crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -4,6 +4,7 @@ import requests
|
|
|
4
4
|
|
|
5
5
|
from helm.common.cache import CacheConfig, Cache
|
|
6
6
|
from helm.common.file_caches.file_cache import FileCache
|
|
7
|
+
from helm.common.hierarchical_logger import hexception
|
|
7
8
|
from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
|
|
8
9
|
from helm.common.tokenization_request import (
|
|
9
10
|
TokenizationRequest,
|
|
@@ -84,6 +85,7 @@ class TogetherImageGenerationClient(Client):
|
|
|
84
85
|
|
|
85
86
|
response, cached = self._cache.get(cache_key, wrap_request_time(do_it))
|
|
86
87
|
except RuntimeError as e:
|
|
88
|
+
hexception(e)
|
|
87
89
|
error: str = f"TogetherVisionClient error: {e}"
|
|
88
90
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
89
91
|
|
helm/clients/megatron_client.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import Any, Dict, List
|
|
|
4
4
|
import traceback
|
|
5
5
|
from helm.common.cache import CacheConfig
|
|
6
6
|
|
|
7
|
+
from helm.common.hierarchical_logger import hexception
|
|
7
8
|
from helm.common.request import (
|
|
8
9
|
wrap_request_time,
|
|
9
10
|
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
|
|
@@ -103,6 +104,7 @@ class MegatronClient(CachingClient):
|
|
|
103
104
|
try:
|
|
104
105
|
return self._make_request(request)
|
|
105
106
|
except Exception as e:
|
|
107
|
+
hexception(e)
|
|
106
108
|
return RequestResult(
|
|
107
109
|
success=False,
|
|
108
110
|
cached=False,
|
helm/clients/mistral_client.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
from typing import Any, Dict, List, Optional, TypedDict, Union
|
|
3
3
|
|
|
4
|
+
from helm.common.hierarchical_logger import hexception
|
|
4
5
|
from helm.proxy.retry import NonRetriableException
|
|
5
6
|
from helm.common.cache import CacheConfig
|
|
6
7
|
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
@@ -156,6 +157,7 @@ class MistralAIClient(CachingClient):
|
|
|
156
157
|
|
|
157
158
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
158
159
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
160
|
+
hexception(e)
|
|
159
161
|
error: str = f"MistralClient error: {e}"
|
|
160
162
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
161
163
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Any, Dict
|
|
2
2
|
|
|
3
|
+
from helm.common.hierarchical_logger import hexception
|
|
3
4
|
from helm.common.request import wrap_request_time
|
|
4
5
|
from helm.common.cache import Cache, CacheConfig
|
|
5
6
|
from helm.common.moderations_api_request import (
|
|
@@ -64,6 +65,7 @@ class ModerationAPIClient:
|
|
|
64
65
|
|
|
65
66
|
response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
|
|
66
67
|
except openai.OpenAIError as e:
|
|
68
|
+
hexception(e)
|
|
67
69
|
error: str = f"Moderation API error: {e}"
|
|
68
70
|
return ModerationAPIRequestResult(
|
|
69
71
|
success=False, cached=False, error=error, flagged=None, flagged_results=None, scores=None
|
helm/clients/openai_client.py
CHANGED
|
@@ -10,7 +10,7 @@ from helm.common import multimodal_request_utils
|
|
|
10
10
|
from helm.common.cache import CacheConfig
|
|
11
11
|
from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
|
|
12
12
|
from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
13
|
-
from helm.common.hierarchical_logger import hlog, hwarn
|
|
13
|
+
from helm.common.hierarchical_logger import hlog, hwarn, hexception
|
|
14
14
|
from helm.common.object_spec import get_class_by_name
|
|
15
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
16
16
|
from helm.common.tokenization_request import (
|
|
@@ -33,9 +33,12 @@ class OpenAIClientUtils:
|
|
|
33
33
|
@classmethod
|
|
34
34
|
def is_reasoning_model(cls, model_engine: str) -> bool:
|
|
35
35
|
# All OpenAI reasoning models start "o[somenumber]", so we regexp for that to future proof things
|
|
36
|
-
return bool(re.match(r"^o\d+", model_engine))
|
|
36
|
+
return bool(re.match(r"^o\d+", model_engine)) or bool(re.match(r"^gpt-5", model_engine))
|
|
37
37
|
|
|
38
38
|
# Error OpenAI throws when the image in the prompt violates their content policy
|
|
39
|
+
HARMFUL_INFORMATION_ERROR: str = (
|
|
40
|
+
"Invalid prompt: we've limited access to this content for safety reasons. This type of information may be used to benefit or to harm people." # noqa: E501
|
|
41
|
+
)
|
|
39
42
|
INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
|
|
40
43
|
INAPPROPRIATE_PROMPT_ERROR: str = "Invalid prompt: your prompt was flagged"
|
|
41
44
|
INAPPROPRIATE_PROMPT_AZURE_ERROR: str = (
|
|
@@ -44,12 +47,10 @@ class OpenAIClientUtils:
|
|
|
44
47
|
INAPPROPRIATE_PROMPT_MICROSOFT_ERROR: str = (
|
|
45
48
|
"The response was filtered due to the prompt triggering Microsoft's content management policy."
|
|
46
49
|
)
|
|
47
|
-
|
|
48
|
-
#
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
"or contact us through our help center at help.openai.com if you keep seeing this error."
|
|
52
|
-
)
|
|
50
|
+
# Grok content safety guidelines error message
|
|
51
|
+
# TODO: Refactor so that this is owned by the Grok client instead.
|
|
52
|
+
SAFETY_GUIDELINES_GROK_ERROR: str = "Content violates safety guidelines."
|
|
53
|
+
USAGE_GUIDELINES_GROK_ERROR: str = "Content violates usage guidelines."
|
|
53
54
|
|
|
54
55
|
# Set the finish reason to this if the prompt violates OpenAI's content policy
|
|
55
56
|
CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
|
|
@@ -74,21 +75,14 @@ class OpenAIClientUtils:
|
|
|
74
75
|
completions=[empty_completion] * request.num_completions,
|
|
75
76
|
embedding=[],
|
|
76
77
|
)
|
|
77
|
-
elif cls.
|
|
78
|
-
# Handle these errors by returning an empty completion to unblock
|
|
79
|
-
hwarn(f"OpenAI server error for request: {str(request)}")
|
|
80
|
-
empty_completion = GeneratedOutput(
|
|
81
|
-
text="",
|
|
82
|
-
logprob=0,
|
|
83
|
-
tokens=[],
|
|
84
|
-
finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
|
|
85
|
-
)
|
|
78
|
+
elif cls.HARMFUL_INFORMATION_ERROR in str(e):
|
|
86
79
|
return RequestResult(
|
|
87
|
-
success=
|
|
80
|
+
success=False,
|
|
88
81
|
cached=False,
|
|
89
|
-
|
|
90
|
-
completions=[
|
|
82
|
+
error="Prompt blocked by OpenAI's safety filter",
|
|
83
|
+
completions=[],
|
|
91
84
|
embedding=[],
|
|
85
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
92
86
|
)
|
|
93
87
|
elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
|
|
94
88
|
return RequestResult(
|
|
@@ -99,7 +93,26 @@ class OpenAIClientUtils:
|
|
|
99
93
|
embedding=[],
|
|
100
94
|
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
101
95
|
)
|
|
96
|
+
elif cls.SAFETY_GUIDELINES_GROK_ERROR in str(e):
|
|
97
|
+
return RequestResult(
|
|
98
|
+
success=False,
|
|
99
|
+
cached=False,
|
|
100
|
+
error="Grok API error: Content violates safety guidelines",
|
|
101
|
+
completions=[],
|
|
102
|
+
embedding=[],
|
|
103
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
104
|
+
)
|
|
105
|
+
elif cls.USAGE_GUIDELINES_GROK_ERROR in str(e):
|
|
106
|
+
return RequestResult(
|
|
107
|
+
success=False,
|
|
108
|
+
cached=False,
|
|
109
|
+
error="Grok API error: Content violates usage guidelines",
|
|
110
|
+
completions=[],
|
|
111
|
+
embedding=[],
|
|
112
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
113
|
+
)
|
|
102
114
|
|
|
115
|
+
hexception(e)
|
|
103
116
|
error: str = f"OpenAI error: {e}"
|
|
104
117
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
105
118
|
|
|
@@ -158,6 +171,7 @@ class OpenAIClient(CachingClient):
|
|
|
158
171
|
cache_key = self._get_cache_key(raw_request, request)
|
|
159
172
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
160
173
|
except openai.OpenAIError as e:
|
|
174
|
+
hexception(e)
|
|
161
175
|
error: str = f"OpenAI error: {e}"
|
|
162
176
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
163
177
|
|
|
@@ -424,6 +438,7 @@ class OpenAIClient(CachingClient):
|
|
|
424
438
|
cache_key = self._get_cache_key(raw_request, request)
|
|
425
439
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
426
440
|
except openai.OpenAIError as e:
|
|
441
|
+
hexception(e)
|
|
427
442
|
error: str = f"OpenAI error: {e}"
|
|
428
443
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
429
444
|
|
|
@@ -479,6 +494,7 @@ class OpenAIClient(CachingClient):
|
|
|
479
494
|
cache_key = self._get_cache_key({"audio": audio_path, "model": model}, request)
|
|
480
495
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
481
496
|
except openai.OpenAIError as e:
|
|
497
|
+
hexception(e)
|
|
482
498
|
error: str = f"OpenAI error: {e}"
|
|
483
499
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
484
500
|
|
|
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
5
5
|
|
|
6
6
|
from helm.clients.openai_client import OpenAIClientUtils
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.hierarchical_logger import hwarn
|
|
8
9
|
from helm.common.media_object import TEXT_TYPE
|
|
9
10
|
from helm.common.request import (
|
|
10
11
|
Thinking,
|
|
@@ -60,7 +61,28 @@ class OpenAIResponseClient(CachingClient):
|
|
|
60
61
|
|
|
61
62
|
def _make_raw_request(self, request: Request) -> dict[str, Any]:
|
|
62
63
|
input: Union[str, List[Dict[str, Any]]]
|
|
63
|
-
|
|
64
|
+
|
|
65
|
+
if (
|
|
66
|
+
(request.prompt and request.messages)
|
|
67
|
+
or (request.prompt and request.multimodal_prompt)
|
|
68
|
+
or (request.messages and request.multimodal_prompt)
|
|
69
|
+
):
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"More than one of `prompt`, `messages` and `multimodal_prompt` was set in request: {request}"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if request.messages is not None:
|
|
75
|
+
# Checks that all messages have a role and some content
|
|
76
|
+
for message in request.messages:
|
|
77
|
+
if not message.get("role") or not message.get("content"):
|
|
78
|
+
raise ValueError("All messages must have a role and content")
|
|
79
|
+
# Checks that the last role is "user"
|
|
80
|
+
if request.messages[-1]["role"] != "user":
|
|
81
|
+
raise ValueError("Last message must have role 'user'")
|
|
82
|
+
if request.prompt != "":
|
|
83
|
+
hwarn("Since message is set, prompt will be ignored")
|
|
84
|
+
input = request.messages
|
|
85
|
+
elif request.multimodal_prompt is not None:
|
|
64
86
|
content = []
|
|
65
87
|
request.validate()
|
|
66
88
|
for media_object in request.multimodal_prompt.media_objects:
|
|
@@ -101,6 +123,8 @@ class OpenAIResponseClient(CachingClient):
|
|
|
101
123
|
# Plus other changes
|
|
102
124
|
model_engine: str = request.model_engine
|
|
103
125
|
if OpenAIClientUtils.is_reasoning_model(model_engine):
|
|
126
|
+
if "reasoning" not in raw_request:
|
|
127
|
+
raw_request["reasoning"] = {}
|
|
104
128
|
raw_request["reasoning"]["summary"] = "detailed"
|
|
105
129
|
# Avoid error:
|
|
106
130
|
# "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
|
|
@@ -150,9 +174,9 @@ class OpenAIResponseClient(CachingClient):
|
|
|
150
174
|
] # one of "message" or "reasoning" from API observation, but can also include tool calls
|
|
151
175
|
|
|
152
176
|
if output_type == "reasoning":
|
|
153
|
-
reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
|
|
177
|
+
reasoning_output += "\n\n".join([raw_output["text"] for raw_output in output["summary"]])
|
|
154
178
|
elif output_type == "message":
|
|
155
|
-
text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
|
|
179
|
+
text_output += "\n\n".join([raw_output["text"] for raw_output in output["content"]])
|
|
156
180
|
# (Other output types are ignored)
|
|
157
181
|
|
|
158
182
|
completion = truncate_and_tokenize_response_text(
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from helm.clients.openai_client import OpenAIClient
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OpenRouterClient(OpenAIClient):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
tokenizer_name: str,
|
|
12
|
+
tokenizer: Tokenizer,
|
|
13
|
+
cache_config: CacheConfig,
|
|
14
|
+
api_key: Optional[str] = None,
|
|
15
|
+
model_name: Optional[str] = None,
|
|
16
|
+
output_processor: Optional[str] = None,
|
|
17
|
+
):
|
|
18
|
+
self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
|
|
19
|
+
self.base_url = "https://openrouter.ai/api/v1/"
|
|
20
|
+
super().__init__(
|
|
21
|
+
tokenizer,
|
|
22
|
+
tokenizer_name,
|
|
23
|
+
cache_config=cache_config,
|
|
24
|
+
output_processor=output_processor,
|
|
25
|
+
base_url=self.base_url,
|
|
26
|
+
api_key=self.api_key,
|
|
27
|
+
)
|
|
28
|
+
self.model_name = model_name
|
|
29
|
+
|
|
30
|
+
def _get_model_for_request(self, request):
|
|
31
|
+
return self.model_name or request.model
|
helm/clients/palmyra_client.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any, Dict, List
|
|
|
5
5
|
|
|
6
6
|
from helm.clients.openai_client import OpenAIClient
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
-
from helm.common.hierarchical_logger import hwarn
|
|
8
|
+
from helm.common.hierarchical_logger import hexception, hwarn
|
|
9
9
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
|
|
10
10
|
from helm.common.tokenization_request import (
|
|
11
11
|
TokenizationRequest,
|
|
@@ -99,6 +99,7 @@ class PalmyraClient(CachingClient):
|
|
|
99
99
|
|
|
100
100
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
101
101
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
102
|
+
hexception(e)
|
|
102
103
|
error: str = f"PalmyraClient error: {e}"
|
|
103
104
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
104
105
|
|
helm/clients/reka_client.py
CHANGED
|
@@ -6,7 +6,7 @@ from helm.proxy.retry import NonRetriableException
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.media_object import TEXT_TYPE
|
|
8
8
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
|
|
9
|
-
from helm.common.hierarchical_logger import hwarn
|
|
9
|
+
from helm.common.hierarchical_logger import hexception, hwarn
|
|
10
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
11
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
12
12
|
from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
|
|
@@ -167,6 +167,7 @@ class RekaClient(CachingClient):
|
|
|
167
167
|
|
|
168
168
|
response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
|
|
169
169
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
170
|
+
hexception(e)
|
|
170
171
|
error: str = f"RekaClient error: {e}"
|
|
171
172
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
172
173
|
|
|
@@ -39,7 +39,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
|
|
|
39
39
|
tokenizer=tokenizer,
|
|
40
40
|
tokenizer_name=tokenizer_name,
|
|
41
41
|
cache_config=cache_config,
|
|
42
|
-
api_key=
|
|
42
|
+
api_key=api_key,
|
|
43
43
|
base_url=base_url,
|
|
44
44
|
azure_openai_deployment_name=openai_model_name,
|
|
45
45
|
api_version=api_version,
|
|
@@ -50,7 +50,7 @@ class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
|
|
|
50
50
|
tokenizer=tokenizer,
|
|
51
51
|
tokenizer_name=tokenizer_name,
|
|
52
52
|
cache_config=cache_config,
|
|
53
|
-
api_key=
|
|
53
|
+
api_key=api_key,
|
|
54
54
|
endpoint=endpoint,
|
|
55
55
|
azure_openai_deployment_name=openai_model_name,
|
|
56
56
|
api_version=api_version,
|
|
@@ -5,6 +5,7 @@ from dataclasses import asdict
|
|
|
5
5
|
from typing import Any, Dict, List, Optional
|
|
6
6
|
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.hierarchical_logger import hexception
|
|
8
9
|
from helm.common.request import (
|
|
9
10
|
wrap_request_time,
|
|
10
11
|
Request,
|
|
@@ -82,6 +83,7 @@ class StanfordHealthCareHTTPModelClient(CachingClient, ABC):
|
|
|
82
83
|
request_time=response["request_time"],
|
|
83
84
|
)
|
|
84
85
|
except requests.exceptions.RequestException as e:
|
|
86
|
+
hexception(e)
|
|
85
87
|
return RequestResult(success=False, cached=False, error=f"Request error: {e}", completions=[], embedding=[])
|
|
86
88
|
|
|
87
89
|
@abstractmethod
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
import tempfile
|
|
4
|
+
|
|
5
|
+
from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
|
|
6
|
+
from helm.common.request import Request
|
|
7
|
+
from helm.clients.openrouter_client import OpenRouterClient
|
|
8
|
+
|
|
9
|
+
from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestOpenRouterClient:
|
|
13
|
+
def setup_method(self, method):
|
|
14
|
+
cache_file = tempfile.NamedTemporaryFile(delete=False)
|
|
15
|
+
self.cache_path: str = cache_file.name
|
|
16
|
+
self.tokenizer_name = "mistralai/Mistral-7B-v0.1"
|
|
17
|
+
self.tokenizer = HuggingFaceTokenizer(
|
|
18
|
+
cache_config=BlackHoleCacheConfig(),
|
|
19
|
+
tokenizer_name=self.tokenizer_name,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def teardown_method(self, method):
|
|
23
|
+
os.remove(self.cache_path)
|
|
24
|
+
|
|
25
|
+
@pytest.mark.parametrize(
|
|
26
|
+
"model_name,test_input,expected_model",
|
|
27
|
+
[
|
|
28
|
+
(
|
|
29
|
+
"mistralai/mistral-medium-3.1",
|
|
30
|
+
Request(
|
|
31
|
+
model="mistralai/mistral-medium-3.1",
|
|
32
|
+
model_deployment="openrouter/mistral-medium-3.1",
|
|
33
|
+
),
|
|
34
|
+
"mistralai/mistral-medium-3.1",
|
|
35
|
+
),
|
|
36
|
+
(
|
|
37
|
+
None,
|
|
38
|
+
Request(model="openai/gpt-oss-20b:free", model_deployment="openrouter/gpt-oss-20b:free"),
|
|
39
|
+
"openai/gpt-oss-20b:free",
|
|
40
|
+
),
|
|
41
|
+
],
|
|
42
|
+
)
|
|
43
|
+
def test_get_model_for_request(self, model_name, test_input, expected_model):
|
|
44
|
+
client = OpenRouterClient(
|
|
45
|
+
tokenizer_name=self.tokenizer_name,
|
|
46
|
+
tokenizer=self.tokenizer,
|
|
47
|
+
cache_config=SqliteCacheConfig(self.cache_path),
|
|
48
|
+
model_name=model_name,
|
|
49
|
+
api_key="test_key",
|
|
50
|
+
)
|
|
51
|
+
assert client._get_model_for_request(test_input) == expected_model
|
|
52
|
+
|
|
53
|
+
def test_api_key_env_var(self, monkeypatch):
|
|
54
|
+
monkeypatch.setenv("OPENROUTER_API_KEY", "test_key")
|
|
55
|
+
client = OpenRouterClient(
|
|
56
|
+
tokenizer_name=self.tokenizer_name,
|
|
57
|
+
tokenizer=self.tokenizer,
|
|
58
|
+
cache_config=SqliteCacheConfig(self.cache_path),
|
|
59
|
+
)
|
|
60
|
+
assert client.api_key == "test_key"
|
|
61
|
+
|
|
62
|
+
def test_api_key_argument(self):
|
|
63
|
+
client = OpenRouterClient(
|
|
64
|
+
tokenizer_name=self.tokenizer_name,
|
|
65
|
+
tokenizer=self.tokenizer,
|
|
66
|
+
cache_config=BlackHoleCacheConfig(),
|
|
67
|
+
api_key="explicit_key",
|
|
68
|
+
)
|
|
69
|
+
assert client.api_key == "explicit_key"
|
helm/clients/together_client.py
CHANGED
|
@@ -9,6 +9,7 @@ import requests
|
|
|
9
9
|
from retrying import retry
|
|
10
10
|
|
|
11
11
|
from helm.common.cache import CacheConfig
|
|
12
|
+
from helm.common.hierarchical_logger import hexception
|
|
12
13
|
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
13
14
|
from helm.common.object_spec import get_class_by_name
|
|
14
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -99,7 +100,7 @@ class JobNotFinishedError(TogetherClientError):
|
|
|
99
100
|
pass
|
|
100
101
|
|
|
101
102
|
|
|
102
|
-
def
|
|
103
|
+
def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
|
|
103
104
|
"""Return a tuple of thinking text and output text."""
|
|
104
105
|
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
105
106
|
if match:
|
|
@@ -112,6 +113,44 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
|
|
|
112
113
|
return (input, "")
|
|
113
114
|
|
|
114
115
|
|
|
116
|
+
def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
|
|
117
|
+
"""Return a tuple of thinking text and output text."""
|
|
118
|
+
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
119
|
+
if match:
|
|
120
|
+
return (match.group(1), match.group(2))
|
|
121
|
+
|
|
122
|
+
match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
|
|
123
|
+
if match:
|
|
124
|
+
return (match.group(1), "")
|
|
125
|
+
|
|
126
|
+
return (input, "")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
|
|
130
|
+
"""Return a tuple of thinking text and output text."""
|
|
131
|
+
match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
|
|
132
|
+
if match:
|
|
133
|
+
return (match.group(1), match.group(2))
|
|
134
|
+
|
|
135
|
+
match = re.match(r"\n<think>(.*)", input, re.DOTALL)
|
|
136
|
+
if match:
|
|
137
|
+
return (match.group(1), "")
|
|
138
|
+
|
|
139
|
+
return (input, "")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
|
|
143
|
+
# TODO: Come up with a more sustainable extensible way of doing this.
|
|
144
|
+
if "deepseek-r1" in model_name:
|
|
145
|
+
return _parse_thinking_deepseek_r1(input)
|
|
146
|
+
elif "qwen3" in model_name:
|
|
147
|
+
return _parse_thinking_qwen3(input)
|
|
148
|
+
elif "glm-4.5" in model_name:
|
|
149
|
+
return _parse_thinking_glm_4_5(input)
|
|
150
|
+
else:
|
|
151
|
+
raise Exception(f"No thinking parser available for model {model_name}")
|
|
152
|
+
|
|
153
|
+
|
|
115
154
|
class TogetherClient(CachingClient):
|
|
116
155
|
"""
|
|
117
156
|
Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
|
|
@@ -235,6 +274,7 @@ class TogetherClient(CachingClient):
|
|
|
235
274
|
try:
|
|
236
275
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it_sync))
|
|
237
276
|
except Exception as error:
|
|
277
|
+
hexception(error)
|
|
238
278
|
return RequestResult(
|
|
239
279
|
success=False,
|
|
240
280
|
cached=False,
|
|
@@ -346,9 +386,8 @@ class TogetherChatClient(CachingClient):
|
|
|
346
386
|
self._client = Together(api_key=api_key)
|
|
347
387
|
self._together_model = together_model
|
|
348
388
|
self._disable_logprobs = bool(disable_logprobs)
|
|
349
|
-
# self.output_processor is actually a function, not a class
|
|
350
389
|
self._parse_thinking = bool(parse_thinking)
|
|
351
|
-
|
|
390
|
+
# self.output_processor is actually a function, not a class
|
|
352
391
|
self.output_processor: Optional[Callable[[str], str]] = (
|
|
353
392
|
get_class_by_name(output_processor) if output_processor else None
|
|
354
393
|
)
|
|
@@ -418,6 +457,7 @@ class TogetherChatClient(CachingClient):
|
|
|
418
457
|
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
419
458
|
response = ChatCompletionResponse.model_validate(raw_response)
|
|
420
459
|
except Exception as error:
|
|
460
|
+
hexception(error)
|
|
421
461
|
return RequestResult(
|
|
422
462
|
success=False,
|
|
423
463
|
cached=False,
|
|
@@ -444,15 +484,15 @@ class TogetherChatClient(CachingClient):
|
|
|
444
484
|
if self.output_processor:
|
|
445
485
|
output_text = self.output_processor(output_text)
|
|
446
486
|
|
|
487
|
+
thinking: Optional[Thinking] = None
|
|
447
488
|
if self._parse_thinking:
|
|
448
|
-
thinking_text, output_text = _parse_thinking(output_text)
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
)
|
|
454
|
-
|
|
455
|
-
generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
|
|
489
|
+
thinking_text, output_text = _parse_thinking(output_text, request.model)
|
|
490
|
+
thinking = Thinking(text=thinking_text)
|
|
491
|
+
elif hasattr(choice.message, "reasoning_content"):
|
|
492
|
+
thinking = Thinking(text=choice.message.reasoning_content)
|
|
493
|
+
generated_outputs.append(
|
|
494
|
+
GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
|
|
495
|
+
)
|
|
456
496
|
return RequestResult(
|
|
457
497
|
success=True,
|
|
458
498
|
cached=cached,
|
|
@@ -525,6 +565,7 @@ class TogetherCompletionClient(CachingClient):
|
|
|
525
565
|
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
526
566
|
response = CompletionResponse.model_validate(raw_response)
|
|
527
567
|
except Exception as error:
|
|
568
|
+
hexception(error)
|
|
528
569
|
return RequestResult(
|
|
529
570
|
success=False,
|
|
530
571
|
cached=False,
|
helm/clients/vertexai_client.py
CHANGED
|
@@ -4,6 +4,7 @@ from threading import Lock
|
|
|
4
4
|
from typing import Any, Dict, Mapping, Optional, List, Union, cast
|
|
5
5
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
|
+
from helm.common.hierarchical_logger import hexception
|
|
7
8
|
from helm.common.multimodal_request_utils import get_contents_as_bytes
|
|
8
9
|
from helm.common.media_object import TEXT_TYPE
|
|
9
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -152,6 +153,7 @@ class VertexAITextClient(VertexAIClient):
|
|
|
152
153
|
|
|
153
154
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
154
155
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
156
|
+
hexception(e)
|
|
155
157
|
error: str = f"VertexAITextClient error: {e}"
|
|
156
158
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
157
159
|
|
|
@@ -276,8 +278,14 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
276
278
|
if not candidate.content:
|
|
277
279
|
raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
|
|
278
280
|
if not candidate.content.parts:
|
|
279
|
-
|
|
280
|
-
|
|
281
|
+
if candidate.finish_reason == 2: # MAX_TOKENS
|
|
282
|
+
# This means that there is no text output because the maximum number of tokens were
|
|
283
|
+
# reached during thinking.
|
|
284
|
+
predictions.append({"text": ""})
|
|
285
|
+
else:
|
|
286
|
+
raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
|
|
287
|
+
else:
|
|
288
|
+
predictions.append({"text": candidate.content.text})
|
|
281
289
|
# TODO: Extract more information from the response
|
|
282
290
|
return {"predictions": predictions}
|
|
283
291
|
|
|
@@ -304,6 +312,7 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
304
312
|
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
305
313
|
)
|
|
306
314
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
315
|
+
hexception(e)
|
|
307
316
|
error: str = f"VertexAITextClient error: {e}"
|
|
308
317
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
309
318
|
|
|
@@ -434,6 +443,7 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
434
443
|
cache_key = self.make_cache_key_with_safety_settings_preset(raw_cache_key, request)
|
|
435
444
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
436
445
|
except requests.exceptions.RequestException as e:
|
|
446
|
+
hexception(e)
|
|
437
447
|
error: str = f"Gemini Vision error: {e}"
|
|
438
448
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
439
449
|
except VertexAIContentBlockedError as e:
|
|
@@ -8,7 +8,7 @@ import torch
|
|
|
8
8
|
|
|
9
9
|
from helm.common.cache import CacheConfig
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
14
14
|
from helm.common.request import wrap_request_time
|
|
@@ -125,6 +125,7 @@ class HuggingFaceVision2SeqClient(CachingClient):
|
|
|
125
125
|
)
|
|
126
126
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
127
127
|
except RuntimeError as model_error:
|
|
128
|
+
hexception(model_error)
|
|
128
129
|
return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
|
|
129
130
|
|
|
130
131
|
for text in result["output"]:
|
|
@@ -5,6 +5,7 @@ from transformers import pipeline
|
|
|
5
5
|
from transformers.pipelines import ImageToTextPipeline
|
|
6
6
|
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.hierarchical_logger import hexception
|
|
8
9
|
from helm.common.images_utils import open_image
|
|
9
10
|
from helm.common.media_object import TEXT_TYPE
|
|
10
11
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -93,6 +94,7 @@ class HuggingFaceVLMClient(CachingClient):
|
|
|
93
94
|
)
|
|
94
95
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
95
96
|
except RuntimeError as e:
|
|
97
|
+
hexception(e)
|
|
96
98
|
return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
|
|
97
99
|
|
|
98
100
|
output: str = result["generated_text"]
|
|
@@ -8,7 +8,7 @@ from transformers import IdeficsForVisionText2Text, AutoProcessor, IdeficsProces
|
|
|
8
8
|
from helm.common.cache import CacheConfig
|
|
9
9
|
from helm.common.images_utils import open_image
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
14
14
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
@@ -137,6 +137,7 @@ class IDEFICSClient(CachingClient):
|
|
|
137
137
|
)
|
|
138
138
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
139
139
|
except RuntimeError as model_error:
|
|
140
|
+
hexception(model_error)
|
|
140
141
|
return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
|
|
141
142
|
|
|
142
143
|
for text in result["output"]:
|
|
@@ -5,7 +5,7 @@ import torch
|
|
|
5
5
|
from huggingface_hub import hf_hub_download
|
|
6
6
|
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
8
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
9
9
|
from helm.common.images_utils import open_image
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
11
|
from helm.common.media_object import TEXT_TYPE
|
|
@@ -131,6 +131,7 @@ class OpenFlamingoClient(CachingClient):
|
|
|
131
131
|
)
|
|
132
132
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
133
133
|
except RuntimeError as ex:
|
|
134
|
+
hexception(ex)
|
|
134
135
|
return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
|
|
135
136
|
|
|
136
137
|
completions: List[GeneratedOutput] = []
|