crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
helm/proxy/server.py
CHANGED
|
@@ -23,7 +23,7 @@ from helm.benchmark.model_deployment_registry import get_default_model_deploymen
|
|
|
23
23
|
from helm.common.authentication import Authentication
|
|
24
24
|
from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
25
25
|
from helm.common.general import ensure_directory_exists
|
|
26
|
-
from helm.common.hierarchical_logger import hlog
|
|
26
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
27
27
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
28
28
|
from helm.common.request import Request
|
|
29
29
|
from helm.common.perspective_api_request import PerspectiveAPIRequest
|
|
@@ -273,6 +273,7 @@ def main():
|
|
|
273
273
|
default="",
|
|
274
274
|
)
|
|
275
275
|
args = parser.parse_args()
|
|
276
|
+
setup_default_logging()
|
|
276
277
|
|
|
277
278
|
register_builtin_configs_from_helm_package()
|
|
278
279
|
register_configs_from_directory(args.base_path)
|
helm/proxy/static/index.css
CHANGED
helm/proxy/static/index.js
CHANGED
|
@@ -282,7 +282,13 @@ $(function () {
|
|
|
282
282
|
requestResult.completions.forEach((completion) => {
|
|
283
283
|
const $contents = $("<span>", {
|
|
284
284
|
title: `logprob: ${completion.logprob}`,
|
|
285
|
-
})
|
|
285
|
+
});
|
|
286
|
+
if (completion.thinking) {
|
|
287
|
+
const $thinking = $("<span>", { class: "thinking" }).append(completion.thinking.text);
|
|
288
|
+
$contents.append($thinking);
|
|
289
|
+
}
|
|
290
|
+
const $resultText = completion.tokens.length > 0 ?renderTokens(completion.tokens) : $("<div>").append(completion.text);
|
|
291
|
+
$contents.append($resultText);
|
|
286
292
|
const $metadata = $("<span>", { class: "metadata" });
|
|
287
293
|
$metadata.append(
|
|
288
294
|
$("<span>", { title: "Log probability" }).append(
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class ACIBenchMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for ACIBench."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="aci_bench_accuracy",
|
|
11
|
-
scenario_name="aci_bench",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class CHWCarePlanMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for CHWCarePlan."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="chw_care_plan_accuracy",
|
|
11
|
-
scenario_name="chw_care_plan",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class DischargeMeMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for DischargeMe."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="dischargeme_accuracy",
|
|
11
|
-
scenario_name="dischargeme",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedDialogMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MedDialog."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="med_dialog_accuracy",
|
|
11
|
-
scenario_name="med_dialog",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedalignMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for Medalign."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medalign_accuracy",
|
|
11
|
-
scenario_name="medalign",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MediQAMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MediQA."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medi_qa_accuracy",
|
|
11
|
-
scenario_name="medi_qa",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medication_qa_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedicationQAMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MedicationQA."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medication_qa_accuracy",
|
|
11
|
-
scenario_name="medication_qa",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mental_health_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MentalHealthMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MentalHealth."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mental_health_accuracy",
|
|
11
|
-
scenario_name="mental_health",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mimic_bhc_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MIMICBHCMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MIMICBHC."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mimic_bhc_accuracy",
|
|
11
|
-
scenario_name="mimic_bhc",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mimic_rrs_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MIMICRRSMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MIMICRRS."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mimic_rrs_accuracy",
|
|
11
|
-
scenario_name="mimic_rrs",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mtsamples_procedures_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MTSamplesProceduresMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MTSamplesProcedures."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mtsamples_procedures_accuracy",
|
|
11
|
-
scenario_name="mtsamples_procedures",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mtsamples_replicate_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MTSamplesReplicateMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MTSamplesReplicate."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mtsamples_replicate_accuracy",
|
|
11
|
-
scenario_name="mtsamples_replicate",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.starr_patient_instructions_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class StarrPatientInstructionsMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for StarrPatientInstructions."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="starr_patient_instructions_accuracy",
|
|
11
|
-
scenario_name="starr_patient_instructions",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|