crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
import os
|
|
3
|
-
import json
|
|
4
3
|
|
|
4
|
+
from datasets import load_dataset
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
|
|
7
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,41 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
21
|
-
"""
|
|
22
|
-
Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
|
|
23
|
-
Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
directory: Path to the directory containing the files
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
List of tuples where each tuple contains (mp3_path, json_path)
|
|
30
|
-
"""
|
|
31
|
-
pairs = []
|
|
32
|
-
|
|
33
|
-
# Walk through all directories and subdirectories
|
|
34
|
-
for root, _, files in os.walk(directory):
|
|
35
|
-
# Get all MP3 files in current directory
|
|
36
|
-
mp3_files = [f for f in files if f.endswith(".mp3")]
|
|
37
|
-
|
|
38
|
-
for mp3_file in mp3_files:
|
|
39
|
-
base_name = os.path.splitext(mp3_file)[0]
|
|
40
|
-
json_file = f"{base_name}.json"
|
|
41
|
-
|
|
42
|
-
# Check if corresponding JSON file exists in the same directory
|
|
43
|
-
if json_file in files:
|
|
44
|
-
mp3_path = os.path.join(root, mp3_file)
|
|
45
|
-
json_path = os.path.join(root, json_file)
|
|
46
|
-
pairs.append((mp3_path, json_path))
|
|
47
|
-
|
|
48
|
-
if len(pairs) == 0:
|
|
49
|
-
raise ValueError(f"No pairs of MP3 and JSON files found in {directory}")
|
|
50
|
-
|
|
51
|
-
return pairs
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
52
18
|
|
|
53
19
|
|
|
54
20
|
class UltraSuiteClassificationScenario(Scenario):
|
|
@@ -72,44 +38,39 @@ class UltraSuiteClassificationScenario(Scenario):
|
|
|
72
38
|
- Audio files (e.g., .mp3)
|
|
73
39
|
- A JSON file with annotations containing 'answer' field
|
|
74
40
|
"""
|
|
41
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
42
|
+
os.makedirs(audio_save_dir, exist_ok=True)
|
|
75
43
|
|
|
76
|
-
print("Downloading SAA-Lab/
|
|
77
|
-
|
|
78
|
-
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
79
|
-
repo_type="dataset",
|
|
80
|
-
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
81
|
-
)
|
|
44
|
+
print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
|
|
45
|
+
dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
|
|
82
46
|
|
|
83
47
|
instances: List[Instance] = []
|
|
84
48
|
split: str = TEST_SPLIT
|
|
85
49
|
|
|
86
|
-
|
|
87
|
-
pairs = find_audio_json_pairs(data_path)
|
|
88
|
-
print(f"Num pairs: {len(pairs)}")
|
|
50
|
+
for idx, row in enumerate(tqdm(dataset["train"])):
|
|
89
51
|
|
|
90
|
-
for audio_path, json_path in tqdm(pairs):
|
|
91
52
|
# Load the annotation
|
|
92
|
-
|
|
93
|
-
|
|
53
|
+
label = row["disorder_class"]
|
|
54
|
+
transcription = row["transcription"]
|
|
55
|
+
|
|
56
|
+
unique_id = str(idx)
|
|
57
|
+
local_audio_name = f"{label}_{unique_id}.mp3"
|
|
58
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
59
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
94
60
|
|
|
95
|
-
# Get the correct answer and convert to label
|
|
96
|
-
answer = annotation["disorder_class"]
|
|
97
|
-
words = annotation["transcription"]
|
|
98
61
|
# Create references for each option
|
|
99
62
|
references: List[Reference] = []
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else [])
|
|
103
|
-
references.append(reference)
|
|
104
|
-
if option == answer:
|
|
105
|
-
correct_label += 1
|
|
106
|
-
if correct_label == 0:
|
|
63
|
+
options = ["typically_developing", "speech_disorder"]
|
|
64
|
+
if label not in options:
|
|
107
65
|
continue
|
|
66
|
+
for option in options:
|
|
67
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
68
|
+
references.append(reference)
|
|
108
69
|
|
|
109
70
|
# Create the input with audio and instruction
|
|
110
71
|
content = [
|
|
111
|
-
MediaObject(content_type="audio/mpeg", location=
|
|
112
|
-
MediaObject(content_type="text/plain", text=self.get_instruction(
|
|
72
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
73
|
+
MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
|
|
113
74
|
]
|
|
114
75
|
|
|
115
76
|
input = Input(multimedia_content=MultimediaObject(content))
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
-
import
|
|
2
|
+
import os
|
|
3
3
|
|
|
4
|
+
from datasets import load_dataset
|
|
4
5
|
from tqdm import tqdm
|
|
5
6
|
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -13,8 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
Output,
|
|
14
15
|
)
|
|
15
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
-
from
|
|
17
|
-
from .ultra_suite_classification_scenario import find_audio_json_pairs
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
@@ -38,46 +38,38 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
|
38
38
|
- Audio files (e.g., .mp3)
|
|
39
39
|
- A JSON file with annotations containing 'disorder_class' field
|
|
40
40
|
"""
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
)
|
|
41
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
42
|
+
os.makedirs(audio_save_dir, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
|
|
45
|
+
dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
|
|
47
46
|
|
|
48
47
|
instances: List[Instance] = []
|
|
49
48
|
split: str = TEST_SPLIT
|
|
50
49
|
|
|
51
|
-
|
|
52
|
-
pairs = find_audio_json_pairs(data_path)
|
|
53
|
-
print(f"Num pairs: {len(pairs)}")
|
|
54
|
-
|
|
55
|
-
for audio_path, json_path in tqdm(pairs):
|
|
50
|
+
for idx, row in enumerate(tqdm(dataset["train"])):
|
|
56
51
|
# Load the annotation
|
|
57
|
-
|
|
58
|
-
|
|
52
|
+
label = row["disorder_type"]
|
|
53
|
+
transcription = row["transcription"]
|
|
59
54
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
prompt = annotation["transcription"]
|
|
55
|
+
unique_id = str(idx)
|
|
56
|
+
local_audio_name = f"{label}_{unique_id}.mp3"
|
|
57
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
58
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
65
59
|
|
|
66
60
|
# Create references for each option
|
|
67
61
|
references: List[Reference] = []
|
|
68
|
-
|
|
69
|
-
|
|
62
|
+
options = ["typically_developing", "articulation", "phonological"]
|
|
63
|
+
if label not in options:
|
|
64
|
+
continue
|
|
65
|
+
for option in options:
|
|
70
66
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
71
67
|
references.append(reference)
|
|
72
|
-
if option == label:
|
|
73
|
-
correct_label += 1
|
|
74
|
-
if correct_label == 0:
|
|
75
|
-
continue
|
|
76
68
|
|
|
77
69
|
# Create the input with audio and instruction
|
|
78
70
|
content = [
|
|
79
|
-
MediaObject(content_type="audio/mpeg", location=
|
|
80
|
-
MediaObject(content_type="text/plain", text=self.get_instruction(
|
|
71
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
72
|
+
MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
|
|
81
73
|
]
|
|
82
74
|
|
|
83
75
|
input = Input(multimedia_content=MultimediaObject(content))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
import os
|
|
3
|
-
import json
|
|
4
3
|
|
|
4
|
+
from datasets import load_dataset
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
|
|
7
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,38 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
21
|
-
"""
|
|
22
|
-
Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
|
|
23
|
-
Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
directory: Path to the directory containing the files
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
List of tuples where each tuple contains (mp3_path, json_path)
|
|
30
|
-
"""
|
|
31
|
-
pairs = []
|
|
32
|
-
|
|
33
|
-
# Walk through all directories and subdirectories
|
|
34
|
-
for root, _, files in os.walk(directory):
|
|
35
|
-
# Get all MP3 files in current directory
|
|
36
|
-
mp3_files = [f for f in files if f.endswith(".mp3")]
|
|
37
|
-
|
|
38
|
-
for mp3_file in mp3_files:
|
|
39
|
-
base_name = os.path.splitext(mp3_file)[0]
|
|
40
|
-
json_file = f"{base_name}.json"
|
|
41
|
-
|
|
42
|
-
# Check if corresponding JSON file exists in the same directory
|
|
43
|
-
if json_file in files:
|
|
44
|
-
mp3_path = os.path.join(root, mp3_file)
|
|
45
|
-
json_path = os.path.join(root, json_file)
|
|
46
|
-
pairs.append((mp3_path, json_path))
|
|
47
|
-
|
|
48
|
-
return pairs
|
|
17
|
+
from helm.common.audio_utils import ensure_audio_file_exists_from_array
|
|
49
18
|
|
|
50
19
|
|
|
51
20
|
class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
@@ -70,45 +39,37 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
|
70
39
|
- Audio files (e.g., .mp3)
|
|
71
40
|
- A JSON file with annotations containing 'answer' field
|
|
72
41
|
"""
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
)
|
|
42
|
+
audio_save_dir = os.path.join(output_path, "audio_files")
|
|
43
|
+
os.makedirs(audio_save_dir, exist_ok=True)
|
|
44
|
+
|
|
45
|
+
print("Downloading SAA-Lab/SLPHelmUltraSuitePlus dataset...")
|
|
46
|
+
dataset = load_dataset("SAA-Lab/SLPHelmUltraSuitePlus")
|
|
79
47
|
|
|
80
48
|
instances: List[Instance] = []
|
|
81
49
|
split: str = TEST_SPLIT
|
|
82
50
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
for audio_path, json_path in tqdm(pairs):
|
|
51
|
+
for idx, row in enumerate(tqdm(dataset["train"])):
|
|
52
|
+
label = row["disorder_symptom"]
|
|
53
|
+
transcription = row["transcription"]
|
|
87
54
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
55
|
+
unique_id = str(idx)
|
|
56
|
+
local_audio_name = f"{label}_{unique_id}.mp3"
|
|
57
|
+
local_audio_path = os.path.join(audio_save_dir, local_audio_name)
|
|
58
|
+
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
|
|
91
59
|
|
|
92
|
-
# Get the correct answer and convert to label
|
|
93
|
-
if "disorder_symptom" not in annotation or "transcription" not in annotation:
|
|
94
|
-
continue
|
|
95
|
-
label = annotation["disorder_symptom"]
|
|
96
|
-
prompt = annotation["transcription"]
|
|
97
60
|
# Create references for each option
|
|
98
61
|
references: List[Reference] = []
|
|
99
|
-
|
|
100
|
-
|
|
62
|
+
options = ["substitution", "omission", "addition", "typically_developing", "stuttering"]
|
|
63
|
+
if label not in options:
|
|
64
|
+
continue
|
|
65
|
+
for option in options:
|
|
101
66
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
102
67
|
references.append(reference)
|
|
103
|
-
if option == label:
|
|
104
|
-
correct_label += 1
|
|
105
|
-
if correct_label == 0:
|
|
106
|
-
continue
|
|
107
68
|
|
|
108
69
|
# Create the input with audio and instruction
|
|
109
70
|
content = [
|
|
110
|
-
MediaObject(content_type="audio/mpeg", location=
|
|
111
|
-
MediaObject(content_type="text/plain", text=self.get_instruction(
|
|
71
|
+
MediaObject(content_type="audio/mpeg", location=local_audio_path),
|
|
72
|
+
MediaObject(content_type="text/plain", text=self.get_instruction(transcription)),
|
|
112
73
|
]
|
|
113
74
|
|
|
114
75
|
input = Input(multimedia_content=MultimediaObject(content))
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
CORRECT_TAG,
|
|
13
14
|
PassageQuestionInput,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -139,3 +141,16 @@ class BabiQAScenario(Scenario):
|
|
|
139
141
|
story.append(fact)
|
|
140
142
|
|
|
141
143
|
return instances
|
|
144
|
+
|
|
145
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
146
|
+
return ScenarioMetadata(
|
|
147
|
+
name="babi_qa",
|
|
148
|
+
display_name="bAbI",
|
|
149
|
+
description="The bAbI benchmark for measuring understanding and reasoning [(Weston et al., "
|
|
150
|
+
"2015)](https://arxiv.org/pdf/1502.05698.pdf).",
|
|
151
|
+
taxonomy=TaxonomyInfo(
|
|
152
|
+
task="question answering", what="reasoning", when="2015", who="synthetic", language="English"
|
|
153
|
+
),
|
|
154
|
+
main_metric="quasi_exact_match",
|
|
155
|
+
main_split="test",
|
|
156
|
+
)
|
|
@@ -2,6 +2,7 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
CORRECT_TAG,
|
|
7
8
|
TEST_SPLIT,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
Reference,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_directory_exists
|
|
16
18
|
|
|
@@ -54,3 +56,22 @@ class Banking77Scenario(Scenario):
|
|
|
54
56
|
instance = Instance(input=input, references=references, split=split_name)
|
|
55
57
|
instances.append(instance)
|
|
56
58
|
return instances
|
|
59
|
+
|
|
60
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
61
|
+
return ScenarioMetadata(
|
|
62
|
+
name="banking77",
|
|
63
|
+
display_name="BANKING77",
|
|
64
|
+
short_display_name="BANKING77",
|
|
65
|
+
description="BANKING77 is a benchmark for intent classification of customer service queries "
|
|
66
|
+
"in the banking domain [(Casanueva et al., "
|
|
67
|
+
"2020)](https://aclanthology.org/2020.nlp4convai-1.5/).",
|
|
68
|
+
taxonomy=TaxonomyInfo(
|
|
69
|
+
task="text classification",
|
|
70
|
+
what="customer service queries in the banking domain",
|
|
71
|
+
when="During or before 2020",
|
|
72
|
+
who="banking customers",
|
|
73
|
+
language="English",
|
|
74
|
+
),
|
|
75
|
+
main_metric="quasi_exact_match",
|
|
76
|
+
main_split="test",
|
|
77
|
+
)
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import random
|
|
4
4
|
from typing import List, Dict, Tuple
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
DEFAULT_TEST_SIZE,
|
|
15
16
|
PassageQuestionInput,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
AMBIGUOUS_TAG = "ambiguous"
|
|
@@ -237,3 +239,16 @@ class BBQScenario(Scenario):
|
|
|
237
239
|
instances.append(instance)
|
|
238
240
|
|
|
239
241
|
return instances
|
|
242
|
+
|
|
243
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
244
|
+
return ScenarioMetadata(
|
|
245
|
+
name="bbq",
|
|
246
|
+
display_name="BBQ (Bias Benchmark for Question Answering)",
|
|
247
|
+
short_display_name="BBQ",
|
|
248
|
+
description="The Bias Benchmark for Question Answering (BBQ) for measuring social bias in "
|
|
249
|
+
"question answering in ambiguous and unambigous context [(Parrish et al., "
|
|
250
|
+
"2022)](https://aclanthology.org/2022.findings-acl.165/).",
|
|
251
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
252
|
+
main_metric="bbq_accuracy",
|
|
253
|
+
main_split="test",
|
|
254
|
+
)
|