crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -2,6 +2,7 @@ import numpy as np
|
|
|
2
2
|
import random
|
|
3
3
|
from typing import List, Tuple
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -234,3 +236,16 @@ class DyckLanguageScenario(Scenario):
|
|
|
234
236
|
not_allowed=train_inputs,
|
|
235
237
|
)
|
|
236
238
|
return train_instances + test_instances
|
|
239
|
+
|
|
240
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
241
|
+
return ScenarioMetadata(
|
|
242
|
+
name="dyck_language",
|
|
243
|
+
display_name="Dyck",
|
|
244
|
+
description="Scenario testing hierarchical reasoning through the Dyck formal languages "
|
|
245
|
+
"[(Suzgun et al., 2019)](https://aclanthology.org/W19-3905/).",
|
|
246
|
+
taxonomy=TaxonomyInfo(
|
|
247
|
+
task="next-word prediction", what="Dyck formal language", when="n/a", who="n/a", language="synthetic"
|
|
248
|
+
),
|
|
249
|
+
main_metric="exact_match_indicator",
|
|
250
|
+
main_split="test",
|
|
251
|
+
)
|
|
@@ -7,6 +7,7 @@ from functools import partial
|
|
|
7
7
|
from tqdm import tqdm
|
|
8
8
|
from typing import Any, Dict, List, Optional, Mapping
|
|
9
9
|
|
|
10
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
10
11
|
from helm.common.general import check_file_exists, ensure_directory_exists
|
|
11
12
|
from helm.benchmark.scenarios.scenario import (
|
|
12
13
|
TEST_SPLIT,
|
|
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
17
|
CORRECT_TAG,
|
|
17
18
|
Reference,
|
|
18
19
|
Output,
|
|
20
|
+
ScenarioMetadata,
|
|
19
21
|
)
|
|
20
22
|
|
|
21
23
|
##################################
|
|
@@ -1517,3 +1519,23 @@ class EHRSHOTScenario(Scenario):
|
|
|
1517
1519
|
)
|
|
1518
1520
|
|
|
1519
1521
|
return instances
|
|
1522
|
+
|
|
1523
|
+
def get_metadata(self):
|
|
1524
|
+
return ScenarioMetadata(
|
|
1525
|
+
name="ehrshot",
|
|
1526
|
+
display_name="EHRSHOT",
|
|
1527
|
+
description="EHRSHOT is a benchmark designed to evaluate a model's ability to predict "
|
|
1528
|
+
"future clinical events using structured EHR code sequences. Each instance "
|
|
1529
|
+
"contains a patient's historical EHR data and a forward-looking clinical "
|
|
1530
|
+
"question about whether a particular diagnosis, lab result, or hospital event "
|
|
1531
|
+
"will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).",
|
|
1532
|
+
taxonomy=TaxonomyInfo(
|
|
1533
|
+
task="Classification",
|
|
1534
|
+
what="Predict whether a medical event will occur in the future based " "on EHR codes",
|
|
1535
|
+
when="Future prediction",
|
|
1536
|
+
who="Clinician, Insurer",
|
|
1537
|
+
language="English",
|
|
1538
|
+
),
|
|
1539
|
+
main_metric="exact_match",
|
|
1540
|
+
main_split="test",
|
|
1541
|
+
)
|
|
@@ -2,6 +2,7 @@ from typing import List, Any
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from datasets import load_dataset
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
TEST_SPLIT,
|
|
11
12
|
Input,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
@@ -56,3 +58,20 @@ class ENEMChallengeScenario(Scenario):
|
|
|
56
58
|
)
|
|
57
59
|
instances.append(instance)
|
|
58
60
|
return instances
|
|
61
|
+
|
|
62
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
63
|
+
return ScenarioMetadata(
|
|
64
|
+
name="enem_challenge",
|
|
65
|
+
display_name="ENEM Challenge",
|
|
66
|
+
short_display_name=None,
|
|
67
|
+
description="ENEM Challenge",
|
|
68
|
+
taxonomy=TaxonomyInfo(
|
|
69
|
+
task="multiple-choice question answering",
|
|
70
|
+
what="general academic subjects",
|
|
71
|
+
when="between 2009 and 2023",
|
|
72
|
+
who="brazilian ministry of education",
|
|
73
|
+
language="Portuguese",
|
|
74
|
+
),
|
|
75
|
+
main_metric="exact_match",
|
|
76
|
+
main_split="test",
|
|
77
|
+
)
|
|
@@ -3,6 +3,7 @@ import pandas as pd
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import List, Tuple
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
8
|
from helm.common.general import ensure_file_downloaded
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
|
|
@@ -160,3 +162,15 @@ class EntityDataImputationScenario(Scenario):
|
|
|
160
162
|
instances.append(instance)
|
|
161
163
|
|
|
162
164
|
return instances
|
|
165
|
+
|
|
166
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
167
|
+
return ScenarioMetadata(
|
|
168
|
+
name="entity_data_imputation",
|
|
169
|
+
display_name="Data imputation",
|
|
170
|
+
description="Scenario from [Mei et al. "
|
|
171
|
+
"(2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability "
|
|
172
|
+
"to impute missing entities in a data table.",
|
|
173
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
174
|
+
main_metric="quasi_exact_match",
|
|
175
|
+
main_split="test",
|
|
176
|
+
)
|
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Dict, List, Tuple
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.hierarchical_logger import hlog
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
from helm.benchmark.scenarios.entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
|
|
19
21
|
|
|
@@ -155,3 +157,15 @@ class EntityMatchingScenario(Scenario):
|
|
|
155
157
|
instances.append(instance)
|
|
156
158
|
|
|
157
159
|
return instances
|
|
160
|
+
|
|
161
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
162
|
+
return ScenarioMetadata(
|
|
163
|
+
name="entity_matching",
|
|
164
|
+
display_name="Entity matching",
|
|
165
|
+
description="Scenario from Magellan [(Konda et al., "
|
|
166
|
+
"2016)](https://dl.acm.org/doi/10.14778/3007263.3007314) that tests the ability "
|
|
167
|
+
"to determine if two entities match.",
|
|
168
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
169
|
+
main_metric="quasi_exact_match",
|
|
170
|
+
main_split="test",
|
|
171
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
TRAIN_SPLIT,
|
|
13
14
|
TEST_SPLIT,
|
|
14
15
|
CORRECT_TAG,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -117,3 +119,21 @@ class FinQAScenario(Scenario):
|
|
|
117
119
|
)
|
|
118
120
|
instances.append(instance)
|
|
119
121
|
return instances
|
|
122
|
+
|
|
123
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
124
|
+
return ScenarioMetadata(
|
|
125
|
+
name="fin_qa",
|
|
126
|
+
display_name="FinQA",
|
|
127
|
+
description="The FinQA benchmark for numeric reasoning over financial data, with question "
|
|
128
|
+
"answering pairs written by financial experts over financial reports [(Chen et "
|
|
129
|
+
"al., 2021)](https://arxiv.org/abs/2109.00122/).",
|
|
130
|
+
taxonomy=TaxonomyInfo(
|
|
131
|
+
task="question answering with numeric reasoning",
|
|
132
|
+
what="financial reports",
|
|
133
|
+
when="1999 to 2019",
|
|
134
|
+
who="financial experts",
|
|
135
|
+
language="English",
|
|
136
|
+
),
|
|
137
|
+
main_metric="program_accuracy",
|
|
138
|
+
main_split="test",
|
|
139
|
+
)
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import random
|
|
5
5
|
from typing import List
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
CORRECT_TAG,
|
|
9
10
|
TRAIN_SPLIT,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
TEST_SPLIT,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
18
20
|
|
|
@@ -51,3 +53,22 @@ class FinanceBenchScenario(Scenario):
|
|
|
51
53
|
for train_index in train_indexes:
|
|
52
54
|
instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
|
|
53
55
|
return instances
|
|
56
|
+
|
|
57
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
58
|
+
return ScenarioMetadata(
|
|
59
|
+
name="financebench",
|
|
60
|
+
display_name="FinanceBench",
|
|
61
|
+
description="FinanceBench is a benchmark for open book financial question answering. It "
|
|
62
|
+
"comprises 10,231 questions about publicly traded companies, with corresponding "
|
|
63
|
+
"answers and evidence strings [(Islam et al., "
|
|
64
|
+
"2023)](https://arxiv.org/abs/2311.11944/).",
|
|
65
|
+
taxonomy=TaxonomyInfo(
|
|
66
|
+
task="question answering with numeric reasoning",
|
|
67
|
+
what="financial reports",
|
|
68
|
+
when="2015 to 2023",
|
|
69
|
+
who="financial experts",
|
|
70
|
+
language="English",
|
|
71
|
+
),
|
|
72
|
+
main_metric="annotation_financebench_label_correct_answer",
|
|
73
|
+
main_split="test",
|
|
74
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import random
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
TEST_SPLIT,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -92,3 +94,22 @@ Possible labels:\n1. positive\n2. neutral\n3. negative""" # noqa: E501
|
|
|
92
94
|
)
|
|
93
95
|
instances.append(instance)
|
|
94
96
|
return instances
|
|
97
|
+
|
|
98
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
99
|
+
return ScenarioMetadata(
|
|
100
|
+
name="financial_phrasebank",
|
|
101
|
+
display_name="Financial Phrasebank (Sentiment Classification)",
|
|
102
|
+
short_display_name=None,
|
|
103
|
+
description="A sentiment classification benchmark based on the dataset from Good Debt or "
|
|
104
|
+
"Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., "
|
|
105
|
+
"2013)](https://arxiv.org/abs/1307.5336).",
|
|
106
|
+
taxonomy=TaxonomyInfo(
|
|
107
|
+
task="sentiment analysis",
|
|
108
|
+
what="phrases from financial news texts and company press releases",
|
|
109
|
+
when="before 2013",
|
|
110
|
+
who="annotators with adequate business education background",
|
|
111
|
+
language="English",
|
|
112
|
+
),
|
|
113
|
+
main_metric="classification_weighted_f1",
|
|
114
|
+
main_split="test",
|
|
115
|
+
)
|
|
@@ -6,6 +6,7 @@ from typing import List
|
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
|
|
9
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
9
10
|
from helm.benchmark.runner import TRAIN_SPLIT
|
|
10
11
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
11
12
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
17
|
Reference,
|
|
17
18
|
Scenario,
|
|
18
19
|
Output,
|
|
20
|
+
ScenarioMetadata,
|
|
19
21
|
)
|
|
20
22
|
|
|
21
23
|
|
|
@@ -122,3 +124,22 @@ class GoldCommodityNewsScenario(Scenario):
|
|
|
122
124
|
for train_index in train_indexes:
|
|
123
125
|
instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
|
|
124
126
|
return instances
|
|
127
|
+
|
|
128
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
129
|
+
return ScenarioMetadata(
|
|
130
|
+
name="gold_commodity_news",
|
|
131
|
+
display_name="Gold Commodity News",
|
|
132
|
+
short_display_name=None,
|
|
133
|
+
description="A classification benchmark based on a dataset of human-annotated gold "
|
|
134
|
+
"commodity news headlines ([Sinha & Khandait, "
|
|
135
|
+
"2019](https://arxiv.org/abs/2009.04202)).",
|
|
136
|
+
taxonomy=TaxonomyInfo(
|
|
137
|
+
task="text classification",
|
|
138
|
+
what="gold commodity news headlines",
|
|
139
|
+
when="2000-2019",
|
|
140
|
+
who="financial journalists",
|
|
141
|
+
language="English",
|
|
142
|
+
),
|
|
143
|
+
main_metric="classification_weighted_f1",
|
|
144
|
+
main_split="test",
|
|
145
|
+
)
|
|
@@ -2,6 +2,7 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
import random
|
|
4
4
|
from typing import List
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_directory_exists
|
|
16
18
|
|
|
@@ -78,3 +80,19 @@ class GPQAScenario(Scenario):
|
|
|
78
80
|
instances.append(instance)
|
|
79
81
|
|
|
80
82
|
return instances
|
|
83
|
+
|
|
84
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
85
|
+
return ScenarioMetadata(
|
|
86
|
+
name=self.name,
|
|
87
|
+
display_name="GPQA",
|
|
88
|
+
description=self.description,
|
|
89
|
+
main_metric="chain_of_thought_correctness",
|
|
90
|
+
main_split="test",
|
|
91
|
+
taxonomy=TaxonomyInfo(
|
|
92
|
+
task="question answering",
|
|
93
|
+
what="complex questions across various disciplines",
|
|
94
|
+
who="domain experts",
|
|
95
|
+
when="2024",
|
|
96
|
+
language="English",
|
|
97
|
+
),
|
|
98
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
-
from helm.benchmark.
|
|
3
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
4
5
|
from helm.benchmark.scenarios.grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
|
|
5
6
|
|
|
6
7
|
|
|
@@ -41,3 +42,21 @@ class GrammarScenario(Scenario):
|
|
|
41
42
|
instances: List[Instance] = list(map(derivation_to_instance, derivations))
|
|
42
43
|
|
|
43
44
|
return instances
|
|
45
|
+
|
|
46
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
47
|
+
return ScenarioMetadata(
|
|
48
|
+
name="grammar",
|
|
49
|
+
display_name="Best ChatGPT Prompts",
|
|
50
|
+
short_display_name="Best ChatGPT Prompts",
|
|
51
|
+
description="A list of “best ChatGPT prompts to power your workflow” summarized by "
|
|
52
|
+
"[GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).",
|
|
53
|
+
taxonomy=TaxonomyInfo(
|
|
54
|
+
task="open-ended instruction following",
|
|
55
|
+
what="Instructions for LLMs",
|
|
56
|
+
when="2023",
|
|
57
|
+
who="Gridfiti Staff",
|
|
58
|
+
language="English",
|
|
59
|
+
),
|
|
60
|
+
main_metric="Helpfulness",
|
|
61
|
+
main_split="test",
|
|
62
|
+
)
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
TEST_SPLIT,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -65,3 +67,22 @@ class GSM8KScenario(Scenario):
|
|
|
65
67
|
),
|
|
66
68
|
)
|
|
67
69
|
return instances
|
|
70
|
+
|
|
71
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
72
|
+
return ScenarioMetadata(
|
|
73
|
+
name="gsm",
|
|
74
|
+
display_name="GSM8K (Grade School Math)",
|
|
75
|
+
short_display_name="GSM8K",
|
|
76
|
+
description="The grade school math word problems dataset (GSM8K) for testing mathematical "
|
|
77
|
+
"reasoning on grade-school math problems [(Cobbe et al., "
|
|
78
|
+
"2021)](https://arxiv.org/pdf/2110.14168.pdf).",
|
|
79
|
+
taxonomy=TaxonomyInfo(
|
|
80
|
+
task="numeric answer question answering",
|
|
81
|
+
what="grade school math word problems",
|
|
82
|
+
when="2021",
|
|
83
|
+
who="contractors on Upwork and Surge AI",
|
|
84
|
+
language="English",
|
|
85
|
+
),
|
|
86
|
+
main_metric="final_number_exact_match",
|
|
87
|
+
main_split="test",
|
|
88
|
+
)
|
|
@@ -2,9 +2,10 @@ import os
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
|
|
7
|
-
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
8
|
+
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class HarmBenchGCGTransferScenario(Scenario):
|
|
@@ -48,3 +49,13 @@ class HarmBenchGCGTransferScenario(Scenario):
|
|
|
48
49
|
instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
|
|
49
50
|
instances.append(instance)
|
|
50
51
|
return instances
|
|
52
|
+
|
|
53
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
54
|
+
return ScenarioMetadata(
|
|
55
|
+
name="harm_bench_gcg_transfer",
|
|
56
|
+
display_name="HarmBenchGCGTransfer",
|
|
57
|
+
description="HarmBenchGCGTransfer",
|
|
58
|
+
taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
|
|
59
|
+
main_metric="safety_score",
|
|
60
|
+
main_split="test",
|
|
61
|
+
)
|
|
@@ -2,9 +2,10 @@ import os
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
|
|
7
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
8
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output, ScenarioMetadata
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class HarmBenchScenario(Scenario):
|
|
@@ -57,3 +58,13 @@ class HarmBenchScenario(Scenario):
|
|
|
57
58
|
instance = Instance(input=input, split=TEST_SPLIT, references=references, sub_split=tag, id=id)
|
|
58
59
|
instances.append(instance)
|
|
59
60
|
return instances
|
|
61
|
+
|
|
62
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
63
|
+
return ScenarioMetadata(
|
|
64
|
+
name="harm_bench",
|
|
65
|
+
display_name="HarmBench",
|
|
66
|
+
description="HarmBench",
|
|
67
|
+
taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
|
|
68
|
+
main_metric="safety_score",
|
|
69
|
+
main_split="test",
|
|
70
|
+
)
|
|
@@ -3,6 +3,7 @@ from typing import List, Optional
|
|
|
3
3
|
|
|
4
4
|
from datasets import DatasetDict, load_dataset
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
CORRECT_TAG,
|
|
8
9
|
TEST_SPLIT,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
Output,
|
|
12
13
|
Reference,
|
|
13
14
|
Scenario,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_directory_exists
|
|
16
18
|
|
|
@@ -134,3 +136,23 @@ class HeadQAScenario(Scenario):
|
|
|
134
136
|
)
|
|
135
137
|
|
|
136
138
|
return instances
|
|
139
|
+
|
|
140
|
+
def get_metadata(self):
|
|
141
|
+
return ScenarioMetadata(
|
|
142
|
+
name="head_qa",
|
|
143
|
+
display_name="HeadQA",
|
|
144
|
+
description="HeadQA is a benchmark consisting of biomedical multiple-choice questions "
|
|
145
|
+
"intended to evaluate a model's medical knowledge and reasoning. Each instance "
|
|
146
|
+
"presents a clinical or scientific question with four answer options, requiring "
|
|
147
|
+
"the model to select the most appropriate answer [(Vilares et al., "
|
|
148
|
+
"2019)](https://arxiv.org/abs/1906.04701).",
|
|
149
|
+
taxonomy=TaxonomyInfo(
|
|
150
|
+
task="Question answering",
|
|
151
|
+
what="Medical knowledge testing",
|
|
152
|
+
when="Any",
|
|
153
|
+
who="Medical student, Researcher",
|
|
154
|
+
language="English",
|
|
155
|
+
),
|
|
156
|
+
main_metric="exact_match",
|
|
157
|
+
main_split="test",
|
|
158
|
+
)
|
|
@@ -2,11 +2,13 @@ import csv
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
8
9
|
TEST_SPLIT,
|
|
9
10
|
Input,
|
|
11
|
+
ScenarioMetadata,
|
|
10
12
|
)
|
|
11
13
|
|
|
12
14
|
|
|
@@ -35,3 +37,14 @@ class HelpdeskCallSummarizationScenario(Scenario):
|
|
|
35
37
|
instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
|
|
36
38
|
instances.append(instance)
|
|
37
39
|
return instances
|
|
40
|
+
|
|
41
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
42
|
+
return ScenarioMetadata(
|
|
43
|
+
name="helpdesk_call_summarization",
|
|
44
|
+
display_name="Helpdesk Call summarization",
|
|
45
|
+
short_display_name=None,
|
|
46
|
+
description="Helpdesk Call summarization",
|
|
47
|
+
taxonomy=TaxonomyInfo(task="summarization", what="n/a", when="?", who="n/a", language="English"),
|
|
48
|
+
main_metric="unknown",
|
|
49
|
+
main_split="test",
|
|
50
|
+
)
|
|
@@ -4,9 +4,10 @@ from typing import List, Union
|
|
|
4
4
|
from enum import Enum
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
8
9
|
from helm.benchmark.scenarios.ice_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
9
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
10
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
|
|
10
11
|
|
|
11
12
|
try:
|
|
12
13
|
# pd.read_excel() uses xlrd
|
|
@@ -467,3 +468,22 @@ class ICEScenario(Scenario):
|
|
|
467
468
|
instances.append(Instance(Input(text=t), references=[], split=TEST_SPLIT))
|
|
468
469
|
|
|
469
470
|
return instances
|
|
471
|
+
|
|
472
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
473
|
+
return ScenarioMetadata(
|
|
474
|
+
name="ice",
|
|
475
|
+
display_name="ICE (International Corpus of English)",
|
|
476
|
+
short_display_name="ICE",
|
|
477
|
+
description="The International Corpus of English (ICE) drawn from English speakers from "
|
|
478
|
+
"various places in the world, initiated by [Greenbaum "
|
|
479
|
+
"(1991)](https://www.cambridge.org/core/journals/english-today/article/abs/ice-the-international-corpus-of-english/47808205394C538393C3FD8E62E5E701).",
|
|
480
|
+
taxonomy=TaxonomyInfo(
|
|
481
|
+
task="language modeling",
|
|
482
|
+
what="?",
|
|
483
|
+
when="?",
|
|
484
|
+
who="?",
|
|
485
|
+
language="English varieties from different nations",
|
|
486
|
+
),
|
|
487
|
+
main_metric="bits_per_byte",
|
|
488
|
+
main_split="test",
|
|
489
|
+
)
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import datasets
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.schema import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
7
|
+
ScenarioMetadata,
|
|
6
8
|
Instance,
|
|
7
9
|
Input,
|
|
8
10
|
TEST_SPLIT,
|
|
@@ -51,3 +53,19 @@ class IFEvalScenario(Scenario):
|
|
|
51
53
|
instances.append(instance)
|
|
52
54
|
|
|
53
55
|
return instances
|
|
56
|
+
|
|
57
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
58
|
+
return ScenarioMetadata(
|
|
59
|
+
name=self.name,
|
|
60
|
+
display_name="IFEval",
|
|
61
|
+
description=self.description,
|
|
62
|
+
main_metric="ifeval_strict_accuracy",
|
|
63
|
+
main_split="test",
|
|
64
|
+
taxonomy=TaxonomyInfo(
|
|
65
|
+
task="instruction following",
|
|
66
|
+
what="verifiable general domain instruction following",
|
|
67
|
+
who="human annotators",
|
|
68
|
+
when="2023",
|
|
69
|
+
language="English",
|
|
70
|
+
),
|
|
71
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List, Dict, Optional
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
VALID_SPLIT,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
16
18
|
|
|
@@ -143,3 +145,16 @@ class IMDBScenario(Scenario):
|
|
|
143
145
|
for split in [TRAIN_SPLIT, VALID_SPLIT]:
|
|
144
146
|
instances.extend(self.get_split_instances(target_path, split, contrast_map))
|
|
145
147
|
return instances
|
|
148
|
+
|
|
149
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="imdb",
|
|
152
|
+
display_name="IMDB",
|
|
153
|
+
description="The IMDB benchmark for sentiment analysis in movie review [(Maas et al., "
|
|
154
|
+
"2011)](https://aclanthology.org/P11-1015/).",
|
|
155
|
+
taxonomy=TaxonomyInfo(
|
|
156
|
+
task="sentiment analysis", what="movie reviews", when="?", who="?", language="English"
|
|
157
|
+
),
|
|
158
|
+
main_metric="quasi_exact_match",
|
|
159
|
+
main_split="valid",
|
|
160
|
+
)
|