crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +191 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +47 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +24 -6
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/huggingface_client.py +2 -2
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +33 -20
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -13
- helm/clients/vertexai_client.py +19 -11
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +525 -172
- helm/config/model_metadata.yaml +185 -10
- helm/config/tokenizer_configs.yaml +100 -2
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AraTrustScenario(Scenario):
|
|
19
|
+
"""AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic
|
|
20
|
+
|
|
21
|
+
EXPERIMENTAL: This scenario may have future reverse incompatible changes.
|
|
22
|
+
|
|
23
|
+
AraTrust is a comprehensive Trustworthiness benchmark for LLMs in Arabic.
|
|
24
|
+
AraTrust comprises 522 human-written multiple-choice questions addressing
|
|
25
|
+
diverse dimensions related to truthfulness, ethics, safety, physical health,
|
|
26
|
+
mental health, unfairness, illegal activities, privacy, and offensive language.
|
|
27
|
+
|
|
28
|
+
- https://huggingface.co/datasets/asas-ai/AraTrust
|
|
29
|
+
- https://arxiv.org/abs/2403.09017
|
|
30
|
+
|
|
31
|
+
Citation:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
@misc{alghamdi2024aratrustevaluationtrustworthinessllms,
|
|
35
|
+
title={AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic},
|
|
36
|
+
author={Emad A. Alghamdi and Reem I. Masoud and Deema Alnuhait and Afnan Y. Alomairi and Ahmed Ashraf and Mohamed Zaytoon},
|
|
37
|
+
year={2024},
|
|
38
|
+
eprint={2403.09017},
|
|
39
|
+
archivePrefix={arXiv},
|
|
40
|
+
primaryClass={cs.CL},
|
|
41
|
+
url={https://arxiv.org/abs/2403.09017},
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
""" # noqa: E501
|
|
45
|
+
|
|
46
|
+
name = "aratrust"
|
|
47
|
+
description = "aratrust"
|
|
48
|
+
tags = ["trustworthiness"]
|
|
49
|
+
|
|
50
|
+
CATEGORIES = [
|
|
51
|
+
"Ethics",
|
|
52
|
+
"Illegal",
|
|
53
|
+
"Mental Health",
|
|
54
|
+
"Offensive",
|
|
55
|
+
"Physical Health",
|
|
56
|
+
"Privacy",
|
|
57
|
+
"Trustfulness",
|
|
58
|
+
"Unfairness",
|
|
59
|
+
]
|
|
60
|
+
OPTION_KEYS = ["A", "B", "C"]
|
|
61
|
+
|
|
62
|
+
def __init__(self, category: str):
|
|
63
|
+
super().__init__()
|
|
64
|
+
category = category.replace("_", " ")
|
|
65
|
+
if category not in self.CATEGORIES and category != "all":
|
|
66
|
+
raise Exception(f"Unknown category {category}")
|
|
67
|
+
self.category = category
|
|
68
|
+
|
|
69
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
+
cache_dir = os.path.join(output_path, "data")
|
|
71
|
+
ensure_directory_exists(cache_dir)
|
|
72
|
+
dataset: datasets.Dataset = datasets.load_dataset(
|
|
73
|
+
"asas-ai/AraTrust",
|
|
74
|
+
revision="d4dd124ed5b90aeb65a7dda7d88e34fb464a31ec",
|
|
75
|
+
cache_dir=cache_dir,
|
|
76
|
+
split="test",
|
|
77
|
+
)
|
|
78
|
+
instances: List[Instance] = []
|
|
79
|
+
for row_index, row in enumerate(dataset):
|
|
80
|
+
if self.category != "all" and self.category != row["Category"]:
|
|
81
|
+
continue
|
|
82
|
+
question_text = row["Question"]
|
|
83
|
+
option_texts = [row[option_key] for option_key in self.OPTION_KEYS if row[option_key]]
|
|
84
|
+
joined_option_texts = "\n".join(option_texts)
|
|
85
|
+
input = Input(text=f"{question_text}\n\n{joined_option_texts}\n")
|
|
86
|
+
references = [Reference(output=Output(text=row["Answer"]), tags=[CORRECT_TAG])]
|
|
87
|
+
instance = Instance(
|
|
88
|
+
id=f"id{row_index}",
|
|
89
|
+
input=input,
|
|
90
|
+
references=references,
|
|
91
|
+
split=TEST_SPLIT,
|
|
92
|
+
)
|
|
93
|
+
instances.append(instance)
|
|
94
|
+
|
|
95
|
+
return instances
|
|
@@ -19,7 +19,7 @@ from helm.common.audio_utils import extract_audio
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class CasualConversations2Scenario(Scenario):
|
|
22
|
-
"""
|
|
22
|
+
r"""
|
|
23
23
|
Casual Conversation v2 (Porgali et al, 2023) is composed of over 5,567 participants (26,467 videos).
|
|
24
24
|
The videos feature paid individuals who agreed to participate in the project and explicitly provided
|
|
25
25
|
Age, Gender, Language/Dialect, Geo-location, Disability, Physical adornments, Physical attributes labels
|
|
@@ -19,7 +19,7 @@ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class MUStARDScenario(Scenario):
|
|
22
|
-
"""
|
|
22
|
+
r"""
|
|
23
23
|
MUStARD: Multimodal Sarcasm Detection Dataset
|
|
24
24
|
|
|
25
25
|
A multimodal video corpus for research in automated sarcasm discovery. The dataset is compiled from popular
|
|
@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
17
|
+
from huggingface_hub import snapshot_download
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
@@ -58,7 +58,6 @@ class UltraSuiteASRClassificationScenario(Scenario):
|
|
|
58
58
|
name = "speech_disorder"
|
|
59
59
|
description = "A scenario for evaluating speech disorders in children"
|
|
60
60
|
tags = ["audio", "classification", "speech_disorder", "asr"]
|
|
61
|
-
HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmUltraSuite"
|
|
62
61
|
|
|
63
62
|
# Classification options
|
|
64
63
|
options: List[str] = ["Healthy", "Unhealthy"]
|
|
@@ -70,14 +69,18 @@ class UltraSuiteASRClassificationScenario(Scenario):
|
|
|
70
69
|
- Audio files (e.g., .mp3)
|
|
71
70
|
- A JSON file with annotations containing 'answer' field
|
|
72
71
|
"""
|
|
73
|
-
print(
|
|
74
|
-
|
|
72
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
73
|
+
data_path = snapshot_download(
|
|
74
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
75
|
+
repo_type="dataset",
|
|
76
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
77
|
+
)
|
|
75
78
|
|
|
76
79
|
instances: List[Instance] = []
|
|
77
80
|
split: str = TEST_SPLIT
|
|
78
81
|
|
|
79
82
|
# Find all pairs of audio and JSON files
|
|
80
|
-
pairs = find_audio_json_pairs(
|
|
83
|
+
pairs = find_audio_json_pairs(data_path)
|
|
81
84
|
|
|
82
85
|
for audio_path, json_path in tqdm(pairs):
|
|
83
86
|
|
|
@@ -88,9 +91,7 @@ class UltraSuiteASRClassificationScenario(Scenario):
|
|
|
88
91
|
# Get the correct answer and convert to label
|
|
89
92
|
answer = annotation["disorder_class"]
|
|
90
93
|
# Create references for each option
|
|
91
|
-
references: List[Reference] = []
|
|
92
|
-
reference = Reference(Output(text=answer), tags=[CORRECT_TAG])
|
|
93
|
-
references.append(reference)
|
|
94
|
+
references: List[Reference] = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
|
|
94
95
|
|
|
95
96
|
# Create the input with audio and instruction
|
|
96
97
|
content = [
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from typing import List, Tuple
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from huggingface_hub import snapshot_download
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
21
|
+
"""
|
|
22
|
+
Find all pairs of MP3 and JSON files in the given directory and its subdirectories.
|
|
23
|
+
Each pair consists of an MP3 file and its corresponding JSON file with the same base name.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
directory: Path to the directory containing the files
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
List of tuples where each tuple contains (mp3_path, json_path)
|
|
30
|
+
"""
|
|
31
|
+
pairs = []
|
|
32
|
+
|
|
33
|
+
# Walk through all directories and subdirectories
|
|
34
|
+
for root, _, files in os.walk(directory):
|
|
35
|
+
# Get all MP3 files in current directory
|
|
36
|
+
mp3_files = [f for f in files if f.endswith(".mp3")]
|
|
37
|
+
|
|
38
|
+
for mp3_file in mp3_files:
|
|
39
|
+
base_name = os.path.splitext(mp3_file)[0]
|
|
40
|
+
json_file = f"{base_name}.json"
|
|
41
|
+
|
|
42
|
+
# Check if corresponding JSON file exists in the same directory
|
|
43
|
+
if json_file in files:
|
|
44
|
+
mp3_path = os.path.join(root, mp3_file)
|
|
45
|
+
json_path = os.path.join(root, json_file)
|
|
46
|
+
pairs.append((mp3_path, json_path))
|
|
47
|
+
|
|
48
|
+
return pairs
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class UltraSuiteASRTranscriptionScenario(Scenario):
|
|
52
|
+
"""
|
|
53
|
+
A scenario for evaluating the transcription capabilities of ASR systems.
|
|
54
|
+
The audio files contain speech from children, potentially with an adult present.
|
|
55
|
+
The task is to classify whether the child speaker is typically developing or has a speech disorder.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
name = "speech_disorder"
|
|
59
|
+
description = "A scenario for evaluating speech disorders in children"
|
|
60
|
+
tags = ["audio", "transcription", "speech_disorder", "asr"]
|
|
61
|
+
|
|
62
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
63
|
+
"""
|
|
64
|
+
Create instances from the audio files and their corresponding JSON annotations.
|
|
65
|
+
The data directory should contain:
|
|
66
|
+
- Audio files (e.g., .mp3)
|
|
67
|
+
- A JSON file with annotations containing 'answer' field
|
|
68
|
+
"""
|
|
69
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
70
|
+
data_path = snapshot_download(
|
|
71
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
72
|
+
repo_type="dataset",
|
|
73
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
instances: List[Instance] = []
|
|
77
|
+
split: str = TEST_SPLIT
|
|
78
|
+
|
|
79
|
+
# Find all pairs of audio and JSON files
|
|
80
|
+
pairs = find_audio_json_pairs(data_path)
|
|
81
|
+
|
|
82
|
+
for audio_path, json_path in tqdm(pairs):
|
|
83
|
+
|
|
84
|
+
# Load the annotation
|
|
85
|
+
with open(json_path, "r") as f:
|
|
86
|
+
annotation = json.load(f)
|
|
87
|
+
|
|
88
|
+
# Create references for the transcription
|
|
89
|
+
references: List[Reference] = [Reference(Output(text=annotation["transcription"]), tags=[CORRECT_TAG])]
|
|
90
|
+
|
|
91
|
+
# Create the input with audio and instruction
|
|
92
|
+
content = [
|
|
93
|
+
MediaObject(content_type="audio/mpeg", location=audio_path),
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
input = Input(multimedia_content=MultimediaObject(content))
|
|
97
|
+
instances.append(Instance(input=input, references=references, split=split))
|
|
98
|
+
|
|
99
|
+
return instances
|
|
@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
17
|
+
from huggingface_hub import snapshot_download
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
@@ -61,7 +61,6 @@ class UltraSuiteClassificationScenario(Scenario):
|
|
|
61
61
|
name = "speech_disorder"
|
|
62
62
|
description = "A scenario for evaluating speech disorders in children"
|
|
63
63
|
tags = ["audio", "classification", "speech_disorder"]
|
|
64
|
-
HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmUltraSuite"
|
|
65
64
|
|
|
66
65
|
def get_instruction(self, words: str) -> str:
|
|
67
66
|
return f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording for signs of typical development or potential speech-language disorder. 2. Conclude your analysis with one of the following labels only: 'typically_developing' or 'speech_disorder'. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
|
|
@@ -74,14 +73,18 @@ class UltraSuiteClassificationScenario(Scenario):
|
|
|
74
73
|
- A JSON file with annotations containing 'answer' field
|
|
75
74
|
"""
|
|
76
75
|
|
|
77
|
-
print(
|
|
78
|
-
|
|
76
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
77
|
+
data_path = snapshot_download(
|
|
78
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
79
|
+
repo_type="dataset",
|
|
80
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
81
|
+
)
|
|
79
82
|
|
|
80
83
|
instances: List[Instance] = []
|
|
81
84
|
split: str = TEST_SPLIT
|
|
82
85
|
|
|
83
86
|
# Find all pairs of audio and JSON files
|
|
84
|
-
pairs = find_audio_json_pairs(
|
|
87
|
+
pairs = find_audio_json_pairs(data_path)
|
|
85
88
|
print(f"Num pairs: {len(pairs)}")
|
|
86
89
|
|
|
87
90
|
for audio_path, json_path in tqdm(pairs):
|
|
@@ -94,9 +97,14 @@ class UltraSuiteClassificationScenario(Scenario):
|
|
|
94
97
|
words = annotation["transcription"]
|
|
95
98
|
# Create references for each option
|
|
96
99
|
references: List[Reference] = []
|
|
100
|
+
correct_label = 0
|
|
97
101
|
for option in ["typically_developing", "speech_disorder"]:
|
|
98
102
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == answer else [])
|
|
99
103
|
references.append(reference)
|
|
104
|
+
if option == answer:
|
|
105
|
+
correct_label += 1
|
|
106
|
+
if correct_label == 0:
|
|
107
|
+
continue
|
|
100
108
|
|
|
101
109
|
# Create the input with audio and instruction
|
|
102
110
|
content = [
|
|
@@ -13,7 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
13
|
Output,
|
|
14
14
|
)
|
|
15
15
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
-
from
|
|
16
|
+
from huggingface_hub import snapshot_download
|
|
17
17
|
from .ultra_suite_classification_scenario import find_audio_json_pairs
|
|
18
18
|
|
|
19
19
|
|
|
@@ -27,7 +27,6 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
|
27
27
|
name = "speech_disorder"
|
|
28
28
|
description = "A scenario for evaluating and classifying specific types of speech disorders in children"
|
|
29
29
|
tags = ["audio", "classification", "speech_disorder", "disorder_breakdown"]
|
|
30
|
-
HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmManualLabels"
|
|
31
30
|
|
|
32
31
|
def get_instruction(self, words: str) -> str:
|
|
33
32
|
return f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt text the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording for signs of typical development or potential speech-language disorder. 2. Conclude your analysis with one of the following labels only: A - 'typically developing' (child's speech patterns and development are within normal age-appropriate ranges), B - 'articulation' (difficulty producing specific speech sounds correctly, such as substituting, omitting, or distorting sounds), C - 'phonological' (difficulty understanding and using the sound system of language, affecting sounds of a particular type). 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
|
|
@@ -39,14 +38,18 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
|
39
38
|
- Audio files (e.g., .mp3)
|
|
40
39
|
- A JSON file with annotations containing 'disorder_class' field
|
|
41
40
|
"""
|
|
42
|
-
print(
|
|
43
|
-
|
|
41
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
42
|
+
data_path = snapshot_download(
|
|
43
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
44
|
+
repo_type="dataset",
|
|
45
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
46
|
+
)
|
|
44
47
|
|
|
45
48
|
instances: List[Instance] = []
|
|
46
49
|
split: str = TEST_SPLIT
|
|
47
50
|
|
|
48
51
|
# Find all pairs of audio and JSON files
|
|
49
|
-
pairs = find_audio_json_pairs(
|
|
52
|
+
pairs = find_audio_json_pairs(data_path)
|
|
50
53
|
print(f"Num pairs: {len(pairs)}")
|
|
51
54
|
|
|
52
55
|
for audio_path, json_path in tqdm(pairs):
|
|
@@ -62,9 +65,14 @@ class UltraSuiteDisorderBreakdownScenario(Scenario):
|
|
|
62
65
|
|
|
63
66
|
# Create references for each option
|
|
64
67
|
references: List[Reference] = []
|
|
68
|
+
correct_label = 0
|
|
65
69
|
for option in ["typically_developing", "articulation", "phonological"]:
|
|
66
70
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
67
71
|
references.append(reference)
|
|
72
|
+
if option == label:
|
|
73
|
+
correct_label += 1
|
|
74
|
+
if correct_label == 0:
|
|
75
|
+
continue
|
|
68
76
|
|
|
69
77
|
# Create the input with audio and instruction
|
|
70
78
|
content = [
|
|
@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
Output,
|
|
15
15
|
)
|
|
16
16
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
-
from
|
|
17
|
+
from huggingface_hub import snapshot_download
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def find_audio_json_pairs(directory: str) -> List[Tuple[str, str]]:
|
|
@@ -57,7 +57,6 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
|
57
57
|
name = "speech_disorder"
|
|
58
58
|
description = "A scenario for evaluating speech disorders in children"
|
|
59
59
|
tags = ["audio", "classification", "speech_disorder"]
|
|
60
|
-
HF_MAPPING_URL = "https://https://huggingface.co/datasets/SAA-Lab/SLPHelmManualLabels"
|
|
61
60
|
|
|
62
61
|
def get_instruction(self, words: str) -> str:
|
|
63
62
|
prompt = f"""You are a highly experienced Speech-Language Pathologist (SLP). An audio recording will be provided, typically consisting of a speech prompt from a pathologist followed by a child's repetition. The prompt the child is trying to repeat is as follows: {words}. Based on your professional expertise: 1. Assess the child's speech in the recording and recognize any abnormal features in the child's speech. 2. These features can be on of the following: A - 'substitution', B - 'omission', C - 'addition', D - 'typically_developing', or E - 'stuttering'. Here, 'substitution' is when the child substitutes one word/phrase/syllable for another. 'omission' is when the child omits one word/phrase/syllable. 'addition' is when the child adds one word/phrase/syllable. 'typically_developing' is when the child's speech is typical of a child of their age. 'stuttering' is when the child stutters, has difficulty speaking, repeats sounds/words or prolongs sounds/words. 3. Provide your response as a single letter without any additional explanation, commentary, or unnecessary text.""" # noqa: E501
|
|
@@ -71,14 +70,18 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
|
71
70
|
- Audio files (e.g., .mp3)
|
|
72
71
|
- A JSON file with annotations containing 'answer' field
|
|
73
72
|
"""
|
|
74
|
-
print(
|
|
75
|
-
|
|
73
|
+
print("Downloading SAA-Lab/SLPHelmManualLabels dataset...")
|
|
74
|
+
data_path = snapshot_download(
|
|
75
|
+
repo_id="SAA-Lab/SLPHelmManualLabels",
|
|
76
|
+
repo_type="dataset",
|
|
77
|
+
revision="38c2d7dab831acf8ccff0ca6f6463d6a8a0184ed",
|
|
78
|
+
)
|
|
76
79
|
|
|
77
80
|
instances: List[Instance] = []
|
|
78
81
|
split: str = TEST_SPLIT
|
|
79
82
|
|
|
80
83
|
# Find all pairs of audio and JSON files
|
|
81
|
-
pairs = find_audio_json_pairs(
|
|
84
|
+
pairs = find_audio_json_pairs(data_path)
|
|
82
85
|
|
|
83
86
|
for audio_path, json_path in tqdm(pairs):
|
|
84
87
|
|
|
@@ -93,9 +96,14 @@ class UltraSuiteDisorderSymptomsScenario(Scenario):
|
|
|
93
96
|
prompt = annotation["transcription"]
|
|
94
97
|
# Create references for each option
|
|
95
98
|
references: List[Reference] = []
|
|
99
|
+
correct_label = 0
|
|
96
100
|
for option in ["substitution", "omission", "addition", "typically_developing", "stuttering"]:
|
|
97
101
|
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if option == label else [])
|
|
98
102
|
references.append(reference)
|
|
103
|
+
if option == label:
|
|
104
|
+
correct_label += 1
|
|
105
|
+
if correct_label == 0:
|
|
106
|
+
continue
|
|
99
107
|
|
|
100
108
|
# Create the input with audio and instruction
|
|
101
109
|
content = [
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
CORRECT_TAG,
|
|
13
14
|
PassageQuestionInput,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -139,3 +141,16 @@ class BabiQAScenario(Scenario):
|
|
|
139
141
|
story.append(fact)
|
|
140
142
|
|
|
141
143
|
return instances
|
|
144
|
+
|
|
145
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
146
|
+
return ScenarioMetadata(
|
|
147
|
+
name="babi_qa",
|
|
148
|
+
display_name="bAbI",
|
|
149
|
+
description="The bAbI benchmark for measuring understanding and reasoning [(Weston et al., "
|
|
150
|
+
"2015)](https://arxiv.org/pdf/1502.05698.pdf).",
|
|
151
|
+
taxonomy=TaxonomyInfo(
|
|
152
|
+
task="question answering", what="reasoning", when="2015", who="synthetic", language="English"
|
|
153
|
+
),
|
|
154
|
+
main_metric="quasi_exact_match",
|
|
155
|
+
main_split="test",
|
|
156
|
+
)
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import random
|
|
4
4
|
from typing import List, Dict, Tuple
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
DEFAULT_TEST_SIZE,
|
|
15
16
|
PassageQuestionInput,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
AMBIGUOUS_TAG = "ambiguous"
|
|
@@ -237,3 +239,16 @@ class BBQScenario(Scenario):
|
|
|
237
239
|
instances.append(instance)
|
|
238
240
|
|
|
239
241
|
return instances
|
|
242
|
+
|
|
243
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
244
|
+
return ScenarioMetadata(
|
|
245
|
+
name="bbq",
|
|
246
|
+
display_name="BBQ (Bias Benchmark for Question Answering)",
|
|
247
|
+
short_display_name="BBQ",
|
|
248
|
+
description="The Bias Benchmark for Question Answering (BBQ) for measuring social bias in "
|
|
249
|
+
"question answering in ambiguous and unambigous context [(Parrish et al., "
|
|
250
|
+
"2022)](https://aclanthology.org/2022.findings-acl.165/).",
|
|
251
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
252
|
+
main_metric="quasi_exact_match",
|
|
253
|
+
main_split="test",
|
|
254
|
+
)
|