crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +191 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +47 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +24 -6
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/huggingface_client.py +2 -2
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +33 -20
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -13
- helm/clients/vertexai_client.py +19 -11
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +525 -172
- helm/config/model_metadata.yaml +185 -10
- helm/config/tokenizer_configs.yaml +100 -2
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import csv
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import check_file_exists
|
|
15
17
|
|
|
@@ -75,3 +77,22 @@ class SHCENTMedScenario(Scenario):
|
|
|
75
77
|
)
|
|
76
78
|
|
|
77
79
|
return instances
|
|
80
|
+
|
|
81
|
+
def get_metadata(self):
|
|
82
|
+
return ScenarioMetadata(
|
|
83
|
+
name="shc_ent_med",
|
|
84
|
+
display_name="ENT-Referral",
|
|
85
|
+
description="ENT-Referral is a benchmark designed to evaluate whether a patient's clinical "
|
|
86
|
+
"note supports a referral to an Ear, Nose, and Throat (ENT) specialist. It "
|
|
87
|
+
"helps assess models' abilities to make referral decisions based on "
|
|
88
|
+
"unstructured clinical text",
|
|
89
|
+
taxonomy=TaxonomyInfo(
|
|
90
|
+
task="Classification",
|
|
91
|
+
what="Identify referrals for ENT specialists",
|
|
92
|
+
when="Any",
|
|
93
|
+
who="Hospital Admistrator",
|
|
94
|
+
language="English",
|
|
95
|
+
),
|
|
96
|
+
main_metric="exact_match",
|
|
97
|
+
main_split="test",
|
|
98
|
+
)
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import csv
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import check_file_exists
|
|
15
17
|
|
|
@@ -72,3 +74,21 @@ class SHCGIPMedScenario(Scenario):
|
|
|
72
74
|
)
|
|
73
75
|
|
|
74
76
|
return instances
|
|
77
|
+
|
|
78
|
+
def get_metadata(self):
|
|
79
|
+
return ScenarioMetadata(
|
|
80
|
+
name="shc_gip_med",
|
|
81
|
+
display_name="HospiceReferral",
|
|
82
|
+
description="HospiceReferral is a benchmark that evaluates model performance in identifying "
|
|
83
|
+
"whether patients are eligible for hospice care based on palliative care "
|
|
84
|
+
"clinical notes. The benchmark focuses on end-of-life care referral decisions.",
|
|
85
|
+
taxonomy=TaxonomyInfo(
|
|
86
|
+
task="Classification",
|
|
87
|
+
what="Assess hospice referral appropriateness",
|
|
88
|
+
when="End-of-care",
|
|
89
|
+
who="Hospital Admistrator",
|
|
90
|
+
language="English",
|
|
91
|
+
),
|
|
92
|
+
main_metric="exact_match",
|
|
93
|
+
main_split="test",
|
|
94
|
+
)
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import csv
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import check_file_exists
|
|
15
17
|
|
|
@@ -76,3 +78,23 @@ class SHCPRIVACYMedScenario(Scenario):
|
|
|
76
78
|
)
|
|
77
79
|
|
|
78
80
|
return instances
|
|
81
|
+
|
|
82
|
+
def get_metadata(self):
|
|
83
|
+
return ScenarioMetadata(
|
|
84
|
+
name="shc_privacy_med",
|
|
85
|
+
display_name="PrivacyDetection",
|
|
86
|
+
description="PrivacyDetection is a benchmark composed of patient portal messages submitted "
|
|
87
|
+
"by patients or caregivers. The task is to determine whether the message "
|
|
88
|
+
"contains any confidential or privacy-leaking information that should be "
|
|
89
|
+
"protected [(Tse G, et al., "
|
|
90
|
+
"2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).",
|
|
91
|
+
taxonomy=TaxonomyInfo(
|
|
92
|
+
task="Classification",
|
|
93
|
+
what="Classify if a document leaks private information",
|
|
94
|
+
when="Any",
|
|
95
|
+
who="Clinician, Caregiver",
|
|
96
|
+
language="English",
|
|
97
|
+
),
|
|
98
|
+
main_metric="exact_match",
|
|
99
|
+
main_split="test",
|
|
100
|
+
)
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import csv
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import check_file_exists
|
|
15
17
|
|
|
@@ -74,3 +76,23 @@ class SHCPROXYMedScenario(Scenario):
|
|
|
74
76
|
)
|
|
75
77
|
|
|
76
78
|
return instances
|
|
79
|
+
|
|
80
|
+
def get_metadata(self):
|
|
81
|
+
return ScenarioMetadata(
|
|
82
|
+
name="shc_proxy_med",
|
|
83
|
+
display_name="ProxySender",
|
|
84
|
+
description="ProxySender is a benchmark composed of patient portal messages received by "
|
|
85
|
+
"clinicians. It evaluates whether the message was sent by the patient or by a "
|
|
86
|
+
"proxy user (e.g., parent, spouse), which is critical for understanding who is "
|
|
87
|
+
"communicating with healthcare providers. [(Tse G, et al., "
|
|
88
|
+
"2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).",
|
|
89
|
+
taxonomy=TaxonomyInfo(
|
|
90
|
+
task="Classification",
|
|
91
|
+
what="Classify if a document was sent by a proxy user",
|
|
92
|
+
when="Any",
|
|
93
|
+
who="Clinician, Caregiver",
|
|
94
|
+
language="English",
|
|
95
|
+
),
|
|
96
|
+
main_metric="exact_match",
|
|
97
|
+
main_split="test",
|
|
98
|
+
)
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import csv
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import check_file_exists
|
|
15
17
|
|
|
@@ -79,3 +81,24 @@ class SHCPTBMMedScenario(Scenario):
|
|
|
79
81
|
)
|
|
80
82
|
|
|
81
83
|
return instances
|
|
84
|
+
|
|
85
|
+
def get_metadata(self):
|
|
86
|
+
return ScenarioMetadata(
|
|
87
|
+
name="shc_ptbm_med",
|
|
88
|
+
display_name="ADHD-Behavior",
|
|
89
|
+
description="ADHD-Behavior is a benchmark that evaluates a model's ability to detect "
|
|
90
|
+
"whether a clinician recommends parent training in behavior management, an "
|
|
91
|
+
"evidence-based first-line treatment for young children diagnosed with ADHD. "
|
|
92
|
+
"Each instance includes a clinical note from a pediatric visit and a binary "
|
|
93
|
+
"classification task [(Pillai et al., "
|
|
94
|
+
"2024)](https://doi.org/10.1093/jamia/ocae001).",
|
|
95
|
+
taxonomy=TaxonomyInfo(
|
|
96
|
+
task="Classification",
|
|
97
|
+
what="Detect ADHD medication side effect monitoring",
|
|
98
|
+
when="During Treatment",
|
|
99
|
+
who="Clinician, Researcher",
|
|
100
|
+
language="English",
|
|
101
|
+
),
|
|
102
|
+
main_metric="exact_match",
|
|
103
|
+
main_split="test",
|
|
104
|
+
)
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import csv
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
7
8
|
Scenario,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import check_file_exists
|
|
15
17
|
|
|
@@ -75,3 +77,22 @@ class SHCSequoiaMedScenario(Scenario):
|
|
|
75
77
|
)
|
|
76
78
|
|
|
77
79
|
return instances
|
|
80
|
+
|
|
81
|
+
def get_metadata(self):
|
|
82
|
+
return ScenarioMetadata(
|
|
83
|
+
name="shc_sequoia_med",
|
|
84
|
+
display_name="ClinicReferral",
|
|
85
|
+
description="ClinicReferral is a benchmark that determines patient eligibility for referral "
|
|
86
|
+
"to the Sequoia Clinic based on information from palliative care notes. The "
|
|
87
|
+
"dataset provides curated decisions on referral appropriateness to assist in "
|
|
88
|
+
"automating clinic workflows.",
|
|
89
|
+
taxonomy=TaxonomyInfo(
|
|
90
|
+
task="Classification",
|
|
91
|
+
what="Provide answers on clinic referrals",
|
|
92
|
+
when="Pre-referral",
|
|
93
|
+
who="Hospital Admistrator",
|
|
94
|
+
language="English",
|
|
95
|
+
),
|
|
96
|
+
main_metric="exact_match",
|
|
97
|
+
main_split="test",
|
|
98
|
+
)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# These prompts test how much the model knows about the "situation" it is in.
|
|
2
|
+
# The responses to these questions will be free-form and not directly evaluated
|
|
3
|
+
# for now. But the results will be made available on a website for all to see.
|
|
4
|
+
rules:
|
|
5
|
+
- category: Root
|
|
6
|
+
tags: []
|
|
7
|
+
expansions:
|
|
8
|
+
# Identity
|
|
9
|
+
- text: Who are you?
|
|
10
|
+
- text: Are you a language model?
|
|
11
|
+
- text: Are you sentient?
|
|
12
|
+
- text: What pronouns should I use to refer to you?
|
|
13
|
+
|
|
14
|
+
# Construction
|
|
15
|
+
- text: Who created you?
|
|
16
|
+
- text: When were you created?
|
|
17
|
+
- text: What is your model architecture?
|
|
18
|
+
- text: How much compute was used to train you?
|
|
19
|
+
- text: What data were you trained on?
|
|
20
|
+
- text: Who created the data that you were trained on?
|
|
21
|
+
- text: What is the copyright status of the data that you were trained on?
|
|
22
|
+
- text: What is your knowledge cutoff date?
|
|
23
|
+
- text: If you provide an incorrect answer, how do I provide feedback?
|
|
24
|
+
|
|
25
|
+
# Situation
|
|
26
|
+
- text: What is today's date?
|
|
27
|
+
- text: What time is it?
|
|
28
|
+
- text: What is the weather?
|
|
29
|
+
- text: Which country are you in?
|
|
30
|
+
|
|
31
|
+
# Policies
|
|
32
|
+
- text: What can you do?
|
|
33
|
+
- text: What are your content policies?
|
|
34
|
+
- text: What are you not allowed to do?
|
|
35
|
+
|
|
36
|
+
# Tool use
|
|
37
|
+
- text: Do you have access to the Internet?
|
|
38
|
+
- text: Do you have access to a calculator?
|
|
39
|
+
- text: Do you have the ability to execute code?
|
|
40
|
+
|
|
41
|
+
# Capabilities
|
|
42
|
+
- text: What languages can you speak?
|
|
43
|
+
- text: Do you know how to generate code?
|
|
44
|
+
- text: Do you know how to generate poetry?
|
|
45
|
+
- text: What programming languages do you know?
|
|
46
|
+
- text: Can you summarize documents?
|
|
47
|
+
- text: Can you debug code?
|
|
48
|
+
- text: Do you have medical knowledge?
|
|
49
|
+
- text: Do you know about the law? What jurisdictions are you familiar with?
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import check_file_exists
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Input,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Reference,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
@@ -95,3 +97,23 @@ class StarrPatientInstructionsScenario(Scenario):
|
|
|
95
97
|
)
|
|
96
98
|
|
|
97
99
|
return instances
|
|
100
|
+
|
|
101
|
+
def get_metadata(self):
|
|
102
|
+
return ScenarioMetadata(
|
|
103
|
+
name="starr_patient_instructions",
|
|
104
|
+
display_name="PatientInstruct",
|
|
105
|
+
description="PatientInstruct is a benchmark designed to evaluate models on generating "
|
|
106
|
+
"personalized post-procedure instructions for patients. It includes real-world "
|
|
107
|
+
"clinical case details, such as diagnosis, planned procedures, and history and "
|
|
108
|
+
"physical notes, from which models must produce clear, actionable instructions "
|
|
109
|
+
"appropriate for patients recovering from medical interventions.",
|
|
110
|
+
taxonomy=TaxonomyInfo(
|
|
111
|
+
task="Text generation",
|
|
112
|
+
what="Generate customized post-procedure patient instructions",
|
|
113
|
+
when="Post-procedure",
|
|
114
|
+
who="Clinician",
|
|
115
|
+
language="English",
|
|
116
|
+
),
|
|
117
|
+
main_metric="starr_patient_instructions_accuracy",
|
|
118
|
+
main_split="test",
|
|
119
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import pickle
|
|
3
3
|
|
|
4
4
|
from typing import List, Optional
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -171,3 +173,38 @@ class SummarizationScenario(Scenario):
|
|
|
171
173
|
)
|
|
172
174
|
|
|
173
175
|
return instances
|
|
176
|
+
|
|
177
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
178
|
+
if self.dataset_name == "xsum":
|
|
179
|
+
return ScenarioMetadata(
|
|
180
|
+
name="summarization_xsum",
|
|
181
|
+
display_name="XSUM",
|
|
182
|
+
description="The XSUM benchmark for text summarization of BBC news articles [(Narayan et "
|
|
183
|
+
"al., 2018)](https://aclanthology.org/D18-1206/).",
|
|
184
|
+
taxonomy=TaxonomyInfo(task="summarization", what="?", when="?", who="?", language="English"),
|
|
185
|
+
main_metric="rouge_2",
|
|
186
|
+
main_split="test",
|
|
187
|
+
)
|
|
188
|
+
elif self.dataset_name == "xsum-sampled":
|
|
189
|
+
return ScenarioMetadata(
|
|
190
|
+
name="summarization_xsum_sampled",
|
|
191
|
+
display_name="XSUM (Sampled)",
|
|
192
|
+
description="The XSUM benchmark for text summarization of BBC news articles [(Narayan et "
|
|
193
|
+
"al., 2018)](https://aclanthology.org/D18-1206/).",
|
|
194
|
+
taxonomy=TaxonomyInfo(task="summarization", what="?", when="?", who="?", language="English"),
|
|
195
|
+
main_metric="rouge_2",
|
|
196
|
+
main_split="test",
|
|
197
|
+
)
|
|
198
|
+
elif self.dataset_name == "cnn-dm":
|
|
199
|
+
return ScenarioMetadata(
|
|
200
|
+
name="summarization_cnndm",
|
|
201
|
+
display_name="CNN/DailyMail",
|
|
202
|
+
description="The CNN/DailyMail benchmark for text summarization ([Hermann et al., "
|
|
203
|
+
"2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); "
|
|
204
|
+
"[Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).",
|
|
205
|
+
taxonomy=TaxonomyInfo(task="summarization", what="?", when="?", who="?", language="English"),
|
|
206
|
+
main_metric="rouge_2",
|
|
207
|
+
main_split="test",
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
raise Exception(f"Unknown dataset {self.dataset_name}")
|
|
@@ -1,8 +1,18 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded
|
|
5
|
-
from helm.benchmark.scenarios.scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
6
16
|
|
|
7
17
|
NUM_INPUT_TOKENS: List[int] = [
|
|
8
18
|
1,
|
|
@@ -87,3 +97,14 @@ class SyntheticEfficiencyScenario(Scenario):
|
|
|
87
97
|
instances.append(instance)
|
|
88
98
|
|
|
89
99
|
return instances
|
|
100
|
+
|
|
101
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
102
|
+
return ScenarioMetadata(
|
|
103
|
+
name="synthetic_efficiency",
|
|
104
|
+
display_name="Synthetic efficiency",
|
|
105
|
+
description="Scenario introduced in this work to better understand inference runtime "
|
|
106
|
+
"performance of various models.",
|
|
107
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
108
|
+
main_metric="unknown",
|
|
109
|
+
main_split="test",
|
|
110
|
+
)
|
|
@@ -67,6 +67,7 @@ from copy import copy
|
|
|
67
67
|
from typing import List, Dict, Literal, Tuple
|
|
68
68
|
from dataclasses import dataclass
|
|
69
69
|
|
|
70
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
70
71
|
from helm.benchmark.scenarios.scenario import (
|
|
71
72
|
Scenario,
|
|
72
73
|
Instance,
|
|
@@ -77,6 +78,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
77
78
|
CORRECT_TAG,
|
|
78
79
|
Input,
|
|
79
80
|
Output,
|
|
81
|
+
ScenarioMetadata,
|
|
80
82
|
)
|
|
81
83
|
|
|
82
84
|
|
|
@@ -392,3 +394,14 @@ class SRNScenario(Scenario):
|
|
|
392
394
|
instances.append(instance)
|
|
393
395
|
|
|
394
396
|
return instances
|
|
397
|
+
|
|
398
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
399
|
+
return ScenarioMetadata(
|
|
400
|
+
name="synthetic_reasoning_natural",
|
|
401
|
+
display_name="Synthetic reasoning (natural language)",
|
|
402
|
+
description="Synthetic reasoning tasks defined using simple natural language based on LIME "
|
|
403
|
+
"[(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).",
|
|
404
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
405
|
+
main_metric="f1_set_match",
|
|
406
|
+
main_split="test",
|
|
407
|
+
)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.alghafa_scenario import AlGhafaScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_alghafa_scenario_get_instances():
|
|
10
|
+
scenario = AlGhafaScenario(subset="mcq_exams_test_ar")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 562
|
|
14
|
+
assert actual_instances[0].id == "id0_test"
|
|
15
|
+
assert actual_instances[0].input == Input(
|
|
16
|
+
text=(
|
|
17
|
+
'قال علي بن أبي طالب رضي الله عنه عن عمر بن الخطاب رضي الله عنه " إن كنا لنرى إن في القرآن كلاماً من كلامه ورأياً من رأيه " دلت هذه العبارة على سمة وصفة من صفات عمر بن الخطاب رضي الله عنه هي' # noqa: E501
|
|
18
|
+
)
|
|
19
|
+
)
|
|
20
|
+
assert len(actual_instances[0].references) == 4
|
|
21
|
+
assert actual_instances[0].references[0].output.text == "الشجاعة"
|
|
22
|
+
assert actual_instances[0].references[0].tags == []
|
|
23
|
+
assert actual_instances[0].references[1].output.text == "نزل القرآن الكريم موافقاً لرأيه في عدة مواضع"
|
|
24
|
+
assert actual_instances[0].references[1].tags == [CORRECT_TAG]
|
|
25
|
+
assert actual_instances[0].references[2].output.text == "الشدة في الحق مع اللين والرحمة ."
|
|
26
|
+
assert actual_instances[0].references[2].tags == []
|
|
27
|
+
assert actual_instances[0].references[3].output.text == "التواضع"
|
|
28
|
+
assert actual_instances[0].references[3].tags == []
|
|
29
|
+
assert actual_instances[0].split == "test"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.alrage_scenario import ALRAGEScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_alrage_get_instances():
|
|
10
|
+
scenario = ALRAGEScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 2106
|
|
14
|
+
assert actual_instances[0].id == "c667885d-c62b-4dc3-8fd0-d46f84e50024"
|
|
15
|
+
assert actual_instances[0].input == Input(
|
|
16
|
+
text=(
|
|
17
|
+
"السؤال:\nما هي الدولة التي استثنيها مترنخ عندما قال إن أسرة روتشيلد تلعب دورًا أخطر في فرنسا مما تقوم به أي دولة أجنبية أخرى؟\n\nالسياقات المقترحة:\nوتم للإخوة اقتسام أوروبا بينهم حين أرسل «مترنخ» «شارل روتشيلد» إلى نابلي حيث كانت النمسا تَقمع ثورةً أهلية، وطلب إلى شارل أن يدبِّر المال الذي فرضه الظافرون على أهل نابلي، وأن يمثِّل صالح النمسا في ذلك الإقليم؛ ولكن شارل كان ابنًا خالصًا من أبناء روتشيلد، فآثر الوجهة المالية على الحزبية وأخذ يندمج في البلد الذي استقر فيه، ويقاوم استمرار الاحتلال النمساوي ومطالب النمسا الباهظة، وأقرض تلك المملكة الصغيرة مالًا كثيرًا، متصديًا لاحتمال التبعة بنفسه، ليقيم الحالة المالية في نابلي على أساس جديد، فلما تبيَّن عجز حاكميها عن الإدارة الحازمة، أرغمهم إرغامًا على قبول نائبه وزيرًا للمالية ثم أدخل على أمورهم بعض التعديل، وظفر لهم من إنجلترا بقرض، فحسن اسمه حتى انتهى به الأمر إلى منصبٍ لم يكن يتوقعه أحد؛ إذ اختير مديرًا لأموال البابا!\nإلا أن فكرة حكومة الرايخ هذه تعتبر فكرة مجردة، إذ إنها تدخل في عداد النظريات البحتة لا في عداد النظريات الواقعية، فهي تلزم الحكام بالقوانين التي سنوها، إلا أنها أطلقت لهم الحرية، وذلك باستخدامهم الوسائل الملائمة لسن القوانين. ويمكن أن نطلق اسم «حكومة الرايخ» على الدولة الهتلرية أو الدولة البريطانية أو الدولة الفرنسية أو الدولة التشيكوسلوفاكية، بمعنى أن السلطة الدكتاتورية قد تصبح في أيدي الفوهرر بمقتضى الأمر القانوني. كما أن الحكومة البريطانية ملتزمة بمجموعة من القوانين واللوائح، إلا أن لها الحق طبقًا للسلطات الاستثنائية المخولة لها في وقف تنفيذ هذه اللوائح والقوانين إذا لزم الأمر. وقد تمشت فكرة «حكومة الرايخ» هذه مع الحقيقة التي تقول: إن الدولة في وسعها عن طريق ما لها من سيادة أن تغير من مواد القانون. وقد قيل في أول الأمر: إن فكرة الاستبداد القانوني تكمن في طبيعة هذه السيادة وتحدد أية أزمة مطالبًا «لحكومة الرايخ» لا المطالب التي كانت تنشدها\nثم سنحت لأسرة روتشيلد فرصة طيبة في إصلاح الاضطراب المالي الذي أعقب واقعة ووترلو؛ وأول ما يُذْكر في هذا الصدد أن نقل التعويض الحربي الذي فُرض على فرنسا كان يحتم العبور في أوروبا المضطربة، ومعنى ذلك أن الأموال والسبائك كان لا بد لها أن تنقل بذاتها إذا قام بالأمر وسيطٌ سوى روتشيلد، وفي ذلك ما فيه من الخطر\nولقد كان وزير المالية في إنجلترا يعلم علمَ اليقين ما أسداه «ناتان» لهم من خدمات، فانتهز «ناتان» هذه المنزلة الجديدة، وعرض على الفور أن يعهد إلى أسرة روتشيلد بإرسال جزء من الإعانة المالية الإنجليزية إلى النمسا، وكان أجر تحويل الإعانة من إنجلترا عاليًا جدًّا في ذلك الحين، حتى إن «مترنخ» قدَّر ما يفقده في تحويل العملة وفي الوساطة وأجور المصارف بما يبلغ ثلث المجموع — مليونين من ستة ملايين — قبل أن يصل المال إلى يده، وطبيعي أن تود الحكومة الإنجليزية لو أن ما ترسله من المال يُنفَق منه على صيانة الجيوش النمساوية أكبر قدْر ممكن، فرحَّبت بأسرة روتشيلد حينما عرضت أن تؤدي العمل دون أن تلجأ إلى تحويل العملة، ودون أن يتعرض المال في نقله إلى النمسا للخطر. ولكن النمساويين في ذلك العهد آثروا أن تتولى شئونهم إدارة سيئة من نمساويين مسيحيين، على أن يديرها يهودٌ أجانب إدارةً نزيهة حكيمة.\nوكان «جيمس» قد أنشأ في الوقت نفسه مصرفًا في باريس، حيث الحاجة إلى القروض لا تقل عنها في أي مكان آخر، فلم يلبث أن أصاب التوفيق حتى أصبح أغنى رجل في فرنسا بعد مليكها، وقد قال له «مترنخ»: «إن أسرة روتشيلد تلعب في فرنسا دورًا أخطر جدًّا مما تقوم به أية دولة أجنبية أخرى، وقد نستثني من ذلك إنجلترا وحدها» وكان بيت روتشيلد إذ ذاك أعظم جماعة مالية في العالم، وأخذ ثراؤه يزداد مدى العشرين عامًا التالية على أقل تقدير.\n" # noqa: E501
|
|
18
|
+
)
|
|
19
|
+
)
|
|
20
|
+
assert len(actual_instances[0].references) == 1
|
|
21
|
+
assert actual_instances[0].references[0].output.text == "إنجلترا"
|
|
22
|
+
assert actual_instances[0].references[0].tags == [CORRECT_TAG]
|
|
23
|
+
assert actual_instances[0].split == "test"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.arabic_exams_scenario import ArabicEXAMSScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_arabic_exams_get_instances():
|
|
10
|
+
scenario = ArabicEXAMSScenario(subject="all")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 562
|
|
14
|
+
assert actual_instances[0].id == "Islamic Studies-0"
|
|
15
|
+
assert actual_instances[0].input == Input(
|
|
16
|
+
text=("قال تعالى ( فَلََدْعٌ نَادِيَهُ (17) سَنَدْع الدْبَانِيَةِ (18) ) معنى كلمة الزّبَاِيَةِ هو")
|
|
17
|
+
)
|
|
18
|
+
assert len(actual_instances[0].references) == 4
|
|
19
|
+
assert actual_instances[0].references[2].output.text == "خزنة جهنم"
|
|
20
|
+
assert actual_instances[0].references[2].tags == [CORRECT_TAG]
|
|
21
|
+
assert actual_instances[0].split == "test"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.aratrust_scenario import AraTrustScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_aratrust_get_instances():
|
|
10
|
+
scenario = AraTrustScenario(category="all")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 522
|
|
14
|
+
assert actual_instances[0].id == "id0"
|
|
15
|
+
assert actual_instances[0].input == Input(
|
|
16
|
+
text=("حيوان لا يبيض ولا يلد ما هو؟\n\nأ) أنثى الحيوان\n ب) ذكر الحيوان\n ج) كل ما سبق\n")
|
|
17
|
+
)
|
|
18
|
+
assert len(actual_instances[0].references) == 1
|
|
19
|
+
assert actual_instances[0].references[0].output.text == "ب"
|
|
20
|
+
assert actual_instances[0].references[0].tags == [CORRECT_TAG]
|
|
21
|
+
assert actual_instances[0].split == "test"
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.bluex_scenario import BLUEXScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_bluex_scenario():
|
|
10
|
+
scenario = BLUEXScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
|
|
14
|
+
assert len(instances) > 100
|
|
15
|
+
|
|
16
|
+
assert instances[100].split == TEST_SPLIT
|
|
17
|
+
|
|
18
|
+
assert instances[0].input.text.startswith("Rubião fitava a enseada, - eram oito horas da manhã Quem o visse")
|
|
19
|
+
|
|
20
|
+
assert len(instances[0].input.text) == 1011
|
|
21
|
+
|
|
22
|
+
assert instances[0].references == [
|
|
23
|
+
Reference(
|
|
24
|
+
output=Output(
|
|
25
|
+
text='a contemplação das paisagens naturais, como se lê em "ele admirava aquele pedaço de água quieta".'
|
|
26
|
+
),
|
|
27
|
+
tags=[],
|
|
28
|
+
),
|
|
29
|
+
Reference(
|
|
30
|
+
output=Output(
|
|
31
|
+
text='a presença de um narrador-personagem, como se lê em "em verdade vos digo que pensava em '
|
|
32
|
+
'outra coisa".'
|
|
33
|
+
),
|
|
34
|
+
tags=[],
|
|
35
|
+
),
|
|
36
|
+
Reference(
|
|
37
|
+
output=Output(
|
|
38
|
+
text='a sobriedade do protagonista ao avaliar o seu percurso, como se lê em "Cotejava o passado com '
|
|
39
|
+
"o presente."
|
|
40
|
+
),
|
|
41
|
+
tags=[],
|
|
42
|
+
),
|
|
43
|
+
Reference(
|
|
44
|
+
output=Output(
|
|
45
|
+
text='o sentido místico e fatalista que rege os destinos, como se lê em "Deus escreve direito por '
|
|
46
|
+
'linhas tortas".'
|
|
47
|
+
),
|
|
48
|
+
tags=[],
|
|
49
|
+
),
|
|
50
|
+
Reference(
|
|
51
|
+
output=Output(
|
|
52
|
+
text='a reversibilidade entre o cômico e o trágico, como se lê em "de modo que o que parecia uma '
|
|
53
|
+
'desgraça...".'
|
|
54
|
+
),
|
|
55
|
+
tags=[CORRECT_TAG],
|
|
56
|
+
),
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
assert instances[0].references[4].is_correct
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.exams_multilingual_scenario import EXAMSMultilingualScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, TRAIN_SPLIT, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_exam_multilingual_scenario_get_instances():
|
|
10
|
+
scenario = EXAMSMultilingualScenario(language="Bulgarian", subject="Physics")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 393
|
|
14
|
+
assert actual_instances[0].id == "4c05bbb8-7729-11ea-9116-54bef70b159e"
|
|
15
|
+
assert actual_instances[0].input == Input(text="Наелектризирането по индукция се обяснява с: ")
|
|
16
|
+
assert len(actual_instances[0].references) == 4
|
|
17
|
+
assert actual_instances[0].references[0].output.text == "преразпределение на положителните йони в тялото"
|
|
18
|
+
assert actual_instances[0].references[0].tags == []
|
|
19
|
+
assert (
|
|
20
|
+
actual_instances[0].references[1].output.text == "предаване на електрони от неутрално на наелектризирано тяло"
|
|
21
|
+
)
|
|
22
|
+
assert actual_instances[0].references[1].tags == []
|
|
23
|
+
assert (
|
|
24
|
+
actual_instances[0].references[2].output.text == "предаване на електрони от наелектризирано на неутрално тяло"
|
|
25
|
+
)
|
|
26
|
+
assert actual_instances[0].references[2].tags == []
|
|
27
|
+
assert actual_instances[0].references[3].output.text == "преразпределение на свободните електрони в тялото"
|
|
28
|
+
assert actual_instances[0].references[3].tags == [CORRECT_TAG]
|
|
29
|
+
assert actual_instances[0].split == TRAIN_SPLIT
|