crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +134 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +26 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +17 -17
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +31 -19
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -11
- helm/clients/vertexai_client.py +8 -2
- helm/config/model_deployments.yaml +75 -1
- helm/config/model_metadata.yaml +70 -2
- helm/config/tokenizer_configs.yaml +19 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
|
10
10
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
11
11
|
ADAPT_GENERATION,
|
|
12
12
|
)
|
|
13
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
13
14
|
from helm.benchmark.runner import get_benchmark_output_path
|
|
14
15
|
from helm.common.general import (
|
|
15
16
|
assert_is_str,
|
|
@@ -27,6 +28,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
27
28
|
CORRECT_TAG,
|
|
28
29
|
Input,
|
|
29
30
|
Output,
|
|
31
|
+
ScenarioMetadata,
|
|
30
32
|
get_scenario_cache_path,
|
|
31
33
|
)
|
|
32
34
|
from helm.benchmark.scenarios.code_scenario import CodeReference, CodeInstance
|
|
@@ -576,6 +578,19 @@ class CLEVATextClassificationScenario(CLEVAScenario):
|
|
|
576
578
|
def task(self) -> str:
|
|
577
579
|
return "text_classification"
|
|
578
580
|
|
|
581
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
582
|
+
return ScenarioMetadata(
|
|
583
|
+
name="cleva_text_classification",
|
|
584
|
+
display_name="CLEVA (Chinese) text classification",
|
|
585
|
+
description="This scenario has two subtasks. Classify if an utterance is humorous and "
|
|
586
|
+
"identify news topic based on its title.",
|
|
587
|
+
taxonomy=TaxonomyInfo(
|
|
588
|
+
task="text classification", what="news or chitchat", when="2010s", who="n/a", language="Chinese"
|
|
589
|
+
),
|
|
590
|
+
main_metric="exact_match",
|
|
591
|
+
main_split="test",
|
|
592
|
+
)
|
|
593
|
+
|
|
579
594
|
|
|
580
595
|
class CLEVAOpinionMiningScenario(CLEVAScenario):
|
|
581
596
|
"""
|
|
@@ -597,6 +612,18 @@ class CLEVAOpinionMiningScenario(CLEVAScenario):
|
|
|
597
612
|
def task(self) -> str:
|
|
598
613
|
return "opinion_mining"
|
|
599
614
|
|
|
615
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
616
|
+
return ScenarioMetadata(
|
|
617
|
+
name="cleva_opinion_mining",
|
|
618
|
+
display_name="CLEVA (Chinese) opinion mining",
|
|
619
|
+
description="Extract the target of an opinion.",
|
|
620
|
+
taxonomy=TaxonomyInfo(
|
|
621
|
+
task="opinion target extraction", what="n/a", when="2010s", who="n/a", language="Chinese"
|
|
622
|
+
),
|
|
623
|
+
main_metric="exact_match",
|
|
624
|
+
main_split="test",
|
|
625
|
+
)
|
|
626
|
+
|
|
600
627
|
|
|
601
628
|
class CLEVAPinyinTransliterationScenario(CLEVAScenario):
|
|
602
629
|
"""
|
|
@@ -626,6 +653,22 @@ class CLEVAPinyinTransliterationScenario(CLEVAScenario):
|
|
|
626
653
|
def task(self) -> str:
|
|
627
654
|
return "pinyin_transliteration"
|
|
628
655
|
|
|
656
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
657
|
+
return ScenarioMetadata(
|
|
658
|
+
name="cleva_pinyin_transliteration",
|
|
659
|
+
display_name="CLEVA (Chinese) pinyin transliteration",
|
|
660
|
+
description="Scenario that asks the model to translate between Chinese and Pinyin.",
|
|
661
|
+
taxonomy=TaxonomyInfo(
|
|
662
|
+
task="pinyin transliteration",
|
|
663
|
+
what="n/a",
|
|
664
|
+
when="2023",
|
|
665
|
+
who="automatically generated by algorithm",
|
|
666
|
+
language="Chinese, Pinyin",
|
|
667
|
+
),
|
|
668
|
+
main_metric="chinese_bleu_1",
|
|
669
|
+
main_split="test",
|
|
670
|
+
)
|
|
671
|
+
|
|
629
672
|
|
|
630
673
|
class CLEVAClassicalChineseUnderstandingScenario(CLEVAScenario):
|
|
631
674
|
"""
|
|
@@ -651,6 +694,20 @@ class CLEVAClassicalChineseUnderstandingScenario(CLEVAScenario):
|
|
|
651
694
|
def task(self) -> str:
|
|
652
695
|
return "classical_chinese_understanding"
|
|
653
696
|
|
|
697
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
698
|
+
return ScenarioMetadata(
|
|
699
|
+
name="cleva_classical_chinese_understanding",
|
|
700
|
+
display_name="CLEVA (Chinese) classical Chinese understanding",
|
|
701
|
+
description="Scenario for evaluating the understanding of classical Chinese by selecting "
|
|
702
|
+
"the appropriate classical Chinese translation for a given modern Chinese "
|
|
703
|
+
"sentence.",
|
|
704
|
+
taxonomy=TaxonomyInfo(
|
|
705
|
+
task="translation", what="n/a", when="2021 or before", who="n/a", language="Classical Chinese"
|
|
706
|
+
),
|
|
707
|
+
main_metric="exact_match",
|
|
708
|
+
main_split="test",
|
|
709
|
+
)
|
|
710
|
+
|
|
654
711
|
|
|
655
712
|
class CLEVASentimentAnalysisScenario(CLEVAScenario):
|
|
656
713
|
"""
|
|
@@ -674,6 +731,22 @@ class CLEVASentimentAnalysisScenario(CLEVAScenario):
|
|
|
674
731
|
def task(self) -> str:
|
|
675
732
|
return "sentiment_analysis"
|
|
676
733
|
|
|
734
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
735
|
+
return ScenarioMetadata(
|
|
736
|
+
name="cleva_sentiment_analysis",
|
|
737
|
+
display_name="Sentiment analysis",
|
|
738
|
+
description="Chinese sentiment analysis for product reviews.",
|
|
739
|
+
taxonomy=TaxonomyInfo(
|
|
740
|
+
task="sentiment analysis",
|
|
741
|
+
what="product reviews",
|
|
742
|
+
when="2021 or before",
|
|
743
|
+
who="customers",
|
|
744
|
+
language="Chinese",
|
|
745
|
+
),
|
|
746
|
+
main_metric="exact_match",
|
|
747
|
+
main_split="test",
|
|
748
|
+
)
|
|
749
|
+
|
|
677
750
|
|
|
678
751
|
class CLEVAInstructionFollowingScenario(CLEVAScenario):
|
|
679
752
|
"""
|
|
@@ -702,6 +775,24 @@ class CLEVAInstructionFollowingScenario(CLEVAScenario):
|
|
|
702
775
|
def task(self) -> str:
|
|
703
776
|
return "instruction_following"
|
|
704
777
|
|
|
778
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
779
|
+
return ScenarioMetadata(
|
|
780
|
+
name="cleva_instruction_following",
|
|
781
|
+
display_name="CLEVA (Chinese) instruction following",
|
|
782
|
+
description="Scenario that examines whether models could follow human instructions, mainly "
|
|
783
|
+
"uncommon ones. It contains two subtasks: 'redefine' and "
|
|
784
|
+
"'pattern_matching_suppression'.",
|
|
785
|
+
taxonomy=TaxonomyInfo(
|
|
786
|
+
task="multiple-choice question answering",
|
|
787
|
+
what="natural language questions",
|
|
788
|
+
when="2023",
|
|
789
|
+
who="automatically generated from templates",
|
|
790
|
+
language="synthetic",
|
|
791
|
+
),
|
|
792
|
+
main_metric="exact_match",
|
|
793
|
+
main_split="test",
|
|
794
|
+
)
|
|
795
|
+
|
|
705
796
|
|
|
706
797
|
class CLEVAFactCheckingScenario(CLEVAScenario):
|
|
707
798
|
"""
|
|
@@ -726,6 +817,23 @@ class CLEVAFactCheckingScenario(CLEVAScenario):
|
|
|
726
817
|
def task(self) -> str:
|
|
727
818
|
return "fact_checking"
|
|
728
819
|
|
|
820
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
821
|
+
return ScenarioMetadata(
|
|
822
|
+
name="cleva_fact_checking",
|
|
823
|
+
display_name="CLEVA (Chinese) fact checking",
|
|
824
|
+
description="Scenario that lets models identify whether the given fact is true to test "
|
|
825
|
+
"their factuality.",
|
|
826
|
+
taxonomy=TaxonomyInfo(
|
|
827
|
+
task="multiple-choice question answering",
|
|
828
|
+
what="factual statements in natural language form",
|
|
829
|
+
when="2022 or before",
|
|
830
|
+
who="n/a",
|
|
831
|
+
language="Chinese",
|
|
832
|
+
),
|
|
833
|
+
main_metric="exact_match",
|
|
834
|
+
main_split="test",
|
|
835
|
+
)
|
|
836
|
+
|
|
729
837
|
|
|
730
838
|
class CLEVATranslationScenario(CLEVAScenario):
|
|
731
839
|
"""
|
|
@@ -758,6 +866,18 @@ class CLEVATranslationScenario(CLEVAScenario):
|
|
|
758
866
|
def task(self) -> str:
|
|
759
867
|
return "translation"
|
|
760
868
|
|
|
869
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
870
|
+
return ScenarioMetadata(
|
|
871
|
+
name="cleva_translation",
|
|
872
|
+
display_name="Translation",
|
|
873
|
+
description="Scenario for measuring the translation quality between Chinese and English.",
|
|
874
|
+
taxonomy=TaxonomyInfo(
|
|
875
|
+
task="translation", what="news", when="2022 or before", who="n/a", language="Chinese, English"
|
|
876
|
+
),
|
|
877
|
+
main_metric="cleva_machine_translation_bleu",
|
|
878
|
+
main_split="test",
|
|
879
|
+
)
|
|
880
|
+
|
|
761
881
|
|
|
762
882
|
class CLEVAToxicityDetectionScenario(CLEVAScenario):
|
|
763
883
|
"""
|
|
@@ -781,6 +901,22 @@ class CLEVAToxicityDetectionScenario(CLEVAScenario):
|
|
|
781
901
|
def task(self) -> str:
|
|
782
902
|
return "toxicity_detection"
|
|
783
903
|
|
|
904
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
905
|
+
return ScenarioMetadata(
|
|
906
|
+
name="cleva_toxicity_detection",
|
|
907
|
+
display_name="CLEVA (Chinese) toxicity detection",
|
|
908
|
+
description="Ask models about the offensiveness of the given text.",
|
|
909
|
+
taxonomy=TaxonomyInfo(
|
|
910
|
+
task="toxicity classification",
|
|
911
|
+
what="text from Chinese social media",
|
|
912
|
+
when="2022 or before",
|
|
913
|
+
who="web users",
|
|
914
|
+
language="Chinese",
|
|
915
|
+
),
|
|
916
|
+
main_metric="exact_match",
|
|
917
|
+
main_split="test",
|
|
918
|
+
)
|
|
919
|
+
|
|
784
920
|
|
|
785
921
|
class CLEVAParaphraseGenerationScenario(CLEVAScenario):
|
|
786
922
|
"""
|
|
@@ -814,6 +950,18 @@ class CLEVAParaphraseGenerationScenario(CLEVAScenario):
|
|
|
814
950
|
)
|
|
815
951
|
return instance
|
|
816
952
|
|
|
953
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
954
|
+
return ScenarioMetadata(
|
|
955
|
+
name="cleva_paraphrase_generation",
|
|
956
|
+
display_name="CLEVA (Chinese) paraphrase generation",
|
|
957
|
+
description="Generate a paraphrase of a given sentence.",
|
|
958
|
+
taxonomy=TaxonomyInfo(
|
|
959
|
+
task="paraphrase generation", what="n/a", when="2010s", who="n/a", language="Chinese"
|
|
960
|
+
),
|
|
961
|
+
main_metric="chinese_ibleu",
|
|
962
|
+
main_split="test",
|
|
963
|
+
)
|
|
964
|
+
|
|
817
965
|
|
|
818
966
|
class CLEVAIntentUnderstandingScenario(CLEVAScenario):
|
|
819
967
|
"""
|
|
@@ -848,6 +996,19 @@ class CLEVAIntentUnderstandingScenario(CLEVAScenario):
|
|
|
848
996
|
def task(self) -> str:
|
|
849
997
|
return "intent_understanding"
|
|
850
998
|
|
|
999
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1000
|
+
return ScenarioMetadata(
|
|
1001
|
+
name="cleva_intent_understanding",
|
|
1002
|
+
display_name="CLEVA (Chinese) intent understanding",
|
|
1003
|
+
description="Tests whether the model could capture the writing intention of the authors "
|
|
1004
|
+
"after reading an article.",
|
|
1005
|
+
taxonomy=TaxonomyInfo(
|
|
1006
|
+
task="multiple-choice question answering", what="exam", when="1990-2022", who="n/a", language="Chinese"
|
|
1007
|
+
),
|
|
1008
|
+
main_metric="exact_match",
|
|
1009
|
+
main_split="test",
|
|
1010
|
+
)
|
|
1011
|
+
|
|
851
1012
|
|
|
852
1013
|
class CLEVACoreferenceResolutionScenario(CLEVAScenario):
|
|
853
1014
|
"""
|
|
@@ -870,6 +1031,23 @@ class CLEVACoreferenceResolutionScenario(CLEVAScenario):
|
|
|
870
1031
|
def task(self) -> str:
|
|
871
1032
|
return "coreference_resolution"
|
|
872
1033
|
|
|
1034
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1035
|
+
return ScenarioMetadata(
|
|
1036
|
+
name="cleva_coreference_resolution",
|
|
1037
|
+
display_name="CLEVA (Chinese) coreference resolution",
|
|
1038
|
+
description="Scenario for testing models on solving coreference resolution problems (the "
|
|
1039
|
+
"winograd schema challenge).",
|
|
1040
|
+
taxonomy=TaxonomyInfo(
|
|
1041
|
+
task="multiple-choice question answering",
|
|
1042
|
+
what="contemporary Chinese literary works",
|
|
1043
|
+
when="2020 or before",
|
|
1044
|
+
who="n/a",
|
|
1045
|
+
language="Chinese",
|
|
1046
|
+
),
|
|
1047
|
+
main_metric="exact_match",
|
|
1048
|
+
main_split="test",
|
|
1049
|
+
)
|
|
1050
|
+
|
|
873
1051
|
|
|
874
1052
|
class CLEVAReadingComprehensionScenario(CLEVAScenario):
|
|
875
1053
|
"""
|
|
@@ -898,6 +1076,18 @@ class CLEVAReadingComprehensionScenario(CLEVAScenario):
|
|
|
898
1076
|
def task(self) -> str:
|
|
899
1077
|
return "reading_comprehension"
|
|
900
1078
|
|
|
1079
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1080
|
+
return ScenarioMetadata(
|
|
1081
|
+
name="cleva_reading_comprehension",
|
|
1082
|
+
display_name="Reading comprehension",
|
|
1083
|
+
description="Answer a multiple-choice question based on a given paragraph.",
|
|
1084
|
+
taxonomy=TaxonomyInfo(
|
|
1085
|
+
task="question answering", what="n/a", when="2019 or before", who="n/a", language="Chinese"
|
|
1086
|
+
),
|
|
1087
|
+
main_metric="exact_match",
|
|
1088
|
+
main_split="test",
|
|
1089
|
+
)
|
|
1090
|
+
|
|
901
1091
|
|
|
902
1092
|
class CLEVADialogueGenerationScenario(CLEVAScenario):
|
|
903
1093
|
"""
|
|
@@ -964,6 +1154,22 @@ class CLEVADialogueGenerationScenario(CLEVAScenario):
|
|
|
964
1154
|
|
|
965
1155
|
return instances
|
|
966
1156
|
|
|
1157
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1158
|
+
return ScenarioMetadata(
|
|
1159
|
+
name="cleva_dialogue_generation",
|
|
1160
|
+
display_name="CLEVA (Chinese) dialogue generation",
|
|
1161
|
+
description="Task-oriented dialogue between a user and a system.",
|
|
1162
|
+
taxonomy=TaxonomyInfo(
|
|
1163
|
+
task="dialogue generation",
|
|
1164
|
+
what="task-oriented dialogue on hotel, restaurant, attraction, metro, " "and taxi domain",
|
|
1165
|
+
when="2020 or before",
|
|
1166
|
+
who="user and assistant",
|
|
1167
|
+
language="Chinese",
|
|
1168
|
+
),
|
|
1169
|
+
main_metric="chinese_bleu_1",
|
|
1170
|
+
main_split="test",
|
|
1171
|
+
)
|
|
1172
|
+
|
|
967
1173
|
|
|
968
1174
|
class CLEVASubjectKnowledgeScenario(CLEVAScenario):
|
|
969
1175
|
"""
|
|
@@ -993,6 +1199,24 @@ class CLEVASubjectKnowledgeScenario(CLEVAScenario):
|
|
|
993
1199
|
def task(self) -> str:
|
|
994
1200
|
return "subject_knowledge"
|
|
995
1201
|
|
|
1202
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1203
|
+
return ScenarioMetadata(
|
|
1204
|
+
name="cleva_subject_knowledge",
|
|
1205
|
+
display_name="CLEVA (Chinese) subject knowledge",
|
|
1206
|
+
description="Scenario inspired by [Petroni et al. "
|
|
1207
|
+
"(2019)](https://aclanthology.org/D19-1250/) to extensively test factual "
|
|
1208
|
+
"knowledge in Chinese. It contains 13 subjects and a general domain.",
|
|
1209
|
+
taxonomy=TaxonomyInfo(
|
|
1210
|
+
task="knowledge base completion",
|
|
1211
|
+
what="entity-relation-entity triples in natural language form",
|
|
1212
|
+
when="2022 or before",
|
|
1213
|
+
who="automatically generated from templates",
|
|
1214
|
+
language="structured Chinese",
|
|
1215
|
+
),
|
|
1216
|
+
main_metric="exact_match",
|
|
1217
|
+
main_split="test",
|
|
1218
|
+
)
|
|
1219
|
+
|
|
996
1220
|
|
|
997
1221
|
class CLEVACulturalKnowledgeScenario(CLEVAScenario):
|
|
998
1222
|
"""
|
|
@@ -1022,6 +1246,23 @@ class CLEVACulturalKnowledgeScenario(CLEVAScenario):
|
|
|
1022
1246
|
def task(self) -> str:
|
|
1023
1247
|
return "cultural_knowledge"
|
|
1024
1248
|
|
|
1249
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1250
|
+
return ScenarioMetadata(
|
|
1251
|
+
name="cleva_cultural_knowledge",
|
|
1252
|
+
display_name="CLEVA (Chinese) cultural knowledge",
|
|
1253
|
+
description="Scenario for evaluating models' understanding of Chinese culture. It has a "
|
|
1254
|
+
"Chinese-idiom-focused subtask.",
|
|
1255
|
+
taxonomy=TaxonomyInfo(
|
|
1256
|
+
task="multiple-choice question answering",
|
|
1257
|
+
what="Various passages containing Chinese idioms",
|
|
1258
|
+
when="2010s",
|
|
1259
|
+
who="n/a",
|
|
1260
|
+
language="Chinese",
|
|
1261
|
+
),
|
|
1262
|
+
main_metric="exact_match",
|
|
1263
|
+
main_split="test",
|
|
1264
|
+
)
|
|
1265
|
+
|
|
1025
1266
|
|
|
1026
1267
|
class CLEVAParaphraseIdentificationScenario(CLEVAScenario):
|
|
1027
1268
|
"""
|
|
@@ -1057,6 +1298,23 @@ class CLEVAParaphraseIdentificationScenario(CLEVAScenario):
|
|
|
1057
1298
|
def task(self) -> str:
|
|
1058
1299
|
return "paraphrase_identification"
|
|
1059
1300
|
|
|
1301
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1302
|
+
return ScenarioMetadata(
|
|
1303
|
+
name="cleva_paraphrase_identification",
|
|
1304
|
+
display_name="Paraphrase identification",
|
|
1305
|
+
description="Identify if two sentences, from a dialogue or from the finance domain, share "
|
|
1306
|
+
"the same meaning.",
|
|
1307
|
+
taxonomy=TaxonomyInfo(
|
|
1308
|
+
task="paraphrase identification",
|
|
1309
|
+
what="financial questions or chitchat",
|
|
1310
|
+
when="2020 or before",
|
|
1311
|
+
who="n/a",
|
|
1312
|
+
language="Chinese",
|
|
1313
|
+
),
|
|
1314
|
+
main_metric="exact_match",
|
|
1315
|
+
main_split="test",
|
|
1316
|
+
)
|
|
1317
|
+
|
|
1060
1318
|
|
|
1061
1319
|
class CLEVAClosedBookQuestionAnsweringScenario(CLEVAScenario):
|
|
1062
1320
|
"""
|
|
@@ -1117,6 +1375,24 @@ class CLEVAClosedBookQuestionAnsweringScenario(CLEVAScenario):
|
|
|
1117
1375
|
def task(self) -> str:
|
|
1118
1376
|
return "closed_book_question_answering"
|
|
1119
1377
|
|
|
1378
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1379
|
+
return ScenarioMetadata(
|
|
1380
|
+
name="cleva_closed_book_question_answering",
|
|
1381
|
+
display_name="CLEVA (Chinese) closed book question answering",
|
|
1382
|
+
description="Closed-book question answering task comprises three subtasks. One is for the "
|
|
1383
|
+
"medical domain, another for open-domain, and the last measures if a model "
|
|
1384
|
+
"generates truthful answers.",
|
|
1385
|
+
taxonomy=TaxonomyInfo(
|
|
1386
|
+
task="question answering",
|
|
1387
|
+
what="medical, open-domain, or truthful qa",
|
|
1388
|
+
when="2022 or before",
|
|
1389
|
+
who="n/a",
|
|
1390
|
+
language="Chinese",
|
|
1391
|
+
),
|
|
1392
|
+
main_metric="exact_match",
|
|
1393
|
+
main_split="test",
|
|
1394
|
+
)
|
|
1395
|
+
|
|
1120
1396
|
|
|
1121
1397
|
class CLEVASummarizationScenario(CLEVAScenario):
|
|
1122
1398
|
"""
|
|
@@ -1151,6 +1427,22 @@ class CLEVASummarizationScenario(CLEVAScenario):
|
|
|
1151
1427
|
def task(self) -> str:
|
|
1152
1428
|
return "summarization"
|
|
1153
1429
|
|
|
1430
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1431
|
+
return ScenarioMetadata(
|
|
1432
|
+
name="cleva_summarization",
|
|
1433
|
+
display_name="CLEVA (Chinese) summarization",
|
|
1434
|
+
description="Summarize a dialogue between a customer representative and a customer.",
|
|
1435
|
+
taxonomy=TaxonomyInfo(
|
|
1436
|
+
task="summarization",
|
|
1437
|
+
what="e-commerce dialogues",
|
|
1438
|
+
when="2021 or before",
|
|
1439
|
+
who="customers and representatives",
|
|
1440
|
+
language="Chinese",
|
|
1441
|
+
),
|
|
1442
|
+
main_metric="chinese_rouge_2",
|
|
1443
|
+
main_split="test",
|
|
1444
|
+
)
|
|
1445
|
+
|
|
1154
1446
|
|
|
1155
1447
|
class CLEVABiasScenario(CLEVAScenario):
|
|
1156
1448
|
"""
|
|
@@ -1215,6 +1507,23 @@ class CLEVABiasScenario(CLEVAScenario):
|
|
|
1215
1507
|
def task(self) -> str:
|
|
1216
1508
|
return "bias"
|
|
1217
1509
|
|
|
1510
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1511
|
+
return ScenarioMetadata(
|
|
1512
|
+
name="cleva_bias",
|
|
1513
|
+
display_name="CLEVA (Chinese) bias",
|
|
1514
|
+
description="Scenario that gauges bias of four demographic categories in dialogues, "
|
|
1515
|
+
"including race, gender, region, and occupation.",
|
|
1516
|
+
taxonomy=TaxonomyInfo(
|
|
1517
|
+
task="multiple-choice question answering",
|
|
1518
|
+
what="short Chinese dialogues from social media",
|
|
1519
|
+
when="2022 or before",
|
|
1520
|
+
who="web users",
|
|
1521
|
+
language="Chinese",
|
|
1522
|
+
),
|
|
1523
|
+
main_metric="exact_match",
|
|
1524
|
+
main_split="test",
|
|
1525
|
+
)
|
|
1526
|
+
|
|
1218
1527
|
|
|
1219
1528
|
class CLEVACopyrightScenario(CLEVAScenario):
|
|
1220
1529
|
"""
|
|
@@ -1231,6 +1540,20 @@ class CLEVACopyrightScenario(CLEVAScenario):
|
|
|
1231
1540
|
def task(self) -> str:
|
|
1232
1541
|
return "copyright"
|
|
1233
1542
|
|
|
1543
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1544
|
+
return ScenarioMetadata(
|
|
1545
|
+
name="cleva_copyright",
|
|
1546
|
+
display_name="CLEVA (Chinese) copyright",
|
|
1547
|
+
description="Scenario that measures copyright and memorization behavior for Chinese books "
|
|
1548
|
+
"and code, based off of [Carlini et al. "
|
|
1549
|
+
"(2021)](https://www.usenix.org/biblio-11958).",
|
|
1550
|
+
taxonomy=TaxonomyInfo(
|
|
1551
|
+
task="next-word prediction", what="books and code", when="2023 or before", who="n/a", language="Chinese"
|
|
1552
|
+
),
|
|
1553
|
+
main_metric="unknown",
|
|
1554
|
+
main_split="test",
|
|
1555
|
+
)
|
|
1556
|
+
|
|
1234
1557
|
|
|
1235
1558
|
class CLEVAConceptualGeneralizationScenario(CLEVAScenario):
|
|
1236
1559
|
"""
|
|
@@ -1257,6 +1580,23 @@ class CLEVAConceptualGeneralizationScenario(CLEVAScenario):
|
|
|
1257
1580
|
def task(self) -> str:
|
|
1258
1581
|
return "conceptual_generalization"
|
|
1259
1582
|
|
|
1583
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1584
|
+
return ScenarioMetadata(
|
|
1585
|
+
name="cleva_conceptual_generalization",
|
|
1586
|
+
display_name="CLEVA (Chinese) conceptual generalization",
|
|
1587
|
+
description="Scenario that assesses whether models could generalize physical relations to a "
|
|
1588
|
+
"synthetic grid world.",
|
|
1589
|
+
taxonomy=TaxonomyInfo(
|
|
1590
|
+
task="next-word prediction",
|
|
1591
|
+
what="n/a",
|
|
1592
|
+
when="2023",
|
|
1593
|
+
who="automatically generated by algorithm",
|
|
1594
|
+
language="synthetic",
|
|
1595
|
+
),
|
|
1596
|
+
main_metric="cleva_top1_accuracy",
|
|
1597
|
+
main_split="test",
|
|
1598
|
+
)
|
|
1599
|
+
|
|
1260
1600
|
|
|
1261
1601
|
class CLEVACommonsenseReasoningScenario(CLEVAScenario):
|
|
1262
1602
|
"""
|
|
@@ -1289,6 +1629,23 @@ class CLEVACommonsenseReasoningScenario(CLEVAScenario):
|
|
|
1289
1629
|
def task(self) -> str:
|
|
1290
1630
|
return "commonsense_reasoning"
|
|
1291
1631
|
|
|
1632
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1633
|
+
return ScenarioMetadata(
|
|
1634
|
+
name="cleva_commonsense_reasoning",
|
|
1635
|
+
display_name="CLEVA (Chinese) commonsense reasoning",
|
|
1636
|
+
description="Scenario that tests models' commonsense reasoning ability. There are two "
|
|
1637
|
+
"subtasks: textual entailment and commonsense question answering.",
|
|
1638
|
+
taxonomy=TaxonomyInfo(
|
|
1639
|
+
task="multiple-choice question answering",
|
|
1640
|
+
what="n/a",
|
|
1641
|
+
when="2023 or before",
|
|
1642
|
+
who="n/a",
|
|
1643
|
+
language="Chinese",
|
|
1644
|
+
),
|
|
1645
|
+
main_metric="exact_match",
|
|
1646
|
+
main_split="test",
|
|
1647
|
+
)
|
|
1648
|
+
|
|
1292
1649
|
|
|
1293
1650
|
class CLEVADeductiveReasoningScenario(CLEVAScenario):
|
|
1294
1651
|
"""
|
|
@@ -1314,6 +1671,23 @@ class CLEVADeductiveReasoningScenario(CLEVAScenario):
|
|
|
1314
1671
|
def task(self) -> str:
|
|
1315
1672
|
return "deductive_reasoning"
|
|
1316
1673
|
|
|
1674
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1675
|
+
return ScenarioMetadata(
|
|
1676
|
+
name="cleva_deductive_reasoning",
|
|
1677
|
+
display_name="CLEVA (Chinese) deductive reasoning",
|
|
1678
|
+
description="Scenario that gauges model's ability to reason deductive arguments. It "
|
|
1679
|
+
"includes a modus tollens subtask.",
|
|
1680
|
+
taxonomy=TaxonomyInfo(
|
|
1681
|
+
task="multiple-choice question answering",
|
|
1682
|
+
what="natural language questions",
|
|
1683
|
+
when="2023",
|
|
1684
|
+
who="n/a",
|
|
1685
|
+
language="structured Chinese",
|
|
1686
|
+
),
|
|
1687
|
+
main_metric="exact_match",
|
|
1688
|
+
main_split="test",
|
|
1689
|
+
)
|
|
1690
|
+
|
|
1317
1691
|
|
|
1318
1692
|
class CLEVAMathematicalCalculationScenario(CLEVAScenario):
|
|
1319
1693
|
"""
|
|
@@ -1348,6 +1722,24 @@ class CLEVAMathematicalCalculationScenario(CLEVAScenario):
|
|
|
1348
1722
|
def task(self) -> str:
|
|
1349
1723
|
return "mathematical_calculation"
|
|
1350
1724
|
|
|
1725
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1726
|
+
return ScenarioMetadata(
|
|
1727
|
+
name="cleva_mathematical_calculation",
|
|
1728
|
+
display_name="CLEVA (Chinese) mathematical calculation",
|
|
1729
|
+
description="Scenario that evaluates the calculation ability of models. It has four "
|
|
1730
|
+
"subtasks: three-digit addition, three-digit subtraction, two-digit "
|
|
1731
|
+
"multiplication, and significant figures.",
|
|
1732
|
+
taxonomy=TaxonomyInfo(
|
|
1733
|
+
task="next-word prediction",
|
|
1734
|
+
what="natural language math questions or pure math expressions",
|
|
1735
|
+
when="2023",
|
|
1736
|
+
who="automatically generated from templates",
|
|
1737
|
+
language="synthetic",
|
|
1738
|
+
),
|
|
1739
|
+
main_metric="exact_match",
|
|
1740
|
+
main_split="test",
|
|
1741
|
+
)
|
|
1742
|
+
|
|
1351
1743
|
|
|
1352
1744
|
class CLEVAInductiveReasoningScenario(CLEVAScenario):
|
|
1353
1745
|
"""
|
|
@@ -1372,6 +1764,23 @@ class CLEVAInductiveReasoningScenario(CLEVAScenario):
|
|
|
1372
1764
|
def task(self) -> str:
|
|
1373
1765
|
return "inductive_reasoning"
|
|
1374
1766
|
|
|
1767
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1768
|
+
return ScenarioMetadata(
|
|
1769
|
+
name="cleva_inductive_reasoning",
|
|
1770
|
+
display_name="CLEVA (Chinese) inductive reasoning",
|
|
1771
|
+
description="Scenario that tests models' ability to conclude rules from demonstrations and "
|
|
1772
|
+
"apply them to unseen test instances.",
|
|
1773
|
+
taxonomy=TaxonomyInfo(
|
|
1774
|
+
task="next-word prediction",
|
|
1775
|
+
what="n/a",
|
|
1776
|
+
when="2023",
|
|
1777
|
+
who="automatically generated by algorithm",
|
|
1778
|
+
language="synthetic",
|
|
1779
|
+
),
|
|
1780
|
+
main_metric="exact_match",
|
|
1781
|
+
main_split="test",
|
|
1782
|
+
)
|
|
1783
|
+
|
|
1375
1784
|
|
|
1376
1785
|
class CLEVAReasoningPrimitiveScenario(CLEVAScenario):
|
|
1377
1786
|
"""
|
|
@@ -1427,6 +1836,23 @@ class CLEVAReasoningPrimitiveScenario(CLEVAScenario):
|
|
|
1427
1836
|
def task(self) -> str:
|
|
1428
1837
|
return "reasoning_primitive"
|
|
1429
1838
|
|
|
1839
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1840
|
+
return ScenarioMetadata(
|
|
1841
|
+
name="cleva_reasoning_primitive",
|
|
1842
|
+
display_name="CLEVA (Chinese) reasoning primitive",
|
|
1843
|
+
description="Scenario focused on primitive reasoning, including dyck language continuation, "
|
|
1844
|
+
"variable substitution, pattern induction, and pattern matching.",
|
|
1845
|
+
taxonomy=TaxonomyInfo(
|
|
1846
|
+
task="next-word prediction",
|
|
1847
|
+
what="n/a",
|
|
1848
|
+
when="2023",
|
|
1849
|
+
who="automatically generated from templates",
|
|
1850
|
+
language="synthetic",
|
|
1851
|
+
),
|
|
1852
|
+
main_metric="exact_match",
|
|
1853
|
+
main_split="test",
|
|
1854
|
+
)
|
|
1855
|
+
|
|
1430
1856
|
|
|
1431
1857
|
class CLEVADataToTextGenerationScenario(CLEVAScenario):
|
|
1432
1858
|
"""
|
|
@@ -1470,6 +1896,19 @@ class CLEVADataToTextGenerationScenario(CLEVAScenario):
|
|
|
1470
1896
|
def task(self) -> str:
|
|
1471
1897
|
return "data_to_text_generation"
|
|
1472
1898
|
|
|
1899
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1900
|
+
return ScenarioMetadata(
|
|
1901
|
+
name="cleva_data_to_text_generation",
|
|
1902
|
+
display_name="CLEVA (Chinese) data to text generation",
|
|
1903
|
+
description="Generate a product description based on structured data containing various "
|
|
1904
|
+
"product properties.",
|
|
1905
|
+
taxonomy=TaxonomyInfo(
|
|
1906
|
+
task="data-to-text generation", what="product description", when="2010s", who="n/a", language="Chinese"
|
|
1907
|
+
),
|
|
1908
|
+
main_metric="chinese_bleu_1",
|
|
1909
|
+
main_split="test",
|
|
1910
|
+
)
|
|
1911
|
+
|
|
1473
1912
|
|
|
1474
1913
|
class CLEVAMathematicalReasoningScenario(CLEVAScenario):
|
|
1475
1914
|
"""
|
|
@@ -1521,6 +1960,20 @@ class CLEVAMathematicalReasoningScenario(CLEVAScenario):
|
|
|
1521
1960
|
)
|
|
1522
1961
|
return instance
|
|
1523
1962
|
|
|
1963
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
1964
|
+
return ScenarioMetadata(
|
|
1965
|
+
name="cleva_mathematical_reasoning",
|
|
1966
|
+
display_name="CLEVA (Chinese) mathematical reasoning",
|
|
1967
|
+
description="Scenario that tests models' mathematical reasoning ability with "
|
|
1968
|
+
"chain-of-thought style reasoning. It contains a math word problem solving "
|
|
1969
|
+
"subtask.",
|
|
1970
|
+
taxonomy=TaxonomyInfo(
|
|
1971
|
+
task="next-word prediction", what="exam", when="2010s", who="n/a", language="Chinese"
|
|
1972
|
+
),
|
|
1973
|
+
main_metric="cleva_math_result_match",
|
|
1974
|
+
main_split="test",
|
|
1975
|
+
)
|
|
1976
|
+
|
|
1524
1977
|
|
|
1525
1978
|
class CLEVALanguageModelingScenario(CLEVAScenario):
|
|
1526
1979
|
"""
|
|
@@ -1547,6 +2000,19 @@ class CLEVALanguageModelingScenario(CLEVAScenario):
|
|
|
1547
2000
|
)
|
|
1548
2001
|
return instance
|
|
1549
2002
|
|
|
2003
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
2004
|
+
return ScenarioMetadata(
|
|
2005
|
+
name="cleva_language_modeling",
|
|
2006
|
+
display_name="CLEVA (Chinese) language modeling",
|
|
2007
|
+
description="Scenario for measuring language model performance across various domains "
|
|
2008
|
+
"(wikipedia and news).",
|
|
2009
|
+
taxonomy=TaxonomyInfo(
|
|
2010
|
+
task="language modeling", what="Wikipedia and news", when="2010s", who="n/a", language="Chinese"
|
|
2011
|
+
),
|
|
2012
|
+
main_metric="bits_per_byte",
|
|
2013
|
+
main_split="test",
|
|
2014
|
+
)
|
|
2015
|
+
|
|
1550
2016
|
|
|
1551
2017
|
class CLEVACodeSynthesisScenario(CLEVAScenario):
|
|
1552
2018
|
r"""
|
|
@@ -1580,6 +2046,19 @@ class CLEVACodeSynthesisScenario(CLEVAScenario):
|
|
|
1580
2046
|
instance = self.converter.transform_code(row, self.prompt_template, split)
|
|
1581
2047
|
return instance
|
|
1582
2048
|
|
|
2049
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
2050
|
+
return ScenarioMetadata(
|
|
2051
|
+
name="cleva_code_synthesis",
|
|
2052
|
+
display_name="CLEVA (Chinese) code synthesis",
|
|
2053
|
+
description="Scenario for measuring functional correctness for synthesizing programs from "
|
|
2054
|
+
"Chinese docstrings.",
|
|
2055
|
+
taxonomy=TaxonomyInfo(
|
|
2056
|
+
task="next-word prediction", what="n/a", when="2023", who="n/a", language="synthetic"
|
|
2057
|
+
),
|
|
2058
|
+
main_metric="exact_match",
|
|
2059
|
+
main_split="test",
|
|
2060
|
+
)
|
|
2061
|
+
|
|
1583
2062
|
|
|
1584
2063
|
class CLEVAKeyphraseExtractionScenario(CLEVAScenario):
|
|
1585
2064
|
"""
|