crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +71 -0
- helm/benchmark/annotation/medication_qa_annotator.py +68 -0
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +31 -2
- helm/benchmark/run_expander.py +113 -10
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
- helm/benchmark/run_specs/experimental_run_specs.py +85 -0
- helm/benchmark/run_specs/finance_run_specs.py +110 -0
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +251 -57
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +189 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +317 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +50 -28
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +79 -19
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +11 -5
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +7 -9
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +99 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +25 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +740 -363
- helm/config/model_metadata.yaml +824 -128
- helm/config/tokenizer_configs.yaml +207 -10
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +2 -3
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +29 -62
- helm/tokenizers/huggingface_tokenizer.py +35 -13
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/schema_image2structure.yaml +0 -304
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -0,0 +1,638 @@
|
|
|
1
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
2
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
3
|
+
)
|
|
4
|
+
from helm.benchmark.adaptation.common_adapter_specs import (
|
|
5
|
+
get_generation_adapter_spec,
|
|
6
|
+
get_multiple_choice_separate_adapter_spec,
|
|
7
|
+
)
|
|
8
|
+
from helm.benchmark.metrics.bhasa_metrics_specs import (
|
|
9
|
+
get_bhasa_machine_translation_metric_specs,
|
|
10
|
+
get_bhasa_qa_metric_specs,
|
|
11
|
+
)
|
|
12
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
13
|
+
get_basic_metric_specs,
|
|
14
|
+
get_exact_match_metric_specs,
|
|
15
|
+
get_classification_metric_specs,
|
|
16
|
+
)
|
|
17
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
18
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
19
|
+
|
|
20
|
+
# BHASA Run Specs
|
|
21
|
+
# A. Natural Language Understanding
|
|
22
|
+
# B. Natural Language Generation
|
|
23
|
+
# C. Natural Language Reasoning
|
|
24
|
+
# D. Linguistic Diagnostics
|
|
25
|
+
|
|
26
|
+
# A. Natural Language Understanding
|
|
27
|
+
# 1. Question Answering
|
|
28
|
+
# 2. Sentiment Analysis
|
|
29
|
+
# 3. Toxicity Detection/Classification
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# 1. Question Answering
|
|
33
|
+
# 1.1 Indonesian: TyDiQA
|
|
34
|
+
@run_spec_function("tydiqa")
|
|
35
|
+
def get_tydiqa_spec() -> RunSpec:
|
|
36
|
+
name = "tydiqa"
|
|
37
|
+
|
|
38
|
+
adapter_spec = get_generation_adapter_spec(
|
|
39
|
+
instructions="Anda akan diberikan sebuah paragraf dan sebuah pertanyaan. Jawablah pertanyaannya dengan "
|
|
40
|
+
"mengekstrak jawaban dari paragraf tersebut.",
|
|
41
|
+
output_noun="Jawaban",
|
|
42
|
+
stop_sequences=["\n"],
|
|
43
|
+
max_tokens=256,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.TyDiQAScenario")
|
|
47
|
+
|
|
48
|
+
return RunSpec(
|
|
49
|
+
name=name,
|
|
50
|
+
scenario_spec=scenario_spec,
|
|
51
|
+
adapter_spec=adapter_spec,
|
|
52
|
+
metric_specs=get_bhasa_qa_metric_specs(
|
|
53
|
+
args={
|
|
54
|
+
"language": "id",
|
|
55
|
+
}
|
|
56
|
+
),
|
|
57
|
+
groups=["bhasa_nlu", "tydiqa"],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# 1.2 Vietnamese & Thai: XQuAD
|
|
62
|
+
XQUAD_PROMPTS = {
|
|
63
|
+
"th": {
|
|
64
|
+
"instructions": "คุณจะได้รับข้อความและคำถาม กรุณาตอบคำถามโดยแยกคำตอบจากข้อความ",
|
|
65
|
+
"output_noun": "คำตอบ",
|
|
66
|
+
},
|
|
67
|
+
"vi": {
|
|
68
|
+
"instructions": "Bạn sẽ được cho một đoạn văn và một câu hỏi. Trả lời câu hỏi bằng cách trích xuất câu "
|
|
69
|
+
"trả lời từ đoạn văn.",
|
|
70
|
+
"output_noun": "Câu trả lời",
|
|
71
|
+
},
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@run_spec_function("xquad")
|
|
76
|
+
def get_xquad_spec(language="th") -> RunSpec:
|
|
77
|
+
name = f"xquad_{language}"
|
|
78
|
+
|
|
79
|
+
adapter_spec = get_generation_adapter_spec(
|
|
80
|
+
instructions=XQUAD_PROMPTS[language]["instructions"],
|
|
81
|
+
output_noun=XQUAD_PROMPTS[language]["output_noun"],
|
|
82
|
+
stop_sequences=["\n"],
|
|
83
|
+
max_tokens=256,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
scenario_spec = ScenarioSpec(
|
|
87
|
+
class_name="helm.benchmark.scenarios.bhasa_scenario.XQuADScenario",
|
|
88
|
+
args={
|
|
89
|
+
"language": language,
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return RunSpec(
|
|
94
|
+
name=name,
|
|
95
|
+
scenario_spec=scenario_spec,
|
|
96
|
+
adapter_spec=adapter_spec,
|
|
97
|
+
metric_specs=get_bhasa_qa_metric_specs(
|
|
98
|
+
args={
|
|
99
|
+
"language": language,
|
|
100
|
+
}
|
|
101
|
+
),
|
|
102
|
+
groups=["bhasa_nlu", f"xquad_{language}"],
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# 1.3 Tamil: IndicQA
|
|
107
|
+
@run_spec_function("indicqa")
|
|
108
|
+
def get_indicqa_spec() -> RunSpec:
|
|
109
|
+
name = "indicqa"
|
|
110
|
+
i = "உங்களுக்கு ஒரு பத்தியும் ஒரு கேள்வியும் தரப்படும். தரப்பட்ட பத்தியிலிருந்து கேள்விக்கான பதிலைக் கண்டறியவும்."
|
|
111
|
+
|
|
112
|
+
adapter_spec = get_generation_adapter_spec(
|
|
113
|
+
instructions=i,
|
|
114
|
+
output_noun="பதில்",
|
|
115
|
+
stop_sequences=["\n"],
|
|
116
|
+
max_tokens=256,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicQAScenario")
|
|
120
|
+
|
|
121
|
+
return RunSpec(
|
|
122
|
+
name=name,
|
|
123
|
+
scenario_spec=scenario_spec,
|
|
124
|
+
adapter_spec=adapter_spec,
|
|
125
|
+
metric_specs=get_bhasa_qa_metric_specs(
|
|
126
|
+
args={
|
|
127
|
+
"language": "ta",
|
|
128
|
+
}
|
|
129
|
+
),
|
|
130
|
+
groups=["bhasa_nlu", "indicqa"],
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# 2. Sentiment Analysis
|
|
135
|
+
# 2.1 Indonesian: NusaX Sentiment
|
|
136
|
+
@run_spec_function("nusax")
|
|
137
|
+
def get_nusax_spec() -> RunSpec:
|
|
138
|
+
name = "nusax"
|
|
139
|
+
|
|
140
|
+
adapter_spec = get_generation_adapter_spec(
|
|
141
|
+
instructions="Apa sentimen dari kalimat berikut ini?\nJawablah dengan satu kata saja:"
|
|
142
|
+
"\n- Positif\n- Negatif\n- Netral",
|
|
143
|
+
input_noun="Kalimat",
|
|
144
|
+
output_noun="Jawaban",
|
|
145
|
+
stop_sequences=["\n"],
|
|
146
|
+
max_tokens=16,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.NusaXScenario")
|
|
150
|
+
|
|
151
|
+
return RunSpec(
|
|
152
|
+
name=name,
|
|
153
|
+
scenario_spec=scenario_spec,
|
|
154
|
+
adapter_spec=adapter_spec,
|
|
155
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
156
|
+
groups=["bhasa_nlu", "nusax"],
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# 2.2 Vietnamese: UIT-VSFC
|
|
161
|
+
@run_spec_function("uitvsfc")
|
|
162
|
+
def get_uitvsfc_spec() -> RunSpec:
|
|
163
|
+
name = "uitvsfc"
|
|
164
|
+
|
|
165
|
+
adapter_spec = get_generation_adapter_spec(
|
|
166
|
+
instructions="Sắc thái của câu sau đây là gì?\nTrả lời với một từ duy nhất:"
|
|
167
|
+
"\n- Tích cực\n- Tiêu cực\n- Trung lập",
|
|
168
|
+
input_noun="Câu văn",
|
|
169
|
+
output_noun="Câu trả lời",
|
|
170
|
+
stop_sequences=["\n"],
|
|
171
|
+
max_tokens=16,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.UITVSFCScenario")
|
|
175
|
+
|
|
176
|
+
return RunSpec(
|
|
177
|
+
name=name,
|
|
178
|
+
scenario_spec=scenario_spec,
|
|
179
|
+
adapter_spec=adapter_spec,
|
|
180
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
181
|
+
groups=["bhasa_nlu", "uitvsfc"],
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# 2.3 Thai: Wisesight Sentiment
|
|
186
|
+
@run_spec_function("wisesight")
|
|
187
|
+
def get_wisesight_spec() -> RunSpec:
|
|
188
|
+
name = "wisesight"
|
|
189
|
+
i = "อารมณ์ความรู้สึกของข้อความต่อไปนี้เป็นอย่างไร?\nกรุณาตอบโดยใช้คำเดียวเท่านั้น:\n- แง่บวก\n- แง่ลบ\n- เฉยๆ"
|
|
190
|
+
|
|
191
|
+
adapter_spec = get_generation_adapter_spec(
|
|
192
|
+
instructions=i,
|
|
193
|
+
input_noun="ข้อความ",
|
|
194
|
+
output_noun="คำตอบ",
|
|
195
|
+
stop_sequences=["\n"],
|
|
196
|
+
max_tokens=16,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.WisesightScenario")
|
|
200
|
+
|
|
201
|
+
return RunSpec(
|
|
202
|
+
name=name,
|
|
203
|
+
scenario_spec=scenario_spec,
|
|
204
|
+
adapter_spec=adapter_spec,
|
|
205
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
206
|
+
groups=["bhasa_nlu", "wisesight"],
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# 2.4 Tamil: IndicSentiment
|
|
211
|
+
@run_spec_function("indicsentiment")
|
|
212
|
+
def get_indicsentiment_spec() -> RunSpec:
|
|
213
|
+
name = "indicsentiment"
|
|
214
|
+
|
|
215
|
+
adapter_spec = get_generation_adapter_spec(
|
|
216
|
+
instructions="பின்வரும் வாக்கியத்தில் வெளிப்படுத்தப்படும் உணர்வு எது?\nஒரு சொல்லில் மட்டும் பதிலளிக்கவும்:"
|
|
217
|
+
"\n- நேர்மறை\n- எதிர்மறை",
|
|
218
|
+
input_noun="வாக்கியம்",
|
|
219
|
+
output_noun="பதில்",
|
|
220
|
+
stop_sequences=["\n"],
|
|
221
|
+
max_tokens=16,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicSentimentScenario")
|
|
225
|
+
|
|
226
|
+
return RunSpec(
|
|
227
|
+
name=name,
|
|
228
|
+
scenario_spec=scenario_spec,
|
|
229
|
+
adapter_spec=adapter_spec,
|
|
230
|
+
metric_specs=get_classification_metric_specs() + get_basic_metric_specs([]),
|
|
231
|
+
groups=["bhasa_nlu", "indicsentiment"],
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# 3. Toxicity Detection/Classification
|
|
236
|
+
# 3.1 Indonesian: Multi-Label Hate Speech Detection
|
|
237
|
+
@run_spec_function("mlhsd")
|
|
238
|
+
def get_mlhsd_spec() -> RunSpec:
|
|
239
|
+
name = "mlhsd"
|
|
240
|
+
|
|
241
|
+
adapter_spec = get_generation_adapter_spec(
|
|
242
|
+
instructions="Anda adalah pendeteksi ujaran kebencian. Definisi dari labelnya adalah sebagai berikut:"
|
|
243
|
+
"\nBersih: Tidak ada ujaran kebencian.\nKasar: Ada ujaran kebencian dan kata-kata kasar, namun "
|
|
244
|
+
"tidak menyerang pihak tertentu.\nBenci: Ada ujaran kebencian atau serangan langsung terhadap pihak "
|
|
245
|
+
"tertentu.\nBerdasarkan definisi labelnya, klasifikasikan kalimat berikut ini dengan satu kata saja:"
|
|
246
|
+
"\n- Bersih\n- Kasar\n- Benci",
|
|
247
|
+
input_noun="Kalimat",
|
|
248
|
+
output_noun="Jawaban",
|
|
249
|
+
stop_sequences=["\n"],
|
|
250
|
+
max_tokens=16,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.MLHSDScenario")
|
|
254
|
+
|
|
255
|
+
return RunSpec(
|
|
256
|
+
name=name,
|
|
257
|
+
scenario_spec=scenario_spec,
|
|
258
|
+
adapter_spec=adapter_spec,
|
|
259
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
260
|
+
groups=["bhasa_nlu", "mlhsd"],
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# 3.2 Vietnamese: ViHSD
|
|
265
|
+
@run_spec_function("vihsd")
|
|
266
|
+
def get_vihsd_spec() -> RunSpec:
|
|
267
|
+
name = "vihsd"
|
|
268
|
+
|
|
269
|
+
adapter_spec = get_generation_adapter_spec(
|
|
270
|
+
instructions="Bạn là máy phát hiện phát ngôn thù ghét. Các nhãn được định nghĩa như sau:"
|
|
271
|
+
"\nSạch: Không quấy rối.\nCông kích: Bao gồm quấy rối và thậm chí chửi thề, nhưng không "
|
|
272
|
+
"tấn công bất kì đối tượng cụ thể nào.\nThù ghét: Trực tiếp quấy rối hay lăng mạ một đối "
|
|
273
|
+
"tượng cụ thể.\nVới các định nghĩa của nhãn, hãy phân loại câu dưới đây với một từ duy nhất:"
|
|
274
|
+
"\n- Sạch\n- Công kích\n- Thù ghét",
|
|
275
|
+
input_noun="Câu văn",
|
|
276
|
+
output_noun="Câu trả lời",
|
|
277
|
+
stop_sequences=["\n"],
|
|
278
|
+
max_tokens=16,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ViHSDScenario")
|
|
282
|
+
|
|
283
|
+
return RunSpec(
|
|
284
|
+
name=name,
|
|
285
|
+
scenario_spec=scenario_spec,
|
|
286
|
+
adapter_spec=adapter_spec,
|
|
287
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
288
|
+
groups=["bhasa_nlu", "vihsd"],
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
# 3.3 Thai: Thai Toxicity Tweets
|
|
293
|
+
@run_spec_function("thaitoxicitytweets")
|
|
294
|
+
def get_thaitoxicitytweets_spec() -> RunSpec:
|
|
295
|
+
name = "thaitoxicitytweets"
|
|
296
|
+
|
|
297
|
+
adapter_spec = get_generation_adapter_spec(
|
|
298
|
+
instructions="คุณเป็นเครื่องตรวจจับข้อความที่เป็นพิษ\nข้อความเป็นพิษถ้ามันทำร้าย สร้างความเสียหาย "
|
|
299
|
+
"หรือสื่อเจตนาเชิงลบถึงบุคคลหรือกลุ่มบุคคล และคุณควรคำนึงถึงการประชดประชันด้วย\nเมื่อได้รับข้อความ "
|
|
300
|
+
"ให้ตอบด้วยตัวอักษร Y หากเป็นพิษ และ N หากไม่เป็นพิษ",
|
|
301
|
+
input_noun="ข้อความ",
|
|
302
|
+
output_noun="คำตอบ",
|
|
303
|
+
stop_sequences=["\n"],
|
|
304
|
+
max_tokens=16,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ThaiToxicityTweetsScenario")
|
|
308
|
+
|
|
309
|
+
return RunSpec(
|
|
310
|
+
name=name,
|
|
311
|
+
scenario_spec=scenario_spec,
|
|
312
|
+
adapter_spec=adapter_spec,
|
|
313
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
314
|
+
groups=["bhasa_nlu", "thaitoxicitytweets"],
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# B. Natural Language Generation
|
|
319
|
+
# 1. Machine Translation
|
|
320
|
+
|
|
321
|
+
# 1. Machine Translation: FLoRes-200
|
|
322
|
+
TRANSLATION_PROMPTS = {
|
|
323
|
+
"en_id": {
|
|
324
|
+
"instructions": "Terjemahkan teks berikut ini ke dalam Bahasa Indonesia.",
|
|
325
|
+
"input_noun": "Teks",
|
|
326
|
+
"output_noun": "Terjemahan",
|
|
327
|
+
},
|
|
328
|
+
"en_ta": {
|
|
329
|
+
"instructions": "பின்வரும் உரையைத் தமிழ் மொழிக்கு மொழிபெயர்க்கவும்.",
|
|
330
|
+
"input_noun": "உரை",
|
|
331
|
+
"output_noun": "மொழிபெயர்ப்பு",
|
|
332
|
+
},
|
|
333
|
+
"en_th": {
|
|
334
|
+
"instructions": "กรุณาแปลข้อความต่อไปนี้เป็นภาษาไทย",
|
|
335
|
+
"input_noun": "ข้อความ",
|
|
336
|
+
"output_noun": "คำแปล",
|
|
337
|
+
},
|
|
338
|
+
"en_vi": {
|
|
339
|
+
"instructions": "Dịch văn bản dưới đây sang Tiếng Việt.",
|
|
340
|
+
"input_noun": "Văn bản",
|
|
341
|
+
"output_noun": "Bản dịch",
|
|
342
|
+
},
|
|
343
|
+
"id_en": {
|
|
344
|
+
"instructions": "Terjemahkan teks berikut ini ke dalam Bahasa Inggris.",
|
|
345
|
+
"input_noun": "Teks",
|
|
346
|
+
"output_noun": "Terjemahan",
|
|
347
|
+
},
|
|
348
|
+
"ta_en": {
|
|
349
|
+
"instructions": "பின்வரும் உரையை ஆங்கில மொழிக்கு மொழிபெயர்க்கவும்.",
|
|
350
|
+
"input_noun": "உரை",
|
|
351
|
+
"output_noun": "மொழிபெயர்ப்பு",
|
|
352
|
+
},
|
|
353
|
+
"th_en": {
|
|
354
|
+
"instructions": "กรุณาแปลข้อความต่อไปนี้เป็นภาษาอังกฤษ",
|
|
355
|
+
"input_noun": "ข้อความ",
|
|
356
|
+
"output_noun": "คำแปล",
|
|
357
|
+
},
|
|
358
|
+
"vi_en": {
|
|
359
|
+
"instructions": "Dịch văn bản dưới đây sang Tiếng Anh.",
|
|
360
|
+
"input_noun": "Văn bản",
|
|
361
|
+
"output_noun": "Bản dịch",
|
|
362
|
+
},
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
@run_spec_function("flores")
|
|
367
|
+
def get_flores_spec(source="en", target="id") -> RunSpec:
|
|
368
|
+
pair = f"{source}_{target}"
|
|
369
|
+
name = f"flores_{pair}"
|
|
370
|
+
|
|
371
|
+
adapter_spec = get_generation_adapter_spec(
|
|
372
|
+
instructions=TRANSLATION_PROMPTS[pair]["instructions"],
|
|
373
|
+
input_noun=TRANSLATION_PROMPTS[pair]["input_noun"],
|
|
374
|
+
output_noun=TRANSLATION_PROMPTS[pair]["output_noun"],
|
|
375
|
+
stop_sequences=["\n"],
|
|
376
|
+
max_tokens=256,
|
|
377
|
+
sample_train=False,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
scenario_spec = ScenarioSpec(
|
|
381
|
+
class_name="helm.benchmark.scenarios.bhasa_scenario.FloresScenario",
|
|
382
|
+
args={
|
|
383
|
+
"pair": pair,
|
|
384
|
+
},
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
return RunSpec(
|
|
388
|
+
name=name,
|
|
389
|
+
scenario_spec=scenario_spec,
|
|
390
|
+
adapter_spec=adapter_spec,
|
|
391
|
+
metric_specs=get_bhasa_machine_translation_metric_specs(),
|
|
392
|
+
groups=["bhasa_nlg", f"flores_{pair}"],
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
# C. Natural Language Reasoning
|
|
397
|
+
# 1. Natural Language Inference
|
|
398
|
+
# 2. Causal Reasoning
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# 1. Natural Language Inference
|
|
402
|
+
# 1.1 Indonesian: IndoNLI
|
|
403
|
+
@run_spec_function("indonli")
|
|
404
|
+
def get_indonli_spec() -> RunSpec:
|
|
405
|
+
name = "indonli"
|
|
406
|
+
|
|
407
|
+
adapter_spec = get_generation_adapter_spec(
|
|
408
|
+
instructions="Anda akan diberikan dua kalimat, X dan Y.\nTentukan mana dari pernyataan berikut "
|
|
409
|
+
"ini yang paling sesuai untuk kalimat X dan Y.\nA: Kalau X benar, maka Y juga harus benar."
|
|
410
|
+
"\nB: X bertentangan dengan Y.\nC: Ketika X benar, Y mungkin benar atau mungkin tidak benar."
|
|
411
|
+
"\nJawablah dengan satu huruf saja, A, B atau C.",
|
|
412
|
+
output_noun="Jawaban",
|
|
413
|
+
stop_sequences=["\n"],
|
|
414
|
+
max_tokens=2,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndoNLIScenario")
|
|
418
|
+
|
|
419
|
+
return RunSpec(
|
|
420
|
+
name=name,
|
|
421
|
+
scenario_spec=scenario_spec,
|
|
422
|
+
adapter_spec=adapter_spec,
|
|
423
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
424
|
+
groups=["bhasa_nlr", "indonli"],
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
# 1.2 Vietnamese & Thai: XNLI
|
|
429
|
+
XNLI_PROMPTS = {
|
|
430
|
+
"th": {
|
|
431
|
+
"instructions": "คุณจะได้รับสองข้อความ X และ Y",
|
|
432
|
+
"input_suffix": "กรุณาพิจารณาว่า ข้อความใดต่อไปนี้ใช้กับข้อความ X และ Y ได้ดีที่สุด"
|
|
433
|
+
"\nA: ถ้า X เป็นจริง Y จะต้องเป็นจริง\nB: X ขัดแย้งกับ Y\nC: เมื่อ X เป็นจริง Y อาจเป็นจริงหรือไม่ก็ได้"
|
|
434
|
+
"\nกรุณาตอบด้วยตัวอักษร A, B หรือ C ตัวเดียวเท่านั้น",
|
|
435
|
+
"output_noun": "คำตอบ",
|
|
436
|
+
},
|
|
437
|
+
"vi": {
|
|
438
|
+
"instructions": "Bạn sẽ được cho hai câu, X và Y.",
|
|
439
|
+
"input_suffix": "Xác định câu nào sau đây là câu phù hợp nhất cho câu X và Y."
|
|
440
|
+
"\nA: Nếu X đúng thì Y phải đúng.\nB: X mâu thuẫn với Y."
|
|
441
|
+
"\nC: Khi X đúng, Y có thể đúng hoặc không đúng.\nTrả lời với một chữ cái duy nhất A, B, hoặc C.",
|
|
442
|
+
"output_noun": "Câu trả lời",
|
|
443
|
+
},
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
@run_spec_function("xnli")
|
|
448
|
+
def get_xnli_spec(language="vi") -> RunSpec:
|
|
449
|
+
name = f"xnli_{language}"
|
|
450
|
+
|
|
451
|
+
adapter_spec = get_generation_adapter_spec(
|
|
452
|
+
instructions=XNLI_PROMPTS[language]["instructions"] + "\n" + XNLI_PROMPTS[language]["input_suffix"],
|
|
453
|
+
output_noun=XNLI_PROMPTS[language]["output_noun"],
|
|
454
|
+
stop_sequences=["\n"],
|
|
455
|
+
max_tokens=2,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
scenario_spec = ScenarioSpec(
|
|
459
|
+
class_name="helm.benchmark.scenarios.bhasa_scenario.XNLIScenario",
|
|
460
|
+
args={
|
|
461
|
+
"language": language,
|
|
462
|
+
},
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
return RunSpec(
|
|
466
|
+
name=name,
|
|
467
|
+
scenario_spec=scenario_spec,
|
|
468
|
+
adapter_spec=adapter_spec,
|
|
469
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
470
|
+
groups=["bhasa_nlr", f"xnli_{language}"],
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# 1.3 Tamil: IndicXNLI
|
|
475
|
+
@run_spec_function("indicxnli")
|
|
476
|
+
def get_indicxnli_spec() -> RunSpec:
|
|
477
|
+
name = "indicxnli"
|
|
478
|
+
|
|
479
|
+
adapter_spec = get_generation_adapter_spec(
|
|
480
|
+
instructions="உங்களுக்கு இரண்டு வாக்கியங்கள், X மற்றும் Y, தரப்படும்."
|
|
481
|
+
"\nபின்வரும் கூற்றுகளில் எது X மற்றும் Y வாக்கியங்களுடன் மிகப் பொருந்துகிறது எனக் கண்டறியவும்."
|
|
482
|
+
"\nA: X உண்மை என்றால் Y உம் உண்மையாக இருக்க வேண்டும்.\nB: X உம் Y உம் முரண்படுகின்றன."
|
|
483
|
+
"\nC: X உண்மையாக இருக்கும்போது Y உண்மையாக இருக்கலாம் அல்லது இல்லாமல் இருக்கலாம்."
|
|
484
|
+
"\nA அல்லது B அல்லது C என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.",
|
|
485
|
+
output_noun="பதில்",
|
|
486
|
+
stop_sequences=["\n"],
|
|
487
|
+
max_tokens=2,
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicXNLIScenario")
|
|
491
|
+
|
|
492
|
+
return RunSpec(
|
|
493
|
+
name=name,
|
|
494
|
+
scenario_spec=scenario_spec,
|
|
495
|
+
adapter_spec=adapter_spec,
|
|
496
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
497
|
+
groups=["bhasa_nlr", "indicxnli"],
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
# 2. Causal Reasoning: XCOPA
|
|
502
|
+
XCOPA_PROMPTS = {
|
|
503
|
+
"id": {
|
|
504
|
+
"input_noun": "Situasi",
|
|
505
|
+
"output_noun": "Jawaban",
|
|
506
|
+
},
|
|
507
|
+
"ta": {
|
|
508
|
+
"input_noun": "சூழ்நிலை",
|
|
509
|
+
"output_noun": "பதில்",
|
|
510
|
+
},
|
|
511
|
+
"th": {
|
|
512
|
+
"input_noun": "สถานการณ์",
|
|
513
|
+
"output_noun": "คำตอบ",
|
|
514
|
+
},
|
|
515
|
+
"vi": {
|
|
516
|
+
"input_noun": "Tình huống",
|
|
517
|
+
"output_noun": "Câu trả lời",
|
|
518
|
+
},
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
@run_spec_function("xcopa")
|
|
523
|
+
def get_xcopa_spec(language="id") -> RunSpec:
|
|
524
|
+
name = f"xcopa_{language}"
|
|
525
|
+
|
|
526
|
+
adapter_spec = get_generation_adapter_spec(
|
|
527
|
+
input_noun=XCOPA_PROMPTS[language]["input_noun"],
|
|
528
|
+
output_noun=XCOPA_PROMPTS[language]["output_noun"],
|
|
529
|
+
stop_sequences=["\n"],
|
|
530
|
+
max_tokens=2,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
scenario_spec = ScenarioSpec(
|
|
534
|
+
class_name="helm.benchmark.scenarios.bhasa_scenario.XCOPAScenario",
|
|
535
|
+
args={
|
|
536
|
+
"language": language,
|
|
537
|
+
},
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
return RunSpec(
|
|
541
|
+
name=name,
|
|
542
|
+
scenario_spec=scenario_spec,
|
|
543
|
+
adapter_spec=adapter_spec,
|
|
544
|
+
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
|
|
545
|
+
groups=["bhasa_nlr", f"xcopa_{language}"],
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
# D. Linguistic Diagnostics (LINDSEA)
|
|
550
|
+
# 1. Syntax
|
|
551
|
+
# 2. Pragmatics
|
|
552
|
+
|
|
553
|
+
# 1. Syntax: LINDSEA Minimal Pairs
|
|
554
|
+
LINDSEA_OUTPUT_NOUNS = {"id": "Jawaban"}
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
@run_spec_function("lindsea_syntax_minimal_pairs")
|
|
558
|
+
def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "mcq") -> RunSpec:
|
|
559
|
+
name = f"lindsea_syntax_minimal_pairs_{language}"
|
|
560
|
+
if method == "mcq":
|
|
561
|
+
adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2)
|
|
562
|
+
else:
|
|
563
|
+
adapter_spec = get_multiple_choice_separate_adapter_spec(
|
|
564
|
+
method=ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
565
|
+
empty_input=True,
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
scenario_spec = ScenarioSpec(
|
|
569
|
+
class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEASyntaxMinimalPairsScenario",
|
|
570
|
+
args={
|
|
571
|
+
"method": method,
|
|
572
|
+
"language": language,
|
|
573
|
+
},
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
return RunSpec(
|
|
577
|
+
name=name,
|
|
578
|
+
scenario_spec=scenario_spec,
|
|
579
|
+
adapter_spec=adapter_spec,
|
|
580
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
581
|
+
groups=["bhasa_linguistic", f"lindsea_syntax_minimal_pairs_{language}"],
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
# 2.1. Pragmatics: LINDSEA Pragmatic Reasoning (single sentence)
|
|
586
|
+
@run_spec_function("lindsea_pragmatics_pragmatic_reasoning_single")
|
|
587
|
+
def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> RunSpec:
|
|
588
|
+
name = f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"
|
|
589
|
+
|
|
590
|
+
adapter_spec = get_generation_adapter_spec(
|
|
591
|
+
output_noun=LINDSEA_OUTPUT_NOUNS[language],
|
|
592
|
+
stop_sequences=["\n"],
|
|
593
|
+
max_train_instances=0,
|
|
594
|
+
max_tokens=8,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
scenario_spec = ScenarioSpec(
|
|
598
|
+
class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningSingleScenario",
|
|
599
|
+
args={
|
|
600
|
+
"language": language,
|
|
601
|
+
},
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
return RunSpec(
|
|
605
|
+
name=name,
|
|
606
|
+
scenario_spec=scenario_spec,
|
|
607
|
+
adapter_spec=adapter_spec,
|
|
608
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
609
|
+
groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"],
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
# 2.2. Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
|
|
614
|
+
@run_spec_function("lindsea_pragmatics_pragmatic_reasoning_pair")
|
|
615
|
+
def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSpec:
|
|
616
|
+
name = f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"
|
|
617
|
+
|
|
618
|
+
adapter_spec = get_generation_adapter_spec(
|
|
619
|
+
output_noun=LINDSEA_OUTPUT_NOUNS[language],
|
|
620
|
+
stop_sequences=["\n"],
|
|
621
|
+
max_train_instances=0,
|
|
622
|
+
max_tokens=8,
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
scenario_spec = ScenarioSpec(
|
|
626
|
+
class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningPairScenario",
|
|
627
|
+
args={
|
|
628
|
+
"language": language,
|
|
629
|
+
},
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
return RunSpec(
|
|
633
|
+
name=name,
|
|
634
|
+
scenario_spec=scenario_spec,
|
|
635
|
+
adapter_spec=adapter_spec,
|
|
636
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
637
|
+
groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"],
|
|
638
|
+
)
|