crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MELTLMMaskFillingScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
Scenario for the MELT Masked Language Modeling dataset.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name = "melt_lm_mask_filling"
|
|
24
|
+
description = "Masked Language Modeling scenario."
|
|
25
|
+
tags = ["language_modeling", "mask_filling"]
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
dataset_name: str,
|
|
30
|
+
revision: str,
|
|
31
|
+
masked_ratio: float = 0.1,
|
|
32
|
+
text_key: str = "text",
|
|
33
|
+
subset: Optional[str] = None,
|
|
34
|
+
splits: Optional[Dict[str, str]] = None,
|
|
35
|
+
):
|
|
36
|
+
"""Initializes the question answering scenario.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
dataset_name: The name of the dataset.
|
|
40
|
+
revision: The revision of the dataset to use.
|
|
41
|
+
masked_ratio: The ratio of tokens to mask in the input text. Defaults to 0.1.
|
|
42
|
+
text_key: The key to use for the text in the dataset. Defaults to "text".
|
|
43
|
+
subset: The subset of the dataset to use. Defaults to "".
|
|
44
|
+
splits: The splits to use for the dataset. Defaults to None.
|
|
45
|
+
"""
|
|
46
|
+
super().__init__()
|
|
47
|
+
self.dataset_name = dataset_name
|
|
48
|
+
self.subset = subset
|
|
49
|
+
self.masked_ratio = masked_ratio
|
|
50
|
+
self.text_key = text_key
|
|
51
|
+
self.revision = revision
|
|
52
|
+
self.splits = splits
|
|
53
|
+
|
|
54
|
+
def _mask_text(self, text: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Mask a portion of the input text.
|
|
57
|
+
Args:
|
|
58
|
+
text (str): The input text to mask.
|
|
59
|
+
Returns:
|
|
60
|
+
str: The masked text.
|
|
61
|
+
"""
|
|
62
|
+
tokens = text.split(" ")
|
|
63
|
+
num_tokens_to_mask = int(len(tokens) * self.masked_ratio)
|
|
64
|
+
indices_to_mask = random.sample(range(len(tokens)), num_tokens_to_mask)
|
|
65
|
+
for index in indices_to_mask:
|
|
66
|
+
tokens[index] = "[MASK]"
|
|
67
|
+
return " ".join(tokens)
|
|
68
|
+
|
|
69
|
+
def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
|
|
70
|
+
"""
|
|
71
|
+
Helper for generating instances for a split.
|
|
72
|
+
Args:
|
|
73
|
+
splits (dict): Which splits to partition the data into.
|
|
74
|
+
Returns:
|
|
75
|
+
List[Instance]: Instances from the file for the specified split.
|
|
76
|
+
"""
|
|
77
|
+
instances: List[Instance] = []
|
|
78
|
+
dataset: Any = load_dataset(
|
|
79
|
+
self.dataset_name,
|
|
80
|
+
self.subset,
|
|
81
|
+
revision=self.revision,
|
|
82
|
+
trust_remote_code=True,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
for dataset_split_name, helm_split_name in splits.items():
|
|
86
|
+
for sample in dataset[dataset_split_name]:
|
|
87
|
+
target_sentence = sample[self.text_key]
|
|
88
|
+
source_sentence = self._mask_text(target_sentence)
|
|
89
|
+
instances.append(
|
|
90
|
+
Instance(
|
|
91
|
+
input=Input(text=source_sentence),
|
|
92
|
+
references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
|
|
93
|
+
split=helm_split_name,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
return instances
|
|
97
|
+
|
|
98
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
99
|
+
if self.splits is None:
|
|
100
|
+
splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
|
|
101
|
+
else:
|
|
102
|
+
splits = {}
|
|
103
|
+
if "train" in self.splits:
|
|
104
|
+
splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
|
|
105
|
+
if "validation" in self.splits:
|
|
106
|
+
splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
|
|
107
|
+
if "test" in self.splits:
|
|
108
|
+
splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
|
|
109
|
+
|
|
110
|
+
random.seed(42)
|
|
111
|
+
instances: List[Instance] = self.get_instances_for_splits(splits=splits)
|
|
112
|
+
return instances
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class MELTLMMaskFillingMLQAScenario(MELTLMMaskFillingScenario):
|
|
116
|
+
"""
|
|
117
|
+
Scenario for the MLQA dataset.
|
|
118
|
+
This dataset is a multilingual question answering dataset.
|
|
119
|
+
It contains questions in multiple languages and their corresponding
|
|
120
|
+
answers in the same language. In this scenario, we are using the
|
|
121
|
+
context of questions in the Vietnamese subset of the MLQA dataset.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
name = "melt_lm_mask_filling_mlqa"
|
|
125
|
+
description = "MLQA dataset for masked language modeling."
|
|
126
|
+
tags = ["language_modeling", "mask_filling"]
|
|
127
|
+
|
|
128
|
+
def __init__(self):
|
|
129
|
+
super().__init__(
|
|
130
|
+
dataset_name="facebook/mlqa",
|
|
131
|
+
revision="397ed406c1a7902140303e7faf60fff35b58d285",
|
|
132
|
+
subset="mlqa.vi.vi",
|
|
133
|
+
text_key="context",
|
|
134
|
+
splits={
|
|
135
|
+
VALID_SPLIT: "validation",
|
|
136
|
+
TEST_SPLIT: "test",
|
|
137
|
+
},
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class MELTLMSpellingCorrectionScenario(Scenario):
|
|
142
|
+
"""
|
|
143
|
+
Scenario for the MELT spelling correction dataset.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
name = "melt_lm_spelling_correction"
|
|
147
|
+
description = "Spelling Correction scenario."
|
|
148
|
+
tags = ["language_modeling", "spelling_correction"]
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
dataset_name: str,
|
|
153
|
+
revision: str,
|
|
154
|
+
source_key: str = "text",
|
|
155
|
+
target_key: str = "corrected_text",
|
|
156
|
+
subset: Optional[str] = None,
|
|
157
|
+
splits: Optional[Dict[str, str]] = None,
|
|
158
|
+
):
|
|
159
|
+
"""Initializes the question answering scenario.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
dataset_name: The name of the dataset.
|
|
163
|
+
revision: The revision of the dataset to use.
|
|
164
|
+
source_key: The key to use for the source text in the dataset. Defaults to "text".
|
|
165
|
+
target_key: The key to use for the target text in the dataset. Defaults to "corrected_text".
|
|
166
|
+
subset: The subset of the dataset to use. Defaults to "".
|
|
167
|
+
splits: The splits to use for the dataset. Defaults to None.
|
|
168
|
+
"""
|
|
169
|
+
super().__init__()
|
|
170
|
+
self.dataset_name = dataset_name
|
|
171
|
+
self.subset = subset
|
|
172
|
+
self.source_key = source_key
|
|
173
|
+
self.target_key = target_key
|
|
174
|
+
self.revision = revision
|
|
175
|
+
self.splits = splits
|
|
176
|
+
|
|
177
|
+
def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
|
|
178
|
+
"""
|
|
179
|
+
Helper for generating instances for a split.
|
|
180
|
+
Args:
|
|
181
|
+
splits (dict): Which splits to partition the data into.
|
|
182
|
+
Returns:
|
|
183
|
+
List[Instance]: Instances from the file for the specified split.
|
|
184
|
+
"""
|
|
185
|
+
instances: List[Instance] = []
|
|
186
|
+
dataset: Any = load_dataset(
|
|
187
|
+
self.dataset_name,
|
|
188
|
+
self.subset,
|
|
189
|
+
revision=self.revision,
|
|
190
|
+
trust_remote_code=True,
|
|
191
|
+
)
|
|
192
|
+
if len(splits) == 1:
|
|
193
|
+
all_keys = list(splits.keys())
|
|
194
|
+
dataset = dataset[all_keys[0]].train_test_split(test_size=0.33, seed=42)
|
|
195
|
+
splits = {
|
|
196
|
+
"train": TRAIN_SPLIT,
|
|
197
|
+
"test": TEST_SPLIT,
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
for dataset_split_name, helm_split_name in splits.items():
|
|
201
|
+
for sample in dataset[dataset_split_name]:
|
|
202
|
+
source_sentence = sample[self.source_key]
|
|
203
|
+
target_sentence = sample[self.target_key]
|
|
204
|
+
instances.append(
|
|
205
|
+
Instance(
|
|
206
|
+
input=Input(text=source_sentence),
|
|
207
|
+
references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
|
|
208
|
+
split=helm_split_name,
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
return instances
|
|
212
|
+
|
|
213
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
214
|
+
if self.splits is None:
|
|
215
|
+
splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
|
|
216
|
+
else:
|
|
217
|
+
splits = {}
|
|
218
|
+
if "train" in self.splits:
|
|
219
|
+
splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
|
|
220
|
+
if "validation" in self.splits:
|
|
221
|
+
splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
|
|
222
|
+
if "test" in self.splits:
|
|
223
|
+
splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
|
|
224
|
+
|
|
225
|
+
random.seed(42)
|
|
226
|
+
instances: List[Instance] = self.get_instances_for_splits(splits=splits)
|
|
227
|
+
return instances
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class MELTLMSpellingCorrectionVSECScenario(MELTLMSpellingCorrectionScenario):
|
|
231
|
+
"""
|
|
232
|
+
Scenario for the VSEC dataset.
|
|
233
|
+
The VSEC dataset is a Vietnamese spelling correction dataset.
|
|
234
|
+
It contains 9,341 pairs of sentences where the first sentence is a misspelled
|
|
235
|
+
version of the second sentence, which is the correct version.
|
|
236
|
+
The mistakes are common spelling errors made by Vietnamese speakers and typists.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
name = "melt_lm_spelling_correction_vsec"
|
|
240
|
+
description = "VSEC dataset for spelling correction."
|
|
241
|
+
tags = ["language_modeling", "spelling_correction"]
|
|
242
|
+
|
|
243
|
+
def __init__(self):
|
|
244
|
+
super().__init__(
|
|
245
|
+
dataset_name="ura-hcmut/VSEC",
|
|
246
|
+
revision="a6732e131605b5ec24ecc1745c6061c5ae86814e",
|
|
247
|
+
source_key="text",
|
|
248
|
+
target_key="correct",
|
|
249
|
+
splits={
|
|
250
|
+
TEST_SPLIT: "test",
|
|
251
|
+
},
|
|
252
|
+
)
|