crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
  2. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
  3. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
  5. helm/benchmark/__init__.py +13 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  9. helm/benchmark/contamination/__init__.py +0 -0
  10. helm/benchmark/metrics/classification_metrics.py +70 -0
  11. helm/benchmark/metrics/machine_translation_metrics.py +36 -0
  12. helm/benchmark/metrics/summarization_metrics.py +7 -8
  13. helm/benchmark/metrics/test_classification_metrics.py +150 -0
  14. helm/benchmark/presentation/create_plots.py +617 -0
  15. helm/benchmark/presentation/run_display.py +7 -48
  16. helm/benchmark/presentation/summarize.py +4 -2
  17. helm/benchmark/presentation/test_create_plots.py +32 -0
  18. helm/benchmark/run.py +144 -48
  19. helm/benchmark/run_expander.py +164 -47
  20. helm/benchmark/run_specs.py +346 -39
  21. helm/benchmark/runner.py +34 -6
  22. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  23. helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
  24. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  25. helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
  26. helm/benchmark/scenarios/lextreme_scenario.py +458 -0
  27. helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
  28. helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
  29. helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
  30. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
  31. helm/benchmark/scenarios/med_qa_scenario.py +96 -0
  32. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  33. helm/benchmark/scenarios/scenario.py +5 -0
  34. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  35. helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
  36. helm/benchmark/static/benchmarking.css +14 -0
  37. helm/benchmark/static/benchmarking.js +43 -0
  38. helm/benchmark/static/index.html +2 -0
  39. helm/benchmark/static/json-urls.js +4 -0
  40. helm/benchmark/static/plot-captions.js +16 -0
  41. helm/benchmark/static/schema.yaml +154 -1
  42. helm/benchmark/window_services/cohere_window_service.py +20 -0
  43. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  44. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  45. helm/benchmark/window_services/santacoder_window_service.py +27 -0
  46. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  47. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  48. helm/benchmark/window_services/window_service_factory.py +34 -7
  49. helm/common/codec.py +123 -0
  50. helm/common/general.py +12 -5
  51. helm/common/test_codec.py +144 -0
  52. helm/proxy/clients/aleph_alpha_client.py +47 -28
  53. helm/proxy/clients/auto_client.py +32 -24
  54. helm/proxy/clients/google_client.py +88 -0
  55. helm/proxy/clients/huggingface_client.py +32 -16
  56. helm/proxy/clients/huggingface_model_registry.py +111 -0
  57. helm/proxy/clients/huggingface_tokenizer.py +25 -7
  58. helm/proxy/clients/openai_client.py +60 -2
  59. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  60. helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
  61. helm/proxy/clients/together_client.py +17 -2
  62. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  63. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  64. helm/proxy/models.py +115 -7
  65. helm/proxy/test_models.py +1 -1
  66. helm/benchmark/presentation/present.py +0 -249
  67. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  68. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,102 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, List
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+ from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, Input, Output
7
+
8
+
9
+ class MedMCQAScenario(Scenario):
10
+ """
11
+ From "MedMCQA: A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering"
12
+ (Pal et al.), MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to address
13
+ real-world medical entrance exam questions." The dataset "...has more than 194k high-quality AIIMS & NEET PG
14
+ entrance exam MCQs covering 2.4k healthcare topics and 21 medical subjects are collected with an average
15
+ token length of 12.77 and high topical diversity."
16
+
17
+ The following is an example from the dataset:
18
+
19
+ Question: In a patient of heart disease antibiotic prophylaxis for dental extraction is:
20
+ A. Amoxicillin.
21
+ B. Imipenem.
22
+ C. Gentamicin.
23
+ D. Erythromycin.
24
+ Answer: A
25
+
26
+ Paper: https://arxiv.org/abs/2203.14371
27
+ Code: https://github.com/MedMCQA/MedMCQA
28
+
29
+ @InProceedings{pmlr-v174-pal22a,
30
+ title = {MedMCQA: A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering},
31
+ author = {Pal, Ankit and Umapathi, Logesh Kumar and Sankarasubbu, Malaikannan},
32
+ booktitle = {Proceedings of the Conference on Health, Inference, and Learning},
33
+ pages = {248--260},
34
+ year = {2022},
35
+ editor = {Flores, Gerardo and Chen, George H and Pollard, Tom and Ho, Joyce C and Naumann, Tristan},
36
+ volume = {174},
37
+ series = {Proceedings of Machine Learning Research},
38
+ month = {07--08 Apr},
39
+ publisher = {PMLR},
40
+ pdf = {https://proceedings.mlr.press/v174/pal22a/pal22a.pdf},
41
+ url = {https://proceedings.mlr.press/v174/pal22a.html},
42
+ abstract = {This paper introduces MedMCQA, a new large-scale, Multiple-Choice Question Answering (MCQA) dataset
43
+ designed to address real-world medical entrance exam questions. More than 194k high-quality AIIMS & NEET PG
44
+ entrance exam MCQs covering 2.4k healthcare topics and 21 medical subjects are collected with an average token
45
+ length of 12.77 and high topical diversity. Each sample contains a question, correct answer(s), and other
46
+ options which requires a deeper language understanding as it tests the 10+ reasoning abilities of a model across
47
+ a wide range of medical subjects & topics. A detailed explanation of the solution, along with the above
48
+ information, is provided in this study.}
49
+ }
50
+ """
51
+
52
+ # From https://github.com/MedMCQA/MedMCQA#data-fields, there are four possible answer choices
53
+ # where "cop" corresponds to the index of the correct option.
54
+ ANSWER_OPTION_TO_INDEX: Dict[str, int] = {"opa": 1, "opb": 2, "opc": 3, "opd": 4}
55
+ DATASET_DOWNLOAD_URL: str = (
56
+ "https://drive.google.com/uc?export=download&id=15VkJdq5eyWIkfb_aoD3oS8i4tScbHYky&confirm=t"
57
+ )
58
+
59
+ name = "med_mcqa"
60
+ description = (
61
+ "MedMCQA is a multiple-choice question answering (MCQA) dataset designed to address "
62
+ "real-world medical entrance exam questions."
63
+ )
64
+ tags = ["question_answering", "biomedical"]
65
+
66
+ def get_instances(self) -> List[Instance]:
67
+ data_path: str = os.path.join(self.output_path, "data")
68
+ ensure_file_downloaded(
69
+ source_url=self.DATASET_DOWNLOAD_URL,
70
+ target_path=data_path,
71
+ unpack=True,
72
+ unpack_type="unzip",
73
+ )
74
+
75
+ instances: List[Instance] = []
76
+
77
+ # From https://github.com/MedMCQA/MedMCQA#model-submission-and-test-set-evaluation,
78
+ # "to preserve the integrity of test results, we do not release the test set's ground-truth to the public".
79
+ for split in [TRAIN_SPLIT, VALID_SPLIT]:
80
+ # Although the files end with ".json", they are actually JSONL files
81
+ split_file_name: str = f"{'dev' if split == VALID_SPLIT else split}.json"
82
+ split_path: str = os.path.join(data_path, split_file_name)
83
+
84
+ with open(split_path, "r") as f:
85
+ for line in f:
86
+ # The data fields and their explanations can be found here:
87
+ # https://github.com/MedMCQA/MedMCQA#data-fields
88
+ example: Dict[str, str] = json.loads(line.rstrip())
89
+
90
+ references: List[Reference] = [
91
+ # Value of "cop" corresponds to the index of the correct option
92
+ Reference(Output(text=example[option]), tags=[CORRECT_TAG] if index == example["cop"] else [])
93
+ for option, index in MedMCQAScenario.ANSWER_OPTION_TO_INDEX.items()
94
+ ]
95
+ instance: Instance = Instance(
96
+ input=Input(text=example["question"]),
97
+ references=references,
98
+ split=split,
99
+ )
100
+ instances.append(instance)
101
+
102
+ return instances
@@ -0,0 +1,119 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
5
+ from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output
6
+
7
+
8
+ class MedParagraphSimplificationScenario(Scenario):
9
+ """
10
+ "Paragraph-level Simplification of Medical Texts" (Devaraj et al.) studies the problem of learning to simplify
11
+ medical texts. One of their contributions is a new corpus that is composed of technical abstracts and their
12
+ lay summaries on various clinical topics.
13
+
14
+ The author generated train/val/test splits, which are available in the GitHub repository linked in the paper.
15
+
16
+ The following is an example from the dataset:
17
+
18
+ {
19
+ "doi": "10.1002/14651858.CD011112.pub2",
20
+ "abstract": "We included six studies (reported as seven papers) involving 326 participants whose ages ranged
21
+ from 39 to 83 years, with a gender bias towards men (73% to 95% across studies), reflecting the characteristics
22
+ of patients with HNC. The risk of bias in the studies was generally high. We did not pool data from studies
23
+ because of significant differences in the interventions and outcomes evaluated. We found a lack of
24
+ standardisation and consistency in the outcomes measured and the endpoints at which they were evaluated.
25
+ We found no evidence that therapeutic exercises were better than TAU, or any other treatment, in improving the
26
+ safety and efficiency of oral swallowing (our primary outcome) or in improving any of the secondary outcomes.
27
+ Using the GRADE system, we classified the overall quality of the evidence for each outcome as very low, due to
28
+ the limited number of trials and their low quality. There were no adverse events reported that were directly
29
+ attributable to the intervention (swallowing exercises). We found no evidence that undertaking therapeutic
30
+ exercises before, during and/or immediately after HNC treatment leads to improvement in oral swallowing. This
31
+ absence of evidence may be due to the small participant numbers in trials, resulting in insufficient power to
32
+ detect any difference. Data from the identified trials could not be combined due to differences in the choice
33
+ of primary outcomes and in the measurement tools used to assess them, and the differing baseline and endpoints
34
+ across studies. Designing and implementing studies with stronger methodological rigour is essential. There needs
35
+ to be agreement about the key primary outcomes, the choice of validated assessment tools to measure them and the
36
+ time points at which those measurements are made.",
37
+ "pls": "We included six studies with 326 participants who undertook therapeutic exercises before, during and/or
38
+ after HNC treatment. We could not combine the results of the studies because of the variation in participants'
39
+ cancers, their treatments, the outcomes measured and the tools used to assess them, as well as the differing
40
+ time points for testing. Researchers have compared: (i) therapeutic exercises versus treatment as usual (TAU);
41
+ (ii) therapeutic exercises versus sham therapy; (iii) therapeutic exercises plus TAU versus TAU. The therapeutic
42
+ exercises varied in their design, timing and intensity. TAU involved managing patients' dysphagia when it
43
+ occurred, including inserting a tube for non-oral feeding. The evidence is up to date to 1 July 2016. We found
44
+ no evidence that therapeutic exercises were better than TAU, or any other treatment, in improving the safety and
45
+ efficiency of oral swallowing (our primary outcome) or in improving any of the secondary outcomes. However,
46
+ there is insufficient evidence to draw any clear conclusion about the effects of undertaking therapeutic
47
+ exercises before during and/or immediately after HNC treatment on preventing or reducing dysphagia. Studies had
48
+ small participant numbers, used complex interventions and varied in the choice of outcomes measured, making it
49
+ difficult to draw reliable conclusions. There were no reported adverse events directly attributable to the
50
+ intervention (swallowing exercises). The current quality of the evidence to support the use of therapeutic
51
+ exercises before, during and/or immediately after HNC treatment to prevent/reduce dysphagia is very low. We need
52
+ better designed, rigorous studies with larger participant numbers and agreed endpoints and outcome measurements
53
+ in order to draw clear(er) conclusions."
54
+ },
55
+
56
+ where "pls" stands for "plain-language summary".
57
+
58
+ Paper: http://arxiv.org/abs/2104.05767
59
+ Code: https://github.com/AshOlogn/Paragraph-level-Simplification-of-Medical-Texts
60
+
61
+ @inproceedings{devaraj-etal-2021-paragraph,
62
+ title = "Paragraph-level Simplification of Medical Texts",
63
+ author = "Devaraj, Ashwin and Marshall, Iain and Wallace, Byron and Li, Junyi Jessy",
64
+ booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for
65
+ Computational Linguistics",
66
+ month = jun,
67
+ year = "2021",
68
+ publisher = "Association for Computational Linguistics",
69
+ url = "https://www.aclweb.org/anthology/2021.naacl-main.395",
70
+ pages = "4972--4984",
71
+ }
72
+ """
73
+
74
+ DOWNLOAD_URL_TEMPLATE: str = (
75
+ "https://raw.githubusercontent.com/AshOlogn/Paragraph-level-Simplification-of-Medical-Texts/"
76
+ "main/data/data-1024/{file_name}"
77
+ )
78
+
79
+ name = "med_paragraph_simplification"
80
+ description = "Corpus with technical abstracts and their lay summaries on various clinical topics"
81
+ tags = ["summarization", "biomedical"]
82
+
83
+ def get_instances(self) -> List[Instance]:
84
+ data_path: str = os.path.join(self.output_path, "data")
85
+ ensure_directory_exists(data_path)
86
+
87
+ instances: List[Instance] = []
88
+ for split in ALL_SPLITS:
89
+ # Original abstracts
90
+ abstract_file_name: str = f"{'val' if split == VALID_SPLIT else split}.source"
91
+ abstract_path: str = os.path.join(data_path, abstract_file_name)
92
+ ensure_file_downloaded(
93
+ source_url=MedParagraphSimplificationScenario.DOWNLOAD_URL_TEMPLATE.format(
94
+ file_name=abstract_file_name
95
+ ),
96
+ target_path=abstract_path,
97
+ )
98
+
99
+ # Plain-language summaries of the abstracts
100
+ pls_file_name: str = f"{'val' if split == VALID_SPLIT else split}.target"
101
+ pls_path: str = os.path.join(data_path, pls_file_name)
102
+ ensure_file_downloaded(
103
+ source_url=MedParagraphSimplificationScenario.DOWNLOAD_URL_TEMPLATE.format(file_name=pls_file_name),
104
+ target_path=pls_path,
105
+ )
106
+
107
+ with open(abstract_path, "r") as abstract_file:
108
+ with open(pls_path, "r") as pls_file:
109
+ for abstract_line, summary_line in zip(abstract_file, pls_file):
110
+ abstract: str = abstract_line.rstrip()
111
+ summary: str = summary_line.rstrip()
112
+ instance: Instance = Instance(
113
+ input=Input(text=abstract),
114
+ references=[Reference(Output(text=summary), tags=[CORRECT_TAG])],
115
+ split=split,
116
+ )
117
+ instances.append(instance)
118
+
119
+ return instances
@@ -0,0 +1,96 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, List
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+ from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output
7
+
8
+
9
+ class MedQAScenario(Scenario):
10
+ """
11
+ From "What Disease Does This Patient Have? A Large-Scale Open Domain Question Answering Dataset from Medical Exams"
12
+ (Jin et al.), MedQA is an open domain question answering dataset composed of questions from professional medical
13
+ board exams.
14
+
15
+ From Jin et al., "to comply with fair use of law ,we shuffle the order of answer options and randomly delete
16
+ one of the wrong options for each question for USMLE and MCMLE datasets, which results in four options with one
17
+ right option and three wrong options".
18
+ We use the 4-options, English subset ("US") of the dataset, which contains 12,723 questions.
19
+
20
+ The following is an example from the dataset:
21
+
22
+ {
23
+ "question": "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states
24
+ it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She
25
+ otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C),
26
+ blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air.
27
+ Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the
28
+ following is the best treatment for this patient?",
29
+ "answer": "Nitrofurantoin",
30
+ "options": {
31
+ "A": "Ampicillin",
32
+ "B": "Ceftriaxone",
33
+ "C": "Ciprofloxacin",
34
+ "D": "Doxycycline",
35
+ "E": "Nitrofurantoin"
36
+ },
37
+ "meta_info": "step2&3",
38
+ "answer_idx": "E"
39
+ }
40
+
41
+ Paper: https://arxiv.org/abs/2009.13081
42
+ Code: https://github.com/jind11/MedQA
43
+
44
+ @article{jin2020disease,
45
+ title={What Disease does this Patient Have? A Large-scale Open Domain Question Answering Dataset
46
+ from Medical Exams},
47
+ author={Jin, Di and Pan, Eileen and Oufattole, Nassim and Weng, Wei-Hung and Fang, Hanyi and Szolovits, Peter},
48
+ journal={arXiv preprint arXiv:2009.13081},
49
+ year={2020}
50
+ }
51
+ """
52
+
53
+ DATASET_DOWNLOAD_URL: str = (
54
+ "https://drive.google.com/uc?export=download&id=1ImYUSLk9JbgHXOemfvyiDiirluZHPeQw&confirm=t"
55
+ )
56
+
57
+ name = "med_qa"
58
+ description = "An open domain question answering (QA) dataset collected from professional medical board exams."
59
+ tags = ["question_answering", "biomedical"]
60
+
61
+ def get_instances(self) -> List[Instance]:
62
+ data_path: str = os.path.join(self.output_path, "data")
63
+ ensure_file_downloaded(
64
+ source_url=self.DATASET_DOWNLOAD_URL,
65
+ target_path=data_path,
66
+ unpack=True,
67
+ unpack_type="unzip",
68
+ )
69
+
70
+ instances: List[Instance] = []
71
+ for split in ALL_SPLITS:
72
+ split_file_name: str = f"phrases_no_exclude_{'dev' if split == VALID_SPLIT else split}.jsonl"
73
+ split_path: str = os.path.join(data_path, "data_clean", "questions", "US", "4_options", split_file_name)
74
+
75
+ with open(split_path, "r") as f:
76
+ for line in f:
77
+ example: Dict = json.loads(line.rstrip())
78
+ assert len(example["options"]) == 4, f"Expected 4 possible answer choices: {example['options']}"
79
+ references: List[Reference] = [
80
+ # Some answers have extraneous characters e.g., 'Pulmonary embolism\n"'.
81
+ # Filed an issue: https://github.com/jind11/MedQA/issues/5
82
+ # Value of "answer_idx" corresponds to the letter of the correct answer.
83
+ Reference(
84
+ Output(text=answer.rstrip('"').rstrip()),
85
+ tags=[CORRECT_TAG] if index == example["answer_idx"] else [],
86
+ )
87
+ for index, answer in example["options"].items()
88
+ ]
89
+ instance: Instance = Instance(
90
+ input=Input(text=example["question"]),
91
+ references=references,
92
+ split=split,
93
+ )
94
+ instances.append(instance)
95
+
96
+ return instances
@@ -0,0 +1,194 @@
1
+ import os
2
+ import pandas as pd
3
+ from typing import List, Dict
4
+
5
+ from helm.common.general import ensure_file_downloaded
6
+
7
+ from .scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TRAIN_SPLIT,
12
+ TEST_SPLIT,
13
+ CORRECT_TAG,
14
+ PassageQuestionInput,
15
+ Input,
16
+ Output,
17
+ )
18
+
19
+
20
+ class OpinionsQAScenario(Scenario):
21
+ """
22
+ The OpinionsQAScenario dataset is from the paper "Whose Opinions Do Language Models Reflect?"
23
+ [Santurkar et al., 2023].
24
+
25
+ OpinionsQA is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently
26
+ subjective, there isn't a single ground truth response. Instead, the object of interest is how
27
+ the distribution of model responses compares to those obtained from human survey participants.
28
+
29
+ As discussed in Santurkar et al., we consider prompting an LM:
30
+ 1. Without any context (zero-shot) to evaluate the "default" opinions reflected
31
+ by it.
32
+ 2. With context containing information pertaining to the group (say Democrats) we want to steer
33
+ the model towards using one of three modes: "steer-qa", "steer-bio" or "steer-portray". See the
34
+ example below for an illustration fo the difference between the three steering modes.
35
+
36
+
37
+ We prompt models using the following format
38
+
39
+ <optional context>
40
+
41
+ Question: <question>
42
+ A. <reference>
43
+ B. <reference>
44
+ C. <reference>
45
+ D. <reference>
46
+ Answer: <A/B/C/D>
47
+
48
+ For instance:
49
+
50
+ <optional context>
51
+
52
+ Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes
53
+ to gun violence in the country today?
54
+ A. A great deal
55
+ B. A fair amount
56
+ C. Not too much
57
+ D. Not at all
58
+ E. Refused
59
+ Answer:
60
+
61
+
62
+ In the example above, the <optional context> varies based on the choice of the "context" variable as follows:
63
+
64
+ 1. If context="default", <optional context> is not used and the model is prompted with the question directly.
65
+
66
+ 2. If context="steer-qa", the group information (to steer the model towards) is passed in a QA format, e.g.:
67
+
68
+ Question: In politics today, do you consider yourself a
69
+ A. Republican
70
+ B. Democrat
71
+ C. Independent
72
+ D. Something else
73
+ E. Refused
74
+ Answer: B
75
+
76
+ 3. If context="steer-bio", the group information is passed to the model as a free-text response
77
+ to a prompt asking for demographic information, e.g.:
78
+
79
+ Below you will be asked to provide a short description of your political affiliation and
80
+ then answer some questions.
81
+
82
+ Description: In politics today, I consider myself a Democrat.
83
+
84
+ 4. If context="steer-portray", the model is explicitly asked to behave as if it were a member of
85
+ said group, e.g.:
86
+
87
+ Answer the following question as if in politics today, you considered yourself a Democrat.
88
+
89
+ """
90
+
91
+ name = "opinions_qa"
92
+ description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
93
+ tags = ["multiple_choice"]
94
+
95
+ """ Information needed to download the dataset """
96
+ CODALAB_URI_TEMPLATE: str = (
97
+ "https://worksheets.codalab.org/rest/bundles/{bundle}/contents/blob/model_input/{filename}"
98
+ )
99
+ CODALAB_BUNDLE: str = "0xa6f81cc62d7d4ccb93031a72d2043669"
100
+ FILE_NAME: str = "Pew_American_Trends_Panel_W{wave}.csv"
101
+ PEW_SURVEY_WAVES: list = [26, 27, 29, 32, 34, 36, 41, 42, 43, 45, 49, 50, 54, 82, 92]
102
+
103
+ def __init__(self, survey_type: str, context: str):
104
+ super().__init__()
105
+ assert context in ["default", "steer-qa", "steer-bio", "steer-portray"]
106
+
107
+ self.survey_type: str = survey_type
108
+ self.context: str = context
109
+
110
+ def download_data(self):
111
+
112
+ self.output_path: str = os.path.join(self.output_path, "data")
113
+ if not os.path.exists(self.output_path):
114
+ os.makedirs(self.output_path)
115
+
116
+ DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
117
+ DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
118
+ DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
119
+
120
+ for filename in DOWNLOAD_FILENAMES:
121
+ data_path: str = os.path.join(self.output_path, filename)
122
+
123
+ source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
124
+ ensure_file_downloaded(source_url=source_url, target_path=data_path, downloader_executable="gdown")
125
+
126
+ def read_survey_questions(self, csv_path):
127
+ df = pd.read_csv(csv_path, sep="\t")
128
+ df["options"] = df.apply(lambda x: eval(x["options"]), axis=1)
129
+ return df
130
+
131
+ def get_instances(self) -> List[Instance]:
132
+ self.download_data()
133
+
134
+ # Read all the instances
135
+ instances: List[Instance] = []
136
+ splits: Dict[str, str] = {
137
+ "dev": TRAIN_SPLIT,
138
+ "test": TEST_SPLIT,
139
+ }
140
+
141
+ all_splits = ["dev", "test"] if self.context == "steer-qa" else ["test"]
142
+ csv_dict = {
143
+ "dev": os.path.join(self.output_path, f"{self.context}.csv"),
144
+ "test": os.path.join(self.output_path, f"{self.survey_type}.csv"),
145
+ }
146
+
147
+ bios_df = None
148
+ if self.context in ["steer-bio", "steer-portray"]:
149
+ bios_path = os.path.join(self.output_path, f"{self.context}.csv")
150
+ bios_df = pd.read_csv(bios_path, sep="\t")
151
+
152
+ for split in all_splits:
153
+
154
+ csv_path: str = csv_dict[split]
155
+ assert os.path.exists(csv_path)
156
+
157
+ question_df = self.read_survey_questions(csv_path)
158
+
159
+ for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
160
+
161
+ # Opinions QA test questions have no correct answer and thus we set it to be None by default
162
+ # for all test instances.
163
+ # In the case where context = steer-qa, we add demographic information in the form of a
164
+ # in-context question answer pair as shown in the example above.
165
+
166
+ correct_answer = None if split == "test" else question_df["correct"][qidx]
167
+
168
+ def answer_to_reference(answer: str) -> Reference:
169
+ return Reference(
170
+ Output(text=answer),
171
+ tags=[CORRECT_TAG] if (answer == correct_answer and split != "test") else [],
172
+ )
173
+
174
+ if bios_df is None:
175
+ # context = "default" or "steer-qa"
176
+ instance = Instance(
177
+ Input(text=question),
178
+ references=list(map(answer_to_reference, answers)),
179
+ split=splits[split],
180
+ )
181
+ instances.append(instance)
182
+ else:
183
+ # context = "steer-bio"or "steer-portray"
184
+ for bio in bios_df["question"].values:
185
+
186
+ context = PassageQuestionInput(passage=bio, question=question + "\n")
187
+ instance = Instance(
188
+ context,
189
+ references=list(map(answer_to_reference, answers)),
190
+ split=splits[split],
191
+ )
192
+ instances.append(instance)
193
+
194
+ return instances
@@ -147,6 +147,11 @@ class Instance:
147
147
  return reference
148
148
  return None
149
149
 
150
+ @property
151
+ def all_correct_references(self) -> List[Reference]:
152
+ """Return all correct references."""
153
+ return [reference for reference in self.references if reference.is_correct]
154
+
150
155
  def render_lines(self) -> List[str]:
151
156
  info = [f"input: {format_text(self.input.text)}"]
152
157
  if self.sub_split:
@@ -78,7 +78,7 @@ class ThePileScenario(Scenario):
78
78
  # Download the raw data
79
79
  data_path = os.path.join(self.output_path, "data")
80
80
  ensure_file_downloaded(
81
- source_url="https://mystic.the-eye.eu/public/AI/pile/test.jsonl.zst",
81
+ source_url="https://the-eye.eu/public/AI/pile/test.jsonl.zst",
82
82
  target_path=data_path,
83
83
  unpack=True,
84
84
  )
@@ -0,0 +1,96 @@
1
+ from typing import List, Any
2
+ from datasets import load_dataset
3
+ from helm.common.hierarchical_logger import hlog
4
+ from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
5
+
6
+
7
+ MAX_TRAIN_INSTANCES = 20_000 # This is arbitrary, but 20,000 training examples should be enough.
8
+
9
+
10
+ class WMT14Scenario(Scenario):
11
+ """
12
+ The 2014 Workshop on Statistical Machine Translation:
13
+ https://aclanthology.org/W14-3302.pdf
14
+
15
+ The scenario consists of 5 subsets, each of which is a parallel corpus between English and another language. The
16
+ non-English languages include Czech, German, French, Hindi, and Russian.
17
+
18
+ For each language pair, the validation and test set each includes around 3,000 examples, while the training set is
19
+ usually much larger. We therefore randomly downsample the training set to speedup data processing.
20
+
21
+ Task prompt structure:
22
+
23
+ Translate {source_language} to {target_language}:
24
+ {Hypothesis} = {Reference}
25
+
26
+ Example from WMT14 Fr-En:
27
+
28
+ Hypothesis: Assemblée générale
29
+ Reference: General Assembly
30
+ """
31
+
32
+ name = "WMT_14"
33
+ description = "Scenario for the 2014 Workshop on Statistical Machine Translation"
34
+ tags = ["machine_translation"]
35
+
36
+ def __init__(self, source_language, target_language):
37
+ super().__init__()
38
+ valid_languages = set(["cs", "de", "fr", "hi", "ru", "en"])
39
+ self.source_language = source_language
40
+ self.target_language = target_language
41
+ if self.source_language not in valid_languages or self.target_language not in valid_languages:
42
+ raise ValueError("WMT14 only includes the following languages: cs, de, fr, hi, ru, en.")
43
+ if self.source_language == self.target_language:
44
+ raise ValueError("The source language and the target language should be different.")
45
+ if self.source_language != "en" and self.target_language != "en":
46
+ raise ValueError("One of the languages should be English.")
47
+
48
+ def _deduplicate(self, dataset: List):
49
+ """
50
+ Remove instances in the dataset with the same label.
51
+ """
52
+
53
+ deduplicated_dataset = []
54
+ seen_labels = set()
55
+ for example in dataset:
56
+ if example[self.target_language] not in seen_labels:
57
+ seen_labels.add(example[self.target_language])
58
+ deduplicated_dataset.append(example)
59
+ return deduplicated_dataset
60
+
61
+ def get_instances(self) -> List[Instance]:
62
+ hlog("Loading the HuggingFace dataset. The first time could take several minutes.")
63
+ subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
64
+ hf_dataset: Any = load_dataset("wmt14", subset_name)
65
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
66
+
67
+ instances: List[Instance] = []
68
+ hlog("Generating instances")
69
+ # Some training sets are too large, so we will only take a random subset of it.
70
+ hf_dataset["train"] = hf_dataset["train"].shuffle(seed=42)[:MAX_TRAIN_INSTANCES]
71
+ hf_dataset["train"]["translation"] = self._deduplicate(hf_dataset["train"]["translation"])
72
+ for example in hf_dataset["train"]["translation"]:
73
+ source_sentence: str = example[self.source_language]
74
+ target_sentence: str = example[self.target_language]
75
+ instances.append(
76
+ Instance(
77
+ input=Input(text=source_sentence),
78
+ references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
79
+ split="train",
80
+ )
81
+ )
82
+
83
+ # No special handling needed for validation or test.
84
+ for split_name in ["validation", "test"]:
85
+ split = splits[split_name]
86
+ for example in hf_dataset[split_name]:
87
+ source_sentence = example["translation"][self.source_language]
88
+ target_sentence = example["translation"][self.target_language]
89
+ instances.append(
90
+ Instance(
91
+ input=Input(text=source_sentence),
92
+ references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
93
+ split=split,
94
+ )
95
+ )
96
+ return instances
@@ -136,3 +136,17 @@ tbody .table-sort-column {
136
136
  background-color: #f5f5f5;
137
137
  white-space: pre-wrap;
138
138
  }
139
+
140
+ .plot {
141
+ margin: 15px;
142
+ }
143
+
144
+ .plot img {
145
+ margin: 10px;
146
+ }
147
+
148
+ .plot-caption {
149
+ color: #555;
150
+ font-style: italic;
151
+ margin: 5px;
152
+ }