crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
  2. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
  3. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
  5. helm/benchmark/__init__.py +13 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  9. helm/benchmark/contamination/__init__.py +0 -0
  10. helm/benchmark/metrics/classification_metrics.py +70 -0
  11. helm/benchmark/metrics/machine_translation_metrics.py +36 -0
  12. helm/benchmark/metrics/summarization_metrics.py +7 -8
  13. helm/benchmark/metrics/test_classification_metrics.py +150 -0
  14. helm/benchmark/presentation/create_plots.py +617 -0
  15. helm/benchmark/presentation/run_display.py +7 -48
  16. helm/benchmark/presentation/summarize.py +4 -2
  17. helm/benchmark/presentation/test_create_plots.py +32 -0
  18. helm/benchmark/run.py +144 -48
  19. helm/benchmark/run_expander.py +164 -47
  20. helm/benchmark/run_specs.py +346 -39
  21. helm/benchmark/runner.py +34 -6
  22. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  23. helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
  24. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  25. helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
  26. helm/benchmark/scenarios/lextreme_scenario.py +458 -0
  27. helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
  28. helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
  29. helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
  30. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
  31. helm/benchmark/scenarios/med_qa_scenario.py +96 -0
  32. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  33. helm/benchmark/scenarios/scenario.py +5 -0
  34. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  35. helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
  36. helm/benchmark/static/benchmarking.css +14 -0
  37. helm/benchmark/static/benchmarking.js +43 -0
  38. helm/benchmark/static/index.html +2 -0
  39. helm/benchmark/static/json-urls.js +4 -0
  40. helm/benchmark/static/plot-captions.js +16 -0
  41. helm/benchmark/static/schema.yaml +154 -1
  42. helm/benchmark/window_services/cohere_window_service.py +20 -0
  43. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  44. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  45. helm/benchmark/window_services/santacoder_window_service.py +27 -0
  46. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  47. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  48. helm/benchmark/window_services/window_service_factory.py +34 -7
  49. helm/common/codec.py +123 -0
  50. helm/common/general.py +12 -5
  51. helm/common/test_codec.py +144 -0
  52. helm/proxy/clients/aleph_alpha_client.py +47 -28
  53. helm/proxy/clients/auto_client.py +32 -24
  54. helm/proxy/clients/google_client.py +88 -0
  55. helm/proxy/clients/huggingface_client.py +32 -16
  56. helm/proxy/clients/huggingface_model_registry.py +111 -0
  57. helm/proxy/clients/huggingface_tokenizer.py +25 -7
  58. helm/proxy/clients/openai_client.py +60 -2
  59. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  60. helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
  61. helm/proxy/clients/together_client.py +17 -2
  62. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  63. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  64. helm/proxy/models.py +115 -7
  65. helm/proxy/test_models.py +1 -1
  66. helm/benchmark/presentation/present.py +0 -249
  67. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  68. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,458 @@
1
+ import ast
2
+ import random
3
+ from pathlib import Path
4
+ from typing import List, Any
5
+
6
+ import datasets
7
+ from datasets import load_dataset
8
+
9
+ from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, Output, Input
10
+
11
+
12
+ class TaskType:
13
+ SLTC = "SingleLabelTextClassification"
14
+ MLTC = "MultiLabelTextClassification"
15
+ NER = "NamedEntityRecognition"
16
+ QA = "QuestionAnswering"
17
+
18
+
19
+ BRAZILIAN_COURT_DECISIONS_JUDGMENT = "brazilian_court_decisions_judgment"
20
+ BRAZILIAN_COURT_DECISIONS_UNANIMITY = "brazilian_court_decisions_unanimity"
21
+ GERMAN_ARGUMENT_MINING = "german_argument_mining"
22
+ GREEK_LEGAL_CODE_CHAPTER = "greek_legal_code_chapter"
23
+ GREEK_LEGAL_CODE_SUBJECT = "greek_legal_code_subject"
24
+ GREEK_LEGAL_CODE_VOLUME = "greek_legal_code_volume"
25
+ SWISS_JUDGMENT_PREDICTION = "swiss_judgment_prediction"
26
+ ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS = "online_terms_of_service_unfairness_levels"
27
+ ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS = "online_terms_of_service_clause_topics"
28
+ COVID19_EMERGENCY_EVENT = "covid19_emergency_event"
29
+ MULTI_EURLEX_LEVEL_1 = "multi_eurlex_level_1"
30
+ MULTI_EURLEX_LEVEL_2 = "multi_eurlex_level_2"
31
+ MULTI_EURLEX_LEVEL_3 = "multi_eurlex_level_3"
32
+ GREEK_LEGAL_NER = "greek_legal_ner"
33
+ LEGALNERO = "legalnero"
34
+ LENER_BR = "lener_br"
35
+ MAPA_COARSE = "mapa_coarse"
36
+ MAPA_FINE = "mapa_fine"
37
+ TASK_CODE_MAPPING = {
38
+ BRAZILIAN_COURT_DECISIONS_JUDGMENT: TaskType.SLTC,
39
+ BRAZILIAN_COURT_DECISIONS_UNANIMITY: TaskType.SLTC,
40
+ GERMAN_ARGUMENT_MINING: TaskType.SLTC,
41
+ GREEK_LEGAL_CODE_CHAPTER: TaskType.SLTC,
42
+ GREEK_LEGAL_CODE_SUBJECT: TaskType.SLTC,
43
+ GREEK_LEGAL_CODE_VOLUME: TaskType.SLTC,
44
+ SWISS_JUDGMENT_PREDICTION: TaskType.SLTC,
45
+ ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: TaskType.SLTC,
46
+ ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: TaskType.MLTC,
47
+ COVID19_EMERGENCY_EVENT: TaskType.MLTC,
48
+ MULTI_EURLEX_LEVEL_1: TaskType.MLTC,
49
+ MULTI_EURLEX_LEVEL_2: TaskType.MLTC,
50
+ MULTI_EURLEX_LEVEL_3: TaskType.MLTC,
51
+ GREEK_LEGAL_NER: TaskType.NER,
52
+ LEGALNERO: TaskType.NER,
53
+ LENER_BR: TaskType.NER,
54
+ MAPA_COARSE: TaskType.NER,
55
+ MAPA_FINE: TaskType.NER,
56
+ }
57
+
58
+
59
+ def get_lextreme_task_type(subset):
60
+ return TASK_CODE_MAPPING[subset]
61
+
62
+
63
+ TASK_MAX_TRAIN_INSTANCES_MAPPING = {
64
+ BRAZILIAN_COURT_DECISIONS_JUDGMENT: 4, # ~ max 1024 tokens
65
+ BRAZILIAN_COURT_DECISIONS_UNANIMITY: 4, # ~ max 1024 tokens
66
+ GERMAN_ARGUMENT_MINING: 5, # ~ max 256 tokens
67
+ GREEK_LEGAL_CODE_CHAPTER: 1, # ~ max 4096 tokens
68
+ GREEK_LEGAL_CODE_SUBJECT: 1, # ~ max 4096 tokens
69
+ GREEK_LEGAL_CODE_VOLUME: 1, # ~ max 4096 tokens
70
+ SWISS_JUDGMENT_PREDICTION: 2, # ~ max 2048 tokens
71
+ ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: 5, # ~ max 256 tokens
72
+ ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: 5, # ~ max 256 tokens
73
+ COVID19_EMERGENCY_EVENT: 5, # ~ max 256 tokens
74
+ MULTI_EURLEX_LEVEL_1: 1, # ~ max 4096 tokens
75
+ MULTI_EURLEX_LEVEL_2: 1, # ~ max 4096 tokens
76
+ MULTI_EURLEX_LEVEL_3: 1, # ~ max 4096 tokens
77
+ GREEK_LEGAL_NER: 5, # ~ max 512 tokens
78
+ LEGALNERO: 5, # ~ max 512 tokens
79
+ LENER_BR: 5, # ~ max 512 tokens
80
+ MAPA_COARSE: 5, # ~ max 512 tokens
81
+ MAPA_FINE: 5, # ~ max 512 tokens
82
+ }
83
+
84
+
85
+ def get_lextreme_max_train_instances(subset):
86
+ return TASK_MAX_TRAIN_INSTANCES_MAPPING[subset]
87
+
88
+
89
+ TASK_MAX_TOKENS_MAPPING = {
90
+ BRAZILIAN_COURT_DECISIONS_JUDGMENT: 5, # one word
91
+ BRAZILIAN_COURT_DECISIONS_UNANIMITY: 5, # one word
92
+ GERMAN_ARGUMENT_MINING: 5, # one word
93
+ GREEK_LEGAL_CODE_CHAPTER: 20, # few non-ASCII words
94
+ GREEK_LEGAL_CODE_SUBJECT: 20, # few non-ASCII words
95
+ GREEK_LEGAL_CODE_VOLUME: 20, # few non-ASCII words
96
+ SWISS_JUDGMENT_PREDICTION: 5, # one word
97
+ ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: 10, # two words
98
+ ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: 10, # max two words
99
+ COVID19_EMERGENCY_EVENT: 10, # max two words
100
+ MULTI_EURLEX_LEVEL_1: 10, # max two words
101
+ MULTI_EURLEX_LEVEL_2: 10, # max two words
102
+ MULTI_EURLEX_LEVEL_3: 10, # max two words
103
+ GREEK_LEGAL_NER: 430, # num NER labels: max 2593, 99% 215, 95% 101 ==> 215 * 2 = 430
104
+ LEGALNERO: 788, # num NER labels: max 737, 99% 394, 95% 103 ==> 394 * 2 = 788
105
+ LENER_BR: 338, # num NER labels: max 654, 99% 169, 95% 100 ==> 169 * 2 = 338
106
+ MAPA_COARSE: 274, # num NER labels: max 367, 99% 137, 95% 83 ==> 137 * 2 = 274
107
+ MAPA_FINE: 274, # num NER labels: max 367, 99% 137, 95% 83 ==> 137 * 2 = 274
108
+ }
109
+
110
+
111
+ def get_lextreme_max_tokens(subset):
112
+ return TASK_MAX_TOKENS_MAPPING[subset]
113
+
114
+
115
+ INSTRUCTIONS = {
116
+ BRAZILIAN_COURT_DECISIONS_JUDGMENT: "In this task, you are given the case description "
117
+ "from a decision heard at the State Supreme Court of Alagoas (Brazil). "
118
+ "Predict the judgment of the case "
119
+ "(no: The appeal was denied, "
120
+ "partial: For partially favourable decisions, "
121
+ "yes: For fully favourable decisions)",
122
+ BRAZILIAN_COURT_DECISIONS_UNANIMITY: "In this task, you are given the case description "
123
+ "from a decision heard at the State Supreme Court of Alagoas (Brazil). "
124
+ "Predict the unanimity of the case (unanimity, not-unanimity, not_determined)",
125
+ GERMAN_ARGUMENT_MINING: "In this task, you are given sentences from German court decisions. "
126
+ "Predict the major component of German Urteilsstil "
127
+ "(conclusion: Overall result, "
128
+ "definition: Abstract legal facts and consequences, "
129
+ "subsumption: Determination sentence / Concrete facts, "
130
+ "other: Anything else)",
131
+ GREEK_LEGAL_CODE_CHAPTER: "In this task, you are given a Greek legislative document. "
132
+ "Predict the chapter level category of the "
133
+ "'Permanent Greek Legislation Code - Raptarchis (Ραπτάρχης)' the document belongs to.",
134
+ GREEK_LEGAL_CODE_SUBJECT: "In this task, you are given a Greek legislative document. "
135
+ "Predict the subject level category of the "
136
+ "'Permanent Greek Legislation Code - Raptarchis (Ραπτάρχης)' the document belongs to.",
137
+ GREEK_LEGAL_CODE_VOLUME: "In this task, you are given a Greek legislative document. "
138
+ "Predict the volume level category of the "
139
+ "'Permanent Greek Legislation Code - Raptarchis (Ραπτάρχης)' the document belongs to.",
140
+ SWISS_JUDGMENT_PREDICTION: "In this task, you are given the facts description "
141
+ "from a decision heard at the Swiss Federal Supreme Court. "
142
+ "Predict the judgment of the case (approval: The appeal was approved, or dismissal: The appeal was denied)",
143
+ ONLINE_TERMS_OF_SERVICE_UNFAIRNESS_LEVELS: "In this task, you are given a sentence "
144
+ "from a Terms of Service (ToS) document. "
145
+ "Predict the unfairness level of the sentence (potentially_unfair, clearly_unfair, clearly_fair, untagged)",
146
+ ONLINE_TERMS_OF_SERVICE_CLAUSE_TOPICS: "In this task, you are given a sentence "
147
+ "from a Terms of Service (ToS) document. "
148
+ "Predict the clause topics of the sentence out of the following: "
149
+ "0: Arbitration, "
150
+ "1: Unilateral change, "
151
+ "2: Content removal, "
152
+ "3: Jurisdiction, "
153
+ "4: Choice of law, "
154
+ "5: Limitation of liability, "
155
+ "6: Unilateral termination, "
156
+ "7: Contract by using, "
157
+ "8: Privacy included. "
158
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
159
+ COVID19_EMERGENCY_EVENT: "In this task, you are given a sentence from a European legislative document. "
160
+ "Predict the applicable measurements against COVID-19 out of the following: "
161
+ "0: State of Emergency, "
162
+ "1: Restrictions of fundamental rights and civil liberties, "
163
+ "2: Restrictions of daily liberties, "
164
+ "3: Closures / lockdown, "
165
+ "4: Suspension of international cooperation and commitments, "
166
+ "5: Police mobilization, "
167
+ "6: Army mobilization, "
168
+ "7: Government oversight. "
169
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
170
+ MULTI_EURLEX_LEVEL_1: "In this task, you are given a document from an EU law. "
171
+ "Predict the level 1 concept in the EUROVOC taxonomy. "
172
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
173
+ MULTI_EURLEX_LEVEL_2: "In this task, you are given a document from an EU law. "
174
+ "Predict the level 2 concept in the EUROVOC taxonomy. "
175
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
176
+ MULTI_EURLEX_LEVEL_3: "In this task, you are given a document from an EU law. "
177
+ "Predict the level 3 concept in the EUROVOC taxonomy. "
178
+ "If there is no label reply n/a, if there are multiple labels specify all of them separated by a comma.",
179
+ GREEK_LEGAL_NER: "In this task, you are given a sentence from Greek legislation. "
180
+ "Predict the named entity type for each token out of the following: "
181
+ "O, B-ORG, I-ORG, B-GPE, I-GPE, B-LEG-REFS, I-LEG-REFS, B-PUBLIC-DOCS, I-PUBLIC-DOCS, B-PERSON, I-PERSON, "
182
+ "B-FACILITY, I-FACILITY, B-LOCATION-UNK, I-LOCATION-UNK, B-LOCATION-NAT, I-LOCATION-NAT",
183
+ LEGALNERO: "In this task, you are given a sentence from Romanian legislation. "
184
+ "Predict the named entity type for each token out of the following: "
185
+ "O, B-TIME, I-TIME, B-LEGAL, I-LEGAL, B-ORG, I-ORG, B-LOC, I-LOC, B-PER, I-PER",
186
+ LENER_BR: "In this task, you are given a sentence "
187
+ "from Brazilian legal documents (court decisions and legislation). "
188
+ "Predict the named entity type for each token out of the following: "
189
+ "O, B-ORGANIZACAO, I-ORGANIZACAO, B-PESSOA, I-PESSOA, B-TEMPO, I-TEMPO, B-LOCAL, I-LOCAL, "
190
+ "B-LEGISLACAO, I-LEGISLACAO, B-JURISPRUDENCIA, I-JURISPRUDENCIA",
191
+ MAPA_COARSE: "In this task, you are given a sentence from the EUR-Lex database. "
192
+ "Predict the coarse grained named entity type for each token out of the following: "
193
+ "O, B-ORGANISATION, I-ORGANISATION, B-ADDRESS, I-ADDRESS, B-DATE, I-DATE, "
194
+ "B-PERSON, I-PERSON, B-AMOUNT, I-AMOUNT, B-TIME, I-TIME",
195
+ MAPA_FINE: "In this task, you are given a sentence from the EUR-Lex database. "
196
+ "Predict the fine grained named entity type for each token out of the following: "
197
+ "O, B-BUILDING, I-BUILDING, B-CITY, I-CITY, B-COUNTRY, I-COUNTRY, B-PLACE, I-PLACE, B-TERRITORY, I-TERRITORY, "
198
+ "I-UNIT, B-UNIT, B-VALUE, I-VALUE, B-YEAR, I-YEAR, B-STANDARD ABBREVIATION, I-STANDARD ABBREVIATION, "
199
+ "B-MONTH, I-MONTH, B-DAY, I-DAY, B-AGE, I-AGE, B-ETHNIC CATEGORY, I-ETHNIC CATEGORY, B-FAMILY NAME, I-FAMILY NAME, "
200
+ "B-INITIAL NAME, I-INITIAL NAME, B-MARITAL STATUS, I-MARITAL STATUS, B-PROFESSION, I-PROFESSION, B-ROLE, I-ROLE, "
201
+ "B-NATIONALITY, I-NATIONALITY, B-TITLE, I-TITLE, B-URL, I-URL, B-TYPE, I-TYPE",
202
+ }
203
+
204
+
205
+ def get_lextreme_instructions(subset):
206
+ return INSTRUCTIONS[subset]
207
+
208
+
209
+ class LEXTREMEScenario(Scenario):
210
+ """
211
+ The dataset consists of 11 diverse multilingual legal NLU tasks.
212
+ 6 tasks have one single configuration and 5 tasks have two or three configurations.
213
+ This leads to a total of 18 tasks (8 single-label text classification tasks,
214
+ 5 multi-label text classification tasks and 5 token-classification tasks).
215
+ Find more information on the dataset here: https://huggingface.co/datasets/joelito/lextreme
216
+
217
+ We prompt models using the following format (example for german_argument_mining)
218
+
219
+ <sentence>
220
+ Urteilsstil:
221
+
222
+ Target completion:
223
+ <sentence> (<sentence>:conclusion, subsumption, definition or other)
224
+
225
+ Using an example from the training dataset, we have
226
+
227
+ ```
228
+ Die Klage ist hinsichtlich der begehrten „Umzugkosten“ und hinsichtlich der begehrten
229
+ „Übernahme der durch den Rechtsstreit gegen das Jobcenter verursachten tatsächlichen Kosten“ insgesamt unzulässig.
230
+
231
+ Urteilsstil:
232
+ Target completion:
233
+ conclusion
234
+ ```
235
+
236
+ """
237
+
238
+ name = "lextreme"
239
+ description = "Multilingual Legal Text Classification and Named Entity Recognition dataset."
240
+ tags = ["single_label_text_classification", "multi_label_text_classification", "named_entity_recognition"]
241
+
242
+ # Mapping from HELM splits to HF splits
243
+ splits_mapping = {
244
+ TRAIN_SPLIT: datasets.Split.TRAIN,
245
+ VALID_SPLIT: datasets.Split.VALIDATION,
246
+ TEST_SPLIT: datasets.Split.TEST,
247
+ }
248
+
249
+ dataset_name = "joelito/lextreme"
250
+ max_number_of_wrong_answers = 30
251
+ delimiter = '" "' # we choose quotes and whitespace as a delimiter because this is what worked for gpt3
252
+
253
+ ner_class_mapping = {
254
+ LENER_BR: [
255
+ "O",
256
+ "B-ORGANIZACAO",
257
+ "I-ORGANIZACAO",
258
+ "B-PESSOA",
259
+ "I-PESSOA",
260
+ "B-TEMPO",
261
+ "I-TEMPO",
262
+ "B-LOCAL",
263
+ "I-LOCAL",
264
+ "B-LEGISLACAO",
265
+ "I-LEGISLACAO",
266
+ "B-JURISPRUDENCIA",
267
+ "I-JURISPRUDENCIA",
268
+ ],
269
+ LEGALNERO: [
270
+ "O",
271
+ "B-TIME",
272
+ "I-TIME",
273
+ "B-LEGAL",
274
+ "I-LEGAL",
275
+ "B-ORG",
276
+ "I-ORG",
277
+ "B-LOC",
278
+ "I-LOC",
279
+ "B-PER",
280
+ "I-PER",
281
+ ],
282
+ GREEK_LEGAL_NER: [
283
+ "O",
284
+ "B-ORG",
285
+ "I-ORG",
286
+ "B-GPE",
287
+ "I-GPE",
288
+ "B-LEG-REFS",
289
+ "I-LEG-REFS",
290
+ "B-PUBLIC-DOCS",
291
+ "I-PUBLIC-DOCS",
292
+ "B-PERSON",
293
+ "I-PERSON",
294
+ "B-FACILITY",
295
+ "I-FACILITY",
296
+ "B-LOCATION-UNK",
297
+ "I-LOCATION-UNK",
298
+ "B-LOCATION-NAT",
299
+ "I-LOCATION-NAT",
300
+ ],
301
+ MAPA_COARSE: [
302
+ "O",
303
+ "B-ORGANISATION",
304
+ "I-ORGANISATION",
305
+ "B-ADDRESS",
306
+ "I-ADDRESS",
307
+ "B-DATE",
308
+ "I-DATE",
309
+ "B-PERSON",
310
+ "I-PERSON",
311
+ "B-AMOUNT",
312
+ "I-AMOUNT",
313
+ "B-TIME",
314
+ "I-TIME",
315
+ ],
316
+ MAPA_FINE: [
317
+ "O",
318
+ "B-BUILDING",
319
+ "I-BUILDING",
320
+ "B-CITY",
321
+ "I-CITY",
322
+ "B-COUNTRY",
323
+ "I-COUNTRY",
324
+ "B-PLACE",
325
+ "I-PLACE",
326
+ "B-TERRITORY",
327
+ "I-TERRITORY",
328
+ "I-UNIT",
329
+ "B-UNIT",
330
+ "B-VALUE",
331
+ "I-VALUE",
332
+ "B-YEAR",
333
+ "I-YEAR",
334
+ "B-STANDARD ABBREVIATION",
335
+ "I-STANDARD ABBREVIATION",
336
+ "B-MONTH",
337
+ "I-MONTH",
338
+ "B-DAY",
339
+ "I-DAY",
340
+ "B-AGE",
341
+ "I-AGE",
342
+ "B-ETHNIC CATEGORY",
343
+ "I-ETHNIC CATEGORY",
344
+ "B-FAMILY NAME",
345
+ "I-FAMILY NAME",
346
+ "B-INITIAL NAME",
347
+ "I-INITIAL NAME",
348
+ "B-MARITAL STATUS",
349
+ "I-MARITAL STATUS",
350
+ "B-PROFESSION",
351
+ "I-PROFESSION",
352
+ "B-ROLE",
353
+ "I-ROLE",
354
+ "B-NATIONALITY",
355
+ "I-NATIONALITY",
356
+ "B-TITLE",
357
+ "I-TITLE",
358
+ "B-URL",
359
+ "I-URL",
360
+ "B-TYPE",
361
+ "I-TYPE",
362
+ ],
363
+ }
364
+
365
+ def __init__(self, subset: str):
366
+ super().__init__()
367
+ assert subset in list(TASK_CODE_MAPPING.keys()) + ["all"], f"Unknown subset: {subset}"
368
+ self.subsets = [subset] if subset != "all" else list(TASK_CODE_MAPPING.keys())
369
+ self.random: random.Random = random.Random(42)
370
+
371
+ def get_instances_for_subset(self, config: str) -> List[Instance]:
372
+ task_code = TASK_CODE_MAPPING[config]
373
+ # Load dataset
374
+ cache_dir = str(Path(self.output_path) / "data")
375
+ dataset: Any = load_dataset(self.dataset_name, config, cache_dir=cache_dir)
376
+
377
+ if task_code == TaskType.SLTC:
378
+ class_label = dataset["train"].features["label"]
379
+ label_classes = class_label.names
380
+ elif task_code == TaskType.MLTC:
381
+ # construct the label classes
382
+ label_classes = set()
383
+ for split in self.splits_mapping.values():
384
+ for example in dataset[split]:
385
+ label_classes |= set(example["label"]) # add all new labels to the set
386
+ label_classes = sorted(list(map(str, label_classes))) # convert everything to a string
387
+ elif task_code == TaskType.NER:
388
+ label_classes = self.ner_class_mapping[config]
389
+
390
+ def generate_instance(example, split: str):
391
+ # get correct labels
392
+ if task_code == TaskType.SLTC:
393
+ correct_label = class_label.int2str(example["label"]) # get label name for correct label
394
+ correct_labels = correct_label if isinstance(correct_label, list) else [correct_label]
395
+ elif task_code == TaskType.MLTC:
396
+ correct_labels = list(map(str, example["label"])) # here we don't have any mapping to label names
397
+ elif task_code == TaskType.NER:
398
+ correct_labels = [label_classes[label] for label in example["label"]]
399
+
400
+ # construct wrong references
401
+ wrong_references = []
402
+ if task_code in [TaskType.SLTC, TaskType.MLTC]:
403
+ for label_name in label_classes:
404
+ if label_name not in correct_labels:
405
+ wrong_reference = Reference(output=Output(label_name), tags=[]) # Wrong output
406
+ wrong_references.append(wrong_reference)
407
+ elif task_code == TaskType.NER:
408
+ if len(set(correct_labels)) > 1: # make sure that the correct labels are not only 'O's
409
+ for label_name in label_classes:
410
+ if label_name not in correct_labels and label_name != "O":
411
+ # just replace the non-'O' labels with the new label_name for a fake example
412
+ new_labels = [label_name if label != "O" else label for label in correct_labels]
413
+ wrong_reference = Reference(
414
+ output=Output(construct_ner_sequence(new_labels)), tags=[]
415
+ ) # Wrong output
416
+ wrong_references.append(wrong_reference)
417
+
418
+ wrong_references = reduce_wrong_reference_count(wrong_references)
419
+
420
+ # construct correct references and input
421
+ if task_code in [TaskType.SLTC, TaskType.MLTC]:
422
+ input_text = example["input"]
423
+ if "multi_eurlex" in config:
424
+ input_text = ast.literal_eval(input_text)
425
+ assert isinstance(input_text, dict)
426
+ languages = list(input_text.keys())
427
+ input_text = input_text[self.random.choice(languages)] # just choose a random language
428
+ correct_references = [
429
+ Reference(output=Output(correct_label), tags=[CORRECT_TAG]) for correct_label in correct_labels
430
+ ] # for MLTC we have multiple correct ones
431
+ elif task_code == TaskType.NER:
432
+ input_text = construct_ner_sequence(example["input"])
433
+ correct_references = [
434
+ Reference(output=Output(construct_ner_sequence(correct_labels)), tags=[CORRECT_TAG])
435
+ ]
436
+ return Instance(input=Input(input_text), references=wrong_references + correct_references, split=split)
437
+
438
+ def construct_ner_sequence(ner_list):
439
+ return '"' + self.delimiter.join(ner_list) + '"'
440
+
441
+ def reduce_wrong_reference_count(wrong_references):
442
+ self.random.shuffle(wrong_references) # shuffle wrong references
443
+ if len(wrong_references) > self.max_number_of_wrong_answers:
444
+ # if there are too many wrong references, only take a subset
445
+ wrong_references = wrong_references[: self.max_number_of_wrong_answers]
446
+ return wrong_references
447
+
448
+ def generate_instances(split: str):
449
+ split_dataset = dataset[self.splits_mapping[split]]
450
+ return [generate_instance(example, split) for example in split_dataset]
451
+
452
+ return generate_instances(TRAIN_SPLIT) + generate_instances(VALID_SPLIT) + generate_instances(TEST_SPLIT)
453
+
454
+ def get_instances(self) -> List[Instance]:
455
+ instances = []
456
+ for subset in self.subsets:
457
+ instances.extend(self.get_instances_for_subset(subset))
458
+ return instances
@@ -0,0 +1,86 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
5
+ from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output
6
+
7
+
8
+ class MeQSumScenario(Scenario):
9
+ """
10
+ From "On the Summarization of Consumer Health Questions" (Abacha et al.), MeQSum is a corpus of 1,000 summarized
11
+ consumer health questions.
12
+
13
+ The following is an example from the dataset:
14
+
15
+ Question:
16
+ SUBJECT: inversion of long arm chromasome7 MESSAGE: My son has been diagnosed with inversion of long arm
17
+ chromasome 7 and down syndrome . please could you give me information on the chromasome 7 please because
18
+ our doctors have not yet mentioned it
19
+
20
+ Summary:
21
+ Where can I find information on chromosome 7?
22
+
23
+ @Inproceedings{MeQSum,
24
+ author = {Asma {Ben Abacha} and Dina Demner-Fushman},
25
+ title = {On the Summarization of Consumer Health Questions},
26
+ booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, ACL 2019,
27
+ Florence, Italy, July 28th - August 2},
28
+ year = {2019},
29
+ abstract = {Question understanding is one of the main challenges in question answering. In real world applications,
30
+ users often submit natural language questions that are longer than needed and include peripheral information that
31
+ increases the complexity of the question, leading to substantially more false positives in answer retrieval. In this
32
+ paper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of
33
+ 1,000 summarized consumer health questions. We explore data augmentation methods and evaluate state-of-the-art
34
+ neural abstractive models on this new task. In particular, we show that semantic augmentation from question datasets
35
+ improves the overall performance, and that pointer-generator networks outperform sequence-to-sequence attentional
36
+ models on this task, with a ROUGE-1 score of 44.16%. We also present a detailed error analysis and discuss
37
+ directions for improvement that are specific to question summarization.}}
38
+ """
39
+
40
+ SOURCE_URL_TEMPLATE: str = (
41
+ "https://worksheets.codalab.org/rest/bundles/0xd98a53314314445b96b4d703bb2d8c8c/contents/blob/{file_name}"
42
+ )
43
+
44
+ name = "me_q_sum"
45
+ description = "MeQSum is a corpus of 1,000 summarized consumer health questions."
46
+ tags = ["summarization", "biomedical"]
47
+
48
+ def get_instances(self) -> List[Instance]:
49
+ """
50
+ Build `Instance`s using the consumer health questions and their summarized versions.
51
+ """
52
+
53
+ def download_and_read_lines(file_name: str) -> List[str]:
54
+ file_path: str = os.path.join(data_path, file_name)
55
+ ensure_file_downloaded(
56
+ source_url=MeQSumScenario.SOURCE_URL_TEMPLATE.format(file_name=file_name),
57
+ target_path=file_path,
58
+ unpack=False,
59
+ )
60
+
61
+ with open(file_path) as f:
62
+ return f.read().splitlines()
63
+
64
+ data_path: str = os.path.join(self.output_path, "data")
65
+ ensure_directory_exists(data_path)
66
+
67
+ instances: List[Instance] = []
68
+ for split in ALL_SPLITS:
69
+ dataset_split: str = "val" if split == VALID_SPLIT else split
70
+
71
+ # The files with the questions end with ".source"
72
+ questions: List[str] = download_and_read_lines(f"{dataset_split}.source")
73
+
74
+ # The files with the summaries end with ".target"
75
+ summaries: List[str] = download_and_read_lines(f"{dataset_split}.target")
76
+
77
+ for question, summary in zip(questions, summaries):
78
+ instances.append(
79
+ Instance(
80
+ input=Input(text=question),
81
+ references=[Reference(output=Output(text=summary), tags=[CORRECT_TAG])],
82
+ split=split,
83
+ )
84
+ )
85
+
86
+ return instances
@@ -0,0 +1,132 @@
1
+ import json
2
+ import os
3
+ from typing import List
4
+
5
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
6
+ from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, Input, Output
7
+
8
+
9
+ class MedDialogScenario(Scenario):
10
+ """
11
+ "The MedDialog dataset (English) contains conversations between doctors and patients.
12
+ It has 0.26 million dialogues. The data is continuously growing and more dialogues will be added.
13
+ The raw dialogues are from healthcaremagic.com and icliniq.com. All copyrights of the data belong
14
+ to healthcaremagic.com and icliniq.com."
15
+
16
+ The following is an example from the healthcaremagic.com subset:
17
+
18
+ Patient: I get cramps on top of my left forearm and hand and it causes my hand and fingers to draw up and it
19
+ hurts. It mainly does this when I bend my arm. I ve been told that I have a slight pinch in a nerve in my neck.
20
+ Could this be a cause? I don t think so. Doctor: Hi there. It may sound difficult to believe it ,but the nerves
21
+ which supply your forearms and hand, start at the level of spinal cord and on their way towards the forearm and
22
+ hand regions which they supply, the course of these nerves pass through difference fascial and muscular planes
23
+ that can make them susceptible to entrapment neuropathies. Its a group of conditions where a nerve gets
24
+ compressed between a muscle and a bone, or between the fibers of a muscle that it pierces or passes through.
25
+ Also, the compression can happen when the nerves are travelling around a blood vessel which can mechanically put
26
+ pressure on them. Usually patients who would be having such a problem present with a dull aching pain over the
27
+ arm and forearm. If it is not too severe and does not cause any neurological deficits then conservative management
28
+ with Pregabalin and Vitamin B complex tablets, activity modifications and physiotherapy can be started which
29
+ will provide relief. Avoid the activities which exaggerate your problem.
30
+
31
+ Could painful forearms be related to pinched nerve in neck?
32
+
33
+
34
+ The following is an example from the icliniq.com subset:
35
+
36
+ Patient: Hello doctor, We are looking for a second opinion on my friend's MRI scan of both the knee joints as he
37
+ is experiencing excruciating pain just above the patella. He has a sudden onset of severe pain on both the knee
38
+ joints about two weeks ago. Previously he had a similar episode about two to three months ago and it subsided
39
+ after resting and painkillers. Doctor: Hi. I viewed the right and left knee MRI images. (attachment removed to
40
+ protect patient identity). Left knee: The MRI, left knee joint shows a complex tear in the posterior horn of the
41
+ medial meniscus area and mild left knee joint effusion. There is some fluid between the semimembranous and medial
42
+ head of gastrocnemius muscles. There is a small area of focal cartilage defect in the upper pole of the patella
43
+ with mild edematous fat. The anterior and posterior cruciate ligaments are normal. The medial and lateral
44
+ collateral ligaments are normal. Right knee: The right knee joint shows mild increased signal intensity in the
45
+ posterior horn of the medial meniscus area and minimal knee joint effusion. There is minimal fluid in the back
46
+ of the lower thigh and not significant. There is a suspicious strain in the left anterior cruciate ligament
47
+ interiorly but largely the attachments are normal. The posterior cruciate ligament is normal. There are subtle
48
+ changes in the upper pole area of the right patella and mild edema. There is mild edema around the bilateral
49
+ distal quadriceps tendons, but there is no obvious tear of the tendons.
50
+
51
+ My friend has excruciating knee pain. Please interpret his MRI report
52
+
53
+
54
+ Paper: https://arxiv.org/abs/2004.03329
55
+ Code: https://github.com/UCSD-AI4H/Medical-Dialogue-System
56
+
57
+ @article{chen2020meddiag,
58
+ title={MedDialog: a large-scale medical dialogue dataset},
59
+ author={Chen, Shu and Ju, Zeqian and Dong, Xiangyu and Fang, Hongchao and Wang, Sicheng and Yang, Yue and Zeng,
60
+ Jiaqi and Zhang, Ruisi and Zhang, Ruoyu and Zhou, Meng and Zhu, Penghui and Xie, Pengtao},
61
+ journal={arXiv preprint arXiv:2004.03329},
62
+ year={2020}
63
+ }
64
+
65
+ We used the data preprocessing from "BioBART: Pretraining and Evaluation o A Biomedical Generative Language Model"
66
+ (Yuan et al.) and generated the following splits:
67
+
68
+ |Dataset | Train | Valid | Test |
69
+ |--------------- |------------|---------|--------|
70
+ |HealthCareMagic | 181,122 | 22,641 | 22,642 |
71
+ |iCliniq | 24,851 | 3,105 | 3,108 |
72
+
73
+ Yuan et al. described, "HealthCareMagic's summaries are more abstractive and are written in a formal style,
74
+ unlike iCliniq's patient-written summaries."
75
+
76
+ Paper: https://arxiv.org/abs/2204.03905
77
+ Code: https://github.com/GanjinZero/BioBART
78
+
79
+ @misc{https://doi.org/10.48550/arxiv.2204.03905,
80
+ doi = {10.48550/ARXIV.2204.03905},
81
+ url = {https://arxiv.org/abs/2204.03905},
82
+ author = {Yuan, Hongyi and Yuan, Zheng and Gan, Ruyi and Zhang, Jiaxing and Xie, Yutao and Yu, Sheng},
83
+ keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences,
84
+ FOS: Computer and information sciences},
85
+ title = {BioBART: Pretraining and Evaluation of A Biomedical Generative Language Model},
86
+ publisher = {arXiv},
87
+ year = {2022},
88
+ copyright = {arXiv.org perpetual, non-exclusive license}
89
+ }
90
+ """
91
+
92
+ name = "med_dialog"
93
+ description = (
94
+ "The MedDialog dataset (English) contains conversations between doctors and patients. "
95
+ "It has 0.26 million dialogues. The data is continuously growing and more dialogues will be added. "
96
+ "The raw dialogues are from healthcaremagic.com and icliniq.com."
97
+ )
98
+ tags = ["dialogue", "biomedical"]
99
+
100
+ def __init__(self, subset: str):
101
+ super().__init__()
102
+ assert subset in ["healthcaremagic", "icliniq"], f"Invalid subset specified for {self.name}: {subset}."
103
+ self.subset: str = subset
104
+
105
+ def get_instances(self) -> List[Instance]:
106
+ data_path: str = os.path.join(self.output_path, self.subset)
107
+ ensure_directory_exists(data_path)
108
+
109
+ instances: List[Instance] = []
110
+
111
+ for split in ALL_SPLITS:
112
+ split_file_name: str = f"{split}.json"
113
+ split_path: str = os.path.join(data_path, split_file_name)
114
+ ensure_file_downloaded(
115
+ source_url="https://worksheets.codalab.org/rest/bundles/0x82f0c47f6d3e4462ae9ef8ea39eebe64/"
116
+ f"contents/blob/{self.subset}/{split_file_name}",
117
+ target_path=split_path,
118
+ unpack=False,
119
+ )
120
+
121
+ with open(split_path, "r") as f:
122
+ examples: List = json.load(f)["data"]
123
+ for example in examples:
124
+ instances.append(
125
+ Instance(
126
+ input=Input(text=example["src"]),
127
+ references=[Reference(Output(text=example["tgt"]), tags=[CORRECT_TAG])],
128
+ split=split,
129
+ )
130
+ )
131
+
132
+ return instances