crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,80 @@
1
+ from typing import Any, List
2
+ import re
3
+ from pathlib import Path
4
+ from datasets import load_dataset
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+
15
+
16
+ class HEALTHQA_BR_Scenario(Scenario):
17
+ """
18
+ HealthQA-BR is a large-scale benchmark designed to evaluate the clinical knowledge of Large Language Models (LLMs)
19
+ within the Brazilian Unified Health System (SUS) context. It comprises 5,632 multiple-choice questions sourced from
20
+ nationwide licensing exams and residency tests, reflecting real challenges faced by Brazil's public health sector.
21
+ Unlike benchmarks focused on the U.S. medical landscape, HealthQA-BR targets the Brazilian healthcare ecosystem,
22
+ covering a wide range of medical specialties and interdisciplinary professions such as nursing, dentistry,
23
+ psychology, social work, pharmacy, and physiotherapy. This comprehensive approach enables a detailed assessment
24
+ of AI models’ ability to collaborate effectively in the team-based patient care typical of SUS.
25
+ """
26
+
27
+ name = "healthqa_br"
28
+ description = "MQA benchmark with questions from Brazilian entrance exams"
29
+ tags = ["knowledge", "multiple_choice", "pt-br"]
30
+
31
+ def get_instances(self, output_path: str) -> List[Instance]:
32
+ # Download the raw data and read all the dialogues
33
+ dataset: Any
34
+ # Read all the instances
35
+ instances: List[Instance] = []
36
+ cache_dir = str(Path(output_path) / "data")
37
+
38
+ dataset = load_dataset("Larxel/healthqa-br", cache_dir=cache_dir)
39
+ for example in dataset["train"]:
40
+ question_choices = example["question"]
41
+ answer = example["answer"].strip().upper()
42
+
43
+ # Separate the question statement from the alternatives
44
+ question_text, choices_text = self.split_question_and_choices(question_choices)
45
+
46
+ # Extract alternatives from text choices_text
47
+ pattern = r"'([A-Z])':\s*'([^']+)'"
48
+ matches = re.findall(pattern, choices_text)
49
+ answers_dict = {label: text for label, text in matches}
50
+
51
+ if answer not in answers_dict:
52
+ continue
53
+
54
+ correct_answer_text = answers_dict[answer]
55
+
56
+ def answer_to_reference(answer: str) -> Reference:
57
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if correct_answer_text == answer else [])
58
+
59
+ instance = Instance(
60
+ input=Input(text=question_text),
61
+ split=TEST_SPLIT,
62
+ references=[answer_to_reference(text) for text in answers_dict.values()],
63
+ )
64
+ instances.append(instance)
65
+ return instances
66
+
67
+ def split_question_and_choices(self, full_text: str):
68
+ # Search for the first occurrence of the alternative pattern
69
+ match = re.search(r"\n'[A-Z]':\s*'.+", full_text)
70
+ if match:
71
+ # Everything before the alternatives
72
+ question_part = full_text[: match.start()].strip()
73
+ # All of the alternatives (from match to end)
74
+ choices_part = full_text[match.start() :].strip()
75
+ else:
76
+ # If you don't find a pattern, consider everything as a question, and no alternative.
77
+ question_part = full_text.strip()
78
+ choices_part = ""
79
+
80
+ return question_part, choices_part
@@ -0,0 +1,90 @@
1
+ import os
2
+ import re
3
+ from typing import List
4
+
5
+ from datasets import load_dataset, Features, Value, Sequence, Dataset
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Input,
11
+ Reference,
12
+ Output,
13
+ CORRECT_TAG,
14
+ TEST_SPLIT,
15
+ )
16
+ from helm.common.general import ensure_directory_exists
17
+
18
+
19
+ class InfiniteBenchEnMCScenario(Scenario):
20
+ """InfiniteBench En.MC
21
+
22
+ InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
23
+ understand, and reason over long contexts (100k+ tokens). InfiniteBench En.MC is a subset of
24
+ InfiniteBench that requires models to perform multiple-choice question answering on questions that necessitate
25
+ long-range dependency and reasoning, beyond simple short passage retrieval.
26
+ """
27
+
28
+ name = "infinite_bench_en_mc"
29
+ description = "∞Bench En.MC is a multiple-choice question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
30
+ tags = ["question_answering"]
31
+
32
+ def __init__(self, max_num_words: int):
33
+ self.max_num_words = max_num_words
34
+ super().__init__()
35
+
36
+ def get_instances(self, output_path: str) -> List[Instance]:
37
+ # Get InfiniteBench from HuggingFace
38
+ cache_dir = os.path.join(output_path, "data")
39
+ ensure_directory_exists(cache_dir)
40
+
41
+ # Define the features schema
42
+ ft = Features(
43
+ {
44
+ "id": Value("int64"),
45
+ "context": Value("string"),
46
+ "input": Value("string"),
47
+ "answer": Sequence(Value("string")),
48
+ "options": Sequence(Value("string")),
49
+ }
50
+ )
51
+
52
+ # Load the dataset with the specified features
53
+ dataset = load_dataset(
54
+ "xinrongzhang2022/InfiniteBench",
55
+ split="longbook_choice_eng",
56
+ features=ft,
57
+ revision="90f0394333616266d9fe85824ceaf505093cbaa5",
58
+ )
59
+
60
+ assert isinstance(dataset, Dataset)
61
+
62
+ def count_words(text: str) -> int:
63
+ return len(re.split(r"\s+", text.strip()))
64
+
65
+ dataset = dataset.filter(
66
+ lambda example: count_words(example["context"])
67
+ + count_words(example["input"])
68
+ + sum(count_words(option) for option in example["options"])
69
+ <= self.max_num_words
70
+ )
71
+
72
+ # Read all instances
73
+ instances: List[Instance] = []
74
+ for row in dataset:
75
+ assert len(row["answer"]) == 1
76
+ id = row["id"]
77
+ input = Input(text=row["context"] + "\n\n" + row["input"])
78
+ references = [
79
+ Reference(Output(text=option), tags=[CORRECT_TAG] if option == row["answer"][0] else [])
80
+ for option in row["options"]
81
+ ]
82
+ instance = Instance(
83
+ id=id,
84
+ input=input,
85
+ references=references,
86
+ split=TEST_SPLIT,
87
+ )
88
+ instances.append(instance)
89
+
90
+ return instances
@@ -26,7 +26,7 @@ class InfiniteBenchEnQAScenario(Scenario):
26
26
  """
27
27
 
28
28
  name = "infinite_bench_en_qa"
29
- description = "∞Bench En.QA is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
29
+ description = "∞Bench En.QA is an open-ended question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
30
30
  tags = ["question_answering"]
31
31
 
32
32
  def __init__(self, max_num_words: int):
@@ -0,0 +1,288 @@
1
+ """Cultural alignment evaluation scenario based on Vietnam World Values Survey responses."""
2
+
3
+ import os
4
+ import json
5
+ import random
6
+ from typing import List
7
+ from datasets import load_dataset
8
+ from huggingface_hub import snapshot_download
9
+
10
+ from helm.common.hierarchical_logger import hlog, hwarn
11
+ from helm.benchmark.scenarios.scenario import (
12
+ Scenario,
13
+ Instance,
14
+ Reference,
15
+ TEST_SPLIT,
16
+ CORRECT_TAG,
17
+ Input,
18
+ Output,
19
+ )
20
+
21
+ SUPPORTED_LANGUAGES = ["en", "vi"]
22
+
23
+
24
+ class CulturalValueUnderstandingWVSScenario(Scenario):
25
+ """Cultural values understanding evaluation based on Vietnam World Values Survey responses."""
26
+
27
+ name = "cultural_value_understanding_wvs"
28
+ description = "Evaluates model understanding of cultural values from WVS Wave 7"
29
+ tags = ["cultural_value_understanding"]
30
+
31
+ def __init__(
32
+ self,
33
+ language: str,
34
+ personas_filename: str = "vn_personas_with_filtered_responses.json",
35
+ questions_filename: str = "wvs_questions_dump.en.json",
36
+ repo_id: str = "stair-lab/cultural-alignment-dataset",
37
+ num_personas: int = 10,
38
+ num_question_variants: int = 2,
39
+ include_few_shot_examples: bool = False,
40
+ ):
41
+ super().__init__()
42
+ self.language = language
43
+ self.personas_filename = personas_filename
44
+ self.questions_filename = questions_filename
45
+ self.repo_id = repo_id
46
+ self.num_personas = 10 if isinstance(num_personas, str) else num_personas
47
+ self.num_question_variants = min(num_question_variants, 4)
48
+ self.include_few_shot_examples = include_few_shot_examples
49
+
50
+ if self.language not in SUPPORTED_LANGUAGES:
51
+ raise ValueError(f"Unsupported language: {self.language}. Supported languages are: {SUPPORTED_LANGUAGES}")
52
+
53
+ def get_few_shot_examples(self) -> str:
54
+ """Return a few-shot learning examples formatted as a string."""
55
+ examples = """Here are some examples:
56
+
57
+ Example 1:
58
+ You are a 35 year old male from Vietnam. You have higher education and consider yourself middle class. You are married.
59
+
60
+ How important is family in your life?
61
+
62
+ 1. Very important
63
+ 2. Rather important
64
+ 3. Not very important
65
+ 4. Not at all important
66
+
67
+ Please select the option number that best represents your view.
68
+ 1
69
+
70
+ Example 2:
71
+ You are a 28 year old female from Vietnam. You have middle education and consider yourself working class. You are single.
72
+
73
+ Would you say that most people can be trusted or that you need to be very careful in dealing with people?
74
+
75
+ 1. Most people can be trusted
76
+ 2. Need to be very careful
77
+
78
+ Please select the option number that best represents your view.
79
+ 2
80
+
81
+ Example 3:
82
+ You are a 50 year old male from Vietnam. You have lower education and consider yourself lower class. You are married.
83
+
84
+ Do you think that homosexuality is justifiable?
85
+
86
+ 1. Never justifiable
87
+ 2. Rarely justifiable
88
+ 3. Sometimes justifiable
89
+ 4. Always justifiable
90
+
91
+ Please select the option number that best represents your view.
92
+ 1
93
+
94
+ Now answer the following question:
95
+ """ # noqa: E501
96
+ return examples
97
+
98
+ def get_instances(self, output_path: str) -> List[Instance]:
99
+ """Generate test instances from Vietnam personas and WVS questions."""
100
+ instances: List[Instance] = []
101
+
102
+ # Download files from Hugging Face Hub
103
+ repo_local_path = snapshot_download(
104
+ repo_id=self.repo_id, repo_type="dataset", revision="fe54b6f5d75cfca5377707cd7199e39f517e3a1f"
105
+ )
106
+
107
+ # Load the downloaded files
108
+ with open(os.path.join(repo_local_path, self.personas_filename), "r", encoding="utf-8") as f:
109
+ personas = json.load(f)
110
+
111
+ with open(os.path.join(repo_local_path, self.questions_filename), "r", encoding="utf-8") as f:
112
+ questions = json.load(f)
113
+
114
+ # Get few-shot examples
115
+ few_shot_examples = self.get_few_shot_examples() if self.include_few_shot_examples else ""
116
+
117
+ # Sample personas
118
+ sampled_personas = random.sample(personas, min(self.num_personas, len(personas)))
119
+
120
+ # Create instances for each persona and question
121
+ for persona in sampled_personas:
122
+ # Get demographic info for persona description
123
+ persona_desc = (
124
+ f"You are a {persona.get('age', 'adult')} year old {persona.get('sex', 'person')} from Vietnam. "
125
+ )
126
+ persona_desc += f"You have {persona.get('education', 'some')} education and consider yourself {persona.get('social_class', 'middle class')}. " # noqa: E501
127
+ persona_desc += f"You are {persona.get('marital_status', 'single')}."
128
+
129
+ # Process each question this persona answered
130
+ for qid, human_response in persona.get("responses", {}).items():
131
+ # Skip if no human response or if it's 0 (which might be a "Don't know" response)
132
+ if human_response is None:
133
+ continue
134
+
135
+ # Convert human_response to int (if possible)
136
+ try:
137
+ human_response_int = int(human_response)
138
+ except (ValueError, TypeError):
139
+ # Skip if human_response can't be converted to int
140
+ continue
141
+
142
+ # Get question info
143
+ question_data = questions.get(qid, {})
144
+ if not question_data:
145
+ continue
146
+
147
+ # Get options directly from question data
148
+ q_options = question_data.get("options", [])
149
+ if not q_options:
150
+ continue
151
+
152
+ # Skip if human_response is out of range
153
+ if human_response_int < 0 or human_response_int > len(q_options):
154
+ continue
155
+
156
+ # Special handling for "Don't know" or zero responses
157
+ if human_response_int == 0:
158
+ # Some questions might encode "Don't know" as 0
159
+ # Skip for now, or you could add special handling
160
+ continue
161
+
162
+ # Use the predefined question variations
163
+ question_variants = question_data.get("questions", [])
164
+ if not question_variants:
165
+ question_variants = [f"Question {qid}: {question_data.get('description', '')}"]
166
+
167
+ # Use the specified number of variants
168
+ variants_to_use = min(self.num_question_variants, len(question_variants))
169
+ selected_variants = question_variants[:variants_to_use]
170
+
171
+ # Create instances for each selected question variant
172
+ for q_text in selected_variants:
173
+ # Format the prompt with or without few-shot examples
174
+ if self.include_few_shot_examples:
175
+ prompt = f"{few_shot_examples}{persona_desc}\n\n{q_text}\n\n"
176
+ else:
177
+ prompt = f"{persona_desc}\n\n{q_text}\n\n"
178
+
179
+ # Add options from question data - with numbers, not letters
180
+ for i, opt in enumerate(q_options, 1):
181
+ prompt += f"{i}. {opt}\n"
182
+
183
+ prompt += "\nPlease select the option number that best represents your view. Return only the option number. Do not return anything else." # noqa: E501
184
+
185
+ # Create a reference with just the human response number
186
+ # We don't create multiple references, just use the actual human response
187
+ reference = Reference(Output(text=str(human_response_int)), tags=[CORRECT_TAG])
188
+
189
+ # Create the instance
190
+ instance = Instance(
191
+ input=Input(text=prompt),
192
+ references=[reference],
193
+ split=TEST_SPLIT,
194
+ )
195
+ instances.append(instance)
196
+
197
+ if not instances:
198
+ hwarn("No valid instances were created. Check the input data and parameters.")
199
+ else:
200
+ hlog(f"Created {len(instances)} instances for cultural value understanding scenario.")
201
+
202
+ return instances
203
+
204
+
205
+ class SocialNormApplicationNormADScenario(Scenario):
206
+ """Social norms application evaluation based on Vietnam World Values Survey responses."""
207
+
208
+ name = "social_norm_application_normad"
209
+ description = "Evaluates model understanding of social norms from WVS Wave 7"
210
+ tags = ["social_norm_application"]
211
+
212
+ def __init__(
213
+ self,
214
+ language: str,
215
+ ):
216
+ super().__init__()
217
+ self.language = language
218
+
219
+ if self.language not in SUPPORTED_LANGUAGES:
220
+ raise ValueError(f"Unsupported language: {self.language}. Supported languages are: {SUPPORTED_LANGUAGES}")
221
+
222
+ def get_instances(self, output_path: str) -> List[Instance]:
223
+ dataset = load_dataset(
224
+ "akhilayerukola/NormAd", split="train", revision="311049f5610e932792672e4e13e7df62b6290336"
225
+ )
226
+
227
+ instances: List[Instance] = []
228
+
229
+ for item in dataset:
230
+ input_text = (
231
+ f"Assumming that below norms and story are from {item['Country'].capitalize()}:\n"
232
+ f"{item['Background']}\n\n{item['Story']}\n"
233
+ "Return one of the following options: 'yes', 'no', or 'neutral'. Do not return anything else.\n\n"
234
+ )
235
+
236
+ instance = Instance(
237
+ input=Input(text=input_text),
238
+ references=[Reference(Output(text=item["Gold Label"]), tags=[CORRECT_TAG])],
239
+ split=TEST_SPLIT,
240
+ )
241
+ instances.append(instance)
242
+
243
+ return instances
244
+
245
+
246
+ class SocialNormExplanationNormADScenario(Scenario):
247
+ """Social norms explanation evaluation based on Vietnam World Values Survey responses."""
248
+
249
+ name = "social_norm_explanation_normad"
250
+ description = "Evaluates model understanding of social norms from WVS Wave 7"
251
+ tags = ["social_norm_explanation"]
252
+
253
+ def __init__(
254
+ self,
255
+ language: str,
256
+ ):
257
+ super().__init__()
258
+ self.language = language
259
+
260
+ if self.language not in SUPPORTED_LANGUAGES:
261
+ raise ValueError(f"Unsupported language: {self.language}. Supported languages are: {SUPPORTED_LANGUAGES}")
262
+
263
+ def get_instances(self, output_path: str) -> List[Instance]:
264
+ dataset = load_dataset(
265
+ "akhilayerukola/NormAd", split="train", revision="311049f5610e932792672e4e13e7df62b6290336"
266
+ )
267
+
268
+ instances: List[Instance] = []
269
+
270
+ for item in dataset:
271
+ if not item["Explanation"]:
272
+ continue
273
+
274
+ input_text = (
275
+ f"Assumming that below norms and story are from {item['Country'].capitalize()}:\n"
276
+ f"{item['Background']}\n\n{item['Story']}\n"
277
+ f"The answer is {item['Gold Label']}. "
278
+ "Briefly explain the reasoning behind this answer in one or two sentences.\n\n"
279
+ )
280
+
281
+ instance = Instance(
282
+ input=Input(text=input_text),
283
+ references=[Reference(Output(text=item["Explanation"]), tags=[CORRECT_TAG])],
284
+ split=TEST_SPLIT,
285
+ )
286
+ instances.append(instance)
287
+
288
+ return instances
@@ -18,13 +18,14 @@ from helm.benchmark.scenarios.scenario import (
18
18
 
19
19
 
20
20
  def remove_boxed(string: str) -> Optional[str]:
21
- """Source: https://github.com/hendrycks/math
21
+ r"""Source: https://github.com/hendrycks/math
22
22
 
23
- Extract the text within a \\boxed{...} environment.
23
+ Extract the text within a \boxed{...} environment.
24
24
 
25
25
  Example:
26
- >>> remove_boxed(\\boxed{\\frac{2}{3}})
27
- \\frac{2}{3}
26
+ >>> from helm.benchmark.scenarios.math_scenario import * # NOQA
27
+ >>> remove_boxed(r'\boxed{\frac{2}{3}}')
28
+ '\\frac{2}{3}'
28
29
  """
29
30
  left = "\\boxed{"
30
31
  try:
@@ -68,17 +69,17 @@ def last_boxed_only_string(string: str) -> Optional[str]:
68
69
 
69
70
 
70
71
  def _fix_fracs(string: str) -> str:
71
- """Source: https://github.com/hendrycks/math
72
+ r"""Source: https://github.com/hendrycks/math
72
73
 
73
74
  Reformat fractions.
74
75
 
75
76
  Examples:
76
- >>> _fix_fracs("\\frac1b")
77
- \frac{1}{b}
78
- >>> _fix_fracs("\\frac12")
79
- \frac{1}{2}
80
- >>> _fix_fracs("\\frac1{72}")
81
- \frac{1}{72}
77
+ >>> _fix_fracs(r"\frac1b")
78
+ '\\frac{1}{b}'
79
+ >>> _fix_fracs(r"\frac12")
80
+ '\\frac{1}{2}'
81
+ >>> _fix_fracs(r"\frac1{72}")
82
+ '\\frac{1}{72}'
82
83
  """
83
84
  substrs = string.split("\\frac")
84
85
  new_str = substrs[0]
@@ -112,13 +113,13 @@ def _fix_fracs(string: str) -> str:
112
113
 
113
114
 
114
115
  def _fix_a_slash_b(string: str) -> str:
115
- """Source: https://github.com/hendrycks/math
116
+ r"""Source: https://github.com/hendrycks/math
116
117
 
117
118
  Reformat fractions formatted as a/b to \\frac{a}{b}.
118
119
 
119
120
  Example:
120
- >>> _fix_a_slash_b("2/3")
121
- \frac{2}{3}
121
+ >>> _fix_a_slash_b(r"2/3")
122
+ '\\frac{2}{3}'
122
123
  """
123
124
  if len(string.split("/")) != 2:
124
125
  return string
@@ -149,13 +150,13 @@ def _remove_right_units(string: str) -> str:
149
150
 
150
151
 
151
152
  def _fix_sqrt(string: str) -> str:
152
- """Source: https://github.com/hendrycks/math
153
+ r"""Source: https://github.com/hendrycks/math
153
154
 
154
155
  Reformat square roots.
155
156
 
156
157
  Example:
157
- >>> _fix_sqrt("\\sqrt3")
158
- \sqrt{3}
158
+ >>> _fix_sqrt("\\sqrt3")
159
+ '\\sqrt{3}'
159
160
  """
160
161
  if "\\sqrt" not in string:
161
162
  return string
@@ -210,7 +211,7 @@ def _strip_string(string: str) -> str:
210
211
 
211
212
  # remove percentage
212
213
  string = string.replace("\\%", "")
213
- string = string.replace("\%", "")
214
+ string = string.replace(r"\%", "")
214
215
 
215
216
  # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
216
217
  string = string.replace(" .", " 0.")
@@ -391,13 +392,13 @@ class MATHScenario(Scenario):
391
392
  for split, split_name in zip([TRAIN_SPLIT, TEST_SPLIT], ["train", "test"]):
392
393
  if split == TRAIN_SPLIT and self.use_official_examples:
393
394
  train_instances = [
394
- ("What is $\left(\\frac{7}{8}\\right)^3 \cdot \left(\\frac{7}{8}\\right)^{-3}$?", "1"),
395
+ ("What is $\\left(\\frac{7}{8}\\right)^3 \\cdot \\left(\\frac{7}{8}\\right)^{-3}$?", "1"),
395
396
  (
396
397
  "In how many ways can 4 books be selected from a shelf of 6 books"
397
398
  + " if the order in which the books are selected does not matter?",
398
399
  "15",
399
400
  ),
400
- ("Find the distance between the points $(2,1,-4)$ and $(5,8,-3).$", "\sqrt{59}"),
401
+ ("Find the distance between the points $(2,1,-4)$ and $(5,8,-3).$", "\\sqrt{59}"),
401
402
  (
402
403
  "The faces of an octahedral die are labeled with digits $1$ through $8$."
403
404
  + " What is the probability, expressed as a common fraction,"