crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +5 -0
- helm/benchmark/presentation/summarize.py +9 -3
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +19 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/huggingface_client.py +2 -2
- helm/clients/openai_client.py +2 -1
- helm/clients/openai_responses_client.py +6 -4
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +0 -2
- helm/clients/vertexai_client.py +11 -9
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +454 -175
- helm/config/model_metadata.yaml +117 -10
- helm/config/tokenizer_configs.yaml +81 -1
- helm/proxy/cli.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
|
@@ -2,22 +2,13 @@
|
|
|
2
2
|
# type: ignore
|
|
3
3
|
# fmt: off
|
|
4
4
|
|
|
5
|
-
import ast
|
|
6
|
-
import datetime
|
|
7
5
|
import transformers
|
|
8
|
-
import langchain
|
|
9
|
-
import langchain.prompts
|
|
10
|
-
import lxml.etree
|
|
11
6
|
import os
|
|
12
7
|
import pandas as pd
|
|
13
|
-
import re
|
|
14
8
|
import tiktoken
|
|
15
9
|
|
|
16
|
-
from langchain_community.retrievers import BM25Retriever
|
|
17
10
|
from tqdm import tqdm
|
|
18
|
-
from typing import Any, Dict, Optional,
|
|
19
|
-
from langchain.schema import Document
|
|
20
|
-
import langchain_community
|
|
11
|
+
from typing import Any, Dict, Optional, Callable
|
|
21
12
|
|
|
22
13
|
from helm.common.general import check_file_exists
|
|
23
14
|
|
|
@@ -167,102 +158,13 @@ def get_tokenizer(tokenizer_name: str) -> Callable:
|
|
|
167
158
|
return transformers.AutoTokenizer.from_pretrained(tokenizer_name, legacy=False)
|
|
168
159
|
|
|
169
160
|
|
|
170
|
-
def retrieve_most_relevant_visits(ehr_visit_strs, query, target_length, tokenizer):
|
|
171
|
-
"""
|
|
172
|
-
Retrieve and filter relevant EHR visits based on a query and target length.
|
|
173
|
-
|
|
174
|
-
This function retrieves electronic health record (EHR) visit strings, sorts them
|
|
175
|
-
by relevance using the BM25Retriever, and constructs a list of final documents
|
|
176
|
-
that fit within a specified character length. The final list ensures that the
|
|
177
|
-
most important visit isn't cut off and is sorted chronologically.
|
|
178
|
-
|
|
179
|
-
Parameters:
|
|
180
|
-
ehr_visit_strs (list of str): List of EHR visit strings.
|
|
181
|
-
query (str): Query string to retrieve relevant visits.
|
|
182
|
-
target_length (int): Maximum total token count for the final list of documents.
|
|
183
|
-
tokenizer (Callable): Tokenizer that converts text to tokens (used for tracking context length)
|
|
184
|
-
|
|
185
|
-
Returns:
|
|
186
|
-
list[str]: List of EHR visit strings sorted chronologically and constrained by the target length.
|
|
187
|
-
"""
|
|
188
|
-
ehr_visits=re.split(r'(?=</encounter>\n)',ehr_visit_strs)
|
|
189
|
-
langchain_docs = [
|
|
190
|
-
langchain.schema.Document(page_content=doc) for doc in ehr_visits #broken since ehr_visit_strs is one string of all visits
|
|
191
|
-
]
|
|
192
|
-
# `k` is the number of documents to retrieve
|
|
193
|
-
# We retrieve everything and just use the BM25Retriever to sort the documents
|
|
194
|
-
retriever = langchain_community.retrievers.BM25Retriever.from_documents(
|
|
195
|
-
langchain_docs, k=len(langchain_docs)
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
# Invoking the retriever means the most relevant documents are sorted first
|
|
199
|
-
sorted_docs = retriever.invoke(query)
|
|
200
|
-
|
|
201
|
-
# Define the regex pattern to find the start time
|
|
202
|
-
# pattern = r'start="([\d/]+ [\d:]+)"'
|
|
203
|
-
pattern = r'start="([\d/]+ [\d:]+ ?[APM]{0,2})"'
|
|
204
|
-
|
|
205
|
-
docs = []
|
|
206
|
-
dts = []
|
|
207
|
-
|
|
208
|
-
# Find the startime of the document
|
|
209
|
-
for doc in sorted_docs:
|
|
210
|
-
doc_content = doc.page_content
|
|
211
|
-
start_dt_match = re.search(pattern, doc_content)
|
|
212
|
-
if start_dt_match:
|
|
213
|
-
start_dt = start_dt_match.group(1)
|
|
214
|
-
parsed = False
|
|
215
|
-
# Try different date formats
|
|
216
|
-
for fmt in (
|
|
217
|
-
"%m/%d/%y %I:%M %p",
|
|
218
|
-
"%m/%d/%Y %I:%M %p",
|
|
219
|
-
"%m/%d/%y %H:%M",
|
|
220
|
-
"%m/%d/%Y %H:%M",
|
|
221
|
-
):
|
|
222
|
-
try:
|
|
223
|
-
dts.append(datetime.datetime.strptime(start_dt, fmt))
|
|
224
|
-
parsed = True
|
|
225
|
-
break
|
|
226
|
-
except ValueError:
|
|
227
|
-
continue
|
|
228
|
-
if not parsed:
|
|
229
|
-
print(f"Error parsing date: {start_dt}")
|
|
230
|
-
continue
|
|
231
|
-
else:
|
|
232
|
-
print(f"Start time not found., {doc_content}")
|
|
233
|
-
dts.append(datetime.datetime.min)
|
|
234
|
-
docs.append(doc_content)
|
|
235
|
-
|
|
236
|
-
final_docs = []
|
|
237
|
-
current_length = 0
|
|
238
|
-
|
|
239
|
-
# Add documents until we exceed the allocated context length
|
|
240
|
-
for i in range(len(docs)):
|
|
241
|
-
doc_content = docs[i]
|
|
242
|
-
doc_length = len(tokenizer.encode(doc_content))
|
|
243
|
-
final_docs.append((dts[i], doc_content))
|
|
244
|
-
current_length += doc_length
|
|
245
|
-
if current_length > target_length:
|
|
246
|
-
break
|
|
247
|
-
|
|
248
|
-
# Sort final_docs chronologically
|
|
249
|
-
final_docs.sort(key=lambda x: x[0])
|
|
250
|
-
|
|
251
|
-
# Extract only the document content for the final output
|
|
252
|
-
final_docs_content = [doc_content for _, doc_content in final_docs]
|
|
253
|
-
|
|
254
|
-
return final_docs_content
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
161
|
def pack_and_trim_prompts(
|
|
259
162
|
instructions: Dict[int, Dict[str, str]],
|
|
260
163
|
ehrs: Dict[int, str],
|
|
261
|
-
|
|
164
|
+
prompt_string: str,
|
|
262
165
|
context_length: int,
|
|
263
166
|
generation_length: int,
|
|
264
167
|
tokenizer: Any,
|
|
265
|
-
use_RAG: bool = True,
|
|
266
168
|
verbose: bool = False,
|
|
267
169
|
include_ehr: bool = True,
|
|
268
170
|
) -> Dict[int, str]:
|
|
@@ -276,26 +178,15 @@ def pack_and_trim_prompts(
|
|
|
276
178
|
patient_id = int(instructions[instruction_id]["patient_id"])
|
|
277
179
|
relevant_ehr = ehrs[patient_id]
|
|
278
180
|
|
|
279
|
-
# Calculate how many tokens of EHR we can include in the prompt
|
|
280
181
|
num_tokens_instruction = len(tokenizer.encode(instruction))
|
|
281
|
-
num_tokens_prompt_template = len(tokenizer.encode(
|
|
182
|
+
num_tokens_prompt_template = len(tokenizer.encode(prompt_string))
|
|
282
183
|
if include_ehr:
|
|
283
184
|
target_ehr_length = context_length - generation_length - num_tokens_prompt_template - num_tokens_instruction
|
|
284
185
|
else:
|
|
285
186
|
target_ehr_length = 0
|
|
286
187
|
if target_ehr_length <= 0:
|
|
287
|
-
prompt_with_truncated_ehr =
|
|
188
|
+
prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr="")
|
|
288
189
|
else:
|
|
289
|
-
if use_RAG:
|
|
290
|
-
# Return a list of the most relevant visit strings
|
|
291
|
-
most_relevant_visits = retrieve_most_relevant_visits(
|
|
292
|
-
ehr_visit_strs=relevant_ehr,
|
|
293
|
-
query=instruction,
|
|
294
|
-
target_length=target_ehr_length,
|
|
295
|
-
tokenizer=tokenizer,
|
|
296
|
-
)
|
|
297
|
-
relevant_ehr = "\n".join(most_relevant_visits)
|
|
298
|
-
|
|
299
190
|
# Do a first pass with a fast tokenizer
|
|
300
191
|
fast_tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
301
192
|
fast_encoded = fast_tokenizer.encode(relevant_ehr)
|
|
@@ -307,13 +198,17 @@ def pack_and_trim_prompts(
|
|
|
307
198
|
encoded_ehr = tokenizer.encode(fast_truncated_ehr)
|
|
308
199
|
truncated_encoded_ehr = encoded_ehr[-target_ehr_length:]
|
|
309
200
|
truncated_ehr = tokenizer.decode(truncated_encoded_ehr)
|
|
310
|
-
prompt_with_truncated_ehr =
|
|
201
|
+
prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
|
|
202
|
+
else:
|
|
203
|
+
# If the fast encoding is still too long, just use the full EHR up to allowed length
|
|
204
|
+
truncated_ehr = fast_tokenizer.decode(fast_encoded[-target_ehr_length:])
|
|
205
|
+
prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
|
|
311
206
|
|
|
312
|
-
|
|
207
|
+
prompts_map[instruction_id] = prompt_with_truncated_ehr
|
|
313
208
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
209
|
+
if verbose:
|
|
210
|
+
print(prompt_with_truncated_ehr)
|
|
211
|
+
print("~" * 20)
|
|
317
212
|
return prompts_map
|
|
318
213
|
|
|
319
214
|
|
|
@@ -322,7 +217,6 @@ def preprocess_prompts(
|
|
|
322
217
|
generation_length,
|
|
323
218
|
path_to_instructions,
|
|
324
219
|
path_to_ehrs,
|
|
325
|
-
use_RAG,
|
|
326
220
|
include_ehr,
|
|
327
221
|
tokenizer,
|
|
328
222
|
codes_only=False,
|
|
@@ -347,16 +241,18 @@ def preprocess_prompts(
|
|
|
347
241
|
|
|
348
242
|
# CONSTRUCT & TRUNCATE PROMPTS #
|
|
349
243
|
print("Constructing prompts using instructions and EHRs...")
|
|
350
|
-
prompt_string=
|
|
351
|
-
|
|
244
|
+
prompt_string = (
|
|
245
|
+
"Instruction: Answer the following question based on the EHR:\n\n"
|
|
246
|
+
"EHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
|
|
247
|
+
)
|
|
248
|
+
|
|
352
249
|
filled_prompts = pack_and_trim_prompts(
|
|
353
250
|
instructions=instructions,
|
|
354
251
|
ehrs=ehrs,
|
|
355
|
-
|
|
252
|
+
prompt_string=prompt_string,
|
|
356
253
|
context_length=target_context_length,
|
|
357
254
|
generation_length=generation_length,
|
|
358
255
|
tokenizer=tokenizer,
|
|
359
|
-
use_RAG=use_RAG,
|
|
360
256
|
verbose=False,
|
|
361
257
|
include_ehr=include_ehr,
|
|
362
258
|
)
|
|
@@ -415,7 +311,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
|
|
|
415
311
|
path_to_ehrs = os.path.join(data_path, "medalign_ehr_xml")
|
|
416
312
|
path_to_reference_responses = os.path.join(data_path, "clinician-instruction-responses.tsv")
|
|
417
313
|
check_file_exists(path_to_reference_responses, msg=f"[MedAlignScenario] Required clinician responses file not found: '{path_to_reference_responses}'")
|
|
418
|
-
use_RAG = False
|
|
419
314
|
include_ehr = True
|
|
420
315
|
tokenizer = "tiktoken"
|
|
421
316
|
|
|
@@ -424,7 +319,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
|
|
|
424
319
|
generation_length=generation_length,
|
|
425
320
|
path_to_instructions=path_to_instructions,
|
|
426
321
|
path_to_ehrs=path_to_ehrs,
|
|
427
|
-
use_RAG=use_RAG,
|
|
428
322
|
include_ehr=include_ehr,
|
|
429
323
|
tokenizer=tokenizer,
|
|
430
324
|
)
|
|
@@ -439,13 +439,13 @@ class MELTMATHScenario(Scenario):
|
|
|
439
439
|
for split, split_name in zip([TRAIN_SPLIT, TEST_SPLIT], ["train", "test"]):
|
|
440
440
|
if split == TRAIN_SPLIT and self.use_official_examples:
|
|
441
441
|
train_instances = [
|
|
442
|
-
("Kết quả của
|
|
442
|
+
("Kết quả của $\\left(\\frac{7}{8}\\right)^3 \\cdot \\left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
|
|
443
443
|
(
|
|
444
444
|
"Có bao nhiêu cách chọn 4 quyển sách từ một kệ sách có 6 quyển,"
|
|
445
445
|
+ " nếu thứ tự các cuốn sách được chọn không quan trọng?",
|
|
446
446
|
"15",
|
|
447
447
|
),
|
|
448
|
-
("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "
|
|
448
|
+
("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\\sqrt{59}"),
|
|
449
449
|
(
|
|
450
450
|
"Các mặt của khối xúc xắc bát diện được dán nhãn bằng các số từ $1$ đến $8$."
|
|
451
451
|
+ " Xác suất tung một cặp xúc xắc bát diện để được tổng số bằng $15$ là bao nhiêu?"
|
|
@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class MIMICBHCScenario(Scenario):
|
|
17
|
-
"""
|
|
17
|
+
r"""
|
|
18
18
|
MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
|
|
19
19
|
course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).
|
|
20
20
|
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MMMLUScenario(Scenario):
|
|
19
|
+
"""Multilingual Massive Multitask Language Understanding (MMMLU) by OpenAI
|
|
20
|
+
|
|
21
|
+
The MMLU is a widely recognized benchmark of general knowledge attained
|
|
22
|
+
by AI models. It covers a broad range of topics from 57 different categories,
|
|
23
|
+
covering elementary-level knowledge up to advanced professional subjects like
|
|
24
|
+
law, physics, history, and computer science.
|
|
25
|
+
|
|
26
|
+
MMMLU is a translation of MMLU’s test set into 14 languages using professional
|
|
27
|
+
human translators. Relying on human translators for this evaluation increases
|
|
28
|
+
confidence in the accuracy of the translations, especially for low-resource
|
|
29
|
+
languages like Yoruba.
|
|
30
|
+
|
|
31
|
+
The Massive Multitask Language Understanding benchmark from this paper:
|
|
32
|
+
|
|
33
|
+
- https://arxiv.org/pdf/2009.03300.pdf
|
|
34
|
+
|
|
35
|
+
The MMMLU dataset is from here:
|
|
36
|
+
|
|
37
|
+
- https://huggingface.co/datasets/openai/MMMLU
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
name = "mmmlu"
|
|
41
|
+
description = "Multilingual Massive Multitask Language Understanding"
|
|
42
|
+
tags = ["knowledge", "multiple_choice"]
|
|
43
|
+
|
|
44
|
+
OPTIONS = ["A", "B", "C", "D"]
|
|
45
|
+
|
|
46
|
+
def __init__(self, locale: str, subject: str):
|
|
47
|
+
super().__init__()
|
|
48
|
+
self.locale: str = locale
|
|
49
|
+
self.subject: str = subject
|
|
50
|
+
|
|
51
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
52
|
+
cache_dir = os.path.join(output_path, "data")
|
|
53
|
+
ensure_directory_exists(cache_dir)
|
|
54
|
+
dataset = datasets.load_dataset(
|
|
55
|
+
"openai/MMMLU",
|
|
56
|
+
self.locale,
|
|
57
|
+
revision="325a01dc3e173cac1578df94120499aaca2e2504",
|
|
58
|
+
cache_dir=cache_dir,
|
|
59
|
+
split="test",
|
|
60
|
+
)
|
|
61
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
62
|
+
|
|
63
|
+
# Read all instances
|
|
64
|
+
instances: List[Instance] = []
|
|
65
|
+
for row_index, row in enumerate(dataset):
|
|
66
|
+
if self.subject != "all" and row["Subject"] != self.subject:
|
|
67
|
+
continue
|
|
68
|
+
input = Input(text=row["Question"])
|
|
69
|
+
references: List[Reference] = []
|
|
70
|
+
for option in self.OPTIONS:
|
|
71
|
+
references.append(
|
|
72
|
+
Reference(
|
|
73
|
+
output=Output(text=row[option]),
|
|
74
|
+
tags=[CORRECT_TAG] if option == row["Answer"] else [],
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
instance = Instance(
|
|
78
|
+
id=f"id{row_index}",
|
|
79
|
+
input=input,
|
|
80
|
+
references=references,
|
|
81
|
+
split=TEST_SPLIT,
|
|
82
|
+
)
|
|
83
|
+
instances.append(instance)
|
|
84
|
+
|
|
85
|
+
return instances
|
|
@@ -1750,7 +1750,7 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
|
|
|
1750
1750
|
text_noun = self.prompt_components["text_noun"]
|
|
1751
1751
|
instruction = self.prompt_components["single_instruction"]
|
|
1752
1752
|
|
|
1753
|
-
passage = "{question}
|
|
1753
|
+
passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
|
|
1754
1754
|
question=question.format(row["question_translated"]),
|
|
1755
1755
|
text_noun=text_noun,
|
|
1756
1756
|
text=row["text"],
|
|
@@ -1898,7 +1898,7 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
|
|
|
1898
1898
|
text_noun = self.prompt_components["text_noun"]
|
|
1899
1899
|
instruction = self.prompt_components["single_instruction"]
|
|
1900
1900
|
|
|
1901
|
-
passage = "{question}
|
|
1901
|
+
passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
|
|
1902
1902
|
question=question.format(row["question_translated"]),
|
|
1903
1903
|
text_noun=text_noun,
|
|
1904
1904
|
text=row["text"],
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.alghafa_scenario import AlGhafaScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_alghafa_scenario_get_instances():
|
|
10
|
+
scenario = AlGhafaScenario(subset="mcq_exams_test_ar")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 562
|
|
14
|
+
assert actual_instances[0].id == "id0_test"
|
|
15
|
+
assert actual_instances[0].input == Input(
|
|
16
|
+
text=(
|
|
17
|
+
'قال علي بن أبي طالب رضي الله عنه عن عمر بن الخطاب رضي الله عنه " إن كنا لنرى إن في القرآن كلاماً من كلامه ورأياً من رأيه " دلت هذه العبارة على سمة وصفة من صفات عمر بن الخطاب رضي الله عنه هي' # noqa: E501
|
|
18
|
+
)
|
|
19
|
+
)
|
|
20
|
+
assert len(actual_instances[0].references) == 4
|
|
21
|
+
assert actual_instances[0].references[0].output.text == "الشجاعة"
|
|
22
|
+
assert actual_instances[0].references[0].tags == []
|
|
23
|
+
assert actual_instances[0].references[1].output.text == "نزل القرآن الكريم موافقاً لرأيه في عدة مواضع"
|
|
24
|
+
assert actual_instances[0].references[1].tags == [CORRECT_TAG]
|
|
25
|
+
assert actual_instances[0].references[2].output.text == "الشدة في الحق مع اللين والرحمة ."
|
|
26
|
+
assert actual_instances[0].references[2].tags == []
|
|
27
|
+
assert actual_instances[0].references[3].output.text == "التواضع"
|
|
28
|
+
assert actual_instances[0].references[3].tags == []
|
|
29
|
+
assert actual_instances[0].split == "test"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.aratrust_scenario import AraTrustScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_aratrust_get_instances():
|
|
10
|
+
scenario = AraTrustScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 522
|
|
14
|
+
assert actual_instances[0].id == "id0"
|
|
15
|
+
assert actual_instances[0].input == Input(
|
|
16
|
+
text=("حيوان لا يبيض ولا يلد ما هو؟\n\nأ) أنثى الحيوان\n ب) ذكر الحيوان\n ج) كل ما سبق\n")
|
|
17
|
+
)
|
|
18
|
+
assert len(actual_instances[0].references) == 1
|
|
19
|
+
assert actual_instances[0].references[0].output.text == "ب"
|
|
20
|
+
assert actual_instances[0].references[0].tags == [CORRECT_TAG]
|
|
21
|
+
assert actual_instances[0].split == "test"
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.bluex_scenario import BLUEX_Scenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_bluex_scenario():
|
|
10
|
+
scenario = BLUEX_Scenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
|
|
14
|
+
assert len(instances) > 100
|
|
15
|
+
|
|
16
|
+
assert instances[100].split == TEST_SPLIT
|
|
17
|
+
|
|
18
|
+
assert instances[0].input.text.startswith("Rubião fitava a enseada, - eram oito horas da manhã Quem o visse")
|
|
19
|
+
|
|
20
|
+
assert len(instances[0].input.text) == 1011
|
|
21
|
+
|
|
22
|
+
assert instances[0].references == [
|
|
23
|
+
Reference(
|
|
24
|
+
output=Output(
|
|
25
|
+
text='a contemplação das paisagens naturais, como se lê em "ele admirava aquele pedaço de água quieta".'
|
|
26
|
+
),
|
|
27
|
+
tags=[],
|
|
28
|
+
),
|
|
29
|
+
Reference(
|
|
30
|
+
output=Output(
|
|
31
|
+
text='a presença de um narrador-personagem, como se lê em "em verdade vos digo que pensava em '
|
|
32
|
+
'outra coisa".'
|
|
33
|
+
),
|
|
34
|
+
tags=[],
|
|
35
|
+
),
|
|
36
|
+
Reference(
|
|
37
|
+
output=Output(
|
|
38
|
+
text='a sobriedade do protagonista ao avaliar o seu percurso, como se lê em "Cotejava o passado com '
|
|
39
|
+
"o presente."
|
|
40
|
+
),
|
|
41
|
+
tags=[],
|
|
42
|
+
),
|
|
43
|
+
Reference(
|
|
44
|
+
output=Output(
|
|
45
|
+
text='o sentido místico e fatalista que rege os destinos, como se lê em "Deus escreve direito por '
|
|
46
|
+
'linhas tortas".'
|
|
47
|
+
),
|
|
48
|
+
tags=[],
|
|
49
|
+
),
|
|
50
|
+
Reference(
|
|
51
|
+
output=Output(
|
|
52
|
+
text='a reversibilidade entre o cômico e o trágico, como se lê em "de modo que o que parecia uma '
|
|
53
|
+
'desgraça...".'
|
|
54
|
+
),
|
|
55
|
+
tags=[CORRECT_TAG],
|
|
56
|
+
),
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
assert instances[0].references[4].is_correct
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.exams_multilingual_scenario import EXAMSMultilingualScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, TRAIN_SPLIT, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_exam_multilingual_scenario_get_instances():
|
|
10
|
+
scenario = EXAMSMultilingualScenario(language="Bulgarian", subject="Physics")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 393
|
|
14
|
+
assert actual_instances[0].id == "4c05bbb8-7729-11ea-9116-54bef70b159e"
|
|
15
|
+
assert actual_instances[0].input == Input(text="Наелектризирането по индукция се обяснява с: ")
|
|
16
|
+
assert len(actual_instances[0].references) == 4
|
|
17
|
+
assert actual_instances[0].references[0].output.text == "преразпределение на положителните йони в тялото"
|
|
18
|
+
assert actual_instances[0].references[0].tags == []
|
|
19
|
+
assert (
|
|
20
|
+
actual_instances[0].references[1].output.text == "предаване на електрони от неутрално на наелектризирано тяло"
|
|
21
|
+
)
|
|
22
|
+
assert actual_instances[0].references[1].tags == []
|
|
23
|
+
assert (
|
|
24
|
+
actual_instances[0].references[2].output.text == "предаване на електрони от наелектризирано на неутрално тяло"
|
|
25
|
+
)
|
|
26
|
+
assert actual_instances[0].references[2].tags == []
|
|
27
|
+
assert actual_instances[0].references[3].output.text == "преразпределение на свободните електрони в тялото"
|
|
28
|
+
assert actual_instances[0].references[3].tags == [CORRECT_TAG]
|
|
29
|
+
assert actual_instances[0].split == TRAIN_SPLIT
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.healthqa_br_scenario import HEALTHQA_BR_Scenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_healthqa_br_instance():
|
|
10
|
+
scenario = HEALTHQA_BR_Scenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
|
|
14
|
+
instance = instances[35]
|
|
15
|
+
|
|
16
|
+
assert instance.split == TEST_SPLIT
|
|
17
|
+
|
|
18
|
+
assert instance.input.text.startswith("Homem de 22 anos de idade procura a Unidade Básica")
|
|
19
|
+
|
|
20
|
+
assert instance.references == [
|
|
21
|
+
Reference(
|
|
22
|
+
output=Output(
|
|
23
|
+
text="administração de relaxante muscular, colocando o paciente em posição de Trendelenburg, com "
|
|
24
|
+
"tentativa de redução do volume."
|
|
25
|
+
),
|
|
26
|
+
tags=[],
|
|
27
|
+
),
|
|
28
|
+
Reference(
|
|
29
|
+
output=Output(
|
|
30
|
+
text="encaminhamento do paciente ao Serviço de Urgência do Hospital com o pedido de avaliação "
|
|
31
|
+
"imediata do cirurgião."
|
|
32
|
+
),
|
|
33
|
+
tags=[CORRECT_TAG],
|
|
34
|
+
),
|
|
35
|
+
Reference(
|
|
36
|
+
output=Output(
|
|
37
|
+
text="tentativa de redução manual do aumento de volume da região inguinescrotal para a cavidade "
|
|
38
|
+
"abdominal."
|
|
39
|
+
),
|
|
40
|
+
tags=[],
|
|
41
|
+
),
|
|
42
|
+
Reference(
|
|
43
|
+
output=Output(
|
|
44
|
+
text="transiluminação do escroto para tentar diferenciar hérnia inguinal de hidrocele comunicante."
|
|
45
|
+
),
|
|
46
|
+
tags=[],
|
|
47
|
+
),
|
|
48
|
+
Reference(
|
|
49
|
+
output=Output(text="prescrição de antiemético e solicitação de ecografia da região inguinescrotal."),
|
|
50
|
+
tags=[],
|
|
51
|
+
),
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
correct_refs = [ref for ref in instance.references if CORRECT_TAG in ref.tags]
|
|
55
|
+
assert len(correct_refs) == 1
|
|
56
|
+
|
|
57
|
+
assert instance.references[1].is_correct
|
helm/benchmark/slurm_jobs.py
CHANGED
|
@@ -13,7 +13,6 @@ except ModuleNotFoundError as e:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class SlurmJobState:
|
|
16
|
-
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
17
16
|
# Non-exhaustive list of Slurm job states.
|
|
18
17
|
# See: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
|
|
19
18
|
|
|
@@ -81,7 +80,7 @@ def get_slurm_job_state(job_id: int) -> str:
|
|
|
81
80
|
except subprocess.CalledProcessError as e:
|
|
82
81
|
# Default CalledProcessError message doesn't have output, so re-raise here to include the output.
|
|
83
82
|
raise Exception(f"{str(e)} output: {e.output}")
|
|
84
|
-
search_result = re.search("JobState=(\w+)", scontrol_output.decode())
|
|
83
|
+
search_result = re.search(r"JobState=(\w+)", scontrol_output.decode())
|
|
85
84
|
if not search_result:
|
|
86
85
|
raise Exception(f"Could not extract JobState from scontrol: {scontrol_output.decode()}")
|
|
87
86
|
return search_result.group(1)
|
helm/benchmark/slurm_runner.py
CHANGED
|
@@ -26,7 +26,7 @@ from helm.benchmark.slurm_jobs import (
|
|
|
26
26
|
FAILURE_SLURM_JOB_STATES,
|
|
27
27
|
)
|
|
28
28
|
from helm.common.general import ensure_directory_exists
|
|
29
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
29
|
+
from helm.common.hierarchical_logger import hlog, htrack_block, setup_default_logging
|
|
30
30
|
|
|
31
31
|
from helm.benchmark.runner_config_registry import RUNNER_CONFIG
|
|
32
32
|
|
|
@@ -343,7 +343,14 @@ def main():
|
|
|
343
343
|
help="Path to the RunSpec JSON file",
|
|
344
344
|
required=True,
|
|
345
345
|
)
|
|
346
|
+
parser.add_argument(
|
|
347
|
+
"--log-config",
|
|
348
|
+
type=str,
|
|
349
|
+
default=None,
|
|
350
|
+
help="PATH to a YAML file to customize logging",
|
|
351
|
+
)
|
|
346
352
|
args = parser.parse_args()
|
|
353
|
+
setup_default_logging(args.log_config)
|
|
347
354
|
|
|
348
355
|
# Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
|
|
349
356
|
with open(args.slurm_runner_spec_path, "r") as f:
|