crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -2,22 +2,13 @@
2
2
  # type: ignore
3
3
  # fmt: off
4
4
 
5
- import ast
6
- import datetime
7
5
  import transformers
8
- import langchain
9
- import langchain.prompts
10
- import lxml.etree
11
6
  import os
12
7
  import pandas as pd
13
- import re
14
8
  import tiktoken
15
9
 
16
- from langchain_community.retrievers import BM25Retriever
17
10
  from tqdm import tqdm
18
- from typing import Any, Dict, Optional, Union, Callable
19
- from langchain.schema import Document
20
- import langchain_community
11
+ from typing import Any, Dict, Optional, Callable
21
12
 
22
13
  from helm.common.general import check_file_exists
23
14
 
@@ -167,102 +158,13 @@ def get_tokenizer(tokenizer_name: str) -> Callable:
167
158
  return transformers.AutoTokenizer.from_pretrained(tokenizer_name, legacy=False)
168
159
 
169
160
 
170
- def retrieve_most_relevant_visits(ehr_visit_strs, query, target_length, tokenizer):
171
- """
172
- Retrieve and filter relevant EHR visits based on a query and target length.
173
-
174
- This function retrieves electronic health record (EHR) visit strings, sorts them
175
- by relevance using the BM25Retriever, and constructs a list of final documents
176
- that fit within a specified character length. The final list ensures that the
177
- most important visit isn't cut off and is sorted chronologically.
178
-
179
- Parameters:
180
- ehr_visit_strs (list of str): List of EHR visit strings.
181
- query (str): Query string to retrieve relevant visits.
182
- target_length (int): Maximum total token count for the final list of documents.
183
- tokenizer (Callable): Tokenizer that converts text to tokens (used for tracking context length)
184
-
185
- Returns:
186
- list[str]: List of EHR visit strings sorted chronologically and constrained by the target length.
187
- """
188
- ehr_visits=re.split(r'(?=</encounter>\n)',ehr_visit_strs)
189
- langchain_docs = [
190
- langchain.schema.Document(page_content=doc) for doc in ehr_visits #broken since ehr_visit_strs is one string of all visits
191
- ]
192
- # `k` is the number of documents to retrieve
193
- # We retrieve everything and just use the BM25Retriever to sort the documents
194
- retriever = langchain_community.retrievers.BM25Retriever.from_documents(
195
- langchain_docs, k=len(langchain_docs)
196
- )
197
-
198
- # Invoking the retriever means the most relevant documents are sorted first
199
- sorted_docs = retriever.invoke(query)
200
-
201
- # Define the regex pattern to find the start time
202
- # pattern = r'start="([\d/]+ [\d:]+)"'
203
- pattern = r'start="([\d/]+ [\d:]+ ?[APM]{0,2})"'
204
-
205
- docs = []
206
- dts = []
207
-
208
- # Find the startime of the document
209
- for doc in sorted_docs:
210
- doc_content = doc.page_content
211
- start_dt_match = re.search(pattern, doc_content)
212
- if start_dt_match:
213
- start_dt = start_dt_match.group(1)
214
- parsed = False
215
- # Try different date formats
216
- for fmt in (
217
- "%m/%d/%y %I:%M %p",
218
- "%m/%d/%Y %I:%M %p",
219
- "%m/%d/%y %H:%M",
220
- "%m/%d/%Y %H:%M",
221
- ):
222
- try:
223
- dts.append(datetime.datetime.strptime(start_dt, fmt))
224
- parsed = True
225
- break
226
- except ValueError:
227
- continue
228
- if not parsed:
229
- print(f"Error parsing date: {start_dt}")
230
- continue
231
- else:
232
- print(f"Start time not found., {doc_content}")
233
- dts.append(datetime.datetime.min)
234
- docs.append(doc_content)
235
-
236
- final_docs = []
237
- current_length = 0
238
-
239
- # Add documents until we exceed the allocated context length
240
- for i in range(len(docs)):
241
- doc_content = docs[i]
242
- doc_length = len(tokenizer.encode(doc_content))
243
- final_docs.append((dts[i], doc_content))
244
- current_length += doc_length
245
- if current_length > target_length:
246
- break
247
-
248
- # Sort final_docs chronologically
249
- final_docs.sort(key=lambda x: x[0])
250
-
251
- # Extract only the document content for the final output
252
- final_docs_content = [doc_content for _, doc_content in final_docs]
253
-
254
- return final_docs_content
255
-
256
-
257
-
258
161
  def pack_and_trim_prompts(
259
162
  instructions: Dict[int, Dict[str, str]],
260
163
  ehrs: Dict[int, str],
261
- prompt_template: langchain.prompts.PromptTemplate,
164
+ prompt_string: str,
262
165
  context_length: int,
263
166
  generation_length: int,
264
167
  tokenizer: Any,
265
- use_RAG: bool = True,
266
168
  verbose: bool = False,
267
169
  include_ehr: bool = True,
268
170
  ) -> Dict[int, str]:
@@ -276,26 +178,15 @@ def pack_and_trim_prompts(
276
178
  patient_id = int(instructions[instruction_id]["patient_id"])
277
179
  relevant_ehr = ehrs[patient_id]
278
180
 
279
- # Calculate how many tokens of EHR we can include in the prompt
280
181
  num_tokens_instruction = len(tokenizer.encode(instruction))
281
- num_tokens_prompt_template = len(tokenizer.encode(prompt_template.template))
182
+ num_tokens_prompt_template = len(tokenizer.encode(prompt_string))
282
183
  if include_ehr:
283
184
  target_ehr_length = context_length - generation_length - num_tokens_prompt_template - num_tokens_instruction
284
185
  else:
285
186
  target_ehr_length = 0
286
187
  if target_ehr_length <= 0:
287
- prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr="")
188
+ prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr="")
288
189
  else:
289
- if use_RAG:
290
- # Return a list of the most relevant visit strings
291
- most_relevant_visits = retrieve_most_relevant_visits(
292
- ehr_visit_strs=relevant_ehr,
293
- query=instruction,
294
- target_length=target_ehr_length,
295
- tokenizer=tokenizer,
296
- )
297
- relevant_ehr = "\n".join(most_relevant_visits)
298
-
299
190
  # Do a first pass with a fast tokenizer
300
191
  fast_tokenizer = tiktoken.get_encoding("cl100k_base")
301
192
  fast_encoded = fast_tokenizer.encode(relevant_ehr)
@@ -307,13 +198,17 @@ def pack_and_trim_prompts(
307
198
  encoded_ehr = tokenizer.encode(fast_truncated_ehr)
308
199
  truncated_encoded_ehr = encoded_ehr[-target_ehr_length:]
309
200
  truncated_ehr = tokenizer.decode(truncated_encoded_ehr)
310
- prompt_with_truncated_ehr = prompt_template.format(question=instruction, ehr=truncated_ehr)
201
+ prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
202
+ else:
203
+ # If the fast encoding is still too long, just use the full EHR up to allowed length
204
+ truncated_ehr = fast_tokenizer.decode(fast_encoded[-target_ehr_length:])
205
+ prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
311
206
 
312
- prompts_map[instruction_id] = prompt_with_truncated_ehr
207
+ prompts_map[instruction_id] = prompt_with_truncated_ehr
313
208
 
314
- if verbose:
315
- print(prompt_with_truncated_ehr)
316
- print("~" * 20)
209
+ if verbose:
210
+ print(prompt_with_truncated_ehr)
211
+ print("~" * 20)
317
212
  return prompts_map
318
213
 
319
214
 
@@ -322,7 +217,6 @@ def preprocess_prompts(
322
217
  generation_length,
323
218
  path_to_instructions,
324
219
  path_to_ehrs,
325
- use_RAG,
326
220
  include_ehr,
327
221
  tokenizer,
328
222
  codes_only=False,
@@ -347,16 +241,18 @@ def preprocess_prompts(
347
241
 
348
242
  # CONSTRUCT & TRUNCATE PROMPTS #
349
243
  print("Constructing prompts using instructions and EHRs...")
350
- prompt_string="Instruction: Answer the following question based on the EHR:\n\nEHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
351
- prompt_template = langchain.prompts.PromptTemplate.from_template(prompt_string)
244
+ prompt_string = (
245
+ "Instruction: Answer the following question based on the EHR:\n\n"
246
+ "EHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
247
+ )
248
+
352
249
  filled_prompts = pack_and_trim_prompts(
353
250
  instructions=instructions,
354
251
  ehrs=ehrs,
355
- prompt_template=prompt_template,
252
+ prompt_string=prompt_string,
356
253
  context_length=target_context_length,
357
254
  generation_length=generation_length,
358
255
  tokenizer=tokenizer,
359
- use_RAG=use_RAG,
360
256
  verbose=False,
361
257
  include_ehr=include_ehr,
362
258
  )
@@ -415,7 +311,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
415
311
  path_to_ehrs = os.path.join(data_path, "medalign_ehr_xml")
416
312
  path_to_reference_responses = os.path.join(data_path, "clinician-instruction-responses.tsv")
417
313
  check_file_exists(path_to_reference_responses, msg=f"[MedAlignScenario] Required clinician responses file not found: '{path_to_reference_responses}'")
418
- use_RAG = False
419
314
  include_ehr = True
420
315
  tokenizer = "tiktoken"
421
316
 
@@ -424,7 +319,6 @@ def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
424
319
  generation_length=generation_length,
425
320
  path_to_instructions=path_to_instructions,
426
321
  path_to_ehrs=path_to_ehrs,
427
- use_RAG=use_RAG,
428
322
  include_ehr=include_ehr,
429
323
  tokenizer=tokenizer,
430
324
  )
@@ -439,13 +439,13 @@ class MELTMATHScenario(Scenario):
439
439
  for split, split_name in zip([TRAIN_SPLIT, TEST_SPLIT], ["train", "test"]):
440
440
  if split == TRAIN_SPLIT and self.use_official_examples:
441
441
  train_instances = [
442
- ("Kết quả của $\left(\\frac{7}{8}\\right)^3 \cdot \left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
442
+ ("Kết quả của $\\left(\\frac{7}{8}\\right)^3 \\cdot \\left(\\frac{7}{8}\\right)^{-3}$ là gì?", "1"),
443
443
  (
444
444
  "Có bao nhiêu cách chọn 4 quyển sách từ một kệ sách có 6 quyển,"
445
445
  + " nếu thứ tự các cuốn sách được chọn không quan trọng?",
446
446
  "15",
447
447
  ),
448
- ("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\sqrt{59}"),
448
+ ("Tìm khoảng cách giữa các điểm $(2,1,-4)$ và $(5,8,-3).$", "\\sqrt{59}"),
449
449
  (
450
450
  "Các mặt của khối xúc xắc bát diện được dán nhãn bằng các số từ $1$ đến $8$."
451
451
  + " Xác suất tung một cặp xúc xắc bát diện để được tổng số bằng $15$ là bao nhiêu?"
@@ -14,7 +14,7 @@ from helm.benchmark.scenarios.scenario import (
14
14
 
15
15
 
16
16
  class MIMICBHCScenario(Scenario):
17
- """
17
+ r"""
18
18
  MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
19
19
  course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).
20
20
 
@@ -0,0 +1,85 @@
1
+ import os
2
+ from typing import List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Input,
14
+ Output,
15
+ )
16
+
17
+
18
+ class MMMLUScenario(Scenario):
19
+ """Multilingual Massive Multitask Language Understanding (MMMLU) by OpenAI
20
+
21
+ The MMLU is a widely recognized benchmark of general knowledge attained
22
+ by AI models. It covers a broad range of topics from 57 different categories,
23
+ covering elementary-level knowledge up to advanced professional subjects like
24
+ law, physics, history, and computer science.
25
+
26
+ MMMLU is a translation of MMLU’s test set into 14 languages using professional
27
+ human translators. Relying on human translators for this evaluation increases
28
+ confidence in the accuracy of the translations, especially for low-resource
29
+ languages like Yoruba.
30
+
31
+ The Massive Multitask Language Understanding benchmark from this paper:
32
+
33
+ - https://arxiv.org/pdf/2009.03300.pdf
34
+
35
+ The MMMLU dataset is from here:
36
+
37
+ - https://huggingface.co/datasets/openai/MMMLU
38
+ """
39
+
40
+ name = "mmmlu"
41
+ description = "Multilingual Massive Multitask Language Understanding"
42
+ tags = ["knowledge", "multiple_choice"]
43
+
44
+ OPTIONS = ["A", "B", "C", "D"]
45
+
46
+ def __init__(self, locale: str, subject: str):
47
+ super().__init__()
48
+ self.locale: str = locale
49
+ self.subject: str = subject
50
+
51
+ def get_instances(self, output_path: str) -> List[Instance]:
52
+ cache_dir = os.path.join(output_path, "data")
53
+ ensure_directory_exists(cache_dir)
54
+ dataset = datasets.load_dataset(
55
+ "openai/MMMLU",
56
+ self.locale,
57
+ revision="325a01dc3e173cac1578df94120499aaca2e2504",
58
+ cache_dir=cache_dir,
59
+ split="test",
60
+ )
61
+ assert isinstance(dataset, datasets.Dataset)
62
+
63
+ # Read all instances
64
+ instances: List[Instance] = []
65
+ for row_index, row in enumerate(dataset):
66
+ if self.subject != "all" and row["Subject"] != self.subject:
67
+ continue
68
+ input = Input(text=row["Question"])
69
+ references: List[Reference] = []
70
+ for option in self.OPTIONS:
71
+ references.append(
72
+ Reference(
73
+ output=Output(text=row[option]),
74
+ tags=[CORRECT_TAG] if option == row["Answer"] else [],
75
+ )
76
+ )
77
+ instance = Instance(
78
+ id=f"id{row_index}",
79
+ input=input,
80
+ references=references,
81
+ split=TEST_SPLIT,
82
+ )
83
+ instances.append(instance)
84
+
85
+ return instances
@@ -1750,7 +1750,7 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
1750
1750
  text_noun = self.prompt_components["text_noun"]
1751
1751
  instruction = self.prompt_components["single_instruction"]
1752
1752
 
1753
- passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1753
+ passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
1754
1754
  question=question.format(row["question_translated"]),
1755
1755
  text_noun=text_noun,
1756
1756
  text=row["text"],
@@ -1898,7 +1898,7 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
1898
1898
  text_noun = self.prompt_components["text_noun"]
1899
1899
  instruction = self.prompt_components["single_instruction"]
1900
1900
 
1901
- passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1901
+ passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
1902
1902
  question=question.format(row["question_translated"]),
1903
1903
  text_noun=text_noun,
1904
1904
  text=row["text"],
@@ -0,0 +1,29 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.alghafa_scenario import AlGhafaScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_alghafa_scenario_get_instances():
10
+ scenario = AlGhafaScenario(subset="mcq_exams_test_ar")
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 562
14
+ assert actual_instances[0].id == "id0_test"
15
+ assert actual_instances[0].input == Input(
16
+ text=(
17
+ 'قال علي بن أبي طالب رضي الله عنه عن عمر بن الخطاب رضي الله عنه " إن كنا لنرى إن في القرآن كلاماً من كلامه ورأياً من رأيه " دلت هذه العبارة على سمة وصفة من صفات عمر بن الخطاب رضي الله عنه هي' # noqa: E501
18
+ )
19
+ )
20
+ assert len(actual_instances[0].references) == 4
21
+ assert actual_instances[0].references[0].output.text == "الشجاعة"
22
+ assert actual_instances[0].references[0].tags == []
23
+ assert actual_instances[0].references[1].output.text == "نزل القرآن الكريم موافقاً لرأيه في عدة مواضع"
24
+ assert actual_instances[0].references[1].tags == [CORRECT_TAG]
25
+ assert actual_instances[0].references[2].output.text == "الشدة في الحق مع اللين والرحمة ."
26
+ assert actual_instances[0].references[2].tags == []
27
+ assert actual_instances[0].references[3].output.text == "التواضع"
28
+ assert actual_instances[0].references[3].tags == []
29
+ assert actual_instances[0].split == "test"
@@ -0,0 +1,21 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.aratrust_scenario import AraTrustScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_aratrust_get_instances():
10
+ scenario = AraTrustScenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 522
14
+ assert actual_instances[0].id == "id0"
15
+ assert actual_instances[0].input == Input(
16
+ text=("حيوان لا يبيض ولا يلد ما هو؟\n\nأ) أنثى الحيوان\n ب) ذكر الحيوان\n ج) كل ما سبق\n")
17
+ )
18
+ assert len(actual_instances[0].references) == 1
19
+ assert actual_instances[0].references[0].output.text == "ب"
20
+ assert actual_instances[0].references[0].tags == [CORRECT_TAG]
21
+ assert actual_instances[0].split == "test"
@@ -0,0 +1,59 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.bluex_scenario import BLUEX_Scenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_bluex_scenario():
10
+ scenario = BLUEX_Scenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = scenario.get_instances(tmpdir)
13
+
14
+ assert len(instances) > 100
15
+
16
+ assert instances[100].split == TEST_SPLIT
17
+
18
+ assert instances[0].input.text.startswith("Rubião fitava a enseada, - eram oito horas da manhã Quem o visse")
19
+
20
+ assert len(instances[0].input.text) == 1011
21
+
22
+ assert instances[0].references == [
23
+ Reference(
24
+ output=Output(
25
+ text='a contemplação das paisagens naturais, como se lê em "ele admirava aquele pedaço de água quieta".'
26
+ ),
27
+ tags=[],
28
+ ),
29
+ Reference(
30
+ output=Output(
31
+ text='a presença de um narrador-personagem, como se lê em "em verdade vos digo que pensava em '
32
+ 'outra coisa".'
33
+ ),
34
+ tags=[],
35
+ ),
36
+ Reference(
37
+ output=Output(
38
+ text='a sobriedade do protagonista ao avaliar o seu percurso, como se lê em "Cotejava o passado com '
39
+ "o presente."
40
+ ),
41
+ tags=[],
42
+ ),
43
+ Reference(
44
+ output=Output(
45
+ text='o sentido místico e fatalista que rege os destinos, como se lê em "Deus escreve direito por '
46
+ 'linhas tortas".'
47
+ ),
48
+ tags=[],
49
+ ),
50
+ Reference(
51
+ output=Output(
52
+ text='a reversibilidade entre o cômico e o trágico, como se lê em "de modo que o que parecia uma '
53
+ 'desgraça...".'
54
+ ),
55
+ tags=[CORRECT_TAG],
56
+ ),
57
+ ]
58
+
59
+ assert instances[0].references[4].is_correct
@@ -0,0 +1,29 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.exams_multilingual_scenario import EXAMSMultilingualScenario
5
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, TRAIN_SPLIT, Input
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_exam_multilingual_scenario_get_instances():
10
+ scenario = EXAMSMultilingualScenario(language="Bulgarian", subject="Physics")
11
+ with TemporaryDirectory() as tmpdir:
12
+ actual_instances = scenario.get_instances(tmpdir)
13
+ assert len(actual_instances) == 393
14
+ assert actual_instances[0].id == "4c05bbb8-7729-11ea-9116-54bef70b159e"
15
+ assert actual_instances[0].input == Input(text="Наелектризирането по индукция се обяснява с: ")
16
+ assert len(actual_instances[0].references) == 4
17
+ assert actual_instances[0].references[0].output.text == "преразпределение на положителните йони в тялото"
18
+ assert actual_instances[0].references[0].tags == []
19
+ assert (
20
+ actual_instances[0].references[1].output.text == "предаване на електрони от неутрално на наелектризирано тяло"
21
+ )
22
+ assert actual_instances[0].references[1].tags == []
23
+ assert (
24
+ actual_instances[0].references[2].output.text == "предаване на електрони от наелектризирано на неутрално тяло"
25
+ )
26
+ assert actual_instances[0].references[2].tags == []
27
+ assert actual_instances[0].references[3].output.text == "преразпределение на свободните електрони в тялото"
28
+ assert actual_instances[0].references[3].tags == [CORRECT_TAG]
29
+ assert actual_instances[0].split == TRAIN_SPLIT
@@ -0,0 +1,57 @@
1
+ import pytest
2
+ from tempfile import TemporaryDirectory
3
+
4
+ from helm.benchmark.scenarios.healthqa_br_scenario import HEALTHQA_BR_Scenario
5
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
6
+
7
+
8
+ @pytest.mark.scenarios
9
+ def test_healthqa_br_instance():
10
+ scenario = HEALTHQA_BR_Scenario()
11
+ with TemporaryDirectory() as tmpdir:
12
+ instances = scenario.get_instances(tmpdir)
13
+
14
+ instance = instances[35]
15
+
16
+ assert instance.split == TEST_SPLIT
17
+
18
+ assert instance.input.text.startswith("Homem de 22 anos de idade procura a Unidade Básica")
19
+
20
+ assert instance.references == [
21
+ Reference(
22
+ output=Output(
23
+ text="administração de relaxante muscular, colocando o paciente em posição de Trendelenburg, com "
24
+ "tentativa de redução do volume."
25
+ ),
26
+ tags=[],
27
+ ),
28
+ Reference(
29
+ output=Output(
30
+ text="encaminhamento do paciente ao Serviço de Urgência do Hospital com o pedido de avaliação "
31
+ "imediata do cirurgião."
32
+ ),
33
+ tags=[CORRECT_TAG],
34
+ ),
35
+ Reference(
36
+ output=Output(
37
+ text="tentativa de redução manual do aumento de volume da região inguinescrotal para a cavidade "
38
+ "abdominal."
39
+ ),
40
+ tags=[],
41
+ ),
42
+ Reference(
43
+ output=Output(
44
+ text="transiluminação do escroto para tentar diferenciar hérnia inguinal de hidrocele comunicante."
45
+ ),
46
+ tags=[],
47
+ ),
48
+ Reference(
49
+ output=Output(text="prescrição de antiemético e solicitação de ecografia da região inguinescrotal."),
50
+ tags=[],
51
+ ),
52
+ ]
53
+
54
+ correct_refs = [ref for ref in instance.references if CORRECT_TAG in ref.tags]
55
+ assert len(correct_refs) == 1
56
+
57
+ assert instance.references[1].is_correct
@@ -13,7 +13,6 @@ except ModuleNotFoundError as e:
13
13
 
14
14
 
15
15
  class SlurmJobState:
16
- # TODO: Convert to StrEnum after upgrading to Python 3.11
17
16
  # Non-exhaustive list of Slurm job states.
18
17
  # See: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
19
18
 
@@ -81,7 +80,7 @@ def get_slurm_job_state(job_id: int) -> str:
81
80
  except subprocess.CalledProcessError as e:
82
81
  # Default CalledProcessError message doesn't have output, so re-raise here to include the output.
83
82
  raise Exception(f"{str(e)} output: {e.output}")
84
- search_result = re.search("JobState=(\w+)", scontrol_output.decode())
83
+ search_result = re.search(r"JobState=(\w+)", scontrol_output.decode())
85
84
  if not search_result:
86
85
  raise Exception(f"Could not extract JobState from scontrol: {scontrol_output.decode()}")
87
86
  return search_result.group(1)
@@ -26,7 +26,7 @@ from helm.benchmark.slurm_jobs import (
26
26
  FAILURE_SLURM_JOB_STATES,
27
27
  )
28
28
  from helm.common.general import ensure_directory_exists
29
- from helm.common.hierarchical_logger import hlog, htrack_block
29
+ from helm.common.hierarchical_logger import hlog, htrack_block, setup_default_logging
30
30
 
31
31
  from helm.benchmark.runner_config_registry import RUNNER_CONFIG
32
32
 
@@ -343,7 +343,14 @@ def main():
343
343
  help="Path to the RunSpec JSON file",
344
344
  required=True,
345
345
  )
346
+ parser.add_argument(
347
+ "--log-config",
348
+ type=str,
349
+ default=None,
350
+ help="PATH to a YAML file to customize logging",
351
+ )
346
352
  args = parser.parse_args()
353
+ setup_default_logging(args.log_config)
347
354
 
348
355
  # Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
349
356
  with open(args.slurm_runner_spec_path, "r") as f: