crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -37,6 +37,7 @@ def get_medcalc_bench_spec() -> RunSpec:
37
37
  output_noun="Answer only the requested quantity without units. No explanation needed",
38
38
  max_tokens=10,
39
39
  max_train_instances=0,
40
+ stop_sequences=[],
40
41
  )
41
42
 
42
43
  metric_specs = [
@@ -56,9 +57,13 @@ def get_medcalc_bench_spec() -> RunSpec:
56
57
 
57
58
 
58
59
  @run_spec_function("clear")
59
- def get_clear_spec(condition: str) -> RunSpec:
60
+ def get_clear_spec(condition: str, data_path: str) -> RunSpec:
60
61
  scenario_spec = ScenarioSpec(
61
- class_name="helm.benchmark.scenarios.clear_scenario.CLEARScenario", args={"condition": condition}
62
+ class_name="helm.benchmark.scenarios.clear_scenario.CLEARScenario",
63
+ args={
64
+ "condition": condition,
65
+ "data_path": data_path,
66
+ },
62
67
  )
63
68
 
64
69
  condition_display = condition.replace("_", " ")
@@ -157,6 +162,7 @@ def get_medec_run_spec() -> RunSpec:
157
162
  output_noun="Answer",
158
163
  max_tokens=256,
159
164
  max_train_instances=0,
165
+ stop_sequences=[],
160
166
  )
161
167
 
162
168
  # Define the metrics
@@ -178,10 +184,14 @@ def get_medec_run_spec() -> RunSpec:
178
184
 
179
185
 
180
186
  @run_spec_function("ehrshot")
181
- def get_ehrshot_spec(subject: str, max_length: int = 100000) -> RunSpec:
187
+ def get_ehrshot_spec(subject: str, data_path: str, max_length: int = 100000) -> RunSpec:
182
188
  scenario_spec = ScenarioSpec(
183
189
  class_name="helm.benchmark.scenarios.ehrshot_scenario.EHRSHOTScenario",
184
- args={"subject": subject, "max_length": max_length},
190
+ args={
191
+ "subject": subject,
192
+ "max_length": max_length,
193
+ "data_path": data_path,
194
+ },
185
195
  )
186
196
 
187
197
  adapter_spec = get_multiple_choice_adapter_spec(
@@ -320,9 +330,13 @@ def get_medbullets_freetext_run_spec() -> RunSpec:
320
330
 
321
331
 
322
332
  @run_spec_function("medalign")
323
- def get_medalign_spec(max_length: int = 40000) -> RunSpec:
333
+ def get_medalign_spec(data_path: str, max_length: int = 40000) -> RunSpec:
324
334
  scenario_spec = ScenarioSpec(
325
- class_name="helm.benchmark.scenarios.medalign_scenario.MedalignScenario", args={"max_length": max_length}
335
+ class_name="helm.benchmark.scenarios.medalign_scenario.MedalignScenario",
336
+ args={
337
+ "max_length": max_length,
338
+ "data_path": data_path,
339
+ },
326
340
  )
327
341
 
328
342
  adapter_spec = get_generation_adapter_spec(
@@ -358,8 +372,11 @@ def get_medalign_spec(max_length: int = 40000) -> RunSpec:
358
372
 
359
373
 
360
374
  @run_spec_function("shc_ptbm_med")
361
- def get_shc_ptbm_spec() -> RunSpec:
362
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_ptbm_scenario.SHCPTBMMedScenario", args={})
375
+ def get_shc_ptbm_spec(data_path: str) -> RunSpec:
376
+ scenario_spec = ScenarioSpec(
377
+ class_name="helm.benchmark.scenarios.shc_ptbm_scenario.SHCPTBMMedScenario",
378
+ args={"data_path": data_path},
379
+ )
363
380
 
364
381
  adapter_spec = get_multiple_choice_adapter_spec(
365
382
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -378,8 +395,11 @@ def get_shc_ptbm_spec() -> RunSpec:
378
395
 
379
396
 
380
397
  @run_spec_function("shc_sei_med")
381
- def get_shc_sei_spec() -> RunSpec:
382
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_sei_scenario.SHCSEIMedScenario", args={})
398
+ def get_shc_sei_spec(data_path: str) -> RunSpec:
399
+ scenario_spec = ScenarioSpec(
400
+ class_name="helm.benchmark.scenarios.shc_sei_scenario.SHCSEIMedScenario",
401
+ args={"data_path": data_path},
402
+ )
383
403
 
384
404
  adapter_spec = get_multiple_choice_adapter_spec(
385
405
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -398,8 +418,13 @@ def get_shc_sei_spec() -> RunSpec:
398
418
 
399
419
 
400
420
  @run_spec_function("dischargeme")
401
- def get_dischargeme_spec() -> RunSpec:
402
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.dischargeme_scenario.DischargeMeScenario")
421
+ def get_dischargeme_spec(data_path: str) -> RunSpec:
422
+ scenario_spec = ScenarioSpec(
423
+ class_name="helm.benchmark.scenarios.dischargeme_scenario.DischargeMeScenario",
424
+ args={
425
+ "data_path": data_path,
426
+ },
427
+ )
403
428
 
404
429
  adapter_spec = get_generation_adapter_spec(
405
430
  instructions=(
@@ -534,8 +559,11 @@ def get_mtsamples_procedures_spec() -> RunSpec:
534
559
 
535
560
 
536
561
  @run_spec_function("mimic_rrs")
537
- def get_mimic_rrs_spec() -> RunSpec:
538
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mimic_rrs_scenario.MIMICRRSScenario", args={})
562
+ def get_mimic_rrs_spec(data_path: str) -> RunSpec:
563
+ scenario_spec = ScenarioSpec(
564
+ class_name="helm.benchmark.scenarios.mimic_rrs_scenario.MIMICRRSScenario",
565
+ args={"data_path": data_path},
566
+ )
539
567
 
540
568
  adapter_spec = get_generation_adapter_spec(
541
569
  instructions=(
@@ -572,8 +600,11 @@ def get_mimic_rrs_spec() -> RunSpec:
572
600
 
573
601
 
574
602
  @run_spec_function("mimic_bhc")
575
- def get_mimic_bhc_spec() -> RunSpec:
576
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mimic_bhc_scenario.MIMICBHCScenario", args={})
603
+ def get_mimic_bhc_spec(data_path: str) -> RunSpec:
604
+ scenario_spec = ScenarioSpec(
605
+ class_name="helm.benchmark.scenarios.mimic_bhc_scenario.MIMICBHCScenario",
606
+ args={"data_path": data_path},
607
+ )
577
608
 
578
609
  adapter_spec = get_generation_adapter_spec(
579
610
  instructions=("Summarize the clinical note into a brief hospital course."),
@@ -585,23 +616,29 @@ def get_mimic_bhc_spec() -> RunSpec:
585
616
  max_train_instances=0,
586
617
  stop_sequences=[],
587
618
  )
619
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.mimic_bhc_annotator.MIMICBHCAnnotator")]
620
+
588
621
  metric_args = {
589
622
  "task": "mimic_bhc",
590
623
  "device": get_torch_device_name(),
591
624
  "bertscore_model": "distilbert-base-uncased",
592
625
  "rescale_with_baseline": False,
593
626
  }
627
+ metric_specs = get_summarization_metric_specs(metric_args) + [
628
+ MetricSpec(class_name="helm.benchmark.metrics.mimic_bhc_metrics.MIMICBHCMetric", args={})
629
+ ]
594
630
  return RunSpec(
595
631
  name="mimic_bhc",
632
+ annotators=annotator_specs,
596
633
  scenario_spec=scenario_spec,
597
634
  adapter_spec=adapter_spec,
598
- metric_specs=get_summarization_metric_specs(metric_args),
635
+ metric_specs=metric_specs,
599
636
  groups=["mimic_bhc"],
600
637
  )
601
638
 
602
639
 
603
640
  @run_spec_function("chw_care_plan")
604
- def get_chw_care_plan_run_spec() -> RunSpec:
641
+ def get_chw_care_plan_run_spec(data_path: str) -> RunSpec:
605
642
  """
606
643
  RunSpec for the chw_care_plan dataset.
607
644
  This configuration evaluates the model's ability to summarize
@@ -609,7 +646,7 @@ def get_chw_care_plan_run_spec() -> RunSpec:
609
646
  """
610
647
  scenario_spec = ScenarioSpec(
611
648
  class_name="helm.benchmark.scenarios.chw_care_plan_scenario.CHWCarePlanScenario",
612
- args={},
649
+ args={"data_path": data_path},
613
650
  )
614
651
 
615
652
  adapter_spec = get_generation_adapter_spec(
@@ -681,10 +718,10 @@ def get_medication_qa_spec() -> RunSpec:
681
718
 
682
719
 
683
720
  @run_spec_function("starr_patient_instructions")
684
- def get_starr_patient_instructions_run_spec() -> RunSpec:
721
+ def get_starr_patient_instructions_run_spec(data_path: str) -> RunSpec:
685
722
  scenario_spec = ScenarioSpec(
686
723
  class_name="helm.benchmark.scenarios.starr_patient_instructions_scenario.StarrPatientInstructionsScenario",
687
- args={},
724
+ args={"data_path": data_path},
688
725
  )
689
726
 
690
727
  adapter_spec = get_generation_adapter_spec(
@@ -748,6 +785,7 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
748
785
  output_noun="Summary",
749
786
  max_tokens=80,
750
787
  max_train_instances=0,
788
+ stop_sequences=[],
751
789
  )
752
790
  annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.med_dialog_annotator.MedDialogAnnotator")]
753
791
 
@@ -771,8 +809,11 @@ def get_med_dialog_spec(subset: str) -> RunSpec:
771
809
 
772
810
 
773
811
  @run_spec_function("shc_conf_med")
774
- def get_shc_conf_spec() -> RunSpec:
775
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_conf_scenario.SHCCONFMedScenario", args={})
812
+ def get_shc_conf_spec(data_path: str) -> RunSpec:
813
+ scenario_spec = ScenarioSpec(
814
+ class_name="helm.benchmark.scenarios.shc_conf_scenario.SHCCONFMedScenario",
815
+ args={"data_path": data_path},
816
+ )
776
817
 
777
818
  adapter_spec = get_multiple_choice_adapter_spec(
778
819
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -824,13 +865,16 @@ def get_medi_qa_spec() -> RunSpec:
824
865
 
825
866
 
826
867
  @run_spec_function("mental_health")
827
- def get_mental_health_spec() -> RunSpec:
868
+ def get_mental_health_spec(data_path: str) -> RunSpec:
828
869
  """
829
870
  Returns the run specification for the mental health counseling scenario.
830
871
  This scenario evaluates a model's ability to generate appropriate counseling responses
831
872
  in mental health conversations.
832
873
  """
833
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.mental_health_scenario.MentalHealthScenario")
874
+ scenario_spec = ScenarioSpec(
875
+ class_name="helm.benchmark.scenarios.mental_health_scenario.MentalHealthScenario",
876
+ args={"data_path": data_path},
877
+ )
834
878
 
835
879
  adapter_spec = get_generation_adapter_spec(
836
880
  instructions=(
@@ -840,6 +884,7 @@ def get_mental_health_spec() -> RunSpec:
840
884
  newline_after_input_noun=False,
841
885
  output_noun="Counselor response",
842
886
  max_tokens=512,
887
+ stop_sequences=[],
843
888
  )
844
889
  annotator_specs = [
845
890
  AnnotatorSpec(class_name="helm.benchmark.annotation.mental_health_annotator.MentalHealthAnnotator")
@@ -871,7 +916,11 @@ def get_pubmed_qa_spec() -> RunSpec:
871
916
 
872
917
  adapter_spec = get_multiple_choice_adapter_spec(
873
918
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
874
- instructions="Answer A for yes, B for no or C for maybe.",
919
+ instructions=(
920
+ "Answer A for yes, B for no or C for maybe. "
921
+ "Do not include any explanation or additional text. "
922
+ "Output only the letter on a single line."
923
+ ),
875
924
  input_noun="Question",
876
925
  output_noun="Answer",
877
926
  max_train_instances=0,
@@ -937,8 +986,11 @@ def get_ehr_sql_run_spec() -> RunSpec:
937
986
 
938
987
 
939
988
  @run_spec_function("shc_bmt_med")
940
- def get_shc_bmt_spec() -> RunSpec:
941
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_bmt_scenario.SHCBMTMedScenario", args={})
989
+ def get_shc_bmt_spec(data_path: str) -> RunSpec:
990
+ scenario_spec = ScenarioSpec(
991
+ class_name="helm.benchmark.scenarios.shc_bmt_scenario.SHCBMTMedScenario",
992
+ args={"data_path": data_path},
993
+ )
942
994
 
943
995
  adapter_spec = get_multiple_choice_adapter_spec(
944
996
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -1002,6 +1054,7 @@ No letter or word, just the integer value.
1002
1054
  Your Judgment""" # noqa: E501
1003
1055
  ),
1004
1056
  max_train_instances=0,
1057
+ stop_sequences=[],
1005
1058
  )
1006
1059
 
1007
1060
  return RunSpec(
@@ -1014,17 +1067,17 @@ Your Judgment""" # noqa: E501
1014
1067
 
1015
1068
 
1016
1069
  @run_spec_function("n2c2_ct_matching")
1017
- def get_n2c2_ct_matching_spec(subject: str) -> RunSpec:
1070
+ def get_n2c2_ct_matching_spec(data_path: str, subject: str) -> RunSpec:
1018
1071
  scenario_spec = ScenarioSpec(
1019
1072
  class_name="helm.benchmark.scenarios.n2c2_ct_matching_scenario.N2C2CTMatchingScenario",
1020
- args={"subject": subject},
1073
+ args={"data_path": data_path, "subject": subject},
1021
1074
  )
1022
1075
 
1023
1076
  adapter_spec = get_multiple_choice_adapter_spec(
1024
1077
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
1025
1078
  instructions="Answer A for yes, B for no.",
1026
1079
  input_noun="",
1027
- output_noun="Answer A for yes, B for no",
1080
+ output_noun="Answer A for yes, B for no. Do not add any other text, punctuation, or symbols",
1028
1081
  max_train_instances=0,
1029
1082
  )
1030
1083
 
@@ -1038,8 +1091,10 @@ def get_n2c2_ct_matching_spec(subject: str) -> RunSpec:
1038
1091
 
1039
1092
 
1040
1093
  @run_spec_function("shc_gip_med")
1041
- def get_shc_gip_spec() -> RunSpec:
1042
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_gip_scenario.SHCGIPMedScenario", args={})
1094
+ def get_shc_gip_spec(data_path: str) -> RunSpec:
1095
+ scenario_spec = ScenarioSpec(
1096
+ class_name="helm.benchmark.scenarios.shc_gip_scenario.SHCGIPMedScenario", args={"data_path": data_path}
1097
+ )
1043
1098
 
1044
1099
  adapter_spec = get_multiple_choice_adapter_spec(
1045
1100
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -1058,11 +1113,11 @@ def get_shc_gip_spec() -> RunSpec:
1058
1113
 
1059
1114
 
1060
1115
  @run_spec_function("mimiciv_billing_code")
1061
- def get_mimiciv_billing_code_spec() -> RunSpec:
1116
+ def get_mimiciv_billing_code_spec(data_path: str) -> RunSpec:
1062
1117
  scenario_spec = ScenarioSpec(
1063
1118
  class_name="helm.benchmark.scenarios.mimiciv_billing_code_scenario.MIMICIVBillingCodeScenario",
1064
1119
  args={
1065
- "data_file": "/share/pi/nigam/data/medhelm/mimiciv_billing_codes/mimiciv_icd10.feather",
1120
+ "data_path": data_path,
1066
1121
  },
1067
1122
  )
1068
1123
  adapter_spec = get_generation_adapter_spec(
@@ -1094,9 +1149,9 @@ def get_mimiciv_billing_code_spec() -> RunSpec:
1094
1149
 
1095
1150
 
1096
1151
  @run_spec_function("shc_sequoia_med")
1097
- def get_shc_sequoia_spec() -> RunSpec:
1152
+ def get_shc_sequoia_spec(data_path: str) -> RunSpec:
1098
1153
  scenario_spec = ScenarioSpec(
1099
- class_name="helm.benchmark.scenarios.shc_sequoia_scenario.SHCSequoiaMedScenario", args={}
1154
+ class_name="helm.benchmark.scenarios.shc_sequoia_scenario.SHCSequoiaMedScenario", args={"data_path": data_path}
1100
1155
  )
1101
1156
 
1102
1157
  adapter_spec = get_multiple_choice_adapter_spec(
@@ -1116,8 +1171,10 @@ def get_shc_sequoia_spec() -> RunSpec:
1116
1171
 
1117
1172
 
1118
1173
  @run_spec_function("shc_cdi_med")
1119
- def get_shc_cdi_spec() -> RunSpec:
1120
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCCDIMedScenario", args={})
1174
+ def get_shc_cdi_spec(data_path: str) -> RunSpec:
1175
+ scenario_spec = ScenarioSpec(
1176
+ class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCCDIMedScenario", args={"data_path": data_path}
1177
+ )
1121
1178
 
1122
1179
  adapter_spec = get_multiple_choice_adapter_spec(
1123
1180
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -1136,8 +1193,10 @@ def get_shc_cdi_spec() -> RunSpec:
1136
1193
 
1137
1194
 
1138
1195
  @run_spec_function("shc_ent_med")
1139
- def get_shc_ent_spec() -> RunSpec:
1140
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.shc_ent_scenario.SHCENTMedScenario", args={})
1196
+ def get_shc_ent_spec(data_path: str) -> RunSpec:
1197
+ scenario_spec = ScenarioSpec(
1198
+ class_name="helm.benchmark.scenarios.shc_ent_scenario.SHCENTMedScenario", args={"data_path": data_path}
1199
+ )
1141
1200
 
1142
1201
  adapter_spec = get_multiple_choice_adapter_spec(
1143
1202
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -1153,3 +1212,49 @@ def get_shc_ent_spec() -> RunSpec:
1153
1212
  metric_specs=get_exact_match_metric_specs(),
1154
1213
  groups=["shc_ent_med"],
1155
1214
  )
1215
+
1216
+
1217
+ @run_spec_function("shc_privacy_med")
1218
+ def get_shc_privacy_spec(data_path: str) -> RunSpec:
1219
+ scenario_spec = ScenarioSpec(
1220
+ class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPRIVACYMedScenario",
1221
+ args={"data_path": data_path},
1222
+ )
1223
+
1224
+ adapter_spec = get_multiple_choice_adapter_spec(
1225
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
1226
+ instructions="Answer A or B.",
1227
+ input_noun="",
1228
+ output_noun="",
1229
+ )
1230
+
1231
+ return RunSpec(
1232
+ name="shc_privacy_med",
1233
+ scenario_spec=scenario_spec,
1234
+ adapter_spec=adapter_spec,
1235
+ metric_specs=get_exact_match_metric_specs(),
1236
+ groups=["shc_privacy_med"],
1237
+ )
1238
+
1239
+
1240
+ @run_spec_function("shc_proxy_med")
1241
+ def get_shc_proxy_spec(data_path: str) -> RunSpec:
1242
+ scenario_spec = ScenarioSpec(
1243
+ class_name="helm.benchmark.scenarios.shc_cdi_scenario.SHCPROXYMedScenario",
1244
+ args={"data_path": data_path},
1245
+ )
1246
+
1247
+ adapter_spec = get_multiple_choice_adapter_spec(
1248
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
1249
+ instructions="Answer A or B.",
1250
+ input_noun="",
1251
+ output_noun="",
1252
+ )
1253
+
1254
+ return RunSpec(
1255
+ name="shc_proxy_med",
1256
+ scenario_spec=scenario_spec,
1257
+ adapter_spec=adapter_spec,
1258
+ metric_specs=get_exact_match_metric_specs(),
1259
+ groups=["shc_proxy_med"],
1260
+ )