medhelm 0.5.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1022) hide show
  1. medhelm-0.5.13/LICENSE +28 -0
  2. medhelm-0.5.13/MANIFEST.in +10 -0
  3. medhelm-0.5.13/PKG-INFO +417 -0
  4. medhelm-0.5.13/README.md +155 -0
  5. medhelm-0.5.13/pyproject.toml +537 -0
  6. medhelm-0.5.13/setup.cfg +4 -0
  7. medhelm-0.5.13/src/helm/__init__.py +0 -0
  8. medhelm-0.5.13/src/helm/benchmark/__init__.py +0 -0
  9. medhelm-0.5.13/src/helm/benchmark/adaptation/__init__.py +0 -0
  10. medhelm-0.5.13/src/helm/benchmark/adaptation/adapter_spec.py +151 -0
  11. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/__init__.py +0 -0
  12. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/adapter.py +30 -0
  13. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/adapter_factory.py +68 -0
  14. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +127 -0
  15. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  16. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  17. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/generation_adapter.py +62 -0
  18. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +325 -0
  19. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py +294 -0
  20. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multimodal/__init__.py +0 -0
  21. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +53 -0
  22. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +142 -0
  23. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +62 -0
  24. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
  25. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +244 -0
  26. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +83 -0
  27. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +47 -0
  28. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +114 -0
  29. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  30. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +62 -0
  31. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/test_adapter.py +21 -0
  32. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py +280 -0
  33. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +162 -0
  34. medhelm-0.5.13/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +293 -0
  35. medhelm-0.5.13/src/helm/benchmark/adaptation/common_adapter_specs.py +443 -0
  36. medhelm-0.5.13/src/helm/benchmark/adaptation/prompt.py +71 -0
  37. medhelm-0.5.13/src/helm/benchmark/adaptation/request_state.py +80 -0
  38. medhelm-0.5.13/src/helm/benchmark/adaptation/scenario_state.py +44 -0
  39. medhelm-0.5.13/src/helm/benchmark/annotation/__init__.py +0 -0
  40. medhelm-0.5.13/src/helm/benchmark/annotation/aci_bench_annotator.py +84 -0
  41. medhelm-0.5.13/src/helm/benchmark/annotation/air_bench_annotator.py +79 -0
  42. medhelm-0.5.13/src/helm/benchmark/annotation/alrage_annotator.py +90 -0
  43. medhelm-0.5.13/src/helm/benchmark/annotation/annotator.py +48 -0
  44. medhelm-0.5.13/src/helm/benchmark/annotation/annotator_factory.py +50 -0
  45. medhelm-0.5.13/src/helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  46. medhelm-0.5.13/src/helm/benchmark/annotation/arabic_content_generation_annotator.py +89 -0
  47. medhelm-0.5.13/src/helm/benchmark/annotation/arabic_content_generation_relative_annotator.py +81 -0
  48. medhelm-0.5.13/src/helm/benchmark/annotation/arabic_content_generation_similarity_annotator.py +90 -0
  49. medhelm-0.5.13/src/helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  50. medhelm-0.5.13/src/helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  51. medhelm-0.5.13/src/helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  52. medhelm-0.5.13/src/helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  53. medhelm-0.5.13/src/helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
  54. medhelm-0.5.13/src/helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  55. medhelm-0.5.13/src/helm/benchmark/annotation/dischargeme_annotator.py +96 -0
  56. medhelm-0.5.13/src/helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  57. medhelm-0.5.13/src/helm/benchmark/annotation/financebench_annotator.py +79 -0
  58. medhelm-0.5.13/src/helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  59. medhelm-0.5.13/src/helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +134 -0
  60. medhelm-0.5.13/src/helm/benchmark/annotation/image2struct/__init__.py +0 -0
  61. medhelm-0.5.13/src/helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
  62. medhelm-0.5.13/src/helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
  63. medhelm-0.5.13/src/helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
  64. medhelm-0.5.13/src/helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +131 -0
  65. medhelm-0.5.13/src/helm/benchmark/annotation/live_qa_annotator.py +76 -0
  66. medhelm-0.5.13/src/helm/benchmark/annotation/med_dialog_annotator.py +88 -0
  67. medhelm-0.5.13/src/helm/benchmark/annotation/medalign_annotator.py +89 -0
  68. medhelm-0.5.13/src/helm/benchmark/annotation/medi_qa_annotator.py +87 -0
  69. medhelm-0.5.13/src/helm/benchmark/annotation/medication_qa_annotator.py +86 -0
  70. medhelm-0.5.13/src/helm/benchmark/annotation/mental_health_annotator.py +87 -0
  71. medhelm-0.5.13/src/helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
  72. medhelm-0.5.13/src/helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
  73. medhelm-0.5.13/src/helm/benchmark/annotation/model_as_judge.py +309 -0
  74. medhelm-0.5.13/src/helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
  75. medhelm-0.5.13/src/helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
  76. medhelm-0.5.13/src/helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  77. medhelm-0.5.13/src/helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  78. medhelm-0.5.13/src/helm/benchmark/annotation/omni_math_annotator.py +134 -0
  79. medhelm-0.5.13/src/helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  80. medhelm-0.5.13/src/helm/benchmark/annotation/spider_annotator.py +18 -0
  81. medhelm-0.5.13/src/helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
  82. medhelm-0.5.13/src/helm/benchmark/annotation/test_annotator_factory.py +26 -0
  83. medhelm-0.5.13/src/helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  84. medhelm-0.5.13/src/helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  85. medhelm-0.5.13/src/helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  86. medhelm-0.5.13/src/helm/benchmark/annotation/wildbench_annotator.py +122 -0
  87. medhelm-0.5.13/src/helm/benchmark/annotation/xstest_annotator.py +100 -0
  88. medhelm-0.5.13/src/helm/benchmark/annotation_executor.py +144 -0
  89. medhelm-0.5.13/src/helm/benchmark/augmentations/__init__.py +0 -0
  90. medhelm-0.5.13/src/helm/benchmark/augmentations/cleva_perturbation.py +753 -0
  91. medhelm-0.5.13/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py +168 -0
  92. medhelm-0.5.13/src/helm/benchmark/augmentations/contrast_sets_perturbation.py +83 -0
  93. medhelm-0.5.13/src/helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  94. medhelm-0.5.13/src/helm/benchmark/augmentations/data_augmenter.py +105 -0
  95. medhelm-0.5.13/src/helm/benchmark/augmentations/dialect_perturbation.py +147 -0
  96. medhelm-0.5.13/src/helm/benchmark/augmentations/extra_space_perturbation.py +28 -0
  97. medhelm-0.5.13/src/helm/benchmark/augmentations/filler_words_perturbation.py +92 -0
  98. medhelm-0.5.13/src/helm/benchmark/augmentations/gender_perturbation.py +220 -0
  99. medhelm-0.5.13/src/helm/benchmark/augmentations/lowercase_perturbation.py +19 -0
  100. medhelm-0.5.13/src/helm/benchmark/augmentations/mild_mix_perturbation.py +53 -0
  101. medhelm-0.5.13/src/helm/benchmark/augmentations/misspelling_perturbation.py +60 -0
  102. medhelm-0.5.13/src/helm/benchmark/augmentations/person_name_perturbation.py +328 -0
  103. medhelm-0.5.13/src/helm/benchmark/augmentations/perturbation.py +101 -0
  104. medhelm-0.5.13/src/helm/benchmark/augmentations/perturbation_description.py +30 -0
  105. medhelm-0.5.13/src/helm/benchmark/augmentations/space_perturbation.py +29 -0
  106. medhelm-0.5.13/src/helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  107. medhelm-0.5.13/src/helm/benchmark/augmentations/synonym_perturbation.py +110 -0
  108. medhelm-0.5.13/src/helm/benchmark/augmentations/test_perturbation.py +308 -0
  109. medhelm-0.5.13/src/helm/benchmark/augmentations/translate_perturbation.py +31 -0
  110. medhelm-0.5.13/src/helm/benchmark/augmentations/typos_perturbation.py +85 -0
  111. medhelm-0.5.13/src/helm/benchmark/config_registry.py +38 -0
  112. medhelm-0.5.13/src/helm/benchmark/data_preprocessor.py +45 -0
  113. medhelm-0.5.13/src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json +200 -0
  114. medhelm-0.5.13/src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json +610 -0
  115. medhelm-0.5.13/src/helm/benchmark/efficiency_data/training_efficiency.json +118 -0
  116. medhelm-0.5.13/src/helm/benchmark/executor.py +122 -0
  117. medhelm-0.5.13/src/helm/benchmark/huggingface_registration.py +107 -0
  118. medhelm-0.5.13/src/helm/benchmark/metrics/__init__.py +0 -0
  119. medhelm-0.5.13/src/helm/benchmark/metrics/air_bench_metrics.py +3212 -0
  120. medhelm-0.5.13/src/helm/benchmark/metrics/alrage_metric.py +35 -0
  121. medhelm-0.5.13/src/helm/benchmark/metrics/annotation_metrics.py +108 -0
  122. medhelm-0.5.13/src/helm/benchmark/metrics/basic_metrics.py +715 -0
  123. medhelm-0.5.13/src/helm/benchmark/metrics/bbq_metrics.py +159 -0
  124. medhelm-0.5.13/src/helm/benchmark/metrics/bias_metrics.py +238 -0
  125. medhelm-0.5.13/src/helm/benchmark/metrics/bias_word_lists.py +860 -0
  126. medhelm-0.5.13/src/helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  127. medhelm-0.5.13/src/helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  128. medhelm-0.5.13/src/helm/benchmark/metrics/classification_metrics.py +188 -0
  129. medhelm-0.5.13/src/helm/benchmark/metrics/cleva_accuracy_metrics.py +57 -0
  130. medhelm-0.5.13/src/helm/benchmark/metrics/cleva_harms_metrics.py +237 -0
  131. medhelm-0.5.13/src/helm/benchmark/metrics/cleva_metrics_helper.py +11 -0
  132. medhelm-0.5.13/src/helm/benchmark/metrics/code_metrics.py +122 -0
  133. medhelm-0.5.13/src/helm/benchmark/metrics/code_metrics_helper.py +676 -0
  134. medhelm-0.5.13/src/helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  135. medhelm-0.5.13/src/helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  136. medhelm-0.5.13/src/helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  137. medhelm-0.5.13/src/helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  138. medhelm-0.5.13/src/helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  139. medhelm-0.5.13/src/helm/benchmark/metrics/comet_metric.py +125 -0
  140. medhelm-0.5.13/src/helm/benchmark/metrics/common_metric_specs.py +174 -0
  141. medhelm-0.5.13/src/helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
  142. medhelm-0.5.13/src/helm/benchmark/metrics/copyright_metrics.py +180 -0
  143. medhelm-0.5.13/src/helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  144. medhelm-0.5.13/src/helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  145. medhelm-0.5.13/src/helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  146. medhelm-0.5.13/src/helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  147. medhelm-0.5.13/src/helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  148. medhelm-0.5.13/src/helm/benchmark/metrics/disinformation_metrics.py +199 -0
  149. medhelm-0.5.13/src/helm/benchmark/metrics/dry_run_metrics.py +124 -0
  150. medhelm-0.5.13/src/helm/benchmark/metrics/efficiency_metrics.py +287 -0
  151. medhelm-0.5.13/src/helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
  152. medhelm-0.5.13/src/helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  153. medhelm-0.5.13/src/helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
  154. medhelm-0.5.13/src/helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  155. medhelm-0.5.13/src/helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  156. medhelm-0.5.13/src/helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
  157. medhelm-0.5.13/src/helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  158. medhelm-0.5.13/src/helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  159. medhelm-0.5.13/src/helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  160. medhelm-0.5.13/src/helm/benchmark/metrics/ifeval/__init__.py +0 -0
  161. medhelm-0.5.13/src/helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  162. medhelm-0.5.13/src/helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  163. medhelm-0.5.13/src/helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  164. medhelm-0.5.13/src/helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  165. medhelm-0.5.13/src/helm/benchmark/metrics/ifeval_metrics.py +67 -0
  166. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/__init__.py +0 -0
  167. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  168. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  169. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
  170. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  171. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  172. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/detectors/__init__.py +0 -0
  173. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  174. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  175. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  176. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  177. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  178. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  179. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +51 -0
  180. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  181. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  182. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  183. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  184. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  185. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  186. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  187. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  188. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  189. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  190. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  191. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  192. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
  193. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  194. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  195. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  196. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  197. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +22 -0
  198. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  199. medhelm-0.5.13/src/helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  200. medhelm-0.5.13/src/helm/benchmark/metrics/instruction_following_critique_metrics.py +231 -0
  201. medhelm-0.5.13/src/helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
  202. medhelm-0.5.13/src/helm/benchmark/metrics/language_modeling_metrics.py +111 -0
  203. medhelm-0.5.13/src/helm/benchmark/metrics/live_qa_metrics.py +35 -0
  204. medhelm-0.5.13/src/helm/benchmark/metrics/llm_jury_metrics.py +58 -0
  205. medhelm-0.5.13/src/helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  206. medhelm-0.5.13/src/helm/benchmark/metrics/lmkt_metrics.py +47 -0
  207. medhelm-0.5.13/src/helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  208. medhelm-0.5.13/src/helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
  209. medhelm-0.5.13/src/helm/benchmark/metrics/medec_metrics.py +124 -0
  210. medhelm-0.5.13/src/helm/benchmark/metrics/melt_bias_metric.py +234 -0
  211. medhelm-0.5.13/src/helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  212. medhelm-0.5.13/src/helm/benchmark/metrics/melt_metric_specs.py +43 -0
  213. medhelm-0.5.13/src/helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  214. medhelm-0.5.13/src/helm/benchmark/metrics/metric.py +349 -0
  215. medhelm-0.5.13/src/helm/benchmark/metrics/metric_name.py +42 -0
  216. medhelm-0.5.13/src/helm/benchmark/metrics/metric_service.py +38 -0
  217. medhelm-0.5.13/src/helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
  218. medhelm-0.5.13/src/helm/benchmark/metrics/nltk_helper.py +32 -0
  219. medhelm-0.5.13/src/helm/benchmark/metrics/omni_math_metrics.py +44 -0
  220. medhelm-0.5.13/src/helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  221. medhelm-0.5.13/src/helm/benchmark/metrics/output_processing_metric.py +60 -0
  222. medhelm-0.5.13/src/helm/benchmark/metrics/output_processors.py +15 -0
  223. medhelm-0.5.13/src/helm/benchmark/metrics/paraphrase_generation_metrics.py +46 -0
  224. medhelm-0.5.13/src/helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  225. medhelm-0.5.13/src/helm/benchmark/metrics/ranking_metrics.py +379 -0
  226. medhelm-0.5.13/src/helm/benchmark/metrics/reference_metric.py +148 -0
  227. medhelm-0.5.13/src/helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  228. medhelm-0.5.13/src/helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  229. medhelm-0.5.13/src/helm/benchmark/metrics/safety_metrics.py +91 -0
  230. medhelm-0.5.13/src/helm/benchmark/metrics/seahelm_metrics.py +201 -0
  231. medhelm-0.5.13/src/helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  232. medhelm-0.5.13/src/helm/benchmark/metrics/spider_metrics.py +7 -0
  233. medhelm-0.5.13/src/helm/benchmark/metrics/statistic.py +108 -0
  234. medhelm-0.5.13/src/helm/benchmark/metrics/summac/__init__.py +0 -0
  235. medhelm-0.5.13/src/helm/benchmark/metrics/summac/model_summac.py +485 -0
  236. medhelm-0.5.13/src/helm/benchmark/metrics/summac/utils_misc.py +52 -0
  237. medhelm-0.5.13/src/helm/benchmark/metrics/summarization_critique_metrics.py +105 -0
  238. medhelm-0.5.13/src/helm/benchmark/metrics/summarization_metrics.py +544 -0
  239. medhelm-0.5.13/src/helm/benchmark/metrics/test_bias_metrics.py +142 -0
  240. medhelm-0.5.13/src/helm/benchmark/metrics/test_classification_metrics.py +225 -0
  241. medhelm-0.5.13/src/helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  242. medhelm-0.5.13/src/helm/benchmark/metrics/test_evaluate_reference_metrics.py +45 -0
  243. medhelm-0.5.13/src/helm/benchmark/metrics/test_metric.py +26 -0
  244. medhelm-0.5.13/src/helm/benchmark/metrics/test_statistic.py +43 -0
  245. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/__init__.py +0 -0
  246. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +16 -0
  247. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +43 -0
  248. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +12 -0
  249. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/free_token_cost_estimator.py +11 -0
  250. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +32 -0
  251. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +26 -0
  252. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +26 -0
  253. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +61 -0
  254. medhelm-0.5.13/src/helm/benchmark/metrics/tokens/token_cost_estimator.py +15 -0
  255. medhelm-0.5.13/src/helm/benchmark/metrics/toxicity_metrics.py +121 -0
  256. medhelm-0.5.13/src/helm/benchmark/metrics/toxicity_utils.py +23 -0
  257. medhelm-0.5.13/src/helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  258. medhelm-0.5.13/src/helm/benchmark/metrics/unitxt_metrics.py +107 -0
  259. medhelm-0.5.13/src/helm/benchmark/metrics/vision_language/__init__.py +0 -0
  260. medhelm-0.5.13/src/helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
  261. medhelm-0.5.13/src/helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
  262. medhelm-0.5.13/src/helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  263. medhelm-0.5.13/src/helm/benchmark/metrics/wildbench_metrics.py +54 -0
  264. medhelm-0.5.13/src/helm/benchmark/model_deployment_registry.py +279 -0
  265. medhelm-0.5.13/src/helm/benchmark/model_metadata_registry.py +237 -0
  266. medhelm-0.5.13/src/helm/benchmark/multi_gpu_runner.py +133 -0
  267. medhelm-0.5.13/src/helm/benchmark/presentation/__init__.py +0 -0
  268. medhelm-0.5.13/src/helm/benchmark/presentation/contamination.py +85 -0
  269. medhelm-0.5.13/src/helm/benchmark/presentation/create_plots.py +655 -0
  270. medhelm-0.5.13/src/helm/benchmark/presentation/run_display.py +329 -0
  271. medhelm-0.5.13/src/helm/benchmark/presentation/run_entry.py +40 -0
  272. medhelm-0.5.13/src/helm/benchmark/presentation/schema.py +645 -0
  273. medhelm-0.5.13/src/helm/benchmark/presentation/summarize.py +1554 -0
  274. medhelm-0.5.13/src/helm/benchmark/presentation/table.py +85 -0
  275. medhelm-0.5.13/src/helm/benchmark/presentation/taxonomy_info.py +20 -0
  276. medhelm-0.5.13/src/helm/benchmark/presentation/test_contamination.py +11 -0
  277. medhelm-0.5.13/src/helm/benchmark/presentation/test_create_plots.py +36 -0
  278. medhelm-0.5.13/src/helm/benchmark/presentation/test_run_entry.py +22 -0
  279. medhelm-0.5.13/src/helm/benchmark/presentation/test_schema.py +31 -0
  280. medhelm-0.5.13/src/helm/benchmark/presentation/test_schema_validation.py +435 -0
  281. medhelm-0.5.13/src/helm/benchmark/presentation/test_summarize.py +183 -0
  282. medhelm-0.5.13/src/helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  283. medhelm-0.5.13/src/helm/benchmark/reeval_run.py +202 -0
  284. medhelm-0.5.13/src/helm/benchmark/reeval_runner.py +355 -0
  285. medhelm-0.5.13/src/helm/benchmark/run.py +478 -0
  286. medhelm-0.5.13/src/helm/benchmark/run_expander.py +1638 -0
  287. medhelm-0.5.13/src/helm/benchmark/run_spec.py +93 -0
  288. medhelm-0.5.13/src/helm/benchmark/run_spec_factory.py +187 -0
  289. medhelm-0.5.13/src/helm/benchmark/run_specs/__init__.py +0 -0
  290. medhelm-0.5.13/src/helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
  291. medhelm-0.5.13/src/helm/benchmark/run_specs/arabic_enterprise_run_specs.py +57 -0
  292. medhelm-0.5.13/src/helm/benchmark/run_specs/arabic_run_specs.py +201 -0
  293. medhelm-0.5.13/src/helm/benchmark/run_specs/audio_run_specs.py +657 -0
  294. medhelm-0.5.13/src/helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  295. medhelm-0.5.13/src/helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  296. medhelm-0.5.13/src/helm/benchmark/run_specs/classic_run_specs.py +1393 -0
  297. medhelm-0.5.13/src/helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  298. medhelm-0.5.13/src/helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  299. medhelm-0.5.13/src/helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
  300. medhelm-0.5.13/src/helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  301. medhelm-0.5.13/src/helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  302. medhelm-0.5.13/src/helm/benchmark/run_specs/experimental_run_specs.py +224 -0
  303. medhelm-0.5.13/src/helm/benchmark/run_specs/finance_run_specs.py +114 -0
  304. medhelm-0.5.13/src/helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  305. medhelm-0.5.13/src/helm/benchmark/run_specs/heim_run_specs.py +625 -0
  306. medhelm-0.5.13/src/helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  307. medhelm-0.5.13/src/helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  308. medhelm-0.5.13/src/helm/benchmark/run_specs/lite_run_specs.py +307 -0
  309. medhelm-0.5.13/src/helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  310. medhelm-0.5.13/src/helm/benchmark/run_specs/long_context_run_specs.py +188 -0
  311. medhelm-0.5.13/src/helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  312. medhelm-0.5.13/src/helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  313. medhelm-0.5.13/src/helm/benchmark/run_specs/medhelm_run_specs.py +1578 -0
  314. medhelm-0.5.13/src/helm/benchmark/run_specs/melt_run_specs.py +783 -0
  315. medhelm-0.5.13/src/helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  316. medhelm-0.5.13/src/helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  317. medhelm-0.5.13/src/helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  318. medhelm-0.5.13/src/helm/benchmark/run_specs/safety_run_specs.py +191 -0
  319. medhelm-0.5.13/src/helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
  320. medhelm-0.5.13/src/helm/benchmark/run_specs/simple_run_specs.py +104 -0
  321. medhelm-0.5.13/src/helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
  322. medhelm-0.5.13/src/helm/benchmark/run_specs/sql_run_specs.py +54 -0
  323. medhelm-0.5.13/src/helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  324. medhelm-0.5.13/src/helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
  325. medhelm-0.5.13/src/helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
  326. medhelm-0.5.13/src/helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  327. medhelm-0.5.13/src/helm/benchmark/runner.py +354 -0
  328. medhelm-0.5.13/src/helm/benchmark/runner_config_registry.py +21 -0
  329. medhelm-0.5.13/src/helm/benchmark/scenarios/__init__.py +0 -0
  330. medhelm-0.5.13/src/helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
  331. medhelm-0.5.13/src/helm/benchmark/scenarios/air_bench_scenario.py +76 -0
  332. medhelm-0.5.13/src/helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  333. medhelm-0.5.13/src/helm/benchmark/scenarios/alrage_scenario.py +54 -0
  334. medhelm-0.5.13/src/helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +112 -0
  335. medhelm-0.5.13/src/helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
  336. medhelm-0.5.13/src/helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  337. medhelm-0.5.13/src/helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  338. medhelm-0.5.13/src/helm/benchmark/scenarios/aratrust_scenario.py +119 -0
  339. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  340. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  341. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  342. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  343. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  344. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  345. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  346. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  347. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  348. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  349. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  350. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  351. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  352. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  353. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  354. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  355. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  356. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  357. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  358. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  359. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  360. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  361. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  362. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  363. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
  364. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  365. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
  366. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  367. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  368. medhelm-0.5.13/src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  369. medhelm-0.5.13/src/helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  370. medhelm-0.5.13/src/helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  371. medhelm-0.5.13/src/helm/benchmark/scenarios/babi_qa_scenario.py +156 -0
  372. medhelm-0.5.13/src/helm/benchmark/scenarios/banking77_scenario.py +77 -0
  373. medhelm-0.5.13/src/helm/benchmark/scenarios/bbq_scenario.py +254 -0
  374. medhelm-0.5.13/src/helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  375. medhelm-0.5.13/src/helm/benchmark/scenarios/big_bench_scenario.py +173 -0
  376. medhelm-0.5.13/src/helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  377. medhelm-0.5.13/src/helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
  378. medhelm-0.5.13/src/helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  379. medhelm-0.5.13/src/helm/benchmark/scenarios/blimp_scenario.py +149 -0
  380. medhelm-0.5.13/src/helm/benchmark/scenarios/bluex_scenario.py +70 -0
  381. medhelm-0.5.13/src/helm/benchmark/scenarios/bold_scenario.py +135 -0
  382. medhelm-0.5.13/src/helm/benchmark/scenarios/boolq_scenario.py +214 -0
  383. medhelm-0.5.13/src/helm/benchmark/scenarios/casehold_scenario.py +79 -0
  384. medhelm-0.5.13/src/helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
  385. medhelm-0.5.13/src/helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  386. medhelm-0.5.13/src/helm/benchmark/scenarios/civil_comments_scenario.py +141 -0
  387. medhelm-0.5.13/src/helm/benchmark/scenarios/clear_scenario.py +180 -0
  388. medhelm-0.5.13/src/helm/benchmark/scenarios/cleva_scenario.py +2085 -0
  389. medhelm-0.5.13/src/helm/benchmark/scenarios/code_scenario.py +361 -0
  390. medhelm-0.5.13/src/helm/benchmark/scenarios/code_scenario_apps_pinned_file_order.py +43 -0
  391. medhelm-0.5.13/src/helm/benchmark/scenarios/code_scenario_helper.py +195 -0
  392. medhelm-0.5.13/src/helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  393. medhelm-0.5.13/src/helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  394. medhelm-0.5.13/src/helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  395. medhelm-0.5.13/src/helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  396. medhelm-0.5.13/src/helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  397. medhelm-0.5.13/src/helm/benchmark/scenarios/commonsense_scenario.py +272 -0
  398. medhelm-0.5.13/src/helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  399. medhelm-0.5.13/src/helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
  400. medhelm-0.5.13/src/helm/benchmark/scenarios/copyright_scenario.py +117 -0
  401. medhelm-0.5.13/src/helm/benchmark/scenarios/covid_dialog_scenario.py +93 -0
  402. medhelm-0.5.13/src/helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
  403. medhelm-0.5.13/src/helm/benchmark/scenarios/custom_mcqa_scenario.py +72 -0
  404. medhelm-0.5.13/src/helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
  405. medhelm-0.5.13/src/helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
  406. medhelm-0.5.13/src/helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
  407. medhelm-0.5.13/src/helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
  408. medhelm-0.5.13/src/helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
  409. medhelm-0.5.13/src/helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
  410. medhelm-0.5.13/src/helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
  411. medhelm-0.5.13/src/helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
  412. medhelm-0.5.13/src/helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
  413. medhelm-0.5.13/src/helm/benchmark/scenarios/dialogue_scenarios.py +153 -0
  414. medhelm-0.5.13/src/helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
  415. medhelm-0.5.13/src/helm/benchmark/scenarios/disinformation_scenario.py +199 -0
  416. medhelm-0.5.13/src/helm/benchmark/scenarios/dyck_language_scenario.py +251 -0
  417. medhelm-0.5.13/src/helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  418. medhelm-0.5.13/src/helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  419. medhelm-0.5.13/src/helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
  420. medhelm-0.5.13/src/helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
  421. medhelm-0.5.13/src/helm/benchmark/scenarios/entity_data_imputation_scenario.py +176 -0
  422. medhelm-0.5.13/src/helm/benchmark/scenarios/entity_matching_scenario.py +171 -0
  423. medhelm-0.5.13/src/helm/benchmark/scenarios/entity_matching_scenario_fixed_random_state.py +60 -0
  424. medhelm-0.5.13/src/helm/benchmark/scenarios/ewok_scenario.py +116 -0
  425. medhelm-0.5.13/src/helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  426. medhelm-0.5.13/src/helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
  427. medhelm-0.5.13/src/helm/benchmark/scenarios/financebench_scenario.py +74 -0
  428. medhelm-0.5.13/src/helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
  429. medhelm-0.5.13/src/helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
  430. medhelm-0.5.13/src/helm/benchmark/scenarios/gpqa_scenario.py +98 -0
  431. medhelm-0.5.13/src/helm/benchmark/scenarios/grammar.py +183 -0
  432. medhelm-0.5.13/src/helm/benchmark/scenarios/grammar_scenario.py +62 -0
  433. medhelm-0.5.13/src/helm/benchmark/scenarios/gsm_scenario.py +88 -0
  434. medhelm-0.5.13/src/helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
  435. medhelm-0.5.13/src/helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
  436. medhelm-0.5.13/src/helm/benchmark/scenarios/headqa_scenario.py +158 -0
  437. medhelm-0.5.13/src/helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  438. medhelm-0.5.13/src/helm/benchmark/scenarios/ice_scenario.py +489 -0
  439. medhelm-0.5.13/src/helm/benchmark/scenarios/ice_scenario_pinned_file_order.py +43 -0
  440. medhelm-0.5.13/src/helm/benchmark/scenarios/ifeval_scenario.py +71 -0
  441. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  442. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  443. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  444. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  445. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  446. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  447. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  448. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  449. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  450. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  451. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  452. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  453. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  454. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  455. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  456. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  457. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  458. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  459. medhelm-0.5.13/src/helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  460. medhelm-0.5.13/src/helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  461. medhelm-0.5.13/src/helm/benchmark/scenarios/imdb_scenario.py +160 -0
  462. medhelm-0.5.13/src/helm/benchmark/scenarios/imdb_scenario_pinned_file_order.py +40 -0
  463. medhelm-0.5.13/src/helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  464. medhelm-0.5.13/src/helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  465. medhelm-0.5.13/src/helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
  466. medhelm-0.5.13/src/helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +62 -0
  467. medhelm-0.5.13/src/helm/benchmark/scenarios/koala_scenario.py +61 -0
  468. medhelm-0.5.13/src/helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
  469. medhelm-0.5.13/src/helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
  470. medhelm-0.5.13/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  471. medhelm-0.5.13/src/helm/benchmark/scenarios/legal_summarization_scenario.py +257 -0
  472. medhelm-0.5.13/src/helm/benchmark/scenarios/legal_support_scenario.py +117 -0
  473. medhelm-0.5.13/src/helm/benchmark/scenarios/legalbench_scenario.py +165 -0
  474. medhelm-0.5.13/src/helm/benchmark/scenarios/lex_glue_scenario.py +274 -0
  475. medhelm-0.5.13/src/helm/benchmark/scenarios/lextreme_scenario.py +479 -0
  476. medhelm-0.5.13/src/helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  477. medhelm-0.5.13/src/helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  478. medhelm-0.5.13/src/helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  479. medhelm-0.5.13/src/helm/benchmark/scenarios/lsat_qa_scenario.py +159 -0
  480. medhelm-0.5.13/src/helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  481. medhelm-0.5.13/src/helm/benchmark/scenarios/math_scenario.py +485 -0
  482. medhelm-0.5.13/src/helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  483. medhelm-0.5.13/src/helm/benchmark/scenarios/me_q_sum_scenario.py +95 -0
  484. medhelm-0.5.13/src/helm/benchmark/scenarios/med_dialog_scenario.py +166 -0
  485. medhelm-0.5.13/src/helm/benchmark/scenarios/med_mcqa_scenario.py +125 -0
  486. medhelm-0.5.13/src/helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +128 -0
  487. medhelm-0.5.13/src/helm/benchmark/scenarios/med_qa_scenario.py +125 -0
  488. medhelm-0.5.13/src/helm/benchmark/scenarios/medalign_scenario.py +117 -0
  489. medhelm-0.5.13/src/helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
  490. medhelm-0.5.13/src/helm/benchmark/scenarios/medbullets_scenario.py +167 -0
  491. medhelm-0.5.13/src/helm/benchmark/scenarios/medcalc_bench_scenario.py +150 -0
  492. medhelm-0.5.13/src/helm/benchmark/scenarios/medec_scenario.py +148 -0
  493. medhelm-0.5.13/src/helm/benchmark/scenarios/medhallu_scenario.py +95 -0
  494. medhelm-0.5.13/src/helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  495. medhelm-0.5.13/src/helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  496. medhelm-0.5.13/src/helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  497. medhelm-0.5.13/src/helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
  498. medhelm-0.5.13/src/helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
  499. medhelm-0.5.13/src/helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  500. medhelm-0.5.13/src/helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  501. medhelm-0.5.13/src/helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  502. medhelm-0.5.13/src/helm/benchmark/scenarios/melt_scenarios.py +793 -0
  503. medhelm-0.5.13/src/helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  504. medhelm-0.5.13/src/helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  505. medhelm-0.5.13/src/helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  506. medhelm-0.5.13/src/helm/benchmark/scenarios/mental_health_scenario.py +146 -0
  507. medhelm-0.5.13/src/helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
  508. medhelm-0.5.13/src/helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
  509. medhelm-0.5.13/src/helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
  510. medhelm-0.5.13/src/helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  511. medhelm-0.5.13/src/helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
  512. medhelm-0.5.13/src/helm/benchmark/scenarios/mmlu_scenario.py +147 -0
  513. medhelm-0.5.13/src/helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  514. medhelm-0.5.13/src/helm/benchmark/scenarios/msmarco_scenario.py +689 -0
  515. medhelm-0.5.13/src/helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
  516. medhelm-0.5.13/src/helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
  517. medhelm-0.5.13/src/helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
  518. medhelm-0.5.13/src/helm/benchmark/scenarios/narrativeqa_scenario.py +176 -0
  519. medhelm-0.5.13/src/helm/benchmark/scenarios/natural_qa_scenario.py +358 -0
  520. medhelm-0.5.13/src/helm/benchmark/scenarios/newsqa_scenario.py +173 -0
  521. medhelm-0.5.13/src/helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  522. medhelm-0.5.13/src/helm/benchmark/scenarios/omni_math_scenario.py +71 -0
  523. medhelm-0.5.13/src/helm/benchmark/scenarios/open_assistant_scenario.py +150 -0
  524. medhelm-0.5.13/src/helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
  525. medhelm-0.5.13/src/helm/benchmark/scenarios/opinions_qa_scenario.py +190 -0
  526. medhelm-0.5.13/src/helm/benchmark/scenarios/pubmed_qa_scenario.py +210 -0
  527. medhelm-0.5.13/src/helm/benchmark/scenarios/quac_scenario.py +208 -0
  528. medhelm-0.5.13/src/helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
  529. medhelm-0.5.13/src/helm/benchmark/scenarios/raft_scenario.py +161 -0
  530. medhelm-0.5.13/src/helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +72 -0
  531. medhelm-0.5.13/src/helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  532. medhelm-0.5.13/src/helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
  533. medhelm-0.5.13/src/helm/benchmark/scenarios/scenario.py +302 -0
  534. medhelm-0.5.13/src/helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
  535. medhelm-0.5.13/src/helm/benchmark/scenarios/self_instruct_scenario.py +76 -0
  536. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
  537. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
  538. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
  539. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
  540. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
  541. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
  542. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
  543. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
  544. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  545. medhelm-0.5.13/src/helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
  546. medhelm-0.5.13/src/helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
  547. medhelm-0.5.13/src/helm/benchmark/scenarios/simple_scenarios.py +173 -0
  548. medhelm-0.5.13/src/helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  549. medhelm-0.5.13/src/helm/benchmark/scenarios/spider_scenario.py +95 -0
  550. medhelm-0.5.13/src/helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
  551. medhelm-0.5.13/src/helm/benchmark/scenarios/summarization_scenario.py +210 -0
  552. medhelm-0.5.13/src/helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  553. medhelm-0.5.13/src/helm/benchmark/scenarios/synthetic_efficiency_scenario.py +110 -0
  554. medhelm-0.5.13/src/helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +407 -0
  555. medhelm-0.5.13/src/helm/benchmark/scenarios/synthetic_reasoning_scenario.py +220 -0
  556. medhelm-0.5.13/src/helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  557. medhelm-0.5.13/src/helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  558. medhelm-0.5.13/src/helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  559. medhelm-0.5.13/src/helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  560. medhelm-0.5.13/src/helm/benchmark/scenarios/test_aratrust_scenario.py +19 -0
  561. medhelm-0.5.13/src/helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  562. medhelm-0.5.13/src/helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  563. medhelm-0.5.13/src/helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  564. medhelm-0.5.13/src/helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  565. medhelm-0.5.13/src/helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  566. medhelm-0.5.13/src/helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
  567. medhelm-0.5.13/src/helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  568. medhelm-0.5.13/src/helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  569. medhelm-0.5.13/src/helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  570. medhelm-0.5.13/src/helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  571. medhelm-0.5.13/src/helm/benchmark/scenarios/test_grammar.py +50 -0
  572. medhelm-0.5.13/src/helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  573. medhelm-0.5.13/src/helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  574. medhelm-0.5.13/src/helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  575. medhelm-0.5.13/src/helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  576. medhelm-0.5.13/src/helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  577. medhelm-0.5.13/src/helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  578. medhelm-0.5.13/src/helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  579. medhelm-0.5.13/src/helm/benchmark/scenarios/test_math_scenario.py +17 -0
  580. medhelm-0.5.13/src/helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  581. medhelm-0.5.13/src/helm/benchmark/scenarios/test_medcalc_bench_scenario.py +44 -0
  582. medhelm-0.5.13/src/helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  583. medhelm-0.5.13/src/helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  584. medhelm-0.5.13/src/helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  585. medhelm-0.5.13/src/helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  586. medhelm-0.5.13/src/helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  587. medhelm-0.5.13/src/helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  588. medhelm-0.5.13/src/helm/benchmark/scenarios/test_scenario.py +58 -0
  589. medhelm-0.5.13/src/helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  590. medhelm-0.5.13/src/helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  591. medhelm-0.5.13/src/helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  592. medhelm-0.5.13/src/helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  593. medhelm-0.5.13/src/helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
  594. medhelm-0.5.13/src/helm/benchmark/scenarios/the_pile_scenario.py +159 -0
  595. medhelm-0.5.13/src/helm/benchmark/scenarios/truthful_qa_scenario.py +170 -0
  596. medhelm-0.5.13/src/helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  597. medhelm-0.5.13/src/helm/benchmark/scenarios/twitter_aae_scenario.py +77 -0
  598. medhelm-0.5.13/src/helm/benchmark/scenarios/unitxt_scenario.py +62 -0
  599. medhelm-0.5.13/src/helm/benchmark/scenarios/verifiability_judgment_scenario.py +152 -0
  600. medhelm-0.5.13/src/helm/benchmark/scenarios/vicuna_scenario.py +69 -0
  601. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/__init__.py +0 -0
  602. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  603. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  604. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  605. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
  606. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  607. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  608. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  609. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  610. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  611. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  612. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
  613. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
  614. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
  615. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
  616. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
  617. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
  618. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
  619. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
  620. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
  621. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
  622. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
  623. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  624. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  625. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  626. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  627. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
  628. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  629. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  630. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  631. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  632. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  633. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  634. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  635. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
  636. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  637. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
  638. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  639. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  640. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  641. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  642. medhelm-0.5.13/src/helm/benchmark/scenarios/vision_language/vqa_scenario.py +127 -0
  643. medhelm-0.5.13/src/helm/benchmark/scenarios/wikifact_scenario.py +205 -0
  644. medhelm-0.5.13/src/helm/benchmark/scenarios/wikitext_103_scenario.py +87 -0
  645. medhelm-0.5.13/src/helm/benchmark/scenarios/wildbench_scenario.py +101 -0
  646. medhelm-0.5.13/src/helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  647. medhelm-0.5.13/src/helm/benchmark/scenarios/wmt_14_scenario.py +127 -0
  648. medhelm-0.5.13/src/helm/benchmark/scenarios/xstest_scenario.py +35 -0
  649. medhelm-0.5.13/src/helm/benchmark/server.py +264 -0
  650. medhelm-0.5.13/src/helm/benchmark/slurm_jobs.py +101 -0
  651. medhelm-0.5.13/src/helm/benchmark/slurm_runner.py +370 -0
  652. medhelm-0.5.13/src/helm/benchmark/static/contamination.yaml +95 -0
  653. medhelm-0.5.13/src/helm/benchmark/static/schema_air_bench.yaml +3149 -0
  654. medhelm-0.5.13/src/helm/benchmark/static/schema_arabic.yaml +271 -0
  655. medhelm-0.5.13/src/helm/benchmark/static/schema_audio.yaml +763 -0
  656. medhelm-0.5.13/src/helm/benchmark/static/schema_autobencher.yaml +150 -0
  657. medhelm-0.5.13/src/helm/benchmark/static/schema_capabilities.yaml +254 -0
  658. medhelm-0.5.13/src/helm/benchmark/static/schema_classic.yaml +2994 -0
  659. medhelm-0.5.13/src/helm/benchmark/static/schema_cleva.yaml +768 -0
  660. medhelm-0.5.13/src/helm/benchmark/static/schema_czech_bank.yaml +148 -0
  661. medhelm-0.5.13/src/helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  662. medhelm-0.5.13/src/helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  663. medhelm-0.5.13/src/helm/benchmark/static/schema_enterprise.yaml +319 -0
  664. medhelm-0.5.13/src/helm/benchmark/static/schema_ewok.yaml +367 -0
  665. medhelm-0.5.13/src/helm/benchmark/static/schema_finance.yaml +191 -0
  666. medhelm-0.5.13/src/helm/benchmark/static/schema_heim.yaml +1389 -0
  667. medhelm-0.5.13/src/helm/benchmark/static/schema_image2struct.yaml +588 -0
  668. medhelm-0.5.13/src/helm/benchmark/static/schema_instruction_following.yaml +161 -0
  669. medhelm-0.5.13/src/helm/benchmark/static/schema_legal.yaml +566 -0
  670. medhelm-0.5.13/src/helm/benchmark/static/schema_lite.yaml +766 -0
  671. medhelm-0.5.13/src/helm/benchmark/static/schema_long_context.yaml +282 -0
  672. medhelm-0.5.13/src/helm/benchmark/static/schema_medhelm.yaml +1176 -0
  673. medhelm-0.5.13/src/helm/benchmark/static/schema_melt.yaml +1257 -0
  674. medhelm-0.5.13/src/helm/benchmark/static/schema_mmlu.yaml +1449 -0
  675. medhelm-0.5.13/src/helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1046 -0
  676. medhelm-0.5.13/src/helm/benchmark/static/schema_safety.yaml +283 -0
  677. medhelm-0.5.13/src/helm/benchmark/static/schema_seahelm.yaml +723 -0
  678. medhelm-0.5.13/src/helm/benchmark/static/schema_slp.yaml +219 -0
  679. medhelm-0.5.13/src/helm/benchmark/static/schema_slphelm.yaml +162 -0
  680. medhelm-0.5.13/src/helm/benchmark/static/schema_social_audio.yaml +224 -0
  681. medhelm-0.5.13/src/helm/benchmark/static/schema_sql.yaml +171 -0
  682. medhelm-0.5.13/src/helm/benchmark/static/schema_thai.yaml +243 -0
  683. medhelm-0.5.13/src/helm/benchmark/static/schema_torr.yaml +474 -0
  684. medhelm-0.5.13/src/helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  685. medhelm-0.5.13/src/helm/benchmark/static/schema_unitxt.yaml +370 -0
  686. medhelm-0.5.13/src/helm/benchmark/static/schema_vhelm.yaml +933 -0
  687. medhelm-0.5.13/src/helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  688. medhelm-0.5.13/src/helm/benchmark/static/schema_video.yaml +219 -0
  689. medhelm-0.5.13/src/helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
  690. medhelm-0.5.13/src/helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  691. medhelm-0.5.13/src/helm/benchmark/static_build/assets/crfm-logo-Du4T1uWZ.png +0 -0
  692. medhelm-0.5.13/src/helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
  693. medhelm-0.5.13/src/helm/benchmark/static_build/assets/helm-logo-simple-DzOhNN41.png +0 -0
  694. medhelm-0.5.13/src/helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
  695. medhelm-0.5.13/src/helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
  696. medhelm-0.5.13/src/helm/benchmark/static_build/assets/index-1l-PnIC-.js +2 -0
  697. medhelm-0.5.13/src/helm/benchmark/static_build/assets/index-D30r2Sub.css +1 -0
  698. medhelm-0.5.13/src/helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
  699. medhelm-0.5.13/src/helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
  700. medhelm-0.5.13/src/helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
  701. medhelm-0.5.13/src/helm/benchmark/static_build/assets/overview-CVXNopt8.png +0 -0
  702. medhelm-0.5.13/src/helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
  703. medhelm-0.5.13/src/helm/benchmark/static_build/assets/react-BhnNyHuP.js +59 -0
  704. medhelm-0.5.13/src/helm/benchmark/static_build/assets/react-markdown-kIsf0iIR.js +27 -0
  705. medhelm-0.5.13/src/helm/benchmark/static_build/assets/recharts-Bmm96ixf.js +73 -0
  706. medhelm-0.5.13/src/helm/benchmark/static_build/assets/tremor-DyW3D1Ox.js +33 -0
  707. medhelm-0.5.13/src/helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
  708. medhelm-0.5.13/src/helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
  709. medhelm-0.5.13/src/helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
  710. medhelm-0.5.13/src/helm/benchmark/static_build/config.js +4 -0
  711. medhelm-0.5.13/src/helm/benchmark/static_build/index.html +20 -0
  712. medhelm-0.5.13/src/helm/benchmark/test_data_preprocessor.py +46 -0
  713. medhelm-0.5.13/src/helm/benchmark/test_plugins.py +223 -0
  714. medhelm-0.5.13/src/helm/benchmark/test_run_expander.py +29 -0
  715. medhelm-0.5.13/src/helm/benchmark/tokenizer_config_registry.py +85 -0
  716. medhelm-0.5.13/src/helm/benchmark/window_services/__init__.py +0 -0
  717. medhelm-0.5.13/src/helm/benchmark/window_services/default_window_service.py +6 -0
  718. medhelm-0.5.13/src/helm/benchmark/window_services/encoder_decoder_window_service.py +44 -0
  719. medhelm-0.5.13/src/helm/benchmark/window_services/ice_window_service.py +20 -0
  720. medhelm-0.5.13/src/helm/benchmark/window_services/image_generation/__init__.py +0 -0
  721. medhelm-0.5.13/src/helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
  722. medhelm-0.5.13/src/helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  723. medhelm-0.5.13/src/helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  724. medhelm-0.5.13/src/helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  725. medhelm-0.5.13/src/helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  726. medhelm-0.5.13/src/helm/benchmark/window_services/local_window_service.py +116 -0
  727. medhelm-0.5.13/src/helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  728. medhelm-0.5.13/src/helm/benchmark/window_services/no_tokenizer_window_service.py +54 -0
  729. medhelm-0.5.13/src/helm/benchmark/window_services/test_anthropic_window_service.py +164 -0
  730. medhelm-0.5.13/src/helm/benchmark/window_services/test_bloom_window_service.py +159 -0
  731. medhelm-0.5.13/src/helm/benchmark/window_services/test_flan_t5_window_service.py +13 -0
  732. medhelm-0.5.13/src/helm/benchmark/window_services/test_gpt2_window_service.py +62 -0
  733. medhelm-0.5.13/src/helm/benchmark/window_services/test_gpt4_window_service.py +31 -0
  734. medhelm-0.5.13/src/helm/benchmark/window_services/test_gptj_window_service.py +55 -0
  735. medhelm-0.5.13/src/helm/benchmark/window_services/test_gptneox_window_service.py +159 -0
  736. medhelm-0.5.13/src/helm/benchmark/window_services/test_openai_window_service.py +54 -0
  737. medhelm-0.5.13/src/helm/benchmark/window_services/test_opt_window_service.py +152 -0
  738. medhelm-0.5.13/src/helm/benchmark/window_services/test_palmyra_window_service.py +161 -0
  739. medhelm-0.5.13/src/helm/benchmark/window_services/test_t0pp_window_service.py +166 -0
  740. medhelm-0.5.13/src/helm/benchmark/window_services/test_t511b_window_service.py +166 -0
  741. medhelm-0.5.13/src/helm/benchmark/window_services/test_ul2_window_service.py +166 -0
  742. medhelm-0.5.13/src/helm/benchmark/window_services/test_utils.py +232 -0
  743. medhelm-0.5.13/src/helm/benchmark/window_services/test_yalm_window_service.py +152 -0
  744. medhelm-0.5.13/src/helm/benchmark/window_services/tokenizer_service.py +25 -0
  745. medhelm-0.5.13/src/helm/benchmark/window_services/window_service.py +154 -0
  746. medhelm-0.5.13/src/helm/benchmark/window_services/window_service_factory.py +71 -0
  747. medhelm-0.5.13/src/helm/benchmark/window_services/yalm_window_service.py +20 -0
  748. medhelm-0.5.13/src/helm/clients/__init__.py +0 -0
  749. medhelm-0.5.13/src/helm/clients/ai21_client.py +200 -0
  750. medhelm-0.5.13/src/helm/clients/ai21_utils.py +17 -0
  751. medhelm-0.5.13/src/helm/clients/aleph_alpha_client.py +114 -0
  752. medhelm-0.5.13/src/helm/clients/anthropic_client.py +775 -0
  753. medhelm-0.5.13/src/helm/clients/audio_language/__init__.py +0 -0
  754. medhelm-0.5.13/src/helm/clients/audio_language/diva_llama_client.py +122 -0
  755. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/arguments.py +61 -0
  756. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/constants.py +9 -0
  757. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/conversation.py +213 -0
  758. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  759. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  760. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  761. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  762. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  763. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  764. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  765. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  766. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  767. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  768. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  769. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  770. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  771. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni/utils.py +202 -0
  772. medhelm-0.5.13/src/helm/clients/audio_language/llama_omni_client.py +199 -0
  773. medhelm-0.5.13/src/helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
  774. medhelm-0.5.13/src/helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
  775. medhelm-0.5.13/src/helm/clients/audio_language/qwen_audiolm_client.py +153 -0
  776. medhelm-0.5.13/src/helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  777. medhelm-0.5.13/src/helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  778. medhelm-0.5.13/src/helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  779. medhelm-0.5.13/src/helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  780. medhelm-0.5.13/src/helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  781. medhelm-0.5.13/src/helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  782. medhelm-0.5.13/src/helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  783. medhelm-0.5.13/src/helm/clients/audio_language/test.py +62 -0
  784. medhelm-0.5.13/src/helm/clients/auto_client.py +228 -0
  785. medhelm-0.5.13/src/helm/clients/azure_openai_client.py +55 -0
  786. medhelm-0.5.13/src/helm/clients/bedrock_client.py +381 -0
  787. medhelm-0.5.13/src/helm/clients/bedrock_utils.py +105 -0
  788. medhelm-0.5.13/src/helm/clients/client.py +222 -0
  789. medhelm-0.5.13/src/helm/clients/clip_score_client.py +49 -0
  790. medhelm-0.5.13/src/helm/clients/clip_scorers/__init__.py +0 -0
  791. medhelm-0.5.13/src/helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  792. medhelm-0.5.13/src/helm/clients/clip_scorers/clip_scorer.py +50 -0
  793. medhelm-0.5.13/src/helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  794. medhelm-0.5.13/src/helm/clients/cohere_client.py +252 -0
  795. medhelm-0.5.13/src/helm/clients/cohere_utils.py +12 -0
  796. medhelm-0.5.13/src/helm/clients/dspy_client.py +135 -0
  797. medhelm-0.5.13/src/helm/clients/gcs_client.py +82 -0
  798. medhelm-0.5.13/src/helm/clients/google_client.py +78 -0
  799. medhelm-0.5.13/src/helm/clients/google_genai_client.py +281 -0
  800. medhelm-0.5.13/src/helm/clients/google_translate_client.py +35 -0
  801. medhelm-0.5.13/src/helm/clients/grok_client.py +36 -0
  802. medhelm-0.5.13/src/helm/clients/http_model_client.py +82 -0
  803. medhelm-0.5.13/src/helm/clients/huggingface_client.py +385 -0
  804. medhelm-0.5.13/src/helm/clients/huggingface_inference_providers_client.py +121 -0
  805. medhelm-0.5.13/src/helm/clients/huggingface_pipeline_client.py +140 -0
  806. medhelm-0.5.13/src/helm/clients/ibm_client.py +269 -0
  807. medhelm-0.5.13/src/helm/clients/image_generation/__init__.py +0 -0
  808. medhelm-0.5.13/src/helm/clients/image_generation/adobe_vision_client.py +80 -0
  809. medhelm-0.5.13/src/helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
  810. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/__init__.py +0 -0
  811. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  812. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  813. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  814. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
  815. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  816. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  817. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
  818. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  819. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  820. medhelm-0.5.13/src/helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  821. medhelm-0.5.13/src/helm/clients/image_generation/cogview2_client.py +192 -0
  822. medhelm-0.5.13/src/helm/clients/image_generation/dalle2_client.py +194 -0
  823. medhelm-0.5.13/src/helm/clients/image_generation/dalle3_client.py +108 -0
  824. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  825. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/data.py +442 -0
  826. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  827. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  828. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  829. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  830. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  831. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  832. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  833. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  834. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  835. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  836. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  837. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  838. medhelm-0.5.13/src/helm/clients/image_generation/dalle_mini_client.py +191 -0
  839. medhelm-0.5.13/src/helm/clients/image_generation/deep_floyd_client.py +80 -0
  840. medhelm-0.5.13/src/helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
  841. medhelm-0.5.13/src/helm/clients/image_generation/image_generation_client_utils.py +9 -0
  842. medhelm-0.5.13/src/helm/clients/image_generation/lexica_client.py +88 -0
  843. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/__init__.py +0 -0
  844. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  845. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  846. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  847. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  848. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  849. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  850. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  851. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  852. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  853. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/utils/config.py +129 -0
  854. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  855. medhelm-0.5.13/src/helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  856. medhelm-0.5.13/src/helm/clients/image_generation/mindalle_client.py +116 -0
  857. medhelm-0.5.13/src/helm/clients/image_generation/nudity_check_client.py +64 -0
  858. medhelm-0.5.13/src/helm/clients/image_generation/together_image_generation_client.py +113 -0
  859. medhelm-0.5.13/src/helm/clients/lit_gpt_client.py +169 -0
  860. medhelm-0.5.13/src/helm/clients/lit_gpt_generate.py +78 -0
  861. medhelm-0.5.13/src/helm/clients/litellm_client.py +140 -0
  862. medhelm-0.5.13/src/helm/clients/megatron_client.py +114 -0
  863. medhelm-0.5.13/src/helm/clients/mistral_client.py +190 -0
  864. medhelm-0.5.13/src/helm/clients/moderation_api_client.py +111 -0
  865. medhelm-0.5.13/src/helm/clients/nvidia_nim_client.py +32 -0
  866. medhelm-0.5.13/src/helm/clients/open_lm_client.py +43 -0
  867. medhelm-0.5.13/src/helm/clients/openai_client.py +604 -0
  868. medhelm-0.5.13/src/helm/clients/openai_responses_client.py +211 -0
  869. medhelm-0.5.13/src/helm/clients/openrouter_client.py +31 -0
  870. medhelm-0.5.13/src/helm/clients/palmyra_client.py +163 -0
  871. medhelm-0.5.13/src/helm/clients/perspective_api_client.py +148 -0
  872. medhelm-0.5.13/src/helm/clients/reka_client.py +190 -0
  873. medhelm-0.5.13/src/helm/clients/simple_client.py +64 -0
  874. medhelm-0.5.13/src/helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  875. medhelm-0.5.13/src/helm/clients/stanfordhealthcare_claude_client.py +31 -0
  876. medhelm-0.5.13/src/helm/clients/stanfordhealthcare_google_client.py +43 -0
  877. medhelm-0.5.13/src/helm/clients/stanfordhealthcare_http_model_client.py +103 -0
  878. medhelm-0.5.13/src/helm/clients/stanfordhealthcare_openai_client.py +62 -0
  879. medhelm-0.5.13/src/helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  880. medhelm-0.5.13/src/helm/clients/test_auto_client.py +76 -0
  881. medhelm-0.5.13/src/helm/clients/test_client.py +98 -0
  882. medhelm-0.5.13/src/helm/clients/test_huggingface_client.py +86 -0
  883. medhelm-0.5.13/src/helm/clients/test_openrouter_client.py +69 -0
  884. medhelm-0.5.13/src/helm/clients/test_simple_client.py +19 -0
  885. medhelm-0.5.13/src/helm/clients/test_together_client.py +184 -0
  886. medhelm-0.5.13/src/helm/clients/together_client.py +601 -0
  887. medhelm-0.5.13/src/helm/clients/toxicity_classifier_client.py +12 -0
  888. medhelm-0.5.13/src/helm/clients/upstage_client.py +23 -0
  889. medhelm-0.5.13/src/helm/clients/vertexai_client.py +488 -0
  890. medhelm-0.5.13/src/helm/clients/vision_language/__init__.py +0 -0
  891. medhelm-0.5.13/src/helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
  892. medhelm-0.5.13/src/helm/clients/vision_language/huggingface_vlm_client.py +114 -0
  893. medhelm-0.5.13/src/helm/clients/vision_language/idefics_client.py +168 -0
  894. medhelm-0.5.13/src/helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  895. medhelm-0.5.13/src/helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  896. medhelm-0.5.13/src/helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  897. medhelm-0.5.13/src/helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  898. medhelm-0.5.13/src/helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  899. medhelm-0.5.13/src/helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  900. medhelm-0.5.13/src/helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  901. medhelm-0.5.13/src/helm/clients/vision_language/open_flamingo_client.py +155 -0
  902. medhelm-0.5.13/src/helm/clients/vision_language/paligemma_client.py +147 -0
  903. medhelm-0.5.13/src/helm/clients/vision_language/palmyra_vision_client.py +101 -0
  904. medhelm-0.5.13/src/helm/clients/vision_language/qwen2_vlm_client.py +189 -0
  905. medhelm-0.5.13/src/helm/clients/vision_language/qwen_vlm_client.py +174 -0
  906. medhelm-0.5.13/src/helm/clients/vllm_client.py +80 -0
  907. medhelm-0.5.13/src/helm/clients/vllm_granite_thinking_client.py +56 -0
  908. medhelm-0.5.13/src/helm/clients/writer_client.py +105 -0
  909. medhelm-0.5.13/src/helm/clients/yi_client.py +28 -0
  910. medhelm-0.5.13/src/helm/common/__init__.py +0 -0
  911. medhelm-0.5.13/src/helm/common/audio_utils.py +111 -0
  912. medhelm-0.5.13/src/helm/common/authentication.py +8 -0
  913. medhelm-0.5.13/src/helm/common/cache.py +201 -0
  914. medhelm-0.5.13/src/helm/common/cache_backend_config.py +47 -0
  915. medhelm-0.5.13/src/helm/common/clip_score_request.py +41 -0
  916. medhelm-0.5.13/src/helm/common/codec.py +163 -0
  917. medhelm-0.5.13/src/helm/common/concurrency.py +32 -0
  918. medhelm-0.5.13/src/helm/common/context.py +80 -0
  919. medhelm-0.5.13/src/helm/common/credentials_utils.py +28 -0
  920. medhelm-0.5.13/src/helm/common/critique_request.py +99 -0
  921. medhelm-0.5.13/src/helm/common/file_caches/__init__.py +0 -0
  922. medhelm-0.5.13/src/helm/common/file_caches/file_cache.py +16 -0
  923. medhelm-0.5.13/src/helm/common/file_caches/local_file_cache.py +61 -0
  924. medhelm-0.5.13/src/helm/common/file_caches/test_local_file_cache.py +25 -0
  925. medhelm-0.5.13/src/helm/common/file_upload_request.py +27 -0
  926. medhelm-0.5.13/src/helm/common/general.py +358 -0
  927. medhelm-0.5.13/src/helm/common/gpu_utils.py +18 -0
  928. medhelm-0.5.13/src/helm/common/hierarchical_logger.py +218 -0
  929. medhelm-0.5.13/src/helm/common/image_generation_parameters.py +25 -0
  930. medhelm-0.5.13/src/helm/common/images_utils.py +102 -0
  931. medhelm-0.5.13/src/helm/common/key_value_store.py +113 -0
  932. medhelm-0.5.13/src/helm/common/local_context.py +140 -0
  933. medhelm-0.5.13/src/helm/common/media_object.py +137 -0
  934. medhelm-0.5.13/src/helm/common/moderations_api_request.py +71 -0
  935. medhelm-0.5.13/src/helm/common/mongo_key_value_store.py +89 -0
  936. medhelm-0.5.13/src/helm/common/multimodal_request_utils.py +57 -0
  937. medhelm-0.5.13/src/helm/common/nudity_check_request.py +29 -0
  938. medhelm-0.5.13/src/helm/common/object_spec.py +136 -0
  939. medhelm-0.5.13/src/helm/common/optional_dependencies.py +16 -0
  940. medhelm-0.5.13/src/helm/common/perspective_api_request.py +84 -0
  941. medhelm-0.5.13/src/helm/common/reeval_parameters.py +12 -0
  942. medhelm-0.5.13/src/helm/common/remote_context.py +61 -0
  943. medhelm-0.5.13/src/helm/common/request.py +267 -0
  944. medhelm-0.5.13/src/helm/common/response_format.py +18 -0
  945. medhelm-0.5.13/src/helm/common/test_cache.py +116 -0
  946. medhelm-0.5.13/src/helm/common/test_codec.py +144 -0
  947. medhelm-0.5.13/src/helm/common/test_general.py +70 -0
  948. medhelm-0.5.13/src/helm/common/test_logging.py +94 -0
  949. medhelm-0.5.13/src/helm/common/test_media_object.py +37 -0
  950. medhelm-0.5.13/src/helm/common/tokenization_request.py +133 -0
  951. medhelm-0.5.13/src/helm/config/__init__.py +0 -0
  952. medhelm-0.5.13/src/helm/config/model_deployments.yaml +5352 -0
  953. medhelm-0.5.13/src/helm/config/model_metadata.yaml +5476 -0
  954. medhelm-0.5.13/src/helm/config/tokenizer_configs.yaml +1366 -0
  955. medhelm-0.5.13/src/helm/proxy/__init__.py +0 -0
  956. medhelm-0.5.13/src/helm/proxy/accounts.py +394 -0
  957. medhelm-0.5.13/src/helm/proxy/cli.py +214 -0
  958. medhelm-0.5.13/src/helm/proxy/critique/__init__.py +0 -0
  959. medhelm-0.5.13/src/helm/proxy/critique/critique_client.py +42 -0
  960. medhelm-0.5.13/src/helm/proxy/critique/mechanical_turk_critique_client.py +13 -0
  961. medhelm-0.5.13/src/helm/proxy/critique/mechanical_turk_critique_exporter.py +205 -0
  962. medhelm-0.5.13/src/helm/proxy/critique/mechanical_turk_critique_importer.py +128 -0
  963. medhelm-0.5.13/src/helm/proxy/critique/mechanical_turk_utils.py +45 -0
  964. medhelm-0.5.13/src/helm/proxy/critique/model_critique_client.py +255 -0
  965. medhelm-0.5.13/src/helm/proxy/critique/scale_critique_client.py +351 -0
  966. medhelm-0.5.13/src/helm/proxy/critique/surge_ai_critique_client.py +181 -0
  967. medhelm-0.5.13/src/helm/proxy/example_queries.py +168 -0
  968. medhelm-0.5.13/src/helm/proxy/query.py +23 -0
  969. medhelm-0.5.13/src/helm/proxy/retry.py +98 -0
  970. medhelm-0.5.13/src/helm/proxy/server.py +312 -0
  971. medhelm-0.5.13/src/helm/proxy/services/__init__.py +0 -0
  972. medhelm-0.5.13/src/helm/proxy/services/remote_service.py +193 -0
  973. medhelm-0.5.13/src/helm/proxy/services/server_service.py +172 -0
  974. medhelm-0.5.13/src/helm/proxy/services/service.py +175 -0
  975. medhelm-0.5.13/src/helm/proxy/services/test_remote_service.py +170 -0
  976. medhelm-0.5.13/src/helm/proxy/services/test_service.py +243 -0
  977. medhelm-0.5.13/src/helm/proxy/static/general.js +122 -0
  978. medhelm-0.5.13/src/helm/proxy/static/help.html +99 -0
  979. medhelm-0.5.13/src/helm/proxy/static/index.css +62 -0
  980. medhelm-0.5.13/src/helm/proxy/static/index.html +40 -0
  981. medhelm-0.5.13/src/helm/proxy/static/index.js +462 -0
  982. medhelm-0.5.13/src/helm/proxy/static/info-icon.png +0 -0
  983. medhelm-0.5.13/src/helm/proxy/test_accounts.py +32 -0
  984. medhelm-0.5.13/src/helm/proxy/test_retry.py +35 -0
  985. medhelm-0.5.13/src/helm/proxy/token_counters/__init__.py +0 -0
  986. medhelm-0.5.13/src/helm/proxy/token_counters/auto_token_counter.py +42 -0
  987. medhelm-0.5.13/src/helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  988. medhelm-0.5.13/src/helm/proxy/token_counters/token_counter.py +13 -0
  989. medhelm-0.5.13/src/helm/py.typed +0 -0
  990. medhelm-0.5.13/src/helm/tokenizers/__init__.py +0 -0
  991. medhelm-0.5.13/src/helm/tokenizers/ai21_tokenizer.py +52 -0
  992. medhelm-0.5.13/src/helm/tokenizers/aleph_alpha_tokenizer.py +87 -0
  993. medhelm-0.5.13/src/helm/tokenizers/auto_tokenizer.py +93 -0
  994. medhelm-0.5.13/src/helm/tokenizers/caching_tokenizer.py +155 -0
  995. medhelm-0.5.13/src/helm/tokenizers/cohere_tokenizer.py +54 -0
  996. medhelm-0.5.13/src/helm/tokenizers/grok_tokenizer.py +55 -0
  997. medhelm-0.5.13/src/helm/tokenizers/http_model_tokenizer.py +90 -0
  998. medhelm-0.5.13/src/helm/tokenizers/huggingface_tokenizer.py +169 -0
  999. medhelm-0.5.13/src/helm/tokenizers/lit_gpt_tokenizer.py +39 -0
  1000. medhelm-0.5.13/src/helm/tokenizers/simple_tokenizer.py +33 -0
  1001. medhelm-0.5.13/src/helm/tokenizers/test_ai21_tokenizer.py +48 -0
  1002. medhelm-0.5.13/src/helm/tokenizers/test_anthropic_tokenizer.py +86 -0
  1003. medhelm-0.5.13/src/helm/tokenizers/test_cohere_tokenizer.py +39 -0
  1004. medhelm-0.5.13/src/helm/tokenizers/test_grok_tokenizer.py +33 -0
  1005. medhelm-0.5.13/src/helm/tokenizers/test_huggingface_tokenizer.py +140 -0
  1006. medhelm-0.5.13/src/helm/tokenizers/test_simple_tokenizer.py +33 -0
  1007. medhelm-0.5.13/src/helm/tokenizers/test_yalm_tokenizer.py +55 -0
  1008. medhelm-0.5.13/src/helm/tokenizers/tiktoken_tokenizer.py +28 -0
  1009. medhelm-0.5.13/src/helm/tokenizers/tokenizer.py +55 -0
  1010. medhelm-0.5.13/src/helm/tokenizers/vertexai_tokenizer.py +97 -0
  1011. medhelm-0.5.13/src/helm/tokenizers/yalm_tokenizer.py +33 -0
  1012. medhelm-0.5.13/src/helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  1013. medhelm-0.5.13/src/helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +25 -0
  1014. medhelm-0.5.13/src/helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  1015. medhelm-0.5.13/src/helm/tokenizers/yalm_tokenizer_data/yalm_tokenizer.py +204 -0
  1016. medhelm-0.5.13/src/medhelm.egg-info/PKG-INFO +417 -0
  1017. medhelm-0.5.13/src/medhelm.egg-info/SOURCES.txt +1020 -0
  1018. medhelm-0.5.13/src/medhelm.egg-info/dependency_links.txt +1 -0
  1019. medhelm-0.5.13/src/medhelm.egg-info/entry_points.txt +8 -0
  1020. medhelm-0.5.13/src/medhelm.egg-info/not-zip-safe +1 -0
  1021. medhelm-0.5.13/src/medhelm.egg-info/requires.txt +291 -0
  1022. medhelm-0.5.13/src/medhelm.egg-info/top_level.txt +1 -0
medhelm-0.5.13/LICENSE ADDED
@@ -0,0 +1,28 @@
1
+ Copyright 2026 © Pacific AI Inc.
2
+
3
+ This Software ("Software" or "Product") including code, design, documentation, configuration, models, tests, and related assets is owned by Pacific AI Inc. All rights reserved.
4
+
5
+ Pacific AI Inc. ("we") is the only owner of the copyright for this Software.
6
+
7
+ Unless otherwise specified in a separate Software License Agreement, Services Agreement, or End User License Agreement that you have executed directly with Pacific AI Inc.:
8
+
9
+ * You are NOT granted any license or right to use the Software in any way.
10
+ * You are NOT granted any license or right to retain a copy of this Software.
11
+ * You are NOT granted any license or right to change, modify, adapt, or translate the Software.
12
+ * You are NOT granted any license or right to sell, assign, rent, exchange, lend, lease, sublease, or redistribute the Software.
13
+ * You are NOT granted any license or rights to bundle, repackage, or include the Software with any software in any way.
14
+ * The Software is Confidential and Proprietary. You are NOT allowed to distribute copies of the Software to others by any means whatsoever.
15
+ * The Software does NOT come with any warranty, express or implied.
16
+ * It is NOT legal to create derivative works based on the Software.
17
+ * It is NOT legal to claim any title in the Software or any of its derivatives.
18
+ * It is NOT legal to reverse engineer, disassemble or decompile the Software.
19
+ * It is NOT legal to make or retain a copy of the Software.
20
+ * We have no liability whatsoever for use of the Software.
21
+ * You may not make any public statements about this Software or Pacific AI without explicit written permission from Pacific AI.
22
+ * You must retain a copy of this notice without changes along with every copy of the Software, even if you have a license for it.
23
+
24
+ Unless required by applicable law or agreed to in writing, Pacific AI provides the Software on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Software and assume any risks associated with Your exercise of permissions under this license.
25
+
26
+ In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall Pacific AI be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this license or out of the use or inability to use the Software (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if advised of the possibility of such damages.
27
+
28
+ Unless required by applicable law or agreed to in writing, Software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -0,0 +1,10 @@
1
+ recursive-include src/helm/ py.typed
2
+ recursive-include src/helm/tokenizers/ *.sp
3
+ recursive-include src/helm/benchmark/ *.json
4
+ recursive-include src/helm/benchmark/ *.yaml
5
+ recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
6
+ recursive-include src/helm/benchmark/static_build/ *.css *.html *.js *.png *.yaml
7
+ recursive-include src/helm/config/ *.yaml
8
+ recursive-include src/helm/benchmark/annotation/omni_math/ *.txt
9
+ recursive-include src/helm/benchmark/annotation/wildbench/ *.md
10
+ recursive-include src/helm/proxy/static/ *.css *.html *.js *.png
@@ -0,0 +1,417 @@
1
+ Metadata-Version: 2.4
2
+ Name: medhelm
3
+ Version: 0.5.13
4
+ Summary: Holistic evaluation of language models for medical applications (HELM for medicine)
5
+ Author-email: Pacific AI <david@pacific.ai>
6
+ License: Apache License 2.0
7
+ Project-URL: Homepage, https://github.com/PacificAI/medhelm
8
+ Project-URL: Documentation, https://medhelm.org
9
+ Keywords: language,models,benchmarking,medical,healthcare,evaluation
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: cattrs~=22.2
17
+ Requires-Dist: colorlog~=6.9
18
+ Requires-Dist: dacite~=1.6
19
+ Requires-Dist: Mako~=1.2
20
+ Requires-Dist: numpy<3,>=1.26
21
+ Requires-Dist: pandas~=2.0
22
+ Requires-Dist: pyhocon~=0.3.59
23
+ Requires-Dist: ubelt~=1.3
24
+ Requires-Dist: retrying~=1.3
25
+ Requires-Dist: spacy~=3.5
26
+ Requires-Dist: tqdm~=4.64
27
+ Requires-Dist: zstandard~=0.18.0
28
+ Requires-Dist: sqlitedict<3.0,>=2.1.0
29
+ Requires-Dist: bottle~=0.12.23
30
+ Requires-Dist: datasets~=3.1
31
+ Requires-Dist: pyarrow>=11.0.0
32
+ Requires-Dist: pyarrow-hotfix~=0.6
33
+ Requires-Dist: nltk!=3.9.0,~=3.7
34
+ Requires-Dist: rouge-score~=0.1.2
35
+ Requires-Dist: scipy>=1.10
36
+ Requires-Dist: uncertainty-calibration~=0.1.4
37
+ Requires-Dist: scikit-learn>=1.1
38
+ Requires-Dist: transformers~=4.53
39
+ Requires-Dist: torch<3.0.0,>=1.13.1
40
+ Requires-Dist: torchvision<3.0.0,>=0.14.1
41
+ Provides-Extra: proxy-server
42
+ Requires-Dist: gunicorn>=20.1; extra == "proxy-server"
43
+ Provides-Extra: human-evaluation
44
+ Requires-Dist: scaleapi~=2.13; extra == "human-evaluation"
45
+ Requires-Dist: surge-api~=1.1; extra == "human-evaluation"
46
+ Provides-Extra: dspy
47
+ Requires-Dist: dspy~=3.0; extra == "dspy"
48
+ Provides-Extra: scenarios
49
+ Requires-Dist: gdown~=5.1; extra == "scenarios"
50
+ Requires-Dist: xlrd~=2.0; extra == "scenarios"
51
+ Provides-Extra: metrics
52
+ Requires-Dist: google-api-python-client~=2.64; extra == "metrics"
53
+ Requires-Dist: numba~=0.56; extra == "metrics"
54
+ Requires-Dist: sacrebleu~=2.2; extra == "metrics"
55
+ Requires-Dist: langdetect~=1.0; extra == "metrics"
56
+ Requires-Dist: immutabledict~=4.2; extra == "metrics"
57
+ Requires-Dist: gradio_client~=1.3; extra == "metrics"
58
+ Provides-Extra: ranking
59
+ Requires-Dist: pytrec_eval==0.5; extra == "ranking"
60
+ Provides-Extra: summarization
61
+ Requires-Dist: summ-eval~=0.892; extra == "summarization"
62
+ Requires-Dist: bert-score~=0.3; extra == "summarization"
63
+ Requires-Dist: rouge-score~=0.1.2; extra == "summarization"
64
+ Requires-Dist: nltk!=3.9.0,~=3.7; extra == "summarization"
65
+ Requires-Dist: sentencepiece~=0.2.0; extra == "summarization"
66
+ Requires-Dist: protobuf; extra == "summarization"
67
+ Provides-Extra: plots
68
+ Requires-Dist: colorcet~=3.0; extra == "plots"
69
+ Requires-Dist: matplotlib>=3.6.0; extra == "plots"
70
+ Requires-Dist: seaborn>=0.11.0; extra == "plots"
71
+ Provides-Extra: decodingtrust
72
+ Requires-Dist: fairlearn~=0.9.0; extra == "decodingtrust"
73
+ Provides-Extra: slurm
74
+ Requires-Dist: simple-slurm~=0.2.6; extra == "slurm"
75
+ Provides-Extra: cleva
76
+ Requires-Dist: unidecode~=1.3; extra == "cleva"
77
+ Requires-Dist: pypinyin~=0.49.0; extra == "cleva"
78
+ Requires-Dist: jieba~=0.42.1; extra == "cleva"
79
+ Requires-Dist: opencc~=1.1; extra == "cleva"
80
+ Requires-Dist: langdetect~=1.0; extra == "cleva"
81
+ Provides-Extra: images
82
+ Requires-Dist: medhelm[accelerate]; extra == "images"
83
+ Requires-Dist: pillow>=10.2; extra == "images"
84
+ Provides-Extra: mongo
85
+ Requires-Dist: pymongo~=4.2; extra == "mongo"
86
+ Provides-Extra: unitxt
87
+ Requires-Dist: evaluate~=0.4.1; extra == "unitxt"
88
+ Provides-Extra: seahelm
89
+ Requires-Dist: pythainlp==5.0.0; extra == "seahelm"
90
+ Requires-Dist: pyonmttok==1.37.0; extra == "seahelm"
91
+ Requires-Dist: sacrebleu~=2.2; extra == "seahelm"
92
+ Requires-Dist: python-crfsuite~=0.9.11; extra == "seahelm"
93
+ Provides-Extra: accelerate
94
+ Requires-Dist: accelerate~=0.25; extra == "accelerate"
95
+ Provides-Extra: aleph-alpha
96
+ Requires-Dist: aleph-alpha-client~=2.14; extra == "aleph-alpha"
97
+ Requires-Dist: tokenizers>=0.13.3; extra == "aleph-alpha"
98
+ Provides-Extra: allenai
99
+ Requires-Dist: ai2-olmo~=0.2; extra == "allenai"
100
+ Provides-Extra: amazon
101
+ Requires-Dist: boto3~=1.34; extra == "amazon"
102
+ Requires-Dist: awscli~=1.33; extra == "amazon"
103
+ Requires-Dist: botocore~=1.34; extra == "amazon"
104
+ Provides-Extra: anthropic
105
+ Requires-Dist: anthropic~=0.41; extra == "anthropic"
106
+ Requires-Dist: websocket-client~=1.3; extra == "anthropic"
107
+ Provides-Extra: cohere
108
+ Requires-Dist: cohere~=5.3; extra == "cohere"
109
+ Provides-Extra: writer
110
+ Requires-Dist: writerai~=4.0; extra == "writer"
111
+ Provides-Extra: mistral
112
+ Requires-Dist: mistralai~=1.1; extra == "mistral"
113
+ Provides-Extra: openai
114
+ Requires-Dist: openai~=2.8; extra == "openai"
115
+ Requires-Dist: tiktoken~=0.7; extra == "openai"
116
+ Requires-Dist: pydantic~=2.0; extra == "openai"
117
+ Provides-Extra: google
118
+ Requires-Dist: google-cloud-aiplatform~=1.48; extra == "google"
119
+ Requires-Dist: google-genai~=1.48; extra == "google"
120
+ Provides-Extra: together
121
+ Requires-Dist: together~=1.1; extra == "together"
122
+ Provides-Extra: yandex
123
+ Requires-Dist: sentencepiece~=0.2.0; extra == "yandex"
124
+ Provides-Extra: models
125
+ Requires-Dist: medhelm[ai21]; extra == "models"
126
+ Requires-Dist: medhelm[accelerate]; extra == "models"
127
+ Requires-Dist: medhelm[aleph-alpha]; extra == "models"
128
+ Requires-Dist: medhelm[allenai]; extra == "models"
129
+ Requires-Dist: medhelm[amazon]; extra == "models"
130
+ Requires-Dist: medhelm[anthropic]; extra == "models"
131
+ Requires-Dist: medhelm[cohere]; extra == "models"
132
+ Requires-Dist: medhelm[google]; extra == "models"
133
+ Requires-Dist: medhelm[mistral]; extra == "models"
134
+ Requires-Dist: medhelm[openai]; extra == "models"
135
+ Requires-Dist: medhelm[reka]; extra == "models"
136
+ Requires-Dist: medhelm[together]; extra == "models"
137
+ Requires-Dist: medhelm[yandex]; extra == "models"
138
+ Requires-Dist: medhelm[writer]; extra == "models"
139
+ Provides-Extra: reka
140
+ Requires-Dist: reka-api~=2.0; extra == "reka"
141
+ Provides-Extra: vlm
142
+ Requires-Dist: medhelm[openai]; extra == "vlm"
143
+ Requires-Dist: einops~=0.7.0; extra == "vlm"
144
+ Requires-Dist: einops-exts~=0.0.4; extra == "vlm"
145
+ Requires-Dist: open-clip-torch~=2.24; extra == "vlm"
146
+ Requires-Dist: torch~=2.1; extra == "vlm"
147
+ Requires-Dist: transformers_stream_generator~=0.0.4; extra == "vlm"
148
+ Requires-Dist: scipy~=1.10; extra == "vlm"
149
+ Requires-Dist: torchvision<3.0.0,>=0.14.1; extra == "vlm"
150
+ Requires-Dist: medhelm[reka]; extra == "vlm"
151
+ Requires-Dist: medhelm[images]; extra == "vlm"
152
+ Requires-Dist: medhelm[image2struct]; extra == "vlm"
153
+ Requires-Dist: pycocoevalcap~=1.2; extra == "vlm"
154
+ Requires-Dist: qwen-vl-utils~=0.0.8; extra == "vlm"
155
+ Provides-Extra: ibm-enterprise-scenarios
156
+ Requires-Dist: openpyxl~=3.1; extra == "ibm-enterprise-scenarios"
157
+ Provides-Extra: ibm
158
+ Requires-Dist: ibm-watsonx-ai~=1.2; extra == "ibm"
159
+ Provides-Extra: image2struct
160
+ Requires-Dist: medhelm[images]; extra == "image2struct"
161
+ Requires-Dist: latex~=0.7.0; extra == "image2struct"
162
+ Requires-Dist: pdf2image~=1.16; extra == "image2struct"
163
+ Requires-Dist: selenium~=4.17; extra == "image2struct"
164
+ Requires-Dist: html2text~=2024.2.26; extra == "image2struct"
165
+ Requires-Dist: opencv-python-headless<=4.11.0.86,>=4.7.0.68; extra == "image2struct"
166
+ Requires-Dist: lpips~=0.1.4; extra == "image2struct"
167
+ Requires-Dist: imagehash~=4.3; extra == "image2struct"
168
+ Provides-Extra: heim
169
+ Requires-Dist: gdown~=5.1; extra == "heim"
170
+ Requires-Dist: diffusers~=0.34.0; extra == "heim"
171
+ Requires-Dist: icetk~=0.0.4; extra == "heim"
172
+ Requires-Dist: jax~=0.6.2; python_version >= "3.10" and extra == "heim"
173
+ Requires-Dist: jax~=0.4.30; python_version < "3.10" and extra == "heim"
174
+ Requires-Dist: jaxlib~=0.6.2; python_version >= "3.10" and extra == "heim"
175
+ Requires-Dist: jaxlib~=0.4.30; python_version < "3.10" and extra == "heim"
176
+ Requires-Dist: medhelm[openai]; extra == "heim"
177
+ Requires-Dist: einops~=0.7.0; extra == "heim"
178
+ Requires-Dist: omegaconf~=2.3; extra == "heim"
179
+ Requires-Dist: pytorch-lightning~=2.0; extra == "heim"
180
+ Requires-Dist: flax~=0.10.7; python_version >= "3.10" and extra == "heim"
181
+ Requires-Dist: flax~=0.8.5; python_version < "3.10" and extra == "heim"
182
+ Requires-Dist: ftfy~=6.1; extra == "heim"
183
+ Requires-Dist: Unidecode~=1.3; extra == "heim"
184
+ Requires-Dist: wandb~=0.16; extra == "heim"
185
+ Requires-Dist: google-cloud-translate~=3.11; extra == "heim"
186
+ Requires-Dist: autokeras~=1.0; extra == "heim"
187
+ Requires-Dist: clip-anytorch~=2.5; extra == "heim"
188
+ Requires-Dist: google-cloud-storage~=2.9; extra == "heim"
189
+ Requires-Dist: lpips~=0.1.4; extra == "heim"
190
+ Requires-Dist: multilingual-clip~=1.0; extra == "heim"
191
+ Requires-Dist: NudeNet~=2.0; extra == "heim"
192
+ Requires-Dist: numpy>=1.26; extra == "heim"
193
+ Requires-Dist: opencv-python<4.8.2.0,>=4.7.0.68; python_version >= "3.10" and extra == "heim"
194
+ Requires-Dist: opencv-python-headless<=4.11.0.86,>=4.7.0.68; python_version < "3.10" and extra == "heim"
195
+ Requires-Dist: pytorch-fid~=0.3.0; extra == "heim"
196
+ Requires-Dist: tensorflow~=2.11; extra == "heim"
197
+ Requires-Dist: timm~=0.6.12; extra == "heim"
198
+ Requires-Dist: torch-fidelity~=0.3.0; extra == "heim"
199
+ Requires-Dist: torchmetrics~=0.11.1; extra == "heim"
200
+ Requires-Dist: scikit-image==0.*,>=0.22; extra == "heim"
201
+ Requires-Dist: medhelm[images]; extra == "heim"
202
+ Provides-Extra: medhelm
203
+ Requires-Dist: accelerate~=0.25; extra == "medhelm"
204
+ Requires-Dist: medhelm[openai]; extra == "medhelm"
205
+ Requires-Dist: medhelm[yandex]; extra == "medhelm"
206
+ Requires-Dist: medhelm[scenarios]; extra == "medhelm"
207
+ Requires-Dist: bert_score~=0.3.13; extra == "medhelm"
208
+ Requires-Dist: lxml~=5.3; extra == "medhelm"
209
+ Requires-Dist: openpyxl~=3.1; extra == "medhelm"
210
+ Requires-Dist: python-docx~=1.1; extra == "medhelm"
211
+ Provides-Extra: gated
212
+ Requires-Dist: gdown~=5.1; extra == "gated"
213
+ Provides-Extra: audiolm
214
+ Requires-Dist: medhelm[openai]; extra == "audiolm"
215
+ Requires-Dist: medhelm[google]; extra == "audiolm"
216
+ Requires-Dist: pydub~=0.25.1; extra == "audiolm"
217
+ Requires-Dist: ffmpeg-python~=0.2.0; extra == "audiolm"
218
+ Requires-Dist: soundfile~=0.12; extra == "audiolm"
219
+ Requires-Dist: librosa~=0.10; extra == "audiolm"
220
+ Requires-Dist: einops~=0.7.0; extra == "audiolm"
221
+ Requires-Dist: openai-whisper==20240930; extra == "audiolm"
222
+ Requires-Dist: transformers_stream_generator~=0.0.4; extra == "audiolm"
223
+ Requires-Dist: av~=14.3; extra == "audiolm"
224
+ Requires-Dist: scipy~=1.10; extra == "audiolm"
225
+ Requires-Dist: torchvision<3.0.0,>=0.14.1; extra == "audiolm"
226
+ Requires-Dist: flash-attn~=2.7; extra == "audiolm"
227
+ Requires-Dist: pycocoevalcap~=1.2; extra == "audiolm"
228
+ Requires-Dist: jiwer~=3.0; extra == "audiolm"
229
+ Requires-Dist: rapidfuzz~=3.10; extra == "audiolm"
230
+ Requires-Dist: jieba~=0.42.1; extra == "audiolm"
231
+ Provides-Extra: codeinsights
232
+ Requires-Dist: clang~=20.1; extra == "codeinsights"
233
+ Requires-Dist: Levenshtein~=0.27; extra == "codeinsights"
234
+ Provides-Extra: lmkt
235
+ Requires-Dist: sentence_transformers~=4.1; extra == "lmkt"
236
+ Provides-Extra: all
237
+ Requires-Dist: medhelm[proxy-server]; extra == "all"
238
+ Requires-Dist: medhelm[scenarios]; extra == "all"
239
+ Requires-Dist: medhelm[metrics]; extra == "all"
240
+ Requires-Dist: medhelm[plots]; extra == "all"
241
+ Requires-Dist: medhelm[decodingtrust]; extra == "all"
242
+ Requires-Dist: medhelm[slurm]; extra == "all"
243
+ Requires-Dist: medhelm[cleva]; extra == "all"
244
+ Requires-Dist: medhelm[images]; extra == "all"
245
+ Requires-Dist: medhelm[models]; extra == "all"
246
+ Requires-Dist: medhelm[mongo]; extra == "all"
247
+ Requires-Dist: medhelm[heim]; extra == "all"
248
+ Requires-Dist: medhelm[vlm]; extra == "all"
249
+ Requires-Dist: medhelm[codeinsights]; extra == "all"
250
+ Requires-Dist: medhelm[lmkt]; extra == "all"
251
+ Provides-Extra: ci
252
+ Requires-Dist: medhelm[metrics]; extra == "ci"
253
+ Requires-Dist: medhelm[openai]; extra == "ci"
254
+ Requires-Dist: medhelm[plots]; extra == "ci"
255
+ Requires-Dist: medhelm[together]; extra == "ci"
256
+ Requires-Dist: medhelm[yandex]; extra == "ci"
257
+ Requires-Dist: medhelm[cohere]; extra == "ci"
258
+ Requires-Dist: medhelm[proxy-server]; extra == "ci"
259
+ Provides-Extra: litellm
260
+ Requires-Dist: litellm>=1.80.0; extra == "litellm"
261
+ Dynamic: license-file
262
+
263
+ # Holistic Evaluation of Language Models (HELM)
264
+
265
+ [comment]: <> (When using the img tag, which allows us to specify size, src has to be a URL.)
266
+ <img src="https://github.com/stanford-crfm/helm/raw/v0.5.4/helm-frontend/src/assets/helm-logo.png" alt="HELM logo" width="480"/>
267
+
268
+ <a href="https://github.com/PacificAI/medhelm">
269
+ <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/PacificAI/medhelm">
270
+ </a>
271
+ <a href="https://github.com/PacificAI/medhelm/blob/main/LICENSE">
272
+ <img alt="License" src="https://img.shields.io/github/license/PacificAI/medhelm?color=blue" />
273
+ </a>
274
+ <a href="https://pypi.org/project/medhelm/">
275
+ <img alt="PyPI" src="https://img.shields.io/pypi/v/medhelm?color=blue" />
276
+ </a>
277
+
278
+ **Holistic Evaluation of Language Models (HELM)** is an open source Python framework created by the [Center for Research on Foundation Models (CRFM) at Stanford](https://crfm.stanford.edu/) for holistic, reproducible and transparent evaluation of foundation models, including large language models (LLMs) and multimodal models. This framework includes the following features:
279
+
280
+ - Datasets and benchmarks in a standardized format (e.g. MMLU-Pro, GPQA, IFEval, WildBench)
281
+ - Models from various providers accessible through a unified interface (e.g. OpenAI models, Anthropic Claude, Google Gemini)
282
+ - Metrics for measuring various aspects beyond accuracy (e.g. efficiency, bias, toxicity)
283
+ - Web UI for inspecting individual prompts and responses
284
+ - Web leaderboard for comparing results across models and benchmarks
285
+
286
+ ## Documentation
287
+
288
+ Documentation: **[medhelm.org](https://medhelm.org)**
289
+
290
+ ## Install & run (MedHELM library)
291
+
292
+ MedHELM uses the HELM core engine and adds medical benchmarks. Install from PyPI:
293
+
294
+ ### Standard (recommended to start)
295
+
296
+ Scenarios: **PubMedQA**, **MedCalc-Bench**, **MedicationQA**, **MedHallu**.
297
+
298
+ ```sh
299
+ pip install medhelm
300
+ # or with uv:
301
+ uv pip install medhelm
302
+ ```
303
+
304
+ Run a benchmark:
305
+
306
+ ```sh
307
+ uv run medhelm-run --run-entries "pubmed_qa:model=huggingface/qwen2.5-7b" --suite my_med_test --max-eval-instances 10
308
+ uv run helm-summarize --suite my_med_test
309
+ uv run helm-server --suite my_med_test
310
+ ```
311
+
312
+ Then open http://localhost:8000/ in your browser.
313
+
314
+ ### Clinical NLP tier (`[summarization]`)
315
+
316
+ Adds heavy libraries (bert-score, rouge-score, nltk). **Install can take 2–3 minutes.**
317
+
318
+ Scenarios: **DischargeMe** (hospital course summaries), **ACI-Bench** (clinical transcripts), **Patient-Edu** (simplifying medical jargon).
319
+
320
+ ```sh
321
+ pip install "medhelm[summarization]"
322
+ # or: uv pip install "medhelm[summarization]"
323
+ ```
324
+
325
+ Example:
326
+
327
+ ```sh
328
+ uv run medhelm-run --run-entries "discharge_summaries:model=huggingface/qwen2.5-7b" --suite med_summaries --max-eval-instances 5
329
+ uv run helm-summarize --suite med_summaries
330
+ uv run helm-server --suite med_summaries
331
+ ```
332
+
333
+ ### Gated / licensing tier (`[gated]`)
334
+
335
+ Adds **gdown** for scenarios that use Google Drive. Install can also take longer.
336
+
337
+ Scenarios: **MedQA** (USMLE/Board exams), **MedMCQA** (AIIMS/NEET exams).
338
+
339
+ ```sh
340
+ pip install "medhelm[gated]"
341
+ # or: uv pip install "medhelm[gated]"
342
+ ```
343
+
344
+ Example:
345
+
346
+ ```sh
347
+ uv run medhelm-run --run-entries "med_qa:model=huggingface/qwen2.5-7b" --suite board_exams --max-eval-instances 10
348
+ uv run helm-summarize --suite board_exams
349
+ uv run helm-server --suite board_exams
350
+ ```
351
+
352
+ ### Classic HELM commands
353
+
354
+ You can still use `helm-run`, `helm-summarize`, and `helm-server`; `medhelm-run` is an alias for `helm-run`.
355
+
356
+ ```sh
357
+ helm-run --run-entries mmlu:subject=philosophy,model=openai/gpt2 --suite my-suite --max-eval-instances 10
358
+ helm-summarize --suite my-suite
359
+ helm-server --suite my-suite
360
+ ```
361
+
362
+ ## Quick Start (summary)
363
+
364
+ <!--quick-start-begin-->
365
+
366
+ | Tier | Install | Scenarios |
367
+ |------|--------|-----------|
368
+ | **Standard** | `pip install medhelm` or `uv pip install medhelm` | PubMedQA, MedCalc-Bench, MedicationQA, MedHallu |
369
+ | **Summarization** | `pip install "medhelm[summarization]"` | DischargeMe, ACI-Bench, Patient-Edu (2–3 min install) |
370
+ | **Gated** | `pip install "medhelm[gated]"` | MedQA, MedMCQA (Drive) |
371
+
372
+ Run: `uv run medhelm-run --run-entries "<scenario>:model=<model>" --suite <name> --max-eval-instances <n>` then `helm-summarize` and `helm-server`. See [medhelm.org](https://medhelm.org) for full docs.
373
+
374
+ <!--quick-start-end-->
375
+
376
+ ## Leaderboards
377
+
378
+ We maintain offical leaderboards with results from evaluating recent models on notable benchmarks using this framework. Our current flagship leaderboards are:
379
+
380
+ - [HELM Capabilities](https://crfm.stanford.edu/helm/capabilities/latest/)
381
+ - [HELM Safety](https://crfm.stanford.edu/helm/safety/latest/)
382
+ - [Holistic Evaluation of Vision-Language Models (VHELM)](https://crfm.stanford.edu/helm/vhelm/latest/)
383
+
384
+ We also maintain leaderboards for a diverse range of domains (e.g. medicine, finance) and aspects (e.g. multi-linguality, world knowledge, regulation compliance). Refer to the [HELM website](https://crfm.stanford.edu/helm/) for a full list of leaderboards.
385
+
386
+ ## Papers
387
+
388
+ The HELM framework was used in the following papers for evaluating models.
389
+
390
+ - **Holistic Evaluation of Language Models** - [paper](https://openreview.net/forum?id=iO4LZibEqW), [leaderboard](https://crfm.stanford.edu/helm/classic/latest/)
391
+ - **Holistic Evaluation of Vision-Language Models (VHELM)** - [paper](https://arxiv.org/abs/2410.07112), [leaderboard](https://crfm.stanford.edu/helm/vhelm/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/vhelm/)
392
+ - **Holistic Evaluation of Text-To-Image Models (HEIM)** - [paper](https://arxiv.org/abs/2311.04287), [leaderboard](https://crfm.stanford.edu/helm/heim/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/heim/)
393
+ - **Image2Struct: Benchmarking Structure Extraction for Vision-Language Models** - [paper](https://arxiv.org/abs/2410.22456)
394
+ - **Enterprise Benchmarks for Large Language Model Evaluation** - [paper](https://arxiv.org/abs/2410.12857), [documentation](https://crfm-helm.readthedocs.io/en/latest/enterprise_benchmark/)
395
+ - **The Mighty ToRR: A Benchmark for Table Reasoning and Robustness** - [paper](https://arxiv.org/abs/2502.19412), [leaderboard](https://crfm.stanford.edu/helm/torr/latest/)
396
+ - **Reliable and Efficient Amortized Model-based Evaluation** - [paper](https://arxiv.org/abs/2503.13335), [documentation](https://crfm-helm.readthedocs.io/en/latest/reeval/)
397
+ - **MedHELM** - paper in progress, [leaderboard](https://crfm.stanford.edu/helm/medhelm/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/reeval/)
398
+ - **Holistic Evaluation of Audio-Language Models** - [paper](https://arxiv.org/abs/2508.21376), [leaderboard](https://crfm.stanford.edu/helm/audio/latest/)
399
+
400
+ The HELM framework can be used to reproduce the published model evaluation results from these papers. To get started, refer to the documentation links above for the corresponding paper, or the [Reproducing Leaderboards](https://medhelm.org/reproducing_leaderboards/) documentation on medhelm.org.
401
+
402
+ ## Citation
403
+
404
+ If you use this software in your research, please cite the [Holistic Evaluation of Language Models paper](https://openreview.net/forum?id=iO4LZibEqW) as below.
405
+
406
+ ```bibtex
407
+ @article{
408
+ liang2023holistic,
409
+ title={Holistic Evaluation of Language Models},
410
+ author={Percy Liang and Rishi Bommasani and Tony Lee and Dimitris Tsipras and Dilara Soylu and Michihiro Yasunaga and Yian Zhang and Deepak Narayanan and Yuhuai Wu and Ananya Kumar and Benjamin Newman and Binhang Yuan and Bobby Yan and Ce Zhang and Christian Alexander Cosgrove and Christopher D Manning and Christopher Re and Diana Acosta-Navas and Drew Arad Hudson and Eric Zelikman and Esin Durmus and Faisal Ladhak and Frieda Rong and Hongyu Ren and Huaxiu Yao and Jue WANG and Keshav Santhanam and Laurel Orr and Lucia Zheng and Mert Yuksekgonul and Mirac Suzgun and Nathan Kim and Neel Guha and Niladri S. Chatterji and Omar Khattab and Peter Henderson and Qian Huang and Ryan Andrew Chi and Sang Michael Xie and Shibani Santurkar and Surya Ganguli and Tatsunori Hashimoto and Thomas Icard and Tianyi Zhang and Vishrav Chaudhary and William Wang and Xuechen Li and Yifan Mai and Yuhui Zhang and Yuta Koreeda},
411
+ journal={Transactions on Machine Learning Research},
412
+ issn={2835-8856},
413
+ year={2023},
414
+ url={https://openreview.net/forum?id=iO4LZibEqW},
415
+ note={Featured Certification, Expert Certification}
416
+ }
417
+ ```
@@ -0,0 +1,155 @@
1
+ # Holistic Evaluation of Language Models (HELM)
2
+
3
+ [comment]: <> (When using the img tag, which allows us to specify size, src has to be a URL.)
4
+ <img src="https://github.com/stanford-crfm/helm/raw/v0.5.4/helm-frontend/src/assets/helm-logo.png" alt="HELM logo" width="480"/>
5
+
6
+ <a href="https://github.com/PacificAI/medhelm">
7
+ <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/PacificAI/medhelm">
8
+ </a>
9
+ <a href="https://github.com/PacificAI/medhelm/blob/main/LICENSE">
10
+ <img alt="License" src="https://img.shields.io/github/license/PacificAI/medhelm?color=blue" />
11
+ </a>
12
+ <a href="https://pypi.org/project/medhelm/">
13
+ <img alt="PyPI" src="https://img.shields.io/pypi/v/medhelm?color=blue" />
14
+ </a>
15
+
16
+ **Holistic Evaluation of Language Models (HELM)** is an open source Python framework created by the [Center for Research on Foundation Models (CRFM) at Stanford](https://crfm.stanford.edu/) for holistic, reproducible and transparent evaluation of foundation models, including large language models (LLMs) and multimodal models. This framework includes the following features:
17
+
18
+ - Datasets and benchmarks in a standardized format (e.g. MMLU-Pro, GPQA, IFEval, WildBench)
19
+ - Models from various providers accessible through a unified interface (e.g. OpenAI models, Anthropic Claude, Google Gemini)
20
+ - Metrics for measuring various aspects beyond accuracy (e.g. efficiency, bias, toxicity)
21
+ - Web UI for inspecting individual prompts and responses
22
+ - Web leaderboard for comparing results across models and benchmarks
23
+
24
+ ## Documentation
25
+
26
+ Documentation: **[medhelm.org](https://medhelm.org)**
27
+
28
+ ## Install & run (MedHELM library)
29
+
30
+ MedHELM uses the HELM core engine and adds medical benchmarks. Install from PyPI:
31
+
32
+ ### Standard (recommended to start)
33
+
34
+ Scenarios: **PubMedQA**, **MedCalc-Bench**, **MedicationQA**, **MedHallu**.
35
+
36
+ ```sh
37
+ pip install medhelm
38
+ # or with uv:
39
+ uv pip install medhelm
40
+ ```
41
+
42
+ Run a benchmark:
43
+
44
+ ```sh
45
+ uv run medhelm-run --run-entries "pubmed_qa:model=huggingface/qwen2.5-7b" --suite my_med_test --max-eval-instances 10
46
+ uv run helm-summarize --suite my_med_test
47
+ uv run helm-server --suite my_med_test
48
+ ```
49
+
50
+ Then open http://localhost:8000/ in your browser.
51
+
52
+ ### Clinical NLP tier (`[summarization]`)
53
+
54
+ Adds heavy libraries (bert-score, rouge-score, nltk). **Install can take 2–3 minutes.**
55
+
56
+ Scenarios: **DischargeMe** (hospital course summaries), **ACI-Bench** (clinical transcripts), **Patient-Edu** (simplifying medical jargon).
57
+
58
+ ```sh
59
+ pip install "medhelm[summarization]"
60
+ # or: uv pip install "medhelm[summarization]"
61
+ ```
62
+
63
+ Example:
64
+
65
+ ```sh
66
+ uv run medhelm-run --run-entries "discharge_summaries:model=huggingface/qwen2.5-7b" --suite med_summaries --max-eval-instances 5
67
+ uv run helm-summarize --suite med_summaries
68
+ uv run helm-server --suite med_summaries
69
+ ```
70
+
71
+ ### Gated / licensing tier (`[gated]`)
72
+
73
+ Adds **gdown** for scenarios that use Google Drive. Install can also take longer.
74
+
75
+ Scenarios: **MedQA** (USMLE/Board exams), **MedMCQA** (AIIMS/NEET exams).
76
+
77
+ ```sh
78
+ pip install "medhelm[gated]"
79
+ # or: uv pip install "medhelm[gated]"
80
+ ```
81
+
82
+ Example:
83
+
84
+ ```sh
85
+ uv run medhelm-run --run-entries "med_qa:model=huggingface/qwen2.5-7b" --suite board_exams --max-eval-instances 10
86
+ uv run helm-summarize --suite board_exams
87
+ uv run helm-server --suite board_exams
88
+ ```
89
+
90
+ ### Classic HELM commands
91
+
92
+ You can still use `helm-run`, `helm-summarize`, and `helm-server`; `medhelm-run` is an alias for `helm-run`.
93
+
94
+ ```sh
95
+ helm-run --run-entries mmlu:subject=philosophy,model=openai/gpt2 --suite my-suite --max-eval-instances 10
96
+ helm-summarize --suite my-suite
97
+ helm-server --suite my-suite
98
+ ```
99
+
100
+ ## Quick Start (summary)
101
+
102
+ <!--quick-start-begin-->
103
+
104
+ | Tier | Install | Scenarios |
105
+ |------|--------|-----------|
106
+ | **Standard** | `pip install medhelm` or `uv pip install medhelm` | PubMedQA, MedCalc-Bench, MedicationQA, MedHallu |
107
+ | **Summarization** | `pip install "medhelm[summarization]"` | DischargeMe, ACI-Bench, Patient-Edu (2–3 min install) |
108
+ | **Gated** | `pip install "medhelm[gated]"` | MedQA, MedMCQA (Drive) |
109
+
110
+ Run: `uv run medhelm-run --run-entries "<scenario>:model=<model>" --suite <name> --max-eval-instances <n>` then `helm-summarize` and `helm-server`. See [medhelm.org](https://medhelm.org) for full docs.
111
+
112
+ <!--quick-start-end-->
113
+
114
+ ## Leaderboards
115
+
116
+ We maintain offical leaderboards with results from evaluating recent models on notable benchmarks using this framework. Our current flagship leaderboards are:
117
+
118
+ - [HELM Capabilities](https://crfm.stanford.edu/helm/capabilities/latest/)
119
+ - [HELM Safety](https://crfm.stanford.edu/helm/safety/latest/)
120
+ - [Holistic Evaluation of Vision-Language Models (VHELM)](https://crfm.stanford.edu/helm/vhelm/latest/)
121
+
122
+ We also maintain leaderboards for a diverse range of domains (e.g. medicine, finance) and aspects (e.g. multi-linguality, world knowledge, regulation compliance). Refer to the [HELM website](https://crfm.stanford.edu/helm/) for a full list of leaderboards.
123
+
124
+ ## Papers
125
+
126
+ The HELM framework was used in the following papers for evaluating models.
127
+
128
+ - **Holistic Evaluation of Language Models** - [paper](https://openreview.net/forum?id=iO4LZibEqW), [leaderboard](https://crfm.stanford.edu/helm/classic/latest/)
129
+ - **Holistic Evaluation of Vision-Language Models (VHELM)** - [paper](https://arxiv.org/abs/2410.07112), [leaderboard](https://crfm.stanford.edu/helm/vhelm/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/vhelm/)
130
+ - **Holistic Evaluation of Text-To-Image Models (HEIM)** - [paper](https://arxiv.org/abs/2311.04287), [leaderboard](https://crfm.stanford.edu/helm/heim/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/heim/)
131
+ - **Image2Struct: Benchmarking Structure Extraction for Vision-Language Models** - [paper](https://arxiv.org/abs/2410.22456)
132
+ - **Enterprise Benchmarks for Large Language Model Evaluation** - [paper](https://arxiv.org/abs/2410.12857), [documentation](https://crfm-helm.readthedocs.io/en/latest/enterprise_benchmark/)
133
+ - **The Mighty ToRR: A Benchmark for Table Reasoning and Robustness** - [paper](https://arxiv.org/abs/2502.19412), [leaderboard](https://crfm.stanford.edu/helm/torr/latest/)
134
+ - **Reliable and Efficient Amortized Model-based Evaluation** - [paper](https://arxiv.org/abs/2503.13335), [documentation](https://crfm-helm.readthedocs.io/en/latest/reeval/)
135
+ - **MedHELM** - paper in progress, [leaderboard](https://crfm.stanford.edu/helm/medhelm/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/reeval/)
136
+ - **Holistic Evaluation of Audio-Language Models** - [paper](https://arxiv.org/abs/2508.21376), [leaderboard](https://crfm.stanford.edu/helm/audio/latest/)
137
+
138
+ The HELM framework can be used to reproduce the published model evaluation results from these papers. To get started, refer to the documentation links above for the corresponding paper, or the [Reproducing Leaderboards](https://medhelm.org/reproducing_leaderboards/) documentation on medhelm.org.
139
+
140
+ ## Citation
141
+
142
+ If you use this software in your research, please cite the [Holistic Evaluation of Language Models paper](https://openreview.net/forum?id=iO4LZibEqW) as below.
143
+
144
+ ```bibtex
145
+ @article{
146
+ liang2023holistic,
147
+ title={Holistic Evaluation of Language Models},
148
+ author={Percy Liang and Rishi Bommasani and Tony Lee and Dimitris Tsipras and Dilara Soylu and Michihiro Yasunaga and Yian Zhang and Deepak Narayanan and Yuhuai Wu and Ananya Kumar and Benjamin Newman and Binhang Yuan and Bobby Yan and Ce Zhang and Christian Alexander Cosgrove and Christopher D Manning and Christopher Re and Diana Acosta-Navas and Drew Arad Hudson and Eric Zelikman and Esin Durmus and Faisal Ladhak and Frieda Rong and Hongyu Ren and Huaxiu Yao and Jue WANG and Keshav Santhanam and Laurel Orr and Lucia Zheng and Mert Yuksekgonul and Mirac Suzgun and Nathan Kim and Neel Guha and Niladri S. Chatterji and Omar Khattab and Peter Henderson and Qian Huang and Ryan Andrew Chi and Sang Michael Xie and Shibani Santurkar and Surya Ganguli and Tatsunori Hashimoto and Thomas Icard and Tianyi Zhang and Vishrav Chaudhary and William Wang and Xuechen Li and Yifan Mai and Yuhui Zhang and Yuta Koreeda},
149
+ journal={Transactions on Machine Learning Research},
150
+ issn={2835-8856},
151
+ year={2023},
152
+ url={https://openreview.net/forum?id=iO4LZibEqW},
153
+ note={Featured Certification, Expert Certification}
154
+ }
155
+ ```