crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crfm-helm
3
- Version: 0.5.5
3
+ Version: 0.5.6
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -17,10 +17,11 @@ Requires-Python: >=3.9
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
19
  Requires-Dist: cattrs~=22.2
20
+ Requires-Dist: colorlog~=6.9
20
21
  Requires-Dist: dacite~=1.6
21
22
  Requires-Dist: importlib-resources~=5.10
22
23
  Requires-Dist: Mako~=1.2
23
- Requires-Dist: numpy<3,~=1.26
24
+ Requires-Dist: numpy<3,>=1.26
24
25
  Requires-Dist: pandas~=2.0
25
26
  Requires-Dist: pyhocon~=0.3.59
26
27
  Requires-Dist: retrying~=1.3
@@ -34,9 +35,9 @@ Requires-Dist: pyarrow>=11.0.0
34
35
  Requires-Dist: pyarrow-hotfix~=0.6
35
36
  Requires-Dist: nltk!=3.9.0,~=3.7
36
37
  Requires-Dist: rouge-score~=0.1.2
37
- Requires-Dist: scipy~=1.10
38
+ Requires-Dist: scipy>=1.10
38
39
  Requires-Dist: uncertainty-calibration~=0.1.4
39
- Requires-Dist: scikit-learn~=1.1
40
+ Requires-Dist: scikit-learn>=1.1
40
41
  Requires-Dist: transformers~=4.40
41
42
  Requires-Dist: torch<3.0.0,>=1.13.1
42
43
  Requires-Dist: torchvision<3.0.0,>=0.14.1
@@ -63,8 +64,8 @@ Requires-Dist: summ-eval~=0.892; extra == "summarization"
63
64
  Requires-Dist: bert-score~=0.3; extra == "summarization"
64
65
  Provides-Extra: plots
65
66
  Requires-Dist: colorcet~=3.0.1; extra == "plots"
66
- Requires-Dist: matplotlib~=3.6.0; extra == "plots"
67
- Requires-Dist: seaborn~=0.11.0; extra == "plots"
67
+ Requires-Dist: matplotlib>=3.6.0; extra == "plots"
68
+ Requires-Dist: seaborn>=0.11.0; extra == "plots"
68
69
  Provides-Extra: decodingtrust
69
70
  Requires-Dist: fairlearn~=0.9.0; extra == "decodingtrust"
70
71
  Provides-Extra: slurm
@@ -99,15 +100,17 @@ Requires-Dist: boto3~=1.34.131; extra == "amazon"
99
100
  Requires-Dist: awscli~=1.33.13; extra == "amazon"
100
101
  Requires-Dist: botocore~=1.34.131; extra == "amazon"
101
102
  Provides-Extra: anthropic
102
- Requires-Dist: anthropic~=0.39; extra == "anthropic"
103
+ Requires-Dist: anthropic~=0.48; extra == "anthropic"
103
104
  Requires-Dist: websocket-client~=1.3.2; extra == "anthropic"
104
105
  Requires-Dist: httpx<0.28.0; extra == "anthropic"
105
106
  Provides-Extra: cohere
106
107
  Requires-Dist: cohere~=5.3; extra == "cohere"
108
+ Provides-Extra: writer
109
+ Requires-Dist: writerai~=4.0; extra == "writer"
107
110
  Provides-Extra: mistral
108
111
  Requires-Dist: mistralai~=1.1; extra == "mistral"
109
112
  Provides-Extra: openai
110
- Requires-Dist: openai~=1.64; extra == "openai"
113
+ Requires-Dist: openai~=1.70; extra == "openai"
111
114
  Requires-Dist: tiktoken~=0.7; extra == "openai"
112
115
  Requires-Dist: pydantic~=2.0; extra == "openai"
113
116
  Provides-Extra: google
@@ -130,6 +133,7 @@ Requires-Dist: crfm-helm[openai]; extra == "models"
130
133
  Requires-Dist: crfm-helm[reka]; extra == "models"
131
134
  Requires-Dist: crfm-helm[together]; extra == "models"
132
135
  Requires-Dist: crfm-helm[yandex]; extra == "models"
136
+ Requires-Dist: crfm-helm[writer]; extra == "models"
133
137
  Requires-Dist: crfm-helm[ibm-enterprise-scenarios]; extra == "models"
134
138
  Provides-Extra: reka
135
139
  Requires-Dist: reka-api~=2.0.0; extra == "reka"
@@ -191,10 +195,18 @@ Requires-Dist: torchmetrics~=0.11.1; extra == "heim"
191
195
  Requires-Dist: scikit-image!=0.23.*,==0.*,>=0.22; extra == "heim"
192
196
  Requires-Dist: crfm-helm[images]; extra == "heim"
193
197
  Provides-Extra: medhelm
198
+ Requires-Dist: crfm-helm[accelerate]; extra == "medhelm"
199
+ Requires-Dist: crfm-helm[openai]; extra == "medhelm"
194
200
  Requires-Dist: crfm-helm[summarization]; extra == "medhelm"
195
- Requires-Dist: python-docx~=1.1.2; extra == "medhelm"
201
+ Requires-Dist: crfm-helm[yandex]; extra == "medhelm"
202
+ Requires-Dist: bert_score~=0.3.13; extra == "medhelm"
196
203
  Requires-Dist: langchain~=0.3.9; extra == "medhelm"
204
+ Requires-Dist: langchain-community~=0.3.8; extra == "medhelm"
197
205
  Requires-Dist: lxml~=5.3.0; extra == "medhelm"
206
+ Requires-Dist: openpyxl~=3.1; extra == "medhelm"
207
+ Requires-Dist: python-docx~=1.1.2; extra == "medhelm"
208
+ Requires-Dist: torch~=2.2.2; extra == "medhelm"
209
+ Requires-Dist: torchvision~=0.17.2; extra == "medhelm"
198
210
  Provides-Extra: audiolm
199
211
  Requires-Dist: crfm-helm[openai]; extra == "audiolm"
200
212
  Requires-Dist: crfm-helm[google]; extra == "audiolm"
@@ -204,10 +216,12 @@ Requires-Dist: soundfile~=0.12; extra == "audiolm"
204
216
  Requires-Dist: librosa~=0.10; extra == "audiolm"
205
217
  Requires-Dist: einops~=0.7.0; extra == "audiolm"
206
218
  Requires-Dist: openai-whisper==20240930; extra == "audiolm"
207
- Requires-Dist: transformers~=4.45.1; extra == "audiolm"
219
+ Requires-Dist: transformers~=4.48.0; extra == "audiolm"
208
220
  Requires-Dist: transformers_stream_generator~=0.0.4; extra == "audiolm"
221
+ Requires-Dist: av~=14.3.0; extra == "audiolm"
209
222
  Requires-Dist: scipy~=1.10; extra == "audiolm"
210
223
  Requires-Dist: torchvision<3.0.0,>=0.14.1; extra == "audiolm"
224
+ Requires-Dist: flash-attn~=2.7.3; extra == "audiolm"
211
225
  Requires-Dist: pycocoevalcap~=1.2; extra == "audiolm"
212
226
  Requires-Dist: jiwer~=3.0; extra == "audiolm"
213
227
  Requires-Dist: rapidfuzz~=3.10; extra == "audiolm"
@@ -226,12 +240,11 @@ Requires-Dist: crfm-helm[models]; extra == "all"
226
240
  Requires-Dist: crfm-helm[mongo]; extra == "all"
227
241
  Requires-Dist: crfm-helm[heim]; extra == "all"
228
242
  Requires-Dist: crfm-helm[vlm]; extra == "all"
229
- Requires-Dist: crfm-helm[audiolm]; extra == "all"
230
243
  Provides-Extra: dev
231
244
  Requires-Dist: pytest~=7.2.0; extra == "dev"
232
245
  Requires-Dist: pre-commit~=2.20.0; extra == "dev"
233
246
  Requires-Dist: black==24.3.0; extra == "dev"
234
- Requires-Dist: mypy==1.5.1; extra == "dev"
247
+ Requires-Dist: mypy==1.16.0; extra == "dev"
235
248
  Requires-Dist: flake8==5.0.4; extra == "dev"
236
249
  Dynamic: license-file
237
250
 
@@ -318,8 +331,9 @@ The HELM framework was used in the following papers for evaluating models.
318
331
  - **Holistic Evaluation of Text-To-Image Models (HEIM)** - [paper](https://arxiv.org/abs/2311.04287), [leaderboard](https://crfm.stanford.edu/helm/heim/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/heim/)
319
332
  - **Image2Struct: Benchmarking Structure Extraction for Vision-Language Models** - [paper](https://arxiv.org/abs/2410.22456)
320
333
  - **Enterprise Benchmarks for Large Language Model Evaluation** - [paper](https://arxiv.org/abs/2410.12857), [documentation](https://crfm-helm.readthedocs.io/en/latest/enterprise_benchmark/)
321
- - **The Mighty ToRR: A Benchmark for Table Reasoning and Robustness** - [paper](https://arxiv.org/abs/2502.19412)
334
+ - **The Mighty ToRR: A Benchmark for Table Reasoning and Robustness** - [paper](https://arxiv.org/abs/2502.19412), [leaderboard](https://crfm.stanford.edu/helm/torr/latest/)
322
335
  - **Reliable and Efficient Amortized Model-based Evaluation** - [paper](https://arxiv.org/abs/2503.13335), [documentation](https://crfm-helm.readthedocs.io/en/latest/reeval/)
336
+ - **MedHELM** - paper in progress, [leaderboard](https://crfm.stanford.edu/helm/medhelm/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/reeval/)
323
337
 
324
338
  The HELM framework can be used to reproduce the published model evaluation results from these papers. To get started, refer to the documentation links above for the corresponding paper, or the [main Reproducing Leaderboards documentation](https://crfm-helm.readthedocs.io/en/latest/reproducing_leaderboards/).
325
339