crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crfm-helm
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -28,8 +28,7 @@ Requires-Dist: bottle ~=0.12.23
28
28
  Requires-Dist: datasets ~=2.17
29
29
  Requires-Dist: pyarrow >=11.0.0
30
30
  Requires-Dist: pyarrow-hotfix ~=0.6
31
- Requires-Dist: nltk ~=3.7
32
- Requires-Dist: pyext ~=0.7
31
+ Requires-Dist: nltk <3.8.2,~=3.7
33
32
  Requires-Dist: rouge-score ~=0.1.2
34
33
  Requires-Dist: scipy ~=1.10
35
34
  Requires-Dist: uncertainty-calibration ~=0.1.4
@@ -37,7 +36,8 @@ Requires-Dist: scikit-learn ~=1.1
37
36
  Requires-Dist: transformers ~=4.40
38
37
  Requires-Dist: torch <3.0.0,>=1.13.1
39
38
  Requires-Dist: torchvision <3.0.0,>=0.14.1
40
- Requires-Dist: google-api-python-client ~=2.64
39
+ Provides-Extra: accelerate
40
+ Requires-Dist: accelerate ~=0.25 ; extra == 'accelerate'
41
41
  Provides-Extra: aleph-alpha
42
42
  Requires-Dist: aleph-alpha-client ~=2.14.0 ; extra == 'aleph-alpha'
43
43
  Requires-Dist: tokenizers >=0.13.3 ; extra == 'aleph-alpha'
@@ -55,6 +55,7 @@ Requires-Dist: crfm-helm[models] ; extra == 'all'
55
55
  Requires-Dist: crfm-helm[mongo] ; extra == 'all'
56
56
  Requires-Dist: crfm-helm[heim] ; extra == 'all'
57
57
  Requires-Dist: crfm-helm[vlm] ; extra == 'all'
58
+ Requires-Dist: crfm-helm[bhasa] ; extra == 'all'
58
59
  Provides-Extra: allenai
59
60
  Requires-Dist: ai2-olmo ~=0.2 ; extra == 'allenai'
60
61
  Provides-Extra: amazon
@@ -64,12 +65,18 @@ Requires-Dist: botocore ~=1.31.57 ; extra == 'amazon'
64
65
  Provides-Extra: anthropic
65
66
  Requires-Dist: anthropic ~=0.17 ; extra == 'anthropic'
66
67
  Requires-Dist: websocket-client ~=1.3.2 ; extra == 'anthropic'
68
+ Provides-Extra: bhasa
69
+ Requires-Dist: pythainlp ==5.0.0 ; extra == 'bhasa'
70
+ Requires-Dist: pyonmttok ==1.37.0 ; extra == 'bhasa'
71
+ Requires-Dist: sacrebleu ~=2.2.1 ; extra == 'bhasa'
67
72
  Provides-Extra: cleva
68
73
  Requires-Dist: unidecode ==1.3.6 ; extra == 'cleva'
69
74
  Requires-Dist: pypinyin ==0.49.0 ; extra == 'cleva'
70
75
  Requires-Dist: jieba ==0.42.1 ; extra == 'cleva'
71
76
  Requires-Dist: opencc ==1.1.6 ; extra == 'cleva'
72
77
  Requires-Dist: langdetect ==1.0.9 ; extra == 'cleva'
78
+ Provides-Extra: cohere
79
+ Requires-Dist: cohere ~=5.3 ; extra == 'cohere'
73
80
  Provides-Extra: decodingtrust
74
81
  Requires-Dist: fairlearn ~=0.9.0 ; extra == 'decodingtrust'
75
82
  Provides-Extra: dev
@@ -79,10 +86,11 @@ Requires-Dist: black ==24.3.0 ; extra == 'dev'
79
86
  Requires-Dist: mypy ==1.5.1 ; extra == 'dev'
80
87
  Requires-Dist: flake8 ==5.0.4 ; extra == 'dev'
81
88
  Provides-Extra: google
82
- Requires-Dist: google-cloud-aiplatform ~=1.44 ; extra == 'google'
89
+ Requires-Dist: google-cloud-aiplatform ~=1.48 ; extra == 'google'
83
90
  Provides-Extra: heim
84
- Requires-Dist: gdown ~=4.4.0 ; extra == 'heim'
91
+ Requires-Dist: gdown ~=5.1 ; extra == 'heim'
85
92
  Requires-Dist: diffusers ~=0.24.0 ; extra == 'heim'
93
+ Requires-Dist: icetk ~=0.0.4 ; extra == 'heim'
86
94
  Requires-Dist: jax ~=0.4.13 ; extra == 'heim'
87
95
  Requires-Dist: jaxlib ~=0.4.13 ; extra == 'heim'
88
96
  Requires-Dist: crfm-helm[openai] ; extra == 'heim'
@@ -106,53 +114,63 @@ Requires-Dist: tensorflow ~=2.11.1 ; extra == 'heim'
106
114
  Requires-Dist: timm ~=0.6.12 ; extra == 'heim'
107
115
  Requires-Dist: torch-fidelity ~=0.3.0 ; extra == 'heim'
108
116
  Requires-Dist: torchmetrics ~=0.11.1 ; extra == 'heim'
117
+ Requires-Dist: scikit-image ~=0.21.0 ; extra == 'heim'
109
118
  Requires-Dist: crfm-helm[images] ; extra == 'heim'
110
119
  Provides-Extra: human-evaluation
111
120
  Requires-Dist: scaleapi ~=2.13.0 ; extra == 'human-evaluation'
112
121
  Requires-Dist: surge-api ~=1.1.0 ; extra == 'human-evaluation'
113
- Provides-Extra: image2structure
114
- Requires-Dist: crfm-helm[images] ; extra == 'image2structure'
115
- Requires-Dist: latex ~=0.7.0 ; extra == 'image2structure'
116
- Requires-Dist: pdf2image ~=1.16.3 ; extra == 'image2structure'
117
- Requires-Dist: selenium ~=4.17.2 ; extra == 'image2structure'
118
- Requires-Dist: html2text ~=2024.2.26 ; extra == 'image2structure'
119
- Requires-Dist: opencv-python ~=4.7.0.68 ; extra == 'image2structure'
120
- Requires-Dist: lpips ~=0.1.4 ; extra == 'image2structure'
121
- Requires-Dist: imagehash ~=4.3.1 ; extra == 'image2structure'
122
+ Provides-Extra: image2struct
123
+ Requires-Dist: crfm-helm[images] ; extra == 'image2struct'
124
+ Requires-Dist: latex ~=0.7.0 ; extra == 'image2struct'
125
+ Requires-Dist: pdf2image ~=1.16.3 ; extra == 'image2struct'
126
+ Requires-Dist: selenium ~=4.17.2 ; extra == 'image2struct'
127
+ Requires-Dist: html2text ~=2024.2.26 ; extra == 'image2struct'
128
+ Requires-Dist: opencv-python ~=4.7.0.68 ; extra == 'image2struct'
129
+ Requires-Dist: lpips ~=0.1.4 ; extra == 'image2struct'
130
+ Requires-Dist: imagehash ~=4.3.1 ; extra == 'image2struct'
122
131
  Provides-Extra: images
123
- Requires-Dist: accelerate ~=0.25.0 ; extra == 'images'
132
+ Requires-Dist: crfm-helm[accelerate] ; extra == 'images'
124
133
  Requires-Dist: pillow ~=10.2 ; extra == 'images'
125
134
  Provides-Extra: metrics
135
+ Requires-Dist: google-api-python-client ~=2.64 ; extra == 'metrics'
126
136
  Requires-Dist: numba ~=0.56.4 ; extra == 'metrics'
127
137
  Requires-Dist: pytrec-eval ==0.5 ; extra == 'metrics'
128
138
  Requires-Dist: sacrebleu ~=2.2.1 ; extra == 'metrics'
129
139
  Provides-Extra: mistral
130
140
  Requires-Dist: mistralai ~=0.0.11 ; extra == 'mistral'
131
141
  Provides-Extra: models
142
+ Requires-Dist: crfm-helm[ai21] ; extra == 'models'
143
+ Requires-Dist: crfm-helm[accelerate] ; extra == 'models'
132
144
  Requires-Dist: crfm-helm[aleph-alpha] ; extra == 'models'
133
145
  Requires-Dist: crfm-helm[allenai] ; extra == 'models'
134
146
  Requires-Dist: crfm-helm[amazon] ; extra == 'models'
135
147
  Requires-Dist: crfm-helm[anthropic] ; extra == 'models'
148
+ Requires-Dist: crfm-helm[cohere] ; extra == 'models'
136
149
  Requires-Dist: crfm-helm[google] ; extra == 'models'
137
150
  Requires-Dist: crfm-helm[mistral] ; extra == 'models'
138
151
  Requires-Dist: crfm-helm[openai] ; extra == 'models'
152
+ Requires-Dist: crfm-helm[reka] ; extra == 'models'
139
153
  Requires-Dist: crfm-helm[together] ; extra == 'models'
140
- Requires-Dist: crfm-helm[tsinghua] ; extra == 'models'
141
154
  Requires-Dist: crfm-helm[yandex] ; extra == 'models'
155
+ Requires-Dist: crfm-helm[openvino] ; extra == 'models'
142
156
  Provides-Extra: mongo
143
157
  Requires-Dist: pymongo ~=4.2 ; extra == 'mongo'
144
158
  Provides-Extra: openai
145
159
  Requires-Dist: openai ~=1.0 ; extra == 'openai'
146
- Requires-Dist: tiktoken ~=0.3.3 ; extra == 'openai'
160
+ Requires-Dist: tiktoken ~=0.7 ; extra == 'openai'
147
161
  Requires-Dist: pydantic ~=2.0 ; extra == 'openai'
162
+ Provides-Extra: openvino
163
+ Requires-Dist: optimum[openvino] ~=1.19 ; extra == 'openvino'
148
164
  Provides-Extra: plots
149
165
  Requires-Dist: colorcet ~=3.0.1 ; extra == 'plots'
150
166
  Requires-Dist: matplotlib ~=3.6.0 ; extra == 'plots'
151
167
  Requires-Dist: seaborn ~=0.11.0 ; extra == 'plots'
152
168
  Provides-Extra: proxy-server
153
169
  Requires-Dist: gunicorn ~=20.1.0 ; extra == 'proxy-server'
170
+ Provides-Extra: reka
171
+ Requires-Dist: reka-api ~=2.0.0 ; extra == 'reka'
154
172
  Provides-Extra: scenarios
155
- Requires-Dist: gdown ~=4.4.0 ; extra == 'scenarios'
173
+ Requires-Dist: gdown ~=5.1 ; extra == 'scenarios'
156
174
  Requires-Dist: sympy ~=1.11.1 ; extra == 'scenarios'
157
175
  Requires-Dist: xlrd ~=2.0.1 ; extra == 'scenarios'
158
176
  Provides-Extra: slurm
@@ -161,21 +179,20 @@ Provides-Extra: summarization
161
179
  Requires-Dist: summ-eval ~=0.892 ; extra == 'summarization'
162
180
  Provides-Extra: together
163
181
  Requires-Dist: together ~=1.1 ; extra == 'together'
164
- Provides-Extra: tsinghua
165
- Requires-Dist: icetk ~=0.0.4 ; extra == 'tsinghua'
166
182
  Provides-Extra: unitxt
167
183
  Requires-Dist: evaluate ~=0.4.1 ; extra == 'unitxt'
168
184
  Provides-Extra: vlm
169
185
  Requires-Dist: crfm-helm[openai] ; extra == 'vlm'
170
186
  Requires-Dist: einops ~=0.7.0 ; extra == 'vlm'
171
187
  Requires-Dist: einops-exts ~=0.0.4 ; extra == 'vlm'
172
- Requires-Dist: open-clip-torch ~=2.24.0 ; extra == 'vlm'
173
- Requires-Dist: torch ~=2.1.2 ; extra == 'vlm'
188
+ Requires-Dist: open-clip-torch ~=2.24 ; extra == 'vlm'
189
+ Requires-Dist: torch ~=2.1 ; extra == 'vlm'
174
190
  Requires-Dist: transformers-stream-generator ~=0.0.4 ; extra == 'vlm'
175
191
  Requires-Dist: scipy ~=1.10 ; extra == 'vlm'
176
192
  Requires-Dist: torchvision <3.0.0,>=0.14.1 ; extra == 'vlm'
193
+ Requires-Dist: crfm-helm[reka] ; extra == 'vlm'
177
194
  Requires-Dist: crfm-helm[images] ; extra == 'vlm'
178
- Requires-Dist: crfm-helm[image2structure] ; extra == 'vlm'
195
+ Requires-Dist: crfm-helm[image2struct] ; extra == 'vlm'
179
196
  Requires-Dist: pycocoevalcap ~=1.2 ; extra == 'vlm'
180
197
  Provides-Extra: yandex
181
198
  Requires-Dist: sentencepiece ~=0.1.97 ; extra == 'yandex'
@@ -199,39 +216,6 @@ Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic
199
216
 
200
217
  To get started, refer to [the documentation on Read the Docs](https://crfm-helm.readthedocs.io/) for how to install and run the package.
201
218
 
202
- ## Directory Structure
203
-
204
- The directory structure for this repo is as follows
205
-
206
- ```
207
- ├── docs # MD used to generate readthedocs
208
-
209
- ├── scripts # Python utility scripts for HELM
210
- │ ├── cache
211
- │ ├── data_overlap # Calculate train test overlap
212
- │ │ ├── common
213
- │ │ ├── scenarios
214
- │ │ └── test
215
- │ ├── efficiency
216
- │ ├── fact_completion
217
- │ ├── offline_eval
218
- │ └── scale
219
- └── src
220
- ├── helm # Benchmarking Scripts for HELM
221
- │ │
222
- │ ├── benchmark # Main Python code for running HELM
223
- │ │ │
224
- │ │ └── static # Current JS (Jquery) code for rendering front-end
225
- │ │ │
226
- │ │ └── ...
227
- │ │
228
- │ ├── common # Additional Python code for running HELM
229
- │ │
230
- │ └── proxy # Python code for external web requests
231
-
232
- └── helm-frontend # New React Front-end
233
- ```
234
-
235
219
  # Holistic Evaluation of Text-To-Image Models
236
220
 
237
221
  <img src="https://github.com/stanford-crfm/helm/raw/heim/src/helm/benchmark/static/heim/images/heim-logo.png" alt="" width="800"/>