crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,152 @@
1
+ """Run specs for experiments only.
2
+
3
+ These run specs are not intended for use with public leaderboards."""
4
+
5
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
6
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
7
+ from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
8
+ from helm.benchmark.metrics.metric import MetricSpec
9
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
10
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
11
+
12
+
13
+ @run_spec_function("call_center_summarization")
14
+ def get_call_center_summarization_spec(subset: str = "summarization") -> RunSpec:
15
+ from helm.benchmark.annotation.call_center_annotator import CallCenterSummarizationAnnotator
16
+
17
+ scenario_spec = ScenarioSpec(
18
+ class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
19
+ args={"subset": subset},
20
+ )
21
+
22
+ instructions = "Summarize the call transcript in under 10 sentences."
23
+
24
+ adapter_spec = AdapterSpec(
25
+ method=ADAPT_GENERATION,
26
+ instructions=instructions,
27
+ input_prefix="### Call Transcript\n",
28
+ input_suffix="",
29
+ output_prefix="",
30
+ output_suffix="",
31
+ max_train_instances=0,
32
+ temperature=0.0,
33
+ max_tokens=512,
34
+ num_outputs=1,
35
+ )
36
+
37
+ annotator_specs = annotator_specs = [
38
+ AnnotatorSpec(class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator")
39
+ ]
40
+ annotation_metric_specs = [
41
+ MetricSpec(
42
+ class_name="helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
43
+ args={
44
+ "annotator_name": CallCenterSummarizationAnnotator.name,
45
+ "key": criterion,
46
+ "min_score": 1,
47
+ "max_score": 5,
48
+ },
49
+ )
50
+ for criterion in CallCenterSummarizationAnnotator.CRITERIA
51
+ ]
52
+
53
+ metric_specs = get_basic_metric_specs([]) + annotation_metric_specs
54
+
55
+ group = "call_center_summarization" if subset == "summarization" else f"call_center_summarization_{subset}"
56
+
57
+ return RunSpec(
58
+ name="call_center_summarization",
59
+ scenario_spec=scenario_spec,
60
+ adapter_spec=adapter_spec,
61
+ metric_specs=metric_specs,
62
+ annotators=annotator_specs,
63
+ groups=[group],
64
+ )
65
+
66
+
67
+ @run_spec_function("call_center_summarization_pairwise_comparison")
68
+ def get_call_center_summarization_pairwise_comparison_spec() -> RunSpec:
69
+ scenario_spec = ScenarioSpec(
70
+ class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
71
+ )
72
+
73
+ instructions = "Summarize the call transcript in under 10 sentences."
74
+
75
+ adapter_spec = AdapterSpec(
76
+ method=ADAPT_GENERATION,
77
+ instructions=instructions,
78
+ input_prefix="### Call Transcript\n",
79
+ input_suffix="",
80
+ output_prefix="",
81
+ output_suffix="",
82
+ max_train_instances=0,
83
+ temperature=0.0,
84
+ max_tokens=512,
85
+ num_outputs=1,
86
+ )
87
+
88
+ annotator_specs = annotator_specs = [
89
+ AnnotatorSpec(
90
+ class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator" # noqa: E501
91
+ )
92
+ ]
93
+
94
+ metric_specs = get_basic_metric_specs([]) + [
95
+ MetricSpec(
96
+ class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
97
+ args={"annotator_name": "call_center_summarization_pairwise_comparison", "key": "score"},
98
+ )
99
+ ]
100
+
101
+ return RunSpec(
102
+ name="call_center_summarization_pairwise_comparison",
103
+ scenario_spec=scenario_spec,
104
+ adapter_spec=adapter_spec,
105
+ metric_specs=metric_specs,
106
+ annotators=annotator_specs,
107
+ groups=["call_center_summarization_pairwise_comparison"],
108
+ )
109
+
110
+
111
+ @run_spec_function("call_center_summarization_key_points_recall")
112
+ def get_call_center_summarization_key_points_recall_spec() -> RunSpec:
113
+ scenario_spec = ScenarioSpec(
114
+ class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
115
+ )
116
+
117
+ instructions = "Summarize the call transcript in under 10 sentences."
118
+
119
+ adapter_spec = AdapterSpec(
120
+ method=ADAPT_GENERATION,
121
+ instructions=instructions,
122
+ input_prefix="### Call Transcript\n",
123
+ input_suffix="",
124
+ output_prefix="",
125
+ output_suffix="",
126
+ max_train_instances=0,
127
+ temperature=0.0,
128
+ max_tokens=512,
129
+ num_outputs=1,
130
+ )
131
+
132
+ annotator_specs = annotator_specs = [
133
+ AnnotatorSpec(
134
+ class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator"
135
+ )
136
+ ]
137
+
138
+ metric_specs = get_basic_metric_specs([]) + [
139
+ MetricSpec(
140
+ class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
141
+ args={"annotator_name": "call_center_summarization_key_points_recall", "key": "score"},
142
+ )
143
+ ]
144
+
145
+ return RunSpec(
146
+ name="call_center_summarization_key_points_recall",
147
+ scenario_spec=scenario_spec,
148
+ adapter_spec=adapter_spec,
149
+ metric_specs=metric_specs,
150
+ annotators=annotator_specs,
151
+ groups=["call_center_summarization_key_points_recall"],
152
+ )
@@ -24,6 +24,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
24
24
  get_ranking_binary_adapter_spec,
25
25
  get_summarization_adapter_spec,
26
26
  )
27
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
27
28
  from helm.benchmark.metrics.common_metric_specs import (
28
29
  get_basic_metric_specs,
29
30
  get_bias_metric_specs,
@@ -1166,8 +1167,6 @@ def get_pubmed_qa_spec() -> RunSpec:
1166
1167
 
1167
1168
  @run_spec_function("live_qa")
1168
1169
  def get_live_qa_spec() -> RunSpec:
1169
- from helm.common.gpu_utils import get_torch_device_name
1170
-
1171
1170
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
1172
1171
 
1173
1172
  adapter_spec = get_generation_adapter_spec(
@@ -1177,22 +1176,23 @@ def get_live_qa_spec() -> RunSpec:
1177
1176
  max_train_instances=0,
1178
1177
  max_tokens=512,
1179
1178
  )
1179
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")]
1180
+ metric_specs = get_open_ended_generation_metric_specs() + [
1181
+ MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric")
1182
+ ]
1180
1183
 
1181
1184
  return RunSpec(
1182
1185
  name="live_qa",
1183
1186
  scenario_spec=scenario_spec,
1184
1187
  adapter_spec=adapter_spec,
1185
- metric_specs=get_summarization_metric_specs(
1186
- {"task": "live_qa", "device": get_torch_device_name()},
1187
- ),
1188
+ annotators=annotator_specs,
1189
+ metric_specs=metric_specs,
1188
1190
  groups=["live_qa"],
1189
1191
  )
1190
1192
 
1191
1193
 
1192
1194
  @run_spec_function("medication_qa")
1193
1195
  def get_medication_qa_spec() -> RunSpec:
1194
- from helm.common.gpu_utils import get_torch_device_name
1195
-
1196
1196
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
1197
1197
 
1198
1198
  adapter_spec = get_generation_adapter_spec(
@@ -1203,13 +1203,17 @@ def get_medication_qa_spec() -> RunSpec:
1203
1203
  max_tokens=512,
1204
1204
  )
1205
1205
 
1206
+ annotator_specs = [
1207
+ AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
1208
+ ]
1209
+ metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
1210
+
1206
1211
  return RunSpec(
1207
1212
  name="medication_qa",
1208
1213
  scenario_spec=scenario_spec,
1209
1214
  adapter_spec=adapter_spec,
1210
- metric_specs=get_summarization_metric_specs(
1211
- {"task": "medication_qa", "device": get_torch_device_name()},
1212
- ),
1215
+ annotators=annotator_specs,
1216
+ metric_specs=metric_specs,
1213
1217
  groups=["medication_qa"],
1214
1218
  )
1215
1219
 
@@ -1506,5 +1510,5 @@ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_J
1506
1510
  scenario_spec=scenario_spec,
1507
1511
  adapter_spec=adapter_spec,
1508
1512
  metric_specs=get_exact_match_metric_specs(),
1509
- groups=["thai_exam"],
1513
+ groups=["thai_exam", f"thai_exam_{exam}"],
1510
1514
  )
@@ -56,7 +56,7 @@ def get_decodingtrust_stereotype_bias_spec(task: str) -> RunSpec:
56
56
  scenario_spec=scenario_spec,
57
57
  adapter_spec=adapter_spec,
58
58
  metric_specs=get_stereotype_bias_metric_specs(),
59
- groups=["decodingtrust", "stereotype_bias"],
59
+ groups=["decodingtrust_stereotype_bias", "stereotype_bias"],
60
60
  )
61
61
 
62
62
 
@@ -74,7 +74,7 @@ def get_decodingtrust_adv_robustness_spec(task: str) -> RunSpec:
74
74
  scenario_spec=scenario_spec,
75
75
  adapter_spec=adapter_spec,
76
76
  metric_specs=get_exact_match_metric_specs(),
77
- groups=["decodingtrust", "adv_robustness"],
77
+ groups=["decodingtrust_adv_robustness", "adv_robustness"],
78
78
  )
79
79
 
80
80
 
@@ -92,7 +92,7 @@ def get_decodingtrust_adv_demonstration_spec(perspective: str, data: str, demo_n
92
92
  scenario_spec=scenario_spec,
93
93
  adapter_spec=adapter_spec,
94
94
  metric_specs=get_exact_match_metric_specs(),
95
- groups=["decodingtrust", "adv_demonstration"],
95
+ groups=["decodingtrust_adv_demonstration", "adv_demonstration"],
96
96
  )
97
97
 
98
98
 
@@ -137,7 +137,7 @@ def get_decodingtrust_ood_robustness_spec(
137
137
  scenario_spec=scenario_spec,
138
138
  adapter_spec=adapter_spec,
139
139
  metric_specs=get_exact_match_metric_specs() if ood_type == "style" else get_ood_knowledge_metric_specs(),
140
- groups=["decodingtrust", "ood_robustness"],
140
+ groups=["decodingtrust_ood_robustness", "ood_robustness"],
141
141
  )
142
142
 
143
143
 
@@ -163,7 +163,7 @@ def get_decodingtrust_fairness_spec(
163
163
  scenario_spec=scenario_spec,
164
164
  adapter_spec=adapter_spec,
165
165
  metric_specs=get_fairness_metric_specs() + get_exact_match_metric_specs(),
166
- groups=["decodingtrust", "fairness"],
166
+ groups=["decodingtrust_fairness", "fairness"],
167
167
  )
168
168
 
169
169
 
@@ -192,7 +192,7 @@ def get_decodingtrust_privacy_spec(
192
192
  scenario_spec=scenario_spec,
193
193
  adapter_spec=adapter_spec,
194
194
  metric_specs=get_privacy_metric_specs(),
195
- groups=["decodingtrust", "privacy"],
195
+ groups=["decodingtrust_privacy", "privacy"],
196
196
  )
197
197
 
198
198
 
@@ -280,7 +280,7 @@ def get_decodingtrust_machine_ethics_spec(
280
280
  scenario_spec=scenario_spec,
281
281
  adapter_spec=adapter_spec,
282
282
  metric_specs=get_exact_match_metric_specs(),
283
- groups=["decodingtrust", "machine_ethics"],
283
+ groups=["decodingtrust_machine_ethics", "machine_ethics"],
284
284
  )
285
285
 
286
286
 
@@ -309,6 +309,8 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
309
309
  name="decodingtrust_toxicity_prompts",
310
310
  scenario_spec=scenario_spec,
311
311
  adapter_spec=adapter_spec,
312
- metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
313
- groups=["decodingtrust", "toxicity_prompts"],
312
+ metric_specs=get_generative_harms_metric_specs(
313
+ include_basic_metrics=True, include_generative_harms_metrics=True
314
+ ),
315
+ groups=["decodingtrust_toxicity_prompts", "toxicity_prompts"],
314
316
  )
@@ -0,0 +1,85 @@
1
+ """Run specs for experiments only.
2
+
3
+ These run specs are not intended for use with public leaderboards."""
4
+
5
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
7
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
8
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
9
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
10
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
11
+
12
+
13
+ @run_spec_function("ci_mcqa")
14
+ def get_ci_mcqa_spec() -> RunSpec:
15
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ci_mcqa_scenario.CIMCQAScenario", args={})
16
+
17
+ adapter_spec = get_multiple_choice_adapter_spec(
18
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
19
+ instructions=(
20
+ "Give a letter answer among the options given. "
21
+ "For example, if the options are A, B, C, D, E, and F, "
22
+ "your answer should consist of the single letter that corresponds to the correct answer."
23
+ ),
24
+ input_noun="Question",
25
+ output_noun="Answer",
26
+ )
27
+
28
+ return RunSpec(
29
+ name="ci_mcqa",
30
+ scenario_spec=scenario_spec,
31
+ adapter_spec=adapter_spec,
32
+ metric_specs=get_exact_match_metric_specs(),
33
+ groups=["CIMCQA"],
34
+ )
35
+
36
+
37
+ @run_spec_function("ewok")
38
+ def get_ewok_spec(domain: str = "all") -> RunSpec:
39
+ scenario_spec = ScenarioSpec(
40
+ class_name="helm.benchmark.scenarios.ewok_scenario.EWoKScenario", args={"domain": domain}
41
+ )
42
+
43
+ instructions = """# INSTRUCTIONS
44
+
45
+ In this study, you will see multiple examples. In each example, you will be given two contexts and a scenario. Your task is to read the two contexts and the subsequent scenario, and pick the context that makes more sense considering the scenario that follows. The contexts will be numbered "1" or "2". You must answer using "1" or "2" in your response.
46
+ """ # noqa: E501
47
+ input_prefix = """# TEST EXAMPLE
48
+
49
+ ## Scenario
50
+ \""""
51
+ input_suffix = """\"
52
+
53
+ ## Contexts
54
+ """
55
+ output_prefix = """
56
+ ## Task
57
+ Which context makes more sense given the scenario? Please answer using either "1" or "2".
58
+
59
+ ## Response
60
+ """
61
+
62
+ adapter_spec = AdapterSpec(
63
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
64
+ instructions=instructions,
65
+ input_prefix=input_prefix,
66
+ input_suffix=input_suffix,
67
+ reference_prefix='1. "',
68
+ reference_suffix='"\n',
69
+ output_prefix=output_prefix,
70
+ output_suffix="\n",
71
+ max_train_instances=2,
72
+ num_outputs=1,
73
+ max_tokens=2,
74
+ temperature=0.0,
75
+ stop_sequences=["\n\n"],
76
+ sample_train=True,
77
+ )
78
+
79
+ return RunSpec(
80
+ name=f"ewok:domain={domain}",
81
+ scenario_spec=scenario_spec,
82
+ adapter_spec=adapter_spec,
83
+ metric_specs=get_exact_match_metric_specs(),
84
+ groups=["ewok", f"ewok_{domain}"],
85
+ )
@@ -0,0 +1,110 @@
1
+ """Run spec functions for the HELM Finance leaderboard.
2
+
3
+ Website: https://crfm.stanford.edu/helm/finance/"""
4
+
5
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
6
+ from helm.benchmark.adaptation.common_adapter_specs import (
7
+ get_generation_adapter_spec,
8
+ )
9
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
10
+ from helm.benchmark.metrics.common_metric_specs import (
11
+ get_basic_metric_specs,
12
+ get_exact_match_metric_specs,
13
+ )
14
+ from helm.benchmark.metrics.metric import MetricSpec
15
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
16
+ from helm.benchmark.runner import get_benchmark_output_path
17
+ from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
18
+
19
+
20
+ @run_spec_function("fin_qa")
21
+ def get_fin_qa_spec() -> RunSpec:
22
+ from helm.benchmark.scenarios.fin_qa_scenario import INSTRUCTIONS
23
+
24
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.fin_qa_scenario.FinQAScenario", args={})
25
+ adapter_spec = get_generation_adapter_spec(
26
+ instructions=INSTRUCTIONS, input_noun=None, output_noun="Program", max_tokens=100
27
+ )
28
+ metric_specs = get_basic_metric_specs([]) + [
29
+ MetricSpec(class_name="helm.benchmark.metrics.fin_qa_metrics.FinQAMetric")
30
+ ]
31
+ return RunSpec(
32
+ name="fin_qa",
33
+ scenario_spec=scenario_spec,
34
+ adapter_spec=adapter_spec,
35
+ metric_specs=metric_specs,
36
+ groups=["fin_qa"],
37
+ )
38
+
39
+
40
+ @run_spec_function("financebench")
41
+ def get_financebench_spec() -> RunSpec:
42
+ instructions = (
43
+ "Answer only the last question using the given evidence. "
44
+ "Respond with only a single paragraph, sentence or sentence fragment.\n"
45
+ )
46
+ scenario_spec = ScenarioSpec(
47
+ class_name="helm.benchmark.scenarios.financebench_scenario.FinanceBenchScenario", args={}
48
+ )
49
+ adapter_spec = AdapterSpec(
50
+ method=ADAPT_GENERATION,
51
+ instructions=instructions,
52
+ input_prefix="\n",
53
+ input_suffix="\n",
54
+ output_prefix="\nAnswer: ",
55
+ output_suffix="\n",
56
+ instance_prefix="\n###\n",
57
+ num_outputs=1,
58
+ max_tokens=300,
59
+ temperature=0.0,
60
+ stop_sequences=["###"],
61
+ )
62
+ annotator_specs = [
63
+ AnnotatorSpec(class_name="helm.benchmark.annotation.financebench_annotator.FinanceBenchAnnotator")
64
+ ]
65
+ metric_specs = get_basic_metric_specs([]) + [
66
+ MetricSpec(
67
+ class_name="helm.benchmark.metrics.annotation_metrics.AnnotationLabelMetric",
68
+ args={
69
+ "annotator_name": "financebench",
70
+ "key": "label",
71
+ "labels": ["correct_answer", "incorrect_answer", "failure_to_answer"],
72
+ },
73
+ )
74
+ ]
75
+ return RunSpec(
76
+ name="financebench",
77
+ scenario_spec=scenario_spec,
78
+ annotators=annotator_specs,
79
+ adapter_spec=adapter_spec,
80
+ metric_specs=metric_specs,
81
+ groups=["financebench"],
82
+ )
83
+
84
+
85
+ @run_spec_function("banking77")
86
+ def get_banking77_spec() -> RunSpec:
87
+ from helm.benchmark.scenarios.raft_scenario import get_raft_instructions
88
+ from helm.benchmark.scenarios.banking77_scenario import Banking77Scenario
89
+
90
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.banking77_scenario.Banking77Scenario", args={})
91
+
92
+ # Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77
93
+ scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), Banking77Scenario.name)
94
+ adapter_spec = get_generation_adapter_spec(
95
+ instructions=get_raft_instructions("banking_77", scenario_cache_path),
96
+ input_noun=None,
97
+ output_noun="Label",
98
+ max_tokens=30, # at most ~50 characters per label
99
+ )
100
+
101
+ # Not using get_classification_metric_specs() / ClassificationMetric because BANKING77 has too many classes,
102
+ # so F1 scores don't make sense. The original paper uses accuracy instead.
103
+ metric_specs = get_exact_match_metric_specs()
104
+ return RunSpec(
105
+ name="banking77",
106
+ scenario_spec=scenario_spec,
107
+ adapter_spec=adapter_spec,
108
+ metric_specs=metric_specs,
109
+ groups=["banking77"],
110
+ )
@@ -0,0 +1,154 @@
1
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
2
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
3
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
4
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
5
+
6
+ from helm.benchmark.metrics.metric import MetricSpec
7
+
8
+
9
+ @run_spec_function("harm_bench")
10
+ def get_harm_bench_spec() -> RunSpec:
11
+ adapter_spec = AdapterSpec(
12
+ method=ADAPT_GENERATION,
13
+ global_prefix="",
14
+ global_suffix="",
15
+ instructions="",
16
+ input_prefix="",
17
+ input_suffix="",
18
+ output_prefix="",
19
+ output_suffix="",
20
+ instance_prefix="",
21
+ max_train_instances=0,
22
+ num_outputs=1,
23
+ max_tokens=512,
24
+ temperature=0.0,
25
+ stop_sequences=[],
26
+ )
27
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.harm_bench_scenario.HarmBenchScenario")
28
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.harm_bench_annotator.HarmBenchAnnotator")]
29
+ metric_specs = [
30
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
31
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
32
+ MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
33
+ ]
34
+ return RunSpec(
35
+ name="harm_bench",
36
+ scenario_spec=scenario_spec,
37
+ adapter_spec=adapter_spec,
38
+ metric_specs=metric_specs,
39
+ annotators=annotator_specs,
40
+ groups=["harm_bench"],
41
+ )
42
+
43
+
44
+ @run_spec_function("simple_safety_tests")
45
+ def get_simple_safety_tests_spec() -> RunSpec:
46
+ adapter_spec = AdapterSpec(
47
+ method=ADAPT_GENERATION,
48
+ global_prefix="",
49
+ global_suffix="",
50
+ instructions="",
51
+ input_prefix="",
52
+ input_suffix="",
53
+ output_prefix="",
54
+ output_suffix="",
55
+ instance_prefix="",
56
+ max_train_instances=0,
57
+ num_outputs=1,
58
+ max_tokens=512,
59
+ temperature=0.0,
60
+ stop_sequences=[],
61
+ )
62
+ scenario_spec = ScenarioSpec(
63
+ class_name="helm.benchmark.scenarios.simple_safety_tests_scenario.SimpleSafetyTestsScenario"
64
+ )
65
+ annotator_specs = [
66
+ AnnotatorSpec(class_name="helm.benchmark.annotation.simple_safety_tests_annotator.SimpleSafetyTestsAnnotator")
67
+ ]
68
+ metric_specs = [
69
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
70
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
71
+ MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
72
+ ]
73
+ return RunSpec(
74
+ name="simple_safety_tests",
75
+ scenario_spec=scenario_spec,
76
+ adapter_spec=adapter_spec,
77
+ metric_specs=metric_specs,
78
+ annotators=annotator_specs,
79
+ groups=["simple_safety_tests"],
80
+ )
81
+
82
+
83
+ @run_spec_function("xstest")
84
+ def get_xstest_spec() -> RunSpec:
85
+ adapter_spec = AdapterSpec(
86
+ method=ADAPT_GENERATION,
87
+ global_prefix="",
88
+ global_suffix="",
89
+ instructions="",
90
+ input_prefix="",
91
+ input_suffix="",
92
+ output_prefix="",
93
+ output_suffix="",
94
+ instance_prefix="",
95
+ max_train_instances=0,
96
+ num_outputs=1,
97
+ max_tokens=512,
98
+ temperature=0.0,
99
+ stop_sequences=[],
100
+ )
101
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.xstest_scenario.XSTestScenario")
102
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.xstest_annotator.XSTestAnnotator")]
103
+ metric_specs = [
104
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
105
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
106
+ MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
107
+ ]
108
+ return RunSpec(
109
+ name="xstest",
110
+ scenario_spec=scenario_spec,
111
+ adapter_spec=adapter_spec,
112
+ metric_specs=metric_specs,
113
+ annotators=annotator_specs,
114
+ groups=["xstest"],
115
+ )
116
+
117
+
118
+ @run_spec_function("anthropic_red_team")
119
+ def get_anthropic_red_team_spec() -> RunSpec:
120
+ adapter_spec = AdapterSpec(
121
+ method=ADAPT_GENERATION,
122
+ global_prefix="",
123
+ global_suffix="",
124
+ instructions="",
125
+ input_prefix="",
126
+ input_suffix="",
127
+ output_prefix="",
128
+ output_suffix="",
129
+ instance_prefix="",
130
+ max_train_instances=0,
131
+ num_outputs=1,
132
+ max_tokens=512,
133
+ temperature=0.0,
134
+ stop_sequences=[],
135
+ )
136
+ scenario_spec = ScenarioSpec(
137
+ class_name="helm.benchmark.scenarios.anthropic_red_team_scenario.AnthropicRedTeamScenario"
138
+ )
139
+ annotator_specs = [
140
+ AnnotatorSpec(class_name="helm.benchmark.annotation.anthropic_red_team_annotator.AnthropicRedTeamAnnotator")
141
+ ]
142
+ metric_specs = [
143
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
144
+ MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
145
+ MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
146
+ ]
147
+ return RunSpec(
148
+ name="anthropic_red_team",
149
+ scenario_spec=scenario_spec,
150
+ adapter_spec=adapter_spec,
151
+ metric_specs=metric_specs,
152
+ annotators=annotator_specs,
153
+ groups=["anthropic_red_team"],
154
+ )