crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -1,66 +1,8 @@
1
1
  ---
2
2
  ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: multiple_choice_joint
10
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
- - name: multiple_choice_separate_original
12
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
- - name: multiple_choice_separate_calibrated
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
- - name: language_modeling
16
- description: Given the input, the model assigns the sequence a probability.
17
- - name: instructions
18
- description: The description of the task that is included at the very beginning of the prompt.
19
- - name: global_prefix
20
- description: The string that is prepended to the prompt.
21
- - name: global_suffix
22
- description: The string that is appended to the prompt.
23
- - name: instance_prefix
24
- description: The string that is included before each instance (e.g., '\n\n').
25
- - name: input_prefix
26
- description: The string that is included before each input (e.g., 'Question:').
27
- - name: input_suffix
28
- description: The string that is included after each input (e.g., '\n').
29
- - name: reference_prefix
30
- description: The string that is included before each reference (for multiple-choice questions).
31
- - name: reference_suffix
32
- description: The string that is included after each reference (for multiple-choice questions).
33
- - name: output_prefix
34
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
- - name: output_suffix
36
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
- - name: substitutions
38
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
- - name: max_train_instances
40
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
- - name: max_eval_instances
42
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
- - name: num_outputs
44
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
- - name: num_train_trials
46
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
- - name: sample_train
48
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
- - name: model
50
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
- - name: model_deployment
52
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
- - name: temperature
54
- description: Temperature parameter used in generation.
55
- - name: max_tokens
56
- description: Maximum number of tokens to generate.
57
- - name: stop_sequences
58
- description: List of sequences, where we stop generation if we encounter any of them.
59
- - name: random
60
- description: Random seed (string), which guarantees reproducibility.
61
- - name: multi_label
62
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this in the future.
5
+ adapter: []
64
6
  ############################################################
65
7
  metrics:
66
8
  # Infrastructure metrics:
@@ -0,0 +1,255 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: f1_score
74
+ display_name: F1
75
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
76
+ lower_is_better: false
77
+ - name: live_qa_score
78
+ display_name: Judge Score
79
+ description: LLM-as-judge score
80
+ lower_is_better: false
81
+ - name: medication_qa_score
82
+ display_name: Judge Score
83
+ description: LLM-as-judge score
84
+ lower_is_better: false
85
+
86
+ # Toxicity metrics
87
+ - name: expected_max_toxicity
88
+ display_name: Average max toxicity
89
+ short_display_name: Avg. max toxicity
90
+ lower_is_better: true
91
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
92
+ - name: max_toxicity_probability
93
+ display_name: max toxicity
94
+ lower_is_better: true
95
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
96
+ - name: toxic_frac
97
+ display_name: Toxic fraction
98
+ lower_is_better: true
99
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
100
+
101
+
102
+ ############################################################
103
+ perturbations: []
104
+
105
+ ############################################################
106
+ metric_groups:
107
+ - name: accuracy
108
+ display_name: Accuracy
109
+ metrics:
110
+ - name: ${main_name}
111
+ split: ${main_split}
112
+
113
+ - name: efficiency
114
+ display_name: Efficiency
115
+ metrics:
116
+ - name: inference_runtime
117
+ split: ${main_split}
118
+
119
+ - name: general_information
120
+ display_name: General information
121
+ hide_win_rates: true
122
+ metrics:
123
+ - name: num_instances
124
+ split: ${main_split}
125
+ - name: num_train_instances
126
+ split: ${main_split}
127
+ - name: prompt_truncated
128
+ split: ${main_split}
129
+ - name: num_prompt_tokens
130
+ split: ${main_split}
131
+ - name: num_output_tokens
132
+ split: ${main_split}
133
+
134
+ - name: toxicity
135
+ display_name: Toxicity
136
+ metrics:
137
+ - name: toxic_frac
138
+ split: ${main_split}
139
+
140
+ ############################################################
141
+ run_groups:
142
+ - name: medical_scenarios
143
+ display_name: Medical Scenarios
144
+ description: Scenarios for the medical domain
145
+ category: All scenarios
146
+ subgroups:
147
+ - med_qa
148
+ - med_mcqa
149
+ - pubmed_qa
150
+ - mmlu
151
+ - live_qa
152
+ - medication_qa
153
+
154
+ - name: med_qa
155
+ display_name: MedQA
156
+ description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
157
+ metric_groups:
158
+ - accuracy
159
+ - efficiency
160
+ - general_information
161
+ environment:
162
+ main_name: exact_match
163
+ main_split: test
164
+ taxonomy:
165
+ task: question answering
166
+ what: n/a
167
+ who: n/a
168
+ when: n/a
169
+ language: English
170
+
171
+ - name: med_mcqa
172
+ display_name: MedMCQA
173
+ description: AIIMS/NEET QA multiple choice questions with 4 choices.
174
+ metric_groups:
175
+ - accuracy
176
+ - efficiency
177
+ - general_information
178
+ environment:
179
+ main_name: exact_match
180
+ main_split: valid
181
+ taxonomy:
182
+ task: question answering
183
+ what: n/a
184
+ who: n/a
185
+ when: n/a
186
+ language: English
187
+
188
+ - name: pubmed_qa
189
+ display_name: PubMedQA
190
+ description: biomedical literature Q + Context + A yes/no/maybe + long answer questions
191
+ metric_groups:
192
+ - accuracy
193
+ - efficiency
194
+ - general_information
195
+ environment:
196
+ main_name: exact_match
197
+ main_split: test
198
+ taxonomy:
199
+ task: question answering
200
+ what: n/a
201
+ who: n/a
202
+ when: n/a
203
+ language: English
204
+
205
+ - name: mmlu
206
+ display_name: MMLU (Massive Multitask Language Understanding)
207
+ short_display_name: MMLU
208
+ description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).
209
+ metric_groups:
210
+ - accuracy
211
+ - efficiency
212
+ - general_information
213
+ environment:
214
+ main_name: exact_match
215
+ main_split: test
216
+ taxonomy:
217
+ task: question answering
218
+ what: "?"
219
+ who: "?"
220
+ when: "?"
221
+ language: English
222
+
223
+ - name: live_qa
224
+ display_name: LiveQA
225
+ description: Consumer health questions with librarian-generated reference answers.
226
+ metric_groups:
227
+ - accuracy
228
+ - efficiency
229
+ - general_information
230
+ environment:
231
+ main_name: live_qa_score
232
+ main_split: test
233
+ taxonomy:
234
+ task: question answering
235
+ what: n/a
236
+ who: n/a
237
+ when: n/a
238
+ language: English
239
+
240
+ - name: medication_qa
241
+ display_name: MedicationQA
242
+ description: Consumer medication questions with reference answers.
243
+ metric_groups:
244
+ - accuracy
245
+ - efficiency
246
+ - general_information
247
+ environment:
248
+ main_name: medication_qa_score
249
+ main_split: test
250
+ taxonomy:
251
+ task: question answering
252
+ what: n/a
253
+ who: n/a
254
+ when: n/a
255
+ language: English
@@ -1,66 +1,8 @@
1
1
  ---
2
2
  ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: multiple_choice_joint
10
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
- - name: multiple_choice_separate_original
12
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
- - name: multiple_choice_separate_calibrated
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
- - name: language_modeling
16
- description: Given the input, the model assigns the sequence a probability.
17
- - name: instructions
18
- description: The description of the task that is included at the very beginning of the prompt.
19
- - name: global_prefix
20
- description: The string that is prepended to the prompt.
21
- - name: global_suffix
22
- description: The string that is appended to the prompt.
23
- - name: instance_prefix
24
- description: The string that is included before each instance (e.g., '\n\n').
25
- - name: input_prefix
26
- description: The string that is included before each input (e.g., 'Question:').
27
- - name: input_suffix
28
- description: The string that is included after each input (e.g., '\n').
29
- - name: reference_prefix
30
- description: The string that is included before each reference (for multiple-choice questions).
31
- - name: reference_suffix
32
- description: The string that is included after each reference (for multiple-choice questions).
33
- - name: output_prefix
34
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
- - name: output_suffix
36
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
- - name: substitutions
38
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
- - name: max_train_instances
40
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
- - name: max_eval_instances
42
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
- - name: num_outputs
44
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
- - name: num_train_trials
46
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
- - name: sample_train
48
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
- - name: model
50
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
- - name: model_deployment
52
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
- - name: temperature
54
- description: Temperature parameter used in generation.
55
- - name: max_tokens
56
- description: Maximum number of tokens to generate.
57
- - name: stop_sequences
58
- description: List of sequences, where we stop generation if we encounter any of them.
59
- - name: random
60
- description: Random seed (string), which guarantees reproducibility.
61
- - name: multi_label
62
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
64
6
  ############################################################
65
7
  metrics:
66
8
  # Infrastructure metrics:
@@ -0,0 +1,247 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+ - name: f1_score
74
+ display_name: F1
75
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
76
+ lower_is_better: false
77
+ - name: safety_score
78
+ display_name: LM Evaluated Safety score
79
+ description: LM Evaluated Safety score (mean of Claude Sonnet3.5 and GPT4o ratings)
80
+ lower_is_better: false
81
+ - name: bbq_accuracy
82
+ display_name: BBQ accuracy
83
+ description: BBQ accuracy
84
+ lower_is_better: false
85
+
86
+ # Toxicity metrics
87
+ - name: expected_max_toxicity
88
+ display_name: Average max toxicity
89
+ short_display_name: Avg. max toxicity
90
+ lower_is_better: true
91
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
92
+ - name: max_toxicity_probability
93
+ display_name: max toxicity
94
+ lower_is_better: true
95
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
96
+ - name: toxic_frac
97
+ display_name: Toxic fraction
98
+ lower_is_better: true
99
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
100
+
101
+
102
+ ############################################################
103
+ perturbations: []
104
+
105
+ ############################################################
106
+ metric_groups:
107
+ - name: accuracy
108
+ display_name: Accuracy
109
+ metrics:
110
+ - name: ${main_name}
111
+ split: ${main_split}
112
+
113
+ - name: efficiency
114
+ display_name: Efficiency
115
+ metrics:
116
+ - name: inference_runtime
117
+ split: ${main_split}
118
+
119
+ - name: general_information
120
+ display_name: General information
121
+ hide_win_rates: true
122
+ metrics:
123
+ - name: num_instances
124
+ split: ${main_split}
125
+ - name: num_train_instances
126
+ split: ${main_split}
127
+ - name: prompt_truncated
128
+ split: ${main_split}
129
+ - name: num_prompt_tokens
130
+ split: ${main_split}
131
+ - name: num_output_tokens
132
+ split: ${main_split}
133
+
134
+ - name: toxicity
135
+ display_name: Toxicity
136
+ metrics:
137
+ - name: toxic_frac
138
+ split: ${main_split}
139
+
140
+ - name: bbq_metrics
141
+ display_name: BBQ metrics
142
+ description: Metrics used for the BBQ bias benchmark.
143
+ metrics:
144
+ - name: bbq_metric_ambiguous_bias
145
+ split: ${main_split}
146
+ - name: bbq_metric_unambiguous_bias
147
+ split: ${main_split}
148
+
149
+ ############################################################
150
+ run_groups:
151
+ - name: safety_scenarios
152
+ display_name: Safety Scenarios
153
+ description: Scenarios for the model safety
154
+ category: All scenarios
155
+ subgroups:
156
+ - harm_bench
157
+ - simple_safety_tests
158
+ - bbq
159
+ - anthropic_red_team
160
+ - xstest
161
+
162
+ - name: harm_bench
163
+ display_name: HarmBench
164
+ description: HarmBench
165
+ metric_groups:
166
+ - accuracy
167
+ - efficiency
168
+ - general_information
169
+ environment:
170
+ main_name: safety_score
171
+ main_split: test
172
+ taxonomy:
173
+ task: question answering
174
+ what: n/a
175
+ who: n/a
176
+ when: n/a
177
+ language: English
178
+
179
+ - name: simple_safety_tests
180
+ display_name: SimpleSafetyTests
181
+ description: SimpleSafetyTests
182
+ metric_groups:
183
+ - accuracy
184
+ - efficiency
185
+ - general_information
186
+ environment:
187
+ main_name: safety_score
188
+ main_split: test
189
+ taxonomy:
190
+ task: question answering
191
+ what: n/a
192
+ who: n/a
193
+ when: n/a
194
+ language: English
195
+
196
+ - name: xstest
197
+ display_name: XSTest
198
+ description: XSTest
199
+ metric_groups:
200
+ - accuracy
201
+ - efficiency
202
+ - general_information
203
+ environment:
204
+ main_name: safety_score
205
+ main_split: test
206
+ taxonomy:
207
+ task: question answering
208
+ what: n/a
209
+ who: n/a
210
+ when: n/a
211
+ language: English
212
+
213
+ - name: bbq
214
+ display_name: BBQ
215
+ description: The Bias Benchmark for Question Answering (BBQ) for measuring social bias in question answering in ambiguous and unambigous context [(Parrish et al., 2022)](https://aclanthology.org/2022.findings-acl.165/).
216
+ metric_groups:
217
+ - accuracy
218
+ - efficiency
219
+ - general_information
220
+ - bbq_metrics
221
+ environment:
222
+ main_name: bbq_accuracy
223
+ main_split: test
224
+ taxonomy:
225
+ task: question answering
226
+ what: n/a
227
+ who: n/a
228
+ when: n/a
229
+ language: English
230
+
231
+ - name: anthropic_red_team
232
+ display_name: Anthropic Red Team
233
+ short_display_name: Anthropic Red Team
234
+ description: Anthropic Red Team
235
+ metric_groups:
236
+ - accuracy
237
+ - efficiency
238
+ - general_information
239
+ environment:
240
+ main_name: safety_score
241
+ main_split: test
242
+ taxonomy:
243
+ task: question answering
244
+ what: "?"
245
+ who: "?"
246
+ when: "?"
247
+ language: English