crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,317 @@
1
+ ---
2
+
3
+ ############################################################
4
+ metrics:
5
+ # Infrastructure metrics:
6
+ - name: num_perplexity_tokens
7
+ display_name: '# tokens'
8
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
9
+ - name: num_bytes
10
+ display_name: '# bytes'
11
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Efficiency metrics:
68
+ - name: training_co2_cost
69
+ display_name: Estimated training emissions (kg CO2)
70
+ short_display_name: Training emissions (kg CO2)
71
+ lower_is_better: true
72
+ description: Estimate of the CO2 emissions from training the model.
73
+ - name: training_energy_cost
74
+ display_name: Estimated training energy cost (MWh)
75
+ short_display_name: Training energy (MWh)
76
+ lower_is_better: true
77
+ description: Estimate of the amount of energy used to train the model.
78
+ - name: inference_runtime
79
+ display_name: Observed inference runtime (s)
80
+ short_display_name: Observed inference time (s)
81
+ lower_is_better: true
82
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
83
+ - name: inference_idealized_runtime
84
+ display_name: Idealized inference runtime (s)
85
+ short_display_name: Idealized inference time (s)
86
+ lower_is_better: true
87
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
88
+ - name: inference_denoised_runtime
89
+ display_name: Denoised inference runtime (s)
90
+ short_display_name: Denoised inference time (s)
91
+ lower_is_better: true
92
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
93
+ - name: batch_size
94
+ display_name: Batch size
95
+ description: For batch jobs, how many requests are in a batch.
96
+
97
+ # Unitxt Metrics
98
+ - name: meteor
99
+ display_name: METEOR
100
+ short_display_name: METEOR
101
+ description: METEOR
102
+ lower_is_better: false
103
+ - name: f1
104
+ display_name: BERTScore F1
105
+ short_display_name: BERTScore F1
106
+ description: BERTScore F1
107
+ lower_is_better: false
108
+ - name: precision
109
+ display_name: Precision
110
+ short_display_name: Precision
111
+ description: Precision
112
+ lower_is_better: false
113
+ - name: recall
114
+ display_name: Recall
115
+ short_display_name: Recall
116
+ description: Recall
117
+ lower_is_better: false
118
+ - name: rouge1
119
+ display_name: ROUGE-1
120
+ short_display_name: ROUGE-1
121
+ description: ROUGE-1
122
+ lower_is_better: false
123
+ - name: rouge2
124
+ display_name: ROUGE-2
125
+ short_display_name: ROUGE-2
126
+ description: ROUGE-2
127
+ lower_is_better: false
128
+ - name: rougeL
129
+ display_name: ROUGE-L
130
+ short_display_name: ROUGE-L
131
+ description: ROUGE-L
132
+ lower_is_better: false
133
+ - name: rougeLsum
134
+ display_name: ROUGE-Lsum
135
+ short_display_name: ROUGE-Lsum
136
+ description: ROUGE-Lsum
137
+ lower_is_better: false
138
+ - name: bleu
139
+ display_name: BLEU
140
+ short_display_name: BLEU
141
+ description: BLEU
142
+ lower_is_better: false
143
+ - name: accuracy
144
+ display_name: Accuracy
145
+ short_display_name: Accuracy
146
+ description: Accuracy
147
+ lower_is_better: false
148
+ - name: f1_macro
149
+ display_name: Macro F1
150
+ short_display_name: Macro F1
151
+ description: Macro F1
152
+ lower_is_better: false
153
+ - name: f1_micro
154
+ display_name: Micro F1
155
+ short_display_name: Micro F1
156
+ description: Micro F1
157
+ lower_is_better: false
158
+ - name: unsorted_list_exact_match
159
+ display_name: Unsorted List Exact Match
160
+ short_display_name: Exact Match
161
+ description: Unsorted List Exact Match
162
+ lower_is_better: false
163
+
164
+ # FinQA Accuracy
165
+ - name: program_accuracy
166
+ display_name: Program Accuracy
167
+ short_display_name: Program Accuracy
168
+ description: Program Accuracy
169
+ lower_is_better: false
170
+ - name: execution_accuracy
171
+ display_name: Execution Accuracy
172
+ short_display_name: Execution Accuracy
173
+ description: Execution Accuracy
174
+ lower_is_better: false
175
+
176
+ perturbations: []
177
+
178
+ metric_groups:
179
+ - name: main_metrics
180
+ display_name: Main Metrics
181
+ metrics:
182
+ - name: ${main_name}
183
+ split: __all__
184
+
185
+ - name: generation_metrics
186
+ display_name: Other Generation Metrics
187
+ hide_win_rates: true
188
+ metrics:
189
+ - name: f1
190
+ split: __all__
191
+ - name: rouge1
192
+ split: __all__
193
+ - name: rouge2
194
+ split: __all__
195
+ - name: rougeL
196
+ split: __all__
197
+ - name: rougeLsum
198
+ split: __all__
199
+ - name: bleu
200
+ split: __all__
201
+
202
+ - name: classification_metrics
203
+ display_name: Classification Metrics
204
+ hide_win_rates: true
205
+ metrics:
206
+ - name: accuracy
207
+ split: __all__
208
+ - name: f1_macro
209
+ split: __all__
210
+ - name: f1_micro
211
+ split: __all__
212
+
213
+ - name: efficiency
214
+ display_name: Efficiency
215
+ metrics:
216
+ - name: inference_runtime
217
+ split: ${main_split}
218
+
219
+ - name: general_information
220
+ display_name: General information
221
+ hide_win_rates: true
222
+ metrics:
223
+ - name: num_instances
224
+ split: ${main_split}
225
+ - name: num_train_instances
226
+ split: ${main_split}
227
+ - name: prompt_truncated
228
+ split: ${main_split}
229
+ - name: num_prompt_tokens
230
+ split: ${main_split}
231
+ - name: num_output_tokens
232
+ split: ${main_split}
233
+
234
+ run_groups:
235
+ - name: table_scenarios
236
+ display_name: Table Scenarios
237
+ description: Table Scenarios
238
+ category: All Scenarios
239
+ subgroups:
240
+ - unitxt_cards.numeric_nlg
241
+ - unitxt_cards.tab_fact
242
+ - unitxt_cards.wikitq
243
+ - unitxt_cards.fin_qa
244
+
245
+ - name: unitxt_cards.numeric_nlg
246
+ display_name: NumericNLG
247
+ short_display_name: NumericNLG
248
+ description: "NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers."
249
+ metric_groups:
250
+ - main_metrics
251
+ - generation_metrics
252
+ - efficiency
253
+ - general_information
254
+ environment:
255
+ main_name: meteor
256
+ main_split: test
257
+ taxonomy:
258
+ task: "?"
259
+ what: "?"
260
+ who: "?"
261
+ when: "?"
262
+ language: English
263
+
264
+ - name: unitxt_cards.tab_fact
265
+ display_name: TabFact
266
+ short_display_name: TabFact
267
+ description: "tab_fact is a large-scale dataset for the task of fact-checking on tables."
268
+ metric_groups:
269
+ - main_metrics
270
+ - classification_metrics
271
+ - efficiency
272
+ - general_information
273
+ environment:
274
+ main_name: accuracy
275
+ main_split: test
276
+ taxonomy:
277
+ task: "?"
278
+ what: "?"
279
+ who: "?"
280
+ when: "?"
281
+ language: English
282
+
283
+ - name: unitxt_cards.wikitq
284
+ display_name: WikiTableQuestions
285
+ short_display_name: WikiTableQuestions
286
+ description: "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables."
287
+ metric_groups:
288
+ - main_metrics
289
+ - classification_metrics
290
+ - efficiency
291
+ - general_information
292
+ environment:
293
+ main_name: unsorted_list_exact_match
294
+ main_split: test
295
+ taxonomy:
296
+ task: "?"
297
+ what: "?"
298
+ who: "?"
299
+ when: "?"
300
+ language: English
301
+
302
+ - name: unitxt_cards.fin_qa
303
+ display_name: FinQA
304
+ description: The FinQA benchmark for numeric reasoning over financial data, with question answering pairs written by financial experts over financial reports [(Chen et al., 2021)](https://arxiv.org/abs/2109.00122/).
305
+ metric_groups:
306
+ - main_metrics
307
+ - efficiency
308
+ - general_information
309
+ environment:
310
+ main_name: program_accuracy
311
+ main_split: test
312
+ taxonomy:
313
+ task: question answering with numeric reasoning
314
+ what: financial reports
315
+ who: financial experts
316
+ when: 1999 to 2019
317
+ language: English
@@ -0,0 +1,244 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+
74
+ ############################################################
75
+ perturbations: []
76
+
77
+ ############################################################
78
+ metric_groups:
79
+ - name: accuracy
80
+ display_name: Accuracy
81
+ hide_win_rates: true
82
+ metrics:
83
+ - name: ${main_name}
84
+ split: ${main_split}
85
+
86
+ - name: efficiency
87
+ display_name: Efficiency
88
+ metrics:
89
+ - name: inference_runtime
90
+ split: ${main_split}
91
+
92
+ - name: general_information
93
+ display_name: General information
94
+ hide_win_rates: true
95
+ metrics:
96
+ - name: num_instances
97
+ split: ${main_split}
98
+ - name: num_train_instances
99
+ split: ${main_split}
100
+ - name: prompt_truncated
101
+ split: ${main_split}
102
+ - name: num_prompt_tokens
103
+ split: ${main_split}
104
+ - name: num_output_tokens
105
+ split: ${main_split}
106
+
107
+ ############################################################
108
+
109
+ run_groups:
110
+ - name: thai_scenarios
111
+ display_name: Thai Scenarios
112
+ description: Thai-language scenarios
113
+ category: All scenarios
114
+ subgroups:
115
+ - thai_exam
116
+ - thai_exam_onet
117
+ - thai_exam_ic
118
+ - thai_exam_tgat
119
+ - thai_exam_tpat1
120
+ - thai_exam_a_level
121
+
122
+
123
+ - name: thai_exam
124
+ display_name: ThaiExam
125
+ description: >
126
+ Macro-averaged accuracy on all ThaiExam examinations.
127
+ metric_groups:
128
+ - accuracy
129
+ - efficiency
130
+ - general_information
131
+ environment:
132
+ main_name: exact_match
133
+ main_split: test
134
+ taxonomy:
135
+ task: question answering
136
+ what: n/a
137
+ who: n/a
138
+ when: "?"
139
+ language: Thai and English
140
+
141
+ - name: thai_exam_onet
142
+ display_name: ONET
143
+ description: >
144
+ The Ordinary National Educational Test (ONET) is an examination for students in Thailand.
145
+ We select the grade-12 ONET exam, which comprises 5 subjects and each question has 5 choices.
146
+ These subjects are Thai, English, Mathematics, Social Studies, and Science.
147
+ Amounting to a total of 170 questions and options.
148
+ metric_groups:
149
+ - accuracy
150
+ - efficiency
151
+ - general_information
152
+ environment:
153
+ main_name: exact_match
154
+ main_split: test
155
+ taxonomy:
156
+ task: question answering
157
+ what: high school / medical school academic knowledge
158
+ who: n/a
159
+ when: "?"
160
+ language: Thai and English
161
+
162
+ - name: thai_exam_ic
163
+ display_name: IC
164
+ description: >
165
+ The Investment Consultant (IC) examination, a licensing test for investment professionals in Thailand.
166
+ Developed by the Stock Exchange of Thailand (SET), features 4 choices per question.
167
+ We extracted questions for levels 1, 2, and 3 resulting in a total of 95 questions and options.
168
+ metric_groups:
169
+ - accuracy
170
+ - efficiency
171
+ - general_information
172
+ environment:
173
+ main_name: exact_match
174
+ main_split: test
175
+ taxonomy:
176
+ task: question answering
177
+ what: licensing for investment professionals
178
+ who: n/a
179
+ when: "?"
180
+ language: Thai
181
+
182
+ - name: thai_exam_tgat
183
+ display_name: TGAT
184
+ description: >
185
+ The Thai General Aptitude Test (TGAT), a national high school examination in Thailand.
186
+ Focuses on critical and logical thinking skills.
187
+ We collected a total of 90 questions and answers. The TGAT consists of four choices per question.
188
+ metric_groups:
189
+ - accuracy
190
+ - efficiency
191
+ - general_information
192
+ environment:
193
+ main_name: exact_match
194
+ main_split: test
195
+ taxonomy:
196
+ task: question answering
197
+ what: high school level questions on reasoning
198
+ who: n/a
199
+ when: "?"
200
+ language: English
201
+
202
+ - name: thai_exam_tpat1
203
+ display_name: TPAT-1
204
+ description: >
205
+ The Thai Professional Aptitude Test 1 (TPAT-1) is a national high school examination in Thailand.
206
+ The Exam assesses students’ professional skills requirement in medical schools.
207
+ This subset contains reasoning and medical ethics. We collected a total of 116 questions and answers.
208
+ The TPAT-1 consists of 5 choices per question.
209
+ description: TBD
210
+ metric_groups:
211
+ - accuracy
212
+ - efficiency
213
+ - general_information
214
+ environment:
215
+ main_name: exact_match
216
+ main_split: test
217
+ taxonomy:
218
+ task: question answering
219
+ what: high school / medical school academic knowledge
220
+ who: n/a
221
+ when: "?"
222
+ language: Thai
223
+
224
+ - name: thai_exam_a_level
225
+ display_name: A-Level
226
+ description: >
227
+ An academic knowledge assessment examination (Applied Knowledge Level)
228
+ that covers general foundational subjects taught in schools.
229
+ The content assessed in this examination aligns with the curriculum guidelines
230
+ and emphasizes the practical application of knowledge in daily life.
231
+ We collected a total of 175 questions and answers.
232
+ metric_groups:
233
+ - accuracy
234
+ - efficiency
235
+ - general_information
236
+ environment:
237
+ main_name: exact_match
238
+ main_split: test
239
+ taxonomy:
240
+ task: question answering
241
+ what: high school academic knowledge
242
+ who: n/a
243
+ when: "?"
244
+ language: Thai and English
@@ -1,66 +1,8 @@
1
1
  ---
2
2
  ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: multiple_choice_joint
10
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
- - name: multiple_choice_separate_original
12
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
- - name: multiple_choice_separate_calibrated
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
- - name: language_modeling
16
- description: Given the input, the model assigns the sequence a probability.
17
- - name: instructions
18
- description: The description of the task that is included at the very beginning of the prompt.
19
- - name: global_prefix
20
- description: The string that is prepended to the prompt.
21
- - name: global_suffix
22
- description: The string that is appended to the prompt.
23
- - name: instance_prefix
24
- description: The string that is included before each instance (e.g., '\n\n').
25
- - name: input_prefix
26
- description: The string that is included before each input (e.g., 'Question:').
27
- - name: input_suffix
28
- description: The string that is included after each input (e.g., '\n').
29
- - name: reference_prefix
30
- description: The string that is included before each reference (for multiple-choice questions).
31
- - name: reference_suffix
32
- description: The string that is included after each reference (for multiple-choice questions).
33
- - name: output_prefix
34
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
- - name: output_suffix
36
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
- - name: substitutions
38
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
- - name: max_train_instances
40
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
- - name: max_eval_instances
42
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
- - name: num_outputs
44
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
- - name: num_train_trials
46
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
- - name: sample_train
48
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
- - name: model
50
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
- - name: model_deployment
52
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
- - name: temperature
54
- description: Temperature parameter used in generation.
55
- - name: max_tokens
56
- description: Maximum number of tokens to generate.
57
- - name: stop_sequences
58
- description: List of sequences, where we stop generation if we encounter any of them.
59
- - name: random
60
- description: Random seed (string), which guarantees reproducibility.
61
- - name: multi_label
62
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
64
6
  ############################################################
65
7
  metrics:
66
8
  # Infrastructure metrics: