crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,223 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+
74
+ ############################################################
75
+ perturbations: []
76
+
77
+ ############################################################
78
+ metric_groups:
79
+ - name: accuracy
80
+ display_name: Accuracy
81
+ metrics:
82
+ - name: ${main_name}
83
+ split: ${main_split}
84
+
85
+ - name: efficiency
86
+ display_name: Efficiency
87
+ metrics:
88
+ - name: inference_runtime
89
+ split: ${main_split}
90
+
91
+ - name: general_information
92
+ display_name: General information
93
+ hide_win_rates: true
94
+ metrics:
95
+ - name: num_instances
96
+ split: ${main_split}
97
+ - name: num_train_instances
98
+ split: ${main_split}
99
+ - name: prompt_truncated
100
+ split: ${main_split}
101
+ - name: num_prompt_tokens
102
+ split: ${main_split}
103
+ - name: num_output_tokens
104
+ split: ${main_split}
105
+
106
+ ############################################################
107
+
108
+ run_groups:
109
+ - name: thai_scenarios
110
+ display_name: Thai Scenarios
111
+ description: Thai-language scenarios
112
+ category: All scenarios
113
+ subgroups:
114
+ - thai_exam_onet
115
+ - thai_exam_ic
116
+ - thai_exam_tgat
117
+ - thai_exam_tpat1
118
+ - thai_exam_a_level
119
+
120
+ - name: thai_exam_onet
121
+ display_name: ONET
122
+ description: >
123
+ The Ordinary National Educational Test (ONET) is an examination for students in Thailand.
124
+ We select the grade-12 ONET exam, which comprises 5 subjects and each question has 5 choices.
125
+ These subjects are Thai, English, Mathematics, Social Studies, and Science.
126
+ Amounting to a total of 170 questions and options.
127
+ metric_groups:
128
+ - accuracy
129
+ - efficiency
130
+ - general_information
131
+ environment:
132
+ main_name: exact_match
133
+ main_split: test
134
+ taxonomy:
135
+ task: question answering
136
+ what: high school / medical school academic knowledge
137
+ who: n/a
138
+ when: "?"
139
+ language: Thai and English
140
+
141
+ - name: thai_exam_ic
142
+ display_name: IC
143
+ description: >
144
+ The Investment Consultant (IC) examination, a licensing test for investment professionals in Thailand.
145
+ Developed by the Stock Exchange of Thailand (SET), features 4 choices per question.
146
+ We extracted questions for levels 1, 2, and 3 resulting in a total of 95 questions and options.
147
+ metric_groups:
148
+ - accuracy
149
+ - efficiency
150
+ - general_information
151
+ environment:
152
+ main_name: exact_match
153
+ main_split: test
154
+ taxonomy:
155
+ task: question answering
156
+ what: licensing for investment professionals
157
+ who: n/a
158
+ when: "?"
159
+ language: Thai
160
+
161
+ - name: thai_exam_tgat
162
+ display_name: TGAT
163
+ description: >
164
+ The Thai General Aptitude Test (TGAT), a national high school examination in Thailand.
165
+ Focuses on critical and logical thinking skills.
166
+ We collected a total of 90 questions and answers. The TGAT consists of four choices per question.
167
+ metric_groups:
168
+ - accuracy
169
+ - efficiency
170
+ - general_information
171
+ environment:
172
+ main_name: exact_match
173
+ main_split: test
174
+ taxonomy:
175
+ task: question answering
176
+ what: high school level questions on reasoning
177
+ who: n/a
178
+ when: "?"
179
+ language: English
180
+
181
+ - name: thai_exam_tpat1
182
+ display_name: TPAT-1
183
+ description: >
184
+ The Thai Professional Aptitude Test 1 (TPAT-1) is a national high school examination in Thailand.
185
+ The Exam assesses students’ professional skills requirement in medical schools.
186
+ This subset contains reasoning and medical ethics. We collected a total of 116 questions and answers.
187
+ The TPAT-1 consists of 5 choices per question.
188
+ description: TBD
189
+ metric_groups:
190
+ - accuracy
191
+ - efficiency
192
+ - general_information
193
+ environment:
194
+ main_name: exact_match
195
+ main_split: test
196
+ taxonomy:
197
+ task: question answering
198
+ what: high school / medical school academic knowledge
199
+ who: n/a
200
+ when: "?"
201
+ language: Thai
202
+
203
+ - name: thai_exam_a_level
204
+ display_name: A-Level
205
+ description: >
206
+ An academic knowledge assessment examination (Applied Knowledge Level)
207
+ that covers general foundational subjects taught in schools.
208
+ The content assessed in this examination aligns with the curriculum guidelines
209
+ and emphasizes the practical application of knowledge in daily life.
210
+ We collected a total of 175 questions and answers.
211
+ metric_groups:
212
+ - accuracy
213
+ - efficiency
214
+ - general_information
215
+ environment:
216
+ main_name: exact_match
217
+ main_split: test
218
+ taxonomy:
219
+ task: question answering
220
+ what: high school academic knowledge
221
+ who: n/a
222
+ when: "?"
223
+ language: Thai and English
@@ -1,66 +1,8 @@
1
1
  ---
2
2
  ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: multiple_choice_joint
10
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
- - name: multiple_choice_separate_original
12
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
- - name: multiple_choice_separate_calibrated
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
- - name: language_modeling
16
- description: Given the input, the model assigns the sequence a probability.
17
- - name: instructions
18
- description: The description of the task that is included at the very beginning of the prompt.
19
- - name: global_prefix
20
- description: The string that is prepended to the prompt.
21
- - name: global_suffix
22
- description: The string that is appended to the prompt.
23
- - name: instance_prefix
24
- description: The string that is included before each instance (e.g., '\n\n').
25
- - name: input_prefix
26
- description: The string that is included before each input (e.g., 'Question:').
27
- - name: input_suffix
28
- description: The string that is included after each input (e.g., '\n').
29
- - name: reference_prefix
30
- description: The string that is included before each reference (for multiple-choice questions).
31
- - name: reference_suffix
32
- description: The string that is included after each reference (for multiple-choice questions).
33
- - name: output_prefix
34
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
- - name: output_suffix
36
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
- - name: substitutions
38
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
- - name: max_train_instances
40
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
- - name: max_eval_instances
42
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
- - name: num_outputs
44
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
- - name: num_train_trials
46
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
- - name: sample_train
48
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
- - name: model
50
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
- - name: model_deployment
52
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
- - name: temperature
54
- description: Temperature parameter used in generation.
55
- - name: max_tokens
56
- description: Maximum number of tokens to generate.
57
- - name: stop_sequences
58
- description: List of sequences, where we stop generation if we encounter any of them.
59
- - name: random
60
- description: Random seed (string), which guarantees reproducibility.
61
- - name: multi_label
62
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
-
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
64
6
  ############################################################
65
7
  metrics:
66
8
  # Infrastructure metrics: