crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (184) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  10. helm/benchmark/annotation/call_center_annotator.py +247 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +32 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +31 -44
  18. helm/benchmark/annotation/model_as_judge.py +45 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  20. helm/benchmark/annotation/xstest_annotator.py +110 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +57 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  30. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  31. helm/benchmark/model_metadata_registry.py +3 -3
  32. helm/benchmark/presentation/test_run_entry.py +1 -0
  33. helm/benchmark/run.py +15 -0
  34. helm/benchmark/run_expander.py +56 -30
  35. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  36. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  37. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  38. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  39. helm/benchmark/run_specs/finance_run_specs.py +78 -1
  40. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +92 -21
  42. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  43. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  44. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  45. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  46. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  47. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  48. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  49. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  50. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  51. helm/benchmark/scenarios/scenario.py +1 -1
  52. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  53. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  54. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  55. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  56. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  57. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  58. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  59. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  60. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  61. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  62. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  63. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  64. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  65. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  66. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  67. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  68. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  69. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  70. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  71. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  72. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  73. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  74. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  75. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  76. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  78. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  79. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  80. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  81. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  82. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  83. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  84. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  85. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  86. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  87. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  88. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  89. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  91. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  92. helm/benchmark/server.py +1 -6
  93. helm/benchmark/static/schema_air_bench.yaml +750 -750
  94. helm/benchmark/static/schema_bhasa.yaml +709 -0
  95. helm/benchmark/static/schema_call_center.yaml +232 -0
  96. helm/benchmark/static/schema_cleva.yaml +768 -0
  97. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  98. helm/benchmark/static/schema_ewok.yaml +367 -0
  99. helm/benchmark/static/schema_finance.yaml +55 -9
  100. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  101. helm/benchmark/static/schema_safety.yaml +247 -0
  102. helm/benchmark/static/schema_tables.yaml +124 -7
  103. helm/benchmark/static/schema_thai.yaml +21 -0
  104. helm/benchmark/static/schema_vhelm.yaml +96 -91
  105. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  106. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  107. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  108. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  109. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  110. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  111. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  112. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  113. helm/benchmark/static_build/index.html +2 -2
  114. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  115. helm/clients/ai21_client.py +71 -1
  116. helm/clients/anthropic_client.py +7 -19
  117. helm/clients/huggingface_client.py +38 -37
  118. helm/clients/nvidia_nim_client.py +35 -0
  119. helm/clients/openai_client.py +2 -3
  120. helm/clients/palmyra_client.py +25 -0
  121. helm/clients/perspective_api_client.py +11 -6
  122. helm/clients/test_client.py +4 -6
  123. helm/clients/vision_language/open_flamingo_client.py +1 -2
  124. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  125. helm/common/images_utils.py +6 -0
  126. helm/common/mongo_key_value_store.py +2 -1
  127. helm/common/request.py +16 -0
  128. helm/config/model_deployments.yaml +315 -332
  129. helm/config/model_metadata.yaml +384 -110
  130. helm/config/tokenizer_configs.yaml +116 -11
  131. helm/proxy/example_queries.py +14 -21
  132. helm/proxy/services/server_service.py +1 -2
  133. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  134. helm/tokenizers/ai21_tokenizer.py +51 -59
  135. helm/tokenizers/cohere_tokenizer.py +0 -75
  136. helm/tokenizers/huggingface_tokenizer.py +0 -1
  137. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  138. helm/benchmark/static/benchmarking.css +0 -156
  139. helm/benchmark/static/benchmarking.js +0 -1705
  140. helm/benchmark/static/config.js +0 -3
  141. helm/benchmark/static/general.js +0 -122
  142. helm/benchmark/static/images/crfm-logo.png +0 -0
  143. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  144. helm/benchmark/static/images/helm-logo.png +0 -0
  145. helm/benchmark/static/images/language-model-helm.png +0 -0
  146. helm/benchmark/static/images/organizations/ai21.png +0 -0
  147. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  148. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  149. helm/benchmark/static/images/organizations/cohere.png +0 -0
  150. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  151. helm/benchmark/static/images/organizations/google.png +0 -0
  152. helm/benchmark/static/images/organizations/meta.png +0 -0
  153. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  154. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  155. helm/benchmark/static/images/organizations/openai.png +0 -0
  156. helm/benchmark/static/images/organizations/together.png +0 -0
  157. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  158. helm/benchmark/static/images/organizations/yandex.png +0 -0
  159. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  160. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  161. helm/benchmark/static/index.html +0 -68
  162. helm/benchmark/static/info-icon.png +0 -0
  163. helm/benchmark/static/json-urls.js +0 -69
  164. helm/benchmark/static/plot-captions.js +0 -27
  165. helm/benchmark/static/utils.js +0 -285
  166. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  167. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  168. helm/benchmark/window_services/ai21_window_service.py +0 -247
  169. helm/benchmark/window_services/cohere_window_service.py +0 -101
  170. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  171. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  172. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  173. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  174. helm/tokenizers/ice_tokenizer.py +0 -30
  175. helm/tokenizers/test_ice_tokenizer.py +0 -57
  176. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  177. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  178. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  179. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  180. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  181. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  182. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  183. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  184. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -0,0 +1,768 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+ # Infrastructure metrics:
5
+ - name: num_perplexity_tokens
6
+ display_name: '# tokens'
7
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
8
+ - name: num_bytes
9
+ display_name: '# bytes'
10
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
11
+
12
+ - name: num_references
13
+ display_name: '# ref'
14
+ description: Number of references.
15
+ - name: num_train_trials
16
+ display_name: '# trials'
17
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
18
+ - name: estimated_num_tokens_cost
19
+ display_name: 'cost'
20
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
21
+ - name: num_prompt_tokens
22
+ display_name: '# prompt tokens'
23
+ description: Number of tokens in the prompt.
24
+ - name: num_prompt_characters
25
+ display_name: '# prompt chars'
26
+ description: Number of characters in the prompt.
27
+ - name: num_completion_tokens
28
+ display_name: '# completion tokens'
29
+ description: Actual number of completion tokens (over all completions).
30
+ - name: num_output_tokens
31
+ display_name: '# output tokens'
32
+ description: Actual number of output tokens.
33
+ - name: max_num_output_tokens
34
+ display_name: 'Max output tokens'
35
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
36
+ - name: num_requests
37
+ display_name: '# requests'
38
+ description: Number of distinct API requests.
39
+ - name: num_instances
40
+ display_name: '# eval'
41
+ description: Number of evaluation instances.
42
+ - name: num_train_instances
43
+ display_name: '# train'
44
+ description: Number of training instances (e.g., in-context examples).
45
+ - name: prompt_truncated
46
+ display_name: truncated
47
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
48
+ - name: finish_reason_length
49
+ display_name: finish b/c length
50
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
51
+ - name: finish_reason_stop
52
+ display_name: finish b/c stop
53
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
54
+ - name: finish_reason_endoftext
55
+ display_name: finish b/c endoftext
56
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
57
+ - name: finish_reason_unknown
58
+ display_name: finish b/c unknown
59
+ description: Fraction of instances where the the output was terminated for unknown reasons.
60
+ - name: num_completions
61
+ display_name: '# completions'
62
+ description: Number of completions.
63
+ - name: predicted_index
64
+ display_name: Predicted index
65
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
66
+
67
+ # Accuracy metrics:
68
+ - name: exact_match
69
+ display_name: Exact match
70
+ short_display_name: EM
71
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
72
+ lower_is_better: false
73
+
74
+ # Calibration metrics:
75
+ - name: ece_10_bin
76
+ display_name: 10-bin expected calibration error
77
+ short_display_name: ECE (10-bin)
78
+ lower_is_better: true
79
+ description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
80
+
81
+ # Classification metrics
82
+ - name: classification_macro_f1
83
+ display_name: Macro-F1
84
+ description: Population-level macro-averaged F1 score.
85
+ lower_is_better: false
86
+ - name: classification_micro_f1
87
+ display_name: Micro-F1
88
+ description: Population-level micro-averaged F1 score.
89
+ lower_is_better: false
90
+
91
+ # CLEVA (Chinese) metrics:
92
+ # Accuracy metrics (Chinese)
93
+ - name: chinese_ibleu
94
+ display_name: Chinese iBLEU
95
+ short_display_name: iBLEU (Chinese)
96
+ description: A special BLEU score [(Sun and Zhou, 2008)](https://aclanthology.org/P12-2008.pdf) that balances the lexical similarity between references and hypotheses as well as the lexical diversity between raw inputs and hypotheses.
97
+ lower_is_better: false
98
+ - name: cleva_top1_accuracy
99
+ display_name: Chinese Top-1 Accuracy
100
+ short_display_name: Acc@Top-1 (Chinese)
101
+ description: A special accuracy [(Patel and Pavlick, 2022)](https://openreview.net/pdf?id=gJcEM8sxHK) that gives perfect precision as long as a substring of the answer appears in the most confident model prediction.
102
+ lower_is_better: false
103
+ - name: cleva_machine_translation_bleu
104
+ display_name: BLEU
105
+ short_display_name: BLEU
106
+ description: BLEU score based on [Post, (2018)](https://aclanthology.org/W18-6319/).
107
+ lower_is_better: false
108
+ - name: chinese_rouge_2
109
+ display_name: Chinese ROUGE-2 score
110
+ short_display_name: ROUGE-2 (Chinese)
111
+ description: ROUGE-2 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese tokenizer that segments Chinese strings by character.
112
+ lower_is_better: false
113
+ - name: chinese_bleu_1
114
+ display_name: Chinese BLEU-1 score
115
+ short_display_name: BLEU-1 (Chinese)
116
+ description: BLEU-1 score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on a Chinese tokenizer that segments Chinese strings by character.
117
+ lower_is_better: false
118
+ - name: cleva_math_result_match
119
+ display_name: CLEVA Math Exact Match
120
+ short_display_name: EM (Math)
121
+ description: Exact match that cares only the last math expression (numbers and fractions) in the model's prediction.
122
+ lower_is_better: false
123
+ # CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
124
+
125
+
126
+
127
+ ############################################################
128
+ perturbations: []
129
+
130
+ ############################################################
131
+ metric_groups:
132
+ - name: accuracy
133
+ display_name: Accuracy
134
+ metrics:
135
+ - name: ${main_name}
136
+ split: ${main_split}
137
+
138
+ - name: calibration
139
+ display_name: Calibration
140
+ metrics:
141
+ - name: ece_10_bin
142
+ split: ${main_split}
143
+
144
+ - name: efficiency
145
+ display_name: Efficiency
146
+ metrics:
147
+ - name: inference_runtime
148
+ split: ${main_split}
149
+
150
+ - name: general_information
151
+ display_name: General information
152
+ hide_win_rates: true
153
+ metrics:
154
+ - name: num_instances
155
+ split: ${main_split}
156
+ - name: num_train_instances
157
+ split: ${main_split}
158
+ - name: prompt_truncated
159
+ split: ${main_split}
160
+ - name: num_prompt_tokens
161
+ split: ${main_split}
162
+ - name: num_output_tokens
163
+ split: ${main_split}
164
+
165
+ - name: classification_metrics
166
+ display_name: Classification metrics
167
+ metrics:
168
+ - name: classification_macro_f1
169
+ split: ${main_split}
170
+ - name: classification_micro_f1
171
+ split: ${main_split}
172
+
173
+ ############################################################
174
+
175
+ run_groups:
176
+ ### Chinese
177
+ - name: cleva_scenarios
178
+ display_name: Scenarios
179
+ description: Scenarios for evaluating Chinese language models
180
+ category: Targeted evaluations
181
+ subgroups:
182
+ # - cleva_bias
183
+ - cleva_classical_chinese_understanding
184
+ # - cleva_closed_book_question_answering
185
+ # - cleva_code_synthesis
186
+ - cleva_commonsense_reasoning
187
+ # - cleva_conceptual_generalization
188
+ # - cleva_copyright
189
+ - cleva_coreference_resolution
190
+ - cleva_cultural_knowledge
191
+ # - cleva_data_to_text_generation
192
+ # - cleva_deductive_reasoning
193
+ # - cleva_dialogue_generation
194
+ # - cleva_fact_checking
195
+ # - cleva_inductive_reasoning
196
+ # - cleva_instruction_following
197
+ # - cleva_intent_understanding
198
+ # - cleva_language_modeling
199
+ - cleva_mathematical_calculation
200
+ - cleva_mathematical_reasoning
201
+ # - cleva_opinion_mining
202
+ - cleva_paraphrase_generation
203
+ - cleva_paraphrase_identification
204
+ - cleva_pinyin_transliteration
205
+ - cleva_reading_comprehension
206
+ # - cleva_reasoning_primitive
207
+ - cleva_sentiment_analysis
208
+ # - cleva_subject_knowledge
209
+ - cleva_summarization
210
+ - cleva_text_classification
211
+ - cleva_toxicity_detection
212
+ - cleva_translation
213
+
214
+ ## CLEVA (Chinese) Scenarios
215
+ # Applications
216
+ # - name: cleva_closed_book_question_answering
217
+ # display_name: Closed book question answering
218
+ # description: Closed-book question answering task comprises three subtasks. One is for the medical domain, another for open-domain, and the last measures if a model generates truthful answers.
219
+ # metric_groups:
220
+ # - accuracy
221
+ # - efficiency
222
+ # - general_information
223
+ # environment:
224
+ # main_name: exact_match
225
+ # main_split: test
226
+ # taxonomy:
227
+ # task: question answering
228
+ # what: medical, open-domain, or truthful qa
229
+ # who: n/a
230
+ # when: 2022 or before
231
+ # language: Chinese
232
+
233
+ - name: cleva_summarization
234
+ display_name: Summarization
235
+ description: "Summarize a dialogue between a customer representative and a customer."
236
+ metric_groups:
237
+ - accuracy
238
+ - general_information
239
+ - efficiency
240
+ environment:
241
+ main_name: chinese_rouge_2
242
+ main_split: test
243
+ taxonomy:
244
+ task: summarization
245
+ what: e-commerce dialogues
246
+ who: customers and representatives
247
+ when: 2021 or before
248
+ language: Chinese
249
+
250
+ - name: cleva_text_classification
251
+ display_name: Text classification
252
+ description: This scenario has two subtasks. Classify if an utterance is humorous and identify news topic based on its title.
253
+ metric_groups:
254
+ - accuracy
255
+ - calibration
256
+ - efficiency
257
+ - general_information
258
+ environment:
259
+ main_name: exact_match
260
+ main_split: test
261
+ taxonomy:
262
+ task: text classification
263
+ what: news or chitchat
264
+ who: n/a
265
+ when: 2010s
266
+ language: Chinese
267
+
268
+ - name: cleva_translation
269
+ display_name: Translation
270
+ description: Scenario for measuring the translation quality between Chinese and English.
271
+ metric_groups:
272
+ - accuracy
273
+ - general_information
274
+ - efficiency
275
+ environment:
276
+ main_name: cleva_machine_translation_bleu
277
+ main_split: test
278
+ taxonomy:
279
+ task: translation
280
+ what: news
281
+ who: n/a
282
+ when: 2022 or before
283
+ language: Chinese, English
284
+
285
+ # - name: cleva_data_to_text_generation
286
+ # display_name: Data to text generation
287
+ # description: "Generate a product description based on structured data containing various product properties."
288
+ # metric_groups:
289
+ # - accuracy
290
+ # - general_information
291
+ # - efficiency
292
+ # environment:
293
+ # main_name: chinese_bleu_1
294
+ # main_split: test
295
+ # taxonomy:
296
+ # task: data-to-text generation
297
+ # what: product description
298
+ # who: n/a
299
+ # when: 2010s
300
+ # language: Chinese
301
+
302
+ # - name: cleva_dialogue_generation
303
+ # display_name: Dialogue generation
304
+ # description: "Task-oriented dialogue between a user and a system."
305
+ # metric_groups:
306
+ # - accuracy
307
+ # - general_information
308
+ # - efficiency
309
+ # environment:
310
+ # main_name: chinese_bleu_1
311
+ # main_split: test
312
+ # taxonomy:
313
+ # task: dialogue generation
314
+ # what: task-oriented dialogue on hotel, restaurant, attraction, metro, and taxi domain
315
+ # who: user and assistant
316
+ # when: 2020 or before
317
+ # language: Chinese
318
+
319
+ # - name: cleva_opinion_mining
320
+ # display_name: Opinion mining
321
+ # description: "Extract the target of an opinion."
322
+ # metric_groups:
323
+ # - accuracy
324
+ # - efficiency
325
+ # - general_information
326
+ # environment:
327
+ # main_name: exact_match
328
+ # main_split: test
329
+ # taxonomy:
330
+ # task: opinion target extraction
331
+ # what: n/a
332
+ # who: n/a
333
+ # when: 2010s
334
+ # language: Chinese
335
+
336
+ - name: cleva_paraphrase_generation
337
+ display_name: Paraphrase generation
338
+ description: Generate a paraphrase of a given sentence.
339
+ metric_groups:
340
+ - accuracy
341
+ - general_information
342
+ - efficiency
343
+ environment:
344
+ main_name: chinese_ibleu
345
+ main_split: test
346
+ taxonomy:
347
+ task: paraphrase generation
348
+ what: n/a
349
+ who: n/a
350
+ when: 2010s
351
+ language: Chinese
352
+
353
+ - name: cleva_paraphrase_identification
354
+ display_name: Paraphrase identification
355
+ description: Identify if two sentences, from a dialogue or from the finance domain, share the same meaning.
356
+ metric_groups:
357
+ - accuracy
358
+ - calibration
359
+ - efficiency
360
+ - general_information
361
+ environment:
362
+ main_name: exact_match
363
+ main_split: test
364
+ taxonomy:
365
+ task: paraphrase identification
366
+ what: financial questions or chitchat
367
+ who: n/a
368
+ when: 2020 or before
369
+ language: Chinese
370
+
371
+ - name: cleva_reading_comprehension
372
+ display_name: Reading comprehension
373
+ description: Answer a multiple-choice question based on a given paragraph.
374
+ metric_groups:
375
+ - accuracy
376
+ - calibration
377
+ - efficiency
378
+ - general_information
379
+ environment:
380
+ main_name: exact_match
381
+ main_split: test
382
+ taxonomy:
383
+ task: question answering
384
+ what: n/a
385
+ who: n/a
386
+ when: 2019 or before
387
+ language: Chinese
388
+
389
+ - name: cleva_sentiment_analysis
390
+ display_name: Sentiment analysis
391
+ description: Chinese sentiment analysis for product reviews.
392
+ metric_groups:
393
+ - accuracy
394
+ - calibration
395
+ - efficiency
396
+ - general_information
397
+ environment:
398
+ main_name: exact_match
399
+ main_split: test
400
+ taxonomy:
401
+ task: sentiment analysis
402
+ what: product reviews
403
+ who: customers
404
+ when: 2021 or before
405
+ language: Chinese
406
+
407
+ # Language
408
+ # - name: cleva_language_modeling
409
+ # display_name: Language modeling
410
+ # description: Scenario for measuring language model performance across various domains (wikipedia and news).
411
+ # metric_groups:
412
+ # - accuracy
413
+ # - calibration
414
+ # - efficiency
415
+ # - general_information
416
+ # environment:
417
+ # main_name: bits_per_byte
418
+ # main_split: test
419
+ # taxonomy:
420
+ # task: language modeling
421
+ # what: Wikipedia and news
422
+ # who: n/a
423
+ # when: 2010s
424
+ # language: Chinese
425
+
426
+ - name: cleva_pinyin_transliteration
427
+ display_name: Pinyin transliteration
428
+ description: Scenario that asks the model to translate between Chinese and Pinyin.
429
+ metric_groups:
430
+ - accuracy
431
+ - general_information
432
+ - efficiency
433
+ environment:
434
+ main_name: chinese_bleu_1
435
+ main_split: test
436
+ taxonomy:
437
+ task: pinyin transliteration
438
+ what: n/a
439
+ who: automatically generated by algorithm
440
+ when: '2023'
441
+ language: Chinese, Pinyin
442
+
443
+ - name: cleva_classical_chinese_understanding
444
+ display_name: Classical Chinese understanding
445
+ description: Scenario for evaluating the understanding of classical Chinese by selecting the appropriate classical Chinese translation for a given modern Chinese sentence.
446
+ metric_groups:
447
+ - accuracy
448
+ - calibration
449
+ - efficiency
450
+ - general_information
451
+ environment:
452
+ main_name: exact_match
453
+ main_split: test
454
+ taxonomy:
455
+ task: translation
456
+ what: n/a
457
+ who: n/a
458
+ when: 2021 or before
459
+ language: Classical Chinese
460
+
461
+ - name: cleva_coreference_resolution
462
+ display_name: Coreference resolution
463
+ description: Scenario for testing models on solving coreference resolution problems (the winograd schema challenge).
464
+ metric_groups:
465
+ - accuracy
466
+ - calibration
467
+ - efficiency
468
+ - general_information
469
+ environment:
470
+ main_name: exact_match
471
+ main_split: test
472
+ taxonomy:
473
+ task: multiple-choice question answering
474
+ what: contemporary Chinese literary works
475
+ who: n/a
476
+ when: 2020 or before
477
+ language: Chinese
478
+
479
+ # - name: cleva_intent_understanding
480
+ # display_name: Intent understanding
481
+ # description: Tests whether the model could capture the writing intention of the authors after reading an article.
482
+ # metric_groups:
483
+ # - accuracy
484
+ # - calibration
485
+ # - efficiency
486
+ # - general_information
487
+ # environment:
488
+ # main_name: exact_match
489
+ # main_split: test
490
+ # taxonomy:
491
+ # task: multiple-choice question answering
492
+ # what: exam
493
+ # who: n/a
494
+ # when: 1990-2022
495
+ # language: Chinese
496
+
497
+ # Knowledge
498
+ # - name: cleva_subject_knowledge
499
+ # display_name: Subject knowledge
500
+ # description: Scenario inspired by [Petroni et al. (2019)](https://aclanthology.org/D19-1250/) to extensively test factual knowledge in Chinese. It contains 13 subjects and a general domain.
501
+ # metric_groups:
502
+ # - accuracy
503
+ # - calibration
504
+ # - efficiency
505
+ # - general_information
506
+ # environment:
507
+ # main_name: exact_match
508
+ # main_split: test
509
+ # taxonomy:
510
+ # task: knowledge base completion
511
+ # what: entity-relation-entity triples in natural language form
512
+ # who: automatically generated from templates
513
+ # when: 2022 or before
514
+ # language: structured Chinese
515
+
516
+ - name: cleva_cultural_knowledge
517
+ display_name: Cultural knowledge
518
+ description: "Scenario for evaluating models' understanding of Chinese culture. It has a Chinese-idiom-focused subtask."
519
+ metric_groups:
520
+ - accuracy
521
+ - calibration
522
+ - efficiency
523
+ - general_information
524
+ environment:
525
+ main_name: exact_match
526
+ main_split: test
527
+ taxonomy:
528
+ task: multiple-choice question answering
529
+ what: Various passages containing Chinese idioms
530
+ who: n/a
531
+ when: 2010s
532
+ language: Chinese
533
+
534
+ # Reasoning
535
+ # - name: cleva_reasoning_primitive
536
+ # display_name: Reasoning primitive
537
+ # description: Scenario focused on primitive reasoning, including dyck language continuation, variable substitution, pattern induction, and pattern matching.
538
+ # metric_groups:
539
+ # - accuracy
540
+ # - calibration
541
+ # - efficiency
542
+ # - general_information
543
+ # environment:
544
+ # main_name: exact_match
545
+ # main_split: test
546
+ # taxonomy:
547
+ # task: next-word prediction
548
+ # what: n/a
549
+ # who: automatically generated from templates
550
+ # when: '2023'
551
+ # language: synthetic
552
+
553
+ # - name: cleva_deductive_reasoning
554
+ # display_name: Deductive reasoning
555
+ # description: "Scenario that gauges model's ability to reason deductive arguments. It includes a modus tollens subtask."
556
+ # metric_groups:
557
+ # - accuracy
558
+ # - calibration
559
+ # - efficiency
560
+ # - general_information
561
+ # environment:
562
+ # main_name: exact_match
563
+ # main_split: test
564
+ # taxonomy:
565
+ # task: multiple-choice question answering
566
+ # what: natural language questions
567
+ # who: n/a
568
+ # when: '2023'
569
+ # language: structured Chinese
570
+
571
+ # - name: cleva_inductive_reasoning
572
+ # display_name: Inductive reasoning
573
+ # description: "Scenario that tests models' ability to conclude rules from demonstrations and apply them to unseen test instances."
574
+ # metric_groups:
575
+ # - accuracy
576
+ # - calibration
577
+ # - efficiency
578
+ # - general_information
579
+ # environment:
580
+ # main_name: exact_match
581
+ # main_split: test
582
+ # taxonomy:
583
+ # task: next-word prediction
584
+ # what: n/a
585
+ # who: automatically generated by algorithm
586
+ # when: '2023'
587
+ # language: synthetic
588
+
589
+ # - name: cleva_code_synthesis
590
+ # display_name: Code synthesis
591
+ # description: Scenario for measuring functional correctness for synthesizing programs from Chinese docstrings.
592
+ # metric_groups:
593
+ # - accuracy
594
+ # - calibration
595
+ # - efficiency
596
+ # - general_information
597
+ # environment:
598
+ # main_name: exact_match
599
+ # main_split: test
600
+ # taxonomy:
601
+ # task: next-word prediction
602
+ # what: n/a
603
+ # who: n/a
604
+ # when: '2023'
605
+ # language: synthetic
606
+
607
+ - name: cleva_commonsense_reasoning
608
+ display_name: Commonsense reasoning
609
+ description: "Scenario that tests models' commonsense reasoning ability. There are two subtasks: textual entailment and commonsense question answering."
610
+ metric_groups:
611
+ - accuracy
612
+ - calibration
613
+ - efficiency
614
+ - general_information
615
+ environment:
616
+ main_name: exact_match
617
+ main_split: test
618
+ taxonomy:
619
+ task: multiple-choice question answering
620
+ what: n/a
621
+ who: n/a
622
+ when: 2023 or before
623
+ language: Chinese
624
+
625
+ - name: cleva_mathematical_reasoning
626
+ display_name: Mathematical reasoning
627
+ description: "Scenario that tests models' mathematical reasoning ability with chain-of-thought style reasoning. It contains a math word problem solving subtask."
628
+ metric_groups:
629
+ - accuracy
630
+ - general_information
631
+ - efficiency
632
+ environment:
633
+ main_name: cleva_math_result_match
634
+ main_split: test
635
+ taxonomy:
636
+ task: next-word prediction
637
+ what: exam
638
+ who: n/a
639
+ when: 2010s
640
+ language: Chinese
641
+
642
+ # - name: cleva_conceptual_generalization
643
+ # display_name: Conceptual generalization
644
+ # description: Scenario that assesses whether models could generalize physical relations to a synthetic grid world.
645
+ # metric_groups:
646
+ # - calibration
647
+ # - efficiency
648
+ # - accuracy
649
+ # - general_information
650
+ # environment:
651
+ # main_name: cleva_top1_accuracy
652
+ # main_split: test
653
+ # taxonomy:
654
+ # task: next-word prediction
655
+ # what: n/a
656
+ # who: automatically generated by algorithm
657
+ # when: '2023'
658
+ # language: synthetic
659
+
660
+ # Harms
661
+ - name: cleva_toxicity_detection
662
+ display_name: Toxicity detection
663
+ description: Ask models about the offensiveness of the given text.
664
+ metric_groups:
665
+ - accuracy
666
+ - calibration
667
+ - efficiency
668
+ - general_information
669
+ environment:
670
+ main_name: exact_match
671
+ main_split: test
672
+ taxonomy:
673
+ task: toxicity classification
674
+ what: text from Chinese social media
675
+ who: web users
676
+ when: 2022 or before
677
+ language: Chinese
678
+
679
+ # - name: cleva_bias
680
+ # display_name: Bias
681
+ # description: Scenario that gauges bias of four demographic categories in dialogues, including race, gender, region, and occupation.
682
+ # metric_groups:
683
+ # - accuracy
684
+ # - calibration
685
+ # - efficiency
686
+ # - general_information
687
+ # - classification_metrics
688
+ # environment:
689
+ # main_name: exact_match
690
+ # main_split: test
691
+ # taxonomy:
692
+ # task: multiple-choice question answering
693
+ # what: short Chinese dialogues from social media
694
+ # who: web users
695
+ # when: 2022 or before
696
+ # language: Chinese
697
+
698
+ # - name: cleva_copyright
699
+ # display_name: Copyright
700
+ # description: Scenario that measures copyright and memorization behavior for Chinese books and code, based off of [Carlini et al. (2021)](https://www.usenix.org/biblio-11958).
701
+ # metric_groups:
702
+ # # - copyright_metrics
703
+ # - general_information
704
+ # - efficiency
705
+ # environment:
706
+ # main_split: test
707
+ # taxonomy:
708
+ # task: next-word prediction
709
+ # what: books and code
710
+ # who: n/a
711
+ # when: 2023 or before
712
+ # language: Chinese
713
+
714
+ # - name: cleva_fact_checking
715
+ # display_name: Fact checking
716
+ # description: Scenario that lets models identify whether the given fact is true to test their factuality.
717
+ # metric_groups:
718
+ # - accuracy
719
+ # - calibration
720
+ # - efficiency
721
+ # - general_information
722
+ # - classification_metrics
723
+ # environment:
724
+ # main_name: exact_match
725
+ # main_split: test
726
+ # taxonomy:
727
+ # task: multiple-choice question answering
728
+ # what: factual statements in natural language form
729
+ # who: n/a
730
+ # when: 2022 or before
731
+ # language: Chinese
732
+
733
+ # Others
734
+ # - name: cleva_instruction_following
735
+ # display_name: Instruction following
736
+ # description: "Scenario that examines whether models could follow human instructions, mainly uncommon ones. It contains two subtasks: 'redefine' and 'pattern_matching_suppression'."
737
+ # metric_groups:
738
+ # - accuracy
739
+ # - calibration
740
+ # - efficiency
741
+ # - general_information
742
+ # environment:
743
+ # main_name: exact_match
744
+ # main_split: test
745
+ # taxonomy:
746
+ # task: multiple-choice question answering
747
+ # what: natural language questions
748
+ # who: automatically generated from templates
749
+ # when: '2023'
750
+ # language: synthetic
751
+
752
+ - name: cleva_mathematical_calculation
753
+ display_name: Mathematical calculation
754
+ description: "Scenario that evaluates the calculation ability of models. It has four subtasks: three-digit addition, three-digit subtraction, two-digit multiplication, and significant figures."
755
+ metric_groups:
756
+ - accuracy
757
+ - calibration
758
+ - efficiency
759
+ - general_information
760
+ environment:
761
+ main_name: exact_match
762
+ main_split: test
763
+ taxonomy:
764
+ task: next-word prediction
765
+ what: natural language math questions or pure math expressions
766
+ who: automatically generated from templates
767
+ when: '2023'
768
+ language: synthetic