crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +32 -45
- helm/benchmark/annotation/medication_qa_annotator.py +31 -44
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +56 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +78 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +92 -21
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +124 -7
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +96 -91
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +2 -3
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/images_utils.py +6 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +315 -332
- helm/config/model_metadata.yaml +384 -110
- helm/config/tokenizer_configs.yaml +116 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +1 -2
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -84,9 +84,19 @@ metrics:
|
|
|
84
84
|
description: Fraction of instances where the generated code compiles successfully.
|
|
85
85
|
lower_is_better: false
|
|
86
86
|
- name: fid_similarity
|
|
87
|
-
display_name:
|
|
88
|
-
short_display_name:
|
|
89
|
-
description:
|
|
87
|
+
display_name: CIS
|
|
88
|
+
short_display_name: CIS
|
|
89
|
+
description: The cosine similarity between the Inception feature vectors.
|
|
90
|
+
lower_is_better: false
|
|
91
|
+
- name: lpips_similarity
|
|
92
|
+
display_name: LPIPS
|
|
93
|
+
short_display_name: LPIPS
|
|
94
|
+
description: The LPIPS distance between the generated image and the target image.
|
|
95
|
+
lower_is_better: false
|
|
96
|
+
- name: ssim_similarity
|
|
97
|
+
display_name: SSIM
|
|
98
|
+
short_display_name: SSIM
|
|
99
|
+
description: The SSIM similarity between the generated image and the target image.
|
|
90
100
|
lower_is_better: false
|
|
91
101
|
|
|
92
102
|
# Accuracy metrics:
|
|
@@ -165,6 +175,10 @@ metric_groups:
|
|
|
165
175
|
split: ${main_split}
|
|
166
176
|
- name: earth_mover_similarity
|
|
167
177
|
split: ${main_split}
|
|
178
|
+
- name: lpips_similarity
|
|
179
|
+
split: ${main_split}
|
|
180
|
+
- name: ssim_similarity
|
|
181
|
+
split: ${main_split}
|
|
168
182
|
|
|
169
183
|
- name: generation_text
|
|
170
184
|
display_name: Generation (text)
|
|
@@ -175,7 +189,7 @@ metric_groups:
|
|
|
175
189
|
############################################################
|
|
176
190
|
run_groups:
|
|
177
191
|
- name: core_scenarios
|
|
178
|
-
display_name:
|
|
192
|
+
display_name: Image2Struct
|
|
179
193
|
description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
|
|
180
194
|
category: All scenarios
|
|
181
195
|
subgroups:
|
|
@@ -183,13 +197,13 @@ run_groups:
|
|
|
183
197
|
- image2webpage
|
|
184
198
|
- image2musicsheet
|
|
185
199
|
|
|
186
|
-
- name:
|
|
187
|
-
display_name:
|
|
200
|
+
- name: image2struct_wild
|
|
201
|
+
display_name: Image2Struct (Wild)
|
|
188
202
|
description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images. These scenarios contain images that do not have a ground truth.
|
|
189
203
|
category: All scenarios
|
|
190
204
|
subgroups:
|
|
191
|
-
-
|
|
192
|
-
-
|
|
205
|
+
- image2latex_wild
|
|
206
|
+
- image2webpage_wild
|
|
193
207
|
|
|
194
208
|
- name: image2latex
|
|
195
209
|
display_name: Image2LaTeX
|
|
@@ -209,9 +223,9 @@ run_groups:
|
|
|
209
223
|
when: "2024"
|
|
210
224
|
language: English
|
|
211
225
|
|
|
212
|
-
- name:
|
|
213
|
-
display_name: I2LaTeX (
|
|
214
|
-
description: The
|
|
226
|
+
- name: image2latex_equation
|
|
227
|
+
display_name: I2LaTeX (Equation)
|
|
228
|
+
description: The Image2LaTeX benchmark subset for converting images of mathematical equations to LaTeX.
|
|
215
229
|
metric_groups:
|
|
216
230
|
- accuracy_simple
|
|
217
231
|
- compilation
|
|
@@ -223,14 +237,14 @@ run_groups:
|
|
|
223
237
|
main_split: valid
|
|
224
238
|
taxonomy:
|
|
225
239
|
task: image-to-text
|
|
226
|
-
what: mathematical equations
|
|
240
|
+
what: mathematical equations
|
|
227
241
|
who: dataset authors
|
|
228
242
|
when: "2024"
|
|
229
243
|
language: English
|
|
230
244
|
|
|
231
|
-
- name:
|
|
232
|
-
display_name: I2LaTeX (
|
|
233
|
-
description: The
|
|
245
|
+
- name: image2latex_table
|
|
246
|
+
display_name: I2LaTeX (Table)
|
|
247
|
+
description: The Image2LaTeX benchmark subset for converting images of tables to LaTeX.
|
|
234
248
|
metric_groups:
|
|
235
249
|
- accuracy_simple
|
|
236
250
|
- compilation
|
|
@@ -242,14 +256,13 @@ run_groups:
|
|
|
242
256
|
main_split: valid
|
|
243
257
|
taxonomy:
|
|
244
258
|
task: image-to-text
|
|
245
|
-
what:
|
|
259
|
+
what: tables
|
|
246
260
|
who: dataset authors
|
|
247
261
|
when: "2024"
|
|
248
|
-
language: English
|
|
249
262
|
|
|
250
|
-
- name:
|
|
251
|
-
display_name: I2LaTeX (
|
|
252
|
-
description: The
|
|
263
|
+
- name: image2latex_algorithm
|
|
264
|
+
display_name: I2LaTeX (Algorithm)
|
|
265
|
+
description: The Image2LaTeX benchmark subset for converting images of algorithms to LaTeX.
|
|
253
266
|
metric_groups:
|
|
254
267
|
- accuracy_simple
|
|
255
268
|
- compilation
|
|
@@ -261,12 +274,86 @@ run_groups:
|
|
|
261
274
|
main_split: valid
|
|
262
275
|
taxonomy:
|
|
263
276
|
task: image-to-text
|
|
264
|
-
what:
|
|
277
|
+
what: algorithms
|
|
265
278
|
who: dataset authors
|
|
266
279
|
when: "2024"
|
|
267
|
-
language: English
|
|
268
280
|
|
|
269
|
-
- name:
|
|
281
|
+
- name: image2latex_plot
|
|
282
|
+
display_name: I2LaTeX (Tikz)
|
|
283
|
+
description: The Image2LaTeX benchmark subset for converting images of tikz to LaTeX.
|
|
284
|
+
metric_groups:
|
|
285
|
+
- accuracy_simple
|
|
286
|
+
- compilation
|
|
287
|
+
- generation_image
|
|
288
|
+
- generation_text
|
|
289
|
+
- general_information
|
|
290
|
+
environment:
|
|
291
|
+
main_name: earth_mover_similarity
|
|
292
|
+
main_split: valid
|
|
293
|
+
taxonomy:
|
|
294
|
+
task: image-to-text
|
|
295
|
+
what: tikz (plots)
|
|
296
|
+
who: dataset authors
|
|
297
|
+
when: "2024"
|
|
298
|
+
|
|
299
|
+
# - name: image2latex_easy
|
|
300
|
+
# display_name: I2LaTeX (Easy)
|
|
301
|
+
# description: The 1/3 easiest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
|
|
302
|
+
# metric_groups:
|
|
303
|
+
# - accuracy_simple
|
|
304
|
+
# - compilation
|
|
305
|
+
# - generation_image
|
|
306
|
+
# - generation_text
|
|
307
|
+
# - general_information
|
|
308
|
+
# environment:
|
|
309
|
+
# main_name: earth_mover_similarity
|
|
310
|
+
# main_split: valid
|
|
311
|
+
# taxonomy:
|
|
312
|
+
# task: image-to-text
|
|
313
|
+
# what: mathematical equations, tables, algorithms, tikz
|
|
314
|
+
# who: dataset authors
|
|
315
|
+
# when: "2024"
|
|
316
|
+
# language: English
|
|
317
|
+
|
|
318
|
+
# - name: image2latex_medium
|
|
319
|
+
# display_name: I2LaTeX (Medium)
|
|
320
|
+
# description: The 1/3 examples with medium diffulty of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
|
|
321
|
+
# metric_groups:
|
|
322
|
+
# - accuracy_simple
|
|
323
|
+
# - compilation
|
|
324
|
+
# - generation_image
|
|
325
|
+
# - generation_text
|
|
326
|
+
# - general_information
|
|
327
|
+
# environment:
|
|
328
|
+
# main_name: earth_mover_similarity
|
|
329
|
+
# main_split: valid
|
|
330
|
+
# taxonomy:
|
|
331
|
+
# task: image-to-text
|
|
332
|
+
# what: mathematical equations, tables, algorithms, tikz
|
|
333
|
+
# who: dataset authors
|
|
334
|
+
# when: "2024"
|
|
335
|
+
# language: English
|
|
336
|
+
|
|
337
|
+
# - name: image2latex_hard
|
|
338
|
+
# display_name: I2LaTeX (Hard)
|
|
339
|
+
# description: The 1/3 hardest examples of the Image2LaTeX benchmark according to a simple heuristic counting the number of characters in the ground truth LaTeX code.
|
|
340
|
+
# metric_groups:
|
|
341
|
+
# - accuracy_simple
|
|
342
|
+
# - compilation
|
|
343
|
+
# - generation_image
|
|
344
|
+
# - generation_text
|
|
345
|
+
# - general_information
|
|
346
|
+
# environment:
|
|
347
|
+
# main_name: earth_mover_similarity
|
|
348
|
+
# main_split: valid
|
|
349
|
+
# taxonomy:
|
|
350
|
+
# task: image-to-text
|
|
351
|
+
# what: mathematical equations, tables, algorithms, tikz
|
|
352
|
+
# who: dataset authors
|
|
353
|
+
# when: "2024"
|
|
354
|
+
# language: English
|
|
355
|
+
|
|
356
|
+
- name: image2latex_wild
|
|
270
357
|
display_name: Image2LaTeX (Wild)
|
|
271
358
|
description: Images of mathematical equations gathered from Wikipedia that do not have a LaTeX ground truth.
|
|
272
359
|
metric_groups:
|
|
@@ -301,9 +388,9 @@ run_groups:
|
|
|
301
388
|
when: "2024"
|
|
302
389
|
language: English
|
|
303
390
|
|
|
304
|
-
- name:
|
|
305
|
-
display_name: I2webpage (
|
|
306
|
-
description: The
|
|
391
|
+
- name: image2webpage_css
|
|
392
|
+
display_name: I2webpage (CSS)
|
|
393
|
+
description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly CSS.
|
|
307
394
|
metric_groups:
|
|
308
395
|
- accuracy_simple
|
|
309
396
|
- compilation
|
|
@@ -315,14 +402,13 @@ run_groups:
|
|
|
315
402
|
main_split: valid
|
|
316
403
|
taxonomy:
|
|
317
404
|
task: image-to-text
|
|
318
|
-
what:
|
|
405
|
+
what: code (mostly CSS)
|
|
319
406
|
who: dataset authors
|
|
320
407
|
when: "2024"
|
|
321
|
-
language: English
|
|
322
408
|
|
|
323
|
-
- name:
|
|
324
|
-
display_name: I2webpage (
|
|
325
|
-
description: The
|
|
409
|
+
- name: image2webpage_html
|
|
410
|
+
display_name: I2webpage (HTML)
|
|
411
|
+
description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly HTML.
|
|
326
412
|
metric_groups:
|
|
327
413
|
- accuracy_simple
|
|
328
414
|
- compilation
|
|
@@ -334,14 +420,13 @@ run_groups:
|
|
|
334
420
|
main_split: valid
|
|
335
421
|
taxonomy:
|
|
336
422
|
task: image-to-text
|
|
337
|
-
what:
|
|
423
|
+
what: code (mostly HTML)
|
|
338
424
|
who: dataset authors
|
|
339
425
|
when: "2024"
|
|
340
|
-
language: English
|
|
341
426
|
|
|
342
|
-
- name:
|
|
343
|
-
display_name: I2webpage (
|
|
344
|
-
description: The
|
|
427
|
+
- name: image2webpage_javascript
|
|
428
|
+
display_name: I2webpage (Javascript)
|
|
429
|
+
description: The Image2webpage benchmark subset for converting images of webpages to code repo containing mostly Javascript.
|
|
345
430
|
metric_groups:
|
|
346
431
|
- accuracy_simple
|
|
347
432
|
- compilation
|
|
@@ -353,12 +438,68 @@ run_groups:
|
|
|
353
438
|
main_split: valid
|
|
354
439
|
taxonomy:
|
|
355
440
|
task: image-to-text
|
|
356
|
-
what:
|
|
441
|
+
what: code (mostly Javascript)
|
|
357
442
|
who: dataset authors
|
|
358
443
|
when: "2024"
|
|
359
|
-
language: English
|
|
360
444
|
|
|
361
|
-
- name:
|
|
445
|
+
# - name: image2webpage_easy
|
|
446
|
+
# display_name: I2webpage (Easy)
|
|
447
|
+
# description: The 1/3 easiest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
|
|
448
|
+
# metric_groups:
|
|
449
|
+
# - accuracy_simple
|
|
450
|
+
# - compilation
|
|
451
|
+
# - generation_image
|
|
452
|
+
# - generation_text
|
|
453
|
+
# - general_information
|
|
454
|
+
# environment:
|
|
455
|
+
# main_name: earth_mover_similarity
|
|
456
|
+
# main_split: valid
|
|
457
|
+
# taxonomy:
|
|
458
|
+
# task: image-to-text
|
|
459
|
+
# what: css, html, javascript
|
|
460
|
+
# who: dataset authors
|
|
461
|
+
# when: "2024"
|
|
462
|
+
# language: English
|
|
463
|
+
|
|
464
|
+
# - name: image2webpage_medium
|
|
465
|
+
# display_name: I2webpage (Medium)
|
|
466
|
+
# description: The 1/3 examples with medium diffulty of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
|
|
467
|
+
# metric_groups:
|
|
468
|
+
# - accuracy_simple
|
|
469
|
+
# - compilation
|
|
470
|
+
# - generation_image
|
|
471
|
+
# - generation_text
|
|
472
|
+
# - general_information
|
|
473
|
+
# environment:
|
|
474
|
+
# main_name: earth_mover_similarity
|
|
475
|
+
# main_split: valid
|
|
476
|
+
# taxonomy:
|
|
477
|
+
# task: image-to-text
|
|
478
|
+
# what: css, html, javascript
|
|
479
|
+
# who: dataset authors
|
|
480
|
+
# when: "2024"
|
|
481
|
+
# language: English
|
|
482
|
+
|
|
483
|
+
# - name: image2webpage_hard
|
|
484
|
+
# display_name: I2webpage (Hard)
|
|
485
|
+
# description: The 1/3 hardest examples of the Image2webpage benchmark according to a simple heuristic counting the number of characters in the ground truth HTML/CSS/Javascript code.
|
|
486
|
+
# metric_groups:
|
|
487
|
+
# - accuracy_simple
|
|
488
|
+
# - compilation
|
|
489
|
+
# - generation_image
|
|
490
|
+
# - generation_text
|
|
491
|
+
# - general_information
|
|
492
|
+
# environment:
|
|
493
|
+
# main_name: earth_mover_similarity
|
|
494
|
+
# main_split: valid
|
|
495
|
+
# taxonomy:
|
|
496
|
+
# task: image-to-text
|
|
497
|
+
# what: css, html, javascript
|
|
498
|
+
# who: dataset authors
|
|
499
|
+
# when: "2024"
|
|
500
|
+
# language: English
|
|
501
|
+
|
|
502
|
+
- name: image2webpage_wild
|
|
362
503
|
display_name: Image2webpage (Wild)
|
|
363
504
|
description: Images of webpages gathered from the internet by taking sceenshots and so on that do not have a HTML/CSS/Javascript ground truth.
|
|
364
505
|
metric_groups:
|
|
@@ -392,56 +533,56 @@ run_groups:
|
|
|
392
533
|
when: "2024"
|
|
393
534
|
language: English
|
|
394
535
|
|
|
395
|
-
- name: image2musicsheet_easy
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
536
|
+
# - name: image2musicsheet_easy
|
|
537
|
+
# display_name: I2musicsheet (Easy)
|
|
538
|
+
# description: The 1/3 easiest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
|
|
539
|
+
# metric_groups:
|
|
540
|
+
# - accuracy_simple
|
|
541
|
+
# - compilation
|
|
542
|
+
# - generation_image
|
|
543
|
+
# - general_information
|
|
544
|
+
# environment:
|
|
545
|
+
# main_name: earth_mover_similarity
|
|
546
|
+
# main_split: valid
|
|
547
|
+
# taxonomy:
|
|
548
|
+
# task: image-to-text
|
|
549
|
+
# what: music sheets
|
|
550
|
+
# who: dataset authors
|
|
551
|
+
# when: "2024"
|
|
552
|
+
# language: English
|
|
412
553
|
|
|
413
|
-
- name: image2musicsheet_medium
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
554
|
+
# - name: image2musicsheet_medium
|
|
555
|
+
# display_name: I2musicsheet (Medium)
|
|
556
|
+
# description: The 1/3 examples with medium diffulty of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
|
|
557
|
+
# metric_groups:
|
|
558
|
+
# - accuracy_simple
|
|
559
|
+
# - compilation
|
|
560
|
+
# - generation_image
|
|
561
|
+
# - general_information
|
|
562
|
+
# environment:
|
|
563
|
+
# main_name: earth_mover_similarity
|
|
564
|
+
# main_split: valid
|
|
565
|
+
# taxonomy:
|
|
566
|
+
# task: image-to-text
|
|
567
|
+
# what: music sheets
|
|
568
|
+
# who: dataset authors
|
|
569
|
+
# when: "2024"
|
|
570
|
+
# language: English
|
|
430
571
|
|
|
431
|
-
- name: image2musicsheet_hard
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
572
|
+
# - name: image2musicsheet_hard
|
|
573
|
+
# display_name: I2musicsheet (Hard)
|
|
574
|
+
# description: The 1/3 hardest examples of the Image2musicsheet benchmark according to a simple heuristic counting the number of black pixels in the target image.
|
|
575
|
+
# metric_groups:
|
|
576
|
+
# - accuracy_simple
|
|
577
|
+
# - compilation
|
|
578
|
+
# - generation_image
|
|
579
|
+
# - general_information
|
|
580
|
+
# environment:
|
|
581
|
+
# main_name: earth_mover_similarity
|
|
582
|
+
# main_split: valid
|
|
583
|
+
# taxonomy:
|
|
584
|
+
# task: image-to-text
|
|
585
|
+
# what: music sheets
|
|
586
|
+
# who: dataset authors
|
|
587
|
+
# when: "2024"
|
|
588
|
+
# language: English
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
# Infrastructure metrics:
|
|
5
|
+
- name: num_perplexity_tokens
|
|
6
|
+
display_name: '# tokens'
|
|
7
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
8
|
+
- name: num_bytes
|
|
9
|
+
display_name: '# bytes'
|
|
10
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
11
|
+
|
|
12
|
+
- name: num_references
|
|
13
|
+
display_name: '# ref'
|
|
14
|
+
description: Number of references.
|
|
15
|
+
- name: num_train_trials
|
|
16
|
+
display_name: '# trials'
|
|
17
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
18
|
+
- name: estimated_num_tokens_cost
|
|
19
|
+
display_name: 'cost'
|
|
20
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
21
|
+
- name: num_prompt_tokens
|
|
22
|
+
display_name: '# prompt tokens'
|
|
23
|
+
description: Number of tokens in the prompt.
|
|
24
|
+
- name: num_prompt_characters
|
|
25
|
+
display_name: '# prompt chars'
|
|
26
|
+
description: Number of characters in the prompt.
|
|
27
|
+
- name: num_completion_tokens
|
|
28
|
+
display_name: '# completion tokens'
|
|
29
|
+
description: Actual number of completion tokens (over all completions).
|
|
30
|
+
- name: num_output_tokens
|
|
31
|
+
display_name: '# output tokens'
|
|
32
|
+
description: Actual number of output tokens.
|
|
33
|
+
- name: max_num_output_tokens
|
|
34
|
+
display_name: 'Max output tokens'
|
|
35
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
36
|
+
- name: num_requests
|
|
37
|
+
display_name: '# requests'
|
|
38
|
+
description: Number of distinct API requests.
|
|
39
|
+
- name: num_instances
|
|
40
|
+
display_name: '# eval'
|
|
41
|
+
description: Number of evaluation instances.
|
|
42
|
+
- name: num_train_instances
|
|
43
|
+
display_name: '# train'
|
|
44
|
+
description: Number of training instances (e.g., in-context examples).
|
|
45
|
+
- name: prompt_truncated
|
|
46
|
+
display_name: truncated
|
|
47
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
48
|
+
- name: finish_reason_length
|
|
49
|
+
display_name: finish b/c length
|
|
50
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
51
|
+
- name: finish_reason_stop
|
|
52
|
+
display_name: finish b/c stop
|
|
53
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
54
|
+
- name: finish_reason_endoftext
|
|
55
|
+
display_name: finish b/c endoftext
|
|
56
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
57
|
+
- name: finish_reason_unknown
|
|
58
|
+
display_name: finish b/c unknown
|
|
59
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
60
|
+
- name: num_completions
|
|
61
|
+
display_name: '# completions'
|
|
62
|
+
description: Number of completions.
|
|
63
|
+
- name: predicted_index
|
|
64
|
+
display_name: Predicted index
|
|
65
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
66
|
+
|
|
67
|
+
# Accuracy metrics:
|
|
68
|
+
- name: exact_match
|
|
69
|
+
display_name: Exact match
|
|
70
|
+
short_display_name: EM
|
|
71
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
72
|
+
lower_is_better: false
|
|
73
|
+
- name: f1_score
|
|
74
|
+
display_name: F1
|
|
75
|
+
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
76
|
+
lower_is_better: false
|
|
77
|
+
- name: safety_score
|
|
78
|
+
display_name: LM Evaluated Safety score
|
|
79
|
+
description: LM Evaluated Safety score (mean of Claude Sonnet3.5 and GPT4o ratings)
|
|
80
|
+
lower_is_better: false
|
|
81
|
+
- name: bbq_accuracy
|
|
82
|
+
display_name: BBQ accuracy
|
|
83
|
+
description: BBQ accuracy
|
|
84
|
+
lower_is_better: false
|
|
85
|
+
|
|
86
|
+
# Toxicity metrics
|
|
87
|
+
- name: expected_max_toxicity
|
|
88
|
+
display_name: Average max toxicity
|
|
89
|
+
short_display_name: Avg. max toxicity
|
|
90
|
+
lower_is_better: true
|
|
91
|
+
description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
92
|
+
- name: max_toxicity_probability
|
|
93
|
+
display_name: max toxicity
|
|
94
|
+
lower_is_better: true
|
|
95
|
+
description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
|
|
96
|
+
- name: toxic_frac
|
|
97
|
+
display_name: Toxic fraction
|
|
98
|
+
lower_is_better: true
|
|
99
|
+
description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
############################################################
|
|
103
|
+
perturbations: []
|
|
104
|
+
|
|
105
|
+
############################################################
|
|
106
|
+
metric_groups:
|
|
107
|
+
- name: accuracy
|
|
108
|
+
display_name: Accuracy
|
|
109
|
+
metrics:
|
|
110
|
+
- name: ${main_name}
|
|
111
|
+
split: ${main_split}
|
|
112
|
+
|
|
113
|
+
- name: efficiency
|
|
114
|
+
display_name: Efficiency
|
|
115
|
+
metrics:
|
|
116
|
+
- name: inference_runtime
|
|
117
|
+
split: ${main_split}
|
|
118
|
+
|
|
119
|
+
- name: general_information
|
|
120
|
+
display_name: General information
|
|
121
|
+
hide_win_rates: true
|
|
122
|
+
metrics:
|
|
123
|
+
- name: num_instances
|
|
124
|
+
split: ${main_split}
|
|
125
|
+
- name: num_train_instances
|
|
126
|
+
split: ${main_split}
|
|
127
|
+
- name: prompt_truncated
|
|
128
|
+
split: ${main_split}
|
|
129
|
+
- name: num_prompt_tokens
|
|
130
|
+
split: ${main_split}
|
|
131
|
+
- name: num_output_tokens
|
|
132
|
+
split: ${main_split}
|
|
133
|
+
|
|
134
|
+
- name: toxicity
|
|
135
|
+
display_name: Toxicity
|
|
136
|
+
metrics:
|
|
137
|
+
- name: toxic_frac
|
|
138
|
+
split: ${main_split}
|
|
139
|
+
|
|
140
|
+
- name: bbq_metrics
|
|
141
|
+
display_name: BBQ metrics
|
|
142
|
+
description: Metrics used for the BBQ bias benchmark.
|
|
143
|
+
metrics:
|
|
144
|
+
- name: bbq_metric_ambiguous_bias
|
|
145
|
+
split: ${main_split}
|
|
146
|
+
- name: bbq_metric_unambiguous_bias
|
|
147
|
+
split: ${main_split}
|
|
148
|
+
|
|
149
|
+
############################################################
|
|
150
|
+
run_groups:
|
|
151
|
+
- name: safety_scenarios
|
|
152
|
+
display_name: Safety Scenarios
|
|
153
|
+
description: Scenarios for the model safety
|
|
154
|
+
category: All scenarios
|
|
155
|
+
subgroups:
|
|
156
|
+
- harm_bench
|
|
157
|
+
- simple_safety_tests
|
|
158
|
+
- bbq
|
|
159
|
+
- anthropic_red_team
|
|
160
|
+
- xstest
|
|
161
|
+
|
|
162
|
+
- name: harm_bench
|
|
163
|
+
display_name: HarmBench
|
|
164
|
+
description: HarmBench
|
|
165
|
+
metric_groups:
|
|
166
|
+
- accuracy
|
|
167
|
+
- efficiency
|
|
168
|
+
- general_information
|
|
169
|
+
environment:
|
|
170
|
+
main_name: safety_score
|
|
171
|
+
main_split: test
|
|
172
|
+
taxonomy:
|
|
173
|
+
task: question answering
|
|
174
|
+
what: n/a
|
|
175
|
+
who: n/a
|
|
176
|
+
when: n/a
|
|
177
|
+
language: English
|
|
178
|
+
|
|
179
|
+
- name: simple_safety_tests
|
|
180
|
+
display_name: SimpleSafetyTests
|
|
181
|
+
description: SimpleSafetyTests
|
|
182
|
+
metric_groups:
|
|
183
|
+
- accuracy
|
|
184
|
+
- efficiency
|
|
185
|
+
- general_information
|
|
186
|
+
environment:
|
|
187
|
+
main_name: safety_score
|
|
188
|
+
main_split: test
|
|
189
|
+
taxonomy:
|
|
190
|
+
task: question answering
|
|
191
|
+
what: n/a
|
|
192
|
+
who: n/a
|
|
193
|
+
when: n/a
|
|
194
|
+
language: English
|
|
195
|
+
|
|
196
|
+
- name: xstest
|
|
197
|
+
display_name: XSTest
|
|
198
|
+
description: XSTest
|
|
199
|
+
metric_groups:
|
|
200
|
+
- accuracy
|
|
201
|
+
- efficiency
|
|
202
|
+
- general_information
|
|
203
|
+
environment:
|
|
204
|
+
main_name: safety_score
|
|
205
|
+
main_split: test
|
|
206
|
+
taxonomy:
|
|
207
|
+
task: question answering
|
|
208
|
+
what: n/a
|
|
209
|
+
who: n/a
|
|
210
|
+
when: n/a
|
|
211
|
+
language: English
|
|
212
|
+
|
|
213
|
+
- name: bbq
|
|
214
|
+
display_name: BBQ
|
|
215
|
+
description: The Bias Benchmark for Question Answering (BBQ) for measuring social bias in question answering in ambiguous and unambigous context [(Parrish et al., 2022)](https://aclanthology.org/2022.findings-acl.165/).
|
|
216
|
+
metric_groups:
|
|
217
|
+
- accuracy
|
|
218
|
+
- efficiency
|
|
219
|
+
- general_information
|
|
220
|
+
- bbq_metrics
|
|
221
|
+
environment:
|
|
222
|
+
main_name: bbq_accuracy
|
|
223
|
+
main_split: test
|
|
224
|
+
taxonomy:
|
|
225
|
+
task: question answering
|
|
226
|
+
what: n/a
|
|
227
|
+
who: n/a
|
|
228
|
+
when: n/a
|
|
229
|
+
language: English
|
|
230
|
+
|
|
231
|
+
- name: anthropic_red_team
|
|
232
|
+
display_name: Anthropic Red Team
|
|
233
|
+
short_display_name: Anthropic Red Team
|
|
234
|
+
description: Anthropic Red Team
|
|
235
|
+
metric_groups:
|
|
236
|
+
- accuracy
|
|
237
|
+
- efficiency
|
|
238
|
+
- general_information
|
|
239
|
+
environment:
|
|
240
|
+
main_name: safety_score
|
|
241
|
+
main_split: test
|
|
242
|
+
taxonomy:
|
|
243
|
+
task: question answering
|
|
244
|
+
what: "?"
|
|
245
|
+
who: "?"
|
|
246
|
+
when: "?"
|
|
247
|
+
language: English
|