crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +71 -0
- helm/benchmark/annotation/medication_qa_annotator.py +68 -0
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +31 -2
- helm/benchmark/run_expander.py +113 -10
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
- helm/benchmark/run_specs/experimental_run_specs.py +85 -0
- helm/benchmark/run_specs/finance_run_specs.py +110 -0
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +251 -57
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +189 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +317 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +50 -28
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +79 -19
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +11 -5
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +7 -9
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +99 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +25 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +740 -363
- helm/config/model_metadata.yaml +824 -128
- helm/config/tokenizer_configs.yaml +207 -10
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +2 -3
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +29 -62
- helm/tokenizers/huggingface_tokenizer.py +35 -13
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/schema_image2structure.yaml +0 -304
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
helm/benchmark/static/config.js
DELETED
helm/benchmark/static/general.js
DELETED
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
function assert(condition, message) {
|
|
2
|
-
if (!condition) {
|
|
3
|
-
throw message || "Assertion failed";
|
|
4
|
-
}
|
|
5
|
-
}
|
|
6
|
-
|
|
7
|
-
function encodeUrlParams(params) {
|
|
8
|
-
let s = "";
|
|
9
|
-
for (let k in params) {
|
|
10
|
-
if (params[k] != null) {
|
|
11
|
-
s += (s === "" ? "?" : "&") + k + "=" + encodeURIComponent(params[k]);
|
|
12
|
-
}
|
|
13
|
-
}
|
|
14
|
-
return s;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
function decodeUrlParams(str) {
|
|
18
|
-
const params = {};
|
|
19
|
-
if (str === "") return params;
|
|
20
|
-
const items = str.substring(1).split(/&/);
|
|
21
|
-
for (let i = 0; i < items.length; i++) {
|
|
22
|
-
const pair = items[i].split(/=/);
|
|
23
|
-
params[pair[0]] = decodeURIComponent(pair[1]);
|
|
24
|
-
}
|
|
25
|
-
return params;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
function updateBrowserLocation(params) {
|
|
29
|
-
// Update the address bar
|
|
30
|
-
window.history.pushState(
|
|
31
|
-
{},
|
|
32
|
-
"",
|
|
33
|
-
window.location.pathname + encodeUrlParams(params),
|
|
34
|
-
);
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
function createCookie(key, value, days) {
|
|
38
|
-
let expires = "";
|
|
39
|
-
if (days) {
|
|
40
|
-
const date = new Date();
|
|
41
|
-
date.setTime(date.getTime() + days * 24 * 60 * 60 * 1000);
|
|
42
|
-
expires = "; expires=" + date.toUTCString();
|
|
43
|
-
}
|
|
44
|
-
document.cookie = key + "=" + value + expires + "; path=/";
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
function readCookie(key) {
|
|
48
|
-
let tokens = document.cookie.split(";");
|
|
49
|
-
for (let i = 0; i < tokens.length; i++) {
|
|
50
|
-
const [k, v] = tokens[i].trim().split("=", 2);
|
|
51
|
-
if (key === k) return v;
|
|
52
|
-
}
|
|
53
|
-
return null;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
function eraseCookie(key) {
|
|
57
|
-
createCookie(key, "", -1);
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
function renderTimestamp(timestamp) {
|
|
61
|
-
if (!timestamp) return null;
|
|
62
|
-
const d = new Date(timestamp * 1000);
|
|
63
|
-
return d.toLocaleString();
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
function renderDict(data) {
|
|
67
|
-
return JSON.stringify(data).substring(0, 10000);
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
function loadScript(src, onload, onerror) {
|
|
71
|
-
// Using jquery doesn't work, so do it in with our bare hands.
|
|
72
|
-
const s = document.createElement("script");
|
|
73
|
-
s.src = src;
|
|
74
|
-
s.onload = onload;
|
|
75
|
-
s.onerror = onerror;
|
|
76
|
-
document.head.appendChild(s);
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
function getRandomString() {
|
|
80
|
-
const vocab =
|
|
81
|
-
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
|
|
82
|
-
let text = "";
|
|
83
|
-
for (let i = 0; i < 6; i++)
|
|
84
|
-
text += vocab.charAt(Math.floor(Math.random() * vocab.length));
|
|
85
|
-
return text;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
function round(x, n) {
|
|
89
|
-
const base = Math.pow(10, n);
|
|
90
|
-
return Math.round(x * base) / base;
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
function multilineHtml(s) {
|
|
94
|
-
return s.replace(/\n/g, "<br>");
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
function renderError(e) {
|
|
98
|
-
return $("<div>").addClass("alert alert-danger").append(multilineHtml(e));
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
function helpIcon(help, link) {
|
|
102
|
-
// Show a ?
|
|
103
|
-
return $("<a>", { href: link, target: "blank_", class: "help-icon" }).append(
|
|
104
|
-
$("<img>", { src: "info-icon.png", width: 15, title: help }),
|
|
105
|
-
);
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
const markdownConverter = new showdown.Converter({ optionKey: "value" });
|
|
109
|
-
function renderMarkdown(markdown) {
|
|
110
|
-
return markdown && markdownConverter.makeHtml(markdown);
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
function refreshHashLocation() {
|
|
114
|
-
// If we request a hash location (URL contains #foo), the problem is #foo
|
|
115
|
-
// might not exist (since it's generated). Call this function to jump to the
|
|
116
|
-
// hash location once all the anchors are generated.
|
|
117
|
-
if (location.hash) {
|
|
118
|
-
const hash = location.hash;
|
|
119
|
-
location.hash = "";
|
|
120
|
-
location.hash = hash;
|
|
121
|
-
}
|
|
122
|
-
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
helm/benchmark/static/index.html
DELETED
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
<!DOCTYPE html>
|
|
2
|
-
<html lang="en">
|
|
3
|
-
<head>
|
|
4
|
-
<title>Holistic Evaluation of Language Models (HELM)</title>
|
|
5
|
-
<meta charset="utf-8">
|
|
6
|
-
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
|
7
|
-
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
|
|
8
|
-
<link rel="stylesheet" type="text/css" href="benchmarking.css">
|
|
9
|
-
</head>
|
|
10
|
-
|
|
11
|
-
<body>
|
|
12
|
-
<div class="container-fluid">
|
|
13
|
-
<nav class="navbar navbar-expand-sm navbar-light bg-faded">
|
|
14
|
-
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#nav-content" aria-controls="nav-content" aria-expanded="false" aria-label="Toggle navigation">
|
|
15
|
-
<span class="navbar-toggler-icon"></span>
|
|
16
|
-
</button>
|
|
17
|
-
|
|
18
|
-
<a class="nav-link active" href="https://crfm.stanford.edu"><img src="images/crfm-logo.png" width="100"/></a>
|
|
19
|
-
<div class="collapse navbar-collapse" id="nav-content">
|
|
20
|
-
<ul class="navbar-nav">
|
|
21
|
-
<li class="nav-item"><a class="nav-link active" href="?"><img src="images/helm-logo-simple.png" width="80"/></a></li>
|
|
22
|
-
<li class="nav-item"><a class="nav-link active" href="?models=1">Models</a></li>
|
|
23
|
-
<li class="nav-item"><a class="nav-link active" href="?scenarios=1">Scenarios</a></li>
|
|
24
|
-
<li class="nav-item"><a class="nav-link active" href="?groups=1">Results</a></li>
|
|
25
|
-
<!--
|
|
26
|
-
TODO(#1441): Enable plots.
|
|
27
|
-
<li class="nav-item"><a class="nav-link active" href="?plots=1">Plots</a></li>
|
|
28
|
-
-->
|
|
29
|
-
<li class="nav-item"><a class="nav-link active" href="?runs=1">Raw runs</a></li>
|
|
30
|
-
</ul>
|
|
31
|
-
</div>
|
|
32
|
-
|
|
33
|
-
<div class="text-right" id="summary" style="white-space: nowrap">
|
|
34
|
-
</div>
|
|
35
|
-
</nav>
|
|
36
|
-
|
|
37
|
-
<div class="row">
|
|
38
|
-
<div class="col-sm-12" id="main">
|
|
39
|
-
</div>
|
|
40
|
-
</div>
|
|
41
|
-
</div>
|
|
42
|
-
|
|
43
|
-
<script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
|
|
44
|
-
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
|
|
45
|
-
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
|
|
46
|
-
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.29.0/js/jquery.tablesorter.min.js"></script>
|
|
47
|
-
<script src="https://cdnjs.cloudflare.com/ajax/libs/js-yaml/4.1.0/js-yaml.min.js"></script>
|
|
48
|
-
<script src="https://cdnjs.cloudflare.com/ajax/libs/showdown/2.0.3/showdown.min.js"></script>
|
|
49
|
-
<script src="https://cdnjs.cloudflare.com/ajax/libs/handlebars.js/4.7.7/handlebars.min.js"></script>
|
|
50
|
-
<!-- *GTAG*
|
|
51
|
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-T0MW28MP3W"></script>
|
|
52
|
-
<script>
|
|
53
|
-
window.dataLayer = window.dataLayer || [];
|
|
54
|
-
function gtag() {
|
|
55
|
-
dataLayer.push(arguments);
|
|
56
|
-
}
|
|
57
|
-
gtag('js', new Date());
|
|
58
|
-
gtag('config', 'G-T0MW28MP3W');
|
|
59
|
-
</script>
|
|
60
|
-
*GTAG* -->
|
|
61
|
-
<script src="config.js"></script>
|
|
62
|
-
<script src="general.js"></script>
|
|
63
|
-
<script src="utils.js"></script>
|
|
64
|
-
<script src="json-urls.js"></script>
|
|
65
|
-
<script src="benchmarking.js"></script>
|
|
66
|
-
<script src="plot-captions.js"></script>
|
|
67
|
-
</body>
|
|
68
|
-
</html>
|
|
Binary file
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
////////////////////////////////////////////////////////////
|
|
2
|
-
// Helper functions for getting URLs of JSON files
|
|
3
|
-
function versionBaseUrl() {
|
|
4
|
-
if (window.RELEASE) {
|
|
5
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/releases/${window.RELEASE}`;
|
|
6
|
-
} else {
|
|
7
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${window.SUITE}`;
|
|
8
|
-
}
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
function schemaJsonUrl() {
|
|
12
|
-
return `${versionBaseUrl()}/schema.json`;
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
function summaryJsonUrl() {
|
|
16
|
-
return `${versionBaseUrl()}/summary.json`;
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
function runsToRunSuitesJsonUrl() {
|
|
20
|
-
return `${versionBaseUrl()}/runs_to_run_suites.json`;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
function runSpecsJsonUrl() {
|
|
24
|
-
return `${versionBaseUrl()}/run_specs.json`;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
function groupsMetadataJsonUrl() {
|
|
28
|
-
return `${versionBaseUrl()}/groups_metadata.json`;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
function groupsJsonUrl() {
|
|
32
|
-
return `${versionBaseUrl()}/groups.json`;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
function groupJsonUrl(groupName) {
|
|
36
|
-
return `${versionBaseUrl()}/groups/${groupName}.json`;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
function runSpecJsonUrl(suite, runSpecName) {
|
|
40
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/run_spec.json`;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
function scenarioJsonUrl(suite, runSpecName) {
|
|
44
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/scenario.json`;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
function scenarioStateJsonUrl(suite, runSpecName) {
|
|
48
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/scenario_state.json`;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
function statsJsonUrl(suite, runSpecName) {
|
|
52
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/stats.json`;
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
function instancesJsonUrl(suite, runSpecName) {
|
|
56
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/instances.json`;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
function predictionsJsonUrl(suite, runSpecName) {
|
|
60
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_predictions.json`;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
function requestsJsonUrl(suite, runSpecName) {
|
|
64
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
function plotUrl(suite, plotName) {
|
|
68
|
-
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
|
|
69
|
-
}
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
////////////////////////////////////////////////////////////
|
|
2
|
-
// Dictionary of plot captions
|
|
3
|
-
|
|
4
|
-
const plotCaptions = {
|
|
5
|
-
generic_summary:
|
|
6
|
-
"Metrics for every model on every core scenario as a means for indicating the spread on a per-metric basis.",
|
|
7
|
-
model_ranking_all:
|
|
8
|
-
"The fraction of head-to-head comparisons between the given model and all other models, across all scenarios, where the given model is higher along the metric (e.g. more accurate in the accuracy subfigure). If a model was the highest for the given metric for every scenario, it would receive a score of 1.0; if a model received a score of 0.5, then if a scenario and second model were chosen at random, the outcome of the comparison would be a coin flip. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation.",
|
|
9
|
-
accuracy_v_x:
|
|
10
|
-
"The relationship between accuracy (x-axis) and each of the 6 metrics (calibration, robustness, fairness, social bias, toxicity, efficiency) we study in this work across all core scenarios and for all models. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
|
|
11
|
-
metric_correlation:
|
|
12
|
-
"The Pearson correlation between each metric and every other metric (x-axis). The small blue dots denote the correlation on each individual scenario, while the larger orange dots average the correlation across scenarios. Trends are qualitatively similarly for other correlation measures (e.g. Spearman correlation). For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
|
|
13
|
-
accuracy_v_access:
|
|
14
|
-
"The relationship between access (open vs. limited vs. closed) and model accuracy for each of the core scenarios. Shaded bars indicate the performance of the best model for that scenario, whereas the solid bars indicate the performance of the overall most accurate model across all core scenarios.",
|
|
15
|
-
accuracy_over_num_parameters:
|
|
16
|
-
"Cumulative plot, depicting the accuracy of the most accurate model up to a given size across all core scenarios.",
|
|
17
|
-
accuracy_over_release_date:
|
|
18
|
-
"The relationship between time (x-axis) and the accuracy of models (y-axis) across the core scenarios.",
|
|
19
|
-
accuracy_over_the_pile_perplexity:
|
|
20
|
-
"The relationship between log bits-per-byte (BPB) on The Pile and the accuracy on each core scenario.",
|
|
21
|
-
targeted_evals:
|
|
22
|
-
"Model accuracy on scenario targeting specific performance components (language, knowledge, reasoning).",
|
|
23
|
-
in_context_ablations:
|
|
24
|
-
"For each model, we set the maximum number of in-context examples to [0, 1, 2, 4, 8, 16] and fit as many in-context examples as possible within the context window. We plot performance as a function of the average number of in-context examples actually used.",
|
|
25
|
-
mc_ablations:
|
|
26
|
-
"For each adaptation method (joint, separate, and separate calibrated), we compare models across scenarios.",
|
|
27
|
-
};
|
|
@@ -1,304 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
############################################################
|
|
3
|
-
adapter:
|
|
4
|
-
- name: method
|
|
5
|
-
description: The high-level strategy for converting instances into a prompt for the language model.
|
|
6
|
-
values:
|
|
7
|
-
- name: generation
|
|
8
|
-
description: Given the input, the model generates the output free-form.
|
|
9
|
-
- name: generation_multimodal
|
|
10
|
-
description: Given the multimodal input, the model generates the output free-form.
|
|
11
|
-
- name: multiple_choice_joint
|
|
12
|
-
description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
|
|
13
|
-
- name: multiple_choice_separate_original
|
|
14
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
|
|
15
|
-
- name: multiple_choice_separate_calibrated
|
|
16
|
-
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
|
|
17
|
-
- name: language_modeling
|
|
18
|
-
description: Given the input, the model assigns the sequence a probability.
|
|
19
|
-
- name: instructions
|
|
20
|
-
description: The description of the task that is included at the very beginning of the prompt.
|
|
21
|
-
- name: global_prefix
|
|
22
|
-
description: The string that is prepended to the prompt.
|
|
23
|
-
- name: global_suffix
|
|
24
|
-
description: The string that is appended to the prompt.
|
|
25
|
-
- name: instance_prefix
|
|
26
|
-
description: The string that is included before each instance (e.g., '\n\n').
|
|
27
|
-
- name: input_prefix
|
|
28
|
-
description: The string that is included before each input (e.g., 'Question:').
|
|
29
|
-
- name: input_suffix
|
|
30
|
-
description: The string that is included after each input (e.g., '\n').
|
|
31
|
-
- name: reference_prefix
|
|
32
|
-
description: The string that is included before each reference (for multiple-choice questions).
|
|
33
|
-
- name: reference_suffix
|
|
34
|
-
description: The string that is included after each reference (for multiple-choice questions).
|
|
35
|
-
- name: output_prefix
|
|
36
|
-
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
|
|
37
|
-
- name: output_suffix
|
|
38
|
-
description: The string that is included after the correct answer/predicted output (e.g., '\n').
|
|
39
|
-
- name: substitutions
|
|
40
|
-
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
|
|
41
|
-
- name: max_train_instances
|
|
42
|
-
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
|
|
43
|
-
- name: max_eval_instances
|
|
44
|
-
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
|
|
45
|
-
- name: num_outputs
|
|
46
|
-
description: Maximum number of possible outputs to generate by sampling multiple outputs.
|
|
47
|
-
- name: num_train_trials
|
|
48
|
-
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
|
|
49
|
-
- name: sample_train
|
|
50
|
-
description: If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
51
|
-
- name: model
|
|
52
|
-
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
|
|
53
|
-
- name: model_deployment
|
|
54
|
-
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
|
|
55
|
-
- name: temperature
|
|
56
|
-
description: Temperature parameter used in generation.
|
|
57
|
-
- name: max_tokens
|
|
58
|
-
description: Maximum number of tokens to generate.
|
|
59
|
-
- name: stop_sequences
|
|
60
|
-
description: List of sequences, where we stop generation if we encounter any of them.
|
|
61
|
-
- name: random
|
|
62
|
-
description: Random seed (string), which guarantees reproducibility.
|
|
63
|
-
- name: multi_label
|
|
64
|
-
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
|
|
65
|
-
|
|
66
|
-
############################################################
|
|
67
|
-
metrics:
|
|
68
|
-
# Infrastructure metrics:
|
|
69
|
-
- name: num_perplexity_tokens
|
|
70
|
-
display_name: '# tokens'
|
|
71
|
-
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
72
|
-
- name: num_bytes
|
|
73
|
-
display_name: '# bytes'
|
|
74
|
-
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
75
|
-
|
|
76
|
-
- name: num_references
|
|
77
|
-
display_name: '# ref'
|
|
78
|
-
description: Number of references.
|
|
79
|
-
- name: num_train_trials
|
|
80
|
-
display_name: '# trials'
|
|
81
|
-
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
82
|
-
- name: estimated_num_tokens_cost
|
|
83
|
-
display_name: 'cost'
|
|
84
|
-
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
85
|
-
- name: num_prompt_tokens
|
|
86
|
-
display_name: '# prompt tokens'
|
|
87
|
-
description: Number of tokens in the prompt.
|
|
88
|
-
- name: num_prompt_characters
|
|
89
|
-
display_name: '# prompt chars'
|
|
90
|
-
description: Number of characters in the prompt.
|
|
91
|
-
- name: num_completion_tokens
|
|
92
|
-
display_name: '# completion tokens'
|
|
93
|
-
description: Actual number of completion tokens (over all completions).
|
|
94
|
-
- name: num_output_tokens
|
|
95
|
-
display_name: '# output tokens'
|
|
96
|
-
description: Actual number of output tokens.
|
|
97
|
-
- name: max_num_output_tokens
|
|
98
|
-
display_name: 'Max output tokens'
|
|
99
|
-
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
100
|
-
- name: num_requests
|
|
101
|
-
display_name: '# requests'
|
|
102
|
-
description: Number of distinct API requests.
|
|
103
|
-
- name: num_instances
|
|
104
|
-
display_name: '# eval'
|
|
105
|
-
description: Number of evaluation instances.
|
|
106
|
-
- name: num_train_instances
|
|
107
|
-
display_name: '# train'
|
|
108
|
-
description: Number of training instances (e.g., in-context examples).
|
|
109
|
-
- name: prompt_truncated
|
|
110
|
-
display_name: truncated
|
|
111
|
-
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
112
|
-
- name: finish_reason_length
|
|
113
|
-
display_name: finish b/c length
|
|
114
|
-
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
115
|
-
- name: finish_reason_stop
|
|
116
|
-
display_name: finish b/c stop
|
|
117
|
-
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
118
|
-
- name: finish_reason_endoftext
|
|
119
|
-
display_name: finish b/c endoftext
|
|
120
|
-
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
121
|
-
- name: finish_reason_unknown
|
|
122
|
-
display_name: finish b/c unknown
|
|
123
|
-
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
124
|
-
- name: num_completions
|
|
125
|
-
display_name: '# completions'
|
|
126
|
-
description: Number of completions.
|
|
127
|
-
- name: predicted_index
|
|
128
|
-
display_name: Predicted index
|
|
129
|
-
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
130
|
-
|
|
131
|
-
# Vision Language metrics [text]:
|
|
132
|
-
- name: edit_similarity
|
|
133
|
-
display_name: Edit similarity (Levenshtein)
|
|
134
|
-
short_display_name: Edit sim.
|
|
135
|
-
lower_is_better: false
|
|
136
|
-
description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
|
|
137
|
-
|
|
138
|
-
# Vision Language metrics [image]:
|
|
139
|
-
- name: block_emd_similarity
|
|
140
|
-
display_name: Block Earth Mover Similarity
|
|
141
|
-
short_display_name: Block EMS
|
|
142
|
-
description: Block Earth Mover Similarity
|
|
143
|
-
lower_is_better: false
|
|
144
|
-
- name: block_emd_similarity_white
|
|
145
|
-
display_name: Block Earth Mover Similarity (white)
|
|
146
|
-
short_display_name: Block EMS (white)
|
|
147
|
-
description: Block Earth Mover Similarity (white)
|
|
148
|
-
lower_is_better: false
|
|
149
|
-
- name: block_emd_similarity_median_color
|
|
150
|
-
display_name: Block Earth Mover Similarity (median)
|
|
151
|
-
short_display_name: Block EMS (median)
|
|
152
|
-
description: Block Earth Mover Similarity (median)
|
|
153
|
-
lower_is_better: false
|
|
154
|
-
- name: pixel_similarity
|
|
155
|
-
display_name: Pixel Similarity
|
|
156
|
-
short_display_name: PS
|
|
157
|
-
description: Pixel Similarity between an image generated by the model and the target image.
|
|
158
|
-
lower_is_better: false
|
|
159
|
-
- name: sift_similarity
|
|
160
|
-
display_name: SIFT Similarity
|
|
161
|
-
short_display_name: SIFT
|
|
162
|
-
description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
|
|
163
|
-
lower_is_better: false
|
|
164
|
-
- name: compilation_success
|
|
165
|
-
display_name: Compilation success
|
|
166
|
-
description: Fraction of instances where the generated code compiles successfully.
|
|
167
|
-
lower_is_better: false
|
|
168
|
-
- name: lpips_similarity
|
|
169
|
-
display_name: LPIPS similarity
|
|
170
|
-
short_display_name: LPIPS
|
|
171
|
-
description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
|
|
172
|
-
lower_is_better: false
|
|
173
|
-
- name: fid_similarity
|
|
174
|
-
display_name: FID similarity
|
|
175
|
-
short_display_name: FID
|
|
176
|
-
description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
|
|
177
|
-
lower_is_better: false
|
|
178
|
-
- name: ssim_similarity
|
|
179
|
-
display_name: SSIM
|
|
180
|
-
short_display_name: SSIM
|
|
181
|
-
description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
|
|
182
|
-
lower_is_better: false
|
|
183
|
-
|
|
184
|
-
# Accuracy metrics:
|
|
185
|
-
- name: exact_match
|
|
186
|
-
display_name: Exact match
|
|
187
|
-
short_display_name: EM
|
|
188
|
-
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
189
|
-
lower_is_better: false
|
|
190
|
-
- name: quasi_exact_match
|
|
191
|
-
display_name: Quasi-exact match
|
|
192
|
-
short_display_name: EM
|
|
193
|
-
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
194
|
-
lower_is_better: false
|
|
195
|
-
- name: prefix_exact_match
|
|
196
|
-
display_name: Prefix exact match
|
|
197
|
-
short_display_name: PEM
|
|
198
|
-
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
199
|
-
lower_is_better: false
|
|
200
|
-
- name: quasi_prefix_exact_match
|
|
201
|
-
# TODO: should call this prefix_quasi_exact_match
|
|
202
|
-
display_name: Prefix quasi-exact match
|
|
203
|
-
short_display_name: PEM
|
|
204
|
-
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
205
|
-
lower_is_better: false
|
|
206
|
-
|
|
207
|
-
############################################################
|
|
208
|
-
perturbations:
|
|
209
|
-
- name: robustness
|
|
210
|
-
display_name: Robustness
|
|
211
|
-
description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
|
|
212
|
-
|
|
213
|
-
############################################################
|
|
214
|
-
metric_groups:
|
|
215
|
-
- name: accuracy
|
|
216
|
-
display_name: Compilation Rate and Earth Mover Similarity
|
|
217
|
-
metrics:
|
|
218
|
-
- name: ${main_name}
|
|
219
|
-
split: ${main_split}
|
|
220
|
-
- name: compilation_success
|
|
221
|
-
split: ${main_split}
|
|
222
|
-
|
|
223
|
-
- name: generation_image
|
|
224
|
-
display_name: Generation (image)
|
|
225
|
-
metrics:
|
|
226
|
-
- name: pixel_similarity
|
|
227
|
-
split: ${main_split}
|
|
228
|
-
- name: compilation_success
|
|
229
|
-
split: ${main_split}
|
|
230
|
-
- name: fid_similarity
|
|
231
|
-
split: ${main_split}
|
|
232
|
-
- name: block_emd_similarity
|
|
233
|
-
split: ${main_split}
|
|
234
|
-
- name: block_emd_similarity_white
|
|
235
|
-
split: ${main_split}
|
|
236
|
-
- name: block_emd_similarity_median_color
|
|
237
|
-
split: ${main_split}
|
|
238
|
-
|
|
239
|
-
- name: generation_text
|
|
240
|
-
display_name: Generation (text)
|
|
241
|
-
metrics:
|
|
242
|
-
- name: edit_similarity
|
|
243
|
-
split: ${main_split}
|
|
244
|
-
|
|
245
|
-
############################################################
|
|
246
|
-
run_groups:
|
|
247
|
-
- name: core_scenarios
|
|
248
|
-
display_name: Image2Structure
|
|
249
|
-
description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
|
|
250
|
-
category: All scenarios
|
|
251
|
-
subgroups:
|
|
252
|
-
- image2latex
|
|
253
|
-
- image2webpage
|
|
254
|
-
- image2musicsheet
|
|
255
|
-
|
|
256
|
-
- name: image2latex
|
|
257
|
-
display_name: Image2LaTeX
|
|
258
|
-
description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
|
|
259
|
-
metric_groups:
|
|
260
|
-
- accuracy
|
|
261
|
-
- generation_image
|
|
262
|
-
- generation_text
|
|
263
|
-
environment:
|
|
264
|
-
main_name: block_emd_similarity
|
|
265
|
-
main_split: valid
|
|
266
|
-
taxonomy:
|
|
267
|
-
task: image-to-text
|
|
268
|
-
what: mathematical equations, tables, algorithms, tikz
|
|
269
|
-
who: n/a
|
|
270
|
-
when: "2024"
|
|
271
|
-
language: English
|
|
272
|
-
|
|
273
|
-
- name: image2webpage
|
|
274
|
-
display_name: Image2webpage
|
|
275
|
-
description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
|
|
276
|
-
metric_groups:
|
|
277
|
-
- accuracy
|
|
278
|
-
- generation_image
|
|
279
|
-
- generation_text
|
|
280
|
-
environment:
|
|
281
|
-
main_name: block_emd_similarity
|
|
282
|
-
main_split: valid
|
|
283
|
-
taxonomy:
|
|
284
|
-
task: image-to-text
|
|
285
|
-
what: css, html, javascript
|
|
286
|
-
who: n/a
|
|
287
|
-
when: "2024"
|
|
288
|
-
language: English
|
|
289
|
-
|
|
290
|
-
- name: image2musicsheet
|
|
291
|
-
display_name: Image2musicsheet
|
|
292
|
-
description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
|
|
293
|
-
metric_groups:
|
|
294
|
-
- accuracy
|
|
295
|
-
- generation_image
|
|
296
|
-
environment:
|
|
297
|
-
main_name: block_emd_similarity
|
|
298
|
-
main_split: valid
|
|
299
|
-
taxonomy:
|
|
300
|
-
task: image-to-text
|
|
301
|
-
what: music sheets
|
|
302
|
-
who: n/a
|
|
303
|
-
when: "2024"
|
|
304
|
-
language: English
|