crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
  3. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +32 -31
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  6. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  7. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  8. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  9. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  10. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  11. helm/benchmark/annotation/annotator_factory.py +6 -0
  12. helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
  13. helm/benchmark/annotation/call_center_annotator.py +247 -0
  14. helm/benchmark/annotation/financebench_annotator.py +79 -0
  15. helm/benchmark/annotation/harm_bench_annotator.py +68 -0
  16. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  17. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  18. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  19. helm/benchmark/annotation/live_qa_annotator.py +71 -0
  20. helm/benchmark/annotation/medication_qa_annotator.py +68 -0
  21. helm/benchmark/annotation/model_as_judge.py +45 -0
  22. helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
  23. helm/benchmark/annotation/xstest_annotator.py +110 -0
  24. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  25. helm/benchmark/huggingface_registration.py +16 -6
  26. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  27. helm/benchmark/metrics/annotation_metrics.py +108 -0
  28. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  29. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  30. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  31. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  32. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  33. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  35. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  36. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  37. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  38. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  39. helm/benchmark/metrics/safety_metrics.py +57 -0
  40. helm/benchmark/metrics/summac/model_summac.py +3 -3
  41. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  42. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  43. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  44. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  45. helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
  46. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  47. helm/benchmark/model_metadata_registry.py +3 -3
  48. helm/benchmark/presentation/schema.py +54 -4
  49. helm/benchmark/presentation/test_run_entry.py +1 -0
  50. helm/benchmark/presentation/test_schema.py +11 -0
  51. helm/benchmark/run.py +31 -2
  52. helm/benchmark/run_expander.py +113 -10
  53. helm/benchmark/run_spec_factory.py +4 -0
  54. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  55. helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
  56. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  57. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  58. helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
  59. helm/benchmark/run_specs/experimental_run_specs.py +85 -0
  60. helm/benchmark/run_specs/finance_run_specs.py +110 -0
  61. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  62. helm/benchmark/run_specs/vlm_run_specs.py +251 -57
  63. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  64. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  65. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  66. helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
  67. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  68. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  69. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  70. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  71. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  72. helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
  73. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  74. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  75. helm/benchmark/scenarios/scenario.py +1 -1
  76. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  77. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  78. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  79. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  80. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  81. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  82. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  83. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  84. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  85. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  86. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  87. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  88. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
  90. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  92. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  93. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  94. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  95. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  97. helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
  98. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
  99. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
  100. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  101. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  102. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  103. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
  104. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  105. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  106. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  107. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  108. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  109. helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
  110. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  111. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  112. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  113. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
  114. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  115. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  116. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  117. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  118. helm/benchmark/server.py +1 -6
  119. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  120. helm/benchmark/static/schema_bhasa.yaml +709 -0
  121. helm/benchmark/static/schema_call_center.yaml +232 -0
  122. helm/benchmark/static/schema_classic.yaml +3 -59
  123. helm/benchmark/static/schema_cleva.yaml +768 -0
  124. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  125. helm/benchmark/static/schema_ewok.yaml +367 -0
  126. helm/benchmark/static/schema_finance.yaml +189 -0
  127. helm/benchmark/static/schema_image2struct.yaml +588 -0
  128. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  129. helm/benchmark/static/schema_lite.yaml +3 -61
  130. helm/benchmark/static/schema_medical.yaml +255 -0
  131. helm/benchmark/static/schema_mmlu.yaml +3 -61
  132. helm/benchmark/static/schema_safety.yaml +247 -0
  133. helm/benchmark/static/schema_tables.yaml +317 -0
  134. helm/benchmark/static/schema_thai.yaml +244 -0
  135. helm/benchmark/static/schema_unitxt.yaml +3 -61
  136. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
  137. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  138. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  139. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  140. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  141. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  142. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  143. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  144. helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
  145. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  146. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  147. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  148. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  149. helm/benchmark/static_build/index.html +2 -2
  150. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  151. helm/clients/ai21_client.py +71 -1
  152. helm/clients/anthropic_client.py +50 -28
  153. helm/clients/auto_client.py +11 -0
  154. helm/clients/client.py +24 -7
  155. helm/clients/cohere_client.py +98 -3
  156. helm/clients/huggingface_client.py +79 -19
  157. helm/clients/nvidia_nim_client.py +35 -0
  158. helm/clients/openai_client.py +11 -5
  159. helm/clients/palmyra_client.py +25 -0
  160. helm/clients/perspective_api_client.py +11 -6
  161. helm/clients/reka_client.py +189 -0
  162. helm/clients/test_client.py +7 -9
  163. helm/clients/test_huggingface_client.py +19 -3
  164. helm/clients/test_together_client.py +72 -2
  165. helm/clients/together_client.py +129 -23
  166. helm/clients/vertexai_client.py +62 -18
  167. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  168. helm/clients/vision_language/open_flamingo_client.py +1 -2
  169. helm/clients/vision_language/paligemma_client.py +146 -0
  170. helm/clients/vision_language/palmyra_vision_client.py +99 -0
  171. helm/clients/yi_client.py +31 -0
  172. helm/common/critique_request.py +10 -1
  173. helm/common/images_utils.py +25 -0
  174. helm/common/mongo_key_value_store.py +2 -1
  175. helm/common/request.py +16 -0
  176. helm/config/model_deployments.yaml +740 -363
  177. helm/config/model_metadata.yaml +824 -128
  178. helm/config/tokenizer_configs.yaml +207 -10
  179. helm/proxy/critique/model_critique_client.py +32 -4
  180. helm/proxy/example_queries.py +14 -21
  181. helm/proxy/services/server_service.py +2 -3
  182. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  183. helm/tokenizers/ai21_tokenizer.py +51 -59
  184. helm/tokenizers/auto_tokenizer.py +1 -1
  185. helm/tokenizers/cohere_tokenizer.py +29 -62
  186. helm/tokenizers/huggingface_tokenizer.py +35 -13
  187. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  188. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  189. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  190. helm/benchmark/static/benchmarking.css +0 -156
  191. helm/benchmark/static/benchmarking.js +0 -1705
  192. helm/benchmark/static/config.js +0 -3
  193. helm/benchmark/static/general.js +0 -122
  194. helm/benchmark/static/images/crfm-logo.png +0 -0
  195. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  196. helm/benchmark/static/images/helm-logo.png +0 -0
  197. helm/benchmark/static/images/language-model-helm.png +0 -0
  198. helm/benchmark/static/images/organizations/ai21.png +0 -0
  199. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  200. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  201. helm/benchmark/static/images/organizations/cohere.png +0 -0
  202. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  203. helm/benchmark/static/images/organizations/google.png +0 -0
  204. helm/benchmark/static/images/organizations/meta.png +0 -0
  205. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  206. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  207. helm/benchmark/static/images/organizations/openai.png +0 -0
  208. helm/benchmark/static/images/organizations/together.png +0 -0
  209. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  210. helm/benchmark/static/images/organizations/yandex.png +0 -0
  211. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  212. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  213. helm/benchmark/static/index.html +0 -68
  214. helm/benchmark/static/info-icon.png +0 -0
  215. helm/benchmark/static/json-urls.js +0 -69
  216. helm/benchmark/static/plot-captions.js +0 -27
  217. helm/benchmark/static/schema_image2structure.yaml +0 -304
  218. helm/benchmark/static/utils.js +0 -285
  219. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  220. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  221. helm/benchmark/window_services/ai21_window_service.py +0 -247
  222. helm/benchmark/window_services/cohere_window_service.py +0 -101
  223. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  224. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  225. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  226. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  227. helm/tokenizers/ice_tokenizer.py +0 -30
  228. helm/tokenizers/test_ice_tokenizer.py +0 -57
  229. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
  230. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
  231. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
  232. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  233. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  234. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
  235. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
  236. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -1,3 +0,0 @@
1
- window.BENCHMARK_OUTPUT_BASE_URL = "benchmark_output";
2
- window.SUITE = "latest";
3
- window.RELEASE = null;
@@ -1,122 +0,0 @@
1
- function assert(condition, message) {
2
- if (!condition) {
3
- throw message || "Assertion failed";
4
- }
5
- }
6
-
7
- function encodeUrlParams(params) {
8
- let s = "";
9
- for (let k in params) {
10
- if (params[k] != null) {
11
- s += (s === "" ? "?" : "&") + k + "=" + encodeURIComponent(params[k]);
12
- }
13
- }
14
- return s;
15
- }
16
-
17
- function decodeUrlParams(str) {
18
- const params = {};
19
- if (str === "") return params;
20
- const items = str.substring(1).split(/&/);
21
- for (let i = 0; i < items.length; i++) {
22
- const pair = items[i].split(/=/);
23
- params[pair[0]] = decodeURIComponent(pair[1]);
24
- }
25
- return params;
26
- }
27
-
28
- function updateBrowserLocation(params) {
29
- // Update the address bar
30
- window.history.pushState(
31
- {},
32
- "",
33
- window.location.pathname + encodeUrlParams(params),
34
- );
35
- }
36
-
37
- function createCookie(key, value, days) {
38
- let expires = "";
39
- if (days) {
40
- const date = new Date();
41
- date.setTime(date.getTime() + days * 24 * 60 * 60 * 1000);
42
- expires = "; expires=" + date.toUTCString();
43
- }
44
- document.cookie = key + "=" + value + expires + "; path=/";
45
- }
46
-
47
- function readCookie(key) {
48
- let tokens = document.cookie.split(";");
49
- for (let i = 0; i < tokens.length; i++) {
50
- const [k, v] = tokens[i].trim().split("=", 2);
51
- if (key === k) return v;
52
- }
53
- return null;
54
- }
55
-
56
- function eraseCookie(key) {
57
- createCookie(key, "", -1);
58
- }
59
-
60
- function renderTimestamp(timestamp) {
61
- if (!timestamp) return null;
62
- const d = new Date(timestamp * 1000);
63
- return d.toLocaleString();
64
- }
65
-
66
- function renderDict(data) {
67
- return JSON.stringify(data).substring(0, 10000);
68
- }
69
-
70
- function loadScript(src, onload, onerror) {
71
- // Using jquery doesn't work, so do it in with our bare hands.
72
- const s = document.createElement("script");
73
- s.src = src;
74
- s.onload = onload;
75
- s.onerror = onerror;
76
- document.head.appendChild(s);
77
- }
78
-
79
- function getRandomString() {
80
- const vocab =
81
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
82
- let text = "";
83
- for (let i = 0; i < 6; i++)
84
- text += vocab.charAt(Math.floor(Math.random() * vocab.length));
85
- return text;
86
- }
87
-
88
- function round(x, n) {
89
- const base = Math.pow(10, n);
90
- return Math.round(x * base) / base;
91
- }
92
-
93
- function multilineHtml(s) {
94
- return s.replace(/\n/g, "<br>");
95
- }
96
-
97
- function renderError(e) {
98
- return $("<div>").addClass("alert alert-danger").append(multilineHtml(e));
99
- }
100
-
101
- function helpIcon(help, link) {
102
- // Show a ?
103
- return $("<a>", { href: link, target: "blank_", class: "help-icon" }).append(
104
- $("<img>", { src: "info-icon.png", width: 15, title: help }),
105
- );
106
- }
107
-
108
- const markdownConverter = new showdown.Converter({ optionKey: "value" });
109
- function renderMarkdown(markdown) {
110
- return markdown && markdownConverter.makeHtml(markdown);
111
- }
112
-
113
- function refreshHashLocation() {
114
- // If we request a hash location (URL contains #foo), the problem is #foo
115
- // might not exist (since it's generated). Call this function to jump to the
116
- // hash location once all the anchors are generated.
117
- if (location.hash) {
118
- const hash = location.hash;
119
- location.hash = "";
120
- location.hash = hash;
121
- }
122
- }
Binary file
Binary file
@@ -1,68 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <title>Holistic Evaluation of Language Models (HELM)</title>
5
- <meta charset="utf-8">
6
- <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
7
- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
8
- <link rel="stylesheet" type="text/css" href="benchmarking.css">
9
- </head>
10
-
11
- <body>
12
- <div class="container-fluid">
13
- <nav class="navbar navbar-expand-sm navbar-light bg-faded">
14
- <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#nav-content" aria-controls="nav-content" aria-expanded="false" aria-label="Toggle navigation">
15
- <span class="navbar-toggler-icon"></span>
16
- </button>
17
-
18
- <a class="nav-link active" href="https://crfm.stanford.edu"><img src="images/crfm-logo.png" width="100"/></a>
19
- <div class="collapse navbar-collapse" id="nav-content">
20
- <ul class="navbar-nav">
21
- <li class="nav-item"><a class="nav-link active" href="?"><img src="images/helm-logo-simple.png" width="80"/></a></li>
22
- <li class="nav-item"><a class="nav-link active" href="?models=1">Models</a></li>
23
- <li class="nav-item"><a class="nav-link active" href="?scenarios=1">Scenarios</a></li>
24
- <li class="nav-item"><a class="nav-link active" href="?groups=1">Results</a></li>
25
- <!--
26
- TODO(#1441): Enable plots.
27
- <li class="nav-item"><a class="nav-link active" href="?plots=1">Plots</a></li>
28
- -->
29
- <li class="nav-item"><a class="nav-link active" href="?runs=1">Raw runs</a></li>
30
- </ul>
31
- </div>
32
-
33
- <div class="text-right" id="summary" style="white-space: nowrap">
34
- </div>
35
- </nav>
36
-
37
- <div class="row">
38
- <div class="col-sm-12" id="main">
39
- </div>
40
- </div>
41
- </div>
42
-
43
- <script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
44
- <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
45
- <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
46
- <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.29.0/js/jquery.tablesorter.min.js"></script>
47
- <script src="https://cdnjs.cloudflare.com/ajax/libs/js-yaml/4.1.0/js-yaml.min.js"></script>
48
- <script src="https://cdnjs.cloudflare.com/ajax/libs/showdown/2.0.3/showdown.min.js"></script>
49
- <script src="https://cdnjs.cloudflare.com/ajax/libs/handlebars.js/4.7.7/handlebars.min.js"></script>
50
- <!-- *GTAG*
51
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-T0MW28MP3W"></script>
52
- <script>
53
- window.dataLayer = window.dataLayer || [];
54
- function gtag() {
55
- dataLayer.push(arguments);
56
- }
57
- gtag('js', new Date());
58
- gtag('config', 'G-T0MW28MP3W');
59
- </script>
60
- *GTAG* -->
61
- <script src="config.js"></script>
62
- <script src="general.js"></script>
63
- <script src="utils.js"></script>
64
- <script src="json-urls.js"></script>
65
- <script src="benchmarking.js"></script>
66
- <script src="plot-captions.js"></script>
67
- </body>
68
- </html>
Binary file
@@ -1,69 +0,0 @@
1
- ////////////////////////////////////////////////////////////
2
- // Helper functions for getting URLs of JSON files
3
- function versionBaseUrl() {
4
- if (window.RELEASE) {
5
- return `${BENCHMARK_OUTPUT_BASE_URL}/releases/${window.RELEASE}`;
6
- } else {
7
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${window.SUITE}`;
8
- }
9
- }
10
-
11
- function schemaJsonUrl() {
12
- return `${versionBaseUrl()}/schema.json`;
13
- }
14
-
15
- function summaryJsonUrl() {
16
- return `${versionBaseUrl()}/summary.json`;
17
- }
18
-
19
- function runsToRunSuitesJsonUrl() {
20
- return `${versionBaseUrl()}/runs_to_run_suites.json`;
21
- }
22
-
23
- function runSpecsJsonUrl() {
24
- return `${versionBaseUrl()}/run_specs.json`;
25
- }
26
-
27
- function groupsMetadataJsonUrl() {
28
- return `${versionBaseUrl()}/groups_metadata.json`;
29
- }
30
-
31
- function groupsJsonUrl() {
32
- return `${versionBaseUrl()}/groups.json`;
33
- }
34
-
35
- function groupJsonUrl(groupName) {
36
- return `${versionBaseUrl()}/groups/${groupName}.json`;
37
- }
38
-
39
- function runSpecJsonUrl(suite, runSpecName) {
40
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/run_spec.json`;
41
- }
42
-
43
- function scenarioJsonUrl(suite, runSpecName) {
44
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/scenario.json`;
45
- }
46
-
47
- function scenarioStateJsonUrl(suite, runSpecName) {
48
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/scenario_state.json`;
49
- }
50
-
51
- function statsJsonUrl(suite, runSpecName) {
52
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/stats.json`;
53
- }
54
-
55
- function instancesJsonUrl(suite, runSpecName) {
56
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/instances.json`;
57
- }
58
-
59
- function predictionsJsonUrl(suite, runSpecName) {
60
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_predictions.json`;
61
- }
62
-
63
- function requestsJsonUrl(suite, runSpecName) {
64
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
65
- }
66
-
67
- function plotUrl(suite, plotName) {
68
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
69
- }
@@ -1,27 +0,0 @@
1
- ////////////////////////////////////////////////////////////
2
- // Dictionary of plot captions
3
-
4
- const plotCaptions = {
5
- generic_summary:
6
- "Metrics for every model on every core scenario as a means for indicating the spread on a per-metric basis.",
7
- model_ranking_all:
8
- "The fraction of head-to-head comparisons between the given model and all other models, across all scenarios, where the given model is higher along the metric (e.g. more accurate in the accuracy subfigure). If a model was the highest for the given metric for every scenario, it would receive a score of 1.0; if a model received a score of 0.5, then if a scenario and second model were chosen at random, the outcome of the comparison would be a coin flip. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation.",
9
- accuracy_v_x:
10
- "The relationship between accuracy (x-axis) and each of the 6 metrics (calibration, robustness, fairness, social bias, toxicity, efficiency) we study in this work across all core scenarios and for all models. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
11
- metric_correlation:
12
- "The Pearson correlation between each metric and every other metric (x-axis). The small blue dots denote the correlation on each individual scenario, while the larger orange dots average the correlation across scenarios. Trends are qualitatively similarly for other correlation measures (e.g. Spearman correlation). For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
13
- accuracy_v_access:
14
- "The relationship between access (open vs. limited vs. closed) and model accuracy for each of the core scenarios. Shaded bars indicate the performance of the best model for that scenario, whereas the solid bars indicate the performance of the overall most accurate model across all core scenarios.",
15
- accuracy_over_num_parameters:
16
- "Cumulative plot, depicting the accuracy of the most accurate model up to a given size across all core scenarios.",
17
- accuracy_over_release_date:
18
- "The relationship between time (x-axis) and the accuracy of models (y-axis) across the core scenarios.",
19
- accuracy_over_the_pile_perplexity:
20
- "The relationship between log bits-per-byte (BPB) on The Pile and the accuracy on each core scenario.",
21
- targeted_evals:
22
- "Model accuracy on scenario targeting specific performance components (language, knowledge, reasoning).",
23
- in_context_ablations:
24
- "For each model, we set the maximum number of in-context examples to [0, 1, 2, 4, 8, 16] and fit as many in-context examples as possible within the context window. We plot performance as a function of the average number of in-context examples actually used.",
25
- mc_ablations:
26
- "For each adaptation method (joint, separate, and separate calibrated), we compare models across scenarios.",
27
- };
@@ -1,304 +0,0 @@
1
- ---
2
- ############################################################
3
- adapter:
4
- - name: method
5
- description: The high-level strategy for converting instances into a prompt for the language model.
6
- values:
7
- - name: generation
8
- description: Given the input, the model generates the output free-form.
9
- - name: generation_multimodal
10
- description: Given the multimodal input, the model generates the output free-form.
11
- - name: multiple_choice_joint
12
- description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
13
- - name: multiple_choice_separate_original
14
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
15
- - name: multiple_choice_separate_calibrated
16
- description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
17
- - name: language_modeling
18
- description: Given the input, the model assigns the sequence a probability.
19
- - name: instructions
20
- description: The description of the task that is included at the very beginning of the prompt.
21
- - name: global_prefix
22
- description: The string that is prepended to the prompt.
23
- - name: global_suffix
24
- description: The string that is appended to the prompt.
25
- - name: instance_prefix
26
- description: The string that is included before each instance (e.g., '\n\n').
27
- - name: input_prefix
28
- description: The string that is included before each input (e.g., 'Question:').
29
- - name: input_suffix
30
- description: The string that is included after each input (e.g., '\n').
31
- - name: reference_prefix
32
- description: The string that is included before each reference (for multiple-choice questions).
33
- - name: reference_suffix
34
- description: The string that is included after each reference (for multiple-choice questions).
35
- - name: output_prefix
36
- description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
37
- - name: output_suffix
38
- description: The string that is included after the correct answer/predicted output (e.g., '\n').
39
- - name: substitutions
40
- description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
41
- - name: max_train_instances
42
- description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
43
- - name: max_eval_instances
44
- description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
45
- - name: num_outputs
46
- description: Maximum number of possible outputs to generate by sampling multiple outputs.
47
- - name: num_train_trials
48
- description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
49
- - name: sample_train
50
- description: If true, randomly sample N training examples; if false, select N consecutive training examples
51
- - name: model
52
- description: Name of the language model (<creator_organization>/<model name>) to send requests to.
53
- - name: model_deployment
54
- description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
55
- - name: temperature
56
- description: Temperature parameter used in generation.
57
- - name: max_tokens
58
- description: Maximum number of tokens to generate.
59
- - name: stop_sequences
60
- description: List of sequences, where we stop generation if we encounter any of them.
61
- - name: random
62
- description: Random seed (string), which guarantees reproducibility.
63
- - name: multi_label
64
- description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
65
-
66
- ############################################################
67
- metrics:
68
- # Infrastructure metrics:
69
- - name: num_perplexity_tokens
70
- display_name: '# tokens'
71
- description: Average number of tokens in the predicted output (for language modeling, the input too).
72
- - name: num_bytes
73
- display_name: '# bytes'
74
- description: Average number of bytes in the predicted output (for language modeling, the input too).
75
-
76
- - name: num_references
77
- display_name: '# ref'
78
- description: Number of references.
79
- - name: num_train_trials
80
- display_name: '# trials'
81
- description: Number of trials, where in each trial we choose an independent, random set of training instances.
82
- - name: estimated_num_tokens_cost
83
- display_name: 'cost'
84
- description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
85
- - name: num_prompt_tokens
86
- display_name: '# prompt tokens'
87
- description: Number of tokens in the prompt.
88
- - name: num_prompt_characters
89
- display_name: '# prompt chars'
90
- description: Number of characters in the prompt.
91
- - name: num_completion_tokens
92
- display_name: '# completion tokens'
93
- description: Actual number of completion tokens (over all completions).
94
- - name: num_output_tokens
95
- display_name: '# output tokens'
96
- description: Actual number of output tokens.
97
- - name: max_num_output_tokens
98
- display_name: 'Max output tokens'
99
- description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
100
- - name: num_requests
101
- display_name: '# requests'
102
- description: Number of distinct API requests.
103
- - name: num_instances
104
- display_name: '# eval'
105
- description: Number of evaluation instances.
106
- - name: num_train_instances
107
- display_name: '# train'
108
- description: Number of training instances (e.g., in-context examples).
109
- - name: prompt_truncated
110
- display_name: truncated
111
- description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
112
- - name: finish_reason_length
113
- display_name: finish b/c length
114
- description: Fraction of instances where the the output was terminated because of the max tokens limit.
115
- - name: finish_reason_stop
116
- display_name: finish b/c stop
117
- description: Fraction of instances where the the output was terminated because of the stop sequences.
118
- - name: finish_reason_endoftext
119
- display_name: finish b/c endoftext
120
- description: Fraction of instances where the the output was terminated because the end of text token was generated.
121
- - name: finish_reason_unknown
122
- display_name: finish b/c unknown
123
- description: Fraction of instances where the the output was terminated for unknown reasons.
124
- - name: num_completions
125
- display_name: '# completions'
126
- description: Number of completions.
127
- - name: predicted_index
128
- display_name: Predicted index
129
- description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
130
-
131
- # Vision Language metrics [text]:
132
- - name: edit_similarity
133
- display_name: Edit similarity (Levenshtein)
134
- short_display_name: Edit sim.
135
- lower_is_better: false
136
- description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
137
-
138
- # Vision Language metrics [image]:
139
- - name: block_emd_similarity
140
- display_name: Block Earth Mover Similarity
141
- short_display_name: Block EMS
142
- description: Block Earth Mover Similarity
143
- lower_is_better: false
144
- - name: block_emd_similarity_white
145
- display_name: Block Earth Mover Similarity (white)
146
- short_display_name: Block EMS (white)
147
- description: Block Earth Mover Similarity (white)
148
- lower_is_better: false
149
- - name: block_emd_similarity_median_color
150
- display_name: Block Earth Mover Similarity (median)
151
- short_display_name: Block EMS (median)
152
- description: Block Earth Mover Similarity (median)
153
- lower_is_better: false
154
- - name: pixel_similarity
155
- display_name: Pixel Similarity
156
- short_display_name: PS
157
- description: Pixel Similarity between an image generated by the model and the target image.
158
- lower_is_better: false
159
- - name: sift_similarity
160
- display_name: SIFT Similarity
161
- short_display_name: SIFT
162
- description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
163
- lower_is_better: false
164
- - name: compilation_success
165
- display_name: Compilation success
166
- description: Fraction of instances where the generated code compiles successfully.
167
- lower_is_better: false
168
- - name: lpips_similarity
169
- display_name: LPIPS similarity
170
- short_display_name: LPIPS
171
- description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
172
- lower_is_better: false
173
- - name: fid_similarity
174
- display_name: FID similarity
175
- short_display_name: FID
176
- description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
177
- lower_is_better: false
178
- - name: ssim_similarity
179
- display_name: SSIM
180
- short_display_name: SSIM
181
- description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
182
- lower_is_better: false
183
-
184
- # Accuracy metrics:
185
- - name: exact_match
186
- display_name: Exact match
187
- short_display_name: EM
188
- description: Fraction of instances that the predicted output matches a correct reference exactly.
189
- lower_is_better: false
190
- - name: quasi_exact_match
191
- display_name: Quasi-exact match
192
- short_display_name: EM
193
- description: Fraction of instances that the predicted output matches a correct reference up to light processing.
194
- lower_is_better: false
195
- - name: prefix_exact_match
196
- display_name: Prefix exact match
197
- short_display_name: PEM
198
- description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
199
- lower_is_better: false
200
- - name: quasi_prefix_exact_match
201
- # TODO: should call this prefix_quasi_exact_match
202
- display_name: Prefix quasi-exact match
203
- short_display_name: PEM
204
- description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
205
- lower_is_better: false
206
-
207
- ############################################################
208
- perturbations:
209
- - name: robustness
210
- display_name: Robustness
211
- description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
212
-
213
- ############################################################
214
- metric_groups:
215
- - name: accuracy
216
- display_name: Compilation Rate and Earth Mover Similarity
217
- metrics:
218
- - name: ${main_name}
219
- split: ${main_split}
220
- - name: compilation_success
221
- split: ${main_split}
222
-
223
- - name: generation_image
224
- display_name: Generation (image)
225
- metrics:
226
- - name: pixel_similarity
227
- split: ${main_split}
228
- - name: compilation_success
229
- split: ${main_split}
230
- - name: fid_similarity
231
- split: ${main_split}
232
- - name: block_emd_similarity
233
- split: ${main_split}
234
- - name: block_emd_similarity_white
235
- split: ${main_split}
236
- - name: block_emd_similarity_median_color
237
- split: ${main_split}
238
-
239
- - name: generation_text
240
- display_name: Generation (text)
241
- metrics:
242
- - name: edit_similarity
243
- split: ${main_split}
244
-
245
- ############################################################
246
- run_groups:
247
- - name: core_scenarios
248
- display_name: Image2Structure
249
- description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
250
- category: All scenarios
251
- subgroups:
252
- - image2latex
253
- - image2webpage
254
- - image2musicsheet
255
-
256
- - name: image2latex
257
- display_name: Image2LaTeX
258
- description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
259
- metric_groups:
260
- - accuracy
261
- - generation_image
262
- - generation_text
263
- environment:
264
- main_name: block_emd_similarity
265
- main_split: valid
266
- taxonomy:
267
- task: image-to-text
268
- what: mathematical equations, tables, algorithms, tikz
269
- who: n/a
270
- when: "2024"
271
- language: English
272
-
273
- - name: image2webpage
274
- display_name: Image2webpage
275
- description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
276
- metric_groups:
277
- - accuracy
278
- - generation_image
279
- - generation_text
280
- environment:
281
- main_name: block_emd_similarity
282
- main_split: valid
283
- taxonomy:
284
- task: image-to-text
285
- what: css, html, javascript
286
- who: n/a
287
- when: "2024"
288
- language: English
289
-
290
- - name: image2musicsheet
291
- display_name: Image2musicsheet
292
- description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
293
- metric_groups:
294
- - accuracy
295
- - generation_image
296
- environment:
297
- main_name: block_emd_similarity
298
- main_split: valid
299
- taxonomy:
300
- task: image-to-text
301
- what: music sheets
302
- who: n/a
303
- when: "2024"
304
- language: English