crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show
  1. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
  2. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
  3. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
  5. helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
  6. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
  7. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
  8. helm/benchmark/adaptation/common_adapter_specs.py +2 -0
  9. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  10. helm/benchmark/annotation/call_center_annotator.py +258 -0
  11. helm/benchmark/annotation/financebench_annotator.py +79 -0
  12. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  13. helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
  14. helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
  15. helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
  16. helm/benchmark/annotation/live_qa_annotator.py +37 -45
  17. helm/benchmark/annotation/medication_qa_annotator.py +36 -44
  18. helm/benchmark/annotation/model_as_judge.py +96 -0
  19. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  20. helm/benchmark/annotation/xstest_annotator.py +100 -0
  21. helm/benchmark/metrics/annotation_metrics.py +108 -0
  22. helm/benchmark/metrics/bhasa_metrics.py +188 -0
  23. helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
  24. helm/benchmark/metrics/code_metrics_helper.py +11 -1
  25. helm/benchmark/metrics/safety_metrics.py +79 -0
  26. helm/benchmark/metrics/summac/model_summac.py +3 -3
  27. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
  28. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
  29. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  30. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  31. helm/benchmark/metrics/vision_language/image_utils.py +1 -1
  32. helm/benchmark/model_metadata_registry.py +3 -3
  33. helm/benchmark/presentation/create_plots.py +1 -1
  34. helm/benchmark/presentation/schema.py +3 -0
  35. helm/benchmark/presentation/summarize.py +106 -256
  36. helm/benchmark/presentation/test_run_entry.py +1 -0
  37. helm/benchmark/presentation/test_summarize.py +145 -3
  38. helm/benchmark/run.py +15 -0
  39. helm/benchmark/run_expander.py +83 -30
  40. helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
  41. helm/benchmark/run_specs/call_center_run_specs.py +152 -0
  42. helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
  43. helm/benchmark/run_specs/experimental_run_specs.py +52 -0
  44. helm/benchmark/run_specs/finance_run_specs.py +82 -1
  45. helm/benchmark/run_specs/safety_run_specs.py +154 -0
  46. helm/benchmark/run_specs/vlm_run_specs.py +100 -24
  47. helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
  48. helm/benchmark/scenarios/banking77_scenario.py +51 -0
  49. helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
  50. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  51. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
  52. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  53. helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
  54. helm/benchmark/scenarios/financebench_scenario.py +53 -0
  55. helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
  56. helm/benchmark/scenarios/raft_scenario.py +1 -1
  57. helm/benchmark/scenarios/scenario.py +1 -1
  58. helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
  59. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  60. helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
  61. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  62. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  63. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  64. helm/benchmark/scenarios/test_math_scenario.py +2 -8
  65. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  66. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  67. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  68. helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
  69. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
  70. helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
  71. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
  72. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  73. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  74. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
  75. helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
  76. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
  77. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
  78. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
  79. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
  80. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
  81. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
  82. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
  83. helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
  84. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
  85. helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
  86. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
  87. helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
  88. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
  89. helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
  90. helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
  91. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  92. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
  93. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
  94. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
  95. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
  96. helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
  97. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  98. helm/benchmark/server.py +1 -6
  99. helm/benchmark/static/schema_air_bench.yaml +750 -750
  100. helm/benchmark/static/schema_bhasa.yaml +709 -0
  101. helm/benchmark/static/schema_call_center.yaml +232 -0
  102. helm/benchmark/static/schema_cleva.yaml +768 -0
  103. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  104. helm/benchmark/static/schema_ewok.yaml +367 -0
  105. helm/benchmark/static/schema_finance.yaml +55 -9
  106. helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
  107. helm/benchmark/static/schema_legal.yaml +566 -0
  108. helm/benchmark/static/schema_safety.yaml +266 -0
  109. helm/benchmark/static/schema_tables.yaml +149 -8
  110. helm/benchmark/static/schema_thai.yaml +21 -0
  111. helm/benchmark/static/schema_vhelm.yaml +137 -101
  112. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  113. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  114. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  115. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  116. helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
  117. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  118. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  119. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  120. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  121. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  122. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  123. helm/benchmark/static_build/index.html +2 -2
  124. helm/benchmark/window_services/test_openai_window_service.py +8 -8
  125. helm/benchmark/window_services/tokenizer_service.py +0 -5
  126. helm/clients/ai21_client.py +71 -1
  127. helm/clients/anthropic_client.py +7 -19
  128. helm/clients/huggingface_client.py +38 -37
  129. helm/clients/nvidia_nim_client.py +35 -0
  130. helm/clients/openai_client.py +18 -4
  131. helm/clients/palmyra_client.py +24 -0
  132. helm/clients/perspective_api_client.py +11 -6
  133. helm/clients/test_client.py +4 -6
  134. helm/clients/together_client.py +22 -0
  135. helm/clients/vision_language/open_flamingo_client.py +1 -2
  136. helm/clients/vision_language/palmyra_vision_client.py +28 -13
  137. helm/common/cache.py +8 -30
  138. helm/common/images_utils.py +6 -0
  139. helm/common/key_value_store.py +9 -9
  140. helm/common/mongo_key_value_store.py +5 -4
  141. helm/common/request.py +16 -0
  142. helm/common/test_cache.py +1 -48
  143. helm/common/tokenization_request.py +0 -9
  144. helm/config/model_deployments.yaml +444 -329
  145. helm/config/model_metadata.yaml +513 -111
  146. helm/config/tokenizer_configs.yaml +140 -11
  147. helm/proxy/example_queries.py +14 -21
  148. helm/proxy/server.py +0 -9
  149. helm/proxy/services/remote_service.py +0 -6
  150. helm/proxy/services/server_service.py +6 -20
  151. helm/proxy/services/service.py +0 -6
  152. helm/proxy/token_counters/test_auto_token_counter.py +2 -2
  153. helm/tokenizers/ai21_tokenizer.py +51 -59
  154. helm/tokenizers/cohere_tokenizer.py +0 -75
  155. helm/tokenizers/huggingface_tokenizer.py +0 -1
  156. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  157. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  158. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  159. helm/benchmark/data_overlap/light_scenario.py +0 -60
  160. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  161. helm/benchmark/static/benchmarking.css +0 -156
  162. helm/benchmark/static/benchmarking.js +0 -1705
  163. helm/benchmark/static/config.js +0 -3
  164. helm/benchmark/static/general.js +0 -122
  165. helm/benchmark/static/images/crfm-logo.png +0 -0
  166. helm/benchmark/static/images/helm-logo-simple.png +0 -0
  167. helm/benchmark/static/images/helm-logo.png +0 -0
  168. helm/benchmark/static/images/language-model-helm.png +0 -0
  169. helm/benchmark/static/images/organizations/ai21.png +0 -0
  170. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  171. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  172. helm/benchmark/static/images/organizations/cohere.png +0 -0
  173. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  174. helm/benchmark/static/images/organizations/google.png +0 -0
  175. helm/benchmark/static/images/organizations/meta.png +0 -0
  176. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  177. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  178. helm/benchmark/static/images/organizations/openai.png +0 -0
  179. helm/benchmark/static/images/organizations/together.png +0 -0
  180. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  181. helm/benchmark/static/images/organizations/yandex.png +0 -0
  182. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  183. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  184. helm/benchmark/static/index.html +0 -68
  185. helm/benchmark/static/info-icon.png +0 -0
  186. helm/benchmark/static/json-urls.js +0 -69
  187. helm/benchmark/static/plot-captions.js +0 -27
  188. helm/benchmark/static/utils.js +0 -285
  189. helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
  190. helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
  191. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  192. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  193. helm/benchmark/window_services/ai21_window_service.py +0 -247
  194. helm/benchmark/window_services/cohere_window_service.py +0 -101
  195. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  196. helm/benchmark/window_services/test_cohere_window_service.py +0 -75
  197. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  198. helm/benchmark/window_services/test_ice_window_service.py +0 -327
  199. helm/tokenizers/ice_tokenizer.py +0 -30
  200. helm/tokenizers/test_ice_tokenizer.py +0 -57
  201. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  202. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  203. {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
  204. /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
  205. /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
  206. /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
  207. /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
  208. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
  209. /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
@@ -1,3 +0,0 @@
1
- window.BENCHMARK_OUTPUT_BASE_URL = "benchmark_output";
2
- window.SUITE = "latest";
3
- window.RELEASE = null;
@@ -1,122 +0,0 @@
1
- function assert(condition, message) {
2
- if (!condition) {
3
- throw message || "Assertion failed";
4
- }
5
- }
6
-
7
- function encodeUrlParams(params) {
8
- let s = "";
9
- for (let k in params) {
10
- if (params[k] != null) {
11
- s += (s === "" ? "?" : "&") + k + "=" + encodeURIComponent(params[k]);
12
- }
13
- }
14
- return s;
15
- }
16
-
17
- function decodeUrlParams(str) {
18
- const params = {};
19
- if (str === "") return params;
20
- const items = str.substring(1).split(/&/);
21
- for (let i = 0; i < items.length; i++) {
22
- const pair = items[i].split(/=/);
23
- params[pair[0]] = decodeURIComponent(pair[1]);
24
- }
25
- return params;
26
- }
27
-
28
- function updateBrowserLocation(params) {
29
- // Update the address bar
30
- window.history.pushState(
31
- {},
32
- "",
33
- window.location.pathname + encodeUrlParams(params),
34
- );
35
- }
36
-
37
- function createCookie(key, value, days) {
38
- let expires = "";
39
- if (days) {
40
- const date = new Date();
41
- date.setTime(date.getTime() + days * 24 * 60 * 60 * 1000);
42
- expires = "; expires=" + date.toUTCString();
43
- }
44
- document.cookie = key + "=" + value + expires + "; path=/";
45
- }
46
-
47
- function readCookie(key) {
48
- let tokens = document.cookie.split(";");
49
- for (let i = 0; i < tokens.length; i++) {
50
- const [k, v] = tokens[i].trim().split("=", 2);
51
- if (key === k) return v;
52
- }
53
- return null;
54
- }
55
-
56
- function eraseCookie(key) {
57
- createCookie(key, "", -1);
58
- }
59
-
60
- function renderTimestamp(timestamp) {
61
- if (!timestamp) return null;
62
- const d = new Date(timestamp * 1000);
63
- return d.toLocaleString();
64
- }
65
-
66
- function renderDict(data) {
67
- return JSON.stringify(data).substring(0, 10000);
68
- }
69
-
70
- function loadScript(src, onload, onerror) {
71
- // Using jquery doesn't work, so do it in with our bare hands.
72
- const s = document.createElement("script");
73
- s.src = src;
74
- s.onload = onload;
75
- s.onerror = onerror;
76
- document.head.appendChild(s);
77
- }
78
-
79
- function getRandomString() {
80
- const vocab =
81
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
82
- let text = "";
83
- for (let i = 0; i < 6; i++)
84
- text += vocab.charAt(Math.floor(Math.random() * vocab.length));
85
- return text;
86
- }
87
-
88
- function round(x, n) {
89
- const base = Math.pow(10, n);
90
- return Math.round(x * base) / base;
91
- }
92
-
93
- function multilineHtml(s) {
94
- return s.replace(/\n/g, "<br>");
95
- }
96
-
97
- function renderError(e) {
98
- return $("<div>").addClass("alert alert-danger").append(multilineHtml(e));
99
- }
100
-
101
- function helpIcon(help, link) {
102
- // Show a ?
103
- return $("<a>", { href: link, target: "blank_", class: "help-icon" }).append(
104
- $("<img>", { src: "info-icon.png", width: 15, title: help }),
105
- );
106
- }
107
-
108
- const markdownConverter = new showdown.Converter({ optionKey: "value" });
109
- function renderMarkdown(markdown) {
110
- return markdown && markdownConverter.makeHtml(markdown);
111
- }
112
-
113
- function refreshHashLocation() {
114
- // If we request a hash location (URL contains #foo), the problem is #foo
115
- // might not exist (since it's generated). Call this function to jump to the
116
- // hash location once all the anchors are generated.
117
- if (location.hash) {
118
- const hash = location.hash;
119
- location.hash = "";
120
- location.hash = hash;
121
- }
122
- }
Binary file
Binary file
@@ -1,68 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <title>Holistic Evaluation of Language Models (HELM)</title>
5
- <meta charset="utf-8">
6
- <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
7
- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
8
- <link rel="stylesheet" type="text/css" href="benchmarking.css">
9
- </head>
10
-
11
- <body>
12
- <div class="container-fluid">
13
- <nav class="navbar navbar-expand-sm navbar-light bg-faded">
14
- <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#nav-content" aria-controls="nav-content" aria-expanded="false" aria-label="Toggle navigation">
15
- <span class="navbar-toggler-icon"></span>
16
- </button>
17
-
18
- <a class="nav-link active" href="https://crfm.stanford.edu"><img src="images/crfm-logo.png" width="100"/></a>
19
- <div class="collapse navbar-collapse" id="nav-content">
20
- <ul class="navbar-nav">
21
- <li class="nav-item"><a class="nav-link active" href="?"><img src="images/helm-logo-simple.png" width="80"/></a></li>
22
- <li class="nav-item"><a class="nav-link active" href="?models=1">Models</a></li>
23
- <li class="nav-item"><a class="nav-link active" href="?scenarios=1">Scenarios</a></li>
24
- <li class="nav-item"><a class="nav-link active" href="?groups=1">Results</a></li>
25
- <!--
26
- TODO(#1441): Enable plots.
27
- <li class="nav-item"><a class="nav-link active" href="?plots=1">Plots</a></li>
28
- -->
29
- <li class="nav-item"><a class="nav-link active" href="?runs=1">Raw runs</a></li>
30
- </ul>
31
- </div>
32
-
33
- <div class="text-right" id="summary" style="white-space: nowrap">
34
- </div>
35
- </nav>
36
-
37
- <div class="row">
38
- <div class="col-sm-12" id="main">
39
- </div>
40
- </div>
41
- </div>
42
-
43
- <script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
44
- <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
45
- <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
46
- <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.29.0/js/jquery.tablesorter.min.js"></script>
47
- <script src="https://cdnjs.cloudflare.com/ajax/libs/js-yaml/4.1.0/js-yaml.min.js"></script>
48
- <script src="https://cdnjs.cloudflare.com/ajax/libs/showdown/2.0.3/showdown.min.js"></script>
49
- <script src="https://cdnjs.cloudflare.com/ajax/libs/handlebars.js/4.7.7/handlebars.min.js"></script>
50
- <!-- *GTAG*
51
- <script async src="https://www.googletagmanager.com/gtag/js?id=G-T0MW28MP3W"></script>
52
- <script>
53
- window.dataLayer = window.dataLayer || [];
54
- function gtag() {
55
- dataLayer.push(arguments);
56
- }
57
- gtag('js', new Date());
58
- gtag('config', 'G-T0MW28MP3W');
59
- </script>
60
- *GTAG* -->
61
- <script src="config.js"></script>
62
- <script src="general.js"></script>
63
- <script src="utils.js"></script>
64
- <script src="json-urls.js"></script>
65
- <script src="benchmarking.js"></script>
66
- <script src="plot-captions.js"></script>
67
- </body>
68
- </html>
Binary file
@@ -1,69 +0,0 @@
1
- ////////////////////////////////////////////////////////////
2
- // Helper functions for getting URLs of JSON files
3
- function versionBaseUrl() {
4
- if (window.RELEASE) {
5
- return `${BENCHMARK_OUTPUT_BASE_URL}/releases/${window.RELEASE}`;
6
- } else {
7
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${window.SUITE}`;
8
- }
9
- }
10
-
11
- function schemaJsonUrl() {
12
- return `${versionBaseUrl()}/schema.json`;
13
- }
14
-
15
- function summaryJsonUrl() {
16
- return `${versionBaseUrl()}/summary.json`;
17
- }
18
-
19
- function runsToRunSuitesJsonUrl() {
20
- return `${versionBaseUrl()}/runs_to_run_suites.json`;
21
- }
22
-
23
- function runSpecsJsonUrl() {
24
- return `${versionBaseUrl()}/run_specs.json`;
25
- }
26
-
27
- function groupsMetadataJsonUrl() {
28
- return `${versionBaseUrl()}/groups_metadata.json`;
29
- }
30
-
31
- function groupsJsonUrl() {
32
- return `${versionBaseUrl()}/groups.json`;
33
- }
34
-
35
- function groupJsonUrl(groupName) {
36
- return `${versionBaseUrl()}/groups/${groupName}.json`;
37
- }
38
-
39
- function runSpecJsonUrl(suite, runSpecName) {
40
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/run_spec.json`;
41
- }
42
-
43
- function scenarioJsonUrl(suite, runSpecName) {
44
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/scenario.json`;
45
- }
46
-
47
- function scenarioStateJsonUrl(suite, runSpecName) {
48
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/scenario_state.json`;
49
- }
50
-
51
- function statsJsonUrl(suite, runSpecName) {
52
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/stats.json`;
53
- }
54
-
55
- function instancesJsonUrl(suite, runSpecName) {
56
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/instances.json`;
57
- }
58
-
59
- function predictionsJsonUrl(suite, runSpecName) {
60
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_predictions.json`;
61
- }
62
-
63
- function requestsJsonUrl(suite, runSpecName) {
64
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
65
- }
66
-
67
- function plotUrl(suite, plotName) {
68
- return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
69
- }
@@ -1,27 +0,0 @@
1
- ////////////////////////////////////////////////////////////
2
- // Dictionary of plot captions
3
-
4
- const plotCaptions = {
5
- generic_summary:
6
- "Metrics for every model on every core scenario as a means for indicating the spread on a per-metric basis.",
7
- model_ranking_all:
8
- "The fraction of head-to-head comparisons between the given model and all other models, across all scenarios, where the given model is higher along the metric (e.g. more accurate in the accuracy subfigure). If a model was the highest for the given metric for every scenario, it would receive a score of 1.0; if a model received a score of 0.5, then if a scenario and second model were chosen at random, the outcome of the comparison would be a coin flip. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation.",
9
- accuracy_v_x:
10
- "The relationship between accuracy (x-axis) and each of the 6 metrics (calibration, robustness, fairness, social bias, toxicity, efficiency) we study in this work across all core scenarios and for all models. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
11
- metric_correlation:
12
- "The Pearson correlation between each metric and every other metric (x-axis). The small blue dots denote the correlation on each individual scenario, while the larger orange dots average the correlation across scenarios. Trends are qualitatively similarly for other correlation measures (e.g. Spearman correlation). For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
13
- accuracy_v_access:
14
- "The relationship between access (open vs. limited vs. closed) and model accuracy for each of the core scenarios. Shaded bars indicate the performance of the best model for that scenario, whereas the solid bars indicate the performance of the overall most accurate model across all core scenarios.",
15
- accuracy_over_num_parameters:
16
- "Cumulative plot, depicting the accuracy of the most accurate model up to a given size across all core scenarios.",
17
- accuracy_over_release_date:
18
- "The relationship between time (x-axis) and the accuracy of models (y-axis) across the core scenarios.",
19
- accuracy_over_the_pile_perplexity:
20
- "The relationship between log bits-per-byte (BPB) on The Pile and the accuracy on each core scenario.",
21
- targeted_evals:
22
- "Model accuracy on scenario targeting specific performance components (language, knowledge, reasoning).",
23
- in_context_ablations:
24
- "For each model, we set the maximum number of in-context examples to [0, 1, 2, 4, 8, 16] and fit as many in-context examples as possible within the context window. We plot performance as a function of the average number of in-context examples actually used.",
25
- mc_ablations:
26
- "For each adaptation method (joint, separate, and separate calibrated), we compare models across scenarios.",
27
- };
@@ -1,285 +0,0 @@
1
- ////////////////////////////////////////////////////////////
2
- // Helper functions for visualizing benchmarking
3
-
4
- function describeField(field) {
5
- let result = field.name + ": " + field.description;
6
- if (field.values) {
7
- result +=
8
- "\nPossible values:\n" +
9
- field.values
10
- .map((value) => `- ${value.name}: ${value.description}`)
11
- .join("\n");
12
- }
13
- return result;
14
- }
15
-
16
- function renderStopSequence(value) {
17
- return JSON.stringify(value);
18
- }
19
-
20
- function renderFieldValue(field, value) {
21
- if (!field.values) {
22
- if (field.name === "stop_sequences") {
23
- return renderStopSequence(value);
24
- }
25
- return value;
26
- }
27
- const valueField = field.values.find(
28
- (valueField) => valueField.name === value,
29
- );
30
- return $("<a>", {
31
- title: valueField ? valueField.description : "(no description)",
32
- }).append(value);
33
- }
34
-
35
- function perturbationEquals(perturbation1, perturbation2) {
36
- if (perturbation1 == null) {
37
- return perturbation2 == null;
38
- }
39
- if (perturbation2 == null) {
40
- return perturbation1 == null;
41
- }
42
- return renderDict(perturbation1) === renderDict(perturbation2);
43
- }
44
-
45
- function metricNameEquals(name1, name2) {
46
- return (
47
- name1.name === name2.name &&
48
- name1.split === name2.split &&
49
- name1.sub_split === name2.sub_split &&
50
- perturbationEquals(name1.perturbation, name2.perturbation)
51
- );
52
- }
53
-
54
- function renderPerturbation(perturbation) {
55
- if (!perturbation) {
56
- return "original";
57
- }
58
- // The perturbation field must have the "name" subfield
59
- const verbose = false;
60
- if (verbose) {
61
- const fields_str = Object.keys(perturbation)
62
- .filter((key) => key !== "name")
63
- .map((key) => `${key}=${perturbation[key]}`)
64
- .join(", ");
65
-
66
- return perturbation.name + (fields_str ? "(" + fields_str + ")" : "");
67
- } else {
68
- return perturbation.name;
69
- }
70
- }
71
-
72
- function renderMetricName(name) {
73
- // Return a short name (suitable for a cell of a table)
74
- // Example: name = {name: 'exact_match'}
75
- let result = name.name.bold();
76
- if (name.split) {
77
- result +=
78
- " on " + name.split + (name.sub_split ? "/" + name.sub_split : "");
79
- }
80
- if (name.perturbation) {
81
- result += " with " + renderPerturbation(name.perturbation);
82
- }
83
- return result;
84
- }
85
-
86
- function describeMetricName(field, name) {
87
- // Return a longer description that explains the name
88
- let result = describeField(field);
89
- if (name.split) {
90
- result += `\n* on ${name.split}: evaluated on the subset of ${name.split} instances`;
91
- }
92
- if (name.perturbation) {
93
- result += `\n* with ${renderPerturbation(
94
- name.perturbation,
95
- )}: applied this perturbation`;
96
- }
97
- return result;
98
- }
99
-
100
- function renderStatName(schema, statName) {
101
- const metric = schema.metricsField(statName);
102
- if (metric.display_name) {
103
- return metric.display_name;
104
- } else {
105
- const formattedName = statName.replaceAll("_", " ");
106
- const capitalizedName =
107
- formattedName.charAt(0).toUpperCase() + formattedName.slice(1);
108
- return capitalizedName;
109
- }
110
- }
111
-
112
- function renderPerturbationName(schema, perturbationName) {
113
- return schema.perturbationsField(perturbationName).display_name;
114
- }
115
-
116
- function renderScenarioDisplayName(scenario, scenarioSpec) {
117
- // Describe the scenario
118
- const name = scenario.name; // e.g. mmlu
119
- const args = scenarioSpec.args; // e.g., {subject: 'philosophy'}
120
- if (Object.keys(args).length > 0) {
121
- return name + " (" + renderDict(args) + ")";
122
- } else {
123
- return name;
124
- }
125
- }
126
-
127
- ////////////////////////////////////////////////////////////
128
- // Generic utility functions
129
-
130
- function renderHeader(header, body) {
131
- return $("<div>").append($("<h4>").append(header)).append(body);
132
- }
133
-
134
- function getJSONList(paths, callback, defaultValue) {
135
- // Fetch the JSON files `paths`, and pass the list of results into `callback`.
136
- const responses = {};
137
- const deferred = $.Deferred();
138
- $.when(
139
- ...paths.map((path) =>
140
- $.getJSON(path, {}, (response) => {
141
- responses[path] = response;
142
- }),
143
- ),
144
- ).then(
145
- () => {
146
- const result = paths.map((path) => responses[path] || defaultValue);
147
- if (callback) {
148
- callback(result);
149
- }
150
- deferred.resolve(result);
151
- },
152
- (error) => {
153
- console.error(
154
- "Failed to load / parse:",
155
- paths.filter((path) => !(path in responses)),
156
- );
157
- console.error(error.responseText);
158
- const result = paths.map((path) => responses[path] || defaultValue);
159
- if (callback) {
160
- callback(result);
161
- }
162
- deferred.resolve(result);
163
- },
164
- );
165
- return deferred.promise();
166
- }
167
-
168
- function getLast(l) {
169
- return l[l.length - 1];
170
- }
171
-
172
- function sortListWithReferenceOrder(list, referenceOrder) {
173
- // Return items in `list` based on referenceOrder.
174
- // Example:
175
- // - list = [3, 5, 2], referenceOrder = [2, 5]
176
- // - Returns [2, 5, 3]
177
- function getKey(x) {
178
- const i = referenceOrder.indexOf(x);
179
- return i === -1 ? 9999 : i; // Put unknown items at the end
180
- }
181
- list.sort(([a, b]) => getKey(a) - getKey(b));
182
- }
183
-
184
- function canonicalizeList(lists, compare) {
185
- // Takes as input a list of lists and optional compare function,
186
- // and returns the list of unique elements (preserving order).
187
- // compare(a, b) should return 0 if and only if a equals b.
188
- // Example: lists = [[1, 2, 3], [2, 3, 4]]
189
- // => [1, 2, 3, 4]
190
- // Example: lists = [[1, 2, 3], [2, 3, 4]], compare = (a, b) => a % 2 - b % 2
191
- // => [1, 2]
192
- const result = [];
193
- lists.forEach((list) => {
194
- list.forEach((elem) => {
195
- const inResult = compare
196
- ? result.some((resultElem) => compare(resultElem, elem) === 0)
197
- : result.indexOf(elem) >= 0;
198
- if (!inResult) {
199
- result.push(elem);
200
- }
201
- });
202
- });
203
- return result;
204
- }
205
-
206
- function dict(entries) {
207
- // Make a dictionary (object) out of the key/value `entries`
208
- const obj = {};
209
- entries.forEach(([key, value]) => {
210
- obj[key] = value;
211
- });
212
- return obj;
213
- }
214
-
215
- function findDiff(items) {
216
- // `items` is a list of dictionaries.
217
- // Return a corresponding list of dictionaries where all the common keys have been removed.
218
- const commonKeys = Object.keys(items[0]).filter((key) =>
219
- items.every(
220
- (item) => JSON.stringify(item[key]) === JSON.stringify(items[0][key]),
221
- ),
222
- );
223
- return items.map((item) => {
224
- return dict(
225
- Object.entries(item).filter(
226
- (entry) => commonKeys.indexOf(entry[0]) === -1,
227
- ),
228
- );
229
- });
230
- }
231
-
232
- function renderDict(obj) {
233
- return Object.entries(obj)
234
- .map(([key, value]) => `${key}=${value}`)
235
- .join(",");
236
- }
237
-
238
- function truncateMiddle(text, border) {
239
- // Return `text` with only `border` characters at the beginning and at the end
240
- // Example: "this is a test", border=4 ==> "this...(6 characters)...test"
241
- if (text.length <= border * 2) {
242
- return text;
243
- }
244
- const numRemoved = text.length - 2 * border;
245
- return (
246
- text.substring(0, border) +
247
- ' <span style="color: lightgray">...(' +
248
- numRemoved +
249
- " characters)...</span> " +
250
- text.substring(text.length - border)
251
- );
252
- }
253
-
254
- function substitute(str, environment) {
255
- if (!str) {
256
- return str;
257
- }
258
- for (let key in environment) {
259
- str = str.replace("${" + key + "}", environment[key]);
260
- }
261
- return str;
262
- }
263
-
264
- function renderAccess(access) {
265
- return $("<span>", { class: "access-" + access + " btn" }).append(access);
266
- }
267
-
268
- function renderItems(items) {
269
- // [1, 2, 3] => "[1 | 2 | 3]"
270
- const $result = $("<div>");
271
- $result.append("[ ");
272
- items.forEach((item, index) => {
273
- if (index > 0) {
274
- $result.append(" | ");
275
- }
276
- $result.append(item);
277
- });
278
- $result.append(" ]");
279
- return $result;
280
- }
281
-
282
- function getSuiteForRun(runNameToSuite, runName) {
283
- suite = window.RELEASE ? runNameToSuite[runName] : window.SUITE;
284
- return suite;
285
- }