crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
- helm/benchmark/__init__.py +13 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +70 -0
- helm/benchmark/metrics/machine_translation_metrics.py +36 -0
- helm/benchmark/metrics/summarization_metrics.py +7 -8
- helm/benchmark/metrics/test_classification_metrics.py +150 -0
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/run_display.py +7 -48
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +144 -48
- helm/benchmark/run_expander.py +164 -47
- helm/benchmark/run_specs.py +346 -39
- helm/benchmark/runner.py +34 -6
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
- helm/benchmark/scenarios/lextreme_scenario.py +458 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
- helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
- helm/benchmark/scenarios/med_qa_scenario.py +96 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +154 -1
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/santacoder_window_service.py +27 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +34 -7
- helm/common/codec.py +123 -0
- helm/common/general.py +12 -5
- helm/common/test_codec.py +144 -0
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +32 -24
- helm/proxy/clients/google_client.py +88 -0
- helm/proxy/clients/huggingface_client.py +32 -16
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +25 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +115 -7
- helm/proxy/test_models.py +1 -1
- helm/benchmark/presentation/present.py +0 -249
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -124,6 +124,44 @@ $(function () {
|
|
|
124
124
|
return $table;
|
|
125
125
|
}
|
|
126
126
|
|
|
127
|
+
function renderPlots() {
|
|
128
|
+
const container = $('<div>', {class: "container"});
|
|
129
|
+
const links = $('<div>');
|
|
130
|
+
container.append(links);
|
|
131
|
+
const tableLinks = [];
|
|
132
|
+
|
|
133
|
+
function renderPlot(name, title) {
|
|
134
|
+
const plot = $('<div>', {class: "plot"});
|
|
135
|
+
const caption = $('<div>', {class: "plot-caption"}).append(plotCaptions[name]);
|
|
136
|
+
|
|
137
|
+
plot.append($('<h3>').append($('<a>', {id: title}).append(title)));
|
|
138
|
+
plot.append(caption);
|
|
139
|
+
plot.append($('<img>', {src: plotUrl(suite, name), class: "img-fluid"}));
|
|
140
|
+
container.append(plot);
|
|
141
|
+
tableLinks.push($('<a>', {href: '#' + title}).append(title));
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
renderPlot("generic_summary", "Metric spread for core scenarios");
|
|
145
|
+
renderPlot("model_ranking_all", "Head-to-head win rate per each model");
|
|
146
|
+
|
|
147
|
+
renderPlot("accuracy_v_x", "Accuracy as a function of other metrics");
|
|
148
|
+
renderPlot("metric_correlation", "Correlation between metrics");
|
|
149
|
+
|
|
150
|
+
renderPlot("accuracy_v_access", "Accuracy as a function of model access");
|
|
151
|
+
renderPlot("accuracy_over_num_parameters", "Accuracy across model sizes");
|
|
152
|
+
renderPlot("accuracy_over_release_date", "Accuracy over time");
|
|
153
|
+
renderPlot("accuracy_over_the_pile_perplexity", "Accuracy as a function of The Pile perplexity");
|
|
154
|
+
|
|
155
|
+
renderPlot("targeted_evals", "Targeted evaluations");
|
|
156
|
+
|
|
157
|
+
renderPlot("in_context_ablations", "Number of in-context examples ablation");
|
|
158
|
+
renderPlot("mc_ablations", "Multiple-choice adaptation ablation");
|
|
159
|
+
|
|
160
|
+
links.append(renderItems(tableLinks));
|
|
161
|
+
|
|
162
|
+
return container;
|
|
163
|
+
}
|
|
164
|
+
|
|
127
165
|
function renderRunsOverview(runSpecs) {
|
|
128
166
|
let query = '';
|
|
129
167
|
const $search = $('<input>', {type: 'text', size: 40, placeholder: 'Enter regex query (enter to open all)'});
|
|
@@ -1170,6 +1208,11 @@ $(function () {
|
|
|
1170
1208
|
$main.empty()
|
|
1171
1209
|
$main.append(renderHeader('Scenarios', renderScenarios()));
|
|
1172
1210
|
refreshHashLocation();
|
|
1211
|
+
} else if (urlParams.plots) {
|
|
1212
|
+
// Plots
|
|
1213
|
+
$main.empty()
|
|
1214
|
+
$main.append(renderHeader('Plots', renderPlots()));
|
|
1215
|
+
refreshHashLocation();
|
|
1173
1216
|
} else if (urlParams.runSpec || urlParams.runSpecs || urlParams.runSpecRegex) {
|
|
1174
1217
|
// Predictions for a set of run specs (matching a regular expression)
|
|
1175
1218
|
$main.text('Loading runs...');
|
helm/benchmark/static/index.html
CHANGED
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
<li class="nav-item"><a class="nav-link active" href="?models=1">Models</a></li>
|
|
23
23
|
<li class="nav-item"><a class="nav-link active" href="?scenarios=1">Scenarios</a></li>
|
|
24
24
|
<li class="nav-item"><a class="nav-link active" href="?groups=1">Results</a></li>
|
|
25
|
+
<li class="nav-item"><a class="nav-link active" href="?plots=1">Plots</a></li>
|
|
25
26
|
<li class="nav-item"><a class="nav-link active" href="?runs=1">Raw runs</a></li>
|
|
26
27
|
</ul>
|
|
27
28
|
</div>
|
|
@@ -48,5 +49,6 @@
|
|
|
48
49
|
<script src="json-urls-root.js"></script>
|
|
49
50
|
<script src="json-urls.js"></script>
|
|
50
51
|
<script src="benchmarking.js"></script>
|
|
52
|
+
<script src="plot-captions.js"></script>
|
|
51
53
|
</body>
|
|
52
54
|
</html>
|
|
@@ -48,3 +48,7 @@ function predictionsJsonUrl(suite, runSpecName) {
|
|
|
48
48
|
function requestsJsonUrl(suite, runSpecName) {
|
|
49
49
|
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
|
|
50
50
|
}
|
|
51
|
+
|
|
52
|
+
function plotUrl(suite, plotName) {
|
|
53
|
+
return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
|
|
54
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
////////////////////////////////////////////////////////////
|
|
2
|
+
// Dictionary of plot captions
|
|
3
|
+
|
|
4
|
+
const plotCaptions = {
|
|
5
|
+
"generic_summary": "Metrics for every model on every core scenario as a means for indicating the spread on a per-metric basis.",
|
|
6
|
+
"model_ranking_all": "The fraction of head-to-head comparisons between the given model and all other models, across all scenarios, where the given model is higher along the metric (e.g. more accurate in the accuracy subfigure). If a model was the highest for the given metric for every scenario, it would receive a score of 1.0; if a model received a score of 0.5, then if a scenario and second model were chosen at random, the outcome of the comparison would be a coin flip. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation.",
|
|
7
|
+
"accuracy_v_x": "The relationship between accuracy (x-axis) and each of the 6 metrics (calibration, robustness, fairness, social bias, toxicity, efficiency) we study in this work across all core scenarios and for all models. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
|
|
8
|
+
"metric_correlation": "The Pearson correlation between each metric and every other metric (x-axis). The small blue dots denote the correlation on each individual scenario, while the larger orange dots average the correlation across scenarios. Trends are qualitatively similarly for other correlation measures (e.g. Spearman correlation). For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
|
|
9
|
+
"accuracy_v_access": "The relationship between access (open vs. limited vs. closed) and model accuracy for each of the core scenarios. Shaded bars indicate the performance of the best model for that scenario, whereas the solid bars indicate the performance of the overall most accurate model across all core scenarios.",
|
|
10
|
+
"accuracy_over_num_parameters": "Cumulative plot, depicting the accuracy of the most accurate model up to a given size across all core scenarios.",
|
|
11
|
+
"accuracy_over_release_date": "The relationship between time (x-axis) and the accuracy of models (y-axis) across the core scenarios.",
|
|
12
|
+
"accuracy_over_the_pile_perplexity": "The relationship between log bits-per-byte (BPB) on The Pile and the accuracy on each core scenario.",
|
|
13
|
+
"targeted_evals": "Model accuracy on scenario targeting specific performance components (language, knowledge, reasoning).",
|
|
14
|
+
"in_context_ablations": "For each model, we set the maximum number of in-context examples to [0, 1, 2, 4, 8, 16] and fit as many in-context examples as possible within the context window. We plot performance as a function of the average number of in-context examples actually used.",
|
|
15
|
+
"mc_ablations": "For each adaptation method (joint, separate, and separate calibrated), we compare models across scenarios."
|
|
16
|
+
};
|
|
@@ -30,6 +30,27 @@ models:
|
|
|
30
30
|
access: limited
|
|
31
31
|
num_parameters: 17000000000
|
|
32
32
|
release_date: 2022-10-28
|
|
33
|
+
- name: ai21/j2-jumbo
|
|
34
|
+
display_name: Jurassic-2 Jumbo (178B)
|
|
35
|
+
description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
|
|
36
|
+
creator_organization: AI21 Labs
|
|
37
|
+
access: limited
|
|
38
|
+
num_parameters: 178000000000
|
|
39
|
+
release_date: 2023-03-09
|
|
40
|
+
- name: ai21/j2-grande
|
|
41
|
+
display_name: Jurassic-2 Grande (17B)
|
|
42
|
+
description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
|
|
43
|
+
creator_organization: AI21 Labs
|
|
44
|
+
access: limited
|
|
45
|
+
num_parameters: 17000000000
|
|
46
|
+
release_date: 2023-03-09
|
|
47
|
+
- name: ai21/j2-large
|
|
48
|
+
display_name: Jurassic-2 Large (7.5B)
|
|
49
|
+
description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
|
|
50
|
+
creator_organization: AI21 Labs
|
|
51
|
+
access: limited
|
|
52
|
+
num_parameters: 7500000000
|
|
53
|
+
release_date: 2023-03-09
|
|
33
54
|
|
|
34
55
|
# Aleph Alpha
|
|
35
56
|
# TODO: add Luminous World when it's released
|
|
@@ -92,6 +113,13 @@ models:
|
|
|
92
113
|
num_parameters: 11000000000
|
|
93
114
|
release_date: 2021-10-15
|
|
94
115
|
|
|
116
|
+
# BigCode
|
|
117
|
+
- name: huggingface/santacoder
|
|
118
|
+
display_name: SantaCoder (1.1B)
|
|
119
|
+
description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
|
|
120
|
+
creator_organization: BigCode
|
|
121
|
+
access: open
|
|
122
|
+
|
|
95
123
|
# Cohere
|
|
96
124
|
- name: cohere/xlarge-20220609
|
|
97
125
|
display_name: Cohere xlarge v20220609 (52.4B)
|
|
@@ -135,6 +163,22 @@ models:
|
|
|
135
163
|
access: limited
|
|
136
164
|
num_parameters: 6100000000
|
|
137
165
|
release_date: 2022-11-08
|
|
166
|
+
- name: cohere/command-medium-beta
|
|
167
|
+
display_name: Cohere Command beta (6.1B)
|
|
168
|
+
description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
|
|
169
|
+
creator_organization: Cohere
|
|
170
|
+
access: limited
|
|
171
|
+
num_parameters: 6100000000
|
|
172
|
+
release_date: 2022-11-08
|
|
173
|
+
todo: true
|
|
174
|
+
- name: cohere/command-xlarge-beta
|
|
175
|
+
display_name: Cohere Command beta (52.4B)
|
|
176
|
+
description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
|
|
177
|
+
creator_organization: Cohere
|
|
178
|
+
access: limited
|
|
179
|
+
num_parameters: 52400000000
|
|
180
|
+
release_date: 2022-11-08
|
|
181
|
+
todo: true
|
|
138
182
|
|
|
139
183
|
# DeepMind
|
|
140
184
|
- name: deepmind/gopher
|
|
@@ -188,7 +232,6 @@ models:
|
|
|
188
232
|
description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
|
|
189
233
|
creator_organization: Google
|
|
190
234
|
access: open
|
|
191
|
-
todo: true
|
|
192
235
|
|
|
193
236
|
- name: google/palm
|
|
194
237
|
display_name: PaLM (540B)
|
|
@@ -197,7 +240,35 @@ models:
|
|
|
197
240
|
access: closed
|
|
198
241
|
todo: true
|
|
199
242
|
|
|
243
|
+
# HazyResearch
|
|
244
|
+
- name: together/h3-2.7b
|
|
245
|
+
display_name: H3 (2.7B)
|
|
246
|
+
description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
|
|
247
|
+
creator_organization: HazyResearch
|
|
248
|
+
access: open
|
|
249
|
+
num_parameters: 2700000000
|
|
250
|
+
release_date: 2023-01-23
|
|
251
|
+
todo: true
|
|
252
|
+
|
|
200
253
|
# Meta
|
|
254
|
+
- name: together/opt-iml-175b
|
|
255
|
+
display_name: OPT-IML (175B)
|
|
256
|
+
description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
|
|
257
|
+
creator_organization: Meta
|
|
258
|
+
access: open
|
|
259
|
+
num_parameters: 175000000000
|
|
260
|
+
release_date: 2022-12-22
|
|
261
|
+
todo: true
|
|
262
|
+
|
|
263
|
+
- name: together/opt-iml-30b
|
|
264
|
+
display_name: OPT-IML (30B)
|
|
265
|
+
description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
|
|
266
|
+
creator_organization: Meta
|
|
267
|
+
access: open
|
|
268
|
+
num_parameters: 30000000000
|
|
269
|
+
release_date: 2022-12-22
|
|
270
|
+
todo: true
|
|
271
|
+
|
|
201
272
|
- name: together/opt-175b
|
|
202
273
|
display_name: OPT (175B)
|
|
203
274
|
description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
|
|
@@ -223,6 +294,15 @@ models:
|
|
|
223
294
|
release_date: 2022-11-15
|
|
224
295
|
todo: true
|
|
225
296
|
|
|
297
|
+
- name: together/galactica-30b
|
|
298
|
+
display_name: Galactica (30B)
|
|
299
|
+
description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
|
|
300
|
+
creator_organization: Meta
|
|
301
|
+
access: open
|
|
302
|
+
num_parameters: 30000000000
|
|
303
|
+
release_date: 2022-11-15
|
|
304
|
+
todo: true
|
|
305
|
+
|
|
226
306
|
# Microsoft/NVIDIA
|
|
227
307
|
- name: microsoft/TNLGv2_530B
|
|
228
308
|
display_name: TNLG v2 (530B)
|
|
@@ -327,6 +407,12 @@ models:
|
|
|
327
407
|
description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
|
|
328
408
|
creator_organization: OpenAI
|
|
329
409
|
access: limited
|
|
410
|
+
- name: openai/gpt-3.5-turbo-0301
|
|
411
|
+
display_name: gpt-3.5-turbo-0301
|
|
412
|
+
description: Sibling model Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
|
|
413
|
+
creator_organization: OpenAI
|
|
414
|
+
access: limited
|
|
415
|
+
release_date: 2023-03-01
|
|
330
416
|
- name: openai/chat-gpt
|
|
331
417
|
display_name: ChatGPT
|
|
332
418
|
description: Sibling model to InstructGPT which interacts in a conversational way. See [OpenAI's announcement](https://openai.com/blog/chatgpt/). The size of the model is unknown.
|
|
@@ -344,6 +430,24 @@ models:
|
|
|
344
430
|
num_parameters: 6700000000
|
|
345
431
|
release_date: 2022-11-29
|
|
346
432
|
todo: true
|
|
433
|
+
- name: together/gpt-neoxt-chat-base-20b
|
|
434
|
+
display_name: GPT-NeoXT-Chat-Base (20B)
|
|
435
|
+
description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
|
|
436
|
+
creator_organization: Together
|
|
437
|
+
access: open
|
|
438
|
+
num_parameters: 20000000000
|
|
439
|
+
release_date: 2023-03-08
|
|
440
|
+
todo: true
|
|
441
|
+
|
|
442
|
+
# Salesforce
|
|
443
|
+
- name: together/codegen
|
|
444
|
+
display_name: CodeGen (16B)
|
|
445
|
+
description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
|
|
446
|
+
creator_organization: Tsinghua
|
|
447
|
+
access: open
|
|
448
|
+
num_parameters: 16000000000
|
|
449
|
+
release_date: 2022-03-25
|
|
450
|
+
todo: true
|
|
347
451
|
|
|
348
452
|
# Tsinghua
|
|
349
453
|
- name: together/glm
|
|
@@ -354,6 +458,15 @@ models:
|
|
|
354
458
|
num_parameters: 130000000000
|
|
355
459
|
release_date: 2022-08-04
|
|
356
460
|
|
|
461
|
+
- name: together/codegeex
|
|
462
|
+
display_name: CodeGeeX (13B)
|
|
463
|
+
description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
|
|
464
|
+
creator_organization: Tsinghua
|
|
465
|
+
access: open
|
|
466
|
+
num_parameters: 13000000000
|
|
467
|
+
release_date: 2022-09-19
|
|
468
|
+
todo: true
|
|
469
|
+
|
|
357
470
|
# Yandex
|
|
358
471
|
- name: together/yalm
|
|
359
472
|
display_name: YaLM (100B)
|
|
@@ -563,6 +676,14 @@ metrics:
|
|
|
563
676
|
display_name: F1
|
|
564
677
|
description: Average F1 score in terms of word overlap between the model output and correct reference.
|
|
565
678
|
lower_is_better: false
|
|
679
|
+
- name: classification_macro_f1
|
|
680
|
+
display_name: Macro-F1
|
|
681
|
+
description: Population-level macro-averaged F1 score.
|
|
682
|
+
lower_is_better: false
|
|
683
|
+
- name: classification_micro_f1
|
|
684
|
+
display_name: Micro-F1
|
|
685
|
+
description: Population-level micro-averaged F1 score.
|
|
686
|
+
lower_is_better: false
|
|
566
687
|
- name: absolute_value_difference
|
|
567
688
|
display_name: Absolute difference
|
|
568
689
|
short_display_name: Diff.
|
|
@@ -1094,6 +1215,14 @@ metric_groups:
|
|
|
1094
1215
|
- name: monte_carlo_entropy
|
|
1095
1216
|
split: ${main_split}
|
|
1096
1217
|
|
|
1218
|
+
- name: classification_metrics
|
|
1219
|
+
display_name: Classification metrics
|
|
1220
|
+
metrics:
|
|
1221
|
+
- name: classification_macro_f1
|
|
1222
|
+
split: ${main_split}
|
|
1223
|
+
- name: classification_micro_f1
|
|
1224
|
+
split: ${main_split}
|
|
1225
|
+
|
|
1097
1226
|
############################################################
|
|
1098
1227
|
run_groups:
|
|
1099
1228
|
## Top-level
|
|
@@ -2031,6 +2160,30 @@ run_groups:
|
|
|
2031
2160
|
when: n/a
|
|
2032
2161
|
language: synthetic
|
|
2033
2162
|
|
|
2163
|
+
- name: lextreme
|
|
2164
|
+
display_name: LEXTREME
|
|
2165
|
+
description: A Multilingual Legal Benchmark for Natural Language Understanding
|
|
2166
|
+
metric_groups:
|
|
2167
|
+
- classification_metrics
|
|
2168
|
+
- calibration
|
|
2169
|
+
- efficiency
|
|
2170
|
+
- general_information
|
|
2171
|
+
environment:
|
|
2172
|
+
main_name: classification_macro_f1
|
|
2173
|
+
main_split: test
|
|
2174
|
+
|
|
2175
|
+
- name: lex_glue
|
|
2176
|
+
display_name: LexGLUE
|
|
2177
|
+
description: A Benchmark Dataset for Legal Language Understanding in English
|
|
2178
|
+
metric_groups:
|
|
2179
|
+
- classification_metrics
|
|
2180
|
+
- calibration
|
|
2181
|
+
- efficiency
|
|
2182
|
+
- general_information
|
|
2183
|
+
environment:
|
|
2184
|
+
main_name: classification_macro_f1
|
|
2185
|
+
main_split: test
|
|
2186
|
+
|
|
2034
2187
|
- name: entity_data_imputation
|
|
2035
2188
|
display_name: Data imputation
|
|
2036
2189
|
description: Scenario from [Mei et al. (2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability to impute missing entities in a data table.
|
|
@@ -141,3 +141,23 @@ class CohereWindowService(LocalWindowService):
|
|
|
141
141
|
result = result[:-1]
|
|
142
142
|
|
|
143
143
|
return result
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class CohereCommandWindowService(CohereWindowService):
|
|
147
|
+
def __init__(self, service: TokenizerService):
|
|
148
|
+
super().__init__(service)
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def max_request_length(self) -> int:
|
|
152
|
+
"""
|
|
153
|
+
The max request length of the model. For Cohere, this is the same as the `max_sequence_length`.
|
|
154
|
+
If we exceed the `max_sequence_length`, we get the following error:
|
|
155
|
+
|
|
156
|
+
Request failed with too many tokens: total number of tokens (prompt and prediction) cannot
|
|
157
|
+
exceed 2048 - received 2049. Try using a shorter prompt or a smaller max_tokens value.
|
|
158
|
+
|
|
159
|
+
For the Command model, in rare situations, the co.tokenize returns a shorter list of tokens
|
|
160
|
+
than the co.generate. This causes sequence length errors for rare inputs. Cohere's advice is
|
|
161
|
+
to reduce the sequence length to 2020 to avoid these issues.
|
|
162
|
+
"""
|
|
163
|
+
return 2020
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from .encoder_decoder_window_service import EncoderDecoderWindowService
|
|
2
|
+
from .tokenizer_service import TokenizerService
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class FlanT5WindowService(EncoderDecoderWindowService):
|
|
6
|
+
def __init__(self, service: TokenizerService):
|
|
7
|
+
super().__init__(service)
|
|
8
|
+
|
|
9
|
+
@property
|
|
10
|
+
def max_sequence_length(self) -> int:
|
|
11
|
+
"""Return the max sequence length."""
|
|
12
|
+
# We subtract 1 to account for <extra_id_0> that gets appended to prompts.
|
|
13
|
+
return 512 - 1
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def end_of_text_token(self) -> str:
|
|
17
|
+
"""The end of text token."""
|
|
18
|
+
return "</s>"
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def tokenizer_name(self) -> str:
|
|
22
|
+
"""Name of the tokenizer to use when sending a request."""
|
|
23
|
+
return "google/flan-t5-xxl"
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def prefix_token(self) -> str:
|
|
27
|
+
"""The prefix token is the same as the end of text token."""
|
|
28
|
+
# echo=True is not supported
|
|
29
|
+
return ""
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from helm.proxy.clients.huggingface_tokenizer import HuggingFaceTokenizers
|
|
2
|
+
from .local_window_service import LocalWindowService
|
|
3
|
+
from .tokenizer_service import TokenizerService
|
|
4
|
+
from helm.proxy.clients.huggingface_client import HuggingFaceModelConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class HuggingFaceWindowService(LocalWindowService):
|
|
8
|
+
def __init__(self, service: TokenizerService, model_config: HuggingFaceModelConfig):
|
|
9
|
+
super().__init__(service)
|
|
10
|
+
self._tokenizer_name = str(model_config)
|
|
11
|
+
tokenizer = HuggingFaceTokenizers.get_tokenizer(self._tokenizer_name)
|
|
12
|
+
self._prefix_token = tokenizer.bos_token
|
|
13
|
+
self._end_of_text_token = tokenizer.eos_token
|
|
14
|
+
self._max_request_length = tokenizer.model_max_length
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def max_sequence_length(self) -> int:
|
|
18
|
+
"""Return the max sequence length of this tokenizer."""
|
|
19
|
+
return self._max_request_length
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def max_request_length(self) -> int:
|
|
23
|
+
"""Return the max request length of this tokenizer."""
|
|
24
|
+
return self.max_sequence_length
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def end_of_text_token(self) -> str:
|
|
28
|
+
"""The end of text token."""
|
|
29
|
+
return self._end_of_text_token
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def tokenizer_name(self) -> str:
|
|
33
|
+
"""Name of the tokenizer to use when sending a request."""
|
|
34
|
+
return self._tokenizer_name
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def prefix_token(self) -> str:
|
|
38
|
+
"""The prefix token."""
|
|
39
|
+
return self._prefix_token
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .local_window_service import LocalWindowService
|
|
2
|
+
from .tokenizer_service import TokenizerService
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SantaCoderWindowService(LocalWindowService):
|
|
6
|
+
def __init__(self, service: TokenizerService):
|
|
7
|
+
super().__init__(service)
|
|
8
|
+
|
|
9
|
+
@property
|
|
10
|
+
def max_sequence_length(self) -> int:
|
|
11
|
+
return 2048
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def max_request_length(self) -> int:
|
|
15
|
+
return self.max_sequence_length
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def end_of_text_token(self) -> str:
|
|
19
|
+
return "<|endoftext|>"
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def tokenizer_name(self) -> str:
|
|
23
|
+
return "bigcode/santacoder"
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def prefix_token(self) -> str:
|
|
27
|
+
return self.end_of_text_token
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.window_services.test_t511b_window_service import TestT511bWindowService
|
|
4
|
+
from helm.benchmark.window_services.window_service_factory import TokenizerService, WindowServiceFactory
|
|
5
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestFlanT5WindowService(TestT511bWindowService):
|
|
9
|
+
def setup_method(self):
|
|
10
|
+
self.path: str = tempfile.mkdtemp()
|
|
11
|
+
service: TokenizerService = get_tokenizer_service(self.path)
|
|
12
|
+
self.window_service = WindowServiceFactory.get_window_service("together/flan-t5-xxl", service)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .ai21_window_service import AI21WindowService
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class WiderAI21WindowService(AI21WindowService):
|
|
5
|
+
@property
|
|
6
|
+
def max_sequence_length(self) -> int:
|
|
7
|
+
"""
|
|
8
|
+
Return the max sequence length of the larger AI21 Jurassic-2 models.
|
|
9
|
+
|
|
10
|
+
The AI21 server automatically prepends a token to every prompt,
|
|
11
|
+
so the actual max sequence length is 8192 - 1 = 8191.
|
|
12
|
+
"""
|
|
13
|
+
return 8191
|
|
@@ -1,7 +1,14 @@
|
|
|
1
|
-
from helm.proxy.models import
|
|
1
|
+
from helm.proxy.models import (
|
|
2
|
+
get_model,
|
|
3
|
+
get_model_names_with_tag,
|
|
4
|
+
Model,
|
|
5
|
+
AI21_WIDER_CONTEXT_WINDOW_TAG,
|
|
6
|
+
WIDER_CONTEXT_WINDOW_TAG,
|
|
7
|
+
)
|
|
2
8
|
from .ai21_window_service import AI21WindowService
|
|
9
|
+
from .wider_ai21_window_service import WiderAI21WindowService
|
|
3
10
|
from .anthropic_window_service import AnthropicWindowService
|
|
4
|
-
from .cohere_window_service import CohereWindowService
|
|
11
|
+
from .cohere_window_service import CohereWindowService, CohereCommandWindowService
|
|
5
12
|
from .luminous_window_service import (
|
|
6
13
|
LuminousBaseWindowService,
|
|
7
14
|
LuminousExtendedWindowService,
|
|
@@ -12,17 +19,21 @@ from .openai_window_service import OpenAIWindowService
|
|
|
12
19
|
from .wider_openai_window_service import WiderOpenAIWindowService
|
|
13
20
|
from .mt_nlg_window_service import MTNLGWindowService
|
|
14
21
|
from .bloom_window_service import BloomWindowService
|
|
22
|
+
from .huggingface_window_service import HuggingFaceWindowService
|
|
15
23
|
from .ice_window_service import ICEWindowService
|
|
24
|
+
from .santacoder_window_service import SantaCoderWindowService
|
|
16
25
|
from .gpt2_window_service import GPT2WindowService
|
|
17
26
|
from .gptj_window_service import GPTJWindowService
|
|
18
27
|
from .gptneox_window_service import GPTNeoXWindowService
|
|
19
28
|
from .opt_window_service import OPTWindowService
|
|
20
29
|
from .t0pp_window_service import T0ppWindowService
|
|
21
30
|
from .t511b_window_service import T511bWindowService
|
|
31
|
+
from .flan_t5_window_service import FlanT5WindowService
|
|
22
32
|
from .ul2_window_service import UL2WindowService
|
|
23
33
|
from .yalm_window_service import YaLMWindowService
|
|
24
34
|
from .window_service import WindowService
|
|
25
35
|
from .tokenizer_service import TokenizerService
|
|
36
|
+
from helm.proxy.clients.huggingface_client import get_huggingface_model_config
|
|
26
37
|
|
|
27
38
|
|
|
28
39
|
class WindowServiceFactory:
|
|
@@ -37,9 +48,13 @@ class WindowServiceFactory:
|
|
|
37
48
|
engine: str = model.engine
|
|
38
49
|
|
|
39
50
|
window_service: WindowService
|
|
40
|
-
|
|
51
|
+
huggingface_model_config = get_huggingface_model_config(model_name)
|
|
52
|
+
if huggingface_model_config:
|
|
53
|
+
window_service = HuggingFaceWindowService(service=service, model_config=huggingface_model_config)
|
|
54
|
+
elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
|
|
41
55
|
window_service = WiderOpenAIWindowService(service)
|
|
42
|
-
|
|
56
|
+
# For the Google models, we approximate with the OpenAIWindowService
|
|
57
|
+
elif organization == "openai" or organization == "simple" or organization == "google":
|
|
43
58
|
window_service = OpenAIWindowService(service)
|
|
44
59
|
elif organization == "AlephAlpha":
|
|
45
60
|
if engine == "luminous-base":
|
|
@@ -56,6 +71,8 @@ class WindowServiceFactory:
|
|
|
56
71
|
window_service = MTNLGWindowService(service)
|
|
57
72
|
elif organization == "anthropic":
|
|
58
73
|
window_service = AnthropicWindowService(service)
|
|
74
|
+
elif engine == "santacoder":
|
|
75
|
+
window_service = SantaCoderWindowService(service)
|
|
59
76
|
elif model_name == "huggingface/gpt2":
|
|
60
77
|
window_service = GPT2WindowService(service)
|
|
61
78
|
elif model_name == "together/bloom":
|
|
@@ -66,22 +83,32 @@ class WindowServiceFactory:
|
|
|
66
83
|
window_service = ICEWindowService(service)
|
|
67
84
|
elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "gooseai/gpt-j-6b"]:
|
|
68
85
|
window_service = GPTJWindowService(service)
|
|
69
|
-
elif model_name in ["together/gpt-neox-20b", "gooseai/gpt-neo-20b"]:
|
|
86
|
+
elif model_name in ["together/gpt-neox-20b", "gooseai/gpt-neo-20b", "together/gpt-neoxt-chat-base-20b"]:
|
|
70
87
|
window_service = GPTNeoXWindowService(service)
|
|
88
|
+
elif model_name == "together/h3-2.7b":
|
|
89
|
+
window_service = GPT2WindowService(service)
|
|
71
90
|
elif model_name in ["together/opt-66b", "together/opt-175b"]:
|
|
72
91
|
window_service = OPTWindowService(service)
|
|
73
92
|
elif model_name == "together/t0pp":
|
|
74
93
|
window_service = T0ppWindowService(service)
|
|
75
94
|
elif model_name == "together/t5-11b":
|
|
76
95
|
window_service = T511bWindowService(service)
|
|
96
|
+
elif model_name == "together/flan-t5-xxl":
|
|
97
|
+
window_service = FlanT5WindowService(service)
|
|
77
98
|
elif model_name == "together/ul2":
|
|
78
99
|
window_service = UL2WindowService(service)
|
|
79
100
|
elif model_name == "together/yalm":
|
|
80
101
|
window_service = YaLMWindowService(service)
|
|
81
102
|
elif organization == "cohere":
|
|
82
|
-
|
|
103
|
+
if "command" in engine:
|
|
104
|
+
window_service = CohereCommandWindowService(service)
|
|
105
|
+
else:
|
|
106
|
+
window_service = CohereWindowService(service)
|
|
83
107
|
elif organization == "ai21":
|
|
84
|
-
|
|
108
|
+
if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG):
|
|
109
|
+
window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
|
|
110
|
+
else:
|
|
111
|
+
window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
|
|
85
112
|
else:
|
|
86
113
|
raise ValueError(f"Unhandled model name: {model_name}")
|
|
87
114
|
|