crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
  2. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
  3. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
  5. helm/benchmark/__init__.py +13 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  9. helm/benchmark/contamination/__init__.py +0 -0
  10. helm/benchmark/metrics/classification_metrics.py +70 -0
  11. helm/benchmark/metrics/machine_translation_metrics.py +36 -0
  12. helm/benchmark/metrics/summarization_metrics.py +7 -8
  13. helm/benchmark/metrics/test_classification_metrics.py +150 -0
  14. helm/benchmark/presentation/create_plots.py +617 -0
  15. helm/benchmark/presentation/run_display.py +7 -48
  16. helm/benchmark/presentation/summarize.py +4 -2
  17. helm/benchmark/presentation/test_create_plots.py +32 -0
  18. helm/benchmark/run.py +144 -48
  19. helm/benchmark/run_expander.py +164 -47
  20. helm/benchmark/run_specs.py +346 -39
  21. helm/benchmark/runner.py +34 -6
  22. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  23. helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
  24. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  25. helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
  26. helm/benchmark/scenarios/lextreme_scenario.py +458 -0
  27. helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
  28. helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
  29. helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
  30. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
  31. helm/benchmark/scenarios/med_qa_scenario.py +96 -0
  32. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  33. helm/benchmark/scenarios/scenario.py +5 -0
  34. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  35. helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
  36. helm/benchmark/static/benchmarking.css +14 -0
  37. helm/benchmark/static/benchmarking.js +43 -0
  38. helm/benchmark/static/index.html +2 -0
  39. helm/benchmark/static/json-urls.js +4 -0
  40. helm/benchmark/static/plot-captions.js +16 -0
  41. helm/benchmark/static/schema.yaml +154 -1
  42. helm/benchmark/window_services/cohere_window_service.py +20 -0
  43. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  44. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  45. helm/benchmark/window_services/santacoder_window_service.py +27 -0
  46. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  47. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  48. helm/benchmark/window_services/window_service_factory.py +34 -7
  49. helm/common/codec.py +123 -0
  50. helm/common/general.py +12 -5
  51. helm/common/test_codec.py +144 -0
  52. helm/proxy/clients/aleph_alpha_client.py +47 -28
  53. helm/proxy/clients/auto_client.py +32 -24
  54. helm/proxy/clients/google_client.py +88 -0
  55. helm/proxy/clients/huggingface_client.py +32 -16
  56. helm/proxy/clients/huggingface_model_registry.py +111 -0
  57. helm/proxy/clients/huggingface_tokenizer.py +25 -7
  58. helm/proxy/clients/openai_client.py +60 -2
  59. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  60. helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
  61. helm/proxy/clients/together_client.py +17 -2
  62. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  63. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  64. helm/proxy/models.py +115 -7
  65. helm/proxy/test_models.py +1 -1
  66. helm/benchmark/presentation/present.py +0 -249
  67. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  68. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
@@ -124,6 +124,44 @@ $(function () {
124
124
  return $table;
125
125
  }
126
126
 
127
+ function renderPlots() {
128
+ const container = $('<div>', {class: "container"});
129
+ const links = $('<div>');
130
+ container.append(links);
131
+ const tableLinks = [];
132
+
133
+ function renderPlot(name, title) {
134
+ const plot = $('<div>', {class: "plot"});
135
+ const caption = $('<div>', {class: "plot-caption"}).append(plotCaptions[name]);
136
+
137
+ plot.append($('<h3>').append($('<a>', {id: title}).append(title)));
138
+ plot.append(caption);
139
+ plot.append($('<img>', {src: plotUrl(suite, name), class: "img-fluid"}));
140
+ container.append(plot);
141
+ tableLinks.push($('<a>', {href: '#' + title}).append(title));
142
+ }
143
+
144
+ renderPlot("generic_summary", "Metric spread for core scenarios");
145
+ renderPlot("model_ranking_all", "Head-to-head win rate per each model");
146
+
147
+ renderPlot("accuracy_v_x", "Accuracy as a function of other metrics");
148
+ renderPlot("metric_correlation", "Correlation between metrics");
149
+
150
+ renderPlot("accuracy_v_access", "Accuracy as a function of model access");
151
+ renderPlot("accuracy_over_num_parameters", "Accuracy across model sizes");
152
+ renderPlot("accuracy_over_release_date", "Accuracy over time");
153
+ renderPlot("accuracy_over_the_pile_perplexity", "Accuracy as a function of The Pile perplexity");
154
+
155
+ renderPlot("targeted_evals", "Targeted evaluations");
156
+
157
+ renderPlot("in_context_ablations", "Number of in-context examples ablation");
158
+ renderPlot("mc_ablations", "Multiple-choice adaptation ablation");
159
+
160
+ links.append(renderItems(tableLinks));
161
+
162
+ return container;
163
+ }
164
+
127
165
  function renderRunsOverview(runSpecs) {
128
166
  let query = '';
129
167
  const $search = $('<input>', {type: 'text', size: 40, placeholder: 'Enter regex query (enter to open all)'});
@@ -1170,6 +1208,11 @@ $(function () {
1170
1208
  $main.empty()
1171
1209
  $main.append(renderHeader('Scenarios', renderScenarios()));
1172
1210
  refreshHashLocation();
1211
+ } else if (urlParams.plots) {
1212
+ // Plots
1213
+ $main.empty()
1214
+ $main.append(renderHeader('Plots', renderPlots()));
1215
+ refreshHashLocation();
1173
1216
  } else if (urlParams.runSpec || urlParams.runSpecs || urlParams.runSpecRegex) {
1174
1217
  // Predictions for a set of run specs (matching a regular expression)
1175
1218
  $main.text('Loading runs...');
@@ -22,6 +22,7 @@
22
22
  <li class="nav-item"><a class="nav-link active" href="?models=1">Models</a></li>
23
23
  <li class="nav-item"><a class="nav-link active" href="?scenarios=1">Scenarios</a></li>
24
24
  <li class="nav-item"><a class="nav-link active" href="?groups=1">Results</a></li>
25
+ <li class="nav-item"><a class="nav-link active" href="?plots=1">Plots</a></li>
25
26
  <li class="nav-item"><a class="nav-link active" href="?runs=1">Raw runs</a></li>
26
27
  </ul>
27
28
  </div>
@@ -48,5 +49,6 @@
48
49
  <script src="json-urls-root.js"></script>
49
50
  <script src="json-urls.js"></script>
50
51
  <script src="benchmarking.js"></script>
52
+ <script src="plot-captions.js"></script>
51
53
  </body>
52
54
  </html>
@@ -48,3 +48,7 @@ function predictionsJsonUrl(suite, runSpecName) {
48
48
  function requestsJsonUrl(suite, runSpecName) {
49
49
  return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/${runSpecName}/display_requests.json`;
50
50
  }
51
+
52
+ function plotUrl(suite, plotName) {
53
+ return `${BENCHMARK_OUTPUT_BASE_URL}/runs/${suite}/plots/${plotName}.png`;
54
+ }
@@ -0,0 +1,16 @@
1
+ ////////////////////////////////////////////////////////////
2
+ // Dictionary of plot captions
3
+
4
+ const plotCaptions = {
5
+ "generic_summary": "Metrics for every model on every core scenario as a means for indicating the spread on a per-metric basis.",
6
+ "model_ranking_all": "The fraction of head-to-head comparisons between the given model and all other models, across all scenarios, where the given model is higher along the metric (e.g. more accurate in the accuracy subfigure). If a model was the highest for the given metric for every scenario, it would receive a score of 1.0; if a model received a score of 0.5, then if a scenario and second model were chosen at random, the outcome of the comparison would be a coin flip. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation.",
7
+ "accuracy_v_x": "The relationship between accuracy (x-axis) and each of the 6 metrics (calibration, robustness, fairness, social bias, toxicity, efficiency) we study in this work across all core scenarios and for all models. For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
8
+ "metric_correlation": "The Pearson correlation between each metric and every other metric (x-axis). The small blue dots denote the correlation on each individual scenario, while the larger orange dots average the correlation across scenarios. Trends are qualitatively similarly for other correlation measures (e.g. Spearman correlation). For calibration error, we measure ECE-10; for bias, we measure bias in gender representation; and for efficiency, we measure denoised inference time.",
9
+ "accuracy_v_access": "The relationship between access (open vs. limited vs. closed) and model accuracy for each of the core scenarios. Shaded bars indicate the performance of the best model for that scenario, whereas the solid bars indicate the performance of the overall most accurate model across all core scenarios.",
10
+ "accuracy_over_num_parameters": "Cumulative plot, depicting the accuracy of the most accurate model up to a given size across all core scenarios.",
11
+ "accuracy_over_release_date": "The relationship between time (x-axis) and the accuracy of models (y-axis) across the core scenarios.",
12
+ "accuracy_over_the_pile_perplexity": "The relationship between log bits-per-byte (BPB) on The Pile and the accuracy on each core scenario.",
13
+ "targeted_evals": "Model accuracy on scenario targeting specific performance components (language, knowledge, reasoning).",
14
+ "in_context_ablations": "For each model, we set the maximum number of in-context examples to [0, 1, 2, 4, 8, 16] and fit as many in-context examples as possible within the context window. We plot performance as a function of the average number of in-context examples actually used.",
15
+ "mc_ablations": "For each adaptation method (joint, separate, and separate calibrated), we compare models across scenarios."
16
+ };
@@ -30,6 +30,27 @@ models:
30
30
  access: limited
31
31
  num_parameters: 17000000000
32
32
  release_date: 2022-10-28
33
+ - name: ai21/j2-jumbo
34
+ display_name: Jurassic-2 Jumbo (178B)
35
+ description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
36
+ creator_organization: AI21 Labs
37
+ access: limited
38
+ num_parameters: 178000000000
39
+ release_date: 2023-03-09
40
+ - name: ai21/j2-grande
41
+ display_name: Jurassic-2 Grande (17B)
42
+ description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
43
+ creator_organization: AI21 Labs
44
+ access: limited
45
+ num_parameters: 17000000000
46
+ release_date: 2023-03-09
47
+ - name: ai21/j2-large
48
+ display_name: Jurassic-2 Large (7.5B)
49
+ description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
50
+ creator_organization: AI21 Labs
51
+ access: limited
52
+ num_parameters: 7500000000
53
+ release_date: 2023-03-09
33
54
 
34
55
  # Aleph Alpha
35
56
  # TODO: add Luminous World when it's released
@@ -92,6 +113,13 @@ models:
92
113
  num_parameters: 11000000000
93
114
  release_date: 2021-10-15
94
115
 
116
+ # BigCode
117
+ - name: huggingface/santacoder
118
+ display_name: SantaCoder (1.1B)
119
+ description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
120
+ creator_organization: BigCode
121
+ access: open
122
+
95
123
  # Cohere
96
124
  - name: cohere/xlarge-20220609
97
125
  display_name: Cohere xlarge v20220609 (52.4B)
@@ -135,6 +163,22 @@ models:
135
163
  access: limited
136
164
  num_parameters: 6100000000
137
165
  release_date: 2022-11-08
166
+ - name: cohere/command-medium-beta
167
+ display_name: Cohere Command beta (6.1B)
168
+ description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
169
+ creator_organization: Cohere
170
+ access: limited
171
+ num_parameters: 6100000000
172
+ release_date: 2022-11-08
173
+ todo: true
174
+ - name: cohere/command-xlarge-beta
175
+ display_name: Cohere Command beta (52.4B)
176
+ description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
177
+ creator_organization: Cohere
178
+ access: limited
179
+ num_parameters: 52400000000
180
+ release_date: 2022-11-08
181
+ todo: true
138
182
 
139
183
  # DeepMind
140
184
  - name: deepmind/gopher
@@ -188,7 +232,6 @@ models:
188
232
  description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
189
233
  creator_organization: Google
190
234
  access: open
191
- todo: true
192
235
 
193
236
  - name: google/palm
194
237
  display_name: PaLM (540B)
@@ -197,7 +240,35 @@ models:
197
240
  access: closed
198
241
  todo: true
199
242
 
243
+ # HazyResearch
244
+ - name: together/h3-2.7b
245
+ display_name: H3 (2.7B)
246
+ description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
247
+ creator_organization: HazyResearch
248
+ access: open
249
+ num_parameters: 2700000000
250
+ release_date: 2023-01-23
251
+ todo: true
252
+
200
253
  # Meta
254
+ - name: together/opt-iml-175b
255
+ display_name: OPT-IML (175B)
256
+ description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
257
+ creator_organization: Meta
258
+ access: open
259
+ num_parameters: 175000000000
260
+ release_date: 2022-12-22
261
+ todo: true
262
+
263
+ - name: together/opt-iml-30b
264
+ display_name: OPT-IML (30B)
265
+ description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
266
+ creator_organization: Meta
267
+ access: open
268
+ num_parameters: 30000000000
269
+ release_date: 2022-12-22
270
+ todo: true
271
+
201
272
  - name: together/opt-175b
202
273
  display_name: OPT (175B)
203
274
  description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
@@ -223,6 +294,15 @@ models:
223
294
  release_date: 2022-11-15
224
295
  todo: true
225
296
 
297
+ - name: together/galactica-30b
298
+ display_name: Galactica (30B)
299
+ description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
300
+ creator_organization: Meta
301
+ access: open
302
+ num_parameters: 30000000000
303
+ release_date: 2022-11-15
304
+ todo: true
305
+
226
306
  # Microsoft/NVIDIA
227
307
  - name: microsoft/TNLGv2_530B
228
308
  display_name: TNLG v2 (530B)
@@ -327,6 +407,12 @@ models:
327
407
  description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
328
408
  creator_organization: OpenAI
329
409
  access: limited
410
+ - name: openai/gpt-3.5-turbo-0301
411
+ display_name: gpt-3.5-turbo-0301
412
+ description: Sibling model Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
413
+ creator_organization: OpenAI
414
+ access: limited
415
+ release_date: 2023-03-01
330
416
  - name: openai/chat-gpt
331
417
  display_name: ChatGPT
332
418
  description: Sibling model to InstructGPT which interacts in a conversational way. See [OpenAI's announcement](https://openai.com/blog/chatgpt/). The size of the model is unknown.
@@ -344,6 +430,24 @@ models:
344
430
  num_parameters: 6700000000
345
431
  release_date: 2022-11-29
346
432
  todo: true
433
+ - name: together/gpt-neoxt-chat-base-20b
434
+ display_name: GPT-NeoXT-Chat-Base (20B)
435
+ description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
436
+ creator_organization: Together
437
+ access: open
438
+ num_parameters: 20000000000
439
+ release_date: 2023-03-08
440
+ todo: true
441
+
442
+ # Salesforce
443
+ - name: together/codegen
444
+ display_name: CodeGen (16B)
445
+ description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
446
+ creator_organization: Tsinghua
447
+ access: open
448
+ num_parameters: 16000000000
449
+ release_date: 2022-03-25
450
+ todo: true
347
451
 
348
452
  # Tsinghua
349
453
  - name: together/glm
@@ -354,6 +458,15 @@ models:
354
458
  num_parameters: 130000000000
355
459
  release_date: 2022-08-04
356
460
 
461
+ - name: together/codegeex
462
+ display_name: CodeGeeX (13B)
463
+ description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
464
+ creator_organization: Tsinghua
465
+ access: open
466
+ num_parameters: 13000000000
467
+ release_date: 2022-09-19
468
+ todo: true
469
+
357
470
  # Yandex
358
471
  - name: together/yalm
359
472
  display_name: YaLM (100B)
@@ -563,6 +676,14 @@ metrics:
563
676
  display_name: F1
564
677
  description: Average F1 score in terms of word overlap between the model output and correct reference.
565
678
  lower_is_better: false
679
+ - name: classification_macro_f1
680
+ display_name: Macro-F1
681
+ description: Population-level macro-averaged F1 score.
682
+ lower_is_better: false
683
+ - name: classification_micro_f1
684
+ display_name: Micro-F1
685
+ description: Population-level micro-averaged F1 score.
686
+ lower_is_better: false
566
687
  - name: absolute_value_difference
567
688
  display_name: Absolute difference
568
689
  short_display_name: Diff.
@@ -1094,6 +1215,14 @@ metric_groups:
1094
1215
  - name: monte_carlo_entropy
1095
1216
  split: ${main_split}
1096
1217
 
1218
+ - name: classification_metrics
1219
+ display_name: Classification metrics
1220
+ metrics:
1221
+ - name: classification_macro_f1
1222
+ split: ${main_split}
1223
+ - name: classification_micro_f1
1224
+ split: ${main_split}
1225
+
1097
1226
  ############################################################
1098
1227
  run_groups:
1099
1228
  ## Top-level
@@ -2031,6 +2160,30 @@ run_groups:
2031
2160
  when: n/a
2032
2161
  language: synthetic
2033
2162
 
2163
+ - name: lextreme
2164
+ display_name: LEXTREME
2165
+ description: A Multilingual Legal Benchmark for Natural Language Understanding
2166
+ metric_groups:
2167
+ - classification_metrics
2168
+ - calibration
2169
+ - efficiency
2170
+ - general_information
2171
+ environment:
2172
+ main_name: classification_macro_f1
2173
+ main_split: test
2174
+
2175
+ - name: lex_glue
2176
+ display_name: LexGLUE
2177
+ description: A Benchmark Dataset for Legal Language Understanding in English
2178
+ metric_groups:
2179
+ - classification_metrics
2180
+ - calibration
2181
+ - efficiency
2182
+ - general_information
2183
+ environment:
2184
+ main_name: classification_macro_f1
2185
+ main_split: test
2186
+
2034
2187
  - name: entity_data_imputation
2035
2188
  display_name: Data imputation
2036
2189
  description: Scenario from [Mei et al. (2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability to impute missing entities in a data table.
@@ -141,3 +141,23 @@ class CohereWindowService(LocalWindowService):
141
141
  result = result[:-1]
142
142
 
143
143
  return result
144
+
145
+
146
+ class CohereCommandWindowService(CohereWindowService):
147
+ def __init__(self, service: TokenizerService):
148
+ super().__init__(service)
149
+
150
+ @property
151
+ def max_request_length(self) -> int:
152
+ """
153
+ The max request length of the model. For Cohere, this is the same as the `max_sequence_length`.
154
+ If we exceed the `max_sequence_length`, we get the following error:
155
+
156
+ Request failed with too many tokens: total number of tokens (prompt and prediction) cannot
157
+ exceed 2048 - received 2049. Try using a shorter prompt or a smaller max_tokens value.
158
+
159
+ For the Command model, in rare situations, the co.tokenize returns a shorter list of tokens
160
+ than the co.generate. This causes sequence length errors for rare inputs. Cohere's advice is
161
+ to reduce the sequence length to 2020 to avoid these issues.
162
+ """
163
+ return 2020
@@ -0,0 +1,29 @@
1
+ from .encoder_decoder_window_service import EncoderDecoderWindowService
2
+ from .tokenizer_service import TokenizerService
3
+
4
+
5
+ class FlanT5WindowService(EncoderDecoderWindowService):
6
+ def __init__(self, service: TokenizerService):
7
+ super().__init__(service)
8
+
9
+ @property
10
+ def max_sequence_length(self) -> int:
11
+ """Return the max sequence length."""
12
+ # We subtract 1 to account for <extra_id_0> that gets appended to prompts.
13
+ return 512 - 1
14
+
15
+ @property
16
+ def end_of_text_token(self) -> str:
17
+ """The end of text token."""
18
+ return "</s>"
19
+
20
+ @property
21
+ def tokenizer_name(self) -> str:
22
+ """Name of the tokenizer to use when sending a request."""
23
+ return "google/flan-t5-xxl"
24
+
25
+ @property
26
+ def prefix_token(self) -> str:
27
+ """The prefix token is the same as the end of text token."""
28
+ # echo=True is not supported
29
+ return ""
@@ -0,0 +1,39 @@
1
+ from helm.proxy.clients.huggingface_tokenizer import HuggingFaceTokenizers
2
+ from .local_window_service import LocalWindowService
3
+ from .tokenizer_service import TokenizerService
4
+ from helm.proxy.clients.huggingface_client import HuggingFaceModelConfig
5
+
6
+
7
+ class HuggingFaceWindowService(LocalWindowService):
8
+ def __init__(self, service: TokenizerService, model_config: HuggingFaceModelConfig):
9
+ super().__init__(service)
10
+ self._tokenizer_name = str(model_config)
11
+ tokenizer = HuggingFaceTokenizers.get_tokenizer(self._tokenizer_name)
12
+ self._prefix_token = tokenizer.bos_token
13
+ self._end_of_text_token = tokenizer.eos_token
14
+ self._max_request_length = tokenizer.model_max_length
15
+
16
+ @property
17
+ def max_sequence_length(self) -> int:
18
+ """Return the max sequence length of this tokenizer."""
19
+ return self._max_request_length
20
+
21
+ @property
22
+ def max_request_length(self) -> int:
23
+ """Return the max request length of this tokenizer."""
24
+ return self.max_sequence_length
25
+
26
+ @property
27
+ def end_of_text_token(self) -> str:
28
+ """The end of text token."""
29
+ return self._end_of_text_token
30
+
31
+ @property
32
+ def tokenizer_name(self) -> str:
33
+ """Name of the tokenizer to use when sending a request."""
34
+ return self._tokenizer_name
35
+
36
+ @property
37
+ def prefix_token(self) -> str:
38
+ """The prefix token."""
39
+ return self._prefix_token
@@ -0,0 +1,27 @@
1
+ from .local_window_service import LocalWindowService
2
+ from .tokenizer_service import TokenizerService
3
+
4
+
5
+ class SantaCoderWindowService(LocalWindowService):
6
+ def __init__(self, service: TokenizerService):
7
+ super().__init__(service)
8
+
9
+ @property
10
+ def max_sequence_length(self) -> int:
11
+ return 2048
12
+
13
+ @property
14
+ def max_request_length(self) -> int:
15
+ return self.max_sequence_length
16
+
17
+ @property
18
+ def end_of_text_token(self) -> str:
19
+ return "<|endoftext|>"
20
+
21
+ @property
22
+ def tokenizer_name(self) -> str:
23
+ return "bigcode/santacoder"
24
+
25
+ @property
26
+ def prefix_token(self) -> str:
27
+ return self.end_of_text_token
@@ -0,0 +1,12 @@
1
+ import tempfile
2
+
3
+ from helm.benchmark.window_services.test_t511b_window_service import TestT511bWindowService
4
+ from helm.benchmark.window_services.window_service_factory import TokenizerService, WindowServiceFactory
5
+ from helm.benchmark.window_services.test_utils import get_tokenizer_service
6
+
7
+
8
+ class TestFlanT5WindowService(TestT511bWindowService):
9
+ def setup_method(self):
10
+ self.path: str = tempfile.mkdtemp()
11
+ service: TokenizerService = get_tokenizer_service(self.path)
12
+ self.window_service = WindowServiceFactory.get_window_service("together/flan-t5-xxl", service)
@@ -0,0 +1,13 @@
1
+ from .ai21_window_service import AI21WindowService
2
+
3
+
4
+ class WiderAI21WindowService(AI21WindowService):
5
+ @property
6
+ def max_sequence_length(self) -> int:
7
+ """
8
+ Return the max sequence length of the larger AI21 Jurassic-2 models.
9
+
10
+ The AI21 server automatically prepends a token to every prompt,
11
+ so the actual max sequence length is 8192 - 1 = 8191.
12
+ """
13
+ return 8191
@@ -1,7 +1,14 @@
1
- from helm.proxy.models import get_model, get_model_names_with_tag, Model, WIDER_CONTEXT_WINDOW_TAG
1
+ from helm.proxy.models import (
2
+ get_model,
3
+ get_model_names_with_tag,
4
+ Model,
5
+ AI21_WIDER_CONTEXT_WINDOW_TAG,
6
+ WIDER_CONTEXT_WINDOW_TAG,
7
+ )
2
8
  from .ai21_window_service import AI21WindowService
9
+ from .wider_ai21_window_service import WiderAI21WindowService
3
10
  from .anthropic_window_service import AnthropicWindowService
4
- from .cohere_window_service import CohereWindowService
11
+ from .cohere_window_service import CohereWindowService, CohereCommandWindowService
5
12
  from .luminous_window_service import (
6
13
  LuminousBaseWindowService,
7
14
  LuminousExtendedWindowService,
@@ -12,17 +19,21 @@ from .openai_window_service import OpenAIWindowService
12
19
  from .wider_openai_window_service import WiderOpenAIWindowService
13
20
  from .mt_nlg_window_service import MTNLGWindowService
14
21
  from .bloom_window_service import BloomWindowService
22
+ from .huggingface_window_service import HuggingFaceWindowService
15
23
  from .ice_window_service import ICEWindowService
24
+ from .santacoder_window_service import SantaCoderWindowService
16
25
  from .gpt2_window_service import GPT2WindowService
17
26
  from .gptj_window_service import GPTJWindowService
18
27
  from .gptneox_window_service import GPTNeoXWindowService
19
28
  from .opt_window_service import OPTWindowService
20
29
  from .t0pp_window_service import T0ppWindowService
21
30
  from .t511b_window_service import T511bWindowService
31
+ from .flan_t5_window_service import FlanT5WindowService
22
32
  from .ul2_window_service import UL2WindowService
23
33
  from .yalm_window_service import YaLMWindowService
24
34
  from .window_service import WindowService
25
35
  from .tokenizer_service import TokenizerService
36
+ from helm.proxy.clients.huggingface_client import get_huggingface_model_config
26
37
 
27
38
 
28
39
  class WindowServiceFactory:
@@ -37,9 +48,13 @@ class WindowServiceFactory:
37
48
  engine: str = model.engine
38
49
 
39
50
  window_service: WindowService
40
- if model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
51
+ huggingface_model_config = get_huggingface_model_config(model_name)
52
+ if huggingface_model_config:
53
+ window_service = HuggingFaceWindowService(service=service, model_config=huggingface_model_config)
54
+ elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
41
55
  window_service = WiderOpenAIWindowService(service)
42
- elif organization == "openai" or organization == "simple":
56
+ # For the Google models, we approximate with the OpenAIWindowService
57
+ elif organization == "openai" or organization == "simple" or organization == "google":
43
58
  window_service = OpenAIWindowService(service)
44
59
  elif organization == "AlephAlpha":
45
60
  if engine == "luminous-base":
@@ -56,6 +71,8 @@ class WindowServiceFactory:
56
71
  window_service = MTNLGWindowService(service)
57
72
  elif organization == "anthropic":
58
73
  window_service = AnthropicWindowService(service)
74
+ elif engine == "santacoder":
75
+ window_service = SantaCoderWindowService(service)
59
76
  elif model_name == "huggingface/gpt2":
60
77
  window_service = GPT2WindowService(service)
61
78
  elif model_name == "together/bloom":
@@ -66,22 +83,32 @@ class WindowServiceFactory:
66
83
  window_service = ICEWindowService(service)
67
84
  elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "gooseai/gpt-j-6b"]:
68
85
  window_service = GPTJWindowService(service)
69
- elif model_name in ["together/gpt-neox-20b", "gooseai/gpt-neo-20b"]:
86
+ elif model_name in ["together/gpt-neox-20b", "gooseai/gpt-neo-20b", "together/gpt-neoxt-chat-base-20b"]:
70
87
  window_service = GPTNeoXWindowService(service)
88
+ elif model_name == "together/h3-2.7b":
89
+ window_service = GPT2WindowService(service)
71
90
  elif model_name in ["together/opt-66b", "together/opt-175b"]:
72
91
  window_service = OPTWindowService(service)
73
92
  elif model_name == "together/t0pp":
74
93
  window_service = T0ppWindowService(service)
75
94
  elif model_name == "together/t5-11b":
76
95
  window_service = T511bWindowService(service)
96
+ elif model_name == "together/flan-t5-xxl":
97
+ window_service = FlanT5WindowService(service)
77
98
  elif model_name == "together/ul2":
78
99
  window_service = UL2WindowService(service)
79
100
  elif model_name == "together/yalm":
80
101
  window_service = YaLMWindowService(service)
81
102
  elif organization == "cohere":
82
- window_service = CohereWindowService(service)
103
+ if "command" in engine:
104
+ window_service = CohereCommandWindowService(service)
105
+ else:
106
+ window_service = CohereWindowService(service)
83
107
  elif organization == "ai21":
84
- window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
108
+ if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG):
109
+ window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
110
+ else:
111
+ window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
85
112
  else:
86
113
  raise ValueError(f"Unhandled model name: {model_name}")
87
114