crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
- helm/benchmark/__init__.py +13 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +70 -0
- helm/benchmark/metrics/machine_translation_metrics.py +36 -0
- helm/benchmark/metrics/summarization_metrics.py +7 -8
- helm/benchmark/metrics/test_classification_metrics.py +150 -0
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/run_display.py +7 -48
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +144 -48
- helm/benchmark/run_expander.py +164 -47
- helm/benchmark/run_specs.py +346 -39
- helm/benchmark/runner.py +34 -6
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
- helm/benchmark/scenarios/lextreme_scenario.py +458 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
- helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
- helm/benchmark/scenarios/med_qa_scenario.py +96 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +154 -1
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/santacoder_window_service.py +27 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +34 -7
- helm/common/codec.py +123 -0
- helm/common/general.py +12 -5
- helm/common/test_codec.py +144 -0
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +32 -24
- helm/proxy/clients/google_client.py +88 -0
- helm/proxy/clients/huggingface_client.py +32 -16
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +25 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +115 -7
- helm/proxy/test_models.py +1 -1
- helm/benchmark/presentation/present.py +0 -249
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
helm/proxy/models.py
CHANGED
|
@@ -11,8 +11,14 @@ EMBEDDING_MODEL_TAG: str = "embedding"
|
|
|
11
11
|
FULL_FUNCTIONALITY_TEXT_MODEL_TAG: str = "full_functionality_text"
|
|
12
12
|
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG: str = "limited_functionality_text"
|
|
13
13
|
|
|
14
|
+
# ChatML format
|
|
15
|
+
CHATML_MODEL_TAG: str = "chatml"
|
|
16
|
+
|
|
14
17
|
# For OpenAI models with wider context windows
|
|
15
|
-
WIDER_CONTEXT_WINDOW_TAG: str = "wider_context_window"
|
|
18
|
+
WIDER_CONTEXT_WINDOW_TAG: str = "wider_context_window" # 4000 tokens
|
|
19
|
+
|
|
20
|
+
# For AI21 Jurassic-2 models with wider context windows
|
|
21
|
+
AI21_WIDER_CONTEXT_WINDOW_TAG: str = "ai21_wider_context_window"
|
|
16
22
|
|
|
17
23
|
# To fetch models that use these tokenizers
|
|
18
24
|
GPT2_TOKENIZER_TAG: str = "gpt2_tokenizer"
|
|
@@ -122,6 +128,31 @@ ALL_MODELS = [
|
|
|
122
128
|
description="Jurassic-1 Large (7.5B parameters)",
|
|
123
129
|
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
|
|
124
130
|
),
|
|
131
|
+
# AI21 Jurassic-2 Models: https://www.ai21.com/blog/introducing-j2
|
|
132
|
+
Model(
|
|
133
|
+
group="jurassic",
|
|
134
|
+
creator_organization="AI21 Labs",
|
|
135
|
+
name="ai21/j2-jumbo",
|
|
136
|
+
display_name="Jurassic-2 Jumbo (178B)",
|
|
137
|
+
description="Jurassic-2 Jumbo (178B parameters)",
|
|
138
|
+
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
|
|
139
|
+
),
|
|
140
|
+
Model(
|
|
141
|
+
group="jurassic",
|
|
142
|
+
creator_organization="AI21 Labs",
|
|
143
|
+
name="ai21/j2-grande",
|
|
144
|
+
display_name="Jurassic-2 Grande (17B)",
|
|
145
|
+
description="Jurassic-2 Grande (17B parameters) with a few tweaks to the training process.",
|
|
146
|
+
tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
|
|
147
|
+
),
|
|
148
|
+
Model(
|
|
149
|
+
group="jurassic",
|
|
150
|
+
creator_organization="AI21 Labs",
|
|
151
|
+
name="ai21/j2-large",
|
|
152
|
+
display_name="Jurassic-2 Large (7.5B)",
|
|
153
|
+
description="Jurassic-2 Large (7.5B parameters)",
|
|
154
|
+
tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
|
|
155
|
+
),
|
|
125
156
|
# Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous
|
|
126
157
|
Model(
|
|
127
158
|
group="luminous",
|
|
@@ -250,6 +281,24 @@ ALL_MODELS = [
|
|
|
250
281
|
description="Cohere small v20220720 (410M parameters)",
|
|
251
282
|
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
|
|
252
283
|
),
|
|
284
|
+
Model(
|
|
285
|
+
group="cohere",
|
|
286
|
+
creator_organization="Cohere",
|
|
287
|
+
name="cohere/command-medium-beta",
|
|
288
|
+
display_name="Cohere Command beta (6.1B)",
|
|
289
|
+
description="Cohere Command beta (6.1B parameters) is fine-tuned from the medium model "
|
|
290
|
+
"to respond well with instruction-like prompts",
|
|
291
|
+
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
|
|
292
|
+
),
|
|
293
|
+
Model(
|
|
294
|
+
group="cohere",
|
|
295
|
+
creator_organization="Cohere",
|
|
296
|
+
name="cohere/command-xlarge-beta",
|
|
297
|
+
display_name="Cohere Command beta (52.4B)",
|
|
298
|
+
description="Cohere Command beta (52.4B parameters) is fine-tuned from the XL model "
|
|
299
|
+
"to respond well with instruction-like prompts",
|
|
300
|
+
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
|
|
301
|
+
),
|
|
253
302
|
# EleutherAI
|
|
254
303
|
Model(
|
|
255
304
|
group="together",
|
|
@@ -285,6 +334,14 @@ ALL_MODELS = [
|
|
|
285
334
|
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
|
|
286
335
|
),
|
|
287
336
|
# HuggingFace
|
|
337
|
+
Model(
|
|
338
|
+
group="huggingface",
|
|
339
|
+
creator_organization="OpenAI",
|
|
340
|
+
name="huggingface/gpt2",
|
|
341
|
+
display_name="GPT-2 (1.5B)",
|
|
342
|
+
description="GPT-2 (1.5B parameters)",
|
|
343
|
+
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
|
|
344
|
+
),
|
|
288
345
|
Model(
|
|
289
346
|
group="huggingface",
|
|
290
347
|
creator_organization="EleutherAI",
|
|
@@ -293,6 +350,15 @@ ALL_MODELS = [
|
|
|
293
350
|
description="GPT-J (6B parameters) autoregressive language model trained on The Pile.",
|
|
294
351
|
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
|
|
295
352
|
),
|
|
353
|
+
Model(
|
|
354
|
+
group="huggingface",
|
|
355
|
+
creator_organization="BigCode",
|
|
356
|
+
name="huggingface/santacoder",
|
|
357
|
+
display_name="SantaCoder (1.1B)",
|
|
358
|
+
description="SantaCoder (1.1B parameters) model trained on the Python, Java, and "
|
|
359
|
+
"JavaScript subset of The Stack (v1.1).",
|
|
360
|
+
tags=[CODE_MODEL_TAG],
|
|
361
|
+
),
|
|
296
362
|
# Google
|
|
297
363
|
Model(
|
|
298
364
|
group="together",
|
|
@@ -306,6 +372,15 @@ ALL_MODELS = [
|
|
|
306
372
|
# Does not support echo=True
|
|
307
373
|
tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
|
|
308
374
|
),
|
|
375
|
+
Model(
|
|
376
|
+
group="together",
|
|
377
|
+
creator_organization="Google",
|
|
378
|
+
name="together/flan-t5-xxl",
|
|
379
|
+
display_name="Flan-T5 (11B)",
|
|
380
|
+
description="Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks.",
|
|
381
|
+
# Does not support echo=True
|
|
382
|
+
tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
|
|
383
|
+
),
|
|
309
384
|
Model(
|
|
310
385
|
group="together",
|
|
311
386
|
creator_organization="Google",
|
|
@@ -323,12 +398,13 @@ ALL_MODELS = [
|
|
|
323
398
|
NLG_PREFIX_TAG,
|
|
324
399
|
],
|
|
325
400
|
),
|
|
401
|
+
# H3 model
|
|
326
402
|
Model(
|
|
327
|
-
group="
|
|
328
|
-
creator_organization="
|
|
329
|
-
name="
|
|
330
|
-
display_name="
|
|
331
|
-
description="
|
|
403
|
+
group="together",
|
|
404
|
+
creator_organization="HazyResearch",
|
|
405
|
+
name="together/h3-2.7b",
|
|
406
|
+
display_name="H3 (2.7B)",
|
|
407
|
+
description="H3 (2.7B parameters) is a decoder-only language model based on state space models.",
|
|
332
408
|
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
|
|
333
409
|
),
|
|
334
410
|
# OPT
|
|
@@ -480,7 +556,21 @@ ALL_MODELS = [
|
|
|
480
556
|
description="Code model that is a stronger, multilingual version of the Codex (12B) model in the paper.",
|
|
481
557
|
tags=[CODE_MODEL_TAG, GPT2_TOKENIZER_TAG],
|
|
482
558
|
),
|
|
483
|
-
# ChatGPT
|
|
559
|
+
# ChatGPT: https://openai.com/blog/chatgpt
|
|
560
|
+
Model(
|
|
561
|
+
group="gpt3",
|
|
562
|
+
creator_organization="OpenAI",
|
|
563
|
+
name="openai/gpt-3.5-turbo-0301",
|
|
564
|
+
display_name="gpt-3.5-turbo-0301",
|
|
565
|
+
# https://platform.openai.com/docs/models/gpt-3-5
|
|
566
|
+
description="Sibling model of text-davinci-003 is optimized for chat but works well "
|
|
567
|
+
"for traditional completions tasks as well. Snapshot from 2023-03-01.",
|
|
568
|
+
# The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
|
|
569
|
+
# sequence length is smaller at 4087 with one user input message and one assistant
|
|
570
|
+
# output message because ChatGPT uses special tokens for message roles and boundaries.
|
|
571
|
+
# We use a rounded-down sequence length of 4000 to account for these special tokens.
|
|
572
|
+
tags=[TEXT_MODEL_TAG, WIDER_CONTEXT_WINDOW_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
|
|
573
|
+
),
|
|
484
574
|
Model(
|
|
485
575
|
group="gpt3",
|
|
486
576
|
creator_organization="OpenAI",
|
|
@@ -532,6 +622,14 @@ ALL_MODELS = [
|
|
|
532
622
|
description="GPT-JT (6B parameters) is a fork of GPT-J",
|
|
533
623
|
tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
|
|
534
624
|
),
|
|
625
|
+
Model(
|
|
626
|
+
group="together",
|
|
627
|
+
creator_organization="Together",
|
|
628
|
+
name="together/gpt-neoxt-chat-base-20b",
|
|
629
|
+
display_name="GPT-NeoXT-Chat-Base (20B)",
|
|
630
|
+
description="GPT-NeoXT-Chat-Base (20B parameters) is a fork of GPT-NeoX",
|
|
631
|
+
tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, CHATML_MODEL_TAG, GPTNEO_TOKENIZER_TAG],
|
|
632
|
+
),
|
|
535
633
|
# Tsinghua
|
|
536
634
|
Model(
|
|
537
635
|
group="together",
|
|
@@ -557,6 +655,16 @@ ALL_MODELS = [
|
|
|
557
655
|
# https://github.com/stanford-crfm/benchmarking/issues/738
|
|
558
656
|
tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG],
|
|
559
657
|
),
|
|
658
|
+
# PaLM
|
|
659
|
+
Model(
|
|
660
|
+
group="google",
|
|
661
|
+
creator_organization="Google",
|
|
662
|
+
name="google/palm",
|
|
663
|
+
display_name="PaLM (540B)",
|
|
664
|
+
description="Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips "
|
|
665
|
+
"([paper](https://arxiv.org/pdf/2204.02311.pdf)).",
|
|
666
|
+
tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
|
|
667
|
+
),
|
|
560
668
|
# For debugging
|
|
561
669
|
Model(
|
|
562
670
|
group="simple",
|
helm/proxy/test_models.py
CHANGED
|
@@ -1,249 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
import os
|
|
3
|
-
import traceback
|
|
4
|
-
|
|
5
|
-
from tqdm import tqdm
|
|
6
|
-
from typing import List, Optional
|
|
7
|
-
|
|
8
|
-
from helm.common.authentication import Authentication
|
|
9
|
-
from helm.common.general import write_lines
|
|
10
|
-
from helm.common.hierarchical_logger import hlog, htrack
|
|
11
|
-
from helm.benchmark.run import run_benchmarking, add_run_args, validate_args, LATEST_SYMLINK
|
|
12
|
-
from helm.benchmark.runner import RunSpec
|
|
13
|
-
from helm.benchmark.presentation.run_entry import read_run_entries
|
|
14
|
-
from helm.proxy.services.remote_service import add_service_args, create_authentication
|
|
15
|
-
|
|
16
|
-
"""
|
|
17
|
-
Runs all the RunSpecs in run_specs.conf and outputs JSON files.
|
|
18
|
-
TODO: rename this file to `run_all.py`
|
|
19
|
-
|
|
20
|
-
Usage:
|
|
21
|
-
|
|
22
|
-
venv/bin/helm-run
|
|
23
|
-
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class AllRunner:
|
|
28
|
-
"""Runs all RunSpecs specified in the configuration file."""
|
|
29
|
-
|
|
30
|
-
def __init__(
|
|
31
|
-
self,
|
|
32
|
-
auth: Authentication,
|
|
33
|
-
conf_paths: List[str],
|
|
34
|
-
url: str,
|
|
35
|
-
local: bool,
|
|
36
|
-
local_path: str,
|
|
37
|
-
output_path: str,
|
|
38
|
-
suite: str,
|
|
39
|
-
num_threads: int,
|
|
40
|
-
dry_run: Optional[bool],
|
|
41
|
-
skip_instances: bool,
|
|
42
|
-
max_eval_instances: Optional[int],
|
|
43
|
-
num_train_trials: Optional[int],
|
|
44
|
-
models_to_run: Optional[List[str]],
|
|
45
|
-
groups_to_run: Optional[List[str]],
|
|
46
|
-
exit_on_error: bool,
|
|
47
|
-
priority: Optional[int],
|
|
48
|
-
mongo_uri: str,
|
|
49
|
-
):
|
|
50
|
-
self.auth: Authentication = auth
|
|
51
|
-
self.conf_paths: List[str] = conf_paths
|
|
52
|
-
self.url: str = url
|
|
53
|
-
self.local: bool = local
|
|
54
|
-
self.local_path: str = local_path
|
|
55
|
-
self.output_path: str = output_path
|
|
56
|
-
self.suite: str = suite
|
|
57
|
-
self.num_threads: int = num_threads
|
|
58
|
-
self.dry_run: Optional[bool] = dry_run
|
|
59
|
-
self.skip_instances: bool = skip_instances
|
|
60
|
-
self.max_eval_instances: Optional[int] = max_eval_instances
|
|
61
|
-
self.num_train_trials: Optional[int] = num_train_trials
|
|
62
|
-
self.models_to_run: Optional[List[str]] = models_to_run
|
|
63
|
-
self.groups_to_run: Optional[List[str]] = groups_to_run
|
|
64
|
-
self.exit_on_error: bool = exit_on_error
|
|
65
|
-
self.priority: Optional[int] = priority
|
|
66
|
-
self.mongo_uri = mongo_uri
|
|
67
|
-
|
|
68
|
-
@htrack(None)
|
|
69
|
-
def run(self):
|
|
70
|
-
run_specs: List[RunSpec] = []
|
|
71
|
-
runs_dir: str = os.path.join(self.output_path, "runs")
|
|
72
|
-
suite_dir: str = os.path.join(runs_dir, self.suite)
|
|
73
|
-
|
|
74
|
-
run_entries = read_run_entries(self.conf_paths)
|
|
75
|
-
|
|
76
|
-
for entry in tqdm(run_entries.entries):
|
|
77
|
-
# Filter by priority
|
|
78
|
-
priority: int = entry.priority
|
|
79
|
-
if self.priority is not None and priority > self.priority:
|
|
80
|
-
continue
|
|
81
|
-
|
|
82
|
-
try:
|
|
83
|
-
new_run_specs = run_benchmarking(
|
|
84
|
-
run_spec_descriptions=[entry.description],
|
|
85
|
-
auth=self.auth,
|
|
86
|
-
url=self.url,
|
|
87
|
-
local=self.local,
|
|
88
|
-
local_path=self.local_path,
|
|
89
|
-
num_threads=self.num_threads,
|
|
90
|
-
output_path=self.output_path,
|
|
91
|
-
suite=self.suite,
|
|
92
|
-
dry_run=self.dry_run,
|
|
93
|
-
skip_instances=self.skip_instances,
|
|
94
|
-
max_eval_instances=self.max_eval_instances,
|
|
95
|
-
num_train_trials=self.num_train_trials,
|
|
96
|
-
groups=entry.groups,
|
|
97
|
-
models_to_run=self.models_to_run,
|
|
98
|
-
groups_to_run=self.groups_to_run,
|
|
99
|
-
mongo_uri=self.mongo_uri,
|
|
100
|
-
)
|
|
101
|
-
run_specs.extend(new_run_specs)
|
|
102
|
-
|
|
103
|
-
except Exception as e:
|
|
104
|
-
if self.exit_on_error:
|
|
105
|
-
raise e
|
|
106
|
-
else:
|
|
107
|
-
hlog(f"Error when running {entry.description}:\n{traceback.format_exc()}")
|
|
108
|
-
|
|
109
|
-
if len(run_specs) == 0:
|
|
110
|
-
hlog("There were no RunSpecs or they got filtered out.")
|
|
111
|
-
return
|
|
112
|
-
|
|
113
|
-
hlog(f"{len(run_entries.entries)} entries produced into {len(run_specs)} run specs")
|
|
114
|
-
|
|
115
|
-
if self.skip_instances:
|
|
116
|
-
self.write_parallel_commands(suite_dir, run_specs)
|
|
117
|
-
|
|
118
|
-
# Create a symlink runs/latest -> runs/<name_of_suite>,
|
|
119
|
-
# so runs/latest always points to the latest run suite.
|
|
120
|
-
symlink_path: str = os.path.abspath(os.path.join(runs_dir, LATEST_SYMLINK))
|
|
121
|
-
if os.path.islink(symlink_path):
|
|
122
|
-
# Remove the previous symlink if it exists.
|
|
123
|
-
os.unlink(symlink_path)
|
|
124
|
-
os.symlink(os.path.abspath(suite_dir), symlink_path)
|
|
125
|
-
|
|
126
|
-
def write_parallel_commands(self, suite_dir: str, run_specs: List[RunSpec]):
|
|
127
|
-
"""
|
|
128
|
-
Print out scripts to run after.
|
|
129
|
-
"""
|
|
130
|
-
# Print out all the models and groups that we're touching.
|
|
131
|
-
models = set()
|
|
132
|
-
groups = set()
|
|
133
|
-
for run_spec in run_specs:
|
|
134
|
-
models.add(run_spec.adapter_spec.model)
|
|
135
|
-
for group in run_spec.groups:
|
|
136
|
-
groups.add(group)
|
|
137
|
-
hlog(f"{len(models)} models: {' '.join(models)}")
|
|
138
|
-
hlog(f"{len(groups)} groups: {' '.join(groups)}")
|
|
139
|
-
|
|
140
|
-
# Write wrapper for helm-run that can be used through Slurm
|
|
141
|
-
lines = [
|
|
142
|
-
"#!/bin/bash",
|
|
143
|
-
"",
|
|
144
|
-
". venv/bin/activate",
|
|
145
|
-
'helm-run "$@"',
|
|
146
|
-
]
|
|
147
|
-
write_lines(os.path.join(suite_dir, "helm-run.sh"), lines)
|
|
148
|
-
|
|
149
|
-
# Write out bash script for launching the entire benchmark
|
|
150
|
-
lines = []
|
|
151
|
-
for model in models:
|
|
152
|
-
for group in groups:
|
|
153
|
-
# Try to match the arguments of `run_benchmarking`
|
|
154
|
-
# Build arguments
|
|
155
|
-
present_args = []
|
|
156
|
-
present_args.append(f"--confs {' '.join(self.conf_paths)}")
|
|
157
|
-
if self.local:
|
|
158
|
-
present_args.append("--local")
|
|
159
|
-
present_args.append(f"--num-threads {self.num_threads}")
|
|
160
|
-
present_args.append(f"--suite {self.suite}")
|
|
161
|
-
if self.max_eval_instances is not None:
|
|
162
|
-
present_args.append(f"--max-eval-instances {self.max_eval_instances}")
|
|
163
|
-
present_args.append(f"--models-to-run {model}")
|
|
164
|
-
present_args.append(f"--scenario-groups-to-run {group}")
|
|
165
|
-
|
|
166
|
-
lines.append(
|
|
167
|
-
f"sbatch --partition john "
|
|
168
|
-
f"--cpus {self.num_threads} "
|
|
169
|
-
f"-o benchmark_output/runs/{self.suite}/slurm-%j.out "
|
|
170
|
-
f"{suite_dir}/helm-run.sh "
|
|
171
|
-
f"{' '.join(present_args)}"
|
|
172
|
-
)
|
|
173
|
-
lines.append("echo '# Run these after Slurm jobs terminate'")
|
|
174
|
-
lines.append(f"echo 'helm-run --local --suite {self.suite} --skip-instances'")
|
|
175
|
-
lines.append(f"echo 'helm-summarize --suite {self.suite}'")
|
|
176
|
-
write_lines(os.path.join(suite_dir, "run-all.sh"), lines)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
def main():
|
|
180
|
-
parser = argparse.ArgumentParser()
|
|
181
|
-
add_service_args(parser)
|
|
182
|
-
parser.add_argument(
|
|
183
|
-
"-c",
|
|
184
|
-
"--conf-paths",
|
|
185
|
-
nargs="+",
|
|
186
|
-
help="Where to read RunSpecs to run from",
|
|
187
|
-
default=["src/helm/benchmark/presentation/run_specs.conf"],
|
|
188
|
-
)
|
|
189
|
-
parser.add_argument(
|
|
190
|
-
"--models-to-run",
|
|
191
|
-
nargs="+",
|
|
192
|
-
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
193
|
-
default=None,
|
|
194
|
-
)
|
|
195
|
-
parser.add_argument(
|
|
196
|
-
"--groups-to-run",
|
|
197
|
-
nargs="+",
|
|
198
|
-
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
199
|
-
default=None,
|
|
200
|
-
)
|
|
201
|
-
parser.add_argument(
|
|
202
|
-
"--exit-on-error",
|
|
203
|
-
action="store_true",
|
|
204
|
-
default=None,
|
|
205
|
-
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
206
|
-
)
|
|
207
|
-
parser.add_argument(
|
|
208
|
-
"--priority",
|
|
209
|
-
type=int,
|
|
210
|
-
default=None,
|
|
211
|
-
help="Run RunSpecs with priority less than or equal to this number. "
|
|
212
|
-
"If a value for --priority is not specified, run on everything",
|
|
213
|
-
)
|
|
214
|
-
add_run_args(parser)
|
|
215
|
-
args = parser.parse_args()
|
|
216
|
-
validate_args(args)
|
|
217
|
-
|
|
218
|
-
runner = AllRunner(
|
|
219
|
-
# Use a dummy API key when `skip_instances` or `local` is set.
|
|
220
|
-
# The benchmarking framework will not make any requests to the proxy server when
|
|
221
|
-
# `skip_instances` is set, so a valid API key is not necessary.
|
|
222
|
-
# Setting `local` will run and cache everything locally.
|
|
223
|
-
auth=Authentication("") if args.skip_instances or args.local else create_authentication(args),
|
|
224
|
-
conf_paths=args.conf_paths,
|
|
225
|
-
url=args.server_url,
|
|
226
|
-
local=args.local,
|
|
227
|
-
local_path=args.local_path,
|
|
228
|
-
output_path=args.output_path,
|
|
229
|
-
suite=args.suite,
|
|
230
|
-
num_threads=args.num_threads,
|
|
231
|
-
dry_run=args.dry_run,
|
|
232
|
-
skip_instances=args.skip_instances,
|
|
233
|
-
max_eval_instances=args.max_eval_instances,
|
|
234
|
-
num_train_trials=args.num_train_trials,
|
|
235
|
-
models_to_run=args.models_to_run,
|
|
236
|
-
groups_to_run=args.groups_to_run,
|
|
237
|
-
exit_on_error=args.exit_on_error,
|
|
238
|
-
priority=args.priority,
|
|
239
|
-
mongo_uri=args.mongo_uri,
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
# Run the benchmark!
|
|
243
|
-
runner.run()
|
|
244
|
-
|
|
245
|
-
hlog("Done.")
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
if __name__ == "__main__":
|
|
249
|
-
main()
|
|
File without changes
|
|
File without changes
|