goPEST 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- goPEST-0.0.11.dist-info/LICENSE +11 -0
- goPEST-0.0.11.dist-info/METADATA +95 -0
- goPEST-0.0.11.dist-info/RECORD +30 -0
- goPEST-0.0.11.dist-info/WHEEL +5 -0
- goPEST-0.0.11.dist-info/entry_points.txt +2 -0
- goPEST-0.0.11.dist-info/top_level.txt +1 -0
- gopest/__init__.py +11 -0
- gopest/_version.py +16 -0
- gopest/check_slaves.py +402 -0
- gopest/commands.py +80 -0
- gopest/common.py +194 -0
- gopest/data/case.pst +67 -0
- gopest/data/goPESTconfig.aut2.toml +95 -0
- gopest/data/goPESTconfig.toml +94 -0
- gopest/data/goPESTobs.list +793 -0
- gopest/data/goPESTpar.list +95 -0
- gopest/make_case_pst.py +229 -0
- gopest/obs.py +297 -0
- gopest/obs_def.py +2086 -0
- gopest/par.py +332 -0
- gopest/par_def.py +313 -0
- gopest/pest_model.py +245 -0
- gopest/rename_latest_files.py +35 -0
- gopest/run_beopest.py +205 -0
- gopest/run_ns_pr.py +617 -0
- gopest/submit_beopest.py +931 -0
- gopest/utils/__init__.py +0 -0
- gopest/utils/gener_groups.py +192 -0
- gopest/utils/t2listingh5.py +376 -0
- gopest/utils/waiwera_listing.py +587 -0
gopest/submit_beopest.py
ADDED
|
@@ -0,0 +1,931 @@
|
|
|
1
|
+
#!/bin/python
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
|
|
6
|
+
from gopest.common import config as cfg
|
|
7
|
+
from gopest.common import runtime
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
Run this script to submit BeoPEST jobs on NeSI using Slurm. This includes
|
|
11
|
+
master and slaves jobs. Note the BeoPEST here uses TCP communication.
|
|
12
|
+
|
|
13
|
+
1. Modify the following paragraph of settings.
|
|
14
|
+
2. Set up PEST directory properly.
|
|
15
|
+
3. Run "python submit_beopest.py" from the main PEST directory.
|
|
16
|
+
|
|
17
|
+
Angus Yeh
|
|
18
|
+
a.yeh@auckland.ac.nz
|
|
19
|
+
June 2015
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
PROJECT = cfg['nesi']['project']
|
|
23
|
+
PROJECT_MAUI = cfg['nesi']['maui']['project']
|
|
24
|
+
PROJECT_MAHUIKA = cfg['nesi']['mahuika']['project']
|
|
25
|
+
|
|
26
|
+
WALLTIME_MASTER = cfg['nesi']['walltime_master']
|
|
27
|
+
WALLTIME_SLAVES = WALLTIME_MASTER
|
|
28
|
+
|
|
29
|
+
WALLTIME_FORWARD = cfg['nesi']['walltime_forward']
|
|
30
|
+
|
|
31
|
+
MAUI_NTASKS = cfg['nesi']['maui']['ntasks']
|
|
32
|
+
MAHUIKA_NTASKS = cfg['nesi']['mahuika']['ntasks']
|
|
33
|
+
|
|
34
|
+
NUM_SLAVES = cfg['pest']['num_slaves']
|
|
35
|
+
|
|
36
|
+
MEM_PER_SLAVE = "5000" # MB
|
|
37
|
+
MEM_MASTER = "500"
|
|
38
|
+
MEM_FORWARD = "4000"
|
|
39
|
+
|
|
40
|
+
# SIMULATOR = "~/bin/autough2_6b" # specify absolute path if not in system path
|
|
41
|
+
# SIMULATOR = "waiwera-Mahuika" # specify absolute path if not in system path
|
|
42
|
+
# 'waiwera': local native waiwera, installed in path
|
|
43
|
+
# 'waiwera-dkr': running locally using Docker with pywaiwera installed
|
|
44
|
+
# 'waiwera-Maui': calling submit_beopest.py and use Maui
|
|
45
|
+
# 'waiwera-Mahuika': calling submit_beopest.py and use Mahuika
|
|
46
|
+
SIMULATOR = cfg['simulator']['executable']
|
|
47
|
+
|
|
48
|
+
PST_NAME = cfg['pest']['case-name']
|
|
49
|
+
PESTDIR = cfg['pest']['dir']
|
|
50
|
+
SLAVEDIR = cfg['pest']['slave_dirs']
|
|
51
|
+
BEOPEST = cfg['pest']['executable']
|
|
52
|
+
PORT = cfg['pest']['port']
|
|
53
|
+
|
|
54
|
+
SWITCHES = " ".join(cfg['pest']['switches'])
|
|
55
|
+
# additional swiches for beopest, eg /s for restart, /p1 for parallise 1st model run
|
|
56
|
+
# should not use /p1 with svda, no /p1 with obsreref_10 either
|
|
57
|
+
# /hpstart, make sure PST_NAME.hp exists
|
|
58
|
+
# /i jco reuse, make sure PST_NAME.jco exists
|
|
59
|
+
# NOTE it's possible to use /hpstart and /i together, will start update tests right away
|
|
60
|
+
# NOTE working directory is assumed to be where this script is launched
|
|
61
|
+
|
|
62
|
+
# if use /f, PEST_HP runs a sets of forward runs using .par files
|
|
63
|
+
F_PAR_SETS = ["prandom", "11", "50", "40", PST_NAME+".rrf"]
|
|
64
|
+
# Enter filename base of parameter value files:
|
|
65
|
+
# Enter first index to use:
|
|
66
|
+
# Enter last index to use:
|
|
67
|
+
# Enter parallel run packet size:
|
|
68
|
+
# Enter name for run results file:
|
|
69
|
+
|
|
70
|
+
# used for slurm to redirect as standard input
|
|
71
|
+
use_input = False
|
|
72
|
+
with open('_input', 'w') as f:
|
|
73
|
+
# if PEST_HP asks name of jacobian file, then use file named PST_NAME + ".jco"
|
|
74
|
+
if "/i" in SWITCHES:
|
|
75
|
+
f.write(PST_NAME + ".jco\n")
|
|
76
|
+
use_input = True
|
|
77
|
+
if "/f" in SWITCHES:
|
|
78
|
+
f.write("\n".join(F_PAR_SETS))
|
|
79
|
+
use_input = True
|
|
80
|
+
|
|
81
|
+
SLAVES_ON_SCRATCH = False
|
|
82
|
+
KEEP_TARGZ_SLAVES = False
|
|
83
|
+
# normally when a slave ends, the slave directory will be tar-gz-ed for debugging
|
|
84
|
+
# but this can be huge due to TOUGH2 listing files, these files can be summarised
|
|
85
|
+
# (using head and tail) to save space
|
|
86
|
+
SUMMARY_LARGE_FILES = "" #"*.listing"
|
|
87
|
+
|
|
88
|
+
MAIN_DIR = os.getcwd()
|
|
89
|
+
|
|
90
|
+
# use absolute path if known
|
|
91
|
+
if PESTDIR:
|
|
92
|
+
BEOPEST = os.path.join(PESTDIR,BEOPEST)
|
|
93
|
+
|
|
94
|
+
# list NeSI modules that needs to load for running scripts/AUTOUGH2 etc.
|
|
95
|
+
ENV_MODULES = [
|
|
96
|
+
'module load gimkl/2018b',
|
|
97
|
+
'module load Python-Geo/3.7.3-gimkl-2018b',
|
|
98
|
+
'source /nesi/project/uoa00124/env-py3-gopest/bin/activate',
|
|
99
|
+
# 18/12/2022 3:34:08 a.m.
|
|
100
|
+
# # "module GCC/4.9.2",
|
|
101
|
+
# # "module Python/2.7.11-foss-2015a",
|
|
102
|
+
# "module load gimkl/2017a",
|
|
103
|
+
# "module load GCC/7.1.0",
|
|
104
|
+
# "module load Python-Geo/2.7.14-gimkl-2017a",
|
|
105
|
+
]
|
|
106
|
+
# user can load these modules by command 'source _load_modules.sh'
|
|
107
|
+
with open('_load_modules.sh', 'w') as f:
|
|
108
|
+
f.write('\n'.join([
|
|
109
|
+
"echo !!! Please source this file to modify environment in calling shell",
|
|
110
|
+
"echo \" 'source _load_modules.sh'\"",
|
|
111
|
+
] + ENV_MODULES))
|
|
112
|
+
|
|
113
|
+
ENV_MODULES_MAUI = cfg['nesi']['maui']['env_init']
|
|
114
|
+
ENV_MODULES_MAHUIKA = cfg['nesi']['mahuika']['env_init']
|
|
115
|
+
|
|
116
|
+
# communication files:
|
|
117
|
+
# pest files
|
|
118
|
+
# goPEST*
|
|
119
|
+
# pest_model*
|
|
120
|
+
# real_model* (from gopest.common.runtime)
|
|
121
|
+
# data_* (from toml)
|
|
122
|
+
# gs_* (from toml)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
pst_name = cfg['pest']['case-name']
|
|
126
|
+
slave_files = [
|
|
127
|
+
pst_name + '.pst',
|
|
128
|
+
# communication files:
|
|
129
|
+
"_input",
|
|
130
|
+
"_pest_dir",
|
|
131
|
+
"_tough2",
|
|
132
|
+
"_logfile",
|
|
133
|
+
"_master_out",
|
|
134
|
+
# goPEST working files
|
|
135
|
+
"pest_model.ins",
|
|
136
|
+
"pest_model.tpl",
|
|
137
|
+
"goPESTconfig.toml",
|
|
138
|
+
"goPESTpar.list",
|
|
139
|
+
"goPESTobs.list",
|
|
140
|
+
# user supplied function, may not exist
|
|
141
|
+
"goPESTuser.py",
|
|
142
|
+
]
|
|
143
|
+
# forward run files -> generated from toml
|
|
144
|
+
slave_files += [
|
|
145
|
+
# runtime['filename']['save'],
|
|
146
|
+
runtime['filename']['incon'],
|
|
147
|
+
runtime['filename']['dat_orig'],
|
|
148
|
+
]
|
|
149
|
+
slave_files += runtime['filename']['all_geoms']
|
|
150
|
+
slave_files += runtime['filename']['dat_seq']
|
|
151
|
+
# slave_files += runtime['filename']['lst_seq']
|
|
152
|
+
# user files -> copied from toml
|
|
153
|
+
slave_files += cfg['files']['slave']
|
|
154
|
+
|
|
155
|
+
master_files = slave_files
|
|
156
|
+
if "/hpstart" in cfg['pest']['switches']:
|
|
157
|
+
master_files.append(pst_name + '.hp')
|
|
158
|
+
if "/i" in cfg['pest']['switches']:
|
|
159
|
+
master_files.append(pst_name + '.jco')
|
|
160
|
+
|
|
161
|
+
master_files += cfg['files']['master']
|
|
162
|
+
|
|
163
|
+
# All files needed in slave directory
|
|
164
|
+
# TODO: should have both master_files and slave_files
|
|
165
|
+
FILE_LIST = " ".join(slave_files)
|
|
166
|
+
|
|
167
|
+
"""
|
|
168
|
+
Details
|
|
169
|
+
-------
|
|
170
|
+
|
|
171
|
+
This explains the working solution of using BeoPEST on NeSI, which is
|
|
172
|
+
implemented by this script.
|
|
173
|
+
|
|
174
|
+
This version of BeoPEST works uses TCP, MPI not supported with some SVD stuff
|
|
175
|
+
John Doherty added (11/12/2014).
|
|
176
|
+
|
|
177
|
+
Slurm job files generated for submission:
|
|
178
|
+
|
|
179
|
+
_master_job.sl -> this is job file for launching master beopest along with
|
|
180
|
+
a set of slaves, using N+1 cpus
|
|
181
|
+
_slaves_job.sl -> job file for slaves, using N cpus
|
|
182
|
+
_multi.conf -> used by slaves job file, calls _run_a_slave.sh
|
|
183
|
+
_run_master.sh -> record which host it is running on, run master beopest
|
|
184
|
+
_run_a_slave.sh -> copy required files to unique slave directories, then
|
|
185
|
+
launch single beopest in slave mode.
|
|
186
|
+
|
|
187
|
+
_master_dir -> so slave jobs know where master directory is
|
|
188
|
+
_master_out -> so slave jobs know where to print information
|
|
189
|
+
_pest_dir -> so slaves know where to find PEST/BeoPEST utilities
|
|
190
|
+
_tough2 -> so slaves know which SIMULATOR simulator to use
|
|
191
|
+
|
|
192
|
+
This set of scripts/job files basically launch a beopest as master in the main
|
|
193
|
+
directory by calling:
|
|
194
|
+
|
|
195
|
+
beopest case /h :4004
|
|
196
|
+
|
|
197
|
+
In the master job file, a command is used to detect which host it is run on. The
|
|
198
|
+
info is saved to a file _master_host in the main direcotry. Hence the slaves
|
|
199
|
+
are set to run with a small delay, to ensure the existence _master_host file.
|
|
200
|
+
|
|
201
|
+
Now a sbatch --dependency after:JOB_ID is used to launch slaves. Note it's
|
|
202
|
+
after: not afterok: as seen in the examples. This runs the slaves after master
|
|
203
|
+
STARTED (instead of finished).
|
|
204
|
+
|
|
205
|
+
The slaves job file uses srun --multi-prog, which reads _multi.conf file that
|
|
206
|
+
tells how to launch each slave. Each slave is launched by using a bash script.
|
|
207
|
+
The script includes the code to:
|
|
208
|
+
1. create unique slave dir, and copy required files
|
|
209
|
+
2. change directory
|
|
210
|
+
3. obtain the hostname of master beopest (from file _master_host)
|
|
211
|
+
4. launch beopest case /h masterhost:4004
|
|
212
|
+
|
|
213
|
+
Also something bad might happen if more than one BeoPEST master uses the same
|
|
214
|
+
communication port. So it's a good idea to set PORT to something different
|
|
215
|
+
when you receive this script. Usually some large number is okay.
|
|
216
|
+
|
|
217
|
+
Tips For Users
|
|
218
|
+
--------------
|
|
219
|
+
- You can launch more slave jobs by copy commands shown on screen after
|
|
220
|
+
submission. Note the dependency jobid is important.
|
|
221
|
+
- Job ids are shown on the screen, you can cencel jobs by "scancel JOBID"
|
|
222
|
+
- cleanups can be done by:
|
|
223
|
+
rm -r slave_*
|
|
224
|
+
rm *.out
|
|
225
|
+
rm _*
|
|
226
|
+
(be carefule not to delete your own files, if they match the names here.)
|
|
227
|
+
- On linux, you have to change a script into an 'executable' if you want to
|
|
228
|
+
run it directly like "./myscript.sh", instead of "bash myscript.sh".
|
|
229
|
+
This requires you to issue the command:
|
|
230
|
+
chmod +x myscript.sh
|
|
231
|
+
- Within each of the slave directory, you can find a file called
|
|
232
|
+
_master_dir, this file contains the master's working directory. This is
|
|
233
|
+
useful if some scripts running on a slave need to know where the master
|
|
234
|
+
job is located. (I use this to aid the transfer of updated .incon files
|
|
235
|
+
across all slaves.) You can use a command like this to get it back into
|
|
236
|
+
a bash variable:
|
|
237
|
+
MASTERDIR=`cat _master_dir`
|
|
238
|
+
cp ${MASTERDIR}/a ./b
|
|
239
|
+
- I use rsync to copy everything in a dir to another. This is useful when
|
|
240
|
+
you want to make some changes to a set of PEST setup/files. Be careful
|
|
241
|
+
with the slashes here. eg.
|
|
242
|
+
rsync -ar --progress case_1/ case_wai_2
|
|
243
|
+
|
|
244
|
+
NeSI related:
|
|
245
|
+
- Read the first page when ssh into nesi login node.
|
|
246
|
+
- use command show_my_projects to see what projects you are on, these
|
|
247
|
+
proejcts directory should be where you keep model files etc.
|
|
248
|
+
- compiling must be done on the build nodes, go to build node by ssh
|
|
249
|
+
build-wm.
|
|
250
|
+
- use private keys to simplify the login/scp procedure, on Windows I use
|
|
251
|
+
Putty and related tools such as WinSCP etc.
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
TODOs
|
|
255
|
+
-----
|
|
256
|
+
- make cleaning up easier
|
|
257
|
+
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
# python 2.6 does not support the new/easier subprocess.check_output
|
|
261
|
+
def check_output_old(cmd):
|
|
262
|
+
''' so use os.system with "> /dev/null 2>&1" '''
|
|
263
|
+
import tempfile
|
|
264
|
+
handle, fname = tempfile.mkstemp(prefix='tmpout', dir='.', text=True)
|
|
265
|
+
code = os.system(cmd + "> " + fname)
|
|
266
|
+
with open(fname, 'r') as f:
|
|
267
|
+
out = f.readlines()
|
|
268
|
+
os.close(handle)
|
|
269
|
+
os.remove(fname)
|
|
270
|
+
if code != 0:
|
|
271
|
+
print("Command %s failed: %s" % (cmd, "".join(out)))
|
|
272
|
+
return "".join(out)
|
|
273
|
+
|
|
274
|
+
def check_output(cmd):
|
|
275
|
+
return subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True).decode()
|
|
276
|
+
|
|
277
|
+
def write_to(filename, line):
|
|
278
|
+
with open(filename, 'w') as f:
|
|
279
|
+
f.write(line)
|
|
280
|
+
|
|
281
|
+
def get_master_dir():
|
|
282
|
+
""" _master_dir is created and copied to slave dirs """
|
|
283
|
+
try:
|
|
284
|
+
with open('_master_dir', 'r') as f:
|
|
285
|
+
line = f.readlines()[0].strip()
|
|
286
|
+
return line
|
|
287
|
+
except:
|
|
288
|
+
return '..'
|
|
289
|
+
|
|
290
|
+
def get_slave_id():
|
|
291
|
+
""" returns a string as id, needs to be called from within a 'slave' directory.
|
|
292
|
+
NOTE _procid should be written by the master program that invokes slaves.
|
|
293
|
+
Otherwise the current (sub) directory name will be used. """
|
|
294
|
+
try:
|
|
295
|
+
with open('_procid', 'r') as f:
|
|
296
|
+
line = f.readlines()[0].strip()
|
|
297
|
+
return line
|
|
298
|
+
except:
|
|
299
|
+
# get get only the last part of current working dir
|
|
300
|
+
return os.path.basename(os.path.normpath(os.getcwd()))
|
|
301
|
+
|
|
302
|
+
def get_master_out():
|
|
303
|
+
try:
|
|
304
|
+
with open('_master_out', 'r') as f:
|
|
305
|
+
line = f.readlines()[0].strip()
|
|
306
|
+
return line
|
|
307
|
+
except:
|
|
308
|
+
return 'STDOUT'
|
|
309
|
+
|
|
310
|
+
def create_job_name():
|
|
311
|
+
""" use directory's name, and make sure spaces replaced by _, for now. """
|
|
312
|
+
name = os.path.split(check_output("pwd"))[-1].replace(' ','').replace('_','')
|
|
313
|
+
if name.startswith('case'):
|
|
314
|
+
name = 'c' + name[4:]
|
|
315
|
+
return name.strip()
|
|
316
|
+
|
|
317
|
+
def gen_master_sl(fname="_master_job.sl"):
|
|
318
|
+
jobname = create_job_name()
|
|
319
|
+
txt = [
|
|
320
|
+
"#!/bin/bash",
|
|
321
|
+
"#SBATCH -J %s" % jobname,
|
|
322
|
+
"#SBATCH -A %s # Project Account" % PROJECT,
|
|
323
|
+
"#SBATCH --time=%s # Walltime" % WALLTIME_MASTER,
|
|
324
|
+
# "#SBATCH --ntasks=1 # number of tasks",
|
|
325
|
+
# "#SBATCH --mem=%s # memory/cpu (in MB)" % MEM_MASTER,
|
|
326
|
+
# # "#SBATCH --workdir=%s # working dir" % MAIN_DIR,
|
|
327
|
+
# "#SBATCH --ntasks=%i # number of tasks" % int(NUM_SLAVES / 5),
|
|
328
|
+
"#SBATCH --ntasks=1 # number of tasks",
|
|
329
|
+
"#SBATCH --cpus-per-task=%i # number of CPUs" % max(int(NUM_SLAVES / 5),1),
|
|
330
|
+
"#SBATCH --overcommit # allow many tasks on one CPU",
|
|
331
|
+
"#SBATCH --mem=%i # memory/cpu (in MB)" % (int(MEM_PER_SLAVE) * int(NUM_SLAVES / 5) + int(MEM_MASTER)),
|
|
332
|
+
"#SBATCH --profile task",
|
|
333
|
+
"#SBATCH --acctg-freq=1",
|
|
334
|
+
]
|
|
335
|
+
if use_input:
|
|
336
|
+
txt += ["#SBATCH --input=_input"]
|
|
337
|
+
txt += [
|
|
338
|
+
"",
|
|
339
|
+
"echo running %s..." % fname,
|
|
340
|
+
"",
|
|
341
|
+
"rm -rf _jobs",
|
|
342
|
+
"mkdir _jobs",
|
|
343
|
+
"",
|
|
344
|
+
"function finish {",
|
|
345
|
+
" cd _jobs",
|
|
346
|
+
" for f in *",
|
|
347
|
+
" do",
|
|
348
|
+
" echo EXIT $SLURM_JOB_ID, master script cancelling child job: $f",
|
|
349
|
+
" scancel --clusters=maui,mahuika $f",
|
|
350
|
+
" done",
|
|
351
|
+
" cd ..",
|
|
352
|
+
"}",
|
|
353
|
+
"trap finish EXIT",
|
|
354
|
+
"",
|
|
355
|
+
]
|
|
356
|
+
txt += ENV_MODULES
|
|
357
|
+
txt += [
|
|
358
|
+
"",
|
|
359
|
+
"MASTERDIR=`pwd`",
|
|
360
|
+
"echo ${MASTERDIR} > _master_dir",
|
|
361
|
+
"",
|
|
362
|
+
# "rm -f run_ns_pr.log",
|
|
363
|
+
# "echo ${MASTERDIR}/run_ns_pr.log > _logfile",
|
|
364
|
+
"",
|
|
365
|
+
"# run master",
|
|
366
|
+
# "srun --exclusive -n1 bash %s/_run_master.sh" % MAIN_DIR,
|
|
367
|
+
"bash %s/_run_master.sh &" % MAIN_DIR,
|
|
368
|
+
"",
|
|
369
|
+
"",
|
|
370
|
+
"# run slaves",
|
|
371
|
+
"echo starting %i PEST slaves on single node..." % NUM_SLAVES,
|
|
372
|
+
"for i in {1..%i}" % NUM_SLAVES,
|
|
373
|
+
"do",
|
|
374
|
+
" bash _run_a_slave.sh ${i} &",
|
|
375
|
+
"done",
|
|
376
|
+
"",
|
|
377
|
+
"# for both background shell jobs (master and slaves srun) to finish",
|
|
378
|
+
"wait",
|
|
379
|
+
"",
|
|
380
|
+
]
|
|
381
|
+
with open(fname, 'w') as f:
|
|
382
|
+
f.write("\n".join(txt))
|
|
383
|
+
|
|
384
|
+
def gen_slaves_sl(fname="_slaves_job.sl"):
|
|
385
|
+
# be careful, might not work, if str contains txt such as 'MB'
|
|
386
|
+
jobname = 'S_' + create_job_name()
|
|
387
|
+
txt = "\n".join([
|
|
388
|
+
"#!/bin/bash",
|
|
389
|
+
"#SBATCH -J %s" % jobname,
|
|
390
|
+
"#SBATCH -A %s # Project Account" % PROJECT,
|
|
391
|
+
"#SBATCH --time=%s # Walltime" % WALLTIME_SLAVES,
|
|
392
|
+
"#SBATCH --ntasks=1 # number of tasks",
|
|
393
|
+
"#SBATCH --cpus-per-task=%i # number of CPUs" % max(int(NUM_SLAVES / 10),1),
|
|
394
|
+
"#SBATCH --overcommit # allow many tasks on one CPU",
|
|
395
|
+
"#SBATCH --mem=%i # memory/cpu (in MB)" % (int(MEM_PER_SLAVE) * int(NUM_SLAVES / 5)),
|
|
396
|
+
"#SBATCH --profile task",
|
|
397
|
+
"#SBATCH --acctg-freq=1",
|
|
398
|
+
# "#SBATCH --workdir=%s # working dir" % MAIN_DIR,
|
|
399
|
+
"echo running %s..." % fname,
|
|
400
|
+
] + ENV_MODULES + [
|
|
401
|
+
"",
|
|
402
|
+
"# run slaves",
|
|
403
|
+
"echo starting %i PEST slaves on single node..." % NUM_SLAVES,
|
|
404
|
+
"for i in {1..%i}" % NUM_SLAVES,
|
|
405
|
+
"do",
|
|
406
|
+
# " echo slave id ${i}",
|
|
407
|
+
" bash _run_a_slave.sh ${i} &",
|
|
408
|
+
"done",
|
|
409
|
+
"",
|
|
410
|
+
"# for both background shell jobs (master and slaves srun) to finish",
|
|
411
|
+
"wait",
|
|
412
|
+
"",
|
|
413
|
+
])
|
|
414
|
+
f = open(fname, 'w')
|
|
415
|
+
f.write(txt)
|
|
416
|
+
f.close()
|
|
417
|
+
|
|
418
|
+
def gen_forward_sl(cmd, fname="_forward.sl"):
|
|
419
|
+
cwd = os.getcwd()
|
|
420
|
+
txt = "\n".join([
|
|
421
|
+
"#!/bin/bash",
|
|
422
|
+
"#SBATCH -J %s" % get_slave_id(),
|
|
423
|
+
"#SBATCH -A %s # Project Account" % PROJECT,
|
|
424
|
+
"#SBATCH --time=%s # Walltime" % WALLTIME_FORWARD,
|
|
425
|
+
"#SBATCH --ntasks=1 # number of tasks",
|
|
426
|
+
"#SBATCH --mem=%s # memory/cpu (in MB)" % MEM_FORWARD,
|
|
427
|
+
# "#SBATCH --workdir=%s # working dir" % cwd,
|
|
428
|
+
# "#SBATCH --output=%s # print to master .out" % get_master_out(),
|
|
429
|
+
# "#SBATCH --error=%s # print to master .out" % get_master_out(),
|
|
430
|
+
# "#SBATCH --open-mode=append # make sure don't overwrite master out",
|
|
431
|
+
"",
|
|
432
|
+
"function finish {",
|
|
433
|
+
" echo -- forward job $SLURM_JOB_ID exiting at $(pwd)",
|
|
434
|
+
" rm _status_on_nesi",
|
|
435
|
+
" rm %s/_jobs/$SLURM_JOB_ID" % get_master_dir(),
|
|
436
|
+
"}",
|
|
437
|
+
"trap finish EXIT",
|
|
438
|
+
"",
|
|
439
|
+
"",
|
|
440
|
+
"echo -- forward job $SLURM_JOB_ID starting at $(pwd)",
|
|
441
|
+
"touch %s/_jobs/$SLURM_JOB_ID" % get_master_dir(),
|
|
442
|
+
"",
|
|
443
|
+
] + ENV_MODULES + [
|
|
444
|
+
"",
|
|
445
|
+
"srun %s" % cmd,
|
|
446
|
+
"",
|
|
447
|
+
])
|
|
448
|
+
with open(fname, 'w') as f:
|
|
449
|
+
f.write(txt)
|
|
450
|
+
|
|
451
|
+
def gen_forward_mahuika_sl(cmd, fname="_forward.sl"):
|
|
452
|
+
cwd = os.getcwd()
|
|
453
|
+
txt = "\n".join([
|
|
454
|
+
"#!/bin/bash -e",
|
|
455
|
+
"#SBATCH -J %s" % get_slave_id(),
|
|
456
|
+
"#SBATCH -A %s # Project Account" % PROJECT_MAHUIKA,
|
|
457
|
+
"#SBATCH --export=NONE # don't carry env over",
|
|
458
|
+
"#SBATCH --time=%s # Walltime" % WALLTIME_FORWARD,
|
|
459
|
+
"#SBATCH --ntasks=%i # number of tasks" % MAHUIKA_NTASKS,
|
|
460
|
+
"#SBATCH --mem=%s # memory/cpu (in MB)" % MEM_FORWARD,
|
|
461
|
+
#"#SBATCH --mem-per-cpu=%i # memory/cpu (in MB)" % int(float(MEM_FORWARD)/float(MAHUIKA_NTASKS)),
|
|
462
|
+
#"#SBATCH --mem=%s # memory/cpu (in MB)" % MEM_FORWARD,
|
|
463
|
+
#"#SBATCH --output=%s # print to master .out" % get_master_out(),
|
|
464
|
+
#"#SBATCH --error=%s # print to master .out" % get_master_out(),
|
|
465
|
+
#"#SBATCH --open-mode=append # make sure don't overwrite master out",
|
|
466
|
+
"",
|
|
467
|
+
"function finish {",
|
|
468
|
+
" echo -- forward job $SLURM_JOB_ID exiting at $(pwd)",
|
|
469
|
+
" rm _status_on_nesi",
|
|
470
|
+
" rm %s/_jobs/$SLURM_JOB_ID" % get_master_dir(),
|
|
471
|
+
"}",
|
|
472
|
+
"trap finish EXIT",
|
|
473
|
+
"",
|
|
474
|
+
"",
|
|
475
|
+
"echo -- forward job $SLURM_JOB_ID starting at $(pwd)",
|
|
476
|
+
"touch %s/_jobs/$SLURM_JOB_ID" % get_master_dir(),
|
|
477
|
+
"",
|
|
478
|
+
] + ENV_MODULES_MAHUIKA + [
|
|
479
|
+
"",
|
|
480
|
+
"export SLURM_EXPORT_ENV=ALL",
|
|
481
|
+
"srun %s" % cmd,
|
|
482
|
+
"",
|
|
483
|
+
])
|
|
484
|
+
with open(fname, 'w') as f:
|
|
485
|
+
f.write(txt)
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def gen_forward_maui_sl(cmd, fname="_forward.sl"):
|
|
489
|
+
cwd = os.getcwd()
|
|
490
|
+
txt = "\n".join([
|
|
491
|
+
"#!/bin/bash -e",
|
|
492
|
+
"#SBATCH -J %s" % get_slave_id(),
|
|
493
|
+
"#SBATCH -A %s # Project Account" % PROJECT_MAUI,
|
|
494
|
+
"#SBATCH --clusters=maui # from mahuika to maui",
|
|
495
|
+
"#SBATCH --export=NONE # don't carry env over",
|
|
496
|
+
"#SBATCH --time=%s # Walltime" % WALLTIME_FORWARD,
|
|
497
|
+
"#SBATCH --ntasks=%i # number of tasks" % MAUI_NTASKS,
|
|
498
|
+
#"#SBATCH --mem-per-cpu=%i # memory/cpu (in MB)" % int(float(MEM_FORWARD)/float(MAUI_NTASKS)),
|
|
499
|
+
"#SBATCH --mem=%s # memory/cpu (in MB)" % MEM_FORWARD,
|
|
500
|
+
#"#SBATCH --output=%s # print to master .out" % get_master_out(),
|
|
501
|
+
#"#SBATCH --error=%s # print to master .out" % get_master_out(),
|
|
502
|
+
#"#SBATCH --open-mode=append # make sure don't overwrite master out",
|
|
503
|
+
"#SBATCH --partition=nesi_research",
|
|
504
|
+
#"#SBATCH --qos=nesi_debug",
|
|
505
|
+
"",
|
|
506
|
+
"function finish {",
|
|
507
|
+
" echo -- forward job $SLURM_JOB_ID exiting at $(pwd)",
|
|
508
|
+
" rm _status_on_nesi",
|
|
509
|
+
" rm %s/_jobs/$SLURM_JOB_ID" % get_master_dir(),
|
|
510
|
+
"}",
|
|
511
|
+
"trap finish EXIT",
|
|
512
|
+
"",
|
|
513
|
+
"",
|
|
514
|
+
"echo -- forward job $SLURM_JOB_ID starting at $(pwd)",
|
|
515
|
+
"touch %s/_jobs/$SLURM_JOB_ID" % get_master_dir(),
|
|
516
|
+
"",
|
|
517
|
+
] + ENV_MODULES_MAUI + [
|
|
518
|
+
"",
|
|
519
|
+
"export SLURM_EXPORT_ENV=ALL",
|
|
520
|
+
"srun %s" % cmd,
|
|
521
|
+
"",
|
|
522
|
+
])
|
|
523
|
+
with open(fname, 'w') as f:
|
|
524
|
+
f.write(txt)
|
|
525
|
+
|
|
526
|
+
def gen_run_master(fname="_run_master.sh"):
|
|
527
|
+
if use_input:
|
|
528
|
+
cmd = "%s %s %s /h :%s < _input" % (BEOPEST, PST_NAME, SWITCHES, PORT)
|
|
529
|
+
else:
|
|
530
|
+
cmd = "%s %s %s /h :%s" % (BEOPEST, PST_NAME, SWITCHES, PORT)
|
|
531
|
+
txt = "\n".join([
|
|
532
|
+
"#!/bin/bash",
|
|
533
|
+
"",
|
|
534
|
+
"echo Master working at $(hostname)",
|
|
535
|
+
"echo $(hostname) > _master_host",
|
|
536
|
+
"",
|
|
537
|
+
"echo $(pwd)/slurm-${SLURM_JOB_ID}.out > _master_out",
|
|
538
|
+
"",
|
|
539
|
+
"echo Master running command: %s" % cmd,
|
|
540
|
+
cmd,
|
|
541
|
+
"",
|
|
542
|
+
])
|
|
543
|
+
f = open(fname, 'w')
|
|
544
|
+
f.write(txt)
|
|
545
|
+
f.close()
|
|
546
|
+
|
|
547
|
+
def gen_run_single_slave(fname="_run_a_slave.sh"):
|
|
548
|
+
# use jobid and procid to ensure slave directories are unique.
|
|
549
|
+
# add ./ into PATH so that PEST can run model files without ./
|
|
550
|
+
# "slave_${SLURM_JOB_ID}_${SLURM_STEP_ID}_${SLURM_PROCID}_${SLURM_LOCALID}_$1"
|
|
551
|
+
slave_name = "slave_${SLURM_JOB_ID}_$1"
|
|
552
|
+
slave_dir = os.path.join(SLAVEDIR, slave_name)
|
|
553
|
+
cmd = "%s %s /h ${MASTERHOST}:%s" % (BEOPEST, PST_NAME, PORT)
|
|
554
|
+
lines = [
|
|
555
|
+
"#!/bin/bash",
|
|
556
|
+
"",
|
|
557
|
+
"echo running %s $1 in %s..." % (fname, slave_dir),
|
|
558
|
+
"",
|
|
559
|
+
"sleep 5",
|
|
560
|
+
"mkdir -pv %s" % slave_dir,
|
|
561
|
+
# "cp * %s/" % slave_dir,
|
|
562
|
+
"cp %s %s/" % (FILE_LIST, slave_dir),
|
|
563
|
+
"cd %s" % slave_dir,
|
|
564
|
+
"echo A slave working at $(hostname):%s" % slave_dir,
|
|
565
|
+
"echo %s > _master_dir" % MAIN_DIR, # needed for model scripts to copy incon/save
|
|
566
|
+
"echo ${SLURM_JOB_ID}_$1 > _procid", # needed for obsreref book keeping
|
|
567
|
+
"MASTERHOST=`awk '{print $1}' %s/_master_host`" % MAIN_DIR,
|
|
568
|
+
"export PATH=./:$PATH",
|
|
569
|
+
"echo Slave $(hostname)_${1} running command: %s" % cmd,
|
|
570
|
+
# "sysctl fs.file-nr",
|
|
571
|
+
# "chmod a+x model.bat",
|
|
572
|
+
# "chmod a+x r_model.bat",
|
|
573
|
+
# "chmod a+x d_model.bat",
|
|
574
|
+
cmd,
|
|
575
|
+
"",]
|
|
576
|
+
if SUMMARY_LARGE_FILES:
|
|
577
|
+
lines += [
|
|
578
|
+
"# head and tail big result files",
|
|
579
|
+
"for f in %s" % SUMMARY_LARGE_FILES,
|
|
580
|
+
"do",
|
|
581
|
+
" head --lines=20 $f > $f.head",
|
|
582
|
+
" tail --lines=20 $f > $f.tail",
|
|
583
|
+
" echo Removing $f...",
|
|
584
|
+
" rm $f",
|
|
585
|
+
"done",
|
|
586
|
+
"",]
|
|
587
|
+
if KEEP_TARGZ_SLAVES:
|
|
588
|
+
# lines += [
|
|
589
|
+
# "cd ..",
|
|
590
|
+
# "",]
|
|
591
|
+
lines += [
|
|
592
|
+
"tar -zcf %s/%s.tar.gz %s" % (MAIN_DIR, slave_name, slave_dir),
|
|
593
|
+
"",]
|
|
594
|
+
with open(fname, 'w') as f:
|
|
595
|
+
f.write("\n".join(lines))
|
|
596
|
+
|
|
597
|
+
def gen_test_dir(fname="_copy_to_test.sh"):
|
|
598
|
+
# make a test directory, used for manually checking slave directory
|
|
599
|
+
slave_dir = "%s/test" % MAIN_DIR
|
|
600
|
+
txt = "\n".join([
|
|
601
|
+
"#!/bin/bash",
|
|
602
|
+
"",
|
|
603
|
+
"mkdir -pv %s" % slave_dir,
|
|
604
|
+
"cp %s %s/" % (FILE_LIST, slave_dir),
|
|
605
|
+
])
|
|
606
|
+
f = open(fname, 'w')
|
|
607
|
+
f.write(txt)
|
|
608
|
+
f.close()
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def sbatch_check(cmd, retry_sec=None, retry_limit=10):
|
|
612
|
+
""" this runs sbatch and detects errors, only return slurm job id if
|
|
613
|
+
successful. If retry is not None, sbatch will be called again after
|
|
614
|
+
sleeping retry seconds, with a limit number of tries.
|
|
615
|
+
"""
|
|
616
|
+
import sys
|
|
617
|
+
if retry_sec is None:
|
|
618
|
+
retry_limit = 0
|
|
619
|
+
i = 0
|
|
620
|
+
jobid = None
|
|
621
|
+
while i < (retry_limit + 1):
|
|
622
|
+
print("running %s ..." % cmd)
|
|
623
|
+
out = check_output(cmd)
|
|
624
|
+
i += 1
|
|
625
|
+
try:
|
|
626
|
+
jobid = out.strip().split()[3]
|
|
627
|
+
break
|
|
628
|
+
except IndexError:
|
|
629
|
+
print("sbatch error, try no. %i: %s" % (i, out.strip()))
|
|
630
|
+
except:
|
|
631
|
+
print("sbatch unexpected error:", sys.exc_info()[0])
|
|
632
|
+
raise
|
|
633
|
+
#if 'error' in out:
|
|
634
|
+
# print("%i: %s" % (i, out.strip()))
|
|
635
|
+
#else:
|
|
636
|
+
# jobid = out.strip().split()[3]
|
|
637
|
+
# break
|
|
638
|
+
return jobid
|
|
639
|
+
|
|
640
|
+
def proc_args():
|
|
641
|
+
# couldn't use argparse because NeSI Python still default to 2.6
|
|
642
|
+
import sys
|
|
643
|
+
def get_opt(o):
|
|
644
|
+
for i,s in enumerate(sys.argv[1:]):
|
|
645
|
+
if s == o:
|
|
646
|
+
return sys.argv[1:][i+1]
|
|
647
|
+
def usage():
|
|
648
|
+
print('\n'.join([
|
|
649
|
+
"This script submits BeoPEST/PEST_HP job on NeSI.",
|
|
650
|
+
" -f, --forward COMMAND",
|
|
651
|
+
" Submit single forward run, COMMAND must be supplied.",
|
|
652
|
+
" Optional, if not supplied the whole PEST master/slaves run",
|
|
653
|
+
" will be submitted instead",
|
|
654
|
+
" -f2, --forward2 COMMAND",
|
|
655
|
+
" This is the same as -f/--forward. But instead using the",
|
|
656
|
+
" less reliable 'sbatch --wait', now it uses 'swait'.",
|
|
657
|
+
" -f3, --forward3 COMMAND",
|
|
658
|
+
" This is the same as -f/--forward. But file locking",
|
|
659
|
+
" mechanism instead of swait/sbatch that replies on good",
|
|
660
|
+
" communication",
|
|
661
|
+
" -f3mahuika, --forward3mahuika COMMAND",
|
|
662
|
+
" This is the same as -f3/--forward3, but submit to Mahuika",
|
|
663
|
+
" instead.",
|
|
664
|
+
" -f3maui, --forward3maui COMMAND",
|
|
665
|
+
" This is the same as -f3/--forward3, but submit to Maui",
|
|
666
|
+
" instead.",
|
|
667
|
+
" -f3x, --forward3x COMMAND",
|
|
668
|
+
" This is the same as -f3/--forward3, but submission to",
|
|
669
|
+
" Maui or Mahuika depends on settings in goPESTconfig.toml",
|
|
670
|
+
" --dirs DIR_PATTERNi --jobnowait COMMAND",
|
|
671
|
+
" Used to submit many jobs of COMMAND in directories specified",
|
|
672
|
+
" by DIR_PATTERN",
|
|
673
|
+
" --cancel",
|
|
674
|
+
" Cancel ALL jobs originated from current directory (_jobs).",
|
|
675
|
+
]))
|
|
676
|
+
option = {
|
|
677
|
+
"forward": None,
|
|
678
|
+
"forward2": None,
|
|
679
|
+
"forward3": None,
|
|
680
|
+
"forward3mahuika": None,
|
|
681
|
+
"forward3maui": None,
|
|
682
|
+
"forward3x": None,
|
|
683
|
+
"dirs": None,
|
|
684
|
+
"jobnowait": None,
|
|
685
|
+
"cancel": False,
|
|
686
|
+
}
|
|
687
|
+
if '--help' in sys.argv[1:]:
|
|
688
|
+
usage()
|
|
689
|
+
elif '-h' in sys.argv[1:]:
|
|
690
|
+
usage()
|
|
691
|
+
elif '--forward' in sys.argv[1:]:
|
|
692
|
+
option['forward'] = get_opt('--forward')
|
|
693
|
+
elif '-f' in sys.argv[1:]:
|
|
694
|
+
option['forward'] = get_opt('-f')
|
|
695
|
+
elif '--forward2' in sys.argv[1:]:
|
|
696
|
+
option['forward2'] = get_opt('--forward2')
|
|
697
|
+
elif '-f2' in sys.argv[1:]:
|
|
698
|
+
option['forward2'] = get_opt('-f2')
|
|
699
|
+
elif '--forward3' in sys.argv[1:]:
|
|
700
|
+
option['forward3'] = get_opt('--forward3')
|
|
701
|
+
elif '-f3' in sys.argv[1:]:
|
|
702
|
+
option['forward3'] = get_opt('-f3')
|
|
703
|
+
elif '--forward3mahuika' in sys.argv[1:]:
|
|
704
|
+
option['forward3mahuika'] = get_opt('--forward3mahuika')
|
|
705
|
+
elif '-f3mahuika' in sys.argv[1:]:
|
|
706
|
+
option['forward3mahuika'] = get_opt('-f3mahuika')
|
|
707
|
+
|
|
708
|
+
elif '--forward3maui' in sys.argv[1:]:
|
|
709
|
+
option['forward3maui'] = get_opt('--forward3maui')
|
|
710
|
+
elif '-f3maui' in sys.argv[1:]:
|
|
711
|
+
option['forward3maui'] = get_opt('-f3maui')
|
|
712
|
+
elif '--forward3x' in sys.argv[1:]:
|
|
713
|
+
option['forward3x'] = get_opt('--forward3x')
|
|
714
|
+
elif '-f3x' in sys.argv[1:]:
|
|
715
|
+
option['forward3x'] = get_opt('-f3x')
|
|
716
|
+
elif '--dirs' in sys.argv[1:]:
|
|
717
|
+
option['dirs'] = get_opt('--dirs')
|
|
718
|
+
option['jobnowait'] = get_opt('--jobnowait')
|
|
719
|
+
elif '--cancel' in sys.argv[1:]:
|
|
720
|
+
option['cancel'] = True
|
|
721
|
+
return option
|
|
722
|
+
|
|
723
|
+
def submit_cli(argv=[]):
|
|
724
|
+
if cfg['mode'] != 'nesi':
|
|
725
|
+
raise Exception('Error! gopest submit can only run with mode = "nesi" ')
|
|
726
|
+
|
|
727
|
+
from time import sleep
|
|
728
|
+
# You can use chain jobs to create dependencies between jobs.
|
|
729
|
+
# SLURM has an option -d or "--dependency" that allows to
|
|
730
|
+
# specify that a job is only allowed to start if another job finished.
|
|
731
|
+
#
|
|
732
|
+
# use 'after:' instead of 'afterok:' to start slaves after master STARTED
|
|
733
|
+
# (not after it's done)
|
|
734
|
+
|
|
735
|
+
SCRIPT_DIR = os.getcwd()
|
|
736
|
+
option = proc_args()
|
|
737
|
+
|
|
738
|
+
### print basic info
|
|
739
|
+
print("You are working under directory: %s" % SCRIPT_DIR)
|
|
740
|
+
write_to('_pest_dir', PESTDIR)
|
|
741
|
+
write_to('_tough2', SIMULATOR)
|
|
742
|
+
write_to('_master_dir', SCRIPT_DIR)
|
|
743
|
+
|
|
744
|
+
# for job management, keeps a list of jobs submitted from this directory
|
|
745
|
+
if not os.path.exists('_jobs'):
|
|
746
|
+
os.makedirs('_jobs')
|
|
747
|
+
|
|
748
|
+
### if forward only submit single run job
|
|
749
|
+
if option['forward'] is not None:
|
|
750
|
+
gen_forward_sl(option['forward'])
|
|
751
|
+
# important to have --wait here to block until job finish
|
|
752
|
+
jobid = sbatch_check("sbatch --wait _forward.sl", retry_sec=30)
|
|
753
|
+
if jobid is not None:
|
|
754
|
+
ttime = check_output("sacct -j %s.0 -o totalcpu -n" % jobid).strip()
|
|
755
|
+
print("\nForward job %s finished after %s" % (jobid, ttime))
|
|
756
|
+
else:
|
|
757
|
+
print("\nFailed to submit _forward.sl")
|
|
758
|
+
exit()
|
|
759
|
+
elif option['forward2'] is not None:
|
|
760
|
+
""" The is the same as -f/--forward, but use the more reliable swait.
|
|
761
|
+
|
|
762
|
+
It happened a few times that sbatch --wait returned early with error:
|
|
763
|
+
sbatch: error: slurm_receive_msg: Socket timed out on send/recv
|
|
764
|
+
operation. This may be triggered when slurm daemon is acutely busy. The
|
|
765
|
+
actual job in this case can still be running. So Gene (from NeSI)
|
|
766
|
+
suggested using swait script.
|
|
767
|
+
"""
|
|
768
|
+
gen_forward_sl(option['forward2'])
|
|
769
|
+
jobid = check_output("sbatch _forward.sl").strip().split()[3]
|
|
770
|
+
print("\nJob %s submitted." % jobid)
|
|
771
|
+
print("/share/bin/swait %s" % jobid)
|
|
772
|
+
os.system("/share/bin/swait %s" % jobid)
|
|
773
|
+
ttime = check_output("sacct -j %s.0 -o totalcpu -n" % jobid).strip()
|
|
774
|
+
print("\nForward job %s finished after %s" % (jobid, ttime))
|
|
775
|
+
exit()
|
|
776
|
+
elif option['forward3'] is not None:
|
|
777
|
+
""" The is the same as -f/--forward, but use lock files instead of
|
|
778
|
+
swait or sbatch --wait, which replies on NeSI's communications.
|
|
779
|
+
"""
|
|
780
|
+
gen_forward_sl(option['forward3'])
|
|
781
|
+
# _status_on_nesi will be removed once the _forward.sl finish
|
|
782
|
+
# regardless how it terminated.
|
|
783
|
+
with open('_status_on_nesi', 'w') as f:
|
|
784
|
+
pass
|
|
785
|
+
import random
|
|
786
|
+
wait_t = random.random() # * 20.0 * 60.0
|
|
787
|
+
print('.. waiting %f sec before submit ..' % wait_t)
|
|
788
|
+
sleep(wait_t)
|
|
789
|
+
jobid = sbatch_check("sbatch _forward.sl", retry_sec=30, retry_limit=50)
|
|
790
|
+
if jobid is not None:
|
|
791
|
+
while os.path.isfile('_status_on_nesi'):
|
|
792
|
+
sleep(120)
|
|
793
|
+
ttime = check_output("sacct -j %s.0 -o totalcpu -n" % jobid).strip()
|
|
794
|
+
print("\nForward job %s finished after %s" % (jobid, ttime))
|
|
795
|
+
else:
|
|
796
|
+
os.remove('_status_on_nesi')
|
|
797
|
+
print("\nFailed to submit _forward.sl")
|
|
798
|
+
exit()
|
|
799
|
+
elif option['forward3x'] is not None:
|
|
800
|
+
""" The is the same as -f/--forward, but use lock files instead of
|
|
801
|
+
swait or sbatch --wait, which replies on NeSI's communications.
|
|
802
|
+
"""
|
|
803
|
+
if cfg['nesi']['cluster_forward'] == 'mahuika':
|
|
804
|
+
cmd = cfg['nesi']['mahuika']['executable'] + ' ' + option['forward3x']
|
|
805
|
+
print('submit_beopest.py runs command (mahuika)' + cmd)
|
|
806
|
+
gen_forward_mahuika_sl(cmd)
|
|
807
|
+
elif cfg['nesi']['cluster_forward'] == 'maui':
|
|
808
|
+
cmd = cfg['nesi']['maui']['executable'] + ' ' + option['forward3x']
|
|
809
|
+
print('submit_beopest.py runs command (maui)' + cmd)
|
|
810
|
+
gen_forward_maui_sl(cmd)
|
|
811
|
+
else:
|
|
812
|
+
raise Exception('only supports mahuika or maui')
|
|
813
|
+
# _status_on_nesi will be removed once the _forward.sl finish
|
|
814
|
+
# regardless how it terminated.
|
|
815
|
+
with open('_status_on_nesi', 'w') as f:
|
|
816
|
+
pass
|
|
817
|
+
import random
|
|
818
|
+
wait_t = random.random() * 5.0 # * 20.0 * 60.0
|
|
819
|
+
print('.. waiting %f sec before submit ..' % wait_t)
|
|
820
|
+
sleep(wait_t)
|
|
821
|
+
jobid = sbatch_check("sbatch _forward.sl", retry_sec=30, retry_limit=50)
|
|
822
|
+
if jobid is not None:
|
|
823
|
+
while os.path.isfile('_status_on_nesi'):
|
|
824
|
+
sleep(30)
|
|
825
|
+
ttime = check_output("sacct --clusters=mahuika -j %s.0 -o totalcpu -n" % jobid).strip()
|
|
826
|
+
print("\nForward job %s finished after %s" % (jobid, ttime))
|
|
827
|
+
else:
|
|
828
|
+
os.remove('_status_on_nesi')
|
|
829
|
+
print("\nFailed to submit _forward.sl")
|
|
830
|
+
exit()
|
|
831
|
+
elif option['forward3mahuika'] is not None:
|
|
832
|
+
""" The is the same as -f/--forward, but use lock files instead of
|
|
833
|
+
swait or sbatch --wait, which replies on NeSI's communications.
|
|
834
|
+
"""
|
|
835
|
+
gen_forward_mahuika_sl(option['forward3mahuika'])
|
|
836
|
+
# _status_on_nesi will be removed once the _forward.sl finish
|
|
837
|
+
# regardless how it terminated.
|
|
838
|
+
with open('_status_on_nesi', 'w') as f:
|
|
839
|
+
pass
|
|
840
|
+
import random
|
|
841
|
+
wait_t = random.random() # * 20.0 * 60.0
|
|
842
|
+
print('.. waiting %f sec before submit ..' % wait_t)
|
|
843
|
+
sleep(wait_t)
|
|
844
|
+
jobid = sbatch_check("sbatch _forward.sl", retry_sec=30, retry_limit=50)
|
|
845
|
+
if jobid is not None:
|
|
846
|
+
while os.path.isfile('_status_on_nesi'):
|
|
847
|
+
sleep(120)
|
|
848
|
+
ttime = check_output("sacct --clusters=mahuika -j %s.0 -o totalcpu -n" % jobid).strip()
|
|
849
|
+
print("\nForward job %s finished after %s" % (jobid, ttime))
|
|
850
|
+
else:
|
|
851
|
+
os.remove('_status_on_nesi')
|
|
852
|
+
print("\nFailed to submit _forward.sl")
|
|
853
|
+
exit()
|
|
854
|
+
elif option['forward3maui'] is not None:
|
|
855
|
+
""" The is the same as -f/--forward, but use lock files instead of
|
|
856
|
+
swait or sbatch --wait, which replies on NeSI's communications.
|
|
857
|
+
"""
|
|
858
|
+
gen_forward_maui_sl(option['forward3maui'])
|
|
859
|
+
# _status_on_nesi will be removed once the _forward.sl finish
|
|
860
|
+
# regardless how it terminated.
|
|
861
|
+
with open('_status_on_nesi', 'w') as f:
|
|
862
|
+
pass
|
|
863
|
+
import random
|
|
864
|
+
wait_t = random.random() # * 20.0 * 60.0
|
|
865
|
+
print('.. waiting %f sec before submit ..' % wait_t)
|
|
866
|
+
sleep(wait_t)
|
|
867
|
+
jobid = sbatch_check("sbatch _forward.sl", retry_sec=30, retry_limit=50)
|
|
868
|
+
if jobid is not None:
|
|
869
|
+
while os.path.isfile('_status_on_nesi'):
|
|
870
|
+
sleep(120)
|
|
871
|
+
ttime = check_output("sacct --clusters=maui -j %s.0 -o totalcpu -n" % jobid).strip()
|
|
872
|
+
print("\nForward job %s finished after %s" % (jobid, ttime))
|
|
873
|
+
else:
|
|
874
|
+
os.remove('_status_on_nesi')
|
|
875
|
+
print("\nFailed to submit _forward.sl")
|
|
876
|
+
exit()
|
|
877
|
+
elif option['dirs'] is not None and option['jobnowait'] is not None:
|
|
878
|
+
import glob
|
|
879
|
+
import shutil
|
|
880
|
+
write_to('_submit.out', '\n'.join([
|
|
881
|
+
'Running --dirs "%s" --jobnowait "%s"' % (option['dirs'], option['jobnowait']),
|
|
882
|
+
'',
|
|
883
|
+
]))
|
|
884
|
+
for d in glob.glob(option['dirs']):
|
|
885
|
+
shutil.copy('_master_dir', d)
|
|
886
|
+
shutil.copy('_tough2', d)
|
|
887
|
+
cwd = os.getcwd()
|
|
888
|
+
os.chdir(d)
|
|
889
|
+
write_to('_master_out', cwd+'/_submit.out')
|
|
890
|
+
gen_forward_sl(option['jobnowait'])
|
|
891
|
+
import random
|
|
892
|
+
wait_t = random.random() * 20.0 * 60.0
|
|
893
|
+
print('.. waiting %f sec before submit ..' % wait_t)
|
|
894
|
+
sleep(wait_t)
|
|
895
|
+
jobid = sbatch_check("sbatch _forward.sl", retry_sec=30, retry_limit=50)
|
|
896
|
+
os.chdir(cwd)
|
|
897
|
+
exit()
|
|
898
|
+
elif option['cancel'] is True:
|
|
899
|
+
import glob
|
|
900
|
+
for f in glob.glob('_jobs/*'):
|
|
901
|
+
print('Cancelling %s' % os.path.basename(f))
|
|
902
|
+
os.system('scancel --clusters=maui,mahuika %s' % os.path.basename(f))
|
|
903
|
+
exit()
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
### generate slurm scripts etc.
|
|
907
|
+
gen_master_sl()
|
|
908
|
+
gen_slaves_sl()
|
|
909
|
+
gen_run_master()
|
|
910
|
+
gen_run_single_slave()
|
|
911
|
+
gen_test_dir()
|
|
912
|
+
|
|
913
|
+
# slave files and dir will be handled within _run_a_slave.sh
|
|
914
|
+
|
|
915
|
+
### submit beopest master job, and get job id, master.sl will record hostname
|
|
916
|
+
out = check_output("sbatch _master_job.sl").strip()
|
|
917
|
+
print("BeoPEST/PEST_HP Master and Slaves: ", out)
|
|
918
|
+
dependency = out.split()[3]
|
|
919
|
+
|
|
920
|
+
write_to('_master_slurm_id', dependency)
|
|
921
|
+
|
|
922
|
+
### submit beopest slaves job, depend on run after master started
|
|
923
|
+
cmd = "sbatch --dependency after:%s _slaves_job.sl" % dependency
|
|
924
|
+
#out = check_output(cmd).strip()
|
|
925
|
+
#print("BeoPEST Slaves: ", out)
|
|
926
|
+
print("Add more slaves (after master job is submitted) by using command:")
|
|
927
|
+
print(" %s" % cmd)
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
if __name__ == "__main__":
|
|
931
|
+
submit_cli()
|