rda-python-common 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rda_python_common/PgCMD.py +603 -0
- rda_python_common/PgDBI.py +2306 -0
- rda_python_common/PgFile.py +3118 -0
- rda_python_common/PgLOG.py +1689 -0
- rda_python_common/PgLock.py +640 -0
- rda_python_common/PgOPT.py +1740 -0
- rda_python_common/PgSIG.py +1164 -0
- rda_python_common/PgSplit.py +299 -0
- rda_python_common/PgUtil.py +1854 -0
- rda_python_common/__init__.py +0 -0
- rda_python_common/pg_cmd.py +493 -0
- rda_python_common/pg_dbi.py +1885 -0
- rda_python_common/pg_file.py +2462 -0
- rda_python_common/pg_lock.py +533 -0
- rda_python_common/pg_log.py +1352 -0
- rda_python_common/pg_opt.py +1447 -0
- rda_python_common/pg_pass.py +92 -0
- rda_python_common/pg_sig.py +879 -0
- rda_python_common/pg_split.py +260 -0
- rda_python_common/pg_util.py +1534 -0
- rda_python_common/pgpassword.py +92 -0
- rda_python_common-2.0.0.dist-info/METADATA +20 -0
- rda_python_common-2.0.0.dist-info/RECORD +27 -0
- rda_python_common-2.0.0.dist-info/WHEEL +5 -0
- rda_python_common-2.0.0.dist-info/entry_points.txt +3 -0
- rda_python_common-2.0.0.dist-info/licenses/LICENSE +21 -0
- rda_python_common-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1164 @@
|
|
|
1
|
+
#
|
|
2
|
+
###############################################################################
|
|
3
|
+
#
|
|
4
|
+
# Title : PgSIG.py
|
|
5
|
+
#
|
|
6
|
+
# Author : Zaihua Ji, zji@ucar.edu
|
|
7
|
+
# Date : 08/05/2020
|
|
8
|
+
# 2025-01-10 transferred to package rda_python_common from
|
|
9
|
+
# https://github.com/NCAR/rda-shared-libraries.git
|
|
10
|
+
# Purpose : python library module for start and control daemon process
|
|
11
|
+
#
|
|
12
|
+
# Github : https://github.com/NCAR/rda-python-common.git
|
|
13
|
+
#
|
|
14
|
+
###############################################################################
|
|
15
|
+
#
|
|
16
|
+
import os
|
|
17
|
+
import re
|
|
18
|
+
import sys
|
|
19
|
+
import errno
|
|
20
|
+
import signal
|
|
21
|
+
import time
|
|
22
|
+
from contextlib import contextmanager
|
|
23
|
+
from . import PgLOG
|
|
24
|
+
from . import PgDBI
|
|
25
|
+
|
|
26
|
+
VUSERS = [] # allow users to start this daemon
|
|
27
|
+
CPIDS = {} # allow upto 'mproc' processes at one time for daemon
|
|
28
|
+
CBIDS = {} # allow upto 'bproc' background processes at one time for each child
|
|
29
|
+
SDUMP = {
|
|
30
|
+
'DEF' : '/dev/null',
|
|
31
|
+
'ERR' : '',
|
|
32
|
+
'OUT' : ''
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
PGSIG = {
|
|
36
|
+
'QUIT' : 0, # 1 if QUIT signal received, quit server if no child
|
|
37
|
+
'MPROC' : 1, # default number of multiple processes
|
|
38
|
+
'BPROC' : 1, # default number of multiple background processes
|
|
39
|
+
'ETIME' : 20, # default error waiting time (in seconds)
|
|
40
|
+
'WTIME' : 120, # default waiting time (in seconds)
|
|
41
|
+
'DTIME' : 600, # the daemon record refresh time (in seconds)
|
|
42
|
+
'RTIME' : 2400, # the web rda config unlocking and unconfigured system down waiting time (in seconds)
|
|
43
|
+
'CTIME' : 4800, # the lock cleaning & configued system down waiting time (in seconds)
|
|
44
|
+
'PPID' : -1, # 1 - server, (> 1) - child, 0 - non-daemon mode
|
|
45
|
+
'PID' : 0, # current process ID
|
|
46
|
+
'DNAME' : '', # daemon name
|
|
47
|
+
'DSTR' : '', # string for daemon with user login name
|
|
48
|
+
'MTIME' : 0, # maximum daemon running time in seconds, 0 for unlimited
|
|
49
|
+
'STIME' : 0, # time the daemon is started
|
|
50
|
+
'STRTM' : '', # string format of 'STIME'
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
#
|
|
54
|
+
# add users for starting this daemon
|
|
55
|
+
#
|
|
56
|
+
def add_vusers(user = None, mores = None):
|
|
57
|
+
|
|
58
|
+
global VUSERS
|
|
59
|
+
if not user:
|
|
60
|
+
VUSERS = [] # clean all vusers
|
|
61
|
+
else:
|
|
62
|
+
VUSERS.append(user)
|
|
63
|
+
|
|
64
|
+
if mores: VUSERS.extend(mores)
|
|
65
|
+
|
|
66
|
+
#
|
|
67
|
+
# valid user for starting this daemon
|
|
68
|
+
#
|
|
69
|
+
def check_vuser(user, aname = None):
|
|
70
|
+
|
|
71
|
+
if user and VUSERS:
|
|
72
|
+
valid = 0;
|
|
73
|
+
for vuser in VUSERS:
|
|
74
|
+
if user == vuser:
|
|
75
|
+
valid = 1;
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
if valid == 0:
|
|
79
|
+
vuser = ', '.join(VUSERS)
|
|
80
|
+
PgLOG.pglog("{}: must be '{}' to run '{}' in Daemon mode".format(user, vuser, aname), PgLOG.LGEREX)
|
|
81
|
+
|
|
82
|
+
#
|
|
83
|
+
# turn this process into a daemon
|
|
84
|
+
#
|
|
85
|
+
# aname - application name, or daemon name
|
|
86
|
+
# uname - user login name to started the application
|
|
87
|
+
# mproc - upper limit of muiltiple child processes
|
|
88
|
+
# wtime - waiting time (in seconds) for next process for the daemon
|
|
89
|
+
# logon - turn on the logging if true
|
|
90
|
+
# bproc - multiple background processes if > 1
|
|
91
|
+
# mtime - maximum running time for the daemon if provided
|
|
92
|
+
#
|
|
93
|
+
def start_daemon(aname, uname, mproc = 1, wtime = 120, logon = 0, bproc = 1, mtime = 0):
|
|
94
|
+
|
|
95
|
+
dstr = "Daemon '{}'{} on {}".format(aname, (" By {}".format(uname) if uname else ''), PgLOG.PGLOG['HOSTNAME'])
|
|
96
|
+
|
|
97
|
+
pid = check_daemon(aname, uname)
|
|
98
|
+
if pid:
|
|
99
|
+
PgLOG.pglog("***************** WARNNING **************************\n" +
|
|
100
|
+
"** {} is running as PID={}\n".format(dstr, pid) +
|
|
101
|
+
"** You need stop it before starting a new one!\n" +
|
|
102
|
+
"*****************************************************" , PgLOG.WARNLG)
|
|
103
|
+
PgLOG.pglog("{} is already running as PID={}".format(dstr, pid), PgLOG.FRCLOG|PgLOG.MSGLOG)
|
|
104
|
+
sys.exit(0)
|
|
105
|
+
|
|
106
|
+
if mproc > 1: PGSIG['MPROC'] = mproc
|
|
107
|
+
if bproc > 1: PGSIG['BPROC'] = bproc
|
|
108
|
+
PGSIG['WTIME'] = get_wait_time(wtime, 120, "Polling Wait Time")
|
|
109
|
+
PGSIG['MTIME'] = get_wait_time(mtime, 0, "Maximum Running Time")
|
|
110
|
+
|
|
111
|
+
pid = process_fork(dstr)
|
|
112
|
+
cpid = pid if pid > 0 else os.getpid()
|
|
113
|
+
msg = "PID={},PL={},WI={}".format(cpid, PGSIG['MPROC'], PGSIG['WTIME'])
|
|
114
|
+
if PGSIG['MTIME']: msg += ",MT={}".format(PGSIG['MTIME'])
|
|
115
|
+
logmsg = "{}({}) started".format(dstr, msg)
|
|
116
|
+
if logon: logmsg += " With Logging On"
|
|
117
|
+
if pid > 0:
|
|
118
|
+
PgLOG.pglog(logmsg, PgLOG.WARNLG)
|
|
119
|
+
sys.exit(0)
|
|
120
|
+
|
|
121
|
+
os.setsid()
|
|
122
|
+
os.umask(0)
|
|
123
|
+
|
|
124
|
+
# setup to catch signals in daemon only
|
|
125
|
+
signal.signal(signal.SIGCHLD, clean_dead_child)
|
|
126
|
+
signal.signal(signal.SIGQUIT, signal_catch)
|
|
127
|
+
signal.signal(signal.SIGUSR1, signal_catch)
|
|
128
|
+
signal.signal(signal.SIGUSR2, signal_catch)
|
|
129
|
+
PGSIG['DSTR'] = dstr
|
|
130
|
+
PGSIG['DNAME'] = aname
|
|
131
|
+
PGSIG['STIME'] = int(time.time())
|
|
132
|
+
PGSIG['STRTM'] = PgLOG.current_datetime(PGSIG['STIME'])
|
|
133
|
+
PGSIG['PPID'] = 1
|
|
134
|
+
PGSIG['PID'] = cpid
|
|
135
|
+
|
|
136
|
+
sys.stdin = open(SDUMP['DEF'])
|
|
137
|
+
PgLOG.cmdlog("{} By {}".format(logmsg, PGSIG['STRTM']))
|
|
138
|
+
|
|
139
|
+
if logon:
|
|
140
|
+
PgLOG.PGLOG['LOGMASK'] &= ~(PgLOG.WARNLG|PgLOG.EMLLOG) # turn off warn/email in daemon
|
|
141
|
+
set_dump()
|
|
142
|
+
else:
|
|
143
|
+
PgLOG.PGLOG['LOGMASK'] &= ~(PgLOG.LGWNEM) # turn off log/warn/email in daemon
|
|
144
|
+
set_dump(SDUMP['DEF'])
|
|
145
|
+
|
|
146
|
+
PgLOG.PGLOG['BCKGRND'] = 1 # make sure the background flag is always on
|
|
147
|
+
PgDBI.pgdisconnect(1) # disconnect database in daemon
|
|
148
|
+
|
|
149
|
+
#
|
|
150
|
+
# set dump output file
|
|
151
|
+
#
|
|
152
|
+
def set_dump(default = None):
|
|
153
|
+
|
|
154
|
+
errdump = PgLOG.get_environment("ERRDUMP", default)
|
|
155
|
+
outdump = PgLOG.get_environment("OUTDUMP", default)
|
|
156
|
+
|
|
157
|
+
if not errdump:
|
|
158
|
+
if not PgLOG.PGLOG['ERRFILE']:
|
|
159
|
+
PgLOG.PGLOG['ERRFILE'] = re.sub(r'\.log$', '.err', PgLOG.PGLOG['LOGFILE'], 1)
|
|
160
|
+
errdump = "{}/{}".format(PgLOG.PGLOG['LOGPATH'], PgLOG.PGLOG['ERRFILE'])
|
|
161
|
+
|
|
162
|
+
if errdump != SDUMP['ERR']:
|
|
163
|
+
sys.stderr = open(errdump, 'a')
|
|
164
|
+
SDUMP['ERR'] = errdump
|
|
165
|
+
|
|
166
|
+
if not outdump: outdump = "{}/{}".format(PgLOG.PGLOG['LOGPATH'], PgLOG.PGLOG['LOGFILE'])
|
|
167
|
+
if outdump != SDUMP['OUT']:
|
|
168
|
+
sys.stdout = open(outdump, 'a')
|
|
169
|
+
SDUMP['OUT'] = outdump
|
|
170
|
+
|
|
171
|
+
#
|
|
172
|
+
# stop daemon and log the ending info
|
|
173
|
+
#
|
|
174
|
+
def stop_daemon(msg):
|
|
175
|
+
|
|
176
|
+
msg = " with " + msg if msg else ''
|
|
177
|
+
PgLOG.PGLOG['LOGMASK'] |= PgLOG.MSGLOG # turn on logging before daemon stops
|
|
178
|
+
PgLOG.pglog("{} Started at {}, Stopped gracefully{} by {}".format(PGSIG['DSTR'], PGSIG['STRTM'], msg, PgLOG.current_datetime()), PgLOG.LOGWRN)
|
|
179
|
+
|
|
180
|
+
#
|
|
181
|
+
# check if a daemon is running already
|
|
182
|
+
#
|
|
183
|
+
# aname - application name for the daemon
|
|
184
|
+
# uname - user login name who started the daemon
|
|
185
|
+
#
|
|
186
|
+
# return the process id if yes and 0 if not
|
|
187
|
+
#
|
|
188
|
+
def check_daemon(aname, uname = None):
|
|
189
|
+
|
|
190
|
+
if uname:
|
|
191
|
+
check_vuser(uname, aname)
|
|
192
|
+
pcmd = "ps -u {} -f | grep {} | grep ' 1 '".format(uname, aname)
|
|
193
|
+
mp = r"^\s*{}\s+(\d+)\s+1\s+".format(uname)
|
|
194
|
+
else:
|
|
195
|
+
pcmd = "ps -C {} -f | grep ' 1 '".format(aname)
|
|
196
|
+
mp = r"^\s*\w+\s+(\d+)\s+1\s+"
|
|
197
|
+
|
|
198
|
+
buf = PgLOG.pgsystem(pcmd, PgLOG.LOGWRN, 20+1024)
|
|
199
|
+
if buf:
|
|
200
|
+
cpid = os.getpid()
|
|
201
|
+
lines = buf.split('\n')
|
|
202
|
+
for line in lines:
|
|
203
|
+
ms = re.match(mp, line)
|
|
204
|
+
pid = int(ms.group(1)) if ms else 0
|
|
205
|
+
if pid > 0 and pid != cpid: return pid
|
|
206
|
+
|
|
207
|
+
return 0
|
|
208
|
+
|
|
209
|
+
#
|
|
210
|
+
# check if an application is running already; other than the current processs
|
|
211
|
+
#
|
|
212
|
+
# aname - application name
|
|
213
|
+
# uname - user login name who started the application
|
|
214
|
+
# argv - argument string
|
|
215
|
+
#
|
|
216
|
+
# return the process id if yes and 0 if not
|
|
217
|
+
#
|
|
218
|
+
def check_application(aname, uname = None, sargv = None):
|
|
219
|
+
|
|
220
|
+
if uname:
|
|
221
|
+
check_vuser(uname, aname)
|
|
222
|
+
pcmd = "ps -u {} -f | grep {} | grep -v ' grep '".format(uname, aname)
|
|
223
|
+
mp = r"^\s*{}\s+(\d+)\s+(\d+)\s+.*{}\S*\s+(.*)$".format(uname, aname)
|
|
224
|
+
else:
|
|
225
|
+
pcmd = "ps -C {} -f".format(aname)
|
|
226
|
+
mp = r"^\s*\w+\s+(\d+)\s+(\d+)\s+.*{}\S*\s+(.*)$".format(aname)
|
|
227
|
+
|
|
228
|
+
buf = PgLOG.pgsystem(pcmd, PgLOG.LOGWRN, 20+1024)
|
|
229
|
+
if not buf: return 0
|
|
230
|
+
|
|
231
|
+
cpids = [os.getpid(), os.getppid()]
|
|
232
|
+
pids = []
|
|
233
|
+
ppids = []
|
|
234
|
+
astrs = []
|
|
235
|
+
lines = buf.split('\n')
|
|
236
|
+
for line in lines:
|
|
237
|
+
ms = re.match(mp, line)
|
|
238
|
+
if not ms: continue
|
|
239
|
+
pid = int(ms.group(1))
|
|
240
|
+
ppid = int(ms.group(2))
|
|
241
|
+
if pid in cpids:
|
|
242
|
+
if ppid not in cpids: cpids.append(ppid)
|
|
243
|
+
continue
|
|
244
|
+
pids.append(pid)
|
|
245
|
+
ppids.append(ppid)
|
|
246
|
+
if sargv: astrs.append(ms.group(3))
|
|
247
|
+
|
|
248
|
+
pcnt = len(pids)
|
|
249
|
+
if not pcnt: return 0
|
|
250
|
+
|
|
251
|
+
i = 0
|
|
252
|
+
while i < pcnt:
|
|
253
|
+
pid = pids[i]
|
|
254
|
+
if pid and pid in cpids:
|
|
255
|
+
pids[i] = 0
|
|
256
|
+
ppid = ppids[i]
|
|
257
|
+
if ppid not in cpids: cpids.append(ppid)
|
|
258
|
+
i = 0
|
|
259
|
+
else:
|
|
260
|
+
i += 1
|
|
261
|
+
|
|
262
|
+
for i in range(pcnt):
|
|
263
|
+
pid = pids[i]
|
|
264
|
+
if pid and (not sargv or sargv.find(astrs[i]) > -1): return pid
|
|
265
|
+
|
|
266
|
+
return 0
|
|
267
|
+
|
|
268
|
+
#
|
|
269
|
+
# validate if the current process is a single one. Quit if not
|
|
270
|
+
#
|
|
271
|
+
def validate_single_process(aname, uname = None, sargv = None, logact = PgLOG.LOGWRN):
|
|
272
|
+
|
|
273
|
+
pid = check_application(aname, uname, sargv)
|
|
274
|
+
if pid:
|
|
275
|
+
msg = aname
|
|
276
|
+
if sargv: msg += ' ' + sargv
|
|
277
|
+
msg += ": already running as PID={} on {}".format(pid, PgLOG.PGLOG['HOSTNAME'])
|
|
278
|
+
if uname: msg += ' By ' + uname
|
|
279
|
+
PgLOG.pglog(msg + ', Quit Now', logact)
|
|
280
|
+
sys.exit(0)
|
|
281
|
+
|
|
282
|
+
#
|
|
283
|
+
# check how many processes are running for an application already
|
|
284
|
+
#
|
|
285
|
+
# aname - application name
|
|
286
|
+
# uname - user login name who started the application
|
|
287
|
+
# argv - argument string
|
|
288
|
+
#
|
|
289
|
+
# return the the number of processes (exclude the child one)
|
|
290
|
+
#
|
|
291
|
+
def check_multiple_application(aname, uname = None, sargv = None):
|
|
292
|
+
|
|
293
|
+
if uname:
|
|
294
|
+
check_vuser(uname, aname)
|
|
295
|
+
pcmd = "ps -u {} -f | grep {} | grep -v ' grep '".format(uname, aname)
|
|
296
|
+
mp = r"^\s*{}\s+(\d+)\s+(\d+)\s+.*{}\S*\s+(.*)$".format(uname, aname)
|
|
297
|
+
else:
|
|
298
|
+
pcmd = "ps -C {} -f".format(aname)
|
|
299
|
+
mp = r"^\s*\w+\s+(\d+)\s+(\d+)\s+.*{}\S*\s+(.*)$".format(aname)
|
|
300
|
+
|
|
301
|
+
buf = PgLOG.pgsystem(pcmd, PgLOG.LOGWRN, 20+1024)
|
|
302
|
+
if not buf: return 0
|
|
303
|
+
|
|
304
|
+
dpids = [os.getpid(), os.getppid()]
|
|
305
|
+
pids = []
|
|
306
|
+
ppids = []
|
|
307
|
+
astrs = []
|
|
308
|
+
lines = buf.split('\n')
|
|
309
|
+
for line in lines:
|
|
310
|
+
ms = re.match(mp, line)
|
|
311
|
+
if not ms: continue
|
|
312
|
+
pid = int(ms.group(1))
|
|
313
|
+
ppid = int(ms.group(2))
|
|
314
|
+
if pid in dpids:
|
|
315
|
+
if ppid > 1 and ppid not in dpids: dpids.append(ppid)
|
|
316
|
+
continue
|
|
317
|
+
elif ppid in pids:
|
|
318
|
+
if pid not in dpids: dpids.append(pid)
|
|
319
|
+
continue
|
|
320
|
+
pids.append(pid)
|
|
321
|
+
ppids.append(ppid)
|
|
322
|
+
if sargv: astrs.append(ms.group(3))
|
|
323
|
+
|
|
324
|
+
pcnt = len(pids)
|
|
325
|
+
if not pcnt: return 0
|
|
326
|
+
|
|
327
|
+
i = 0
|
|
328
|
+
while i < pcnt:
|
|
329
|
+
pid = pids[i]
|
|
330
|
+
ppid = ppids[i]
|
|
331
|
+
if pid:
|
|
332
|
+
if pid in dpids:
|
|
333
|
+
if ppid > 1 and ppid not in dpids: dpids.append(ppid)
|
|
334
|
+
i = pids[i] = 0
|
|
335
|
+
continue
|
|
336
|
+
elif ppid in pids:
|
|
337
|
+
if pid not in dpids: dpids.append(pid)
|
|
338
|
+
i = pids[i] = 0
|
|
339
|
+
continue
|
|
340
|
+
i += 1
|
|
341
|
+
|
|
342
|
+
ccnt = 0
|
|
343
|
+
for i in range(pcnt):
|
|
344
|
+
if pids[i] and (not sargv or sargv.find(astrs[i]) > -1): ccnt += 1
|
|
345
|
+
|
|
346
|
+
return ccnt
|
|
347
|
+
|
|
348
|
+
#
|
|
349
|
+
# validate if the running processes reach the limit for the given app; Quit if yes
|
|
350
|
+
#
|
|
351
|
+
def validate_multiple_process(aname, plimit, uname = None, sargv = None, logact = PgLOG.LOGWRN):
|
|
352
|
+
|
|
353
|
+
pcnt = check_multiple_application(aname, uname, sargv)
|
|
354
|
+
if pcnt >= plimit:
|
|
355
|
+
msg = aname
|
|
356
|
+
if sargv: msg += ' ' + sargv
|
|
357
|
+
msg += ": already running in {} processes on {}".format(pcnt, PgLOG.PGLOG['HOSTNAME'])
|
|
358
|
+
if uname: msg += ' By ' + uname
|
|
359
|
+
PgLOG.pglog(msg + ', Quit Now', logact)
|
|
360
|
+
sys.exit(0)
|
|
361
|
+
|
|
362
|
+
#
|
|
363
|
+
# fork process
|
|
364
|
+
#
|
|
365
|
+
# return the defined result from call of fork
|
|
366
|
+
#
|
|
367
|
+
def process_fork(dstr):
|
|
368
|
+
|
|
369
|
+
for i in range(10): # try 10 times
|
|
370
|
+
try:
|
|
371
|
+
pid = os.fork()
|
|
372
|
+
return pid
|
|
373
|
+
except OSError as e:
|
|
374
|
+
if e.errno == errno.EAGAIN:
|
|
375
|
+
os.sleep(5)
|
|
376
|
+
else:
|
|
377
|
+
PgLOG.pglog("{}: {}".format(dstr, str(e)), PgLOG.LGEREX)
|
|
378
|
+
break
|
|
379
|
+
|
|
380
|
+
PgLOG.pglog("{}: too many tries (10) for os.fork()".format(dstr), PgLOG.LGEREX)
|
|
381
|
+
|
|
382
|
+
#
|
|
383
|
+
# process the predefined signals
|
|
384
|
+
#
|
|
385
|
+
def signal_catch(signum, frame):
|
|
386
|
+
|
|
387
|
+
if PGSIG['PPID'] == 1:
|
|
388
|
+
tmp = 'Server'
|
|
389
|
+
elif PGSIG['PPID'] > 1:
|
|
390
|
+
tmp = 'Child'
|
|
391
|
+
else:
|
|
392
|
+
tmp = 'Process'
|
|
393
|
+
|
|
394
|
+
if signum == signal.SIGQUIT:
|
|
395
|
+
sname = "<{} - signal.SIGQUIT - Quit>".format(signum)
|
|
396
|
+
elif signum == signal.SIGUSR1:
|
|
397
|
+
linfo = 'Logging On'
|
|
398
|
+
if PgLOG.PGLOG['LOGMASK']&PgLOG.MSGLOG: linfo += ' & Debugging On'
|
|
399
|
+
sname = "<{} - signal.SIGUSR1 - {}>".format(signum, linfo)
|
|
400
|
+
elif signum == signal.SIGUSR2:
|
|
401
|
+
if PgLOG.PGLOG['DBGLEVEL']:
|
|
402
|
+
linfo = 'Logging off & Debugging Off'
|
|
403
|
+
else:
|
|
404
|
+
linfo = 'Logging Off'
|
|
405
|
+
sname = "<{} - signal.SIGUSR2 - {}>".format(signum, linfo)
|
|
406
|
+
else:
|
|
407
|
+
sname = "<{} - Signal Not Supports Yet>".format(signum)
|
|
408
|
+
|
|
409
|
+
dumpon = 1 if SDUMP['OUT'] and SDUMP['OUT'] != SDUMP['DEF'] else 0
|
|
410
|
+
if not dumpon: set_dump()
|
|
411
|
+
PgLOG.pglog("catches {} in {} {}".format(sname, tmp, PGSIG['DSTR']), PgLOG.LOGWRN|PgLOG.FRCLOG)
|
|
412
|
+
|
|
413
|
+
if signum == signal.SIGUSR1:
|
|
414
|
+
if PgLOG.PGLOG['LOGMASK']&PgLOG.MSGLOG:
|
|
415
|
+
PgLOG.PGLOG['DBGLEVEL'] = 1000 # turn logon twice
|
|
416
|
+
else:
|
|
417
|
+
PgLOG.PGLOG['LOGMASK'] |= PgLOG.MSGLOG # turn on logging
|
|
418
|
+
elif signum == signal.SIGUSR2:
|
|
419
|
+
PgLOG.PGLOG['LOGMASK'] &= ~(PgLOG.MSGLOG) # turn off logging
|
|
420
|
+
PgLOG.PGLOG['DBGLEVEL'] = 0 # turn off debugging
|
|
421
|
+
set_dump(SDUMP['DEF'])
|
|
422
|
+
else:
|
|
423
|
+
if not dumpon: set_dump(SDUMP['DEF'])
|
|
424
|
+
if signum == signal.SIGQUIT: PGSIG['QUIT'] = 1
|
|
425
|
+
|
|
426
|
+
if PGSIG['PPID'] <= 1 and len(CPIDS) > 0: # passing signal to child processes
|
|
427
|
+
for pid in CPIDS: kill_process(pid, signum)
|
|
428
|
+
|
|
429
|
+
#
|
|
430
|
+
# wrapper function to call os.kill() logging caught error based on logact
|
|
431
|
+
# return PgLOG.SUCCESS is success; PgLog.FAILURE if not
|
|
432
|
+
#
|
|
433
|
+
def kill_process(pid, signum, logact = 0):
|
|
434
|
+
|
|
435
|
+
try:
|
|
436
|
+
os.kill(pid, signum)
|
|
437
|
+
except Exception as e:
|
|
438
|
+
ret = PgLOG.FAILURE
|
|
439
|
+
if logact:
|
|
440
|
+
if type(signum) is int:
|
|
441
|
+
sigstr = str(signum)
|
|
442
|
+
else:
|
|
443
|
+
sigstr = "{}-{}".format(signum.name, int(signum))
|
|
444
|
+
PgLOG.pglog("Error pass signal {} to pid {}: {}".format(sigstr, pid, str(e)), logact)
|
|
445
|
+
else:
|
|
446
|
+
ret = PgLOG.SUCCESS
|
|
447
|
+
|
|
448
|
+
return ret
|
|
449
|
+
|
|
450
|
+
#
|
|
451
|
+
# wait child process to finish
|
|
452
|
+
#
|
|
453
|
+
def clean_dead_child(signum, frame):
|
|
454
|
+
|
|
455
|
+
live = 0
|
|
456
|
+
|
|
457
|
+
while True:
|
|
458
|
+
try:
|
|
459
|
+
dpid, status = os.waitpid(-1, os.WNOHANG)
|
|
460
|
+
except ChildProcessError as e:
|
|
461
|
+
break # no child process any more
|
|
462
|
+
except Exception as e:
|
|
463
|
+
PgLOG.PGLOG("Error check child process: {}".format(str(e)), PgLOG.ERRLOG)
|
|
464
|
+
break
|
|
465
|
+
else:
|
|
466
|
+
if dpid == 0:
|
|
467
|
+
if live > 0: break # wait twice if a process is still a live
|
|
468
|
+
live += 1
|
|
469
|
+
elif PGSIG['PPID'] < 2:
|
|
470
|
+
if dpid in CPIDS: del CPIDS[dpid]
|
|
471
|
+
|
|
472
|
+
#
|
|
473
|
+
# send signal to daemon and exit
|
|
474
|
+
#
|
|
475
|
+
def signal_daemon(sname, aname, uname):
|
|
476
|
+
|
|
477
|
+
dstr = "Daemon '{}'{} on {}".format(aname, ((" By " + uname) if uname else ""), PgLOG.PGLOG['HOSTNAME'])
|
|
478
|
+
pid = check_daemon(aname, uname)
|
|
479
|
+
|
|
480
|
+
if pid > 0:
|
|
481
|
+
dstr += " (PID = {})".format(pid)
|
|
482
|
+
if re.match(r'^(quit|stop)$', sname, re.I):
|
|
483
|
+
signum = signal.SIGQUIT
|
|
484
|
+
msg = "QUIT"
|
|
485
|
+
elif re.match(r'^(logon|on)$', sname, re.I):
|
|
486
|
+
signum = signal.SIGUSR1
|
|
487
|
+
msg = "Logging ON"
|
|
488
|
+
elif re.match(r'^(logoff|off)$', sname, re.I):
|
|
489
|
+
signum = signal.SIGUSR2
|
|
490
|
+
msg = "Logging OFF"
|
|
491
|
+
PgLOG.PGLOG['DBGLEVEL'] = 0
|
|
492
|
+
else:
|
|
493
|
+
PgLOG.pglog("{}: invalid Signal for {}".format(sname, dstr), PgLOG.LGEREX)
|
|
494
|
+
|
|
495
|
+
if kill_process(pid, signum, PgLOG.LOGERR) == PgLOG.SUCCESS:
|
|
496
|
+
PgLOG.pglog("{}: signal sent to {}".format(msg, dstr), PgLOG.LOGWRN|PgLOG.FRCLOG)
|
|
497
|
+
else:
|
|
498
|
+
PgLOG.pglog(dstr + ": not running currently", PgLOG.LOGWRN|PgLOG.FRCLOG)
|
|
499
|
+
|
|
500
|
+
sys.exit(0)
|
|
501
|
+
|
|
502
|
+
#
|
|
503
|
+
# start a time child to run the command in case hanging
|
|
504
|
+
#
|
|
505
|
+
def timeout_command(cmd, logact = PgLOG.LOGWRN, cmdopt = 4):
|
|
506
|
+
|
|
507
|
+
if logact&PgLOG.EXITLG: logact &= ~PgLOG.EXITLG
|
|
508
|
+
|
|
509
|
+
PgLOG.pglog("> " + cmd, logact)
|
|
510
|
+
if start_timeout_child(cmd, logact):
|
|
511
|
+
PgLOG.pgsystem(cmd, logact, cmdopt)
|
|
512
|
+
sys.exit(0)
|
|
513
|
+
|
|
514
|
+
#
|
|
515
|
+
# start a timeout child process
|
|
516
|
+
#
|
|
517
|
+
# return: 1 - in child, 0 - in parent
|
|
518
|
+
#
|
|
519
|
+
def start_timeout_child(msg, logact = PgLOG.LOGWRN):
|
|
520
|
+
|
|
521
|
+
pid = process_fork(msg)
|
|
522
|
+
|
|
523
|
+
if pid == 0: # in child
|
|
524
|
+
signal.signal(signal.SIGQUIT, signal_catch) # catch quit signal only
|
|
525
|
+
PGSIG['PPID'] = PGSIG['PID']
|
|
526
|
+
PGSIG['PID'] = pid = os.getpid()
|
|
527
|
+
PgLOG.cmdlog("Timeout child to " + msg, time.time(), 0)
|
|
528
|
+
PgDBI.pgdisconnect(0) # disconnect database in child
|
|
529
|
+
return 1
|
|
530
|
+
|
|
531
|
+
# in parent
|
|
532
|
+
for i in range(PgLOG.PGLOG['TIMEOUT']):
|
|
533
|
+
if not check_process(pid): break
|
|
534
|
+
sys.sleep(2)
|
|
535
|
+
|
|
536
|
+
if check_process(pid):
|
|
537
|
+
msg += ": timeout({} secs) in CPID {}".format(2*PgLOG.PGLOG['TIMEOUT'], pid)
|
|
538
|
+
pids = kill_children(pid, 0)
|
|
539
|
+
sys.sleep(6)
|
|
540
|
+
if kill_process(pid, signal.SIGKILL, PgLOG.LOGERR): pids.insert(0, pid)
|
|
541
|
+
|
|
542
|
+
if pids: msg += "\nProcess({}) Killed".format(','.join(map(str, pids)))
|
|
543
|
+
PgLOG.pglog(msg, logact)
|
|
544
|
+
|
|
545
|
+
return 0
|
|
546
|
+
|
|
547
|
+
#
|
|
548
|
+
# kill children recursively start from the deepest and return the pids got killed
|
|
549
|
+
#
|
|
550
|
+
def kill_children(pid, logact = PgLOG.LOGWRN):
|
|
551
|
+
|
|
552
|
+
buf = PgLOG.pgsystem("ps --ppid {} -o pid".format(pid), logact, 20)
|
|
553
|
+
pids = []
|
|
554
|
+
if buf:
|
|
555
|
+
lines = buf.split('\n')
|
|
556
|
+
for line in lines:
|
|
557
|
+
ms = re.match(r'^\s*(\d+)', line)
|
|
558
|
+
if not ms: continue
|
|
559
|
+
cid = int(ms.group(1))
|
|
560
|
+
if not check_process(cid): continue
|
|
561
|
+
cids = kill_children(cid, logact)
|
|
562
|
+
if cids: pids = cids + pids
|
|
563
|
+
if kill_process(cid, signal.SIGKILL, logact) == PgLOG.SUCCESS: pids.insert(0, cid)
|
|
564
|
+
|
|
565
|
+
if logact and len(pids): PgLOG.pglog("Process({}) Killed".format(','.join(map(str, pids))), logact)
|
|
566
|
+
|
|
567
|
+
return pids
|
|
568
|
+
|
|
569
|
+
#
|
|
570
|
+
# start a child process
|
|
571
|
+
# pname - unique process name
|
|
572
|
+
#
|
|
573
|
+
def start_child(pname, logact = PgLOG.LOGWRN, dowait = 0):
|
|
574
|
+
|
|
575
|
+
global CBIDS
|
|
576
|
+
if PGSIG['MPROC'] < 2: return 1 # no need child process
|
|
577
|
+
|
|
578
|
+
if logact&PgLOG.EXITLG: logact &= ~PgLOG.EXITLG
|
|
579
|
+
if logact&PgLOG.MSGLOG: logact |= PgLOG.FRCLOG
|
|
580
|
+
|
|
581
|
+
if PGSIG['QUIT']:
|
|
582
|
+
return PgLOG.pglog("{} is in QUIT mode, cannot start CPID for {}".format(PGSIG['DSTR'], pname), logact)
|
|
583
|
+
elif len(CPIDS) >= PGSIG['MPROC']:
|
|
584
|
+
i = 0
|
|
585
|
+
while True:
|
|
586
|
+
pcnt = check_child(None, 0, logact)
|
|
587
|
+
if pcnt < PGSIG['MPROC']: break
|
|
588
|
+
if dowait:
|
|
589
|
+
show_wait_message(i, "{}-{}: wait any {} child processes".format(PGSIG['DSTR'], pname, pcnt), logact, dowait)
|
|
590
|
+
i += 1
|
|
591
|
+
else:
|
|
592
|
+
return PgLOG.pglog("{}-{}: {} child processes already running at {}".format(PGSIG['DSTR'], pname, pcnt, PgLOG.current_datetime()), logact)
|
|
593
|
+
|
|
594
|
+
if check_child(pname): return -1 # process is running already
|
|
595
|
+
|
|
596
|
+
pid = process_fork(PGSIG['DSTR'])
|
|
597
|
+
if pid:
|
|
598
|
+
CPIDS[pid] = pname # record the child process id
|
|
599
|
+
PgLOG.pglog("{}: starts CPID {} for {}".format(PGSIG['DSTR'], pid, pname))
|
|
600
|
+
else:
|
|
601
|
+
signal.signal(signal.SIGQUIT, signal.SIG_DFL) # turn off catch QUIT signal in child
|
|
602
|
+
PgLOG.PGLOG['LOGMASK'] &= ~PgLOG.WARNLG # turn off warn in child
|
|
603
|
+
PGSIG['PPID'] = PGSIG['PID']
|
|
604
|
+
PGSIG['PID'] = pid = os.getpid()
|
|
605
|
+
PGSIG['MPROC'] = 1 # 1 in child process
|
|
606
|
+
CBIDS = {} # empty backgroud proces info in case not
|
|
607
|
+
PGSIG['DSTR'] += ": CPID {} for {}".format(pid, pname)
|
|
608
|
+
PgLOG.cmdlog("CPID {} for {}".format(pid, pname))
|
|
609
|
+
PgDBI.pgdisconnect(0) # disconnect database in child
|
|
610
|
+
|
|
611
|
+
return 1 # child started successfully
|
|
612
|
+
|
|
613
|
+
#
|
|
614
|
+
# get child process id for given pname
|
|
615
|
+
#
|
|
616
|
+
def pname2cpid(pname):
|
|
617
|
+
|
|
618
|
+
for cpid in CPIDS:
|
|
619
|
+
if CPIDS[cpid] == pname: return cpid
|
|
620
|
+
|
|
621
|
+
return 0
|
|
622
|
+
|
|
623
|
+
#
|
|
624
|
+
# check one or all child processes if they are still running
|
|
625
|
+
# pname - unique process name if given
|
|
626
|
+
# pid - check this specified process id if given
|
|
627
|
+
# dowait - 0 no wait, 1 wait all done, -1 wait only when all children are running
|
|
628
|
+
# return the number of running processes if dowait == 0 or 1
|
|
629
|
+
# return the number of none-running processes if dowait == -1
|
|
630
|
+
#
|
|
631
|
+
def check_child(pname, pid = 0, logact = PgLOG.LOGWRN, dowait = 0):
|
|
632
|
+
|
|
633
|
+
if PGSIG['MPROC'] < 2: return 0 # no child process
|
|
634
|
+
|
|
635
|
+
if logact&PgLOG.EXITLG: logact &= ~PgLOG.EXITLG
|
|
636
|
+
ccnt = i = 0
|
|
637
|
+
if dowait < 0: ccnt = 1 if (pid or pname) else PGSIG['MPROC']
|
|
638
|
+
while True:
|
|
639
|
+
pcnt = 0
|
|
640
|
+
if not pid and pname: pid = pname2cpid(pname)
|
|
641
|
+
if pid:
|
|
642
|
+
if check_process(pid): # process is not done yet
|
|
643
|
+
if pname:
|
|
644
|
+
PgLOG.pglog("{}({}): Child still running".format(pname, pid), logact)
|
|
645
|
+
else:
|
|
646
|
+
PgLOG.pglog("{}: Child still running".format(pid), logact)
|
|
647
|
+
pcnt = 1
|
|
648
|
+
elif pid in CPIDS:
|
|
649
|
+
del CPIDS[pid] # clean the saved info for the process
|
|
650
|
+
elif not pname:
|
|
651
|
+
cpids = list(CPIDS)
|
|
652
|
+
for cpid in cpids:
|
|
653
|
+
if check_process(cpid): # process is not done yet
|
|
654
|
+
pcnt += 1
|
|
655
|
+
elif cpid in CPIDS:
|
|
656
|
+
del CPIDS[cpid]
|
|
657
|
+
|
|
658
|
+
if pcnt == 0 or dowait == 0 or pcnt < ccnt: break
|
|
659
|
+
show_wait_message(i, "{}: wait {}/{} child processes".format(PGSIG['DSTR'], pcnt, PGSIG['MPROC']), logact, dowait)
|
|
660
|
+
i += 1
|
|
661
|
+
|
|
662
|
+
return (ccnt - pcnt) if ccnt else pcnt
|
|
663
|
+
|
|
664
|
+
#
|
|
665
|
+
# start this process in none daemon mode
|
|
666
|
+
#
|
|
667
|
+
# aname - application name, or daemon name
|
|
668
|
+
# cact - short action name
|
|
669
|
+
# uname - user login name to started the application
|
|
670
|
+
# mproc - upper limit of muiltiple child processes
|
|
671
|
+
# wtime - waiting time (in seconds) for next process
|
|
672
|
+
#
|
|
673
|
+
def start_none_daemon(aname, cact = None, uname = None, mproc = 1, wtime = 120, logon = 1, bproc = 1):
|
|
674
|
+
|
|
675
|
+
dstr = aname
|
|
676
|
+
if cact: dstr += " for Action " + cact
|
|
677
|
+
if uname:
|
|
678
|
+
dstr += " By " + uname
|
|
679
|
+
check_vuser(uname, aname)
|
|
680
|
+
|
|
681
|
+
signal.signal(signal.SIGQUIT, signal_catch) # catch quit signal only
|
|
682
|
+
signal.signal(signal.SIGCHLD, clean_dead_child)
|
|
683
|
+
PGSIG['DSTR'] = dstr
|
|
684
|
+
PGSIG['DNAME'] = aname
|
|
685
|
+
PGSIG['PPID'] = 0
|
|
686
|
+
PGSIG['PID'] = os.getpid()
|
|
687
|
+
PGSIG['MPROC'] = mproc
|
|
688
|
+
PGSIG['BPROC'] = bproc
|
|
689
|
+
PgLOG.PGLOG['CMDTIME'] = PGSIG['WTIME'] = get_wait_time(wtime, 120, "Polling Wait Time")
|
|
690
|
+
if PGSIG['MPROC'] > 1:
|
|
691
|
+
PgLOG.cmdlog("starts non-daemon {}(ML={},WI={})".format(aname, PGSIG['MPROC'], PGSIG['WTIME']))
|
|
692
|
+
if not logon: PgLOG.PGLOG['LOGMASK'] &= ~PgLOG.MSGLOG # turn off message logging
|
|
693
|
+
|
|
694
|
+
#
|
|
695
|
+
# check one process id other than the current one if it is still running
|
|
696
|
+
# pid - specified process id
|
|
697
|
+
# pmsg - process message if given
|
|
698
|
+
#
|
|
699
|
+
def check_process(pid):
|
|
700
|
+
|
|
701
|
+
buf = PgLOG.pgsystem("ps -p {} -o pid".format(pid), PgLOG.LGWNEX, 20)
|
|
702
|
+
if buf:
|
|
703
|
+
mp = r'^\s*{}$'.format(pid)
|
|
704
|
+
lines = buf.split('\n')
|
|
705
|
+
for line in lines:
|
|
706
|
+
if re.match(mp, line): return 1
|
|
707
|
+
|
|
708
|
+
return 0
|
|
709
|
+
|
|
710
|
+
#
|
|
711
|
+
# check a process id on give host
|
|
712
|
+
#
|
|
713
|
+
def check_host_pid(host, pid, pmsg = None, logact = PgLOG.LOGWRN):
|
|
714
|
+
|
|
715
|
+
cmd = 'rdaps'
|
|
716
|
+
if host: cmd += " -h " + host
|
|
717
|
+
cmd += " -p {}".format(pid)
|
|
718
|
+
buf = PgLOG.pgsystem(cmd, logact, 276) # 4+16+256
|
|
719
|
+
if not buf: return (-1 if PgLOG.PGLOG['SYSERR'] else 0)
|
|
720
|
+
if pmsg: PgLOG.pglog(pmsg, logact&(~PgLOG.EXITLG))
|
|
721
|
+
return 1
|
|
722
|
+
|
|
723
|
+
#
|
|
724
|
+
# check one process id on a given host name if it is still running, with default timeout
|
|
725
|
+
# pid - specified process id
|
|
726
|
+
# ppid - specified parent process id
|
|
727
|
+
# uname - user login name who started the daemon
|
|
728
|
+
# host - host name the pid supposed to be running on
|
|
729
|
+
# aname - application name
|
|
730
|
+
# pmsg - process message if given
|
|
731
|
+
#
|
|
732
|
+
# return 1 if process is steal live, 0 died already, -1 error checking
|
|
733
|
+
#
|
|
734
|
+
def check_host_process(host, pid, ppid = 0, uname = None, aname = None, pmsg = None, logact = PgLOG.LOGWRN):
|
|
735
|
+
|
|
736
|
+
cmd = "rdaps"
|
|
737
|
+
if host: cmd += " -h " + host
|
|
738
|
+
if pid: cmd += " -p {}".format(pid)
|
|
739
|
+
if ppid: cmd += " -P {}".format(ppid)
|
|
740
|
+
if uname: cmd += " -u " + uname
|
|
741
|
+
if aname: cmd += " -a " + aname
|
|
742
|
+
buf = PgLOG.pgsystem(cmd, logact, 276) # 4+16+256
|
|
743
|
+
if not buf: return (-1 if PgLOG.PGLOG['SYSERR'] else 0)
|
|
744
|
+
if pmsg: PgLOG.pglog(pmsg, logact&(~PgLOG.EXITLG))
|
|
745
|
+
return 1
|
|
746
|
+
|
|
747
|
+
#
|
|
748
|
+
# get a single slurm status record
|
|
749
|
+
#
|
|
750
|
+
def get_slurm_info(bcmd, logact = PgLOG.LOGWRN):
|
|
751
|
+
|
|
752
|
+
stat = {}
|
|
753
|
+
buf = PgLOG.pgsystem(bcmd, logact, 16)
|
|
754
|
+
if not buf: return stat
|
|
755
|
+
|
|
756
|
+
chkt = 1
|
|
757
|
+
lines = buf.split('\n')
|
|
758
|
+
for line in lines:
|
|
759
|
+
if chkt:
|
|
760
|
+
if re.match(r'^\s*JOBID\s', line, re.I):
|
|
761
|
+
ckeys = re.split(r'\s+', PgLOG.pgtrim(line))
|
|
762
|
+
kcnt = len(ckeys)
|
|
763
|
+
chkt = 0
|
|
764
|
+
else:
|
|
765
|
+
if re.match(r'^-----', line): continue
|
|
766
|
+
vals = re.split(r'\s+', PgLOG.pgtrim(line))
|
|
767
|
+
vcnt = len(vals)
|
|
768
|
+
if vcnt >= kcnt:
|
|
769
|
+
for i in range(kcnt):
|
|
770
|
+
ckeys[i] = ckeys[i].upper()
|
|
771
|
+
stat[ckeys[i]] = vals[i]
|
|
772
|
+
|
|
773
|
+
if vcnt > kcnt:
|
|
774
|
+
for i in range(kcnt, vcnt):
|
|
775
|
+
stat[ckeys[kcnt-1]] += ' ' + str(vals[i])
|
|
776
|
+
break
|
|
777
|
+
|
|
778
|
+
return stat
|
|
779
|
+
|
|
780
|
+
#
|
|
781
|
+
# get a single pbs status record via qstat
|
|
782
|
+
#
|
|
783
|
+
def get_pbs_info(qopts, multiple = 0, logact = 0, chkcnt = 1):
|
|
784
|
+
|
|
785
|
+
stat = {}
|
|
786
|
+
loop = 0
|
|
787
|
+
buf = None
|
|
788
|
+
while loop < chkcnt:
|
|
789
|
+
buf = PgLOG.pgsystem("qstat -n -w {}".format(qopts), logact, 16)
|
|
790
|
+
if buf: break
|
|
791
|
+
loop += 1
|
|
792
|
+
time.sleep(6)
|
|
793
|
+
|
|
794
|
+
if not buf: return stat
|
|
795
|
+
|
|
796
|
+
chkt = chkd = 1
|
|
797
|
+
lines = buf.split('\n')
|
|
798
|
+
for line in lines:
|
|
799
|
+
if chkt:
|
|
800
|
+
if re.match(r'^Job ID', line):
|
|
801
|
+
line = re.sub(r'^Job ID', 'JobID', line, 1)
|
|
802
|
+
ckeys = re.split(r'\s+', PgLOG.pgtrim(line))
|
|
803
|
+
ckeys[1] = 'UserName'
|
|
804
|
+
ckeys[3] = 'JobName'
|
|
805
|
+
ckeys[7] = 'Reqd' + ckeys[7]
|
|
806
|
+
ckeys[8] = 'Reqd' + ckeys[7]
|
|
807
|
+
ckeys[9] = 'State'
|
|
808
|
+
ckeys[10] = 'Elap' + ckeys[7]
|
|
809
|
+
ckeys.append('Node')
|
|
810
|
+
kcnt = len(ckeys)
|
|
811
|
+
if multiple:
|
|
812
|
+
for i in range(kcnt):
|
|
813
|
+
stat[ckeys[i]] = []
|
|
814
|
+
chkt = 0
|
|
815
|
+
elif chkd:
|
|
816
|
+
if re.match(r'^-----', line): chkd = 0
|
|
817
|
+
else:
|
|
818
|
+
vals = re.split(r'\s+', PgLOG.pgtrim(line))
|
|
819
|
+
vcnt = len(vals)
|
|
820
|
+
if vcnt == 1:
|
|
821
|
+
if multiple:
|
|
822
|
+
stat[ckeys[kcnt-1]].append(vals[0])
|
|
823
|
+
else:
|
|
824
|
+
stat[ckeys[kcnt-1]] = vals[0]
|
|
825
|
+
break
|
|
826
|
+
elif vcnt > 1:
|
|
827
|
+
ms = re.match(r'^(\d+)', vals[0])
|
|
828
|
+
if ms: vals[0] = ms.group(1)
|
|
829
|
+
for i in range(vcnt):
|
|
830
|
+
if multiple:
|
|
831
|
+
stat[ckeys[i]].append(vals[i])
|
|
832
|
+
else:
|
|
833
|
+
stat[ckeys[i]] = vals[i]
|
|
834
|
+
if vcnt == kcnt: break
|
|
835
|
+
|
|
836
|
+
return stat
|
|
837
|
+
|
|
838
|
+
#
|
|
839
|
+
# get multiple slurn status record
|
|
840
|
+
#
|
|
841
|
+
def get_slurm_multiple(bcmd, logact = PgLOG.LOGWRN):
|
|
842
|
+
|
|
843
|
+
buf = PgLOG.pgsystem(bcmd, logact, 16)
|
|
844
|
+
if not buf: return 0
|
|
845
|
+
|
|
846
|
+
stat = {}
|
|
847
|
+
j = 0
|
|
848
|
+
chkt = chkd = 1
|
|
849
|
+
lines = buf.split('\n')
|
|
850
|
+
for line in lines:
|
|
851
|
+
if chkt:
|
|
852
|
+
if re.match(r'^\s*JOBID\s', line, re.I):
|
|
853
|
+
ckeys = re.split(r'\s+', PgLOG.pgtrim(line))
|
|
854
|
+
kcnt = len(ckeys)
|
|
855
|
+
for i in range(kcnt):
|
|
856
|
+
ckeys[i] = ckeys[i].upper()
|
|
857
|
+
stat[ckeys[i]] = []
|
|
858
|
+
chkt = 0
|
|
859
|
+
elif chkd:
|
|
860
|
+
if re.match(r'^-----', line): chkd = 0
|
|
861
|
+
else:
|
|
862
|
+
vals = re.split(r'\s+', PgLOG.pgtrim(line))
|
|
863
|
+
vcnt = len(vals)
|
|
864
|
+
if vcnt >= kcnt:
|
|
865
|
+
for i in range(kcnt):
|
|
866
|
+
stat[ckeys[i]].append(vals[i])
|
|
867
|
+
|
|
868
|
+
if vcnt > kcnt:
|
|
869
|
+
for i in range(kcnt, vcnt):
|
|
870
|
+
stat[ckeys[kcnt-1]][j] += ' ' + str(vals[i])
|
|
871
|
+
j += 1
|
|
872
|
+
|
|
873
|
+
return stat if j else 0
|
|
874
|
+
|
|
875
|
+
#
|
|
876
|
+
# check status of a slurm batch id
|
|
877
|
+
# bid - specified batch id
|
|
878
|
+
#
|
|
879
|
+
# return hash of batch status, 0 if cannot check any more
|
|
880
|
+
#
|
|
881
|
+
def check_slurm_status(bid, logact = PgLOG.LOGWRN):
|
|
882
|
+
|
|
883
|
+
return get_slurm_info("sacct -o jobid,user,totalcpu,elapsed,ncpus,state,jobname,nodelist -j {}".format(bid), logact)
|
|
884
|
+
|
|
885
|
+
#
|
|
886
|
+
# check status of a pbs batch id
|
|
887
|
+
# bid - specified batch id
|
|
888
|
+
#
|
|
889
|
+
# return hash of batch status, 0 if cannot check any more
|
|
890
|
+
#
|
|
891
|
+
def check_pbs_status(bid, logact = PgLOG.LOGWRN):
|
|
892
|
+
|
|
893
|
+
stat = {}
|
|
894
|
+
buf = PgLOG.pgsystem("qhist -w -j {}".format(bid), logact, 20)
|
|
895
|
+
if not buf: return stat
|
|
896
|
+
|
|
897
|
+
chkt = 1
|
|
898
|
+
lines = buf.split('\n')
|
|
899
|
+
for line in lines:
|
|
900
|
+
if chkt:
|
|
901
|
+
if re.match(r'^Job', line):
|
|
902
|
+
line = re.sub(r'^Job ID', 'JobID', line, 1)
|
|
903
|
+
line = re.sub(r'Finish Time', 'FinishTime', line, 1)
|
|
904
|
+
line = re.sub(r'Req Mem', 'ReqMem', line, 1)
|
|
905
|
+
line = re.sub(r'Used Mem\(GB\)', 'UsedMem(GB)', line, 1)
|
|
906
|
+
line = re.sub(r'Avg CPU \(%\)', 'AvgCPU(%)', line, 1)
|
|
907
|
+
line = re.sub(r'Elapsed \(h\)', 'WallTime(h)', line, 1)
|
|
908
|
+
line = re.sub(r'Job Name', 'JobName', line, 1)
|
|
909
|
+
ckeys = re.split(r'\s+', PgLOG.pgtrim(line))
|
|
910
|
+
ckeys[1] = 'UserName'
|
|
911
|
+
kcnt = len(ckeys)
|
|
912
|
+
chkt = 0
|
|
913
|
+
else:
|
|
914
|
+
vals = re.split(r'\s+', PgLOG.pgtrim(line))
|
|
915
|
+
for i in range(kcnt):
|
|
916
|
+
stat[ckeys[i]] = vals[i]
|
|
917
|
+
break
|
|
918
|
+
|
|
919
|
+
return stat
|
|
920
|
+
|
|
921
|
+
#
|
|
922
|
+
# check if a slurm batch id is live
|
|
923
|
+
# bid - specified batch id
|
|
924
|
+
#
|
|
925
|
+
# return 1 if process is steal live, 0 died already or error checking
|
|
926
|
+
#
|
|
927
|
+
def check_slurm_process(bid, pmsg = None, logact = PgLOG.LOGWRN):
|
|
928
|
+
|
|
929
|
+
stat = get_slurm_info("squeue -l -j {}".format(bid), logact)
|
|
930
|
+
|
|
931
|
+
if stat:
|
|
932
|
+
ms = re.match(r'^(RUNNING|PENDING|SUSPENDE|COMPLETI|CONFIGUR|REQUEUE_)$', stat['STATE'])
|
|
933
|
+
if ms:
|
|
934
|
+
if pmsg: PgLOG.pglog("{}, STATE={}".format(pmsg, ms.group(1)), logact&~PgLOG.EXITLG)
|
|
935
|
+
return 1
|
|
936
|
+
else:
|
|
937
|
+
return 0
|
|
938
|
+
|
|
939
|
+
return -1
|
|
940
|
+
|
|
941
|
+
#
|
|
942
|
+
# check if a pbs batch id is live
|
|
943
|
+
# bid - specified batch id
|
|
944
|
+
#
|
|
945
|
+
# return 1 if process is steal live, 0 died already or error checking
|
|
946
|
+
#
|
|
947
|
+
def check_pbs_process(bid, pmsg = None, logact = PgLOG.LOGWRN):
|
|
948
|
+
|
|
949
|
+
stat = get_pbs_info(bid, 0, logact)
|
|
950
|
+
|
|
951
|
+
ret = -1
|
|
952
|
+
if stat:
|
|
953
|
+
ms = re.match(r'^(B|R|Q|S|H|W|X)$', stat['State'])
|
|
954
|
+
if ms:
|
|
955
|
+
if pmsg: pmsg += ", STATE='{}' and returns 1".format(ms.group(1))
|
|
956
|
+
ret = 1
|
|
957
|
+
else:
|
|
958
|
+
if pmsg: pmsg += ", STATE='{}' and returns 0".format(stat['State'])
|
|
959
|
+
ret = 0
|
|
960
|
+
elif pmsg:
|
|
961
|
+
pmsg += ", Process Not Exists and returns -1"
|
|
962
|
+
|
|
963
|
+
if pmsg: PgLOG.pglog(pmsg, logact&~PgLOG.EXITLG)
|
|
964
|
+
|
|
965
|
+
return ret
|
|
966
|
+
|
|
967
|
+
#
|
|
968
|
+
# get wait time
|
|
969
|
+
#
|
|
970
|
+
def get_wait_time(wtime, default, tmsg):
|
|
971
|
+
|
|
972
|
+
if not wtime: wtime = default # use default time
|
|
973
|
+
|
|
974
|
+
if type(wtime) is int: return wtime
|
|
975
|
+
if re.match(r'^(\d*)$', wtime): return int(wtime)
|
|
976
|
+
|
|
977
|
+
ms = re.match(r'^(\d*)([DHMS])$', wtime, re.I)
|
|
978
|
+
if ms:
|
|
979
|
+
ret = int(ms.group(1))
|
|
980
|
+
unit = ms.group(2)
|
|
981
|
+
else:
|
|
982
|
+
PgLOG.pglog("{}: '{}' NOT in (D,H,M,S)".format(wtime, tmsg), PgLOG.LGEREX)
|
|
983
|
+
|
|
984
|
+
if unit != 'S':
|
|
985
|
+
ret *= 60 # seconds in a minute
|
|
986
|
+
if unit != 'M':
|
|
987
|
+
ret *= 60 # minutes in an hour
|
|
988
|
+
if unit != 'H':
|
|
989
|
+
ret *= 24 # hours in a day
|
|
990
|
+
|
|
991
|
+
return ret # in seconds
|
|
992
|
+
|
|
993
|
+
#
|
|
994
|
+
# start a background process and record its id; check PgLOG.pgsystem() in PgLOG.pm for
|
|
995
|
+
# valid cmdopt values
|
|
996
|
+
#
|
|
997
|
+
def start_background(cmd, logact = PgLOG.LOGWRN, cmdopt = 5, dowait = 0):
|
|
998
|
+
|
|
999
|
+
if PGSIG['BPROC'] < 2: return PgLOG.pgsystem(cmd, logact, cmdopt) # no background
|
|
1000
|
+
|
|
1001
|
+
act = logact&(~PgLOG.EXITLG)
|
|
1002
|
+
if act&PgLOG.MSGLOG: act |= PgLOG.FRCLOG # make sure background calls always logged
|
|
1003
|
+
|
|
1004
|
+
if len(CBIDS) >= PGSIG['BPROC']:
|
|
1005
|
+
i = 0
|
|
1006
|
+
while True:
|
|
1007
|
+
bcnt = check_background(None, 0, act)
|
|
1008
|
+
if bcnt < PGSIG['BPROC']: break
|
|
1009
|
+
if dowait:
|
|
1010
|
+
show_wait_message(i, "{}-{}: wait any {} background calls".format(PGSIG['DSTR'], cmd, bcnt), act, dowait)
|
|
1011
|
+
i += 1
|
|
1012
|
+
else:
|
|
1013
|
+
return PgLOG.pglog("{}-{}: {} background calls already at {}".format(PGSIG['DSTR'], cmd, bcnt, PgLOG.current_datetime()), act)
|
|
1014
|
+
|
|
1015
|
+
cmdlog = (act if cmdopt&1 else PgLOG.WARNLG)
|
|
1016
|
+
if cmdopt&8:
|
|
1017
|
+
PgLOG.cmdlog("starts '{}'".format(cmd), None, cmdlog)
|
|
1018
|
+
else:
|
|
1019
|
+
PgLOG.pglog("{}({})-{} >{} &".format(PgLOG.PGLOG['HOSTNAME'], os.getpid(), PgLOG.current_datetime(), cmd), cmdlog)
|
|
1020
|
+
bckcmd = cmd
|
|
1021
|
+
if cmdopt&2:
|
|
1022
|
+
bckcmd += " >> {}/{}".format(PgLOG.PGLOG['LOGPATH'], PgLOG.PGLOG['LOGFILE'])
|
|
1023
|
+
|
|
1024
|
+
if cmdopt&4:
|
|
1025
|
+
if not PgLOG.PGLOG['ERRFILE']:
|
|
1026
|
+
PgLOG.PGLOG['ERRFILE'] = re.sub(r'\.log$', '.err', PgLOG.PGLOG['LOGFILE'], 1)
|
|
1027
|
+
bckcmd += " 2>> {}/{}".format(PgLOG.PGLOG['LOGPATH'], PgLOG.PGLOG['ERRFILE'])
|
|
1028
|
+
|
|
1029
|
+
bckcmd += " &"
|
|
1030
|
+
os.system(bckcmd)
|
|
1031
|
+
return record_background(cmd, logact)
|
|
1032
|
+
|
|
1033
|
+
#
|
|
1034
|
+
# get background process id for given bcmd
|
|
1035
|
+
#
|
|
1036
|
+
def bcmd2cbid(bcmd):
|
|
1037
|
+
|
|
1038
|
+
for cbid in CBIDS:
|
|
1039
|
+
if CBIDS[cbid] == bcmd: return cbid
|
|
1040
|
+
|
|
1041
|
+
return 0
|
|
1042
|
+
|
|
1043
|
+
#
|
|
1044
|
+
# check one or all child processes if they are still running
|
|
1045
|
+
# bid - check this specified background process id if given
|
|
1046
|
+
# return the number of processes are still running
|
|
1047
|
+
#
|
|
1048
|
+
def check_background(bcmd, bid = 0, logact = PgLOG.LOGWRN, dowait = 0):
|
|
1049
|
+
|
|
1050
|
+
if PGSIG['BPROC'] < 2: return 0 # no background process
|
|
1051
|
+
|
|
1052
|
+
if logact&PgLOG.EXITLG: logact &= ~PgLOG.EXITLG
|
|
1053
|
+
if not bid and bcmd: bid = bcmd2cbid(bcmd)
|
|
1054
|
+
bcnt = i = 0
|
|
1055
|
+
while True:
|
|
1056
|
+
if bid:
|
|
1057
|
+
if check_process(bid): # process is not done yet
|
|
1058
|
+
if bcmd:
|
|
1059
|
+
PgLOG.pglog("{}({}): Background process still running".format(bcmd, bid), logact)
|
|
1060
|
+
else:
|
|
1061
|
+
PgLOG.pglog("{}: Background process still running".format(bid), logact)
|
|
1062
|
+
bcnt = 1
|
|
1063
|
+
elif bid in CBIDS:
|
|
1064
|
+
del CBIDS[bid] # clean the saved info for the process
|
|
1065
|
+
elif not bcmd:
|
|
1066
|
+
for bid in CBIDS:
|
|
1067
|
+
if check_process(bid): # process is not done yet
|
|
1068
|
+
bcnt += 1
|
|
1069
|
+
else:
|
|
1070
|
+
del CBIDS[bid]
|
|
1071
|
+
|
|
1072
|
+
if not (bcnt and dowait): break
|
|
1073
|
+
show_wait_message(i, "{}: wait {}/{} background processes".format(PGSIG['DSTR'], bcnt, PGSIG['MPROC']), logact, dowait)
|
|
1074
|
+
i += 1
|
|
1075
|
+
bcnt = 0
|
|
1076
|
+
|
|
1077
|
+
return bcnt
|
|
1078
|
+
|
|
1079
|
+
#
|
|
1080
|
+
# check and record process id for background command; return 1 if success full;
|
|
1081
|
+
# 0 otherwise; -1 if done already
|
|
1082
|
+
#
|
|
1083
|
+
def record_background(bcmd, logact = PgLOG.LOGWRN):
|
|
1084
|
+
|
|
1085
|
+
ms = re.match(r'^(\S+)', bcmd)
|
|
1086
|
+
if ms:
|
|
1087
|
+
aname = ms.group(1)
|
|
1088
|
+
else:
|
|
1089
|
+
aname = bcmd
|
|
1090
|
+
|
|
1091
|
+
mp = r"^\s*(\S+)\s+(\d+)\s+1\s+.*{}(.*)$".format(aname)
|
|
1092
|
+
pc = "ps -u {},{} -f | grep ' 1 ' | grep {}".format(PgLOG.PGLOG['CURUID'], PgLOG.PGLOG['GDEXUSER'], aname)
|
|
1093
|
+
for i in range(2):
|
|
1094
|
+
buf = PgLOG.pgsystem(pc, logact, 20+1024)
|
|
1095
|
+
if buf:
|
|
1096
|
+
lines = buf.split('\n')
|
|
1097
|
+
for line in lines:
|
|
1098
|
+
ms = re.match(mp, line)
|
|
1099
|
+
if not ms: continue
|
|
1100
|
+
(uid, sbid, acmd) = ms.groups()
|
|
1101
|
+
bid = int(sbid)
|
|
1102
|
+
if bid in CBIDS: return -1
|
|
1103
|
+
if uid == PgLOG.PGLOG['GDEXUSER']:
|
|
1104
|
+
acmd = re.sub(r'^\.(pl|py)\s+', '', acmd, 1)
|
|
1105
|
+
if re.match(r'^{}{}'.format(aname, acmd), bcmd): continue
|
|
1106
|
+
CBIDS[bid] = bcmd
|
|
1107
|
+
return 1
|
|
1108
|
+
time.sleep(2)
|
|
1109
|
+
|
|
1110
|
+
return 0
|
|
1111
|
+
|
|
1112
|
+
#
|
|
1113
|
+
# sleep for given period for the daemon, stops if maximum running time reached
|
|
1114
|
+
#
|
|
1115
|
+
def sleep_daemon(wtime = 0, mtime = None):
|
|
1116
|
+
|
|
1117
|
+
if not wtime: wtime = PGSIG['WTIME']
|
|
1118
|
+
if mtime is None: mtime = PGSIG['MTIME']
|
|
1119
|
+
|
|
1120
|
+
if mtime > 0:
|
|
1121
|
+
rtime = int(time.time()) - PGSIG['STIME']
|
|
1122
|
+
if rtime >= mtime:
|
|
1123
|
+
PGSIG['QUIT'] = 1
|
|
1124
|
+
wtime = 0
|
|
1125
|
+
|
|
1126
|
+
if wtime: time.sleep(wtime)
|
|
1127
|
+
return wtime
|
|
1128
|
+
|
|
1129
|
+
#
|
|
1130
|
+
# show wait message every dintv and then sleep for PGSIG['WTIME']
|
|
1131
|
+
#
|
|
1132
|
+
def show_wait_message(loop, msg, logact = PgLOG.LOGWRN, dowait = 0):
|
|
1133
|
+
|
|
1134
|
+
if loop > 0 and (loop%30) == 0:
|
|
1135
|
+
PgLOG.pglog("{} at {}".format(msg, PgLOG.current_datetime()), logact)
|
|
1136
|
+
|
|
1137
|
+
if dowait: time.sleep(PGSIG['WTIME'])
|
|
1138
|
+
|
|
1139
|
+
#
|
|
1140
|
+
# register a time out function to raise a time out error
|
|
1141
|
+
#
|
|
1142
|
+
@contextmanager
|
|
1143
|
+
def pgtimeout(seconds = 0, logact = 0):
|
|
1144
|
+
|
|
1145
|
+
if not seconds: seconds = PgLOG.PGLOG['TIMEOUT']
|
|
1146
|
+
signal.signal(signal.SIGALRM, raise_pgtimeout)
|
|
1147
|
+
signal.alarm(seconds)
|
|
1148
|
+
try:
|
|
1149
|
+
yield
|
|
1150
|
+
except TimeoutError as e:
|
|
1151
|
+
pass
|
|
1152
|
+
finally:
|
|
1153
|
+
signal.signal(signal.SIGALRM, signal.SIG_IGN)
|
|
1154
|
+
|
|
1155
|
+
def raise_pgtimeout(signum, frame):
|
|
1156
|
+
raise TimeoutError
|
|
1157
|
+
|
|
1158
|
+
def timeout_func():
|
|
1159
|
+
# Add a timeout block.
|
|
1160
|
+
with pgtimeout(1):
|
|
1161
|
+
print('entering block')
|
|
1162
|
+
import time
|
|
1163
|
+
time.sleep(10)
|
|
1164
|
+
print('This should never get printed because the line before timed out')
|