pwact 0.1.11__tar.gz → 0.1.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pwact-0.1.11 → pwact-0.1.13}/PKG-INFO +1 -1
- pwact-0.1.13/pwact/active_learning/slurm/slurm.py +382 -0
- pwact-0.1.13/pwact/active_learning/slurm/slurm_tool.py +35 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/cmd_infos.py +1 -1
- pwact-0.1.13/pwact/utils/app_lib/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact.egg-info/PKG-INFO +1 -1
- {pwact-0.1.11 → pwact-0.1.13}/pwact.egg-info/SOURCES.txt +3 -0
- {pwact-0.1.11 → pwact-0.1.13}/setup.py +1 -1
- {pwact-0.1.11 → pwact-0.1.13}/LICENSE +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/README.md +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/environment.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/explore/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/explore/run_model_md.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/explore/select_image.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/init_bulk/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/init_bulk/aimd.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/init_bulk/duplicate_scale.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/init_bulk/init_bulk_run.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/init_bulk/relabel.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/init_bulk/relax.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/label/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/label/labeling.py +0 -0
- {pwact-0.1.11/pwact/active_learning/test → pwact-0.1.13/pwact/active_learning/slurm}/__init__.py +0 -0
- {pwact-0.1.11/pwact/active_learning/train → pwact-0.1.13/pwact/active_learning/test}/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/test/test.py +0 -0
- {pwact-0.1.11/pwact/active_learning/user_input → pwact-0.1.13/pwact/active_learning/train}/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/train/dp_kpu.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/train/train_model.py +0 -0
- {pwact-0.1.11/pwact/active_learning/user_input/train_param → pwact-0.1.13/pwact/active_learning/user_input}/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/init_bulk_input.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/iter_input.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/resource.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/scf_param.py +0 -0
- {pwact-0.1.11/pwact/bin → pwact-0.1.13/pwact/active_learning/user_input/train_param}/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/train_param/model_param.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/train_param/nep_param.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/train_param/nn_feature_type.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/train_param/optimizer_param.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/train_param/train_param.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/train_param/work_file_param.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/workdir.py +0 -0
- {pwact-0.1.11/pwact/data_format → pwact-0.1.13/pwact/bin}/__init__.py +0 -0
- {pwact-0.1.11/pwact/utils → pwact-0.1.13/pwact/data_format}/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/data_format/configop.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/main.py +0 -0
- {pwact-0.1.11/pwact/utils/app_lib → pwact-0.1.13/pwact/utils}/__init__.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/app_lib/common.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/app_lib/cp2k.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/app_lib/lammps.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/app_lib/pwmat.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/constant.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/file_operation.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/format_input_output.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/json_operation.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/pre_al_data_util.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/process_tool.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact/utils/slurm_script.py +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact.egg-info/dependency_links.txt +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact.egg-info/entry_points.txt +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/pwact.egg-info/top_level.txt +0 -0
- {pwact-0.1.11 → pwact-0.1.13}/setup.cfg +0 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from subprocess import Popen, PIPE
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
import shutil
|
|
7
|
+
from pwact.active_learning.slurm.slurm_tool import get_jobs
|
|
8
|
+
class JobStatus (Enum) :
|
|
9
|
+
unsubmitted = 1 #
|
|
10
|
+
waiting = 2 # PD
|
|
11
|
+
running = 3 # R
|
|
12
|
+
terminated = 4
|
|
13
|
+
finished = 5
|
|
14
|
+
unknown = 100
|
|
15
|
+
resubmit_failed = 6
|
|
16
|
+
submit_limit:int = 3
|
|
17
|
+
|
|
18
|
+
def get_slurm_sbatch_cmd(job_dir:str, job_name:str):
|
|
19
|
+
cmd = "cd {} && sbatch {}".format(job_dir, job_name)
|
|
20
|
+
return cmd
|
|
21
|
+
|
|
22
|
+
class SlurmJob(object):
|
|
23
|
+
def __init__(self, job_id=None, status=JobStatus.unsubmitted, user=None, name=None, nodes=None, nodelist=None, partition=None) -> None:
|
|
24
|
+
self.job_id = job_id
|
|
25
|
+
self.status = status
|
|
26
|
+
self.user = user
|
|
27
|
+
self.name = name
|
|
28
|
+
self.partition=partition
|
|
29
|
+
self.nodes = nodes
|
|
30
|
+
self.nodelist = nodelist
|
|
31
|
+
self.submit_num = 0
|
|
32
|
+
|
|
33
|
+
def set_cmd(self, script_path:str):
|
|
34
|
+
#such as "sbatch main_MD_test.sh"
|
|
35
|
+
self.slurm_job_run_dir = os.path.dirname(script_path)
|
|
36
|
+
self.slurm_job_name = os.path.basename(script_path)
|
|
37
|
+
slurm_cmd = get_slurm_sbatch_cmd(self.slurm_job_run_dir, self.slurm_job_name)
|
|
38
|
+
self.submit_cmd = slurm_cmd
|
|
39
|
+
|
|
40
|
+
'''
|
|
41
|
+
description:
|
|
42
|
+
the job_type could be:
|
|
43
|
+
cp2k/relax, cp2k/scf, cp2k/aimd, pwmat/relax, pwmat/scf, pwmat/aimd, vasp/relax, vasp/scf, vasp/aimd, lammps
|
|
44
|
+
param {*} self
|
|
45
|
+
param {*} tag
|
|
46
|
+
param {str} job_type
|
|
47
|
+
return {*}
|
|
48
|
+
author: wuxingxing
|
|
49
|
+
'''
|
|
50
|
+
def set_tag(self, tag, job_type:str=None):
|
|
51
|
+
self.job_finish_tag = tag
|
|
52
|
+
if job_type is not None: # use to determine if the lammps md task has terminated due to "ERROR: there are two atoms too close" reason
|
|
53
|
+
self.job_type = job_type.lower()
|
|
54
|
+
else:
|
|
55
|
+
self.job_type = None
|
|
56
|
+
|
|
57
|
+
def submit(self):
|
|
58
|
+
# ret = Popen([self.submit_cmd + " " + self.job_script], stdout=PIPE, stderr=PIPE, shell = True)
|
|
59
|
+
ret = Popen([self.submit_cmd], stdout=PIPE, stderr=PIPE, shell = True)
|
|
60
|
+
stdout, stderr = ret.communicate()
|
|
61
|
+
if str(stderr, encoding='ascii') != "":
|
|
62
|
+
raise RuntimeError (stderr)
|
|
63
|
+
job_id = str(stdout, encoding='ascii').replace('\n','').split()[-1]
|
|
64
|
+
self.job_id = job_id
|
|
65
|
+
self.submit_num += 1
|
|
66
|
+
status = self.update_status()
|
|
67
|
+
print ("# job {} submitted!".format(self.job_id))
|
|
68
|
+
|
|
69
|
+
def scancel_job(self):
|
|
70
|
+
ret = Popen (["scancel " + self.job_id], shell=True, stdout=PIPE, stderr=PIPE)
|
|
71
|
+
time.sleep(1)
|
|
72
|
+
stdout, stderr = ret.communicate()
|
|
73
|
+
print("scancel job {}".format(self.job_id))
|
|
74
|
+
# print(str(stderr, encoding='ascii'))
|
|
75
|
+
|
|
76
|
+
def update_status(self):
|
|
77
|
+
self.status = self.check_status()
|
|
78
|
+
return self.status
|
|
79
|
+
|
|
80
|
+
def check_status_no_tag(self):
|
|
81
|
+
ret = Popen (["squeue --job " + self.job_id], shell=True, stdout=PIPE, stderr=PIPE)
|
|
82
|
+
stdout, stderr = ret.communicate()
|
|
83
|
+
if (ret.returncode != 0) :
|
|
84
|
+
if str("Invalid job id specified") in str(stderr, encoding='ascii') :
|
|
85
|
+
return JobStatus.finished
|
|
86
|
+
else :
|
|
87
|
+
print("status command " + "squeue" + " fails to execute")
|
|
88
|
+
print("erro info: " + str(stderr, encoding='ascii'))
|
|
89
|
+
print("return code: " + str(ret.returncode))
|
|
90
|
+
sys.exit ()
|
|
91
|
+
status_line = str(stdout, encoding='ascii').split ('\n')[-2]
|
|
92
|
+
status_word = status_line.split ()[4]
|
|
93
|
+
if status_word in ["PD","CF","S"] :
|
|
94
|
+
return JobStatus.waiting
|
|
95
|
+
elif status_word in ["R","CG"] :
|
|
96
|
+
return JobStatus.running
|
|
97
|
+
elif status_word in ["C","E","K","BF","CA","CD","F","NF","PR","SE","ST","TO"] :
|
|
98
|
+
return JobStatus.finished
|
|
99
|
+
elif status_word in ["RH"] : #for job in 'RH' status, scancel the job and return terminated
|
|
100
|
+
self.scancel_job()
|
|
101
|
+
return JobStatus.finished
|
|
102
|
+
else:
|
|
103
|
+
return JobStatus.unknown
|
|
104
|
+
|
|
105
|
+
def check_status(self):
|
|
106
|
+
ret = Popen (["squeue --job " + self.job_id], shell=True, stdout=PIPE, stderr=PIPE)
|
|
107
|
+
stdout, stderr = ret.communicate()
|
|
108
|
+
if (ret.returncode != 0) :
|
|
109
|
+
if str("Invalid job id specified") in str(stderr, encoding='ascii') :
|
|
110
|
+
if os.path.exists (self.job_finish_tag) :
|
|
111
|
+
print("job {} finished: the cmd is {}.".format(self.job_id, self.submit_cmd))
|
|
112
|
+
return JobStatus.finished
|
|
113
|
+
else :
|
|
114
|
+
return JobStatus.terminated
|
|
115
|
+
else :
|
|
116
|
+
print("status command " + "squeue" + " fails to execute")
|
|
117
|
+
print("erro info: " + str(stderr, encoding='ascii'))
|
|
118
|
+
print("return code: " + str(ret.returncode))
|
|
119
|
+
sys.exit ()
|
|
120
|
+
status_line = str(stdout, encoding='ascii').split ('\n')[-2]
|
|
121
|
+
status_word = status_line.split ()[4]
|
|
122
|
+
if status_word in ["PD","CF","S"] :
|
|
123
|
+
return JobStatus.waiting
|
|
124
|
+
elif status_word in ["R","CG"] :
|
|
125
|
+
return JobStatus.running
|
|
126
|
+
elif status_word in ["C","E","K","BF","CA","CD","F","NF","PR","SE","ST","TO"] :
|
|
127
|
+
if os.path.exists (self.job_finish_tag) :
|
|
128
|
+
print("job {} finished: the cmd is {}.".format(self.job_id, self.submit_cmd))
|
|
129
|
+
return JobStatus.finished
|
|
130
|
+
else :
|
|
131
|
+
# for lammps md job, if the job stops because of 'ERROR: there are two atoms too close', set the job.status to finished
|
|
132
|
+
if self.job_type is not None and self.job_type == "lammps":
|
|
133
|
+
end_normal = self.check_lammps_out_file()
|
|
134
|
+
if end_normal:
|
|
135
|
+
with open(self.job_finish_tag, 'w') as wf:
|
|
136
|
+
wf.writelines("Job done!")
|
|
137
|
+
print("job {} finished: the cmd is {}.".format(self.job_id, self.submit_cmd))
|
|
138
|
+
return JobStatus.finished
|
|
139
|
+
return JobStatus.terminated
|
|
140
|
+
elif status_word in ["RH"] : #for job in 'RH' status, scancel the job and return terminated
|
|
141
|
+
self.scancel_job()
|
|
142
|
+
if os.path.exists (self.job_finish_tag) :
|
|
143
|
+
print("job {} finished: the cmd is {}.".format(self.job_id, self.submit_cmd))
|
|
144
|
+
return JobStatus.finished
|
|
145
|
+
else:
|
|
146
|
+
return JobStatus.terminated
|
|
147
|
+
else :
|
|
148
|
+
return JobStatus.unknown
|
|
149
|
+
|
|
150
|
+
def running_work(self):
|
|
151
|
+
self.submit()
|
|
152
|
+
while True:
|
|
153
|
+
status = self.check_status()
|
|
154
|
+
if (status == JobStatus.waiting) or \
|
|
155
|
+
(status == JobStatus.running):
|
|
156
|
+
time.sleep(10)
|
|
157
|
+
else:
|
|
158
|
+
break
|
|
159
|
+
|
|
160
|
+
assert(status == JobStatus.finished)
|
|
161
|
+
return status
|
|
162
|
+
|
|
163
|
+
def get_slurm_works_dir(self):
|
|
164
|
+
with open(os.path.join(self.slurm_job_run_dir, self.slurm_job_name), 'r') as rf:
|
|
165
|
+
lines = rf.readlines()
|
|
166
|
+
work_dir_list = []
|
|
167
|
+
for line in lines:
|
|
168
|
+
if 'cd ' in line:
|
|
169
|
+
work_dir = line.split()[-1].strip()
|
|
170
|
+
work_dir_list.append(work_dir)
|
|
171
|
+
return work_dir_list
|
|
172
|
+
|
|
173
|
+
'''
|
|
174
|
+
description:
|
|
175
|
+
if the job is md task, and stoped because of 'ERROR: there are two atoms too close' let the task end normally
|
|
176
|
+
param {*} self
|
|
177
|
+
return {*}
|
|
178
|
+
author: wuxingxing
|
|
179
|
+
'''
|
|
180
|
+
def check_lammps_out_file(self):
|
|
181
|
+
# read last line of md.log file
|
|
182
|
+
md_dirs = self.get_slurm_works_dir()
|
|
183
|
+
try:
|
|
184
|
+
for md_dir in md_dirs:
|
|
185
|
+
tag_md_file = os.path.join(md_dir, "tag.md.success")
|
|
186
|
+
md_log = os.path.join(md_dir, "md.log")
|
|
187
|
+
if os.path.exists(tag_md_file):
|
|
188
|
+
continue
|
|
189
|
+
if not os.path.exists(md_log):
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
with open(md_log, "rb") as file:
|
|
193
|
+
file.seek(-2, 2) # 定位到文件末尾前两个字节
|
|
194
|
+
while file.read(1) != b'\n': # 逐字节向前查找换行符
|
|
195
|
+
file.seek(-2, 1) # 向前移动两个字节
|
|
196
|
+
last_line = file.readline().decode().strip() # 读取最后一行并去除换行符和空白字符
|
|
197
|
+
if "ERROR: there are two atoms" in last_line:
|
|
198
|
+
with open(tag_md_file, 'w') as wf:
|
|
199
|
+
wf.writelines("ERROR: there are two atoms too close")
|
|
200
|
+
return True
|
|
201
|
+
elif "Total wall time" in last_line:
|
|
202
|
+
with open(tag_md_file, 'w') as wf:
|
|
203
|
+
wf.writelines("Job Done!")
|
|
204
|
+
return True
|
|
205
|
+
else:
|
|
206
|
+
return False
|
|
207
|
+
return True
|
|
208
|
+
except Exception as e:
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class Mission(object):
|
|
213
|
+
def __init__(self, mission_id=None) -> None:
|
|
214
|
+
self.mission_id = mission_id
|
|
215
|
+
self.job_list: list[SlurmJob]= []
|
|
216
|
+
|
|
217
|
+
def add_job(self, job:SlurmJob):
|
|
218
|
+
self.job_list.append(job)
|
|
219
|
+
|
|
220
|
+
def pop_job(self, job_id):
|
|
221
|
+
del_job, index = self.get_job(job_id)
|
|
222
|
+
self.job_list.remove(del_job)
|
|
223
|
+
|
|
224
|
+
def get_job(self, job_id):
|
|
225
|
+
for i, job in enumerate(self.job_list):
|
|
226
|
+
if job.job_id == job_id:
|
|
227
|
+
return job, i
|
|
228
|
+
|
|
229
|
+
def update_job_state(self, job_id, state):
|
|
230
|
+
up_job, index = self.get_job(job_id)
|
|
231
|
+
up_job.status = state
|
|
232
|
+
self.job_list[index] = up_job
|
|
233
|
+
|
|
234
|
+
def get_running_jobs(self):
|
|
235
|
+
job_list: list[SlurmJob] = []
|
|
236
|
+
for job in self.job_list:
|
|
237
|
+
if (job.status == JobStatus.waiting) or (job.status == JobStatus.running):
|
|
238
|
+
job_list.append(job)
|
|
239
|
+
return job_list
|
|
240
|
+
|
|
241
|
+
def move_slurm_log_to_slurm_work_dir(self, slurm_log_dir_source:str):
|
|
242
|
+
for job in self.job_list:
|
|
243
|
+
slurm_log_source = os.path.join(slurm_log_dir_source, "slurm-{}.out".format(job.job_id))
|
|
244
|
+
slurm_job_log_target = os.path.join(os.path.dirname(job.slurm_job_path), os.path.basename(slurm_log_source))
|
|
245
|
+
if os.path.exists(slurm_log_source):
|
|
246
|
+
shutil.move(slurm_log_source, slurm_job_log_target)
|
|
247
|
+
|
|
248
|
+
'''
|
|
249
|
+
Description:
|
|
250
|
+
job_finish_tag does not exist means this job running in error
|
|
251
|
+
param {*} self
|
|
252
|
+
Returns:
|
|
253
|
+
Author: WU Xingxing
|
|
254
|
+
'''
|
|
255
|
+
def get_error_jobs(self):
|
|
256
|
+
job_list: list[SlurmJob] = []
|
|
257
|
+
for job in self.job_list:
|
|
258
|
+
if os.path.exists(job.job_finish_tag) is False:
|
|
259
|
+
job_list.append(job)
|
|
260
|
+
return job_list
|
|
261
|
+
|
|
262
|
+
def all_job_finished(self, error_type:str=None):
|
|
263
|
+
error_jobs = self.get_error_jobs()
|
|
264
|
+
if len(error_jobs) >= 1:
|
|
265
|
+
error_log_content = ""
|
|
266
|
+
for error_job in error_jobs:
|
|
267
|
+
error_log_path = os.path.join(error_job.slurm_job_run_dir, "slurm-{}.out".format(error_job.job_id))
|
|
268
|
+
error_log_content += "JOB ERRIR! The cmd '{}' failed!\nFor more details on errors, please refer to the following documents:\n"\
|
|
269
|
+
.format(error_job.submit_cmd)
|
|
270
|
+
|
|
271
|
+
slurm_content = " Slurm script file is {}\n The slurm log is {}\n"\
|
|
272
|
+
.format(os.path.join(error_job.slurm_job_run_dir, error_job.slurm_job_name), error_log_path)
|
|
273
|
+
|
|
274
|
+
tmp_error = None
|
|
275
|
+
if error_type is not None:
|
|
276
|
+
work_dirs = error_job.get_slurm_works_dir()
|
|
277
|
+
if len(work_dirs) > 0:
|
|
278
|
+
tmp_error = " Task logs under this slurm job:\n"
|
|
279
|
+
for _ in work_dirs:
|
|
280
|
+
job_error_log = "{}/{}".format(_, error_type)
|
|
281
|
+
job_finish_tag = "{}/{}".format(_, error_job.job_finish_tag)
|
|
282
|
+
if os.path.exists(job_error_log) and not os.path.exists(job_finish_tag):
|
|
283
|
+
tmp_error += " {}\n".format(job_error_log)
|
|
284
|
+
|
|
285
|
+
error_log_content += slurm_content
|
|
286
|
+
if tmp_error is not None:
|
|
287
|
+
error_log_content += tmp_error
|
|
288
|
+
error_log_content += "\n\n"
|
|
289
|
+
raise Exception(error_log_content)
|
|
290
|
+
return True
|
|
291
|
+
|
|
292
|
+
def commit_jobs(self):
|
|
293
|
+
for job in self.job_list:
|
|
294
|
+
if job.status == JobStatus.unsubmitted:
|
|
295
|
+
job.submit()
|
|
296
|
+
|
|
297
|
+
'''
|
|
298
|
+
description:
|
|
299
|
+
return all job ids, the job id is the slurm job id
|
|
300
|
+
param {*} self
|
|
301
|
+
return {*}
|
|
302
|
+
author: wuxingxing
|
|
303
|
+
'''
|
|
304
|
+
def get_all_job_ids(self):
|
|
305
|
+
job_id_list = []
|
|
306
|
+
for job in self.job_list:
|
|
307
|
+
job_id_list.append(job.job_id)
|
|
308
|
+
return job_id_list
|
|
309
|
+
|
|
310
|
+
def check_running_job(self):
|
|
311
|
+
while True:
|
|
312
|
+
for job in self.job_list:
|
|
313
|
+
# print(job.status, job.job_id)
|
|
314
|
+
if job.status == JobStatus.resubmit_failed or job.status == JobStatus.finished: # For job resubmitted more than 3 times, do not check again
|
|
315
|
+
continue
|
|
316
|
+
status = job.check_status()
|
|
317
|
+
self.update_job_state(job.job_id, status)
|
|
318
|
+
# if the job failed, resubmit it until the resubmit time more than 3 times
|
|
319
|
+
self.resubmit_jobs()
|
|
320
|
+
if len(self.get_running_jobs()) == 0:
|
|
321
|
+
break
|
|
322
|
+
time.sleep(10)
|
|
323
|
+
# error_jobs = self.get_error_jobs()
|
|
324
|
+
# if len(error_jobs) > 0:
|
|
325
|
+
# error_info = "job error: {}".format([_.job_id for _ in error_jobs])
|
|
326
|
+
# raise Exception(error_info)
|
|
327
|
+
return True
|
|
328
|
+
|
|
329
|
+
def resubmit_jobs(self):
|
|
330
|
+
for job in self.job_list:
|
|
331
|
+
if job.status == JobStatus.terminated:
|
|
332
|
+
if job.submit_num <= JobStatus.submit_limit.value:
|
|
333
|
+
print("resubmit job {}: {}, the time is {}\n".format(job.job_id, job.submit_cmd, job.submit_num))
|
|
334
|
+
job.submit()
|
|
335
|
+
else:
|
|
336
|
+
job.status = JobStatus.resubmit_failed
|
|
337
|
+
|
|
338
|
+
'''
|
|
339
|
+
Description:
|
|
340
|
+
after some jobs finished with some jobs terminated, we should try to recover these terminated jobs.
|
|
341
|
+
param {*} self
|
|
342
|
+
Returns:
|
|
343
|
+
Author: WU Xingxing
|
|
344
|
+
'''
|
|
345
|
+
def re_submmit_terminated_jobs(self):
|
|
346
|
+
error_jobs = self.get_error_jobs()
|
|
347
|
+
if len(error_jobs) == 0:
|
|
348
|
+
return
|
|
349
|
+
self.job_list.clear()
|
|
350
|
+
self.job_list.extend(error_jobs)
|
|
351
|
+
self.reset_job_state()
|
|
352
|
+
self.commit_jobs()
|
|
353
|
+
self.check_running_job()
|
|
354
|
+
|
|
355
|
+
def reset_job_state(self):
|
|
356
|
+
for job in self.job_list:
|
|
357
|
+
job.status == JobStatus.unsubmitted
|
|
358
|
+
|
|
359
|
+
def scancle_job(work_dir:str):
|
|
360
|
+
job_id_list = get_jobs(work_dir)
|
|
361
|
+
print("the job to be scancelled is:")
|
|
362
|
+
print(job_id_list)
|
|
363
|
+
for job_id in job_id_list:
|
|
364
|
+
job = SlurmJob(job_id=job_id)
|
|
365
|
+
status = job.check_status_no_tag()#get status
|
|
366
|
+
if status == JobStatus.waiting or status == JobStatus.running: # is running
|
|
367
|
+
job.scancel_job()
|
|
368
|
+
# time.sleep(2)
|
|
369
|
+
# status = job.check_status_no_tag()
|
|
370
|
+
# if JobStatus.finished == status:
|
|
371
|
+
# print("scancel job {} successfully\n\n".format(job_id))
|
|
372
|
+
# else:
|
|
373
|
+
# print("Scancel job {} failed, Please manually check and cancel this task!\n\n".format(job_id))
|
|
374
|
+
time.sleep(5)
|
|
375
|
+
for job_id in job_id_list:
|
|
376
|
+
job = SlurmJob(job_id=job_id)
|
|
377
|
+
status = job.check_status_no_tag()#get status
|
|
378
|
+
if JobStatus.finished == status:
|
|
379
|
+
print("scancel job {} successfully".format(job_id))
|
|
380
|
+
else:
|
|
381
|
+
print("Scancel job {} failed, Please manually check and cancel this task!\n".format(job_id))
|
|
382
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
def get_job_ids():
|
|
5
|
+
# 使用 squeue 获取当前用户的所有作业ID列表
|
|
6
|
+
user = os.getlogin() # 获取当前登录的用户名
|
|
7
|
+
result = subprocess.run(['squeue', '-u', user, '-o', '%i', '-h'], capture_output=True, text=True)
|
|
8
|
+
job_ids = result.stdout.strip().split()
|
|
9
|
+
return job_ids
|
|
10
|
+
|
|
11
|
+
def get_slurm_script_path(job_id):
|
|
12
|
+
# 使用 scontrol 显示作业详细信息,并从中提取 slurm 脚本路径
|
|
13
|
+
result = subprocess.run(['scontrol', 'show', 'job', str(job_id)], capture_output=True, text=True)
|
|
14
|
+
for line in result.stdout.splitlines():
|
|
15
|
+
if "Command=" in line:
|
|
16
|
+
script_path = line.split('=')[1]
|
|
17
|
+
return script_path
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
def get_jobs(work_dir:str):
|
|
21
|
+
job_ids = get_job_ids()
|
|
22
|
+
jobs = []
|
|
23
|
+
for job_id in job_ids:
|
|
24
|
+
script_path = get_slurm_script_path(job_id)
|
|
25
|
+
# if script_path:
|
|
26
|
+
# print(f"Job ID: {job_id}, Slurm Script Path: {script_path}")
|
|
27
|
+
# else:
|
|
28
|
+
# print(f"Job ID: {job_id}, Slurm Script Path: Not Found")
|
|
29
|
+
if work_dir in script_path:
|
|
30
|
+
jobs.append(job_id)
|
|
31
|
+
return jobs
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# if __name__ == "__main__":
|
|
35
|
+
# get_jobs("run_iter/iter.0000/temp_run_iter_work/label/scf")
|
|
File without changes
|
|
@@ -21,6 +21,9 @@ pwact/active_learning/init_bulk/relabel.py
|
|
|
21
21
|
pwact/active_learning/init_bulk/relax.py
|
|
22
22
|
pwact/active_learning/label/__init__.py
|
|
23
23
|
pwact/active_learning/label/labeling.py
|
|
24
|
+
pwact/active_learning/slurm/__init__.py
|
|
25
|
+
pwact/active_learning/slurm/slurm.py
|
|
26
|
+
pwact/active_learning/slurm/slurm_tool.py
|
|
24
27
|
pwact/active_learning/test/__init__.py
|
|
25
28
|
pwact/active_learning/test/test.py
|
|
26
29
|
pwact/active_learning/train/__init__.py
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
|
|
|
5
5
|
|
|
6
6
|
setuptools.setup(
|
|
7
7
|
name="pwact",
|
|
8
|
-
version="0.1.
|
|
8
|
+
version="0.1.13",
|
|
9
9
|
author="LonxunQuantum",
|
|
10
10
|
author_email="lonxun@pwmat.com",
|
|
11
11
|
description="PWACT is an open-source automated active learning platform based on PWMLFF for efficient data sampling.",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pwact-0.1.11/pwact/active_learning/test → pwact-0.1.13/pwact/active_learning/slurm}/__init__.py
RENAMED
|
File without changes
|
{pwact-0.1.11/pwact/active_learning/train → pwact-0.1.13/pwact/active_learning/test}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pwact-0.1.11/pwact/bin → pwact-0.1.13/pwact/active_learning/user_input/train_param}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/train_param/nn_feature_type.py
RENAMED
|
File without changes
|
{pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/train_param/optimizer_param.py
RENAMED
|
File without changes
|
|
File without changes
|
{pwact-0.1.11 → pwact-0.1.13}/pwact/active_learning/user_input/train_param/work_file_param.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|