oh-my-batch 0.1.0.dev0__py3-none-any.whl → 0.1.0.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oh_my_batch/job.py +35 -43
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/METADATA +3 -3
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/RECORD +6 -6
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/LICENSE +0 -0
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/WHEEL +0 -0
- {oh_my_batch-0.1.0.dev0.dist-info → oh_my_batch-0.1.0.dev2.dist-info}/entry_points.txt +0 -0
oh_my_batch/job.py
CHANGED
@@ -12,29 +12,22 @@ from .util import expand_globs, shell_run, parse_csv
|
|
12
12
|
|
13
13
|
logger = logging.getLogger(__name__)
|
14
14
|
|
15
|
+
class JobState:
|
16
|
+
NULL = 0
|
17
|
+
PENDING = 1
|
18
|
+
RUNNING = 2
|
19
|
+
CANCELLED = 3
|
20
|
+
COMPLETED = 4
|
21
|
+
FAILED = 5
|
22
|
+
UNKNOWN = 6
|
15
23
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
obj.terminal = terminal
|
24
|
-
obj.status_name = status_name
|
25
|
-
return obj
|
26
|
-
|
27
|
-
value: int # type: ignore
|
28
|
-
terminal: bool
|
29
|
-
status_name: str
|
30
|
-
|
31
|
-
NULL = (0, True, "NULL")
|
32
|
-
PENDING = (1, False, "PENDING")
|
33
|
-
RUNNING = (2, False, "RUNNING")
|
34
|
-
CANCELLED = (3, True, "CANCELLED")
|
35
|
-
COMPLETED = (4, True, "COMPLETED")
|
36
|
-
FAILED = (5, True, "FAILED")
|
37
|
-
UNKNOWN = (6, False, "UNKNOWN")
|
24
|
+
@classmethod
|
25
|
+
def is_terminal(cls, state: int):
|
26
|
+
return state in (JobState.NULL, JobState.COMPLETED, JobState.FAILED, JobState.CANCELLED)
|
27
|
+
|
28
|
+
@classmethod
|
29
|
+
def is_success(cls, state: int):
|
30
|
+
return state == JobState.COMPLETED
|
38
31
|
|
39
32
|
|
40
33
|
def new_job(script: str):
|
@@ -69,9 +62,6 @@ class BaseJobManager:
|
|
69
62
|
scripts = set(os.path.normpath(s) for s in expand_globs(script))
|
70
63
|
logger.info('Scripts to submit: %s', scripts)
|
71
64
|
|
72
|
-
if recovery and recover_scripts != scripts:
|
73
|
-
raise ValueError('Scripts to submit are different from scripts in recovery file')
|
74
|
-
|
75
65
|
for script_file in scripts:
|
76
66
|
if script_file not in recover_scripts:
|
77
67
|
jobs.append(new_job(script_file))
|
@@ -87,7 +77,7 @@ class BaseJobManager:
|
|
87
77
|
break
|
88
78
|
|
89
79
|
# stop if all jobs are terminal and not job to be submitted
|
90
|
-
if (all(j['state']
|
80
|
+
if (all(JobState.is_terminal(j['state']) for j in jobs) and
|
91
81
|
not any(should_submit(j, max_tries) for j in jobs)):
|
92
82
|
break
|
93
83
|
|
@@ -108,19 +98,21 @@ class Slurm(BaseJobManager):
|
|
108
98
|
|
109
99
|
def _update_jobs(self, jobs: List[dict], max_tries: int, submit_opts: str):
|
110
100
|
# query job status
|
111
|
-
job_ids =
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
101
|
+
job_ids = [j['id'] for j in jobs if j['id']]
|
102
|
+
if job_ids:
|
103
|
+
query_cmd = f'{self._sacct_bin} -X -P --format=JobID,JobName,State -j {",".join(job_ids)}'
|
104
|
+
user = os.environ.get('USER')
|
105
|
+
if user:
|
106
|
+
query_cmd += f' -u {user}'
|
107
|
+
|
108
|
+
cp = shell_run(query_cmd)
|
109
|
+
if cp.returncode != 0:
|
110
|
+
logger.error('Failed to query job status: %s', cp.stderr.decode('utf-8'))
|
111
|
+
return jobs
|
112
|
+
logger.info('Job status: %s', cp.stdout.decode('utf-8'))
|
113
|
+
new_state = parse_csv(cp.stdout.decode('utf-8'))
|
114
|
+
else:
|
115
|
+
new_state = []
|
124
116
|
|
125
117
|
for job in jobs:
|
126
118
|
for row in new_state:
|
@@ -130,8 +122,8 @@ class Slurm(BaseJobManager):
|
|
130
122
|
logger.warning('Unknown job %s state: %s',row['JobID'], row['State'])
|
131
123
|
break
|
132
124
|
else:
|
133
|
-
job['
|
134
|
-
|
125
|
+
if job['id']:
|
126
|
+
logger.error('Job %s not found in sacct output', job['id'])
|
135
127
|
|
136
128
|
# check if there are jobs to be (re)submitted
|
137
129
|
for job in jobs:
|
@@ -171,8 +163,8 @@ class Slurm(BaseJobManager):
|
|
171
163
|
|
172
164
|
|
173
165
|
def should_submit(job: dict, max_tries: int):
|
174
|
-
state:
|
175
|
-
if not state
|
166
|
+
state: int = job['state']
|
167
|
+
if not JobState.is_terminal(state):
|
176
168
|
return False
|
177
169
|
if job['tries'] >= max_tries:
|
178
170
|
return False
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: oh-my-batch
|
3
|
-
Version: 0.1.0.
|
3
|
+
Version: 0.1.0.dev2
|
4
4
|
Summary:
|
5
5
|
License: GPL
|
6
6
|
Author: weihong.xu
|
@@ -118,13 +118,13 @@ You can use `omb job` to track the state of the jobs.
|
|
118
118
|
```bash
|
119
119
|
|
120
120
|
omb job slurm \
|
121
|
-
submit tmp/*.slurm --max_tries 3 --wait --recovery lammps.
|
121
|
+
submit tmp/*.slurm --max_tries 3 --wait --recovery lammps-jobs.json
|
122
122
|
```
|
123
123
|
|
124
124
|
The above command will submit the batch scripts to the job scheduler,
|
125
125
|
and wait for the jobs to finish. If the job fails, it will retry for at most 3 times.
|
126
126
|
|
127
|
-
The `--recovery` option will save the job information to `lammps.
|
127
|
+
The `--recovery` option will save the job information to `lammps-jobs.json` file,
|
128
128
|
if `omb job` is interrupted, you can run the exact same command to recover the job status,
|
129
129
|
so that you don't need to resubmit the jobs that are already submitted.
|
130
130
|
|
@@ -5,10 +5,10 @@ oh_my_batch/assets/functions.sh,sha256=eORxFefV-XrWbG-2I6u-c8uf1XxOQ31LaeVHBumwz
|
|
5
5
|
oh_my_batch/batch.py,sha256=e73N-xwxMvgxnWwFMp33PQD1Dy-T-ATjANlwtPRHPQM,3016
|
6
6
|
oh_my_batch/cli.py,sha256=G_JxqX0Zbx_EbcDxXbYjJ_4O-EOhmkF1lcMWgQ5ZPqo,375
|
7
7
|
oh_my_batch/combo.py,sha256=AHFD5CLoczqtjcfl2Rb4A2ucoQU40-cWtDOYjtP-yY4,7680
|
8
|
-
oh_my_batch/job.py,sha256=
|
8
|
+
oh_my_batch/job.py,sha256=_fETBYpuSd_hNHKnXSwYcSU3OXtU7PO-P2QMfhE-Wfs,5788
|
9
9
|
oh_my_batch/util.py,sha256=H8B4zVNH5xRp-NG_uypgvtmz2YSpXy_6LK5ROv6SYrc,2116
|
10
|
-
oh_my_batch-0.1.0.
|
11
|
-
oh_my_batch-0.1.0.
|
12
|
-
oh_my_batch-0.1.0.
|
13
|
-
oh_my_batch-0.1.0.
|
14
|
-
oh_my_batch-0.1.0.
|
10
|
+
oh_my_batch-0.1.0.dev2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
11
|
+
oh_my_batch-0.1.0.dev2.dist-info/METADATA,sha256=Novvp5-MhKR9J0Q6Al833vA6GjT7EVRtDv4ADmnluxk,4456
|
12
|
+
oh_my_batch-0.1.0.dev2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
13
|
+
oh_my_batch-0.1.0.dev2.dist-info/entry_points.txt,sha256=ZY2GutSoNjjSyJ4qO2pTeseKUFgoTYdvmgkuZZkwi68,77
|
14
|
+
oh_my_batch-0.1.0.dev2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|