oh-my-batch 0.1.0.dev3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/PKG-INFO +14 -14
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/README.md +13 -13
- oh_my_batch-0.2.0/oh_my_batch/__init__.py +4 -0
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/oh_my_batch/batch.py +46 -16
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/oh_my_batch/combo.py +0 -1
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/oh_my_batch/job.py +14 -12
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/oh_my_batch/util.py +16 -2
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/pyproject.toml +1 -1
- oh_my_batch-0.1.0.dev3/oh_my_batch/__init__.py +0 -0
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/LICENSE +0 -0
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/oh_my_batch/__main__.py +0 -0
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/oh_my_batch/assets/__init__.py +0 -0
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/oh_my_batch/assets/functions.sh +0 -0
- {oh_my_batch-0.1.0.dev3 → oh_my_batch-0.2.0}/oh_my_batch/cli.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: oh-my-batch
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary:
|
5
5
|
License: GPL
|
6
6
|
Author: weihong.xu
|
@@ -17,6 +17,11 @@ Requires-Dist: fire (>=0.7.0,<0.8.0)
|
|
17
17
|
Description-Content-Type: text/markdown
|
18
18
|
|
19
19
|
# oh-my-batch
|
20
|
+
|
21
|
+
[](https://badge.fury.io/py/oh-my-batch)
|
22
|
+
[](https://pypi.org/project/oh-my-batch/)
|
23
|
+
[](https://pypi.org/project/oh-my-batch/)
|
24
|
+
|
20
25
|
A simple tool to manipulate batch tasks designed for scientific computing community.
|
21
26
|
|
22
27
|
## Features
|
@@ -41,7 +46,6 @@ for example, different temperatures 300K, 400K, 500K, against each data file.
|
|
41
46
|
In this case, you can use `omb combo` command to generate a series of input files for you.
|
42
47
|
|
43
48
|
```bash
|
44
|
-
#! /bin/bash
|
45
49
|
# prepare fake data files
|
46
50
|
mkdir -p tmp/
|
47
51
|
touch tmp/1.data tmp/2.data tmp/3.data
|
@@ -87,7 +91,6 @@ You want to package them into 2 batch scripts to submit to a job scheduler.
|
|
87
91
|
You can use `omb batch` to generate batch scripts for you like this:
|
88
92
|
|
89
93
|
```bash
|
90
|
-
#! /bin/bash
|
91
94
|
cat > tmp/lammps_header.sh <<EOF
|
92
95
|
#!/bin/bash
|
93
96
|
#SBATCH -J lmp
|
@@ -96,9 +99,9 @@ cat > tmp/lammps_header.sh <<EOF
|
|
96
99
|
EOF
|
97
100
|
|
98
101
|
omb batch \
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
+
add_work_dirs tmp/tasks/* - \
|
103
|
+
add_header_files tmp/lammps_header.sh - \
|
104
|
+
add_cmds "checkpoint lmp.done ./run.sh" - \
|
102
105
|
make tmp/lmp-{i}.slurm --concurrency 2
|
103
106
|
```
|
104
107
|
|
@@ -112,19 +115,16 @@ You can run the above script by `./examples/omb-batch.sh`,
|
|
112
115
|
### Track the state of job in job schedular
|
113
116
|
|
114
117
|
Let's continue the above example, now you have submitted the batch scripts to the job scheduler.
|
115
|
-
|
116
|
-
You can use `omb job` to track the state of the jobs.
|
118
|
+
In this case, you can use `omb job` to track the state of the jobs.
|
117
119
|
|
118
120
|
```bash
|
119
|
-
|
120
|
-
omb job slurm \
|
121
|
-
submit tmp/*.slurm --max_tries 3 --wait --recovery lammps-jobs.json
|
121
|
+
omb job slurm submit tmp/*.slurm --max_tries 3 --wait --recovery lammps-jobs.json
|
122
122
|
```
|
123
123
|
|
124
124
|
The above command will submit the batch scripts to the job scheduler,
|
125
125
|
and wait for the jobs to finish. If the job fails, it will retry for at most 3 times.
|
126
126
|
|
127
|
-
The `--recovery` option will save the job information to `lammps-jobs.json` file
|
128
|
-
|
129
|
-
so that you don't need to resubmit the jobs that are
|
127
|
+
The `--recovery` option will save the job information to `lammps-jobs.json` file.
|
128
|
+
If `omb job` is interrupted, you can rerun the exact same command to recover the job status,
|
129
|
+
so that you don't need to resubmit the jobs that are still running or completed.
|
130
130
|
|
@@ -1,4 +1,9 @@
|
|
1
1
|
# oh-my-batch
|
2
|
+
|
3
|
+
[](https://badge.fury.io/py/oh-my-batch)
|
4
|
+
[](https://pypi.org/project/oh-my-batch/)
|
5
|
+
[](https://pypi.org/project/oh-my-batch/)
|
6
|
+
|
2
7
|
A simple tool to manipulate batch tasks designed for scientific computing community.
|
3
8
|
|
4
9
|
## Features
|
@@ -23,7 +28,6 @@ for example, different temperatures 300K, 400K, 500K, against each data file.
|
|
23
28
|
In this case, you can use `omb combo` command to generate a series of input files for you.
|
24
29
|
|
25
30
|
```bash
|
26
|
-
#! /bin/bash
|
27
31
|
# prepare fake data files
|
28
32
|
mkdir -p tmp/
|
29
33
|
touch tmp/1.data tmp/2.data tmp/3.data
|
@@ -69,7 +73,6 @@ You want to package them into 2 batch scripts to submit to a job scheduler.
|
|
69
73
|
You can use `omb batch` to generate batch scripts for you like this:
|
70
74
|
|
71
75
|
```bash
|
72
|
-
#! /bin/bash
|
73
76
|
cat > tmp/lammps_header.sh <<EOF
|
74
77
|
#!/bin/bash
|
75
78
|
#SBATCH -J lmp
|
@@ -78,9 +81,9 @@ cat > tmp/lammps_header.sh <<EOF
|
|
78
81
|
EOF
|
79
82
|
|
80
83
|
omb batch \
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
+
add_work_dirs tmp/tasks/* - \
|
85
|
+
add_header_files tmp/lammps_header.sh - \
|
86
|
+
add_cmds "checkpoint lmp.done ./run.sh" - \
|
84
87
|
make tmp/lmp-{i}.slurm --concurrency 2
|
85
88
|
```
|
86
89
|
|
@@ -94,18 +97,15 @@ You can run the above script by `./examples/omb-batch.sh`,
|
|
94
97
|
### Track the state of job in job schedular
|
95
98
|
|
96
99
|
Let's continue the above example, now you have submitted the batch scripts to the job scheduler.
|
97
|
-
|
98
|
-
You can use `omb job` to track the state of the jobs.
|
100
|
+
In this case, you can use `omb job` to track the state of the jobs.
|
99
101
|
|
100
102
|
```bash
|
101
|
-
|
102
|
-
omb job slurm \
|
103
|
-
submit tmp/*.slurm --max_tries 3 --wait --recovery lammps-jobs.json
|
103
|
+
omb job slurm submit tmp/*.slurm --max_tries 3 --wait --recovery lammps-jobs.json
|
104
104
|
```
|
105
105
|
|
106
106
|
The above command will submit the batch scripts to the job scheduler,
|
107
107
|
and wait for the jobs to finish. If the job fails, it will retry for at most 3 times.
|
108
108
|
|
109
|
-
The `--recovery` option will save the job information to `lammps-jobs.json` file
|
110
|
-
|
111
|
-
so that you don't need to resubmit the jobs that are
|
109
|
+
The `--recovery` option will save the job information to `lammps-jobs.json` file.
|
110
|
+
If `omb job` is interrupted, you can rerun the exact same command to recover the job status,
|
111
|
+
so that you don't need to resubmit the jobs that are still running or completed.
|
@@ -13,7 +13,7 @@ class BatchMaker:
|
|
13
13
|
self._script_bottom = []
|
14
14
|
self._command = []
|
15
15
|
|
16
|
-
def
|
16
|
+
def add_work_dirs(self, *dir: str):
|
17
17
|
"""
|
18
18
|
Add working directories
|
19
19
|
|
@@ -22,39 +22,55 @@ class BatchMaker:
|
|
22
22
|
self._work_dirs.extend(expand_globs(dir))
|
23
23
|
return self
|
24
24
|
|
25
|
-
def
|
25
|
+
def add_header_files(self, *file: str, encoding='utf-8'):
|
26
26
|
"""
|
27
27
|
Add script header from files
|
28
28
|
|
29
29
|
:param file: File path
|
30
30
|
:param encoding: File encoding
|
31
31
|
"""
|
32
|
-
|
33
|
-
self._script_header.append(f.read())
|
32
|
+
self._script_header.extend(load_files(*file, encoding=encoding))
|
34
33
|
return self
|
35
34
|
|
36
|
-
def
|
35
|
+
def add_headers(self, *header: str):
|
36
|
+
"""
|
37
|
+
Add script header
|
38
|
+
|
39
|
+
:param header: Header lines
|
40
|
+
"""
|
41
|
+
self._script_header.extend(header)
|
42
|
+
return self
|
43
|
+
|
44
|
+
def add_bottom_files(self, *file: str, encoding='utf-8'):
|
37
45
|
"""
|
38
46
|
Add script bottom from files
|
39
47
|
|
40
48
|
:param file: File path
|
41
49
|
:param encoding: File encoding
|
42
50
|
"""
|
43
|
-
|
44
|
-
|
51
|
+
self._script_bottom.extend(load_files(*file, encoding=encoding))
|
52
|
+
return self
|
53
|
+
|
54
|
+
def add_bottoms(self, *bottom: str):
|
55
|
+
"""
|
56
|
+
Add script bottom
|
45
57
|
|
46
|
-
|
58
|
+
:param bottom: Bottom lines
|
59
|
+
"""
|
60
|
+
self._script_bottom.extend(bottom)
|
61
|
+
return self
|
62
|
+
|
63
|
+
def add_cmd_files(self, *file: str, encoding='utf-8'):
|
47
64
|
"""
|
48
65
|
Add commands from files to run under every working directory
|
49
66
|
|
50
67
|
:param file: File path
|
51
68
|
:param encoding: File encoding
|
52
69
|
"""
|
53
|
-
|
54
|
-
self._command.append(f.read())
|
70
|
+
self._command.extend(load_files(*file, encoding=encoding))
|
55
71
|
return self
|
56
72
|
|
57
|
-
def
|
73
|
+
def add_cmds(self, *cmd: str):
|
58
74
|
"""
|
59
75
|
add commands to run under every working directory
|
60
76
|
|
@@ -68,10 +84,10 @@ class BatchMaker:
|
|
68
84
|
Make batch script files from the previous setup
|
69
85
|
|
70
86
|
:param path: Path to save batch script files, use {i} to represent index
|
71
|
-
:param concurrency: Number of
|
87
|
+
:param concurrency: Number of scripts to to make
|
72
88
|
"""
|
73
89
|
# inject pre-defined functions
|
74
|
-
self.
|
90
|
+
self.add_header_files(get_asset('functions.sh'))
|
75
91
|
|
76
92
|
header = '\n'.join(self._script_header)
|
77
93
|
bottom = '\n'.join(self._script_bottom)
|
@@ -80,10 +96,10 @@ class BatchMaker:
|
|
80
96
|
work_dirs_arr = "\n".join(shlex.quote(w) for w in work_dirs)
|
81
97
|
body.extend([
|
82
98
|
'[ -n "$PBS_O_WORKDIR" ] && cd $PBS_O_WORKDIR # fix PBS',
|
83
|
-
f'
|
99
|
+
f'WORK_DIRS=({work_dirs_arr})',
|
84
100
|
'',
|
85
|
-
'for
|
86
|
-
'pushd $
|
101
|
+
'for WORK_DIR in "${WORK_DIRS[@]}"; do',
|
102
|
+
'pushd $WORK_DIR',
|
87
103
|
*self._command,
|
88
104
|
'popd',
|
89
105
|
'done'
|
@@ -94,3 +110,17 @@ class BatchMaker:
|
|
94
110
|
with open(out_path, 'w', encoding=encoding) as f:
|
95
111
|
f.write(script)
|
96
112
|
os.chmod(out_path, mode_translate(str(mode)))
|
113
|
+
|
114
|
+
|
115
|
+
def load_files(*file, encoding='utf-8', raise_invalid=False):
|
116
|
+
"""
|
117
|
+
Load files from paths
|
118
|
+
|
119
|
+
:param files: List of file paths
|
120
|
+
:return: List of file contents
|
121
|
+
"""
|
122
|
+
result = []
|
123
|
+
for file in expand_globs(file, raise_invalid=raise_invalid):
|
124
|
+
with open(file, 'r', encoding=encoding) as f:
|
125
|
+
result.append(f.read())
|
126
|
+
return result
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from typing import List
|
2
|
-
from enum import Enum
|
3
2
|
|
4
3
|
import logging
|
5
4
|
import json
|
@@ -7,11 +6,12 @@ import time
|
|
7
6
|
import os
|
8
7
|
import re
|
9
8
|
|
10
|
-
from .util import expand_globs, shell_run, parse_csv
|
9
|
+
from .util import expand_globs, shell_run, parse_csv, ensure_dir, log_cp
|
11
10
|
|
12
11
|
|
13
12
|
logger = logging.getLogger(__name__)
|
14
13
|
|
14
|
+
|
15
15
|
class JobState:
|
16
16
|
NULL = 0
|
17
17
|
PENDING = 1
|
@@ -59,7 +59,7 @@ class BaseJobManager:
|
|
59
59
|
recover_scripts = set(j['script'] for j in jobs)
|
60
60
|
logger.info('Scripts in recovery files: %s', recover_scripts)
|
61
61
|
|
62
|
-
scripts = set(
|
62
|
+
scripts = set(norm_path(s) for s in expand_globs(script, raise_invalid=True))
|
63
63
|
logger.info('Scripts to submit: %s', scripts)
|
64
64
|
|
65
65
|
for script_file in scripts:
|
@@ -70,6 +70,7 @@ class BaseJobManager:
|
|
70
70
|
while True:
|
71
71
|
self._update_jobs(jobs, max_tries, opts)
|
72
72
|
if recovery:
|
73
|
+
ensure_dir(recovery)
|
73
74
|
with open(recovery, 'w', encoding='utf-8') as f:
|
74
75
|
json.dump(jobs, f, indent=2)
|
75
76
|
|
@@ -101,20 +102,18 @@ class Slurm(BaseJobManager):
|
|
101
102
|
job_ids = [j['id'] for j in jobs if j['id']]
|
102
103
|
if job_ids:
|
103
104
|
query_cmd = f'{self._sacct_bin} -X -P --format=JobID,JobName,State -j {",".join(job_ids)}'
|
104
|
-
user = os.environ.get('USER')
|
105
|
-
if user:
|
106
|
-
query_cmd += f' -u {user}'
|
107
|
-
|
108
105
|
cp = shell_run(query_cmd)
|
109
106
|
if cp.returncode != 0:
|
110
|
-
logger.error('Failed to query job status: %s', cp
|
107
|
+
logger.error('Failed to query job status: %s', log_cp(cp))
|
111
108
|
return jobs
|
112
|
-
logger.info('Job status
|
109
|
+
logger.info('Job status:\n%s', cp.stdout.decode('utf-8'))
|
113
110
|
new_state = parse_csv(cp.stdout.decode('utf-8'))
|
114
111
|
else:
|
115
112
|
new_state = []
|
116
113
|
|
117
114
|
for job in jobs:
|
115
|
+
if not job['id']:
|
116
|
+
continue
|
118
117
|
for row in new_state:
|
119
118
|
if job['id'] == row['JobID']:
|
120
119
|
job['state'] = self._map_state(row['State'])
|
@@ -122,8 +121,7 @@ class Slurm(BaseJobManager):
|
|
122
121
|
logger.warning('Unknown job %s state: %s',row['JobID'], row['State'])
|
123
122
|
break
|
124
123
|
else:
|
125
|
-
|
126
|
-
logger.error('Job %s not found in sacct output', job['id'])
|
124
|
+
logger.error('Job %s not found in sacct output', job['id'])
|
127
125
|
|
128
126
|
# check if there are jobs to be (re)submitted
|
129
127
|
for job in jobs:
|
@@ -135,7 +133,7 @@ class Slurm(BaseJobManager):
|
|
135
133
|
cp = shell_run(submit_cmd)
|
136
134
|
if cp.returncode != 0:
|
137
135
|
job['state'] = JobState.FAILED
|
138
|
-
logger.error('Failed to submit job: %s', cp
|
136
|
+
logger.error('Failed to submit job: %s', log_cp(cp))
|
139
137
|
else:
|
140
138
|
job['id'] = self._parse_job_id(cp.stdout.decode('utf-8'))
|
141
139
|
assert job['id'], 'Failed to parse job id'
|
@@ -169,3 +167,7 @@ def should_submit(job: dict, max_tries: int):
|
|
169
167
|
if job['tries'] >= max_tries:
|
170
168
|
return False
|
171
169
|
return state != JobState.COMPLETED
|
170
|
+
|
171
|
+
|
172
|
+
def norm_path(path: str):
|
173
|
+
return os.path.normpath(os.path.abspath(path))
|
@@ -19,7 +19,7 @@ def expand_globs(patterns: Iterable[str], raise_invalid=False) -> List[str]:
|
|
19
19
|
"""
|
20
20
|
paths = []
|
21
21
|
for pattern in patterns:
|
22
|
-
result = glob.glob(pattern, recursive=True)
|
22
|
+
result = glob.glob(pattern, recursive=True)
|
23
23
|
if raise_invalid and len(result) == 0:
|
24
24
|
raise FileNotFoundError(f'No file found for {pattern}')
|
25
25
|
for p in result:
|
@@ -83,4 +83,18 @@ def parse_csv(text: str, delimiter="|"):
|
|
83
83
|
Parse CSV text to list of dictionaries
|
84
84
|
"""
|
85
85
|
reader = csv.DictReader(text.splitlines(), delimiter=delimiter)
|
86
|
-
return list(reader)
|
86
|
+
return list(reader)
|
87
|
+
|
88
|
+
|
89
|
+
def log_cp(cp):
|
90
|
+
"""
|
91
|
+
Log child process
|
92
|
+
"""
|
93
|
+
log = f'Command: {cp.args}\nReturn code: {cp.returncode}'
|
94
|
+
|
95
|
+
out = cp.stdout.decode('utf-8').strip()
|
96
|
+
if out:
|
97
|
+
log += f'\nSTDOUT:\n{out}'
|
98
|
+
err = cp.stderr.decode('utf-8').strip()
|
99
|
+
if err:
|
100
|
+
log += f'\nSTDERR:\n{err}'
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|