oh-my-batch 0.0.1.dev0__py3-none-any.whl → 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oh_my_batch/__main__.py +5 -0
- oh_my_batch/assets/__init__.py +7 -0
- oh_my_batch/assets/functions.sh +20 -0
- oh_my_batch/batch.py +96 -0
- oh_my_batch/cli.py +25 -0
- oh_my_batch/combo.py +202 -0
- oh_my_batch/job.py +179 -0
- oh_my_batch/util.py +86 -0
- oh_my_batch-0.1.0.dev0.dist-info/METADATA +130 -0
- oh_my_batch-0.1.0.dev0.dist-info/RECORD +14 -0
- oh_my_batch-0.1.0.dev0.dist-info/entry_points.txt +4 -0
- oh_my_batch-0.0.1.dev0.dist-info/METADATA +0 -20
- oh_my_batch-0.0.1.dev0.dist-info/RECORD +0 -5
- {on_my_batch → oh_my_batch}/__init__.py +0 -0
- {oh_my_batch-0.0.1.dev0.dist-info → oh_my_batch-0.1.0.dev0.dist-info}/LICENSE +0 -0
- {oh_my_batch-0.0.1.dev0.dist-info → oh_my_batch-0.1.0.dev0.dist-info}/WHEEL +0 -0
oh_my_batch/__main__.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
|
2
|
+
checkpoint() {
|
3
|
+
# Usage: checkpoint <flag_file> <command> [arg1] [arg2] ...
|
4
|
+
local flag_file="$1"
|
5
|
+
shift # Remove the first argument so $@ contains only the command and its arguments
|
6
|
+
if [ -f "$flag_file" ]; then
|
7
|
+
cat "$flag_file"
|
8
|
+
else
|
9
|
+
"$@" # Execute the command
|
10
|
+
local exit_code=$?
|
11
|
+
if [ $exit_code -eq 0 ]; then
|
12
|
+
local current_time=$(date '+%Y-%m-%d %H:%M:%S')
|
13
|
+
printf 'Command succeeded at %s\n' "$current_time" > "$flag_file"
|
14
|
+
echo "Created flag file '$flag_file' with timestamp: $current_time"
|
15
|
+
else
|
16
|
+
echo "Command `$@` failed with exit code $exit_code"
|
17
|
+
return $exit_code
|
18
|
+
fi
|
19
|
+
fi
|
20
|
+
}
|
oh_my_batch/batch.py
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
import shlex
|
2
|
+
import os
|
3
|
+
|
4
|
+
from .util import split_list, ensure_dir, expand_globs, mode_translate
|
5
|
+
from .assets import get_asset
|
6
|
+
|
7
|
+
|
8
|
+
class BatchMaker:
|
9
|
+
|
10
|
+
def __init__(self):
|
11
|
+
self._work_dirs = []
|
12
|
+
self._script_header = []
|
13
|
+
self._script_bottom = []
|
14
|
+
self._command = []
|
15
|
+
|
16
|
+
def add_work_dir(self, *dir: str):
|
17
|
+
"""
|
18
|
+
Add working directories
|
19
|
+
|
20
|
+
:param dir: Directories to work on, can be glob patterns
|
21
|
+
"""
|
22
|
+
self._work_dirs.extend(expand_globs(dir))
|
23
|
+
return self
|
24
|
+
|
25
|
+
def add_header_file(self, file: str, encoding='utf-8'):
|
26
|
+
"""
|
27
|
+
Add script header from files
|
28
|
+
|
29
|
+
:param file: File path
|
30
|
+
:param encoding: File encoding
|
31
|
+
"""
|
32
|
+
with open(file, 'r', encoding=encoding) as f:
|
33
|
+
self._script_header.append(f.read())
|
34
|
+
return self
|
35
|
+
|
36
|
+
def add_bottom_file(self, file: str, encoding='utf-8'):
|
37
|
+
"""
|
38
|
+
Add script bottom from files
|
39
|
+
|
40
|
+
:param file: File path
|
41
|
+
:param encoding: File encoding
|
42
|
+
"""
|
43
|
+
with open(file, 'r', encoding=encoding) as f:
|
44
|
+
self._script_bottom.append(f.read())
|
45
|
+
|
46
|
+
def add_command_file(self, file: str, encoding='utf-8'):
|
47
|
+
"""
|
48
|
+
Add commands from files to run under every working directory
|
49
|
+
|
50
|
+
:param file: File path
|
51
|
+
:param encoding: File encoding
|
52
|
+
"""
|
53
|
+
with open(file, 'r', encoding=encoding) as f:
|
54
|
+
self._command.append(f.read())
|
55
|
+
return self
|
56
|
+
|
57
|
+
def add_command(self, *cmd: str):
|
58
|
+
"""
|
59
|
+
add commands to run under every working directory
|
60
|
+
|
61
|
+
:param cmd: Commands to run, can be multiple
|
62
|
+
"""
|
63
|
+
self._command.extend(cmd)
|
64
|
+
return self
|
65
|
+
|
66
|
+
def make(self, path: str, concurrency=1, encoding='utf-8', mode='755'):
|
67
|
+
"""
|
68
|
+
Make batch script files from the previous setup
|
69
|
+
|
70
|
+
:param path: Path to save batch script files, use {i} to represent index
|
71
|
+
:param concurrency: Number of concurrent commands to run
|
72
|
+
"""
|
73
|
+
# inject pre-defined functions
|
74
|
+
self.add_header_file(get_asset('functions.sh'))
|
75
|
+
|
76
|
+
header = '\n'.join(self._script_header)
|
77
|
+
bottom = '\n'.join(self._script_bottom)
|
78
|
+
for i, work_dirs in enumerate(split_list(self._work_dirs, concurrency)):
|
79
|
+
body = []
|
80
|
+
work_dirs_arr = "\n".join(shlex.quote(w) for w in work_dirs)
|
81
|
+
body.extend([
|
82
|
+
'[ -n "$PBS_O_WORKDIR" ] && cd $PBS_O_WORKDIR # fix PBS',
|
83
|
+
f'work_dirs=({work_dirs_arr})',
|
84
|
+
'',
|
85
|
+
'for work_dir in "${work_dirs[@]}"; do',
|
86
|
+
'pushd $work_dir',
|
87
|
+
*self._command,
|
88
|
+
'popd',
|
89
|
+
'done'
|
90
|
+
])
|
91
|
+
script = '\n'.join([header, *body, bottom])
|
92
|
+
out_path = path.format(i=i)
|
93
|
+
ensure_dir(out_path)
|
94
|
+
with open(out_path, 'w', encoding=encoding) as f:
|
95
|
+
f.write(script)
|
96
|
+
os.chmod(out_path, mode_translate(str(mode)))
|
oh_my_batch/cli.py
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
import fire
|
2
|
+
|
3
|
+
class JobCli:
|
4
|
+
|
5
|
+
def slurm(self):
|
6
|
+
from .job import Slurm
|
7
|
+
return Slurm
|
8
|
+
|
9
|
+
|
10
|
+
class OhMyBatch:
|
11
|
+
|
12
|
+
def combo(self):
|
13
|
+
from .combo import ComboMaker
|
14
|
+
return ComboMaker
|
15
|
+
|
16
|
+
def batch(self):
|
17
|
+
from .batch import BatchMaker
|
18
|
+
return BatchMaker
|
19
|
+
|
20
|
+
def job(self):
|
21
|
+
return JobCli()
|
22
|
+
|
23
|
+
|
24
|
+
def main():
|
25
|
+
fire.Fire(OhMyBatch)
|
oh_my_batch/combo.py
ADDED
@@ -0,0 +1,202 @@
|
|
1
|
+
from itertools import product
|
2
|
+
from string import Template
|
3
|
+
import random
|
4
|
+
import os
|
5
|
+
|
6
|
+
from .util import expand_globs, mode_translate
|
7
|
+
|
8
|
+
class ComboMaker:
|
9
|
+
|
10
|
+
def __init__(self, seed=None):
|
11
|
+
"""
|
12
|
+
ComboMaker constructor
|
13
|
+
|
14
|
+
:param seed: Seed for random number generator
|
15
|
+
"""
|
16
|
+
self._product_vars = {}
|
17
|
+
self._broadcast_vars = {}
|
18
|
+
if seed is not None:
|
19
|
+
random.seed(seed)
|
20
|
+
self._combos = []
|
21
|
+
|
22
|
+
def add_seq(self, key: str, start: int, stop: int, step: int=1, broadcast=False):
|
23
|
+
"""
|
24
|
+
Add a variable with sequence of integer values
|
25
|
+
|
26
|
+
:param key: Variable name
|
27
|
+
:param start: Start value
|
28
|
+
:param stop: Stop value
|
29
|
+
:param step: Step
|
30
|
+
:param broadcast: If True, values are broadcasted, otherwise they are producted when making combos
|
31
|
+
"""
|
32
|
+
args = list(range(start, stop, step))
|
33
|
+
self.add_var(key, *args, broadcast=broadcast)
|
34
|
+
return self
|
35
|
+
|
36
|
+
def add_randint(self, key: str, n: int, a: int, b: int, broadcast=False, seed=None):
|
37
|
+
"""
|
38
|
+
Add a variable with random integer values
|
39
|
+
|
40
|
+
:param key: Variable name
|
41
|
+
:param n: Number of values
|
42
|
+
:param a: Lower bound
|
43
|
+
:param b: Upper bound
|
44
|
+
:param broadcast: If True, values are broadcasted, otherwise they are producted when making combos
|
45
|
+
:param seed: Seed for random number generator
|
46
|
+
"""
|
47
|
+
if seed is not None:
|
48
|
+
random.seed(seed)
|
49
|
+
args = [random.randint(a, b) for _ in range(n)]
|
50
|
+
self.add_var(key, *args, broadcast=broadcast)
|
51
|
+
return self
|
52
|
+
|
53
|
+
def add_rand(self, key: str, n: int, a: float, b: float, broadcast=False, seed=None):
|
54
|
+
"""
|
55
|
+
Add a variable with random float values
|
56
|
+
|
57
|
+
:param key: Variable name
|
58
|
+
:param n: Number of values
|
59
|
+
:param a: Lower bound
|
60
|
+
:param b: Upper bound
|
61
|
+
:param broadcast: If True, values are broadcasted, otherwise they are producted when making combos
|
62
|
+
:param seed: Seed for random number generator
|
63
|
+
"""
|
64
|
+
if seed is not None:
|
65
|
+
random.seed(seed)
|
66
|
+
args = [random.uniform(a, b) for _ in range(n)]
|
67
|
+
self.add_var(key, *args, broadcast=broadcast)
|
68
|
+
return self
|
69
|
+
|
70
|
+
def add_files(self, key: str, *path: str, broadcast=False, abs=False):
|
71
|
+
"""
|
72
|
+
Add a variable with files by glob pattern
|
73
|
+
For example, suppose there are 3 files named 1.txt, 2.txt, 3.txt in data directory,
|
74
|
+
then calling add_files('DATA_FILE', 'data/*.txt') will add list ["data/1.txt", "data/2.txt", "data/3.txt"]
|
75
|
+
to the variable DATA_FILE.
|
76
|
+
|
77
|
+
:param key: Variable name
|
78
|
+
:param path: Path to files, can include glob pattern
|
79
|
+
:param broadcast: If True, values are broadcasted, otherwise they are producted when making combos
|
80
|
+
:param abs: If True, path will be turned into absolute path
|
81
|
+
"""
|
82
|
+
args = expand_globs(path, raise_invalid=True)
|
83
|
+
if not args:
|
84
|
+
raise ValueError(f"No files found for {path}")
|
85
|
+
if abs:
|
86
|
+
args = [os.path.abspath(p) for p in args]
|
87
|
+
self.add_var(key, *args, broadcast=broadcast)
|
88
|
+
return self
|
89
|
+
|
90
|
+
def add_files_as_one(self, key: str, path: str, broadcast=False, sep=' ', abs=False):
|
91
|
+
"""
|
92
|
+
Add a variable with files by glob pattern as one string
|
93
|
+
Unlike add_files, this function joins the files with a delimiter.
|
94
|
+
For example, suppose there are 1.txt, 2.txt, 3.txt in data directory,
|
95
|
+
then calling add_files_as_one('DATA_FILE', 'data/*.txt') will add string "data/1.txt data/2.txt data/3.txt"
|
96
|
+
to the variable DATA_FILE.
|
97
|
+
|
98
|
+
:param key: Variable name
|
99
|
+
:param path: Path to files, can include glob pattern
|
100
|
+
:param broadcast: If True, values are broadcasted, otherwise they are producted when making combos
|
101
|
+
:param sep: Separator to join files
|
102
|
+
:param abs: If True, path will be turned into absolute path
|
103
|
+
"""
|
104
|
+
args = expand_globs(path, raise_invalid=True)
|
105
|
+
if not args:
|
106
|
+
raise ValueError(f"No files found for {path}")
|
107
|
+
if abs:
|
108
|
+
args = [os.path.abspath(p) for p in args]
|
109
|
+
self.add_var(key, sep.join(args), broadcast=broadcast)
|
110
|
+
return self
|
111
|
+
|
112
|
+
def add_var(self, key: str, *args, broadcast=False):
|
113
|
+
"""
|
114
|
+
Add a variable with values
|
115
|
+
|
116
|
+
:param key: Variable name
|
117
|
+
:param args: Values
|
118
|
+
:param broadcast: If True, values are broadcasted, otherwise they are producted when making combos
|
119
|
+
"""
|
120
|
+
|
121
|
+
if key == 'i':
|
122
|
+
raise ValueError("Variable name 'i' is reserved")
|
123
|
+
|
124
|
+
if broadcast:
|
125
|
+
if key in self._product_vars:
|
126
|
+
raise ValueError(f"Variable {key} already defined as product variable")
|
127
|
+
self._broadcast_vars.setdefault(key, []).extend(args)
|
128
|
+
else:
|
129
|
+
if key in self._broadcast_vars:
|
130
|
+
raise ValueError(f"Variable {key} already defined as broadcast variable")
|
131
|
+
self._product_vars.setdefault(key, []).extend(args)
|
132
|
+
return self
|
133
|
+
|
134
|
+
def shuffle(self, *keys: str, seed=None):
|
135
|
+
"""
|
136
|
+
Shuffle variables
|
137
|
+
:param keys: Variable names to shuffle
|
138
|
+
:param seed: Seed for random number generator
|
139
|
+
"""
|
140
|
+
if seed is not None:
|
141
|
+
random.seed(seed)
|
142
|
+
|
143
|
+
for key in keys:
|
144
|
+
if key in self._product_vars:
|
145
|
+
random.shuffle(self._product_vars[key])
|
146
|
+
elif key in self._broadcast_vars:
|
147
|
+
random.shuffle(self._broadcast_vars[key])
|
148
|
+
else:
|
149
|
+
raise ValueError(f"Variable {key} not found")
|
150
|
+
return self
|
151
|
+
|
152
|
+
def make_files(self, template: str, dest: str, delimiter='$', mode=None):
|
153
|
+
"""
|
154
|
+
Make files from template
|
155
|
+
The template file can include variables with delimiter.
|
156
|
+
For example, if delimiter is '$', then the template file can include $var1, $var2, ...
|
157
|
+
|
158
|
+
The destination can also include variables in string format style.
|
159
|
+
For example, if dest is 'output/{i}.txt', then files are saved as output/0.txt, output/1.txt, ...
|
160
|
+
|
161
|
+
:param template: Path to template file
|
162
|
+
:param dest: Path pattern to destination file
|
163
|
+
:param delimiter: Delimiter for variables in template, default is '$',
|
164
|
+
can be changed to other character, e.g $$, @, ...
|
165
|
+
"""
|
166
|
+
_delimiter = delimiter
|
167
|
+
|
168
|
+
class _Template(Template):
|
169
|
+
delimiter = _delimiter
|
170
|
+
|
171
|
+
combos = self._make_combos()
|
172
|
+
for i, combo in enumerate(combos):
|
173
|
+
with open(template, 'r') as f:
|
174
|
+
template_text = f.read()
|
175
|
+
text = _Template(template_text).safe_substitute(combo)
|
176
|
+
_dest = dest.format(i=i, **combo)
|
177
|
+
os.makedirs(os.path.dirname(_dest), exist_ok=True)
|
178
|
+
with open(_dest, 'w') as f:
|
179
|
+
f.write(text)
|
180
|
+
if mode is not None:
|
181
|
+
os.chmod(_dest, mode_translate(str(mode)))
|
182
|
+
return self
|
183
|
+
|
184
|
+
def done(self):
|
185
|
+
"""
|
186
|
+
End of command chain
|
187
|
+
"""
|
188
|
+
pass
|
189
|
+
|
190
|
+
def _make_combos(self):
|
191
|
+
if not self._product_vars and not self._broadcast_vars:
|
192
|
+
return self._combos
|
193
|
+
keys = self._product_vars.keys()
|
194
|
+
values_list = product(*self._product_vars.values())
|
195
|
+
combos = [ dict(zip(keys, values)) for values in values_list ]
|
196
|
+
for i, combo in enumerate(combos):
|
197
|
+
for k, v in self._broadcast_vars.items():
|
198
|
+
combo[k] = v[i % len(v)]
|
199
|
+
self._combos.extend(combos)
|
200
|
+
self._product_vars = {}
|
201
|
+
self._broadcast_vars = {}
|
202
|
+
return self._combos
|
oh_my_batch/job.py
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
from typing import List
|
2
|
+
from enum import Enum
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import json
|
6
|
+
import time
|
7
|
+
import os
|
8
|
+
import re
|
9
|
+
|
10
|
+
from .util import expand_globs, shell_run, parse_csv
|
11
|
+
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class JobState(bytes, Enum):
|
17
|
+
"""
|
18
|
+
Job state enumeration
|
19
|
+
"""
|
20
|
+
def __new__(cls, value: int, terminal: bool, status_name: str) -> "JobState":
|
21
|
+
obj = bytes.__new__(cls, [value])
|
22
|
+
obj._value_ = value
|
23
|
+
obj.terminal = terminal
|
24
|
+
obj.status_name = status_name
|
25
|
+
return obj
|
26
|
+
|
27
|
+
value: int # type: ignore
|
28
|
+
terminal: bool
|
29
|
+
status_name: str
|
30
|
+
|
31
|
+
NULL = (0, True, "NULL")
|
32
|
+
PENDING = (1, False, "PENDING")
|
33
|
+
RUNNING = (2, False, "RUNNING")
|
34
|
+
CANCELLED = (3, True, "CANCELLED")
|
35
|
+
COMPLETED = (4, True, "COMPLETED")
|
36
|
+
FAILED = (5, True, "FAILED")
|
37
|
+
UNKNOWN = (6, False, "UNKNOWN")
|
38
|
+
|
39
|
+
|
40
|
+
def new_job(script: str):
|
41
|
+
return {
|
42
|
+
'id': '', 'script': script, 'state': JobState.NULL, 'tries': 0,
|
43
|
+
}
|
44
|
+
|
45
|
+
|
46
|
+
class BaseJobManager:
|
47
|
+
|
48
|
+
def submit(self, *script: str, recovery: str = '', wait=False,
|
49
|
+
timeout=None, opts='', max_tries=1, interval=10):
|
50
|
+
"""
|
51
|
+
Submit scripts
|
52
|
+
|
53
|
+
:param script: Script files to submit, can be glob patterns.
|
54
|
+
:param recovery: Recovery file to store the state of the submitted scripts
|
55
|
+
:param wait: If True, wait for the job to finish
|
56
|
+
:param timeout: Timeout in seconds for waiting
|
57
|
+
:param opts: Additional options for submit command
|
58
|
+
:param max_tries: Maximum number of tries for each job
|
59
|
+
:param interval: Interval in seconds for checking job status
|
60
|
+
"""
|
61
|
+
jobs = []
|
62
|
+
if recovery and os.path.exists(recovery):
|
63
|
+
with open(recovery, 'r', encoding='utf-8') as f:
|
64
|
+
jobs = json.load(f)
|
65
|
+
|
66
|
+
recover_scripts = set(j['script'] for j in jobs)
|
67
|
+
logger.info('Scripts in recovery files: %s', recover_scripts)
|
68
|
+
|
69
|
+
scripts = set(os.path.normpath(s) for s in expand_globs(script))
|
70
|
+
logger.info('Scripts to submit: %s', scripts)
|
71
|
+
|
72
|
+
if recovery and recover_scripts != scripts:
|
73
|
+
raise ValueError('Scripts to submit are different from scripts in recovery file')
|
74
|
+
|
75
|
+
for script_file in scripts:
|
76
|
+
if script_file not in recover_scripts:
|
77
|
+
jobs.append(new_job(script_file))
|
78
|
+
|
79
|
+
current = time.time()
|
80
|
+
while True:
|
81
|
+
self._update_jobs(jobs, max_tries, opts)
|
82
|
+
if recovery:
|
83
|
+
with open(recovery, 'w', encoding='utf-8') as f:
|
84
|
+
json.dump(jobs, f, indent=2)
|
85
|
+
|
86
|
+
if not wait:
|
87
|
+
break
|
88
|
+
|
89
|
+
# stop if all jobs are terminal and not job to be submitted
|
90
|
+
if (all(j['state'].terminal for j in jobs) and
|
91
|
+
not any(should_submit(j, max_tries) for j in jobs)):
|
92
|
+
break
|
93
|
+
|
94
|
+
if timeout and time.time() - current > timeout:
|
95
|
+
logger.error('Timeout, current state: %s', jobs)
|
96
|
+
break
|
97
|
+
|
98
|
+
time.sleep(interval)
|
99
|
+
|
100
|
+
def _update_jobs(self, jobs: List[dict], max_tries: int, submit_opts: str):
|
101
|
+
raise NotImplementedError
|
102
|
+
|
103
|
+
|
104
|
+
class Slurm(BaseJobManager):
|
105
|
+
def __init__(self, sbatch='sbatch', sacct='sacct'):
|
106
|
+
self._sbatch_bin = sbatch
|
107
|
+
self._sacct_bin = sacct
|
108
|
+
|
109
|
+
def _update_jobs(self, jobs: List[dict], max_tries: int, submit_opts: str):
|
110
|
+
# query job status
|
111
|
+
job_ids = ','.join(j['id'] for j in jobs if j['id'])
|
112
|
+
query_cmd = f'{self._sacct_bin} -X -P -j {job_ids} --format=JobID,JobName,State'
|
113
|
+
|
114
|
+
user = os.environ.get('USER')
|
115
|
+
if user:
|
116
|
+
query_cmd += f' -u {user}'
|
117
|
+
|
118
|
+
cp = shell_run(query_cmd)
|
119
|
+
if cp.returncode != 0:
|
120
|
+
logger.error('Failed to query job status: %s', cp.stderr.decode('utf-8'))
|
121
|
+
return jobs
|
122
|
+
logger.info('Job status: %s', cp.stdout.decode('utf-8'))
|
123
|
+
new_state = parse_csv(cp.stdout.decode('utf-8'))
|
124
|
+
|
125
|
+
for job in jobs:
|
126
|
+
for row in new_state:
|
127
|
+
if job['id'] == row['JobID']:
|
128
|
+
job['state'] = self._map_state(row['State'])
|
129
|
+
if job['state'] == JobState.UNKNOWN:
|
130
|
+
logger.warning('Unknown job %s state: %s',row['JobID'], row['State'])
|
131
|
+
break
|
132
|
+
else:
|
133
|
+
job['state'] = JobState.FAILED
|
134
|
+
logger.error('Job %s not found in sacct output', job['id'])
|
135
|
+
|
136
|
+
# check if there are jobs to be (re)submitted
|
137
|
+
for job in jobs:
|
138
|
+
if should_submit(job, max_tries):
|
139
|
+
job['tries'] += 1
|
140
|
+
job['id'] = ''
|
141
|
+
job['state'] = JobState.NULL
|
142
|
+
submit_cmd = f'{self._sbatch_bin} {submit_opts} {job["script"]}'
|
143
|
+
cp = shell_run(submit_cmd)
|
144
|
+
if cp.returncode != 0:
|
145
|
+
job['state'] = JobState.FAILED
|
146
|
+
logger.error('Failed to submit job: %s', cp.stderr.decode('utf-8'))
|
147
|
+
else:
|
148
|
+
job['id'] = self._parse_job_id(cp.stdout.decode('utf-8'))
|
149
|
+
assert job['id'], 'Failed to parse job id'
|
150
|
+
job['state'] = JobState.PENDING
|
151
|
+
logger.info('Job %s submitted', job['id'])
|
152
|
+
|
153
|
+
def _map_state(self, state: str):
|
154
|
+
if state.startswith('CANCELLED'):
|
155
|
+
return JobState.CANCELLED
|
156
|
+
return {
|
157
|
+
'PENDING': JobState.PENDING,
|
158
|
+
'RUNNING': JobState.RUNNING,
|
159
|
+
'COMPLETED': JobState.COMPLETED,
|
160
|
+
'FAILED': JobState.FAILED,
|
161
|
+
'OUT_OF_MEMORY': JobState.FAILED,
|
162
|
+
'TIMEOUT': JobState.FAILED,
|
163
|
+
}.get(state, JobState.UNKNOWN)
|
164
|
+
|
165
|
+
def _parse_job_id(self, output: str):
|
166
|
+
"""
|
167
|
+
Parse job id from sbatch output
|
168
|
+
"""
|
169
|
+
m = re.search(r'\d+', output)
|
170
|
+
return m.group(0) if m else ''
|
171
|
+
|
172
|
+
|
173
|
+
def should_submit(job: dict, max_tries: int):
|
174
|
+
state: JobState = job['state']
|
175
|
+
if not state.terminal:
|
176
|
+
return False
|
177
|
+
if job['tries'] >= max_tries:
|
178
|
+
return False
|
179
|
+
return state != JobState.COMPLETED
|
oh_my_batch/util.py
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
from typing import List, Iterable
|
2
|
+
import subprocess as sp
|
3
|
+
import logging
|
4
|
+
import glob
|
5
|
+
import csv
|
6
|
+
import os
|
7
|
+
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
def expand_globs(patterns: Iterable[str], raise_invalid=False) -> List[str]:
|
13
|
+
"""
|
14
|
+
Expand glob patterns in paths
|
15
|
+
|
16
|
+
:param patterns: list of paths or glob patterns
|
17
|
+
:param raise_invalid: if True, will raise error if no file found for a glob pattern
|
18
|
+
:return: list of expanded paths
|
19
|
+
"""
|
20
|
+
paths = []
|
21
|
+
for pattern in patterns:
|
22
|
+
result = glob.glob(pattern, recursive=True) if '*' in pattern else [pattern]
|
23
|
+
if raise_invalid and len(result) == 0:
|
24
|
+
raise FileNotFoundError(f'No file found for {pattern}')
|
25
|
+
for p in result:
|
26
|
+
if p not in paths:
|
27
|
+
paths.append(p)
|
28
|
+
else:
|
29
|
+
logger.warning('path %s already exists in the list', p)
|
30
|
+
return paths
|
31
|
+
|
32
|
+
|
33
|
+
def split_list(l, n):
|
34
|
+
"""
|
35
|
+
Splits a list into n sub-lists.
|
36
|
+
|
37
|
+
:param l: The list to be split.
|
38
|
+
:param n: The number of sub-lists to create.
|
39
|
+
:return: A list of sub-lists.
|
40
|
+
"""
|
41
|
+
if n <= 0:
|
42
|
+
raise ValueError("Number of sub-lists must be a positive integer")
|
43
|
+
|
44
|
+
# Calculate the size of each sublist
|
45
|
+
k, m = divmod(len(l), n)
|
46
|
+
|
47
|
+
for i in range(n):
|
48
|
+
start = i * k + min(i, m)
|
49
|
+
end = (i + 1) * k + min(i + 1, m)
|
50
|
+
if start == end:
|
51
|
+
break
|
52
|
+
yield l[start:end]
|
53
|
+
|
54
|
+
|
55
|
+
def ensure_dir(path: str):
|
56
|
+
"""
|
57
|
+
Ensure the directory exists
|
58
|
+
|
59
|
+
:param path: Path to directory or file.
|
60
|
+
"""
|
61
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
62
|
+
|
63
|
+
|
64
|
+
def mode_translate(mode: str):
|
65
|
+
"""
|
66
|
+
Translate mode in decimal to octal
|
67
|
+
For example, convert 777 -> 0o777, 755 -> 0o755
|
68
|
+
"""
|
69
|
+
return int(mode, 8)
|
70
|
+
|
71
|
+
|
72
|
+
def shell_run(cmd: str):
|
73
|
+
"""
|
74
|
+
Run a shell command
|
75
|
+
|
76
|
+
:param cmd: Command to run
|
77
|
+
"""
|
78
|
+
return sp.run(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
|
79
|
+
|
80
|
+
|
81
|
+
def parse_csv(text: str, delimiter="|"):
|
82
|
+
"""
|
83
|
+
Parse CSV text to list of dictionaries
|
84
|
+
"""
|
85
|
+
reader = csv.DictReader(text.splitlines(), delimiter=delimiter)
|
86
|
+
return list(reader)
|
@@ -0,0 +1,130 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: oh-my-batch
|
3
|
+
Version: 0.1.0.dev0
|
4
|
+
Summary:
|
5
|
+
License: GPL
|
6
|
+
Author: weihong.xu
|
7
|
+
Author-email: xuweihong.cn@gmail.com
|
8
|
+
Requires-Python: >=3.8,<4.0
|
9
|
+
Classifier: License :: Other/Proprietary License
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Requires-Dist: fire (>=0.7.0,<0.8.0)
|
17
|
+
Description-Content-Type: text/markdown
|
18
|
+
|
19
|
+
# oh-my-batch
|
20
|
+
A simple tool to manipulate batch tasks.
|
21
|
+
|
22
|
+
The goal of this to is to provide a handy command tool for
|
23
|
+
* `omb combo`: generate folders/files from different combinations of parameters
|
24
|
+
* `omb batch`: generate batch scripts from multiple working directories
|
25
|
+
* `omb job`: track the state of job in job schedular
|
26
|
+
|
27
|
+
## Install
|
28
|
+
```bash
|
29
|
+
pip install oh-my-batch
|
30
|
+
```
|
31
|
+
|
32
|
+
## Use cases
|
33
|
+
|
34
|
+
### Generate files from different combinations of parameters
|
35
|
+
|
36
|
+
It's common to generate files with different combinations of parameters in scientific computing.
|
37
|
+
For example, you have 3 LAMMPS data files in `tmp` directory: `tmp/1.data`, `tmp/2.data`, `tmp/3.data`.
|
38
|
+
And you want to generate a series of input files with different parameters,
|
39
|
+
for example, different temperatures 300K, 400K, 500K, against each data file.
|
40
|
+
|
41
|
+
In this case, you can use `omb combo` command to generate a series of input files for you.
|
42
|
+
|
43
|
+
```bash
|
44
|
+
#! /bin/bash
|
45
|
+
# prepare fake data files
|
46
|
+
mkdir -p tmp/
|
47
|
+
touch tmp/1.data tmp/2.data tmp/3.data
|
48
|
+
|
49
|
+
# prepare a lammps input file template
|
50
|
+
cat > tmp/in.lmp.tmp <<EOF
|
51
|
+
read_data $DATA_FILE
|
52
|
+
velocity all create $TEMP $RANDOM
|
53
|
+
run 1000
|
54
|
+
EOF
|
55
|
+
|
56
|
+
# prepare a run script template
|
57
|
+
cat > tmp/run.sh.tmp <<EOF
|
58
|
+
cat in.lmp # simulate running lammps
|
59
|
+
EOF
|
60
|
+
|
61
|
+
# generate input files
|
62
|
+
omb combo \
|
63
|
+
add_files DATA_FILE tmp/*.data - \
|
64
|
+
add_var TEMP 300 400 500 - \
|
65
|
+
add_randint RANDOM -n 3 -a 1 -b 1000 --broadcast - \
|
66
|
+
make_files tmp/in.lmp.tmp tmp/tasks/{i}-T-{TEMP}/in.lmp - \
|
67
|
+
make_files tmp/run.sh.tmp tmp/tasks/{i}-T-{TEMP}/run.sh --mode 755 - \
|
68
|
+
done
|
69
|
+
```
|
70
|
+
|
71
|
+
The above script will generate 9 folders in `tmp/tasks` directory
|
72
|
+
with names from `0-T-300`, `1-T-400`, `2-T-500`, `3-T-300` to `8-T-500`.
|
73
|
+
Each folder will contain a `in.lmp` file and a `run.sh` file.
|
74
|
+
|
75
|
+
The 9 folders are the combinations of 3 data files and 3 temperatures,
|
76
|
+
and each input file will have a independent random number between 1 and 1000 as `RANDOM`.
|
77
|
+
|
78
|
+
You can run the about script by `./examples/omb-combo.sh`,
|
79
|
+
and you can also run `omb combo --help` to see the detailed usage of `combo` command.
|
80
|
+
|
81
|
+
### Generate batch scripts from multiple working directories
|
82
|
+
It's common to submit a lot of jobs to a job scheduler. `omb batch` is designed to help you generate batch scripts from multiple working directories and package them into several batch scripts.
|
83
|
+
|
84
|
+
Let's continue the above example, now you have 9 folders in `tmp/tasks` directory.
|
85
|
+
You want to package them into 2 batch scripts to submit to a job scheduler.
|
86
|
+
|
87
|
+
You can use `omb batch` to generate batch scripts for you like this:
|
88
|
+
|
89
|
+
```bash
|
90
|
+
#! /bin/bash
|
91
|
+
cat > tmp/lammps_header.sh <<EOF
|
92
|
+
#!/bin/bash
|
93
|
+
#SBATCH -J lmp
|
94
|
+
#SBATCH -n 1
|
95
|
+
#SBATCH -t 1:00:00
|
96
|
+
EOF
|
97
|
+
|
98
|
+
omb batch \
|
99
|
+
add_work_dir tmp/tasks/* - \
|
100
|
+
add_header_file tmp/lammps_header.sh - \
|
101
|
+
add_command "checkpoint lmp.done ./run.sh" - \
|
102
|
+
make tmp/lmp-{i}.slurm --concurrency 2
|
103
|
+
```
|
104
|
+
|
105
|
+
You will find batch scripts `tmp/lmp-0.slurm` and `tmp/lmp-1.slurm` in `tmp` directory.
|
106
|
+
|
107
|
+
`omb batch` will provide some useful functions in the batch script.
|
108
|
+
For example, `checkpoint` will check if the job is done and skip the job if it's done.
|
109
|
+
|
110
|
+
You can run the above script by `./examples/omb-batch.sh`,
|
111
|
+
|
112
|
+
### Track the state of job in job schedular
|
113
|
+
|
114
|
+
Let's continue the above example, now you have submitted the batch scripts to the job scheduler.
|
115
|
+
|
116
|
+
You can use `omb job` to track the state of the jobs.
|
117
|
+
|
118
|
+
```bash
|
119
|
+
|
120
|
+
omb job slurm \
|
121
|
+
submit tmp/*.slurm --max_tries 3 --wait --recovery lammps.recovery
|
122
|
+
```
|
123
|
+
|
124
|
+
The above command will submit the batch scripts to the job scheduler,
|
125
|
+
and wait for the jobs to finish. If the job fails, it will retry for at most 3 times.
|
126
|
+
|
127
|
+
The `--recovery` option will save the job information to `lammps.recovery` file,
|
128
|
+
if `omb job` is interrupted, you can run the exact same command to recover the job status,
|
129
|
+
so that you don't need to resubmit the jobs that are already submitted.
|
130
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
oh_my_batch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
oh_my_batch/__main__.py,sha256=sWyFZMwWNvhkanwZSJRGfBBDoIevhC028dTSB67i6yI,61
|
3
|
+
oh_my_batch/assets/__init__.py,sha256=Exub46UbQaz2V2eXpQeiVfnThQpXaNeuyjlGY6gBSZc,130
|
4
|
+
oh_my_batch/assets/functions.sh,sha256=eORxFefV-XrWbG-2I6u-c8uf1XxOQ31LaeVHBumwzJ4,708
|
5
|
+
oh_my_batch/batch.py,sha256=e73N-xwxMvgxnWwFMp33PQD1Dy-T-ATjANlwtPRHPQM,3016
|
6
|
+
oh_my_batch/cli.py,sha256=G_JxqX0Zbx_EbcDxXbYjJ_4O-EOhmkF1lcMWgQ5ZPqo,375
|
7
|
+
oh_my_batch/combo.py,sha256=AHFD5CLoczqtjcfl2Rb4A2ucoQU40-cWtDOYjtP-yY4,7680
|
8
|
+
oh_my_batch/job.py,sha256=kup6Kwr3HFeCWAYJzJ1BET81_Dvbz1HxuHfmMPOpCnU,6080
|
9
|
+
oh_my_batch/util.py,sha256=H8B4zVNH5xRp-NG_uypgvtmz2YSpXy_6LK5ROv6SYrc,2116
|
10
|
+
oh_my_batch-0.1.0.dev0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
11
|
+
oh_my_batch-0.1.0.dev0.dist-info/METADATA,sha256=nUAgfApBgfXFhHD9-VWXAZsyGF1iJcO8bxYGMZKcGLI,4453
|
12
|
+
oh_my_batch-0.1.0.dev0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
13
|
+
oh_my_batch-0.1.0.dev0.dist-info/entry_points.txt,sha256=ZY2GutSoNjjSyJ4qO2pTeseKUFgoTYdvmgkuZZkwi68,77
|
14
|
+
oh_my_batch-0.1.0.dev0.dist-info/RECORD,,
|
@@ -1,20 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: oh-my-batch
|
3
|
-
Version: 0.0.1.dev0
|
4
|
-
Summary:
|
5
|
-
License: GPL
|
6
|
-
Author: weihong.xu
|
7
|
-
Author-email: xuweihong.cn@gmail.com
|
8
|
-
Requires-Python: >=3.8,<4.0
|
9
|
-
Classifier: License :: Other/Proprietary License
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
11
|
-
Classifier: Programming Language :: Python :: 3.8
|
12
|
-
Classifier: Programming Language :: Python :: 3.9
|
13
|
-
Classifier: Programming Language :: Python :: 3.10
|
14
|
-
Classifier: Programming Language :: Python :: 3.11
|
15
|
-
Classifier: Programming Language :: Python :: 3.12
|
16
|
-
Description-Content-Type: text/markdown
|
17
|
-
|
18
|
-
# oh-my-batch
|
19
|
-
A simple tool to manipulate batch tasks.
|
20
|
-
|
@@ -1,5 +0,0 @@
|
|
1
|
-
on_my_batch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
oh_my_batch-0.0.1.dev0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
3
|
-
oh_my_batch-0.0.1.dev0.dist-info/METADATA,sha256=j1kg17YPCOJs503Obz4RUb0vlaRLAZhM2BP68J5DcrA,614
|
4
|
-
oh_my_batch-0.0.1.dev0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
5
|
-
oh_my_batch-0.0.1.dev0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|