siliconcompiler 0.33.1__py3-none-any.whl → 0.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- siliconcompiler/__init__.py +2 -0
- siliconcompiler/_metadata.py +1 -1
- siliconcompiler/apps/sc_issue.py +5 -3
- siliconcompiler/apps/sc_remote.py +0 -17
- siliconcompiler/apps/utils/replay.py +5 -5
- siliconcompiler/checklist.py +1 -1
- siliconcompiler/core.py +39 -48
- siliconcompiler/data/templates/replay/replay.sh.j2 +18 -1
- siliconcompiler/dependencyschema.py +392 -0
- siliconcompiler/design.py +664 -0
- siliconcompiler/flowgraph.py +32 -1
- siliconcompiler/metric.py +19 -0
- siliconcompiler/package/__init__.py +383 -223
- siliconcompiler/package/git.py +75 -77
- siliconcompiler/package/github.py +70 -97
- siliconcompiler/package/https.py +77 -93
- siliconcompiler/packageschema.py +260 -0
- siliconcompiler/pdk.py +2 -2
- siliconcompiler/record.py +57 -5
- siliconcompiler/remote/client.py +61 -13
- siliconcompiler/remote/server.py +109 -64
- siliconcompiler/report/dashboard/cli/board.py +1 -2
- siliconcompiler/scheduler/__init__.py +3 -1375
- siliconcompiler/scheduler/docker.py +268 -0
- siliconcompiler/scheduler/run_node.py +20 -19
- siliconcompiler/scheduler/scheduler.py +308 -0
- siliconcompiler/scheduler/schedulernode.py +934 -0
- siliconcompiler/scheduler/slurm.py +147 -163
- siliconcompiler/scheduler/taskscheduler.py +39 -52
- siliconcompiler/schema/__init__.py +3 -3
- siliconcompiler/schema/baseschema.py +256 -11
- siliconcompiler/schema/editableschema.py +4 -0
- siliconcompiler/schema/journal.py +210 -0
- siliconcompiler/schema/namedschema.py +31 -2
- siliconcompiler/schema/parameter.py +14 -1
- siliconcompiler/schema/parametervalue.py +1 -34
- siliconcompiler/schema/schema_cfg.py +211 -350
- siliconcompiler/tool.py +139 -37
- siliconcompiler/tools/_common/__init__.py +14 -11
- siliconcompiler/tools/builtin/concatenate.py +2 -2
- siliconcompiler/tools/builtin/verify.py +1 -2
- siliconcompiler/tools/openroad/scripts/common/procs.tcl +27 -25
- siliconcompiler/tools/slang/__init__.py +3 -2
- siliconcompiler/tools/vpr/route.py +69 -0
- siliconcompiler/tools/yosys/sc_synth_asic.tcl +0 -4
- siliconcompiler/toolscripts/_tools.json +13 -8
- siliconcompiler/toolscripts/ubuntu22/install-klayout.sh +4 -0
- siliconcompiler/toolscripts/ubuntu24/install-klayout.sh +4 -0
- siliconcompiler/utils/__init__.py +2 -23
- siliconcompiler/utils/flowgraph.py +5 -5
- siliconcompiler/utils/logging.py +2 -1
- {siliconcompiler-0.33.1.dist-info → siliconcompiler-0.34.0.dist-info}/METADATA +8 -6
- {siliconcompiler-0.33.1.dist-info → siliconcompiler-0.34.0.dist-info}/RECORD +57 -52
- {siliconcompiler-0.33.1.dist-info → siliconcompiler-0.34.0.dist-info}/WHEEL +1 -1
- siliconcompiler/scheduler/docker_runner.py +0 -254
- siliconcompiler/schema/journalingschema.py +0 -238
- {siliconcompiler-0.33.1.dist-info → siliconcompiler-0.34.0.dist-info}/entry_points.txt +0 -0
- {siliconcompiler-0.33.1.dist-info → siliconcompiler-0.34.0.dist-info}/licenses/LICENSE +0 -0
- {siliconcompiler-0.33.1.dist-info → siliconcompiler-0.34.0.dist-info}/top_level.txt +0 -0
|
@@ -5,171 +5,155 @@ import stat
|
|
|
5
5
|
import uuid
|
|
6
6
|
import json
|
|
7
7
|
import shutil
|
|
8
|
-
from siliconcompiler import utils, SiliconCompilerError
|
|
9
|
-
from siliconcompiler.package import get_cache_path
|
|
10
|
-
from siliconcompiler.flowgraph import RuntimeFlowgraph
|
|
11
|
-
|
|
12
|
-
# Full list of Slurm states, split into 'active' and 'inactive' categories.
|
|
13
|
-
# Many of these do not apply to a minimal configuration, but we'll track them all.
|
|
14
|
-
# https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
|
|
15
|
-
SLURM_ACTIVE_STATES = [
|
|
16
|
-
'RUNNING',
|
|
17
|
-
'PENDING',
|
|
18
|
-
'CONFIGURING',
|
|
19
|
-
'COMPLETING',
|
|
20
|
-
'SIGNALING',
|
|
21
|
-
'STAGE_OUT',
|
|
22
|
-
'RESIZING',
|
|
23
|
-
'REQUEUED',
|
|
24
|
-
]
|
|
25
|
-
SLURM_INACTIVE_STATES = [
|
|
26
|
-
'BOOT_FAIL',
|
|
27
|
-
'CANCELLED',
|
|
28
|
-
'COMPLETED',
|
|
29
|
-
'DEADLINE',
|
|
30
|
-
'FAILED',
|
|
31
|
-
'NODE_FAIL',
|
|
32
|
-
'OUT_OF_MEMORY',
|
|
33
|
-
'PREEMPTED',
|
|
34
|
-
'RESV_DEL_HOLD',
|
|
35
|
-
'REQUEUE_FED',
|
|
36
|
-
'REQUEUE_HOLD',
|
|
37
|
-
'REVOKED',
|
|
38
|
-
'SPECIAL_EXIT',
|
|
39
|
-
'STOPPED',
|
|
40
|
-
'SUSPENDED',
|
|
41
|
-
'TIMEOUT',
|
|
42
|
-
]
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
###########################################################################
|
|
46
|
-
def get_configuration_directory(chip):
|
|
47
|
-
'''
|
|
48
|
-
Helper function to get the configuration directory for the scheduler
|
|
49
|
-
'''
|
|
50
|
-
|
|
51
|
-
return f'{chip.getworkdir()}/configs'
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def init(chip):
|
|
55
|
-
if os.path.exists(chip._getcollectdir()):
|
|
56
|
-
# nothing to do
|
|
57
|
-
return
|
|
58
|
-
|
|
59
|
-
collect = False
|
|
60
|
-
flow = chip.get('option', 'flow')
|
|
61
|
-
entry_nodes = chip.schema.get("flowgraph", flow, field="schema").get_entry_nodes()
|
|
62
|
-
|
|
63
|
-
runtime = RuntimeFlowgraph(
|
|
64
|
-
chip.schema.get("flowgraph", flow, field='schema'),
|
|
65
|
-
from_steps=chip.get('option', 'from'),
|
|
66
|
-
to_steps=chip.get('option', 'to'),
|
|
67
|
-
prune_nodes=chip.get('option', 'prune'))
|
|
68
|
-
|
|
69
|
-
for (step, index) in runtime.get_nodes():
|
|
70
|
-
if (step, index) in entry_nodes:
|
|
71
|
-
collect = True
|
|
72
|
-
|
|
73
|
-
if collect:
|
|
74
|
-
chip.collect()
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
###########################################################################
|
|
78
|
-
def _defernode(chip, step, index, replay):
|
|
79
|
-
'''
|
|
80
|
-
Helper method to run an individual step on a slurm cluster.
|
|
81
|
-
|
|
82
|
-
Blocks until the compute node
|
|
83
|
-
finishes processing this step, and it sets the active/error bits.
|
|
84
|
-
'''
|
|
85
|
-
|
|
86
|
-
# Determine which HPC job scheduler being used.
|
|
87
|
-
scheduler_type = chip.get('option', 'scheduler', 'name', step=step, index=index)
|
|
88
|
-
|
|
89
|
-
if scheduler_type != 'slurm':
|
|
90
|
-
raise ValueError(f'{scheduler_type} is not a supported scheduler')
|
|
91
|
-
|
|
92
|
-
if not check_slurm():
|
|
93
|
-
raise SiliconCompilerError('slurm is not available or installed on this machine', chip=chip)
|
|
94
|
-
|
|
95
|
-
# Determine which cluster parititon to use. (Default value can be overridden on per-step basis)
|
|
96
|
-
partition = chip.get('option', 'scheduler', 'queue', step=step, index=index)
|
|
97
|
-
if not partition:
|
|
98
|
-
partition = _get_slurm_partition()
|
|
99
|
-
|
|
100
|
-
# Get the temporary UID associated with this job run.
|
|
101
|
-
job_hash = chip.get('record', 'remoteid')
|
|
102
|
-
if not job_hash:
|
|
103
|
-
# Generate a new uuid since it was not set
|
|
104
|
-
job_hash = uuid.uuid4().hex
|
|
105
|
-
|
|
106
|
-
job_name = f'{job_hash}_{step}{index}'
|
|
107
|
-
|
|
108
|
-
# Write out the current schema for the compute node to pick up.
|
|
109
|
-
cfg_dir = get_configuration_directory(chip)
|
|
110
|
-
cfg_file = f'{cfg_dir}/{step}{index}.json'
|
|
111
|
-
log_file = f'{cfg_dir}/{step}{index}.log'
|
|
112
|
-
script_file = f'{cfg_dir}/{step}{index}.sh'
|
|
113
|
-
os.makedirs(cfg_dir, exist_ok=True)
|
|
114
|
-
|
|
115
|
-
chip.set('option', 'scheduler', 'name', None, step=step, index=index)
|
|
116
|
-
chip.write_manifest(cfg_file)
|
|
117
|
-
|
|
118
|
-
# Allow user-defined compute node execution script if it already exists on the filesystem.
|
|
119
|
-
# Otherwise, create a minimal script to run the task using the SiliconCompiler CLI.
|
|
120
|
-
if not os.path.isfile(script_file):
|
|
121
|
-
with open(script_file, 'w') as sf:
|
|
122
|
-
sf.write(utils.get_file_template('slurm/run.sh').render(
|
|
123
|
-
cfg_file=shlex.quote(cfg_file),
|
|
124
|
-
build_dir=shlex.quote(chip.get("option", "builddir")),
|
|
125
|
-
step=shlex.quote(step),
|
|
126
|
-
index=shlex.quote(index),
|
|
127
|
-
cachedir=shlex.quote(get_cache_path(chip))
|
|
128
|
-
))
|
|
129
|
-
|
|
130
|
-
# This is Python for: `chmod +x [script_path]`
|
|
131
|
-
os.chmod(script_file,
|
|
132
|
-
os.stat(script_file).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
|
|
133
|
-
|
|
134
|
-
schedule_cmd = ['srun',
|
|
135
|
-
'--exclusive',
|
|
136
|
-
'--partition', partition,
|
|
137
|
-
'--chdir', chip.cwd,
|
|
138
|
-
'--job-name', job_name,
|
|
139
|
-
'--output', log_file]
|
|
140
|
-
|
|
141
|
-
# Only delay the starting time if the 'defer' Schema option is specified.
|
|
142
|
-
defer_time = chip.get('option', 'scheduler', 'defer', step=step, index=index)
|
|
143
|
-
if defer_time:
|
|
144
|
-
schedule_cmd.extend(['--begin', defer_time])
|
|
145
|
-
|
|
146
|
-
schedule_cmd.append(script_file)
|
|
147
8
|
|
|
148
|
-
|
|
149
|
-
# TODO: output should be fed to log, and stdout if quiet = False
|
|
150
|
-
step_result = subprocess.Popen(schedule_cmd,
|
|
151
|
-
stdout=subprocess.PIPE,
|
|
152
|
-
stderr=subprocess.STDOUT)
|
|
9
|
+
import os.path
|
|
153
10
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def _get_slurm_partition():
|
|
161
|
-
partitions = subprocess.run(['sinfo', '--json'],
|
|
162
|
-
stdout=subprocess.PIPE,
|
|
163
|
-
stderr=subprocess.STDOUT)
|
|
164
|
-
|
|
165
|
-
if partitions.returncode != 0:
|
|
166
|
-
raise RuntimeError('Unable to determine partitions in slurm')
|
|
167
|
-
|
|
168
|
-
sinfo = json.loads(partitions.stdout.decode())
|
|
11
|
+
from siliconcompiler import utils
|
|
12
|
+
from siliconcompiler.package import RemoteResolver
|
|
13
|
+
from siliconcompiler.flowgraph import RuntimeFlowgraph
|
|
14
|
+
from siliconcompiler.scheduler.schedulernode import SchedulerNode
|
|
169
15
|
|
|
170
|
-
# Return the first listed partition
|
|
171
|
-
return sinfo['nodes'][0]['partitions'][0]
|
|
172
16
|
|
|
17
|
+
class SlurmSchedulerNode(SchedulerNode):
|
|
18
|
+
def __init__(self, chip, step, index, replay=False):
|
|
19
|
+
super().__init__(chip, step, index, replay=replay)
|
|
173
20
|
|
|
174
|
-
|
|
175
|
-
|
|
21
|
+
# Get the temporary UID associated with this job run.
|
|
22
|
+
self.__job_hash = chip.get('record', 'remoteid')
|
|
23
|
+
if not self.__job_hash:
|
|
24
|
+
# Generate a new uuid since it was not set
|
|
25
|
+
self.__job_hash = uuid.uuid4().hex
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def jobhash(self):
|
|
29
|
+
return self.__job_hash
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def init(chip):
|
|
33
|
+
if os.path.exists(chip._getcollectdir()):
|
|
34
|
+
# nothing to do
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
collect = False
|
|
38
|
+
flow = chip.get('option', 'flow')
|
|
39
|
+
entry_nodes = chip.schema.get("flowgraph", flow, field="schema").get_entry_nodes()
|
|
40
|
+
|
|
41
|
+
runtime = RuntimeFlowgraph(
|
|
42
|
+
chip.schema.get("flowgraph", flow, field='schema'),
|
|
43
|
+
from_steps=chip.get('option', 'from'),
|
|
44
|
+
to_steps=chip.get('option', 'to'),
|
|
45
|
+
prune_nodes=chip.get('option', 'prune'))
|
|
46
|
+
|
|
47
|
+
for (step, index) in runtime.get_nodes():
|
|
48
|
+
if (step, index) in entry_nodes:
|
|
49
|
+
collect = True
|
|
50
|
+
|
|
51
|
+
if collect:
|
|
52
|
+
chip.collect()
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def is_local(self):
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def get_configuration_directory(chip):
|
|
60
|
+
'''
|
|
61
|
+
Helper function to get the configuration directory for the scheduler
|
|
62
|
+
'''
|
|
63
|
+
|
|
64
|
+
return os.path.join(chip.getworkdir(), 'sc_configs')
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def get_job_name(jobhash, step, index):
|
|
68
|
+
return f'{jobhash}_{step}{index}'
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def get_runtime_file_name(jobhash, step, index, ext):
|
|
72
|
+
return f"{SlurmSchedulerNode.get_job_name(jobhash, step, index)}.{ext}"
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def get_slurm_partition():
|
|
76
|
+
partitions = subprocess.run(['sinfo', '--json'],
|
|
77
|
+
stdout=subprocess.PIPE,
|
|
78
|
+
stderr=subprocess.STDOUT)
|
|
79
|
+
|
|
80
|
+
if partitions.returncode != 0:
|
|
81
|
+
raise RuntimeError('Unable to determine partitions in slurm')
|
|
82
|
+
|
|
83
|
+
sinfo = json.loads(partitions.stdout.decode())
|
|
84
|
+
|
|
85
|
+
# Return the first listed partition
|
|
86
|
+
return sinfo['nodes'][0]['partitions'][0]
|
|
87
|
+
|
|
88
|
+
def run(self):
|
|
89
|
+
'''
|
|
90
|
+
Helper method to run an individual step on a slurm cluster.
|
|
91
|
+
|
|
92
|
+
Blocks until the compute node
|
|
93
|
+
finishes processing this step, and it sets the active/error bits.
|
|
94
|
+
'''
|
|
95
|
+
|
|
96
|
+
if shutil.which('sinfo') is None:
|
|
97
|
+
raise RuntimeError('slurm is not available or installed on this machine')
|
|
98
|
+
|
|
99
|
+
# Determine which cluster parititon to use.
|
|
100
|
+
partition = self.chip.get('option', 'scheduler', 'queue', step=self.step, index=self.index)
|
|
101
|
+
if not partition:
|
|
102
|
+
partition = SlurmSchedulerNode.get_slurm_partition()
|
|
103
|
+
|
|
104
|
+
# Write out the current schema for the compute node to pick up.
|
|
105
|
+
cfg_dir = SlurmSchedulerNode.get_configuration_directory(self.chip)
|
|
106
|
+
os.makedirs(cfg_dir, exist_ok=True)
|
|
107
|
+
|
|
108
|
+
cfg_file = os.path.join(cfg_dir, SlurmSchedulerNode.get_runtime_file_name(
|
|
109
|
+
self.__job_hash, self.step, self.index, "pkg.json"))
|
|
110
|
+
log_file = os.path.join(cfg_dir, SlurmSchedulerNode.get_runtime_file_name(
|
|
111
|
+
self.__job_hash, self.step, self.index, "log"))
|
|
112
|
+
script_file = os.path.join(cfg_dir, SlurmSchedulerNode.get_runtime_file_name(
|
|
113
|
+
self.__job_hash, self.step, self.index, "sh"))
|
|
114
|
+
|
|
115
|
+
# Remove scheduler as this is now a local run
|
|
116
|
+
self.chip.set('option', 'scheduler', 'name', None, step=self.step, index=self.index)
|
|
117
|
+
self.chip.write_manifest(cfg_file)
|
|
118
|
+
|
|
119
|
+
# Allow user-defined compute node execution script if it already exists on the filesystem.
|
|
120
|
+
# Otherwise, create a minimal script to run the task using the SiliconCompiler CLI.
|
|
121
|
+
if not os.path.isfile(script_file):
|
|
122
|
+
with open(script_file, 'w') as sf:
|
|
123
|
+
sf.write(utils.get_file_template('slurm/run.sh').render(
|
|
124
|
+
cfg_file=shlex.quote(cfg_file),
|
|
125
|
+
build_dir=shlex.quote(self.chip.get("option", "builddir")),
|
|
126
|
+
step=shlex.quote(self.step),
|
|
127
|
+
index=shlex.quote(self.index),
|
|
128
|
+
cachedir=shlex.quote(str(RemoteResolver.determine_cache_dir(self.chip)))
|
|
129
|
+
))
|
|
130
|
+
|
|
131
|
+
# This is Python for: `chmod +x [script_path]`
|
|
132
|
+
os.chmod(script_file,
|
|
133
|
+
os.stat(script_file).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
|
|
134
|
+
|
|
135
|
+
schedule_cmd = ['srun',
|
|
136
|
+
'--exclusive',
|
|
137
|
+
'--partition', partition,
|
|
138
|
+
'--chdir', self.chip.cwd,
|
|
139
|
+
'--job-name', SlurmSchedulerNode.get_job_name(self.__job_hash,
|
|
140
|
+
self.step, self.index),
|
|
141
|
+
'--output', log_file]
|
|
142
|
+
|
|
143
|
+
# Only delay the starting time if the 'defer' Schema option is specified.
|
|
144
|
+
defer_time = self.chip.get('option', 'scheduler', 'defer', step=self.step, index=self.index)
|
|
145
|
+
if defer_time:
|
|
146
|
+
schedule_cmd.extend(['--begin', defer_time])
|
|
147
|
+
|
|
148
|
+
schedule_cmd.append(script_file)
|
|
149
|
+
|
|
150
|
+
# Run the 'srun' command, and track its output.
|
|
151
|
+
# TODO: output should be fed to log, and stdout if quiet = False
|
|
152
|
+
step_result = subprocess.Popen(schedule_cmd,
|
|
153
|
+
stdout=subprocess.PIPE,
|
|
154
|
+
stderr=subprocess.STDOUT)
|
|
155
|
+
|
|
156
|
+
# Wait for the subprocess call to complete. It should already be done,
|
|
157
|
+
# as it has closed its output stream. But if we don't call '.wait()',
|
|
158
|
+
# the '.returncode' value will not be set correctly.
|
|
159
|
+
step_result.wait()
|
|
@@ -11,11 +11,8 @@ from siliconcompiler import SiliconCompilerError
|
|
|
11
11
|
from siliconcompiler import utils
|
|
12
12
|
from siliconcompiler.flowgraph import RuntimeFlowgraph
|
|
13
13
|
|
|
14
|
-
from siliconcompiler.schema import
|
|
14
|
+
from siliconcompiler.schema import Journal
|
|
15
15
|
|
|
16
|
-
from siliconcompiler.scheduler import slurm
|
|
17
|
-
from siliconcompiler.scheduler import docker_runner
|
|
18
|
-
from siliconcompiler.tools._common import get_tool_task
|
|
19
16
|
from siliconcompiler.utils.logging import SCBlankLoggerFormatter
|
|
20
17
|
|
|
21
18
|
|
|
@@ -33,7 +30,7 @@ class TaskScheduler:
|
|
|
33
30
|
raise ValueError(f"{hook} is not a valid callback")
|
|
34
31
|
TaskScheduler.__callbacks[hook] = func
|
|
35
32
|
|
|
36
|
-
def __init__(self, chip):
|
|
33
|
+
def __init__(self, chip, tasks):
|
|
37
34
|
self.__chip = chip
|
|
38
35
|
self.__logger = self.__chip.logger
|
|
39
36
|
self.__schema = self.__chip.schema
|
|
@@ -50,17 +47,21 @@ class TaskScheduler:
|
|
|
50
47
|
# clip max parallel jobs to 1 <= jobs <= max_cores
|
|
51
48
|
self.__max_parallel_run = max(1, min(self.__max_parallel_run, self.__max_cores))
|
|
52
49
|
|
|
50
|
+
self.__runtime_flow = RuntimeFlowgraph(
|
|
51
|
+
self.__flow,
|
|
52
|
+
from_steps=self.__chip.get('option', 'from'),
|
|
53
|
+
to_steps=self.__chip.get('option', 'to'),
|
|
54
|
+
prune_nodes=self.__chip.get('option', 'prune'))
|
|
55
|
+
|
|
53
56
|
self.__log_queue = multiprocessing.Queue(-1)
|
|
54
57
|
|
|
55
58
|
self.__nodes = {}
|
|
56
59
|
self.__startTimes = {}
|
|
57
60
|
self.__dwellTime = 0.1
|
|
58
61
|
|
|
59
|
-
self.__create_nodes()
|
|
60
|
-
|
|
61
|
-
def __create_nodes(self):
|
|
62
|
-
from siliconcompiler.scheduler import _executenode, _runtask
|
|
62
|
+
self.__create_nodes(tasks)
|
|
63
63
|
|
|
64
|
+
def __create_nodes(self, tasks):
|
|
64
65
|
runtime = RuntimeFlowgraph(
|
|
65
66
|
self.__flow,
|
|
66
67
|
from_steps=set([step for step, _ in self.__flow.get_entry_nodes()]),
|
|
@@ -68,19 +69,11 @@ class TaskScheduler:
|
|
|
68
69
|
|
|
69
70
|
init_funcs = set()
|
|
70
71
|
|
|
71
|
-
|
|
72
|
-
self.__flow,
|
|
73
|
-
from_steps=self.__chip.get('option', 'from'),
|
|
74
|
-
to_steps=self.__chip.get('option', 'to'),
|
|
75
|
-
prune_nodes=self.__chip.get('option', 'prune'))
|
|
76
|
-
|
|
77
|
-
for step, index in runtime_flow.get_nodes():
|
|
72
|
+
for step, index in self.__runtime_flow.get_nodes():
|
|
78
73
|
if self.__record.get('status', step=step, index=index) != NodeStatus.PENDING:
|
|
79
74
|
continue
|
|
80
75
|
|
|
81
|
-
|
|
82
|
-
threads = self.__chip.get('tool', tool_name, 'task', task_name, 'threads',
|
|
83
|
-
step=step, index=index)
|
|
76
|
+
threads = tasks[(step, index)].threads
|
|
84
77
|
if not threads:
|
|
85
78
|
threads = self.__max_threads
|
|
86
79
|
threads = max(1, min(threads, self.__max_threads))
|
|
@@ -89,42 +82,21 @@ class TaskScheduler:
|
|
|
89
82
|
"name": f"{step}{index}",
|
|
90
83
|
"inputs": runtime.get_node_inputs(step, index, record=self.__record),
|
|
91
84
|
"proc": None,
|
|
92
|
-
"child_pipe": None,
|
|
93
85
|
"parent_pipe": None,
|
|
94
|
-
"local": False,
|
|
95
|
-
"tool": tool_name,
|
|
96
|
-
"task": task_name,
|
|
97
86
|
"threads": threads,
|
|
98
87
|
"running": False,
|
|
99
88
|
"manifest": os.path.join(self.__chip.getworkdir(step=step, index=index),
|
|
100
89
|
'outputs',
|
|
101
|
-
f'{self.__chip.design}.pkg.json')
|
|
90
|
+
f'{self.__chip.design}.pkg.json'),
|
|
91
|
+
"node": tasks[(step, index)]
|
|
102
92
|
}
|
|
103
93
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
if node_scheduler == 'slurm':
|
|
108
|
-
# Defer job to compute node
|
|
109
|
-
# If the job is configured to run on a cluster, collect the schema
|
|
110
|
-
# and send it to a compute node for deferred execution.
|
|
111
|
-
init_funcs.add(slurm.init)
|
|
112
|
-
exec_func = slurm._defernode
|
|
113
|
-
elif node_scheduler == 'docker':
|
|
114
|
-
# Run job in docker
|
|
115
|
-
init_funcs.add(docker_runner.init)
|
|
116
|
-
exec_func = docker_runner.run
|
|
117
|
-
task["local"] = True
|
|
118
|
-
else:
|
|
119
|
-
task["local"] = True
|
|
120
|
-
|
|
121
|
-
task["parent_pipe"], task["child_pipe"] = multiprocessing.Pipe()
|
|
122
|
-
task["proc"] = multiprocessing.Process(
|
|
123
|
-
target=_runtask,
|
|
124
|
-
args=(self.__chip, self.__flow.name(), step, index, exec_func),
|
|
125
|
-
kwargs={"pipe": task["child_pipe"],
|
|
126
|
-
"queue": self.__log_queue})
|
|
94
|
+
task["parent_pipe"], pipe = multiprocessing.Pipe()
|
|
95
|
+
task["node"].set_queue(pipe, self.__log_queue)
|
|
96
|
+
task["node"].init_state() # reinit access to remove holdover access
|
|
127
97
|
|
|
98
|
+
task["proc"] = multiprocessing.Process(target=task["node"].run)
|
|
99
|
+
init_funcs.add(task["node"].init)
|
|
128
100
|
self.__nodes[(step, index)] = task
|
|
129
101
|
|
|
130
102
|
# Call preprocessing for schedulers
|
|
@@ -217,13 +189,17 @@ class TaskScheduler:
|
|
|
217
189
|
self.__logger.debug(f'{info["name"]} is complete merging: {manifest}')
|
|
218
190
|
|
|
219
191
|
if os.path.exists(manifest):
|
|
220
|
-
|
|
192
|
+
Journal.replay_file(self.__schema, manifest)
|
|
193
|
+
# TODO: once tool is fixed this can go away
|
|
194
|
+
self.__schema.unset("arg", "step")
|
|
195
|
+
self.__schema.unset("arg", "index")
|
|
221
196
|
|
|
222
197
|
if info["parent_pipe"] and info["parent_pipe"].poll(1):
|
|
223
198
|
try:
|
|
224
199
|
packages = info["parent_pipe"].recv()
|
|
225
200
|
if isinstance(packages, dict):
|
|
226
|
-
|
|
201
|
+
for package, path in packages.items():
|
|
202
|
+
self.__chip.get("package", field="schema")._set_cache(package, path)
|
|
227
203
|
except: # noqa E722
|
|
228
204
|
pass
|
|
229
205
|
|
|
@@ -249,7 +225,7 @@ class TaskScheduler:
|
|
|
249
225
|
def __allow_start(self, node):
|
|
250
226
|
info = self.__nodes[node]
|
|
251
227
|
|
|
252
|
-
if not info["
|
|
228
|
+
if not info["node"].is_local:
|
|
253
229
|
# using a different scheduler, so allow
|
|
254
230
|
return True
|
|
255
231
|
|
|
@@ -286,7 +262,7 @@ class TaskScheduler:
|
|
|
286
262
|
if not NodeStatus.is_done(in_status):
|
|
287
263
|
ready = False
|
|
288
264
|
break
|
|
289
|
-
if NodeStatus.is_error(in_status) and info["
|
|
265
|
+
if NodeStatus.is_error(in_status) and not info["node"].is_builtin:
|
|
290
266
|
# Fail if any dependency failed for non-builtin task
|
|
291
267
|
self.__record.set("status", NodeStatus.ERROR, step=step, index=index)
|
|
292
268
|
|
|
@@ -295,7 +271,7 @@ class TaskScheduler:
|
|
|
295
271
|
any_success = any([status == NodeStatus.SUCCESS for status in inputs])
|
|
296
272
|
else:
|
|
297
273
|
any_success = True
|
|
298
|
-
if ready and info["
|
|
274
|
+
if ready and info["node"].is_builtin and not any_success:
|
|
299
275
|
self.__record.set("status", NodeStatus.ERROR, step=step, index=index)
|
|
300
276
|
|
|
301
277
|
if self.__record.get('status', step=step, index=index) == NodeStatus.ERROR:
|
|
@@ -318,3 +294,14 @@ class TaskScheduler:
|
|
|
318
294
|
info["proc"].start()
|
|
319
295
|
|
|
320
296
|
return changed
|
|
297
|
+
|
|
298
|
+
def check(self):
|
|
299
|
+
exit_steps = set([step for step, _ in self.__runtime_flow.get_exit_nodes()])
|
|
300
|
+
completed_steps = set([step for step, _ in
|
|
301
|
+
self.__runtime_flow.get_completed_nodes(record=self.__record)])
|
|
302
|
+
|
|
303
|
+
unreached = set(exit_steps).difference(completed_steps)
|
|
304
|
+
|
|
305
|
+
if unreached:
|
|
306
|
+
raise RuntimeError(
|
|
307
|
+
f'These final steps could not be reached: {", ".join(sorted(unreached))}')
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from .parameter import Parameter, Scope, PerNode
|
|
2
|
+
from .journal import Journal
|
|
2
3
|
from .safeschema import SafeSchema
|
|
3
4
|
from .editableschema import EditableSchema
|
|
4
5
|
from .baseschema import BaseSchema
|
|
5
6
|
from .cmdlineschema import CommandLineSchema
|
|
6
|
-
from .journalingschema import JournalingSchema
|
|
7
7
|
from .namedschema import NamedSchema
|
|
8
8
|
from .packageschema import PackageSchema
|
|
9
9
|
|
|
@@ -15,10 +15,10 @@ __all__ = [
|
|
|
15
15
|
"SafeSchema",
|
|
16
16
|
"EditableSchema",
|
|
17
17
|
"CommandLineSchema",
|
|
18
|
-
"JournalingSchema",
|
|
19
18
|
"NamedSchema",
|
|
20
19
|
"PackageSchema",
|
|
21
20
|
"Parameter",
|
|
22
21
|
"Scope",
|
|
23
|
-
"PerNode"
|
|
22
|
+
"PerNode",
|
|
23
|
+
"Journal"
|
|
24
24
|
]
|