earthkit-workflows 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cascade/benchmarks/__main__.py +58 -35
- cascade/executor/config.py +1 -0
- cascade/executor/executor.py +3 -0
- cascade/shm/server.py +1 -1
- earthkit/workflows/_version.py +1 -1
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.2.dist-info}/METADATA +1 -1
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.2.dist-info}/RECORD +10 -10
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.2.dist-info}/WHEEL +0 -0
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.2.dist-info}/top_level.txt +0 -0
cascade/benchmarks/__main__.py
CHANGED
|
@@ -140,20 +140,25 @@ def launch_executor(
|
|
|
140
140
|
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
141
141
|
else:
|
|
142
142
|
logging.config.dictConfig(logging_config)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
143
|
+
try:
|
|
144
|
+
logger.info(f"will set {gpu_count} gpus on host {i}")
|
|
145
|
+
os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
|
|
146
|
+
executor = Executor(
|
|
147
|
+
job_instance,
|
|
148
|
+
controller_address,
|
|
149
|
+
workers_per_host,
|
|
150
|
+
f"h{i}",
|
|
151
|
+
portBase,
|
|
152
|
+
shm_vol_gb,
|
|
153
|
+
log_base,
|
|
154
|
+
url_base,
|
|
155
|
+
)
|
|
156
|
+
executor.register()
|
|
157
|
+
executor.recv_loop()
|
|
158
|
+
except Exception:
|
|
159
|
+
# NOTE we log this to get the stacktrace into the logfile
|
|
160
|
+
logger.exception("executor failure")
|
|
161
|
+
raise
|
|
157
162
|
|
|
158
163
|
|
|
159
164
|
def run_locally(
|
|
@@ -169,31 +174,47 @@ def run_locally(
|
|
|
169
174
|
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
170
175
|
else:
|
|
171
176
|
logging.config.dictConfig(logging_config)
|
|
177
|
+
logger.debug(f"local run starting with {hosts=} and {workers=} on {portBase=}")
|
|
172
178
|
launch = perf_counter_ns()
|
|
173
|
-
preschedule = precompute(job)
|
|
174
179
|
c = f"tcp://localhost:{portBase}"
|
|
175
180
|
m = f"tcp://localhost:{portBase+1}"
|
|
176
181
|
ps = []
|
|
177
|
-
for i, executor in enumerate(range(hosts)):
|
|
178
|
-
gpu_count = get_gpu_count(i, workers)
|
|
179
|
-
# NOTE forkserver/spawn seem to forget venv, we need fork
|
|
180
|
-
p = multiprocessing.get_context("fork").Process(
|
|
181
|
-
target=launch_executor,
|
|
182
|
-
args=(
|
|
183
|
-
job,
|
|
184
|
-
c,
|
|
185
|
-
workers,
|
|
186
|
-
portBase + 1 + i * 10,
|
|
187
|
-
i,
|
|
188
|
-
None,
|
|
189
|
-
gpu_count,
|
|
190
|
-
log_base,
|
|
191
|
-
"tcp://localhost",
|
|
192
|
-
),
|
|
193
|
-
)
|
|
194
|
-
p.start()
|
|
195
|
-
ps.append(p)
|
|
196
182
|
try:
|
|
183
|
+
# executors forking
|
|
184
|
+
for i, executor in enumerate(range(hosts)):
|
|
185
|
+
gpu_count = get_gpu_count(i, workers)
|
|
186
|
+
# NOTE forkserver/spawn seem to forget venv, we need fork
|
|
187
|
+
logger.debug(f"forking into executor on host {i}")
|
|
188
|
+
p = multiprocessing.get_context("fork").Process(
|
|
189
|
+
target=launch_executor,
|
|
190
|
+
args=(
|
|
191
|
+
job,
|
|
192
|
+
c,
|
|
193
|
+
workers,
|
|
194
|
+
portBase + 1 + i * 10,
|
|
195
|
+
i,
|
|
196
|
+
None,
|
|
197
|
+
gpu_count,
|
|
198
|
+
log_base,
|
|
199
|
+
"tcp://localhost",
|
|
200
|
+
),
|
|
201
|
+
)
|
|
202
|
+
p.start()
|
|
203
|
+
ps.append(p)
|
|
204
|
+
|
|
205
|
+
# compute preschedule
|
|
206
|
+
preschedule = precompute(job)
|
|
207
|
+
|
|
208
|
+
# check processes started healthy
|
|
209
|
+
for i, p in enumerate(ps):
|
|
210
|
+
if not p.is_alive():
|
|
211
|
+
# TODO ideally we would somehow connect this with the Register message
|
|
212
|
+
# consumption in the Controller -- but there we don't assume that
|
|
213
|
+
# executors are on the same physical host
|
|
214
|
+
raise ValueError(f"executor {i} failed to live due to {p.exitcode}")
|
|
215
|
+
|
|
216
|
+
# start bridge itself
|
|
217
|
+
logger.debug("starting bridge")
|
|
197
218
|
b = Bridge(c, hosts)
|
|
198
219
|
start = perf_counter_ns()
|
|
199
220
|
run(job, b, preschedule, report_address=report_address)
|
|
@@ -201,7 +222,9 @@ def run_locally(
|
|
|
201
222
|
print(
|
|
202
223
|
f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
|
|
203
224
|
)
|
|
204
|
-
except:
|
|
225
|
+
except Exception:
|
|
226
|
+
# NOTE we log this to get the stacktrace into the logfile
|
|
227
|
+
logger.exception("controller failure, proceed with executor shutdown")
|
|
205
228
|
for p in ps:
|
|
206
229
|
if p.is_alive():
|
|
207
230
|
callback(m, ExecutorShutdown())
|
cascade/executor/config.py
CHANGED
|
@@ -21,6 +21,7 @@ logging_config = {
|
|
|
21
21
|
"forecastbox.worker": {"level": "DEBUG"},
|
|
22
22
|
"forecastbox.executor": {"level": "DEBUG"},
|
|
23
23
|
"cascade": {"level": "INFO"},
|
|
24
|
+
"cascade.benchmarks": {"level": "DEBUG"},
|
|
24
25
|
"cascade.low": {"level": "DEBUG"},
|
|
25
26
|
"cascade.shm": {"level": "DEBUG"},
|
|
26
27
|
"cascade.controller": {"level": "DEBUG"},
|
cascade/executor/executor.py
CHANGED
|
@@ -86,6 +86,7 @@ class Executor:
|
|
|
86
86
|
self.heartbeat_watcher = GraceWatcher(grace_ms=heartbeat_grace_ms)
|
|
87
87
|
|
|
88
88
|
self.terminating = False
|
|
89
|
+
logger.debug("register terminate function")
|
|
89
90
|
atexit.register(self.terminate)
|
|
90
91
|
# NOTE following inits are with potential side effects
|
|
91
92
|
self.mlistener = Listener(address_of(portBase))
|
|
@@ -99,6 +100,7 @@ class Executor:
|
|
|
99
100
|
shm_logging = logging_config_filehandler(f"{log_base}.shm.txt")
|
|
100
101
|
else:
|
|
101
102
|
shm_logging = logging_config
|
|
103
|
+
logger.debug("about to fork into shm process")
|
|
102
104
|
self.shm_process = ctx.Process(
|
|
103
105
|
target=shm_server,
|
|
104
106
|
args=(
|
|
@@ -114,6 +116,7 @@ class Executor:
|
|
|
114
116
|
dsr_logging = logging_config_filehandler(f"{log_base}.dsr.txt")
|
|
115
117
|
else:
|
|
116
118
|
dsr_logging = logging_config
|
|
119
|
+
logger.debug("about to fork into data server")
|
|
117
120
|
self.data_server = ctx.Process(
|
|
118
121
|
target=start_data_server,
|
|
119
122
|
args=(
|
cascade/shm/server.py
CHANGED
|
@@ -115,5 +115,5 @@ def entrypoint(
|
|
|
115
115
|
server.start()
|
|
116
116
|
except Exception as e:
|
|
117
117
|
# we always get a Bad file descriptor due to sigterm handler calling sock close mid-read
|
|
118
|
-
logger.warning(f"shutdown issue: {e}")
|
|
118
|
+
logger.warning(f"shutdown issue: {repr(e)}")
|
|
119
119
|
server.atexit(0, None)
|
earthkit/workflows/_version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
# Do not change! Do not track in version control!
|
|
2
|
-
__version__ = "0.4.
|
|
2
|
+
__version__ = "0.4.2"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: earthkit-workflows
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
|
|
5
5
|
Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
|
|
4
|
-
cascade/benchmarks/__main__.py,sha256=
|
|
4
|
+
cascade/benchmarks/__main__.py,sha256=g03xRzp58dXLHDj8kTPyPnbBOS5sRIAMTthFtFjDRbs,9876
|
|
5
5
|
cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
|
|
6
6
|
cascade/benchmarks/dist.py,sha256=ngXJJzegnMUVwDFPvGMG6997lamB-aSEHi74oBbayrE,4116
|
|
7
7
|
cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
|
|
@@ -17,9 +17,9 @@ cascade/controller/notify.py,sha256=5eSPKcxqrv9kHy7St-iIm1NttsyzcvwLhZI5dvr4cEY,
|
|
|
17
17
|
cascade/controller/report.py,sha256=FD-MAWZq6pwSw2CP2m4OUBw4hzrX46vKE_FZO5NpjDU,3670
|
|
18
18
|
cascade/executor/bridge.py,sha256=WDE-GM2Bv7nUk1-nV-otMGuaRYw1-Vmd7PWploXBp6Y,8267
|
|
19
19
|
cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
|
|
20
|
-
cascade/executor/config.py,sha256=
|
|
20
|
+
cascade/executor/config.py,sha256=8azy_sXdvDGO0zTNqA0pdtkXsyihM4FQ4U1W_3Dhua0,1571
|
|
21
21
|
cascade/executor/data_server.py,sha256=xLIbLkWn8PnJl4lMP8ADHa2S0EgPwr0-bH7_Sib_Y70,13701
|
|
22
|
-
cascade/executor/executor.py,sha256=
|
|
22
|
+
cascade/executor/executor.py,sha256=egPhfqhzYfeM77Hu10-mGHNVsQAdqmZOA7hmjFP1Q8M,13484
|
|
23
23
|
cascade/executor/msg.py,sha256=7HI0rKeCRaV1ONR4HWEa64nHbu-p6-QdBwJNitmst48,4340
|
|
24
24
|
cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
|
|
25
25
|
cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
|
|
@@ -53,9 +53,9 @@ cascade/shm/client.py,sha256=pnod_dmUROJZRtipCpoeCuAEuynW0IgSfgjrp21CH2M,5893
|
|
|
53
53
|
cascade/shm/dataset.py,sha256=Z2ewpnW7mVDJB9GylIVoOWV0DYOF7FWLIXkIvV-Y7sI,12347
|
|
54
54
|
cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
|
|
55
55
|
cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
|
|
56
|
-
cascade/shm/server.py,sha256=
|
|
56
|
+
cascade/shm/server.py,sha256=LnnNX0F6QJt5V_JLfmC3ZMHGNL5WpLY44wpB_pYDr7Y,5042
|
|
57
57
|
earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
|
|
58
|
-
earthkit/workflows/_version.py,sha256=
|
|
58
|
+
earthkit/workflows/_version.py,sha256=nkd71CReR3pz5TZ9pcVgB2cP1MDj4YK6VH9UGJYzXDM,72
|
|
59
59
|
earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
|
|
60
60
|
earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
|
|
61
61
|
earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
|
|
@@ -85,8 +85,8 @@ earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESs
|
|
|
85
85
|
earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
|
|
86
86
|
earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
|
|
87
87
|
earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
|
|
88
|
-
earthkit_workflows-0.4.
|
|
89
|
-
earthkit_workflows-0.4.
|
|
90
|
-
earthkit_workflows-0.4.
|
|
91
|
-
earthkit_workflows-0.4.
|
|
92
|
-
earthkit_workflows-0.4.
|
|
88
|
+
earthkit_workflows-0.4.2.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
|
|
89
|
+
earthkit_workflows-0.4.2.dist-info/METADATA,sha256=LWW-xDc0sq8cOdu6IpY335_MSFfe7Lmg1SHYT9cXjWA,1571
|
|
90
|
+
earthkit_workflows-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
91
|
+
earthkit_workflows-0.4.2.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
|
|
92
|
+
earthkit_workflows-0.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|