earthkit-workflows 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cascade/benchmarks/__main__.py +60 -37
- cascade/executor/config.py +1 -0
- cascade/executor/executor.py +5 -2
- cascade/executor/platform.py +24 -0
- cascade/gateway/router.py +2 -6
- cascade/shm/server.py +1 -1
- earthkit/workflows/_version.py +1 -1
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.3.dist-info}/METADATA +1 -1
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.3.dist-info}/RECORD +12 -11
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.3.dist-info}/WHEEL +0 -0
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.3.dist-info}/licenses/LICENSE +0 -0
- {earthkit_workflows-0.4.1.dist-info → earthkit_workflows-0.4.3.dist-info}/top_level.txt +0 -0
cascade/benchmarks/__main__.py
CHANGED
|
@@ -28,12 +28,12 @@ import os
|
|
|
28
28
|
import subprocess
|
|
29
29
|
import sys
|
|
30
30
|
from concurrent.futures import ThreadPoolExecutor
|
|
31
|
-
from socket import getfqdn
|
|
32
31
|
from time import perf_counter_ns
|
|
33
32
|
|
|
34
33
|
import fire
|
|
35
34
|
import orjson
|
|
36
35
|
|
|
36
|
+
import cascade.executor.platform as platform
|
|
37
37
|
import cascade.low.into
|
|
38
38
|
from cascade.controller.impl import run
|
|
39
39
|
from cascade.executor.bridge import Bridge
|
|
@@ -140,20 +140,25 @@ def launch_executor(
|
|
|
140
140
|
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
141
141
|
else:
|
|
142
142
|
logging.config.dictConfig(logging_config)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
143
|
+
try:
|
|
144
|
+
logger.info(f"will set {gpu_count} gpus on host {i}")
|
|
145
|
+
os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
|
|
146
|
+
executor = Executor(
|
|
147
|
+
job_instance,
|
|
148
|
+
controller_address,
|
|
149
|
+
workers_per_host,
|
|
150
|
+
f"h{i}",
|
|
151
|
+
portBase,
|
|
152
|
+
shm_vol_gb,
|
|
153
|
+
log_base,
|
|
154
|
+
url_base,
|
|
155
|
+
)
|
|
156
|
+
executor.register()
|
|
157
|
+
executor.recv_loop()
|
|
158
|
+
except Exception:
|
|
159
|
+
# NOTE we log this to get the stacktrace into the logfile
|
|
160
|
+
logger.exception("executor failure")
|
|
161
|
+
raise
|
|
157
162
|
|
|
158
163
|
|
|
159
164
|
def run_locally(
|
|
@@ -169,31 +174,47 @@ def run_locally(
|
|
|
169
174
|
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
170
175
|
else:
|
|
171
176
|
logging.config.dictConfig(logging_config)
|
|
177
|
+
logger.debug(f"local run starting with {hosts=} and {workers=} on {portBase=}")
|
|
172
178
|
launch = perf_counter_ns()
|
|
173
|
-
preschedule = precompute(job)
|
|
174
179
|
c = f"tcp://localhost:{portBase}"
|
|
175
180
|
m = f"tcp://localhost:{portBase+1}"
|
|
176
181
|
ps = []
|
|
177
|
-
for i, executor in enumerate(range(hosts)):
|
|
178
|
-
gpu_count = get_gpu_count(i, workers)
|
|
179
|
-
# NOTE forkserver/spawn seem to forget venv, we need fork
|
|
180
|
-
p = multiprocessing.get_context("fork").Process(
|
|
181
|
-
target=launch_executor,
|
|
182
|
-
args=(
|
|
183
|
-
job,
|
|
184
|
-
c,
|
|
185
|
-
workers,
|
|
186
|
-
portBase + 1 + i * 10,
|
|
187
|
-
i,
|
|
188
|
-
None,
|
|
189
|
-
gpu_count,
|
|
190
|
-
log_base,
|
|
191
|
-
"tcp://localhost",
|
|
192
|
-
),
|
|
193
|
-
)
|
|
194
|
-
p.start()
|
|
195
|
-
ps.append(p)
|
|
196
182
|
try:
|
|
183
|
+
# executors forking
|
|
184
|
+
for i, executor in enumerate(range(hosts)):
|
|
185
|
+
gpu_count = get_gpu_count(i, workers)
|
|
186
|
+
# NOTE forkserver/spawn seem to forget venv, we need fork
|
|
187
|
+
logger.debug(f"forking into executor on host {i}")
|
|
188
|
+
p = multiprocessing.get_context("fork").Process(
|
|
189
|
+
target=launch_executor,
|
|
190
|
+
args=(
|
|
191
|
+
job,
|
|
192
|
+
c,
|
|
193
|
+
workers,
|
|
194
|
+
portBase + 1 + i * 10,
|
|
195
|
+
i,
|
|
196
|
+
None,
|
|
197
|
+
gpu_count,
|
|
198
|
+
log_base,
|
|
199
|
+
"tcp://localhost",
|
|
200
|
+
),
|
|
201
|
+
)
|
|
202
|
+
p.start()
|
|
203
|
+
ps.append(p)
|
|
204
|
+
|
|
205
|
+
# compute preschedule
|
|
206
|
+
preschedule = precompute(job)
|
|
207
|
+
|
|
208
|
+
# check processes started healthy
|
|
209
|
+
for i, p in enumerate(ps):
|
|
210
|
+
if not p.is_alive():
|
|
211
|
+
# TODO ideally we would somehow connect this with the Register message
|
|
212
|
+
# consumption in the Controller -- but there we don't assume that
|
|
213
|
+
# executors are on the same physical host
|
|
214
|
+
raise ValueError(f"executor {i} failed to live due to {p.exitcode}")
|
|
215
|
+
|
|
216
|
+
# start bridge itself
|
|
217
|
+
logger.debug("starting bridge")
|
|
197
218
|
b = Bridge(c, hosts)
|
|
198
219
|
start = perf_counter_ns()
|
|
199
220
|
run(job, b, preschedule, report_address=report_address)
|
|
@@ -201,7 +222,9 @@ def run_locally(
|
|
|
201
222
|
print(
|
|
202
223
|
f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
|
|
203
224
|
)
|
|
204
|
-
except:
|
|
225
|
+
except Exception:
|
|
226
|
+
# NOTE we log this to get the stacktrace into the logfile
|
|
227
|
+
logger.exception("controller failure, proceed with executor shutdown")
|
|
205
228
|
for p in ps:
|
|
206
229
|
if p.is_alive():
|
|
207
230
|
callback(m, ExecutorShutdown())
|
|
@@ -272,7 +295,7 @@ def main_dist(
|
|
|
272
295
|
idx,
|
|
273
296
|
shm_vol_gb,
|
|
274
297
|
gpu_count,
|
|
275
|
-
f"tcp://{
|
|
298
|
+
f"tcp://{platform.get_bindabble_self()}",
|
|
276
299
|
)
|
|
277
300
|
|
|
278
301
|
|
cascade/executor/config.py
CHANGED
|
@@ -21,6 +21,7 @@ logging_config = {
|
|
|
21
21
|
"forecastbox.worker": {"level": "DEBUG"},
|
|
22
22
|
"forecastbox.executor": {"level": "DEBUG"},
|
|
23
23
|
"cascade": {"level": "INFO"},
|
|
24
|
+
"cascade.benchmarks": {"level": "DEBUG"},
|
|
24
25
|
"cascade.low": {"level": "DEBUG"},
|
|
25
26
|
"cascade.shm": {"level": "DEBUG"},
|
|
26
27
|
"cascade.controller": {"level": "DEBUG"},
|
cascade/executor/executor.py
CHANGED
|
@@ -18,11 +18,11 @@ the tasks themselves.
|
|
|
18
18
|
import atexit
|
|
19
19
|
import logging
|
|
20
20
|
import os
|
|
21
|
-
import socket
|
|
22
21
|
from multiprocessing import get_context
|
|
23
22
|
from multiprocessing.process import BaseProcess
|
|
24
23
|
from typing import Iterable
|
|
25
24
|
|
|
25
|
+
import cascade.executor.platform as platform
|
|
26
26
|
import cascade.shm.api as shm_api
|
|
27
27
|
import cascade.shm.client as shm_client
|
|
28
28
|
from cascade.executor.comms import GraceWatcher, Listener, ReliableSender, callback
|
|
@@ -58,7 +58,7 @@ heartbeat_grace_ms = 2 * comms_default_timeout_ms
|
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
def address_of(port: int) -> BackboneAddress:
|
|
61
|
-
return f"tcp://{
|
|
61
|
+
return f"tcp://{platform.get_bindabble_self()}:{port}"
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
class Executor:
|
|
@@ -86,6 +86,7 @@ class Executor:
|
|
|
86
86
|
self.heartbeat_watcher = GraceWatcher(grace_ms=heartbeat_grace_ms)
|
|
87
87
|
|
|
88
88
|
self.terminating = False
|
|
89
|
+
logger.debug("register terminate function")
|
|
89
90
|
atexit.register(self.terminate)
|
|
90
91
|
# NOTE following inits are with potential side effects
|
|
91
92
|
self.mlistener = Listener(address_of(portBase))
|
|
@@ -99,6 +100,7 @@ class Executor:
|
|
|
99
100
|
shm_logging = logging_config_filehandler(f"{log_base}.shm.txt")
|
|
100
101
|
else:
|
|
101
102
|
shm_logging = logging_config
|
|
103
|
+
logger.debug("about to fork into shm process")
|
|
102
104
|
self.shm_process = ctx.Process(
|
|
103
105
|
target=shm_server,
|
|
104
106
|
args=(
|
|
@@ -114,6 +116,7 @@ class Executor:
|
|
|
114
116
|
dsr_logging = logging_config_filehandler(f"{log_base}.dsr.txt")
|
|
115
117
|
else:
|
|
116
118
|
dsr_logging = logging_config
|
|
119
|
+
logger.debug("about to fork into data server")
|
|
117
120
|
self.data_server = ctx.Process(
|
|
118
121
|
target=start_data_server,
|
|
119
122
|
args=(
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# (C) Copyright 2025- ECMWF.
|
|
2
|
+
#
|
|
3
|
+
# This software is licensed under the terms of the Apache Licence Version 2.0
|
|
4
|
+
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
|
5
|
+
# In applying this licence, ECMWF does not waive the privileges and immunities
|
|
6
|
+
# granted to it by virtue of its status as an intergovernmental organisation
|
|
7
|
+
# nor does it submit to any jurisdiction.
|
|
8
|
+
|
|
9
|
+
"""Macos-vs-Linux specific code"""
|
|
10
|
+
|
|
11
|
+
import socket
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_bindabble_self():
|
|
16
|
+
"""Returns a hostname such that zmq can bind to it"""
|
|
17
|
+
|
|
18
|
+
if sys.platform == "darwin":
|
|
19
|
+
# NOTE on macos, getfqdn usually returns like '66246.local', which can't then be bound to
|
|
20
|
+
# This is a stopper for running a cluster of macos devices -- but we don't plan that yet
|
|
21
|
+
return "localhost"
|
|
22
|
+
else:
|
|
23
|
+
# NOTE not sure if fqdn or hostname is better -- all we need is for it to be resolvable within cluster
|
|
24
|
+
return socket.gethostname() # socket.getfqdn()
|
cascade/gateway/router.py
CHANGED
|
@@ -14,12 +14,12 @@ import os
|
|
|
14
14
|
import subprocess
|
|
15
15
|
import uuid
|
|
16
16
|
from dataclasses import dataclass
|
|
17
|
-
from socket import getfqdn
|
|
18
17
|
from typing import Iterable
|
|
19
18
|
|
|
20
19
|
import orjson
|
|
21
20
|
import zmq
|
|
22
21
|
|
|
22
|
+
import cascade.executor.platform as platform
|
|
23
23
|
from cascade.controller.report import JobId, JobProgress, JobProgressStarted
|
|
24
24
|
from cascade.executor.comms import get_context
|
|
25
25
|
from cascade.gateway.api import JobSpec
|
|
@@ -131,11 +131,7 @@ class JobRouter:
|
|
|
131
131
|
|
|
132
132
|
def spawn_job(self, job_spec: JobSpec) -> JobId:
|
|
133
133
|
job_id = next_uuid(self.jobs.keys(), lambda: str(uuid.uuid4()))
|
|
134
|
-
|
|
135
|
-
base_addr = f"tcp://{getfqdn()}"
|
|
136
|
-
else:
|
|
137
|
-
# NOTE on macos, it seems getfqdn does not give zmq-bindable addr
|
|
138
|
-
base_addr = "tcp://localhost"
|
|
134
|
+
base_addr = f"tcp://{platform.get_bindabble_self()}"
|
|
139
135
|
socket = get_context().socket(zmq.PULL)
|
|
140
136
|
port = socket.bind_to_random_port(base_addr)
|
|
141
137
|
full_addr = f"{base_addr}:{port}"
|
cascade/shm/server.py
CHANGED
|
@@ -115,5 +115,5 @@ def entrypoint(
|
|
|
115
115
|
server.start()
|
|
116
116
|
except Exception as e:
|
|
117
117
|
# we always get a Bad file descriptor due to sigterm handler calling sock close mid-read
|
|
118
|
-
logger.warning(f"shutdown issue: {e}")
|
|
118
|
+
logger.warning(f"shutdown issue: {repr(e)}")
|
|
119
119
|
server.atexit(0, None)
|
earthkit/workflows/_version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
# Do not change! Do not track in version control!
|
|
2
|
-
__version__ = "0.4.
|
|
2
|
+
__version__ = "0.4.3"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: earthkit-workflows
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
|
|
5
5
|
Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
|
|
4
|
-
cascade/benchmarks/__main__.py,sha256=
|
|
4
|
+
cascade/benchmarks/__main__.py,sha256=UnIGFykZNw_WsuBbgWIxSEUZB_U4_Yra0VrmyMTEFQ8,9914
|
|
5
5
|
cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
|
|
6
6
|
cascade/benchmarks/dist.py,sha256=ngXJJzegnMUVwDFPvGMG6997lamB-aSEHi74oBbayrE,4116
|
|
7
7
|
cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
|
|
@@ -17,10 +17,11 @@ cascade/controller/notify.py,sha256=5eSPKcxqrv9kHy7St-iIm1NttsyzcvwLhZI5dvr4cEY,
|
|
|
17
17
|
cascade/controller/report.py,sha256=FD-MAWZq6pwSw2CP2m4OUBw4hzrX46vKE_FZO5NpjDU,3670
|
|
18
18
|
cascade/executor/bridge.py,sha256=WDE-GM2Bv7nUk1-nV-otMGuaRYw1-Vmd7PWploXBp6Y,8267
|
|
19
19
|
cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
|
|
20
|
-
cascade/executor/config.py,sha256=
|
|
20
|
+
cascade/executor/config.py,sha256=8azy_sXdvDGO0zTNqA0pdtkXsyihM4FQ4U1W_3Dhua0,1571
|
|
21
21
|
cascade/executor/data_server.py,sha256=xLIbLkWn8PnJl4lMP8ADHa2S0EgPwr0-bH7_Sib_Y70,13701
|
|
22
|
-
cascade/executor/executor.py,sha256=
|
|
22
|
+
cascade/executor/executor.py,sha256=ufl20jq1lObNONHiYkXDFFCNfv4xKVvFsC_gzhTLdKQ,13524
|
|
23
23
|
cascade/executor/msg.py,sha256=7HI0rKeCRaV1ONR4HWEa64nHbu-p6-QdBwJNitmst48,4340
|
|
24
|
+
cascade/executor/platform.py,sha256=Ym-J6v0Wx56r2z_1ZfK5cX1Zs8eMuu7l1SsNtVvT1Go,959
|
|
24
25
|
cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
|
|
25
26
|
cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
|
|
26
27
|
cascade/executor/runner/entrypoint.py,sha256=32i2U4fmEvQnsV1MTl0Xf8mK_1nbk1BEVJqIidd6MRM,8042
|
|
@@ -31,7 +32,7 @@ cascade/gateway/__init__.py,sha256=1EzMKdLFXEucj0YWOlyVqLx4suOntitwM03T_rRubIk,8
|
|
|
31
32
|
cascade/gateway/__main__.py,sha256=x6-DQin6ICvalHT9YcghGyVMoykEATOdN5ON9IeHPYA,862
|
|
32
33
|
cascade/gateway/api.py,sha256=-7HTUhK9idszVCwiVwyHMcNx7n6qRcyPWsLx2e19n3A,2511
|
|
33
34
|
cascade/gateway/client.py,sha256=1p4Tvrf-BH0LQHOES5rY1z3JNIfmXcqWG2kYl4rpcE0,4061
|
|
34
|
-
cascade/gateway/router.py,sha256=
|
|
35
|
+
cascade/gateway/router.py,sha256=81kS_ZtWaDV-WLcxaYgsYv-1WHy8aQxYGLmVL5CzOto,7549
|
|
35
36
|
cascade/gateway/server.py,sha256=tsOyKtVFs5EZmWrjKdi9JwWxK0DG207oSa9OQ-4zN3M,3772
|
|
36
37
|
cascade/low/__init__.py,sha256=5cw2taOGITK_gFbICftzK2YLdEAnLUY5OzblFzdHss4,769
|
|
37
38
|
cascade/low/builders.py,sha256=_u5X8G_EF00hFt8Anv9AXo6yPf1O8MHDmqs2kKmREl0,7073
|
|
@@ -53,9 +54,9 @@ cascade/shm/client.py,sha256=pnod_dmUROJZRtipCpoeCuAEuynW0IgSfgjrp21CH2M,5893
|
|
|
53
54
|
cascade/shm/dataset.py,sha256=Z2ewpnW7mVDJB9GylIVoOWV0DYOF7FWLIXkIvV-Y7sI,12347
|
|
54
55
|
cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
|
|
55
56
|
cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
|
|
56
|
-
cascade/shm/server.py,sha256=
|
|
57
|
+
cascade/shm/server.py,sha256=LnnNX0F6QJt5V_JLfmC3ZMHGNL5WpLY44wpB_pYDr7Y,5042
|
|
57
58
|
earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
|
|
58
|
-
earthkit/workflows/_version.py,sha256=
|
|
59
|
+
earthkit/workflows/_version.py,sha256=42UO3Zhb1-CO_KjQGniDtgDR8xL_a28l7aAWQcA9o9Y,72
|
|
59
60
|
earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
|
|
60
61
|
earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
|
|
61
62
|
earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
|
|
@@ -85,8 +86,8 @@ earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESs
|
|
|
85
86
|
earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
|
|
86
87
|
earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
|
|
87
88
|
earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
|
|
88
|
-
earthkit_workflows-0.4.
|
|
89
|
-
earthkit_workflows-0.4.
|
|
90
|
-
earthkit_workflows-0.4.
|
|
91
|
-
earthkit_workflows-0.4.
|
|
92
|
-
earthkit_workflows-0.4.
|
|
89
|
+
earthkit_workflows-0.4.3.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
|
|
90
|
+
earthkit_workflows-0.4.3.dist-info/METADATA,sha256=whowFjotkctfsy1v7mH-CsoCIucWMZpFcf8bqajhmQk,1571
|
|
91
|
+
earthkit_workflows-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
92
|
+
earthkit_workflows-0.4.3.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
|
|
93
|
+
earthkit_workflows-0.4.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|