earthkit-workflows 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,12 +28,12 @@ import os
28
28
  import subprocess
29
29
  import sys
30
30
  from concurrent.futures import ThreadPoolExecutor
31
- from socket import getfqdn
32
31
  from time import perf_counter_ns
33
32
 
34
33
  import fire
35
34
  import orjson
36
35
 
36
+ import cascade.executor.platform as platform
37
37
  import cascade.low.into
38
38
  from cascade.controller.impl import run
39
39
  from cascade.executor.bridge import Bridge
@@ -140,20 +140,25 @@ def launch_executor(
140
140
  logging.config.dictConfig(logging_config_filehandler(log_path))
141
141
  else:
142
142
  logging.config.dictConfig(logging_config)
143
- logger.info(f"will set {gpu_count} gpus on host {i}")
144
- os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
145
- executor = Executor(
146
- job_instance,
147
- controller_address,
148
- workers_per_host,
149
- f"h{i}",
150
- portBase,
151
- shm_vol_gb,
152
- log_base,
153
- url_base,
154
- )
155
- executor.register()
156
- executor.recv_loop()
143
+ try:
144
+ logger.info(f"will set {gpu_count} gpus on host {i}")
145
+ os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
146
+ executor = Executor(
147
+ job_instance,
148
+ controller_address,
149
+ workers_per_host,
150
+ f"h{i}",
151
+ portBase,
152
+ shm_vol_gb,
153
+ log_base,
154
+ url_base,
155
+ )
156
+ executor.register()
157
+ executor.recv_loop()
158
+ except Exception:
159
+ # NOTE we log this to get the stacktrace into the logfile
160
+ logger.exception("executor failure")
161
+ raise
157
162
 
158
163
 
159
164
  def run_locally(
@@ -169,31 +174,47 @@ def run_locally(
169
174
  logging.config.dictConfig(logging_config_filehandler(log_path))
170
175
  else:
171
176
  logging.config.dictConfig(logging_config)
177
+ logger.debug(f"local run starting with {hosts=} and {workers=} on {portBase=}")
172
178
  launch = perf_counter_ns()
173
- preschedule = precompute(job)
174
179
  c = f"tcp://localhost:{portBase}"
175
180
  m = f"tcp://localhost:{portBase+1}"
176
181
  ps = []
177
- for i, executor in enumerate(range(hosts)):
178
- gpu_count = get_gpu_count(i, workers)
179
- # NOTE forkserver/spawn seem to forget venv, we need fork
180
- p = multiprocessing.get_context("fork").Process(
181
- target=launch_executor,
182
- args=(
183
- job,
184
- c,
185
- workers,
186
- portBase + 1 + i * 10,
187
- i,
188
- None,
189
- gpu_count,
190
- log_base,
191
- "tcp://localhost",
192
- ),
193
- )
194
- p.start()
195
- ps.append(p)
196
182
  try:
183
+ # executors forking
184
+ for i, executor in enumerate(range(hosts)):
185
+ gpu_count = get_gpu_count(i, workers)
186
+ # NOTE forkserver/spawn seem to forget venv, we need fork
187
+ logger.debug(f"forking into executor on host {i}")
188
+ p = multiprocessing.get_context("fork").Process(
189
+ target=launch_executor,
190
+ args=(
191
+ job,
192
+ c,
193
+ workers,
194
+ portBase + 1 + i * 10,
195
+ i,
196
+ None,
197
+ gpu_count,
198
+ log_base,
199
+ "tcp://localhost",
200
+ ),
201
+ )
202
+ p.start()
203
+ ps.append(p)
204
+
205
+ # compute preschedule
206
+ preschedule = precompute(job)
207
+
208
+ # check processes started healthy
209
+ for i, p in enumerate(ps):
210
+ if not p.is_alive():
211
+ # TODO ideally we would somehow connect this with the Register message
212
+ # consumption in the Controller -- but there we don't assume that
213
+ # executors are on the same physical host
214
+ raise ValueError(f"executor {i} failed to live due to {p.exitcode}")
215
+
216
+ # start bridge itself
217
+ logger.debug("starting bridge")
197
218
  b = Bridge(c, hosts)
198
219
  start = perf_counter_ns()
199
220
  run(job, b, preschedule, report_address=report_address)
@@ -201,7 +222,9 @@ def run_locally(
201
222
  print(
202
223
  f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
203
224
  )
204
- except:
225
+ except Exception:
226
+ # NOTE we log this to get the stacktrace into the logfile
227
+ logger.exception("controller failure, proceed with executor shutdown")
205
228
  for p in ps:
206
229
  if p.is_alive():
207
230
  callback(m, ExecutorShutdown())
@@ -272,7 +295,7 @@ def main_dist(
272
295
  idx,
273
296
  shm_vol_gb,
274
297
  gpu_count,
275
- f"tcp://{getfqdn()}",
298
+ f"tcp://{platform.get_bindabble_self()}",
276
299
  )
277
300
 
278
301
 
@@ -21,6 +21,7 @@ logging_config = {
21
21
  "forecastbox.worker": {"level": "DEBUG"},
22
22
  "forecastbox.executor": {"level": "DEBUG"},
23
23
  "cascade": {"level": "INFO"},
24
+ "cascade.benchmarks": {"level": "DEBUG"},
24
25
  "cascade.low": {"level": "DEBUG"},
25
26
  "cascade.shm": {"level": "DEBUG"},
26
27
  "cascade.controller": {"level": "DEBUG"},
@@ -18,11 +18,11 @@ the tasks themselves.
18
18
  import atexit
19
19
  import logging
20
20
  import os
21
- import socket
22
21
  from multiprocessing import get_context
23
22
  from multiprocessing.process import BaseProcess
24
23
  from typing import Iterable
25
24
 
25
+ import cascade.executor.platform as platform
26
26
  import cascade.shm.api as shm_api
27
27
  import cascade.shm.client as shm_client
28
28
  from cascade.executor.comms import GraceWatcher, Listener, ReliableSender, callback
@@ -58,7 +58,7 @@ heartbeat_grace_ms = 2 * comms_default_timeout_ms
58
58
 
59
59
 
60
60
  def address_of(port: int) -> BackboneAddress:
61
- return f"tcp://{socket.gethostname()}:{port}"
61
+ return f"tcp://{platform.get_bindabble_self()}:{port}"
62
62
 
63
63
 
64
64
  class Executor:
@@ -86,6 +86,7 @@ class Executor:
86
86
  self.heartbeat_watcher = GraceWatcher(grace_ms=heartbeat_grace_ms)
87
87
 
88
88
  self.terminating = False
89
+ logger.debug("register terminate function")
89
90
  atexit.register(self.terminate)
90
91
  # NOTE following inits are with potential side effects
91
92
  self.mlistener = Listener(address_of(portBase))
@@ -99,6 +100,7 @@ class Executor:
99
100
  shm_logging = logging_config_filehandler(f"{log_base}.shm.txt")
100
101
  else:
101
102
  shm_logging = logging_config
103
+ logger.debug("about to fork into shm process")
102
104
  self.shm_process = ctx.Process(
103
105
  target=shm_server,
104
106
  args=(
@@ -114,6 +116,7 @@ class Executor:
114
116
  dsr_logging = logging_config_filehandler(f"{log_base}.dsr.txt")
115
117
  else:
116
118
  dsr_logging = logging_config
119
+ logger.debug("about to fork into data server")
117
120
  self.data_server = ctx.Process(
118
121
  target=start_data_server,
119
122
  args=(
@@ -0,0 +1,24 @@
1
+ # (C) Copyright 2025- ECMWF.
2
+ #
3
+ # This software is licensed under the terms of the Apache Licence Version 2.0
4
+ # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5
+ # In applying this licence, ECMWF does not waive the privileges and immunities
6
+ # granted to it by virtue of its status as an intergovernmental organisation
7
+ # nor does it submit to any jurisdiction.
8
+
9
+ """Macos-vs-Linux specific code"""
10
+
11
+ import socket
12
+ import sys
13
+
14
+
15
+ def get_bindabble_self():
16
+ """Returns a hostname such that zmq can bind to it"""
17
+
18
+ if sys.platform == "darwin":
19
+ # NOTE on macos, getfqdn usually returns like '66246.local', which can't then be bound to
20
+ # This is a stopper for running a cluster of macos devices -- but we don't plan that yet
21
+ return "localhost"
22
+ else:
23
+ # NOTE not sure if fqdn or hostname is better -- all we need is for it to be resolvable within cluster
24
+ return socket.gethostname() # socket.getfqdn()
cascade/gateway/router.py CHANGED
@@ -14,12 +14,12 @@ import os
14
14
  import subprocess
15
15
  import uuid
16
16
  from dataclasses import dataclass
17
- from socket import getfqdn
18
17
  from typing import Iterable
19
18
 
20
19
  import orjson
21
20
  import zmq
22
21
 
22
+ import cascade.executor.platform as platform
23
23
  from cascade.controller.report import JobId, JobProgress, JobProgressStarted
24
24
  from cascade.executor.comms import get_context
25
25
  from cascade.gateway.api import JobSpec
@@ -131,11 +131,7 @@ class JobRouter:
131
131
 
132
132
  def spawn_job(self, job_spec: JobSpec) -> JobId:
133
133
  job_id = next_uuid(self.jobs.keys(), lambda: str(uuid.uuid4()))
134
- if job_spec.use_slurm:
135
- base_addr = f"tcp://{getfqdn()}"
136
- else:
137
- # NOTE on macos, it seems getfqdn does not give zmq-bindable addr
138
- base_addr = "tcp://localhost"
134
+ base_addr = f"tcp://{platform.get_bindabble_self()}"
139
135
  socket = get_context().socket(zmq.PULL)
140
136
  port = socket.bind_to_random_port(base_addr)
141
137
  full_addr = f"{base_addr}:{port}"
cascade/shm/server.py CHANGED
@@ -115,5 +115,5 @@ def entrypoint(
115
115
  server.start()
116
116
  except Exception as e:
117
117
  # we always get a Bad file descriptor due to sigterm handler calling sock close mid-read
118
- logger.warning(f"shutdown issue: {e}")
118
+ logger.warning(f"shutdown issue: {repr(e)}")
119
119
  server.atexit(0, None)
@@ -1,2 +1,2 @@
1
1
  # Do not change! Do not track in version control!
2
- __version__ = "0.4.1"
2
+ __version__ = "0.4.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: earthkit-workflows
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
5
5
  Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
6
6
  License-Expression: Apache-2.0
@@ -1,7 +1,7 @@
1
1
  cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
4
- cascade/benchmarks/__main__.py,sha256=9g2F-EH-M2DOgSEm_K3teFcJMzRV7cB5Kp3DWviyIkY,8729
4
+ cascade/benchmarks/__main__.py,sha256=UnIGFykZNw_WsuBbgWIxSEUZB_U4_Yra0VrmyMTEFQ8,9914
5
5
  cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
6
6
  cascade/benchmarks/dist.py,sha256=ngXJJzegnMUVwDFPvGMG6997lamB-aSEHi74oBbayrE,4116
7
7
  cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
@@ -17,10 +17,11 @@ cascade/controller/notify.py,sha256=5eSPKcxqrv9kHy7St-iIm1NttsyzcvwLhZI5dvr4cEY,
17
17
  cascade/controller/report.py,sha256=FD-MAWZq6pwSw2CP2m4OUBw4hzrX46vKE_FZO5NpjDU,3670
18
18
  cascade/executor/bridge.py,sha256=WDE-GM2Bv7nUk1-nV-otMGuaRYw1-Vmd7PWploXBp6Y,8267
19
19
  cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
20
- cascade/executor/config.py,sha256=IXBheZCp8DYKMd42e8whR6Gar1jnJyT9BCl6_X2c0Wk,1521
20
+ cascade/executor/config.py,sha256=8azy_sXdvDGO0zTNqA0pdtkXsyihM4FQ4U1W_3Dhua0,1571
21
21
  cascade/executor/data_server.py,sha256=xLIbLkWn8PnJl4lMP8ADHa2S0EgPwr0-bH7_Sib_Y70,13701
22
- cascade/executor/executor.py,sha256=MHG_b4GCFjx-E9nLYvIqQicoOfNcP6JMNtn1Q0l4VIY,13322
22
+ cascade/executor/executor.py,sha256=ufl20jq1lObNONHiYkXDFFCNfv4xKVvFsC_gzhTLdKQ,13524
23
23
  cascade/executor/msg.py,sha256=7HI0rKeCRaV1ONR4HWEa64nHbu-p6-QdBwJNitmst48,4340
24
+ cascade/executor/platform.py,sha256=Ym-J6v0Wx56r2z_1ZfK5cX1Zs8eMuu7l1SsNtVvT1Go,959
24
25
  cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
25
26
  cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
26
27
  cascade/executor/runner/entrypoint.py,sha256=32i2U4fmEvQnsV1MTl0Xf8mK_1nbk1BEVJqIidd6MRM,8042
@@ -31,7 +32,7 @@ cascade/gateway/__init__.py,sha256=1EzMKdLFXEucj0YWOlyVqLx4suOntitwM03T_rRubIk,8
31
32
  cascade/gateway/__main__.py,sha256=x6-DQin6ICvalHT9YcghGyVMoykEATOdN5ON9IeHPYA,862
32
33
  cascade/gateway/api.py,sha256=-7HTUhK9idszVCwiVwyHMcNx7n6qRcyPWsLx2e19n3A,2511
33
34
  cascade/gateway/client.py,sha256=1p4Tvrf-BH0LQHOES5rY1z3JNIfmXcqWG2kYl4rpcE0,4061
34
- cascade/gateway/router.py,sha256=iN-dc3L46aEy0EV57NNKYwaqIu0Au9kImu1pg-UbxwE,7680
35
+ cascade/gateway/router.py,sha256=81kS_ZtWaDV-WLcxaYgsYv-1WHy8aQxYGLmVL5CzOto,7549
35
36
  cascade/gateway/server.py,sha256=tsOyKtVFs5EZmWrjKdi9JwWxK0DG207oSa9OQ-4zN3M,3772
36
37
  cascade/low/__init__.py,sha256=5cw2taOGITK_gFbICftzK2YLdEAnLUY5OzblFzdHss4,769
37
38
  cascade/low/builders.py,sha256=_u5X8G_EF00hFt8Anv9AXo6yPf1O8MHDmqs2kKmREl0,7073
@@ -53,9 +54,9 @@ cascade/shm/client.py,sha256=pnod_dmUROJZRtipCpoeCuAEuynW0IgSfgjrp21CH2M,5893
53
54
  cascade/shm/dataset.py,sha256=Z2ewpnW7mVDJB9GylIVoOWV0DYOF7FWLIXkIvV-Y7sI,12347
54
55
  cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
55
56
  cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
56
- cascade/shm/server.py,sha256=5Ub9bnBmDto9BwfjX3h3sJeiLzZN4lawgtLfvK-vcMU,5036
57
+ cascade/shm/server.py,sha256=LnnNX0F6QJt5V_JLfmC3ZMHGNL5WpLY44wpB_pYDr7Y,5042
57
58
  earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
58
- earthkit/workflows/_version.py,sha256=y47bDxdCo8QZJg-t8qedLIvdjhzndkJxexb9k-2JS1g,72
59
+ earthkit/workflows/_version.py,sha256=42UO3Zhb1-CO_KjQGniDtgDR8xL_a28l7aAWQcA9o9Y,72
59
60
  earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
60
61
  earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
61
62
  earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
@@ -85,8 +86,8 @@ earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESs
85
86
  earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
86
87
  earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
87
88
  earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
88
- earthkit_workflows-0.4.1.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
89
- earthkit_workflows-0.4.1.dist-info/METADATA,sha256=wntnQCnb78Ashzd9p3OHSzDlB4yH6Np9yd4jKO00EOI,1571
90
- earthkit_workflows-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
91
- earthkit_workflows-0.4.1.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
92
- earthkit_workflows-0.4.1.dist-info/RECORD,,
89
+ earthkit_workflows-0.4.3.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
90
+ earthkit_workflows-0.4.3.dist-info/METADATA,sha256=whowFjotkctfsy1v7mH-CsoCIucWMZpFcf8bqajhmQk,1571
91
+ earthkit_workflows-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
92
+ earthkit_workflows-0.4.3.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
93
+ earthkit_workflows-0.4.3.dist-info/RECORD,,