earthkit-workflows 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -140,20 +140,25 @@ def launch_executor(
140
140
  logging.config.dictConfig(logging_config_filehandler(log_path))
141
141
  else:
142
142
  logging.config.dictConfig(logging_config)
143
- logger.info(f"will set {gpu_count} gpus on host {i}")
144
- os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
145
- executor = Executor(
146
- job_instance,
147
- controller_address,
148
- workers_per_host,
149
- f"h{i}",
150
- portBase,
151
- shm_vol_gb,
152
- log_base,
153
- url_base,
154
- )
155
- executor.register()
156
- executor.recv_loop()
143
+ try:
144
+ logger.info(f"will set {gpu_count} gpus on host {i}")
145
+ os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
146
+ executor = Executor(
147
+ job_instance,
148
+ controller_address,
149
+ workers_per_host,
150
+ f"h{i}",
151
+ portBase,
152
+ shm_vol_gb,
153
+ log_base,
154
+ url_base,
155
+ )
156
+ executor.register()
157
+ executor.recv_loop()
158
+ except Exception:
159
+ # NOTE we log this to get the stacktrace into the logfile
160
+ logger.exception("executor failure")
161
+ raise
157
162
 
158
163
 
159
164
  def run_locally(
@@ -169,31 +174,47 @@ def run_locally(
169
174
  logging.config.dictConfig(logging_config_filehandler(log_path))
170
175
  else:
171
176
  logging.config.dictConfig(logging_config)
177
+ logger.debug(f"local run starting with {hosts=} and {workers=} on {portBase=}")
172
178
  launch = perf_counter_ns()
173
- preschedule = precompute(job)
174
179
  c = f"tcp://localhost:{portBase}"
175
180
  m = f"tcp://localhost:{portBase+1}"
176
181
  ps = []
177
- for i, executor in enumerate(range(hosts)):
178
- gpu_count = get_gpu_count(i, workers)
179
- # NOTE forkserver/spawn seem to forget venv, we need fork
180
- p = multiprocessing.get_context("fork").Process(
181
- target=launch_executor,
182
- args=(
183
- job,
184
- c,
185
- workers,
186
- portBase + 1 + i * 10,
187
- i,
188
- None,
189
- gpu_count,
190
- log_base,
191
- "tcp://localhost",
192
- ),
193
- )
194
- p.start()
195
- ps.append(p)
196
182
  try:
183
+ # executors forking
184
+ for i, executor in enumerate(range(hosts)):
185
+ gpu_count = get_gpu_count(i, workers)
186
+ # NOTE forkserver/spawn seem to forget venv, we need fork
187
+ logger.debug(f"forking into executor on host {i}")
188
+ p = multiprocessing.get_context("fork").Process(
189
+ target=launch_executor,
190
+ args=(
191
+ job,
192
+ c,
193
+ workers,
194
+ portBase + 1 + i * 10,
195
+ i,
196
+ None,
197
+ gpu_count,
198
+ log_base,
199
+ "tcp://localhost",
200
+ ),
201
+ )
202
+ p.start()
203
+ ps.append(p)
204
+
205
+ # compute preschedule
206
+ preschedule = precompute(job)
207
+
208
+ # check processes started healthy
209
+ for i, p in enumerate(ps):
210
+ if not p.is_alive():
211
+ # TODO ideally we would somehow connect this with the Register message
212
+ # consumption in the Controller -- but there we don't assume that
213
+ # executors are on the same physical host
214
+ raise ValueError(f"executor {i} failed to live due to {p.exitcode}")
215
+
216
+ # start bridge itself
217
+ logger.debug("starting bridge")
197
218
  b = Bridge(c, hosts)
198
219
  start = perf_counter_ns()
199
220
  run(job, b, preschedule, report_address=report_address)
@@ -201,7 +222,9 @@ def run_locally(
201
222
  print(
202
223
  f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
203
224
  )
204
- except:
225
+ except Exception:
226
+ # NOTE we log this to get the stacktrace into the logfile
227
+ logger.exception("controller failure, proceed with executor shutdown")
205
228
  for p in ps:
206
229
  if p.is_alive():
207
230
  callback(m, ExecutorShutdown())
@@ -21,6 +21,7 @@ logging_config = {
21
21
  "forecastbox.worker": {"level": "DEBUG"},
22
22
  "forecastbox.executor": {"level": "DEBUG"},
23
23
  "cascade": {"level": "INFO"},
24
+ "cascade.benchmarks": {"level": "DEBUG"},
24
25
  "cascade.low": {"level": "DEBUG"},
25
26
  "cascade.shm": {"level": "DEBUG"},
26
27
  "cascade.controller": {"level": "DEBUG"},
@@ -86,6 +86,7 @@ class Executor:
86
86
  self.heartbeat_watcher = GraceWatcher(grace_ms=heartbeat_grace_ms)
87
87
 
88
88
  self.terminating = False
89
+ logger.debug("register terminate function")
89
90
  atexit.register(self.terminate)
90
91
  # NOTE following inits are with potential side effects
91
92
  self.mlistener = Listener(address_of(portBase))
@@ -99,6 +100,7 @@ class Executor:
99
100
  shm_logging = logging_config_filehandler(f"{log_base}.shm.txt")
100
101
  else:
101
102
  shm_logging = logging_config
103
+ logger.debug("about to fork into shm process")
102
104
  self.shm_process = ctx.Process(
103
105
  target=shm_server,
104
106
  args=(
@@ -114,6 +116,7 @@ class Executor:
114
116
  dsr_logging = logging_config_filehandler(f"{log_base}.dsr.txt")
115
117
  else:
116
118
  dsr_logging = logging_config
119
+ logger.debug("about to fork into data server")
117
120
  self.data_server = ctx.Process(
118
121
  target=start_data_server,
119
122
  args=(
cascade/shm/server.py CHANGED
@@ -115,5 +115,5 @@ def entrypoint(
115
115
  server.start()
116
116
  except Exception as e:
117
117
  # we always get a Bad file descriptor due to sigterm handler calling sock close mid-read
118
- logger.warning(f"shutdown issue: {e}")
118
+ logger.warning(f"shutdown issue: {repr(e)}")
119
119
  server.atexit(0, None)
@@ -1,2 +1,2 @@
1
1
  # Do not change! Do not track in version control!
2
- __version__ = "0.4.1"
2
+ __version__ = "0.4.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: earthkit-workflows
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
5
5
  Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
6
6
  License-Expression: Apache-2.0
@@ -1,7 +1,7 @@
1
1
  cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
4
- cascade/benchmarks/__main__.py,sha256=9g2F-EH-M2DOgSEm_K3teFcJMzRV7cB5Kp3DWviyIkY,8729
4
+ cascade/benchmarks/__main__.py,sha256=g03xRzp58dXLHDj8kTPyPnbBOS5sRIAMTthFtFjDRbs,9876
5
5
  cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
6
6
  cascade/benchmarks/dist.py,sha256=ngXJJzegnMUVwDFPvGMG6997lamB-aSEHi74oBbayrE,4116
7
7
  cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
@@ -17,9 +17,9 @@ cascade/controller/notify.py,sha256=5eSPKcxqrv9kHy7St-iIm1NttsyzcvwLhZI5dvr4cEY,
17
17
  cascade/controller/report.py,sha256=FD-MAWZq6pwSw2CP2m4OUBw4hzrX46vKE_FZO5NpjDU,3670
18
18
  cascade/executor/bridge.py,sha256=WDE-GM2Bv7nUk1-nV-otMGuaRYw1-Vmd7PWploXBp6Y,8267
19
19
  cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
20
- cascade/executor/config.py,sha256=IXBheZCp8DYKMd42e8whR6Gar1jnJyT9BCl6_X2c0Wk,1521
20
+ cascade/executor/config.py,sha256=8azy_sXdvDGO0zTNqA0pdtkXsyihM4FQ4U1W_3Dhua0,1571
21
21
  cascade/executor/data_server.py,sha256=xLIbLkWn8PnJl4lMP8ADHa2S0EgPwr0-bH7_Sib_Y70,13701
22
- cascade/executor/executor.py,sha256=MHG_b4GCFjx-E9nLYvIqQicoOfNcP6JMNtn1Q0l4VIY,13322
22
+ cascade/executor/executor.py,sha256=egPhfqhzYfeM77Hu10-mGHNVsQAdqmZOA7hmjFP1Q8M,13484
23
23
  cascade/executor/msg.py,sha256=7HI0rKeCRaV1ONR4HWEa64nHbu-p6-QdBwJNitmst48,4340
24
24
  cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
25
25
  cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
@@ -53,9 +53,9 @@ cascade/shm/client.py,sha256=pnod_dmUROJZRtipCpoeCuAEuynW0IgSfgjrp21CH2M,5893
53
53
  cascade/shm/dataset.py,sha256=Z2ewpnW7mVDJB9GylIVoOWV0DYOF7FWLIXkIvV-Y7sI,12347
54
54
  cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
55
55
  cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
56
- cascade/shm/server.py,sha256=5Ub9bnBmDto9BwfjX3h3sJeiLzZN4lawgtLfvK-vcMU,5036
56
+ cascade/shm/server.py,sha256=LnnNX0F6QJt5V_JLfmC3ZMHGNL5WpLY44wpB_pYDr7Y,5042
57
57
  earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
58
- earthkit/workflows/_version.py,sha256=y47bDxdCo8QZJg-t8qedLIvdjhzndkJxexb9k-2JS1g,72
58
+ earthkit/workflows/_version.py,sha256=nkd71CReR3pz5TZ9pcVgB2cP1MDj4YK6VH9UGJYzXDM,72
59
59
  earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
60
60
  earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
61
61
  earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
@@ -85,8 +85,8 @@ earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESs
85
85
  earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
86
86
  earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
87
87
  earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
88
- earthkit_workflows-0.4.1.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
89
- earthkit_workflows-0.4.1.dist-info/METADATA,sha256=wntnQCnb78Ashzd9p3OHSzDlB4yH6Np9yd4jKO00EOI,1571
90
- earthkit_workflows-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
91
- earthkit_workflows-0.4.1.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
92
- earthkit_workflows-0.4.1.dist-info/RECORD,,
88
+ earthkit_workflows-0.4.2.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
89
+ earthkit_workflows-0.4.2.dist-info/METADATA,sha256=LWW-xDc0sq8cOdu6IpY335_MSFfe7Lmg1SHYT9cXjWA,1571
90
+ earthkit_workflows-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
91
+ earthkit_workflows-0.4.2.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
92
+ earthkit_workflows-0.4.2.dist-info/RECORD,,