earthkit-workflows 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,7 +36,7 @@ import cascade.low.into
36
36
  from cascade.controller.impl import run
37
37
  from cascade.executor.bridge import Bridge
38
38
  from cascade.executor.comms import callback
39
- from cascade.executor.config import logging_config
39
+ from cascade.executor.config import logging_config, logging_config_filehandler
40
40
  from cascade.executor.executor import Executor
41
41
  from cascade.executor.msg import BackboneAddress, ExecutorShutdown
42
42
  from cascade.low.core import JobInstance
@@ -105,8 +105,14 @@ def launch_executor(
105
105
  i: int,
106
106
  shm_vol_gb: int | None,
107
107
  gpu_count: int,
108
+ log_base: str | None,
108
109
  ):
109
- logging.config.dictConfig(logging_config)
110
+ if log_base is not None:
111
+ log_base = f"{log_base}.host{i}"
112
+ log_path = f"{log_base}.txt"
113
+ logging.config.dictConfig(logging_config_filehandler(log_path))
114
+ else:
115
+ logging.config.dictConfig(logging_config)
110
116
  logger.info(f"will set {gpu_count} gpus on host {i}")
111
117
  os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
112
118
  executor = Executor(
@@ -116,6 +122,7 @@ def launch_executor(
116
122
  f"h{i}",
117
123
  portBase,
118
124
  shm_vol_gb,
125
+ log_base,
119
126
  )
120
127
  executor.register()
121
128
  executor.recv_loop()
@@ -126,9 +133,14 @@ def run_locally(
126
133
  hosts: int,
127
134
  workers: int,
128
135
  portBase: int = 12345,
136
+ log_base: str | None = None,
129
137
  report_address: str | None = None,
130
138
  ):
131
- logging.config.dictConfig(logging_config)
139
+ if log_base is not None:
140
+ log_path = f"{log_base}.controller.txt"
141
+ logging.config.dictConfig(logging_config_filehandler(log_path))
142
+ else:
143
+ logging.config.dictConfig(logging_config)
132
144
  launch = perf_counter_ns()
133
145
  preschedule = precompute(job)
134
146
  c = f"tcp://localhost:{portBase}"
@@ -142,7 +154,7 @@ def run_locally(
142
154
  # NOTE forkserver/spawn seem to forget venv, we need fork
143
155
  p = multiprocessing.get_context("fork").Process(
144
156
  target=launch_executor,
145
- args=(job, c, workers, portBase + 1 + i * 10, i, None, gpu_count),
157
+ args=(job, c, workers, portBase + 1 + i * 10, i, None, gpu_count, log_base),
146
158
  )
147
159
  p.start()
148
160
  ps.append(p)
@@ -172,6 +184,7 @@ def main_local(
172
184
  job: str | None = None,
173
185
  instance: str | None = None,
174
186
  port_base: int = 12345,
187
+ log_base: str | None = None,
175
188
  ) -> None:
176
189
  jobInstance = get_job(job, instance)
177
190
  run_locally(
@@ -180,6 +193,7 @@ def main_local(
180
193
  workers_per_host,
181
194
  report_address=report_address,
182
195
  portBase=port_base,
196
+ log_base=log_base,
183
197
  )
184
198
 
185
199
 
@@ -28,7 +28,7 @@ import cascade.shm.client as shm_client
28
28
  from cascade.executor.comms import GraceWatcher, Listener, ReliableSender, callback
29
29
  from cascade.executor.comms import default_message_resend_ms as resend_grace_ms
30
30
  from cascade.executor.comms import default_timeout_ms as comms_default_timeout_ms
31
- from cascade.executor.config import logging_config
31
+ from cascade.executor.config import logging_config, logging_config_filehandler
32
32
  from cascade.executor.data_server import start_data_server
33
33
  from cascade.executor.msg import (
34
34
  Ack,
@@ -70,6 +70,7 @@ class Executor:
70
70
  host: HostId,
71
71
  portBase: int,
72
72
  shm_vol_gb: int | None = None,
73
+ log_base: str | None = None,
73
74
  ) -> None:
74
75
  self.job_instance = job_instance
75
76
  self.param_source = param_source(job_instance.edges)
@@ -78,6 +79,7 @@ class Executor:
78
79
  self.workers: dict[WorkerId, BaseProcess | None] = {
79
80
  WorkerId(host, f"w{i}"): None for i in range(workers)
80
81
  }
82
+ self.log_base = log_base
81
83
 
82
84
  self.datasets: set[DatasetId] = set()
83
85
  self.heartbeat_watcher = GraceWatcher(grace_ms=heartbeat_grace_ms)
@@ -92,17 +94,25 @@ class Executor:
92
94
  shm_port = portBase + 2
93
95
  shm_api.publish_client_port(shm_port)
94
96
  ctx = get_context("fork")
97
+ if log_base:
98
+ shm_logging = logging_config_filehandler(f"{log_base}.shm.txt")
99
+ else:
100
+ shm_logging = logging_config
95
101
  self.shm_process = ctx.Process(
96
102
  target=shm_server,
97
103
  args=(
98
104
  shm_port,
99
105
  shm_vol_gb * (1024**3) if shm_vol_gb else None,
100
- logging_config,
106
+ shm_logging,
101
107
  f"sCasc{host}",
102
108
  ),
103
109
  )
104
110
  self.shm_process.start()
105
111
  self.daddress = address_of(portBase + 1)
112
+ if log_base:
113
+ dsr_logging = logging_config_filehandler(f"{log_base}.dsr.txt")
114
+ else:
115
+ dsr_logging = logging_config
106
116
  self.data_server = ctx.Process(
107
117
  target=start_data_server,
108
118
  args=(
@@ -110,7 +120,7 @@ class Executor:
110
120
  self.daddress,
111
121
  self.host,
112
122
  shm_port,
113
- logging_config,
123
+ dsr_logging,
114
124
  ),
115
125
  )
116
126
  self.data_server.start()
@@ -181,6 +191,7 @@ class Executor:
181
191
  job=self.job_instance,
182
192
  param_source=self.param_source,
183
193
  callback=self.mlistener.address,
194
+ log_base=self.log_base,
184
195
  )
185
196
  p = ctx.Process(target=entrypoint, kwargs={"runnerContext": runnerContext})
186
197
  p.start()
@@ -222,17 +233,17 @@ class Executor:
222
233
  procFail = lambda ex: ex is not None and ex != 0
223
234
  for k, e in self.workers.items():
224
235
  if e is None:
225
- ValueError(f"process on {k} is not alive")
236
+ raise ValueError(f"process on {k} is not alive")
226
237
  elif procFail(e.exitcode):
227
- ValueError(
238
+ raise ValueError(
228
239
  f"process on {k} failed to terminate correctly: {e.pid} -> {e.exitcode}"
229
240
  )
230
241
  if procFail(self.shm_process.exitcode):
231
- ValueError(
242
+ raise ValueError(
232
243
  f"shm server {self.shm_process.pid} failed with {self.shm_process.exitcode}"
233
244
  )
234
245
  if procFail(self.data_server.exitcode):
235
- ValueError(
246
+ raise ValueError(
236
247
  f"data server {self.data_server.pid} failed with {self.data_server.exitcode}"
237
248
  )
238
249
  if self.heartbeat_watcher.is_breach() > 0:
@@ -17,7 +17,7 @@ import zmq
17
17
 
18
18
  import cascade.executor.serde as serde
19
19
  from cascade.executor.comms import callback
20
- from cascade.executor.config import logging_config
20
+ from cascade.executor.config import logging_config, logging_config_filehandler
21
21
  from cascade.executor.msg import (
22
22
  BackboneAddress,
23
23
  DatasetPublished,
@@ -44,6 +44,7 @@ class RunnerContext:
44
44
  job: JobInstance
45
45
  callback: BackboneAddress
46
46
  param_source: dict[TaskId, dict[int | str, DatasetId]]
47
+ log_base: str | None
47
48
 
48
49
  def project(self, taskSequence: TaskSequence) -> ExecutionContext:
49
50
  schema_lookup: dict[DatasetId, str] = {}
@@ -92,7 +93,11 @@ def execute_sequence(
92
93
 
93
94
 
94
95
  def entrypoint(runnerContext: RunnerContext):
95
- logging.config.dictConfig(logging_config)
96
+ if runnerContext.log_base:
97
+ log_path = f"{runnerContext.log_base}.{runnerContext.workerId.worker}"
98
+ logging.config.dictConfig(logging_config_filehandler(log_path))
99
+ else:
100
+ logging.config.dictConfig(logging_config)
96
101
  ctx = zmq.Context()
97
102
  socket = ctx.socket(zmq.PULL)
98
103
  socket.bind(worker_address(runnerContext.workerId))
@@ -10,13 +10,17 @@ import logging.config
10
10
 
11
11
  import fire
12
12
 
13
- from cascade.executor.config import logging_config
13
+ from cascade.executor.config import logging_config, logging_config_filehandler
14
14
  from cascade.gateway.server import serve
15
15
 
16
16
 
17
- def main(url: str) -> None:
18
- logging.config.dictConfig(logging_config)
19
- serve(url)
17
+ def main(url: str, log_base: str | None = None) -> None:
18
+ if log_base:
19
+ log_path = f"{log_base}/gateway.txt"
20
+ logging.config.dictConfig(logging_config_filehandler(log_path))
21
+ else:
22
+ logging.config.dictConfig(logging_config)
23
+ serve(url, log_base)
20
24
 
21
25
 
22
26
  if __name__ == "__main__":
cascade/gateway/router.py CHANGED
@@ -43,7 +43,9 @@ class Job:
43
43
  local_job_port = 12345
44
44
 
45
45
 
46
- def _spawn_local(job_spec: JobSpec, addr: str, job_id: JobId) -> subprocess.Popen:
46
+ def _spawn_local(
47
+ job_spec: JobSpec, addr: str, job_id: JobId, log_base: str | None
48
+ ) -> subprocess.Popen:
47
49
  base = [
48
50
  "python",
49
51
  "-m",
@@ -68,11 +70,15 @@ def _spawn_local(job_spec: JobSpec, addr: str, job_id: JobId) -> subprocess.Pope
68
70
  f"{job_spec.hosts}",
69
71
  ]
70
72
  report = ["--report_address", f"{addr},{job_id}"]
73
+ if log_base:
74
+ logs = ["--log_base", f"{log_base}/job.{job_id}"]
75
+ else:
76
+ logs = []
71
77
  global local_job_port
72
78
  portBase = ["--port_base", str(local_job_port)]
73
79
  local_job_port += 1 + job_spec.hosts * job_spec.workers_per_host * 10
74
80
  return subprocess.Popen(
75
- base + infra + report + portBase, env={**os.environ, **job_spec.envvars}
81
+ base + infra + report + portBase + logs, env={**os.environ, **job_spec.envvars}
76
82
  )
77
83
 
78
84
 
@@ -105,18 +111,23 @@ def _spawn_slurm(job_spec: JobSpec, addr: str, job_id: JobId) -> subprocess.Pope
105
111
  )
106
112
 
107
113
 
108
- def _spawn_subprocess(job_spec: JobSpec, addr: str, job_id: JobId) -> subprocess.Popen:
114
+ def _spawn_subprocess(
115
+ job_spec: JobSpec, addr: str, job_id: JobId, log_base: str | None
116
+ ) -> subprocess.Popen:
109
117
  if job_spec.use_slurm:
118
+ if log_base is not None:
119
+ raise ValueError(f"unexpected {log_base=}")
110
120
  return _spawn_slurm(job_spec, addr, job_id)
111
121
  else:
112
- return _spawn_local(job_spec, addr, job_id)
122
+ return _spawn_local(job_spec, addr, job_id, log_base)
113
123
 
114
124
 
115
125
  class JobRouter:
116
- def __init__(self, poller: zmq.Poller):
126
+ def __init__(self, poller: zmq.Poller, log_base: str | None):
117
127
  self.poller = poller
118
128
  self.jobs: dict[str, Job] = {}
119
129
  self.procs: dict[str, subprocess.Popen] = {}
130
+ self.log_base = log_base
120
131
 
121
132
  def spawn_job(self, job_spec: JobSpec) -> JobId:
122
133
  job_id = next_uuid(self.jobs.keys(), lambda: str(uuid.uuid4()))
@@ -131,7 +142,9 @@ class JobRouter:
131
142
  logger.debug(f"will spawn job {job_id} and listen on {full_addr}")
132
143
  self.poller.register(socket, flags=zmq.POLLIN)
133
144
  self.jobs[job_id] = Job(socket, JobProgressStarted, -1, {})
134
- self.procs[job_id] = _spawn_subprocess(job_spec, full_addr, job_id)
145
+ self.procs[job_id] = _spawn_subprocess(
146
+ job_spec, full_addr, job_id, self.log_base
147
+ )
135
148
  return job_id
136
149
 
137
150
  def progress_of(
cascade/gateway/server.py CHANGED
@@ -79,14 +79,14 @@ def handle_controller(socket: zmq.Socket, jobs: JobRouter) -> None:
79
79
  jobs.put_result(report.job_id, dataset_id, result)
80
80
 
81
81
 
82
- def serve(url: str) -> None:
82
+ def serve(url: str, log_base: str | None = None) -> None:
83
83
  ctx = get_context()
84
84
  poller = zmq.Poller()
85
85
 
86
86
  fe = ctx.socket(zmq.REP)
87
87
  fe.bind(url)
88
88
  poller.register(fe, flags=zmq.POLLIN)
89
- jobs = JobRouter(poller)
89
+ jobs = JobRouter(poller, log_base)
90
90
 
91
91
  logger.debug("entering recv loop")
92
92
  is_break = False
@@ -1,2 +1,2 @@
1
1
  # Do not change! Do not track in version control!
2
- __version__ = "0.3.5"
2
+ __version__ = "0.3.6"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: earthkit-workflows
3
- Version: 0.3.5
3
+ Version: 0.3.6
4
4
  Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
5
5
  Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
6
6
  License-Expression: Apache-2.0
@@ -1,7 +1,7 @@
1
1
  cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
4
- cascade/benchmarks/__main__.py,sha256=euamIHoDdqo2VC6gBUoOWuzlK1DttYCprDBWAVKGjnA,6986
4
+ cascade/benchmarks/__main__.py,sha256=LyYwIAGLgZD4Fq7Kxb3vFXu3oDqA6MzzICH9h-bpLL8,7517
5
5
  cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
6
6
  cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
7
7
  cascade/benchmarks/job1.py,sha256=NY1k9PvkUZODCIDO_zSNwC9sFiMYpBwOaWB7FMSkt2o,4594
@@ -17,20 +17,20 @@ cascade/executor/bridge.py,sha256=vrs-5_Qt2mgkAD7Mzi43Xt_q7tpXX6i1UOPfqZSxHfs,81
17
17
  cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
18
18
  cascade/executor/config.py,sha256=rA4WeCNbdJJ3FdOKJ6WN3_VUorYW3cqdMfKUYPSyj0Y,1471
19
19
  cascade/executor/data_server.py,sha256=xLIbLkWn8PnJl4lMP8ADHa2S0EgPwr0-bH7_Sib_Y70,13701
20
- cascade/executor/executor.py,sha256=KAwCJbv-kO1itPSggEKfVszZYMHEpr7bwgubmW3Qlds,12822
20
+ cascade/executor/executor.py,sha256=SqMVM8BvCNM2r2Zbg9kxSxwFADAaoBU7nCMtfzktsgI,13282
21
21
  cascade/executor/msg.py,sha256=QW7Me-8Sin-x-f4M4bzvO7_av8MRkjnabQN6Ch3x22c,4230
22
22
  cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
23
23
  cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
24
- cascade/executor/runner/entrypoint.py,sha256=kpZ0stgWn5JforZhJVjVCt5RZOgrAL0DBo4A33hauz4,5886
24
+ cascade/executor/runner/entrypoint.py,sha256=paTrkURhI0Vvxb36BUO8QMohIXJdTDVI03o2vNr8VN8,6135
25
25
  cascade/executor/runner/memory.py,sha256=EhFhZIFiDo1wDiNuw2gpeUi15yAVDW0hxD7cvX0m0Ho,5299
26
26
  cascade/executor/runner/packages.py,sha256=OZjEOvKy8LQ2uguGZU1L7TVYz1415JOUGySRfU_D_sc,2513
27
27
  cascade/executor/runner/runner.py,sha256=zqpkvxdWLbwyUFaUbZmSj0KQEBNRpmF8gwVotiaamhc,4870
28
28
  cascade/gateway/__init__.py,sha256=1EzMKdLFXEucj0YWOlyVqLx4suOntitwM03T_rRubIk,829
29
- cascade/gateway/__main__.py,sha256=OcT5Amo0tE1-3StHuTOeQCaABmMBO3XLDK4t4b8oeeQ,647
29
+ cascade/gateway/__main__.py,sha256=x6-DQin6ICvalHT9YcghGyVMoykEATOdN5ON9IeHPYA,862
30
30
  cascade/gateway/api.py,sha256=-7HTUhK9idszVCwiVwyHMcNx7n6qRcyPWsLx2e19n3A,2511
31
31
  cascade/gateway/client.py,sha256=1p4Tvrf-BH0LQHOES5rY1z3JNIfmXcqWG2kYl4rpcE0,4061
32
- cascade/gateway/router.py,sha256=KBlw5U-QOcZ8QN6Ls3WLS7gO0X2apupAWX6pS7A7mrs,7323
33
- cascade/gateway/server.py,sha256=srqmtOuzzon5GV5jR1wUVh744Ct336BQs5Gzd3mhsPA,3733
32
+ cascade/gateway/router.py,sha256=iN-dc3L46aEy0EV57NNKYwaqIu0Au9kImu1pg-UbxwE,7680
33
+ cascade/gateway/server.py,sha256=tsOyKtVFs5EZmWrjKdi9JwWxK0DG207oSa9OQ-4zN3M,3772
34
34
  cascade/low/__init__.py,sha256=5cw2taOGITK_gFbICftzK2YLdEAnLUY5OzblFzdHss4,769
35
35
  cascade/low/builders.py,sha256=_u5X8G_EF00hFt8Anv9AXo6yPf1O8MHDmqs2kKmREl0,7073
36
36
  cascade/low/core.py,sha256=txya9rgks2b1ze9yLvFvrZCs8sCCtDUlfNwz4sHgybM,5994
@@ -53,7 +53,7 @@ cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
53
53
  cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
54
54
  cascade/shm/server.py,sha256=5Ub9bnBmDto9BwfjX3h3sJeiLzZN4lawgtLfvK-vcMU,5036
55
55
  earthkit/workflows/__init__.py,sha256=f17AdiV9g4eRN8m4dUnSU58RoLRqk1e6iMRrQiBUSKk,1880
56
- earthkit/workflows/_version.py,sha256=y917q-_1kG-gTBVkeELzqyHU_FN3NUWwP8pqRDzG4Yw,72
56
+ earthkit/workflows/_version.py,sha256=RCglqs61OUYaUdhn0AKIjoOZXQBCSTS0C7aXjmtZiuA,72
57
57
  earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
58
58
  earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
59
59
  earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
@@ -83,8 +83,8 @@ earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESs
83
83
  earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
84
84
  earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
85
85
  earthkit/workflows/plugins/__init__.py,sha256=WcX4qbEhgTXabIbogydtzNmZ2tB_SuW6NzNkOYQfS-Y,61
86
- earthkit_workflows-0.3.5.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
87
- earthkit_workflows-0.3.5.dist-info/METADATA,sha256=o-BNwkFAKcA9NyyryCTovDEeMs1uNNde9IyChoUq_Lc,1571
88
- earthkit_workflows-0.3.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
- earthkit_workflows-0.3.5.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
90
- earthkit_workflows-0.3.5.dist-info/RECORD,,
86
+ earthkit_workflows-0.3.6.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
87
+ earthkit_workflows-0.3.6.dist-info/METADATA,sha256=yu-WoCd76So0jElLV9J1woPnwCYcJHoRla8bga7qsgM,1571
88
+ earthkit_workflows-0.3.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
+ earthkit_workflows-0.3.6.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
90
+ earthkit_workflows-0.3.6.dist-info/RECORD,,