earthkit-workflows 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,7 +26,9 @@ import logging.config
26
26
  import multiprocessing
27
27
  import os
28
28
  import subprocess
29
+ import sys
29
30
  from concurrent.futures import ThreadPoolExecutor
31
+ from socket import getfqdn
30
32
  from time import perf_counter_ns
31
33
 
32
34
  import fire
@@ -77,13 +79,17 @@ def get_job(benchmark: str | None, instance_path: str | None) -> JobInstance:
77
79
  import cascade.benchmarks.matmul as matmul
78
80
 
79
81
  return matmul.get_job()
82
+ elif benchmark.startswith("dist"):
83
+ import cascade.benchmarks.dist as dist
84
+
85
+ return dist.get_job()
80
86
  else:
81
87
  raise NotImplementedError(benchmark)
82
88
  else:
83
89
  raise TypeError("specified neither benchmark name nor job instance")
84
90
 
85
91
 
86
- def get_gpu_count() -> int:
92
+ def get_cuda_count() -> int:
87
93
  try:
88
94
  if "CUDA_VISIBLE_DEVICES" in os.environ:
89
95
  # TODO we dont want to just count, we want to actually use literally these ids
@@ -101,12 +107,22 @@ def get_gpu_count() -> int:
101
107
  if "GPU" in l
102
108
  )
103
109
  except:
104
- # TODO support macos
105
110
  logger.exception("unable to determine available gpus")
106
111
  gpus = 0
107
112
  return gpus
108
113
 
109
114
 
115
+ def get_gpu_count(host_idx: int, worker_count: int) -> int:
116
+ if sys.platform == "darwin":
117
+ # we should inspect some gpu capabilities details to prevent overcommit
118
+ return worker_count
119
+ else:
120
+ if host_idx == 0:
121
+ return get_cuda_count()
122
+ else:
123
+ return 0
124
+
125
+
110
126
  def launch_executor(
111
127
  job_instance: JobInstance,
112
128
  controller_address: BackboneAddress,
@@ -116,6 +132,7 @@ def launch_executor(
116
132
  shm_vol_gb: int | None,
117
133
  gpu_count: int,
118
134
  log_base: str | None,
135
+ url_base: str,
119
136
  ):
120
137
  if log_base is not None:
121
138
  log_base = f"{log_base}.host{i}"
@@ -123,19 +140,25 @@ def launch_executor(
123
140
  logging.config.dictConfig(logging_config_filehandler(log_path))
124
141
  else:
125
142
  logging.config.dictConfig(logging_config)
126
- logger.info(f"will set {gpu_count} gpus on host {i}")
127
- os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
128
- executor = Executor(
129
- job_instance,
130
- controller_address,
131
- workers_per_host,
132
- f"h{i}",
133
- portBase,
134
- shm_vol_gb,
135
- log_base,
136
- )
137
- executor.register()
138
- executor.recv_loop()
143
+ try:
144
+ logger.info(f"will set {gpu_count} gpus on host {i}")
145
+ os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
146
+ executor = Executor(
147
+ job_instance,
148
+ controller_address,
149
+ workers_per_host,
150
+ f"h{i}",
151
+ portBase,
152
+ shm_vol_gb,
153
+ log_base,
154
+ url_base,
155
+ )
156
+ executor.register()
157
+ executor.recv_loop()
158
+ except Exception:
159
+ # NOTE we log this to get the stacktrace into the logfile
160
+ logger.exception("executor failure")
161
+ raise
139
162
 
140
163
 
141
164
  def run_locally(
@@ -151,24 +174,47 @@ def run_locally(
151
174
  logging.config.dictConfig(logging_config_filehandler(log_path))
152
175
  else:
153
176
  logging.config.dictConfig(logging_config)
177
+ logger.debug(f"local run starting with {hosts=} and {workers=} on {portBase=}")
154
178
  launch = perf_counter_ns()
155
- preschedule = precompute(job)
156
179
  c = f"tcp://localhost:{portBase}"
157
180
  m = f"tcp://localhost:{portBase+1}"
158
181
  ps = []
159
- for i, executor in enumerate(range(hosts)):
160
- if i == 0:
161
- gpu_count = get_gpu_count()
162
- else:
163
- gpu_count = 0
164
- # NOTE forkserver/spawn seem to forget venv, we need fork
165
- p = multiprocessing.get_context("fork").Process(
166
- target=launch_executor,
167
- args=(job, c, workers, portBase + 1 + i * 10, i, None, gpu_count, log_base),
168
- )
169
- p.start()
170
- ps.append(p)
171
182
  try:
183
+ # executors forking
184
+ for i, executor in enumerate(range(hosts)):
185
+ gpu_count = get_gpu_count(i, workers)
186
+ # NOTE forkserver/spawn seem to forget venv, we need fork
187
+ logger.debug(f"forking into executor on host {i}")
188
+ p = multiprocessing.get_context("fork").Process(
189
+ target=launch_executor,
190
+ args=(
191
+ job,
192
+ c,
193
+ workers,
194
+ portBase + 1 + i * 10,
195
+ i,
196
+ None,
197
+ gpu_count,
198
+ log_base,
199
+ "tcp://localhost",
200
+ ),
201
+ )
202
+ p.start()
203
+ ps.append(p)
204
+
205
+ # compute preschedule
206
+ preschedule = precompute(job)
207
+
208
+ # check processes started healthy
209
+ for i, p in enumerate(ps):
210
+ if not p.is_alive():
211
+ # TODO ideally we would somehow connect this with the Register message
212
+ # consumption in the Controller -- but there we don't assume that
213
+ # executors are on the same physical host
214
+ raise ValueError(f"executor {i} failed to live due to {p.exitcode}")
215
+
216
+ # start bridge itself
217
+ logger.debug("starting bridge")
172
218
  b = Bridge(c, hosts)
173
219
  start = perf_counter_ns()
174
220
  run(job, b, preschedule, report_address=report_address)
@@ -176,7 +222,9 @@ def run_locally(
176
222
  print(
177
223
  f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
178
224
  )
179
- except:
225
+ except Exception:
226
+ # NOTE we log this to get the stacktrace into the logfile
227
+ logger.exception("controller failure, proceed with executor shutdown")
180
228
  for p in ps:
181
229
  if p.is_alive():
182
230
  callback(m, ExecutorShutdown())
@@ -238,7 +286,7 @@ def main_dist(
238
286
  f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
239
287
  )
240
288
  else:
241
- gpu_count = get_gpu_count()
289
+ gpu_count = get_gpu_count(0, workers_per_host)
242
290
  launch_executor(
243
291
  jobInstance,
244
292
  controller_url,
@@ -247,6 +295,7 @@ def main_dist(
247
295
  idx,
248
296
  shm_vol_gb,
249
297
  gpu_count,
298
+ f"tcp://{getfqdn()}",
250
299
  )
251
300
 
252
301
 
@@ -0,0 +1,123 @@
1
+ """Demonstrates gang scheduling capabilities, ie, multiple nodes capable of mutual communication.
2
+
3
+ The job is a source -> (dist group) -> sink, where:
4
+ source just returns an int,
5
+ dist group is L nodes to be scheduled as a single gang
6
+ rank=0 node broadcasts a buffer containing the node's input
7
+ each node returns its input multiplied by broadcasted buffer
8
+ sink returns the sum of all inputs
9
+
10
+ There are multiple implementations of that:
11
+ torch
12
+ jax (actually does a mesh-shard global sum instead of broadcast -- the point is to showcase dist init)
13
+ """
14
+
15
+ import os
16
+
17
+ from cascade.low.builders import JobBuilder, TaskBuilder
18
+ from cascade.low.core import JobInstance, SchedulingConstraint
19
+
20
+
21
+ def source_func() -> int:
22
+ return 42
23
+
24
+
25
+ def dist_func_torch(a: int) -> int:
26
+ import datetime as dt
27
+
28
+ import numpy as np
29
+ import torch.distributed as dist
30
+
31
+ world_size = int(os.environ["CASCADE_GANG_WORLD_SIZE"])
32
+ rank = int(os.environ["CASCADE_GANG_RANK"])
33
+ coordinator = os.environ["CASCADE_GANG_COORDINATOR"]
34
+ print(f"starting with envvars: {rank=}/{world_size=}, {coordinator=}")
35
+ dist.init_process_group(
36
+ backend="gloo",
37
+ init_method=coordinator,
38
+ timeout=dt.timedelta(minutes=1),
39
+ world_size=world_size,
40
+ rank=rank,
41
+ )
42
+ group_ranks = np.arange(world_size, dtype=int)
43
+ group = dist.new_group(group_ranks)
44
+
45
+ if rank == 0:
46
+ buf = [a]
47
+ dist.broadcast_object_list(buf, src=0, group=group)
48
+ print("broadcast ok")
49
+ else:
50
+ buf = np.array([0], dtype=np.uint64)
51
+ dist.broadcast_object_list(buf, src=0, group=group)
52
+ print(f"broadcast recevied {buf}")
53
+
54
+ return a * buf[0]
55
+
56
+
57
+ def dist_func_jax(a: int) -> int:
58
+ world_size = int(os.environ["CASCADE_GANG_WORLD_SIZE"])
59
+ rank = int(os.environ["CASCADE_GANG_RANK"])
60
+ coordinator = os.environ["CASCADE_GANG_COORDINATOR"]
61
+ os.environ["JAX_NUM_CPU_DEVICES"] = "1"
62
+ os.environ["JAX_PLATFORM_NAME"] = "cpu"
63
+ os.environ["JAX_PLATFORMS"] = "cpu"
64
+ import jax
65
+ import jax.numpy as jp
66
+
67
+ jax.config.update("jax_platforms", "cpu")
68
+ jax.config.update("jax_platform_name", "cpu")
69
+ # NOTE neither of the above seems to actually help with an init error message :(
70
+ print(f"starting with envvars: {rank=}/{world_size=}, {coordinator=}")
71
+ if coordinator.startswith("tcp://"):
72
+ coordinator = coordinator[len("tcp://") :]
73
+ jax.distributed.initialize(coordinator, num_processes=world_size, process_id=rank)
74
+ assert jax.device_count() == world_size
75
+
76
+ mesh = jax.make_mesh((world_size,), ("i",))
77
+ global_data = jp.arange(world_size)
78
+ sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("i"))
79
+ global_array = jax.device_put(global_data, sharding)
80
+ result = jp.sum(global_array)
81
+ print(f"worker {rank}# got result {result=}")
82
+ return a + result
83
+
84
+
85
+ def build_dist_func(impl: str):
86
+ if impl == "torch":
87
+ return dist_func_torch
88
+ elif impl == "jax":
89
+ return dist_func_jax
90
+ else:
91
+ raise NotImplementedError(impl)
92
+
93
+
94
+ def sink_func(**kwargs) -> int:
95
+ c = 0
96
+ for _, v in kwargs.items():
97
+ c += v
98
+ print(f"sink accumulated {c}")
99
+ return c
100
+
101
+
102
+ def get_job() -> JobInstance:
103
+ source_node = TaskBuilder.from_callable(source_func)
104
+ sink_node = TaskBuilder.from_callable(sink_func)
105
+ job = JobBuilder().with_node("source", source_node).with_node("sink", sink_node)
106
+ L = int(os.environ["DIST_L"])
107
+ IMPL = os.environ["DIST_IMPL"]
108
+ node = TaskBuilder.from_callable(build_dist_func(IMPL))
109
+
110
+ for i in range(L):
111
+ job = (
112
+ job.with_node(f"proc{i}", node)
113
+ .with_edge("source", f"proc{i}", "a")
114
+ .with_edge(f"proc{i}", "sink", f"v{i}")
115
+ )
116
+ job.nodes["sink"].definition.input_schema[
117
+ f"v{i}"
118
+ ] = "int" # TODO put some allow_kw into TaskDefinition instead to allow this
119
+
120
+ job = job.build().get_or_raise()
121
+ job.ext_outputs = list(job.outputs_of("sink"))
122
+ job.constraints = [SchedulingConstraint(gang=[f"proc{i}" for i in range(L)])]
123
+ return job
cascade/controller/act.py CHANGED
@@ -51,6 +51,7 @@ def act(bridge: Bridge, assignment: Assignment) -> None:
51
51
  worker=assignment.worker,
52
52
  tasks=assignment.tasks,
53
53
  publish=assignment.outputs,
54
+ extra_env=assignment.extra_env,
54
55
  )
55
56
 
56
57
  for task in assignment.tasks:
@@ -43,6 +43,11 @@ def run(
43
43
  reporter = Reporter(report_address)
44
44
 
45
45
  try:
46
+ total_gpus = sum(worker.gpu for worker in env.workers.values())
47
+ needs_gpus = any(task.definition.needs_gpu for task in job.tasks.values())
48
+ if needs_gpus and total_gpus == 0:
49
+ raise ValueError("environment contains no gpu yet job demands one")
50
+
46
51
  while (
47
52
  state.has_awaitable()
48
53
  or context.has_awaitable()
@@ -22,6 +22,7 @@ from cascade.low.core import DatasetId, HostId, WorkerId
22
22
  from cascade.low.execution_context import DatasetStatus, JobExecutionContext
23
23
  from cascade.low.func import assert_never
24
24
  from cascade.low.tracing import TaskLifecycle, TransmitLifecycle, mark
25
+ from cascade.scheduler.api import gang_check_ready
25
26
  from cascade.scheduler.assign import set_worker2task_overhead
26
27
  from cascade.scheduler.core import Schedule
27
28
 
@@ -67,6 +68,7 @@ def consider_computable(
67
68
  # NOTE this is a task newly made computable, so we need to calc
68
69
  # `overhead` for all hosts/workers assigned to the component
69
70
  set_worker2task_overhead(schedule, context, worker, child_task)
71
+ gang_check_ready(child_task, component.gang_preparation)
70
72
 
71
73
 
72
74
  # TODO refac less explicit mutation of context, use class methods
@@ -46,7 +46,7 @@ class Bridge:
46
46
  self.transmit_idx_counter = 0
47
47
  self.sender = ReliableSender(self.mlistener.address, resend_grace_ms)
48
48
  registered = 0
49
- self.environment = Environment(workers={})
49
+ self.environment = Environment(workers={}, host_url_base={})
50
50
  logger.debug("about to start receiving registrations")
51
51
  registration_grace = time.time_ns() + 3 * 60 * 1_000_000_000
52
52
  while registered < expected_executors:
@@ -69,6 +69,7 @@ class Bridge:
69
69
  self.environment.workers[worker.worker_id] = Worker(
70
70
  cpu=worker.cpu, gpu=worker.gpu, memory_mb=worker.memory_mb
71
71
  )
72
+ self.environment.host_url_base[message.host] = message.url_base
72
73
  registered += 1
73
74
  self.heartbeat_checker[message.host] = GraceWatcher(
74
75
  2 * executor_heartbeat_grace_ms
@@ -21,12 +21,14 @@ logging_config = {
21
21
  "forecastbox.worker": {"level": "DEBUG"},
22
22
  "forecastbox.executor": {"level": "DEBUG"},
23
23
  "cascade": {"level": "INFO"},
24
+ "cascade.benchmarks": {"level": "DEBUG"},
24
25
  "cascade.low": {"level": "DEBUG"},
25
26
  "cascade.shm": {"level": "DEBUG"},
26
27
  "cascade.controller": {"level": "DEBUG"},
27
28
  "cascade.executor": {"level": "DEBUG"},
28
29
  "cascade.scheduler": {"level": "DEBUG"},
29
30
  "cascade.gateway": {"level": "DEBUG"},
31
+ "earthkit.workflows": {"level": "DEBUG"},
30
32
  "httpcore": {"level": "ERROR"},
31
33
  "httpx": {"level": "ERROR"},
32
34
  "": {"level": "WARNING", "handlers": ["default"]},
@@ -69,8 +69,9 @@ class Executor:
69
69
  workers: int,
70
70
  host: HostId,
71
71
  portBase: int,
72
- shm_vol_gb: int | None = None,
73
- log_base: str | None = None,
72
+ shm_vol_gb: int | None,
73
+ log_base: str | None,
74
+ url_base: str,
74
75
  ) -> None:
75
76
  self.job_instance = job_instance
76
77
  self.param_source = param_source(job_instance.edges)
@@ -85,6 +86,7 @@ class Executor:
85
86
  self.heartbeat_watcher = GraceWatcher(grace_ms=heartbeat_grace_ms)
86
87
 
87
88
  self.terminating = False
89
+ logger.debug("register terminate function")
88
90
  atexit.register(self.terminate)
89
91
  # NOTE following inits are with potential side effects
90
92
  self.mlistener = Listener(address_of(portBase))
@@ -98,6 +100,7 @@ class Executor:
98
100
  shm_logging = logging_config_filehandler(f"{log_base}.shm.txt")
99
101
  else:
100
102
  shm_logging = logging_config
103
+ logger.debug("about to fork into shm process")
101
104
  self.shm_process = ctx.Process(
102
105
  target=shm_server,
103
106
  args=(
@@ -113,6 +116,7 @@ class Executor:
113
116
  dsr_logging = logging_config_filehandler(f"{log_base}.dsr.txt")
114
117
  else:
115
118
  dsr_logging = logging_config
119
+ logger.debug("about to fork into data server")
116
120
  self.data_server = ctx.Process(
117
121
  target=start_data_server,
118
122
  args=(
@@ -138,6 +142,7 @@ class Executor:
138
142
  )
139
143
  for idx, worker_id in enumerate(self.workers.keys())
140
144
  ],
145
+ url_base=url_base,
141
146
  )
142
147
  logger.debug("constructed executor")
143
148
 
cascade/executor/msg.py CHANGED
@@ -71,6 +71,7 @@ class TaskSequence:
71
71
  worker: WorkerId # worker for running those tasks
72
72
  tasks: list[TaskId] # to be executed in the given order
73
73
  publish: set[DatasetId] # set of outputs to be published
74
+ extra_env: list[tuple[str, str]] # extra env var to set
74
75
 
75
76
 
76
77
  @dataclass(frozen=True)
@@ -147,6 +148,7 @@ class ExecutorRegistration:
147
148
  host: HostId
148
149
  maddress: BackboneAddress
149
150
  daddress: BackboneAddress
151
+ url_base: str # used for eg dist comms init
150
152
  workers: list[Worker]
151
153
 
152
154
 
@@ -11,6 +11,7 @@
11
11
  import logging
12
12
  import logging.config
13
13
  import os
14
+ import sys
14
15
  from dataclasses import dataclass
15
16
 
16
17
  import zmq
@@ -98,12 +99,17 @@ def execute_sequence(
98
99
  ) -> None:
99
100
  taskId: TaskId | None = None
100
101
  try:
102
+ for key, value in taskSequence.extra_env.items():
103
+ os.environ[key] = value
101
104
  executionContext = runnerContext.project(taskSequence)
102
105
  for taskId in taskSequence.tasks:
103
106
  pckg.extend(executionContext.tasks[taskId].definition.environment)
104
107
  run(taskId, executionContext, memory)
105
108
  if Config.posttask_flush:
106
109
  memory.flush()
110
+ for key in taskSequence.extra_env.keys():
111
+ # NOTE we should in principle restore the previous value, but we dont expect collisions
112
+ del os.environ[key]
107
113
  except Exception as e:
108
114
  logger.exception("runner failure, about to report")
109
115
  callback(
@@ -129,11 +135,15 @@ def entrypoint(runnerContext: RunnerContext):
129
135
  label("worker", repr(runnerContext.workerId))
130
136
  worker_num = runnerContext.workerId.worker_num()
131
137
  gpus = int(os.environ.get("CASCADE_GPU_COUNT", "0"))
132
- os.environ["CUDA_VISIBLE_DEVICES"] = (
133
- ",".join(str(worker_num)) if worker_num < gpus else ""
134
- )
135
- # NOTE check any(task.definition.needs_gpu) anywhere?
136
- # TODO configure OMP_NUM_THREADS, blas, mkl, etc -- not clear how tho
138
+ if sys.platform != "darwin":
139
+ os.environ["CUDA_VISIBLE_DEVICES"] = (
140
+ str(worker_num) if worker_num < gpus else ""
141
+ )
142
+ # NOTE check any(task.definition.needs_gpu) anywhere?
143
+ # TODO configure OMP_NUM_THREADS, blas, mkl, etc -- not clear how tho
144
+ else:
145
+ if gpus != 1:
146
+ logger.warning("unexpected absence of gpu on darwin")
137
147
 
138
148
  for serdeTypeEnc, (serdeSer, serdeDes) in runnerContext.job.serdes.items():
139
149
  serde.SerdeRegistry.register(type_dec(serdeTypeEnc), serdeSer, serdeDes)
cascade/low/core.py CHANGED
@@ -106,15 +106,26 @@ def type_enc(t: Type) -> str:
106
106
  return b64encode(cloudpickle.dumps(t)).decode("ascii")
107
107
 
108
108
 
109
+ class SchedulingConstraint(BaseModel):
110
+ gang: list[TaskId] = Field(
111
+ description="this set of TaskIds must be started at the same time, with ranks and address list as envvar",
112
+ )
113
+
114
+
109
115
  class JobInstance(BaseModel):
110
116
  tasks: dict[TaskId, TaskInstance]
111
117
  edges: list[Task2TaskEdge]
112
118
  serdes: dict[str, tuple[str, str]] = Field(
113
- {},
119
+ default_factory=lambda: {},
114
120
  description="for each Type with custom serde, add entry here. The string is fully qualified name of the ser/des functions",
115
121
  )
116
122
  ext_outputs: list[DatasetId] = Field(
117
- [], description="ids to externally materialize"
123
+ default_factory=lambda: [],
124
+ description="ids to externally materialize",
125
+ )
126
+ constraints: list[SchedulingConstraint] = Field(
127
+ default_factory=lambda: [],
128
+ description="constraints for the scheduler such as gangs",
118
129
  )
119
130
 
120
131
  def outputs_of(self, task_id: TaskId) -> set[DatasetId]:
@@ -157,6 +168,7 @@ class Worker(BaseModel):
157
168
 
158
169
  class Environment(BaseModel):
159
170
  workers: dict[WorkerId, Worker]
171
+ host_url_base: dict[HostId, str]
160
172
 
161
173
 
162
174
  class TaskExecutionRecord(BaseModel):
cascade/scheduler/api.py CHANGED
@@ -22,6 +22,7 @@ from cascade.scheduler.core import (
22
22
  Assignment,
23
23
  ComponentId,
24
24
  ComponentSchedule,
25
+ GangPreparation,
25
26
  Preschedule,
26
27
  Schedule,
27
28
  )
@@ -29,12 +30,60 @@ from cascade.scheduler.core import (
29
30
  logger = logging.getLogger(__name__)
30
31
 
31
32
 
33
+ def gang_check_ready(task: TaskId, gang_prep: GangPreparation):
34
+ """When a task becomes computable, mutate the gang_prep to possibly
35
+ transition some gangs to `ready`
36
+ """
37
+ for gang in gang_prep.lookup[task]:
38
+ if gang not in gang_prep.countdown:
39
+ raise ValueError(
40
+ f"after {task=} marked computable, {gang=} not found -- double compuptable mark?"
41
+ )
42
+ remaining = gang_prep.countdown[gang]
43
+ if task not in remaining:
44
+ raise ValueError(
45
+ f"after {task=} marked computable, {gang=} does not have it in {remaining=}. Invalid gang?"
46
+ )
47
+ remaining.remove(task)
48
+ if not remaining:
49
+ logger.debug(f"gang just became ready {gang=}")
50
+ gang_prep.ready.append(gang)
51
+ gang_prep.countdown.pop(gang)
52
+
53
+
32
54
  def init_schedule(preschedule: Preschedule, context: JobExecutionContext) -> Schedule:
33
55
  components: list[ComponentSchedule] = []
34
56
  ts2component: dict[TaskId, ComponentId] = {}
35
57
 
58
+ gangs = [
59
+ frozenset(constraint.gang) for constraint in context.job_instance.constraints
60
+ ]
61
+
36
62
  computable = 0
37
63
  for componentId, precomponent in enumerate(preschedule.components):
64
+ # gang preparation
65
+ tasks = set(precomponent.nodes)
66
+ lookup = defaultdict(list)
67
+ countdown = {}
68
+ i = 0
69
+ while i < len(gangs):
70
+ if not gangs[i].issubset(tasks):
71
+ i += 1
72
+ continue
73
+ gang = gangs.pop(i)
74
+ countdown[gang] = set(gang)
75
+ for e in gang:
76
+ lookup[e].append(gang)
77
+
78
+ gang_preparation = GangPreparation(
79
+ ready=[],
80
+ lookup=lookup,
81
+ countdown=countdown,
82
+ )
83
+ for source in precomponent.sources:
84
+ gang_check_ready(source, gang_preparation)
85
+
86
+ # component itself
38
87
  component = ComponentSchedule(
39
88
  core=precomponent,
40
89
  weight=precomponent.weight(),
@@ -45,12 +94,18 @@ def init_schedule(preschedule: Preschedule, context: JobExecutionContext) -> Sch
45
94
  task: {inp for inp in context.edge_i[task]}
46
95
  for task in precomponent.nodes
47
96
  },
97
+ gang_preparation=gang_preparation,
48
98
  )
49
99
  components.append(component)
50
100
  computable += len(precomponent.sources)
51
101
  for task in precomponent.nodes:
52
102
  ts2component[task] = componentId
53
103
 
104
+ if gangs:
105
+ for gang in gangs:
106
+ logger.error(f"a gang not part of a component: {gang}")
107
+ raise ValueError(f"a total of {len(gangs)} were not a subcomponent")
108
+
54
109
  return Schedule(
55
110
  components=components,
56
111
  ts2component=ts2component,
@@ -18,7 +18,13 @@ from typing import Iterable, Iterator
18
18
  from cascade.low.core import DatasetId, HostId, TaskId, WorkerId
19
19
  from cascade.low.execution_context import DatasetStatus, JobExecutionContext
20
20
  from cascade.low.tracing import Microtrace, trace
21
- from cascade.scheduler.core import Assignment, ComponentCore, ComponentId, Schedule
21
+ from cascade.scheduler.core import (
22
+ Assignment,
23
+ ComponentCore,
24
+ ComponentId,
25
+ ComponentSchedule,
26
+ Schedule,
27
+ )
22
28
 
23
29
  logger = logging.getLogger(__name__)
24
30
 
@@ -92,9 +98,148 @@ def build_assignment(
92
98
  tasks=assigned,
93
99
  prep=prep,
94
100
  outputs=trimmed_outputs,
101
+ extra_env={},
95
102
  )
96
103
 
97
104
 
105
+ def _postproc_assignment(
106
+ assignment: Assignment,
107
+ component: ComponentSchedule,
108
+ schedule: Schedule,
109
+ context: JobExecutionContext,
110
+ ) -> None:
111
+ for assigned in assignment.tasks:
112
+ if assigned in component.computable:
113
+ component.computable.pop(assigned)
114
+ component.worker2task_values.remove(assigned)
115
+ schedule.computable -= 1
116
+ else:
117
+ # shortcut for fused-in tasks
118
+ component.is_computable_tracker[assigned] = set()
119
+ context.idle_workers.remove(assignment.worker)
120
+ component.weight -= len(assignment.tasks)
121
+
122
+
123
+ # TODO this is not particularly systematic! We cant bind dynamically at the host as we send this
124
+ # in advance, so we need to hardcode. Ideally we centrallize all port opening into a single module,
125
+ # in particular unify this with the portBase from benchmarks/__main__ and then derived ports from
126
+ # executor/executor.py etc. As is, we have a single global variable that we increment, to ensure
127
+ # no port collision happens gang-wise -- we dont really expect many gangs per a workflow
128
+ gang_port = 12355
129
+
130
+
131
+ def _try_assign_gang(
132
+ schedule: Schedule,
133
+ gang: list[frozenset[TaskId]],
134
+ workers: list[WorkerId],
135
+ component_id: ComponentId,
136
+ context: JobExecutionContext,
137
+ fail_acc: list[frozenset[TaskId]],
138
+ ) -> Iterator[Assignment]:
139
+ """We greedily assign by descending worker-task distance"""
140
+ global gang_port
141
+ if len(gang) > len(workers):
142
+ logger.debug(f"not enough workers ({len(workers)}) for {gang=}")
143
+ fail_acc.append(gang)
144
+ return
145
+ start = perf_counter_ns()
146
+ component = schedule.components[component_id]
147
+ gpu_tasks: set[TaskId] = set()
148
+ cpu_tasks: set[TaskId] = set()
149
+ gpu_workers: set[WorkerId] = set()
150
+ cpu_workers: set[WorkerId] = set()
151
+ for task in gang:
152
+ if context.job_instance.tasks[task].definition.needs_gpu:
153
+ gpu_tasks.add(task)
154
+ else:
155
+ cpu_tasks.add(task)
156
+ for worker in workers:
157
+ if context.environment.workers[worker].gpu > 0:
158
+ gpu_workers.add(worker)
159
+ else:
160
+ cpu_workers.add(worker)
161
+ if len(gpu_tasks) > len(gpu_workers):
162
+ logger.debug(f"not enough gpu workers ({len(workers)}) for {gang=}")
163
+ fail_acc.append(gang)
164
+ end = perf_counter_ns()
165
+ trace(Microtrace.ctrl_assign, end - start)
166
+ return
167
+
168
+ world_size = len(gang)
169
+ rank = 0
170
+ coordinator = None
171
+
172
+ # similarly to _assignment_heuristic, a greedy algorithm
173
+ candidates = [
174
+ (schedule.worker2task_overhead[w][t], component.core.value[t], w, t)
175
+ for w in gpu_workers
176
+ for t in gpu_tasks
177
+ ]
178
+ candidates.sort(key=lambda e: (e[0], e[1]))
179
+ for _, _, worker, task in candidates:
180
+ if task in gpu_tasks and worker in gpu_workers:
181
+ if task not in component.computable:
182
+ # it may be that some fusing for previous task already assigned this
183
+ continue
184
+ end = perf_counter_ns()
185
+ trace(Microtrace.ctrl_assign, end - start)
186
+ assignment = build_assignment(worker, task, context, component.core)
187
+ if not coordinator:
188
+ coordinator = (
189
+ f"{context.environment.host_url_base[worker.host]}:{gang_port}"
190
+ )
191
+ assignment.extra_env["CASCADE_GANG_WORLD_SIZE"] = str(world_size)
192
+ assignment.extra_env["CASCADE_GANG_RANK"] = str(rank)
193
+ assignment.extra_env["CASCADE_GANG_COORDINATOR"] = coordinator
194
+ rank += 1
195
+ yield assignment
196
+ start = perf_counter_ns()
197
+ _postproc_assignment(assignment, component, schedule, context)
198
+ gpu_tasks.remove(task)
199
+ gpu_workers.remove(worker)
200
+ if gpu_tasks:
201
+ raise ValueError(
202
+ f"expected to assign all gang gpu tasks, yet {gpu_tasks} remain"
203
+ )
204
+
205
+ all_workers = cpu_workers.union(gpu_workers)
206
+ candidates = [
207
+ (schedule.worker2task_overhead[w][t], component.core.value[t], w, t)
208
+ for w in all_workers
209
+ for t in cpu_tasks
210
+ ]
211
+ candidates.sort(key=lambda e: (e[0], e[1]))
212
+ for _, _, worker, task in candidates:
213
+ if task in cpu_tasks and worker in all_workers:
214
+ if task not in component.computable:
215
+ # it may be that some fusing for previous task already assigned this
216
+ continue
217
+ end = perf_counter_ns()
218
+ trace(Microtrace.ctrl_assign, end - start)
219
+ assignment = build_assignment(worker, task, context, component.core)
220
+ if not coordinator:
221
+ coordinator = (
222
+ f"{context.environment.host_url_base[worker.host]}:{gang_port}"
223
+ )
224
+ assignment.extra_env["CASCADE_GANG_WORLD_SIZE"] = str(world_size)
225
+ assignment.extra_env["CASCADE_GANG_RANK"] = str(rank)
226
+ assignment.extra_env["CASCADE_GANG_COORDINATOR"] = coordinator
227
+ rank += 1
228
+ yield assignment
229
+ start = perf_counter_ns()
230
+ _postproc_assignment(assignment, component, schedule, context)
231
+ cpu_tasks.remove(task)
232
+ all_workers.remove(worker)
233
+ if cpu_tasks:
234
+ raise ValueError(
235
+ f"expected to assign all gang cpu tasks, yet {cpu_tasks} remain"
236
+ )
237
+
238
+ end = perf_counter_ns()
239
+ trace(Microtrace.ctrl_assign, end - start)
240
+ gang_port += 1
241
+
242
+
98
243
  def _assignment_heuristic(
99
244
  schedule: Schedule,
100
245
  tasks: list[TaskId],
@@ -106,18 +251,6 @@ def _assignment_heuristic(
106
251
  start = perf_counter_ns()
107
252
  component = schedule.components[component_id]
108
253
 
109
- def postproc_assignment(assignment: Assignment) -> None:
110
- for assigned in assignment.tasks:
111
- if assigned in component.computable:
112
- component.computable.pop(assigned)
113
- component.worker2task_values.remove(assigned)
114
- schedule.computable -= 1
115
- else:
116
- # shortcut for fused-in tasks
117
- component.is_computable_tracker[assigned] = set()
118
- context.idle_workers.remove(worker)
119
- component.weight -= len(assignment.tasks)
120
-
121
254
  # first, attempt optimum-distance assignment
122
255
  unassigned: list[TaskId] = []
123
256
  for task in tasks:
@@ -133,7 +266,7 @@ def _assignment_heuristic(
133
266
  assignment = build_assignment(worker, task, context, component.core)
134
267
  yield assignment
135
268
  start = perf_counter_ns()
136
- postproc_assignment(assignment)
269
+ _postproc_assignment(assignment, component, schedule, context)
137
270
  workers.pop(idx)
138
271
  was_assigned = True
139
272
  break
@@ -159,7 +292,7 @@ def _assignment_heuristic(
159
292
  assignment = build_assignment(worker, task, context, component.core)
160
293
  yield assignment
161
294
  start = perf_counter_ns()
162
- postproc_assignment(assignment)
295
+ _postproc_assignment(assignment, component, schedule, context)
163
296
  remaining_t.remove(task)
164
297
  remaining_w.remove(worker)
165
298
 
@@ -173,29 +306,63 @@ def assign_within_component(
173
306
  component_id: ComponentId,
174
307
  context: JobExecutionContext,
175
308
  ) -> Iterator[Assignment]:
176
- """We first handle tasks requiring a gpu, then tasks whose child requires a gpu, last cpu only tasks, using the same algorithm for either case"""
177
- # TODO employ a more systematic solution and handle all multicriterially at once -- ideally together with adding support for multi-gpu-groups
178
- # NOTE this is getting even more important as we started considering gpu fused distance
179
- # NOTE the concept of "strategic wait" is completely missing here (eg dont assign a gpu worker to a cpu task because there will come a gpu task in a few secs)
309
+ """We hardcode order of handling task groups:
310
+ 1/ ready gangs,
311
+ 2/ tasks requiring a gpu,
312
+ 3/ tasks whose fusable child requires a gpu,
313
+ 4/ all other tasks,
314
+ using the same algorithm for cases 2-4 and a naive for case 1
315
+ """
316
+ # TODO rework into a more systematic multicriterial opt solution that is able to consider all groups
317
+ # at once, using a generic value/cost framework and matching algorithm. It should additionally be able
318
+ # to issue a "strategic wait" command -- eg if we could assign a task to an idle worker with high cost,
319
+ # or wait until a better-equipped busy worker finished, etc.
320
+ component = schedule.components[component_id]
321
+
322
+ # gangs
323
+ fail_acc: list[frozenset[TaskId]] = []
324
+ for gang in component.gang_preparation.ready:
325
+ logger.debug(f"trying to assign a {gang=}")
326
+ yield from _try_assign_gang(
327
+ schedule, gang, list(context.idle_workers), component_id, context, fail_acc
328
+ )
329
+ component.gang_preparation.ready = fail_acc
330
+
331
+ # the other cases: build them first
180
332
  cpu_t: list[TaskId] = []
181
333
  gpu_t: list[TaskId] = []
182
334
  opu_t: list[TaskId] = []
183
- component = schedule.components[component_id]
184
335
  for task in component.computable.keys():
185
- if context.job_instance.tasks[task].definition.needs_gpu:
336
+ if component.gang_preparation.lookup[task]:
337
+ # no gang participation in single-task scheduling
338
+ continue
339
+ elif context.job_instance.tasks[task].definition.needs_gpu:
186
340
  gpu_t.append(task)
187
341
  elif component.core.gpu_fused_distance[task] is not None:
188
342
  opu_t.append(task)
189
343
  else:
190
344
  cpu_t.append(task)
345
+
346
+ # tasks immediately needing a gpu
191
347
  eligible_w = [
192
- worker for worker in workers if context.environment.workers[worker].gpu > 0
348
+ worker
349
+ for worker in workers
350
+ if context.environment.workers[worker].gpu > 0
351
+ and worker in context.idle_workers
193
352
  ]
353
+ logger.debug(
354
+ f"considering {len(gpu_t)}# gpu tasks, {len(opu_t)}# maybe-gpu tasks, {len(cpu_t)}# cpu tasks, with {len(workers)}# workers out of which {len(eligible_w)} have gpu"
355
+ )
194
356
  yield from _assignment_heuristic(schedule, gpu_t, eligible_w, component_id, context)
357
+ # tasks whose fusing opportunity needs a gpu
195
358
  eligible_w = [worker for worker in eligible_w if worker in context.idle_workers]
196
359
  yield from _assignment_heuristic(schedule, opu_t, eligible_w, component_id, context)
360
+ # remaining tasks
197
361
  eligible_w = [worker for worker in workers if worker in context.idle_workers]
198
- yield from _assignment_heuristic(schedule, cpu_t, eligible_w, component_id, context)
362
+ u_opu_t = [task for task in opu_t if task in component.computable]
363
+ yield from _assignment_heuristic(
364
+ schedule, cpu_t + u_opu_t, eligible_w, component_id, context
365
+ )
199
366
 
200
367
 
201
368
  def update_worker2task_distance(
cascade/scheduler/core.py CHANGED
@@ -44,6 +44,19 @@ Worker2TaskDistance = dict[WorkerId, dict[TaskId, int]]
44
44
  ComponentId = int
45
45
 
46
46
 
47
+ @dataclass
48
+ class GangPreparation:
49
+ ready: list[
50
+ frozenset[TaskId]
51
+ ] # used by scheduler to see if any gangs can be assigned/started
52
+ countdown: dict[
53
+ frozenset[TaskId], set[TaskId]
54
+ ] # used to check after a task completion whether a gang can be moved to ready
55
+ lookup: dict[
56
+ TaskId, list[frozenset[TaskId]]
57
+ ] # used to decrease countdown after a task completion
58
+
59
+
47
60
  @dataclass
48
61
  class ComponentSchedule:
49
62
  core: ComponentCore
@@ -58,6 +71,7 @@ class ComponentSchedule:
58
71
  worker2task_distance: Worker2TaskDistance
59
72
  # eligible values -- a cached value. Used when migrating new workers to the component, inserted whenever a parent of this task gets `preparing`, removed when this task is made computable
60
73
  worker2task_values: set[TaskId]
74
+ gang_preparation: GangPreparation
61
75
 
62
76
 
63
77
  @dataclass
@@ -79,3 +93,4 @@ class Assignment:
79
93
  tasks: list[TaskId]
80
94
  prep: list[tuple[DatasetId, HostId]]
81
95
  outputs: set[DatasetId]
96
+ extra_env: list[tuple[str, str]]
@@ -112,6 +112,7 @@ def _enrich(
112
112
  edge_i: dict[TaskId, set[TaskId]],
113
113
  edge_o: dict[TaskId, set[TaskId]],
114
114
  needs_gpu: set[TaskId],
115
+ gangs: set[TaskId],
115
116
  ) -> ComponentCore:
116
117
  nodes, sources = plain_component
117
118
  logger.debug(
@@ -170,7 +171,7 @@ def _enrich(
170
171
  while layer:
171
172
  gpu_distance = None
172
173
  head = layer.pop(0)
173
- if head in fused:
174
+ if head in fused or head in gangs:
174
175
  continue
175
176
  chain = []
176
177
  fused.add(head)
@@ -183,7 +184,7 @@ def _enrich(
183
184
  gpu_fused_distance[head] = gpu_distance
184
185
  found = False
185
186
  for edge in edge_i[head]:
186
- if edge not in fused:
187
+ if edge not in fused and edge not in gangs:
187
188
  chain.insert(0, head)
188
189
  head = edge
189
190
  fused.add(head)
@@ -222,11 +223,16 @@ def precompute(job_instance: JobInstance) -> Preschedule:
222
223
  for task_id, task in job_instance.tasks.items()
223
224
  if task.definition.needs_gpu
224
225
  }
226
+ gangs = {
227
+ task_id
228
+ for constraint in job_instance.constraints
229
+ for task_id in constraint.gang
230
+ }
225
231
 
226
232
  with ThreadPoolExecutor(max_workers=4) as tp:
227
233
  # TODO if coptrs is not used, then this doesnt make sense
228
234
  f = lambda plain_component: timer(_enrich, Microtrace.presched_enrich)(
229
- plain_component, edge_i_proj, edge_o_proj, needs_gpu
235
+ plain_component, edge_i_proj, edge_o_proj, needs_gpu, gangs
230
236
  )
231
237
  plain_components = (
232
238
  plain_component
cascade/shm/server.py CHANGED
@@ -115,5 +115,5 @@ def entrypoint(
115
115
  server.start()
116
116
  except Exception as e:
117
117
  # we always get a Bad file descriptor due to sigterm handler calling sock close mid-read
118
- logger.warning(f"shutdown issue: {e}")
118
+ logger.warning(f"shutdown issue: {repr(e)}")
119
119
  server.atexit(0, None)
@@ -1,2 +1,2 @@
1
1
  # Do not change! Do not track in version control!
2
- __version__ = "0.4.0"
2
+ __version__ = "0.4.2"
@@ -7,37 +7,52 @@
7
7
  # nor does it submit to any jurisdiction.
8
8
 
9
9
  import functools
10
- import warnings
11
- from typing import Callable
10
+ import logging
11
+ from typing import Callable, Union
12
12
 
13
13
  import xarray as xr
14
14
 
15
15
  from .arrayapi import ArrayAPIBackend
16
16
  from .xarray import XArrayBackend
17
17
 
18
+ logger = logging.getLogger(__name__)
19
+
20
+
18
21
  BACKENDS = {
19
22
  xr.DataArray: XArrayBackend,
20
23
  xr.Dataset: XArrayBackend,
21
- "default": ArrayAPIBackend,
24
+ object: ArrayAPIBackend,
22
25
  }
23
26
 
24
27
 
25
28
  def register(type, backend):
26
29
  if type in BACKENDS:
27
- warnings.warn(
30
+ logger.warning(
28
31
  f"Overwriting backend for {type}. Existing backend {BACKENDS[type]}."
29
32
  )
30
33
  BACKENDS[type] = backend
31
34
 
32
35
 
36
+ def _get_backend(obj_type: type) -> Union[type, None]:
37
+ return BACKENDS.get(obj_type, None)
38
+
39
+
33
40
  def array_module(*arrays):
34
- # Only deduce type from first element to allow for mixed types
35
- # but this means the first argument needs to specify the correct module
41
+ """Return the backend module for the given arrays."""
42
+ # Checks all bases of the first array type for a registered backend.
43
+ # If no backend is found, it will traverse the hierarchy of types
44
+ # until it finds a registered backend or reaches the base object type.
45
+ if not arrays:
46
+ raise ValueError("No arrays provided to determine backend.")
36
47
  array_type = type(arrays[0])
37
- backend = BACKENDS.get(array_type, None)
38
- if backend is None:
39
- # Fall back on array API
40
- backend = BACKENDS["default"]
48
+ while True:
49
+ backend = _get_backend(array_type)
50
+ if backend is not None:
51
+ break
52
+ # If no backend found, try the next type in the hierarchy
53
+ array_type = array_type.__bases__[0]
54
+
55
+ logger.debug(f"Using backend {backend} for {array_type}")
41
56
  return backend
42
57
 
43
58
 
@@ -201,5 +216,6 @@ try:
201
216
 
202
217
  BACKENDS[SimpleFieldList] = FieldListBackend
203
218
  BACKENDS[FieldList] = FieldListBackend
219
+
204
220
  except ImportError:
205
- warnings.warn("earthkit could not be imported, FieldList not supported.")
221
+ logger.warning("earthkit could not be imported, FieldList not supported.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: earthkit-workflows
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
5
5
  Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
6
6
  License-Expression: Apache-2.0
@@ -1,28 +1,29 @@
1
1
  cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
4
- cascade/benchmarks/__main__.py,sha256=n0RX44Sj_j6InFNKCjwXRVWKTYznMsrPBdf8kwGKhjM,8065
4
+ cascade/benchmarks/__main__.py,sha256=g03xRzp58dXLHDj8kTPyPnbBOS5sRIAMTthFtFjDRbs,9876
5
5
  cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
6
+ cascade/benchmarks/dist.py,sha256=ngXJJzegnMUVwDFPvGMG6997lamB-aSEHi74oBbayrE,4116
6
7
  cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
7
8
  cascade/benchmarks/job1.py,sha256=MOcZZYgf36MzHCjtby0lQyenM1ODUlagG8wtt2CbpnI,4640
8
9
  cascade/benchmarks/matmul.py,sha256=5STuvPY6Q37E2pKRCde9dQjL5M6tx7tkES9cBLZ6eK4,1972
9
10
  cascade/benchmarks/plotting.py,sha256=vSz9HHbqZwMXHpBUS-In6xsXGgK7QIoQTTiYfSwYwZs,4428
10
11
  cascade/benchmarks/reporting.py,sha256=MejaM-eekbMYLAnuBxGv_t4dR1ODJs4Rpc0fiZSGjyw,5410
11
12
  cascade/controller/__init__.py,sha256=p4C2p3S_0nUGamP9Mi6cSa5bvpiWbI6sVWtGhFnNqjw,1278
12
- cascade/controller/act.py,sha256=POzWwIlnp26hCY78Gp-ZMvCO6iXGOfA7TJUrEWrheyw,2941
13
+ cascade/controller/act.py,sha256=WHIsk4H-Bbyl_DABX2VWhyKy_cNnp12x1nilatPCL8I,2981
13
14
  cascade/controller/core.py,sha256=NqvZ5g5GNphwOpzdXbCI0_fxIzzmO97_n2xZKswK72Q,3589
14
- cascade/controller/impl.py,sha256=sLllTM509obsBHWbNtJ_Zu8Q6IJDG2IZOw0E08LDVfg,3247
15
- cascade/controller/notify.py,sha256=-FgHCsEVlghXuHX3_8Okyt_yL1AMj57ZBBHXiebX_Ys,5757
15
+ cascade/controller/impl.py,sha256=9jdTikYO8OkaNIfzatyr3Mhai5EfEhaeii9GaF9cQw4,3526
16
+ cascade/controller/notify.py,sha256=5eSPKcxqrv9kHy7St-iIm1NttsyzcvwLhZI5dvr4cEY,5881
16
17
  cascade/controller/report.py,sha256=FD-MAWZq6pwSw2CP2m4OUBw4hzrX46vKE_FZO5NpjDU,3670
17
- cascade/executor/bridge.py,sha256=vrs-5_Qt2mgkAD7Mzi43Xt_q7tpXX6i1UOPfqZSxHfs,8169
18
+ cascade/executor/bridge.py,sha256=WDE-GM2Bv7nUk1-nV-otMGuaRYw1-Vmd7PWploXBp6Y,8267
18
19
  cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
19
- cascade/executor/config.py,sha256=rA4WeCNbdJJ3FdOKJ6WN3_VUorYW3cqdMfKUYPSyj0Y,1471
20
+ cascade/executor/config.py,sha256=8azy_sXdvDGO0zTNqA0pdtkXsyihM4FQ4U1W_3Dhua0,1571
20
21
  cascade/executor/data_server.py,sha256=xLIbLkWn8PnJl4lMP8ADHa2S0EgPwr0-bH7_Sib_Y70,13701
21
- cascade/executor/executor.py,sha256=SqMVM8BvCNM2r2Zbg9kxSxwFADAaoBU7nCMtfzktsgI,13282
22
- cascade/executor/msg.py,sha256=QW7Me-8Sin-x-f4M4bzvO7_av8MRkjnabQN6Ch3x22c,4230
22
+ cascade/executor/executor.py,sha256=egPhfqhzYfeM77Hu10-mGHNVsQAdqmZOA7hmjFP1Q8M,13484
23
+ cascade/executor/msg.py,sha256=7HI0rKeCRaV1ONR4HWEa64nHbu-p6-QdBwJNitmst48,4340
23
24
  cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
24
25
  cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
25
- cascade/executor/runner/entrypoint.py,sha256=e_MWYTSQroGMkgMddrqtn5DEqUeN-svC565TlOrv5iA,7598
26
+ cascade/executor/runner/entrypoint.py,sha256=32i2U4fmEvQnsV1MTl0Xf8mK_1nbk1BEVJqIidd6MRM,8042
26
27
  cascade/executor/runner/memory.py,sha256=jkAV9T7-imciVcGvkV7OhRfosEpOQJU1OME7z-4ztAs,6371
27
28
  cascade/executor/runner/packages.py,sha256=OZjEOvKy8LQ2uguGZU1L7TVYz1415JOUGySRfU_D_sc,2513
28
29
  cascade/executor/runner/runner.py,sha256=zqpkvxdWLbwyUFaUbZmSj0KQEBNRpmF8gwVotiaamhc,4870
@@ -34,17 +35,17 @@ cascade/gateway/router.py,sha256=iN-dc3L46aEy0EV57NNKYwaqIu0Au9kImu1pg-UbxwE,768
34
35
  cascade/gateway/server.py,sha256=tsOyKtVFs5EZmWrjKdi9JwWxK0DG207oSa9OQ-4zN3M,3772
35
36
  cascade/low/__init__.py,sha256=5cw2taOGITK_gFbICftzK2YLdEAnLUY5OzblFzdHss4,769
36
37
  cascade/low/builders.py,sha256=_u5X8G_EF00hFt8Anv9AXo6yPf1O8MHDmqs2kKmREl0,7073
37
- cascade/low/core.py,sha256=txya9rgks2b1ze9yLvFvrZCs8sCCtDUlfNwz4sHgybM,5994
38
+ cascade/low/core.py,sha256=_3x4ka_pmCgZbfwFeyhq8S4M6wmh0s24VRCLhk5yQFM,6444
38
39
  cascade/low/execution_context.py,sha256=cdDJLYhreo4T7t4qXgFBosncubZpTrm0hELo7q4miqo,6640
39
40
  cascade/low/func.py,sha256=ihL5n3cK-IJnATgP4Dub2m-Mp_jHMxJzCA1v4uMEsi8,5211
40
41
  cascade/low/into.py,sha256=QvjrcBuHfu7qpEkeB0EJu1EAaRxOEZskUnyjkRJ_9gA,3391
41
42
  cascade/low/tracing.py,sha256=qvGVKB1huwcYoyvMYN-2wQ92pLQTErocTjpIjWv9glA,4511
42
43
  cascade/low/views.py,sha256=UwafO2EQHre17GjG8hdzO8b6qBRtTRtDlhOc1pTf8Io,1822
43
44
  cascade/scheduler/__init__.py,sha256=VT2qQ0gOQWHC4-T0FcCs59w8WZ94j2nUn7tiGm5XepA,1148
44
- cascade/scheduler/api.py,sha256=uyRslN3ZNXOZNax27pQOrczeo9-2zTxal7-xYAPCDgI,5911
45
- cascade/scheduler/assign.py,sha256=XRTu3wEK2FYM-4Y_Gp4_O6h2wr6LSUa7e05DTwPHRcs,12250
46
- cascade/scheduler/core.py,sha256=XtXpfq6gtE8FS1BQd0ku0uQOrJpe1_CzzuBd98W6y7g,2891
47
- cascade/scheduler/precompute.py,sha256=QmZgriwfb07LViMztZogX5DOC1L4dCTbZJNGuFvFS9A,8513
45
+ cascade/scheduler/api.py,sha256=UuomWS2ISuDw-ngFFUKLyucygpTWF0EBW8ZuF91EUBU,7778
46
+ cascade/scheduler/assign.py,sha256=gpOLL22-k3ah4gihiztIGMX2uF0RdJ5AtJ8fOCJUviE,18362
47
+ cascade/scheduler/core.py,sha256=umORLC6SDeOyS4z8nQuVFkDukBJ96JfH4hdLSj6Km20,3378
48
+ cascade/scheduler/precompute.py,sha256=AhTn8RgnU4XuV_WAgbVXz9z0YRpNS6LCY1dJeHdTfCc,8709
48
49
  cascade/shm/__init__.py,sha256=R9QgGSnsl_YDjFjAUQkoleM_5yGM37ce9S8a4ReA1mE,3854
49
50
  cascade/shm/algorithms.py,sha256=SGxnJF4ovUaywTunMJWkG77l5DN-jXx7HgABt3sRJXM,2356
50
51
  cascade/shm/api.py,sha256=a_KrjyELsDms0Di0ThHsZe7MfmNEkekflmjXAQ1_Qws,6040
@@ -52,9 +53,9 @@ cascade/shm/client.py,sha256=pnod_dmUROJZRtipCpoeCuAEuynW0IgSfgjrp21CH2M,5893
52
53
  cascade/shm/dataset.py,sha256=Z2ewpnW7mVDJB9GylIVoOWV0DYOF7FWLIXkIvV-Y7sI,12347
53
54
  cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
54
55
  cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
55
- cascade/shm/server.py,sha256=5Ub9bnBmDto9BwfjX3h3sJeiLzZN4lawgtLfvK-vcMU,5036
56
+ cascade/shm/server.py,sha256=LnnNX0F6QJt5V_JLfmC3ZMHGNL5WpLY44wpB_pYDr7Y,5042
56
57
  earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
57
- earthkit/workflows/_version.py,sha256=-UXII43tJWWG-Bw3-ObfEfbloOAVS2Clozd55E6zYvA,72
58
+ earthkit/workflows/_version.py,sha256=nkd71CReR3pz5TZ9pcVgB2cP1MDj4YK6VH9UGJYzXDM,72
58
59
  earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
59
60
  earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
60
61
  earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
@@ -63,7 +64,7 @@ earthkit/workflows/taskgraph.py,sha256=RsT1Qlng1uPZSaSBNqE8vFsoI5J8DDcQl468YPX-k
63
64
  earthkit/workflows/transformers.py,sha256=BsUUvnG-UyerT3XUYcHc1qJkSsLc0ZX3Zxqq70tJWLU,2105
64
65
  earthkit/workflows/utility.py,sha256=ygqn1s846WQbo7HGY46Z8N1AXrDFGwyygSgsv4YnGJ8,1344
65
66
  earthkit/workflows/visualise.py,sha256=WbqJWvn648B7Qo3VCKJyoJzU6Mgvv0p3UWZb0lf01m8,2290
66
- earthkit/workflows/backends/__init__.py,sha256=XMJM2OL55bnWOSB_g4nzoY9dgBfnh250d8nLBOCj0MA,6013
67
+ earthkit/workflows/backends/__init__.py,sha256=6ONg-EdNODiqeBZqyosI5iq1UfZfaOLqhAo8l8_wn9o,6519
67
68
  earthkit/workflows/backends/arrayapi.py,sha256=QfUsTlYuFH3CroWdcf_XBcLnt2znMcS1HwNNEe8J0qU,2279
68
69
  earthkit/workflows/backends/earthkit.py,sha256=rZURJf6FLKcCjJkyWgOf6NqKjPZjSNX09dV_SicIlss,8958
69
70
  earthkit/workflows/backends/xarray.py,sha256=4pnnPgIug4DmvhigkU0JsituvdvspuVA_vxbIsrq8-A,6762
@@ -84,8 +85,8 @@ earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESs
84
85
  earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
85
86
  earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
86
87
  earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
87
- earthkit_workflows-0.4.0.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
88
- earthkit_workflows-0.4.0.dist-info/METADATA,sha256=GUxPv5SDQH-BE7InVU4Yy0MheZaSXdD1ys1seH-vPO4,1571
89
- earthkit_workflows-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
90
- earthkit_workflows-0.4.0.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
91
- earthkit_workflows-0.4.0.dist-info/RECORD,,
88
+ earthkit_workflows-0.4.2.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
89
+ earthkit_workflows-0.4.2.dist-info/METADATA,sha256=LWW-xDc0sq8cOdu6IpY335_MSFfe7Lmg1SHYT9cXjWA,1571
90
+ earthkit_workflows-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
91
+ earthkit_workflows-0.4.2.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
92
+ earthkit_workflows-0.4.2.dist-info/RECORD,,