PyPI - earthkit-workflows - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

earthkit-workflows 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

cascade/benchmarks/__main__.py +79 -30
cascade/benchmarks/dist.py +123 -0
cascade/controller/act.py +1 -0
cascade/controller/impl.py +5 -0
cascade/controller/notify.py +2 -0
cascade/executor/bridge.py +2 -1
cascade/executor/config.py +2 -0
cascade/executor/executor.py +7 -2
cascade/executor/msg.py +2 -0
cascade/executor/runner/entrypoint.py +15 -5
cascade/low/core.py +14 -2
cascade/scheduler/api.py +55 -0
cascade/scheduler/assign.py +190 -23
cascade/scheduler/core.py +15 -0
cascade/scheduler/precompute.py +9 -3
cascade/shm/server.py +1 -1
earthkit/workflows/_version.py +1 -1
earthkit/workflows/backends/__init__.py +27 -11
{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/METADATA +1 -1
{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/RECORD +23 -22
{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/WHEEL +0 -0
{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/licenses/LICENSE +0 -0
{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/top_level.txt +0 -0

cascade/benchmarks/__main__.py CHANGED Viewed

@@ -26,7 +26,9 @@ import logging.config
 import multiprocessing
 import os
 import subprocess
+import sys
 from concurrent.futures import ThreadPoolExecutor
+from socket import getfqdn
 from time import perf_counter_ns
 import fire
@@ -77,13 +79,17 @@ def get_job(benchmark: str | None, instance_path: str | None) -> JobInstance:
             import cascade.benchmarks.matmul as matmul
             return matmul.get_job()
+        elif benchmark.startswith("dist"):
+            import cascade.benchmarks.dist as dist
+            return dist.get_job()
         else:
             raise NotImplementedError(benchmark)
     else:
         raise TypeError("specified neither benchmark name nor job instance")
-def get_gpu_count() -> int:
+def get_cuda_count() -> int:
     try:
         if "CUDA_VISIBLE_DEVICES" in os.environ:
             # TODO we dont want to just count, we want to actually use literally these ids
@@ -101,12 +107,22 @@ def get_gpu_count() -> int:
             if "GPU" in l
         )
     except:
-        # TODO support macos
         logger.exception("unable to determine available gpus")
         gpus = 0
     return gpus
+def get_gpu_count(host_idx: int, worker_count: int) -> int:
+    if sys.platform == "darwin":
+        # we should inspect some gpu capabilities details to prevent overcommit
+        return worker_count
+    else:
+        if host_idx == 0:
+            return get_cuda_count()
+        else:
+            return 0
 def launch_executor(
     job_instance: JobInstance,
     controller_address: BackboneAddress,
@@ -116,6 +132,7 @@ def launch_executor(
     shm_vol_gb: int | None,
     gpu_count: int,
     log_base: str | None,
+    url_base: str,
 ):
     if log_base is not None:
         log_base = f"{log_base}.host{i}"
@@ -123,19 +140,25 @@ def launch_executor(
         logging.config.dictConfig(logging_config_filehandler(log_path))
     else:
         logging.config.dictConfig(logging_config)
-    logger.info(f"will set {gpu_count} gpus on host {i}")
-    os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
-    executor = Executor(
-        job_instance,
-        controller_address,
-        workers_per_host,
-        f"h{i}",
-        portBase,
-        shm_vol_gb,
-        log_base,
-    )
-    executor.register()
-    executor.recv_loop()
+    try:
+        logger.info(f"will set {gpu_count} gpus on host {i}")
+        os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
+        executor = Executor(
+            job_instance,
+            controller_address,
+            workers_per_host,
+            f"h{i}",
+            portBase,
+            shm_vol_gb,
+            log_base,
+            url_base,
+        )
+        executor.register()
+        executor.recv_loop()
+    except Exception:
+        # NOTE we log this to get the stacktrace into the logfile
+        logger.exception("executor failure")
+        raise
 def run_locally(
@@ -151,24 +174,47 @@ def run_locally(
         logging.config.dictConfig(logging_config_filehandler(log_path))
     else:
         logging.config.dictConfig(logging_config)
+    logger.debug(f"local run starting with {hosts=} and {workers=} on {portBase=}")
     launch = perf_counter_ns()
-    preschedule = precompute(job)
     c = f"tcp://localhost:{portBase}"
     m = f"tcp://localhost:{portBase+1}"
     ps = []
-    for i, executor in enumerate(range(hosts)):
-        if i == 0:
-            gpu_count = get_gpu_count()
-        else:
-            gpu_count = 0
-        # NOTE forkserver/spawn seem to forget venv, we need fork
-        p = multiprocessing.get_context("fork").Process(
-            target=launch_executor,
-            args=(job, c, workers, portBase + 1 + i * 10, i, None, gpu_count, log_base),
-        )
-        p.start()
-        ps.append(p)
     try:
+        # executors forking
+        for i, executor in enumerate(range(hosts)):
+            gpu_count = get_gpu_count(i, workers)
+            # NOTE forkserver/spawn seem to forget venv, we need fork
+            logger.debug(f"forking into executor on host {i}")
+            p = multiprocessing.get_context("fork").Process(
+                target=launch_executor,
+                args=(
+                    job,
+                    c,
+                    workers,
+                    portBase + 1 + i * 10,
+                    i,
+                    None,
+                    gpu_count,
+                    log_base,
+                    "tcp://localhost",
+                ),
+            )
+            p.start()
+            ps.append(p)
+        # compute preschedule
+        preschedule = precompute(job)
+        # check processes started healthy
+        for i, p in enumerate(ps):
+            if not p.is_alive():
+                # TODO ideally we would somehow connect this with the Register message
+                # consumption in the Controller -- but there we don't assume that
+                # executors are on the same physical host
+                raise ValueError(f"executor {i} failed to live due to {p.exitcode}")
+        # start bridge itself
+        logger.debug("starting bridge")
         b = Bridge(c, hosts)
         start = perf_counter_ns()
         run(job, b, preschedule, report_address=report_address)
@@ -176,7 +222,9 @@ def run_locally(
         print(
             f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
         )
-    except:
+    except Exception:
+        # NOTE we log this to get the stacktrace into the logfile
+        logger.exception("controller failure, proceed with executor shutdown")
         for p in ps:
             if p.is_alive():
                 callback(m, ExecutorShutdown())
@@ -238,7 +286,7 @@ def main_dist(
             f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
         )
     else:
-        gpu_count = get_gpu_count()
+        gpu_count = get_gpu_count(0, workers_per_host)
         launch_executor(
             jobInstance,
             controller_url,
@@ -247,6 +295,7 @@ def main_dist(
             idx,
             shm_vol_gb,
             gpu_count,
+            f"tcp://{getfqdn()}",
         )

cascade/benchmarks/dist.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""Demonstrates gang scheduling capabilities, ie, multiple nodes capable of mutual communication.
+The job is a source -> (dist group) -> sink, where:
+    source just returns an int,
+    dist group is L nodes to be scheduled as a single gang
+        rank=0 node broadcasts a buffer containing the node's input
+        each node returns its input multiplied by broadcasted buffer
+    sink returns the sum of all inputs
+There are multiple implementations of that:
+    torch
+    jax (actually does a mesh-shard global sum instead of broadcast -- the point is to showcase dist init)
+"""
+import os
+from cascade.low.builders import JobBuilder, TaskBuilder
+from cascade.low.core import JobInstance, SchedulingConstraint
+def source_func() -> int:
+    return 42
+def dist_func_torch(a: int) -> int:
+    import datetime as dt
+    import numpy as np
+    import torch.distributed as dist
+    world_size = int(os.environ["CASCADE_GANG_WORLD_SIZE"])
+    rank = int(os.environ["CASCADE_GANG_RANK"])
+    coordinator = os.environ["CASCADE_GANG_COORDINATOR"]
+    print(f"starting with envvars: {rank=}/{world_size=}, {coordinator=}")
+    dist.init_process_group(
+        backend="gloo",
+        init_method=coordinator,
+        timeout=dt.timedelta(minutes=1),
+        world_size=world_size,
+        rank=rank,
+    )
+    group_ranks = np.arange(world_size, dtype=int)
+    group = dist.new_group(group_ranks)
+    if rank == 0:
+        buf = [a]
+        dist.broadcast_object_list(buf, src=0, group=group)
+        print("broadcast ok")
+    else:
+        buf = np.array([0], dtype=np.uint64)
+        dist.broadcast_object_list(buf, src=0, group=group)
+        print(f"broadcast recevied {buf}")
+    return a * buf[0]
+def dist_func_jax(a: int) -> int:
+    world_size = int(os.environ["CASCADE_GANG_WORLD_SIZE"])
+    rank = int(os.environ["CASCADE_GANG_RANK"])
+    coordinator = os.environ["CASCADE_GANG_COORDINATOR"]
+    os.environ["JAX_NUM_CPU_DEVICES"] = "1"
+    os.environ["JAX_PLATFORM_NAME"] = "cpu"
+    os.environ["JAX_PLATFORMS"] = "cpu"
+    import jax
+    import jax.numpy as jp
+    jax.config.update("jax_platforms", "cpu")
+    jax.config.update("jax_platform_name", "cpu")
+    # NOTE neither of the above seems to actually help with an init error message :(
+    print(f"starting with envvars: {rank=}/{world_size=}, {coordinator=}")
+    if coordinator.startswith("tcp://"):
+        coordinator = coordinator[len("tcp://") :]
+    jax.distributed.initialize(coordinator, num_processes=world_size, process_id=rank)
+    assert jax.device_count() == world_size
+    mesh = jax.make_mesh((world_size,), ("i",))
+    global_data = jp.arange(world_size)
+    sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("i"))
+    global_array = jax.device_put(global_data, sharding)
+    result = jp.sum(global_array)
+    print(f"worker {rank}# got result {result=}")
+    return a + result
+def build_dist_func(impl: str):
+    if impl == "torch":
+        return dist_func_torch
+    elif impl == "jax":
+        return dist_func_jax
+    else:
+        raise NotImplementedError(impl)
+def sink_func(**kwargs) -> int:
+    c = 0
+    for _, v in kwargs.items():
+        c += v
+    print(f"sink accumulated {c}")
+    return c
+def get_job() -> JobInstance:
+    source_node = TaskBuilder.from_callable(source_func)
+    sink_node = TaskBuilder.from_callable(sink_func)
+    job = JobBuilder().with_node("source", source_node).with_node("sink", sink_node)
+    L = int(os.environ["DIST_L"])
+    IMPL = os.environ["DIST_IMPL"]
+    node = TaskBuilder.from_callable(build_dist_func(IMPL))
+    for i in range(L):
+        job = (
+            job.with_node(f"proc{i}", node)
+            .with_edge("source", f"proc{i}", "a")
+            .with_edge(f"proc{i}", "sink", f"v{i}")
+        )
+        job.nodes["sink"].definition.input_schema[
+            f"v{i}"
+        ] = "int"  # TODO put some allow_kw into TaskDefinition instead to allow this
+    job = job.build().get_or_raise()
+    job.ext_outputs = list(job.outputs_of("sink"))
+    job.constraints = [SchedulingConstraint(gang=[f"proc{i}" for i in range(L)])]
+    return job

cascade/controller/act.py CHANGED Viewed

@@ -51,6 +51,7 @@ def act(bridge: Bridge, assignment: Assignment) -> None:
         worker=assignment.worker,
         tasks=assignment.tasks,
         publish=assignment.outputs,
+        extra_env=assignment.extra_env,
     )
     for task in assignment.tasks:

cascade/controller/impl.py CHANGED Viewed

@@ -43,6 +43,11 @@ def run(
     reporter = Reporter(report_address)
     try:
+        total_gpus = sum(worker.gpu for worker in env.workers.values())
+        needs_gpus = any(task.definition.needs_gpu for task in job.tasks.values())
+        if needs_gpus and total_gpus == 0:
+            raise ValueError("environment contains no gpu yet job demands one")
         while (
             state.has_awaitable()
             or context.has_awaitable()

cascade/controller/notify.py CHANGED Viewed

@@ -22,6 +22,7 @@ from cascade.low.core import DatasetId, HostId, WorkerId
 from cascade.low.execution_context import DatasetStatus, JobExecutionContext
 from cascade.low.func import assert_never
 from cascade.low.tracing import TaskLifecycle, TransmitLifecycle, mark
+from cascade.scheduler.api import gang_check_ready
 from cascade.scheduler.assign import set_worker2task_overhead
 from cascade.scheduler.core import Schedule
@@ -67,6 +68,7 @@ def consider_computable(
                     # NOTE this is a task newly made computable, so we need to calc
                     # `overhead` for all hosts/workers assigned to the component
                     set_worker2task_overhead(schedule, context, worker, child_task)
+                gang_check_ready(child_task, component.gang_preparation)
 # TODO refac less explicit mutation of context, use class methods

cascade/executor/bridge.py CHANGED Viewed

@@ -46,7 +46,7 @@ class Bridge:
         self.transmit_idx_counter = 0
         self.sender = ReliableSender(self.mlistener.address, resend_grace_ms)
         registered = 0
-        self.environment = Environment(workers={})
+        self.environment = Environment(workers={}, host_url_base={})
         logger.debug("about to start receiving registrations")
         registration_grace = time.time_ns() + 3 * 60 * 1_000_000_000
         while registered < expected_executors:
@@ -69,6 +69,7 @@ class Bridge:
                     self.environment.workers[worker.worker_id] = Worker(
                         cpu=worker.cpu, gpu=worker.gpu, memory_mb=worker.memory_mb
                     )
+                self.environment.host_url_base[message.host] = message.url_base
                 registered += 1
                 self.heartbeat_checker[message.host] = GraceWatcher(
                     2 * executor_heartbeat_grace_ms

cascade/executor/config.py CHANGED Viewed

@@ -21,12 +21,14 @@ logging_config = {
         "forecastbox.worker": {"level": "DEBUG"},
         "forecastbox.executor": {"level": "DEBUG"},
         "cascade": {"level": "INFO"},
+        "cascade.benchmarks": {"level": "DEBUG"},
         "cascade.low": {"level": "DEBUG"},
         "cascade.shm": {"level": "DEBUG"},
         "cascade.controller": {"level": "DEBUG"},
         "cascade.executor": {"level": "DEBUG"},
         "cascade.scheduler": {"level": "DEBUG"},
         "cascade.gateway": {"level": "DEBUG"},
+        "earthkit.workflows": {"level": "DEBUG"},
         "httpcore": {"level": "ERROR"},
         "httpx": {"level": "ERROR"},
         "": {"level": "WARNING", "handlers": ["default"]},

cascade/executor/executor.py CHANGED Viewed

@@ -69,8 +69,9 @@ class Executor:
         workers: int,
         host: HostId,
         portBase: int,
-        shm_vol_gb: int | None = None,
-        log_base: str | None = None,
+        shm_vol_gb: int | None,
+        log_base: str | None,
+        url_base: str,
     ) -> None:
         self.job_instance = job_instance
         self.param_source = param_source(job_instance.edges)
@@ -85,6 +86,7 @@ class Executor:
         self.heartbeat_watcher = GraceWatcher(grace_ms=heartbeat_grace_ms)
         self.terminating = False
+        logger.debug("register terminate function")
         atexit.register(self.terminate)
         # NOTE following inits are with potential side effects
         self.mlistener = Listener(address_of(portBase))
@@ -98,6 +100,7 @@ class Executor:
             shm_logging = logging_config_filehandler(f"{log_base}.shm.txt")
         else:
             shm_logging = logging_config
+        logger.debug("about to fork into shm process")
         self.shm_process = ctx.Process(
             target=shm_server,
             args=(
@@ -113,6 +116,7 @@ class Executor:
             dsr_logging = logging_config_filehandler(f"{log_base}.dsr.txt")
         else:
             dsr_logging = logging_config
+        logger.debug("about to fork into data server")
         self.data_server = ctx.Process(
             target=start_data_server,
             args=(
@@ -138,6 +142,7 @@ class Executor:
                 )
                 for idx, worker_id in enumerate(self.workers.keys())
             ],
+            url_base=url_base,
         )
         logger.debug("constructed executor")

cascade/executor/msg.py CHANGED Viewed

@@ -71,6 +71,7 @@ class TaskSequence:
     worker: WorkerId  # worker for running those tasks
     tasks: list[TaskId]  # to be executed in the given order
     publish: set[DatasetId]  # set of outputs to be published
+    extra_env: list[tuple[str, str]]  # extra env var to set
 @dataclass(frozen=True)
@@ -147,6 +148,7 @@ class ExecutorRegistration:
     host: HostId
     maddress: BackboneAddress
     daddress: BackboneAddress
+    url_base: str  # used for eg dist comms init
     workers: list[Worker]

cascade/executor/runner/entrypoint.py CHANGED Viewed

@@ -11,6 +11,7 @@
 import logging
 import logging.config
 import os
+import sys
 from dataclasses import dataclass
 import zmq
@@ -98,12 +99,17 @@ def execute_sequence(
 ) -> None:
     taskId: TaskId | None = None
     try:
+        for key, value in taskSequence.extra_env.items():
+            os.environ[key] = value
         executionContext = runnerContext.project(taskSequence)
         for taskId in taskSequence.tasks:
             pckg.extend(executionContext.tasks[taskId].definition.environment)
             run(taskId, executionContext, memory)
         if Config.posttask_flush:
             memory.flush()
+        for key in taskSequence.extra_env.keys():
+            # NOTE we should in principle restore the previous value, but we dont expect collisions
+            del os.environ[key]
     except Exception as e:
         logger.exception("runner failure, about to report")
         callback(
@@ -129,11 +135,15 @@ def entrypoint(runnerContext: RunnerContext):
         label("worker", repr(runnerContext.workerId))
         worker_num = runnerContext.workerId.worker_num()
         gpus = int(os.environ.get("CASCADE_GPU_COUNT", "0"))
-        os.environ["CUDA_VISIBLE_DEVICES"] = (
-            ",".join(str(worker_num)) if worker_num < gpus else ""
-        )
-        # NOTE check any(task.definition.needs_gpu) anywhere?
-        # TODO configure OMP_NUM_THREADS, blas, mkl, etc -- not clear how tho
+        if sys.platform != "darwin":
+            os.environ["CUDA_VISIBLE_DEVICES"] = (
+                str(worker_num) if worker_num < gpus else ""
+            )
+            # NOTE check any(task.definition.needs_gpu) anywhere?
+            # TODO configure OMP_NUM_THREADS, blas, mkl, etc -- not clear how tho
+        else:
+            if gpus != 1:
+                logger.warning("unexpected absence of gpu on darwin")
         for serdeTypeEnc, (serdeSer, serdeDes) in runnerContext.job.serdes.items():
             serde.SerdeRegistry.register(type_dec(serdeTypeEnc), serdeSer, serdeDes)

cascade/low/core.py CHANGED Viewed

@@ -106,15 +106,26 @@ def type_enc(t: Type) -> str:
     return b64encode(cloudpickle.dumps(t)).decode("ascii")
+class SchedulingConstraint(BaseModel):
+    gang: list[TaskId] = Field(
+        description="this set of TaskIds must be started at the same time, with ranks and address list as envvar",
+    )
 class JobInstance(BaseModel):
     tasks: dict[TaskId, TaskInstance]
     edges: list[Task2TaskEdge]
     serdes: dict[str, tuple[str, str]] = Field(
-        {},
+        default_factory=lambda: {},
         description="for each Type with custom serde, add entry here. The string is fully qualified name of the ser/des functions",
     )
     ext_outputs: list[DatasetId] = Field(
-        [], description="ids to externally materialize"
+        default_factory=lambda: [],
+        description="ids to externally materialize",
+    )
+    constraints: list[SchedulingConstraint] = Field(
+        default_factory=lambda: [],
+        description="constraints for the scheduler such as gangs",
     )
     def outputs_of(self, task_id: TaskId) -> set[DatasetId]:
@@ -157,6 +168,7 @@ class Worker(BaseModel):
 class Environment(BaseModel):
     workers: dict[WorkerId, Worker]
+    host_url_base: dict[HostId, str]
 class TaskExecutionRecord(BaseModel):

cascade/scheduler/api.py CHANGED Viewed

@@ -22,6 +22,7 @@ from cascade.scheduler.core import (
     Assignment,
     ComponentId,
     ComponentSchedule,
+    GangPreparation,
     Preschedule,
     Schedule,
 )
@@ -29,12 +30,60 @@ from cascade.scheduler.core import (
 logger = logging.getLogger(__name__)
+def gang_check_ready(task: TaskId, gang_prep: GangPreparation):
+    """When a task becomes computable, mutate the gang_prep to possibly
+    transition some gangs to `ready`
+    """
+    for gang in gang_prep.lookup[task]:
+        if gang not in gang_prep.countdown:
+            raise ValueError(
+                f"after {task=} marked computable, {gang=} not found -- double compuptable mark?"
+            )
+        remaining = gang_prep.countdown[gang]
+        if task not in remaining:
+            raise ValueError(
+                f"after {task=} marked computable, {gang=} does not have it in {remaining=}. Invalid gang?"
+            )
+        remaining.remove(task)
+        if not remaining:
+            logger.debug(f"gang just became ready {gang=}")
+            gang_prep.ready.append(gang)
+            gang_prep.countdown.pop(gang)
 def init_schedule(preschedule: Preschedule, context: JobExecutionContext) -> Schedule:
     components: list[ComponentSchedule] = []
     ts2component: dict[TaskId, ComponentId] = {}
+    gangs = [
+        frozenset(constraint.gang) for constraint in context.job_instance.constraints
+    ]
     computable = 0
     for componentId, precomponent in enumerate(preschedule.components):
+        # gang preparation
+        tasks = set(precomponent.nodes)
+        lookup = defaultdict(list)
+        countdown = {}
+        i = 0
+        while i < len(gangs):
+            if not gangs[i].issubset(tasks):
+                i += 1
+                continue
+            gang = gangs.pop(i)
+            countdown[gang] = set(gang)
+            for e in gang:
+                lookup[e].append(gang)
+        gang_preparation = GangPreparation(
+            ready=[],
+            lookup=lookup,
+            countdown=countdown,
+        )
+        for source in precomponent.sources:
+            gang_check_ready(source, gang_preparation)
+        # component itself
         component = ComponentSchedule(
             core=precomponent,
             weight=precomponent.weight(),
@@ -45,12 +94,18 @@ def init_schedule(preschedule: Preschedule, context: JobExecutionContext) -> Sch
                 task: {inp for inp in context.edge_i[task]}
                 for task in precomponent.nodes
             },
+            gang_preparation=gang_preparation,
         )
         components.append(component)
         computable += len(precomponent.sources)
         for task in precomponent.nodes:
             ts2component[task] = componentId
+    if gangs:
+        for gang in gangs:
+            logger.error(f"a gang not part of a component: {gang}")
+        raise ValueError(f"a total of {len(gangs)} were not a subcomponent")
     return Schedule(
         components=components,
         ts2component=ts2component,

cascade/scheduler/assign.py CHANGED Viewed

@@ -18,7 +18,13 @@ from typing import Iterable, Iterator
 from cascade.low.core import DatasetId, HostId, TaskId, WorkerId
 from cascade.low.execution_context import DatasetStatus, JobExecutionContext
 from cascade.low.tracing import Microtrace, trace
-from cascade.scheduler.core import Assignment, ComponentCore, ComponentId, Schedule
+from cascade.scheduler.core import (
+    Assignment,
+    ComponentCore,
+    ComponentId,
+    ComponentSchedule,
+    Schedule,
+)
 logger = logging.getLogger(__name__)
@@ -92,9 +98,148 @@ def build_assignment(
         tasks=assigned,
         prep=prep,
         outputs=trimmed_outputs,
+        extra_env={},
     )
+def _postproc_assignment(
+    assignment: Assignment,
+    component: ComponentSchedule,
+    schedule: Schedule,
+    context: JobExecutionContext,
+) -> None:
+    for assigned in assignment.tasks:
+        if assigned in component.computable:
+            component.computable.pop(assigned)
+            component.worker2task_values.remove(assigned)
+            schedule.computable -= 1
+        else:
+            # shortcut for fused-in tasks
+            component.is_computable_tracker[assigned] = set()
+    context.idle_workers.remove(assignment.worker)
+    component.weight -= len(assignment.tasks)
+# TODO this is not particularly systematic! We cant bind dynamically at the host as we send this
+# in advance, so we need to hardcode. Ideally we centrallize all port opening into a single module,
+# in particular unify this with the portBase from benchmarks/__main__ and then derived ports from
+# executor/executor.py etc. As is, we have a single global variable that we increment, to ensure
+# no port collision happens gang-wise -- we dont really expect many gangs per a workflow
+gang_port = 12355
+def _try_assign_gang(
+    schedule: Schedule,
+    gang: list[frozenset[TaskId]],
+    workers: list[WorkerId],
+    component_id: ComponentId,
+    context: JobExecutionContext,
+    fail_acc: list[frozenset[TaskId]],
+) -> Iterator[Assignment]:
+    """We greedily assign by descending worker-task distance"""
+    global gang_port
+    if len(gang) > len(workers):
+        logger.debug(f"not enough workers ({len(workers)}) for {gang=}")
+        fail_acc.append(gang)
+        return
+    start = perf_counter_ns()
+    component = schedule.components[component_id]
+    gpu_tasks: set[TaskId] = set()
+    cpu_tasks: set[TaskId] = set()
+    gpu_workers: set[WorkerId] = set()
+    cpu_workers: set[WorkerId] = set()
+    for task in gang:
+        if context.job_instance.tasks[task].definition.needs_gpu:
+            gpu_tasks.add(task)
+        else:
+            cpu_tasks.add(task)
+    for worker in workers:
+        if context.environment.workers[worker].gpu > 0:
+            gpu_workers.add(worker)
+        else:
+            cpu_workers.add(worker)
+    if len(gpu_tasks) > len(gpu_workers):
+        logger.debug(f"not enough gpu workers ({len(workers)}) for {gang=}")
+        fail_acc.append(gang)
+        end = perf_counter_ns()
+        trace(Microtrace.ctrl_assign, end - start)
+        return
+    world_size = len(gang)
+    rank = 0
+    coordinator = None
+    # similarly to _assignment_heuristic, a greedy algorithm
+    candidates = [
+        (schedule.worker2task_overhead[w][t], component.core.value[t], w, t)
+        for w in gpu_workers
+        for t in gpu_tasks
+    ]
+    candidates.sort(key=lambda e: (e[0], e[1]))
+    for _, _, worker, task in candidates:
+        if task in gpu_tasks and worker in gpu_workers:
+            if task not in component.computable:
+                # it may be that some fusing for previous task already assigned this
+                continue
+            end = perf_counter_ns()
+            trace(Microtrace.ctrl_assign, end - start)
+            assignment = build_assignment(worker, task, context, component.core)
+            if not coordinator:
+                coordinator = (
+                    f"{context.environment.host_url_base[worker.host]}:{gang_port}"
+                )
+            assignment.extra_env["CASCADE_GANG_WORLD_SIZE"] = str(world_size)
+            assignment.extra_env["CASCADE_GANG_RANK"] = str(rank)
+            assignment.extra_env["CASCADE_GANG_COORDINATOR"] = coordinator
+            rank += 1
+            yield assignment
+            start = perf_counter_ns()
+            _postproc_assignment(assignment, component, schedule, context)
+            gpu_tasks.remove(task)
+            gpu_workers.remove(worker)
+    if gpu_tasks:
+        raise ValueError(
+            f"expected to assign all gang gpu tasks, yet {gpu_tasks} remain"
+        )
+    all_workers = cpu_workers.union(gpu_workers)
+    candidates = [
+        (schedule.worker2task_overhead[w][t], component.core.value[t], w, t)
+        for w in all_workers
+        for t in cpu_tasks
+    ]
+    candidates.sort(key=lambda e: (e[0], e[1]))
+    for _, _, worker, task in candidates:
+        if task in cpu_tasks and worker in all_workers:
+            if task not in component.computable:
+                # it may be that some fusing for previous task already assigned this
+                continue
+            end = perf_counter_ns()
+            trace(Microtrace.ctrl_assign, end - start)
+            assignment = build_assignment(worker, task, context, component.core)
+            if not coordinator:
+                coordinator = (
+                    f"{context.environment.host_url_base[worker.host]}:{gang_port}"
+                )
+            assignment.extra_env["CASCADE_GANG_WORLD_SIZE"] = str(world_size)
+            assignment.extra_env["CASCADE_GANG_RANK"] = str(rank)
+            assignment.extra_env["CASCADE_GANG_COORDINATOR"] = coordinator
+            rank += 1
+            yield assignment
+            start = perf_counter_ns()
+            _postproc_assignment(assignment, component, schedule, context)
+            cpu_tasks.remove(task)
+            all_workers.remove(worker)
+    if cpu_tasks:
+        raise ValueError(
+            f"expected to assign all gang cpu tasks, yet {cpu_tasks} remain"
+        )
+    end = perf_counter_ns()
+    trace(Microtrace.ctrl_assign, end - start)
+    gang_port += 1
 def _assignment_heuristic(
     schedule: Schedule,
     tasks: list[TaskId],
@@ -106,18 +251,6 @@ def _assignment_heuristic(
     start = perf_counter_ns()
     component = schedule.components[component_id]
-    def postproc_assignment(assignment: Assignment) -> None:
-        for assigned in assignment.tasks:
-            if assigned in component.computable:
-                component.computable.pop(assigned)
-                component.worker2task_values.remove(assigned)
-                schedule.computable -= 1
-            else:
-                # shortcut for fused-in tasks
-                component.is_computable_tracker[assigned] = set()
-        context.idle_workers.remove(worker)
-        component.weight -= len(assignment.tasks)
     # first, attempt optimum-distance assignment
     unassigned: list[TaskId] = []
     for task in tasks:
@@ -133,7 +266,7 @@ def _assignment_heuristic(
                 assignment = build_assignment(worker, task, context, component.core)
                 yield assignment
                 start = perf_counter_ns()
-                postproc_assignment(assignment)
+                _postproc_assignment(assignment, component, schedule, context)
                 workers.pop(idx)
                 was_assigned = True
                 break
@@ -159,7 +292,7 @@ def _assignment_heuristic(
             assignment = build_assignment(worker, task, context, component.core)
             yield assignment
             start = perf_counter_ns()
-            postproc_assignment(assignment)
+            _postproc_assignment(assignment, component, schedule, context)
             remaining_t.remove(task)
             remaining_w.remove(worker)
@@ -173,29 +306,63 @@ def assign_within_component(
     component_id: ComponentId,
     context: JobExecutionContext,
 ) -> Iterator[Assignment]:
-    """We first handle tasks requiring a gpu, then tasks whose child requires a gpu, last cpu only tasks, using the same algorithm for either case"""
-    # TODO employ a more systematic solution and handle all multicriterially at once -- ideally together with adding support for multi-gpu-groups
-    # NOTE this is getting even more important as we started considering gpu fused distance
-    # NOTE the concept of "strategic wait" is completely missing here (eg dont assign a gpu worker to a cpu task because there will come a gpu task in a few secs)
+    """We hardcode order of handling task groups:
+        1/ ready gangs,
+        2/ tasks requiring a gpu,
+        3/ tasks whose fusable child requires a gpu,
+        4/ all other tasks,
+    using the same algorithm for cases 2-4 and a naive for case 1
+    """
+    # TODO rework into a more systematic multicriterial opt solution that is able to consider all groups
+    # at once, using a generic value/cost framework and matching algorithm. It should additionally be able
+    # to issue a "strategic wait" command -- eg if we could assign a task to an idle worker with high cost,
+    # or wait until a better-equipped busy worker finished, etc.
+    component = schedule.components[component_id]
+    # gangs
+    fail_acc: list[frozenset[TaskId]] = []
+    for gang in component.gang_preparation.ready:
+        logger.debug(f"trying to assign a {gang=}")
+        yield from _try_assign_gang(
+            schedule, gang, list(context.idle_workers), component_id, context, fail_acc
+        )
+    component.gang_preparation.ready = fail_acc
+    # the other cases: build them first
     cpu_t: list[TaskId] = []
     gpu_t: list[TaskId] = []
     opu_t: list[TaskId] = []
-    component = schedule.components[component_id]
     for task in component.computable.keys():
-        if context.job_instance.tasks[task].definition.needs_gpu:
+        if component.gang_preparation.lookup[task]:
+            # no gang participation in single-task scheduling
+            continue
+        elif context.job_instance.tasks[task].definition.needs_gpu:
             gpu_t.append(task)
         elif component.core.gpu_fused_distance[task] is not None:
             opu_t.append(task)
         else:
             cpu_t.append(task)
+    # tasks immediately needing a gpu
     eligible_w = [
-        worker for worker in workers if context.environment.workers[worker].gpu > 0
+        worker
+        for worker in workers
+        if context.environment.workers[worker].gpu > 0
+        and worker in context.idle_workers
     ]
+    logger.debug(
+        f"considering {len(gpu_t)}# gpu tasks, {len(opu_t)}# maybe-gpu tasks, {len(cpu_t)}# cpu tasks, with {len(workers)}# workers out of which {len(eligible_w)} have gpu"
+    )
     yield from _assignment_heuristic(schedule, gpu_t, eligible_w, component_id, context)
+    # tasks whose fusing opportunity needs a gpu
     eligible_w = [worker for worker in eligible_w if worker in context.idle_workers]
     yield from _assignment_heuristic(schedule, opu_t, eligible_w, component_id, context)
+    # remaining tasks
     eligible_w = [worker for worker in workers if worker in context.idle_workers]
-    yield from _assignment_heuristic(schedule, cpu_t, eligible_w, component_id, context)
+    u_opu_t = [task for task in opu_t if task in component.computable]
+    yield from _assignment_heuristic(
+        schedule, cpu_t + u_opu_t, eligible_w, component_id, context
+    )
 def update_worker2task_distance(

cascade/scheduler/core.py CHANGED Viewed

@@ -44,6 +44,19 @@ Worker2TaskDistance = dict[WorkerId, dict[TaskId, int]]
 ComponentId = int
+@dataclass
+class GangPreparation:
+    ready: list[
+        frozenset[TaskId]
+    ]  # used by scheduler to see if any gangs can be assigned/started
+    countdown: dict[
+        frozenset[TaskId], set[TaskId]
+    ]  # used to check after a task completion whether a gang can be moved to ready
+    lookup: dict[
+        TaskId, list[frozenset[TaskId]]
+    ]  # used to decrease countdown after a task completion
 @dataclass
 class ComponentSchedule:
     core: ComponentCore
@@ -58,6 +71,7 @@ class ComponentSchedule:
     worker2task_distance: Worker2TaskDistance
     # eligible values -- a cached value. Used when migrating new workers to the component, inserted whenever a parent of this task gets `preparing`, removed when this task is made computable
     worker2task_values: set[TaskId]
+    gang_preparation: GangPreparation
 @dataclass
@@ -79,3 +93,4 @@ class Assignment:
     tasks: list[TaskId]
     prep: list[tuple[DatasetId, HostId]]
     outputs: set[DatasetId]
+    extra_env: list[tuple[str, str]]

cascade/scheduler/precompute.py CHANGED Viewed

@@ -112,6 +112,7 @@ def _enrich(
     edge_i: dict[TaskId, set[TaskId]],
     edge_o: dict[TaskId, set[TaskId]],
     needs_gpu: set[TaskId],
+    gangs: set[TaskId],
 ) -> ComponentCore:
     nodes, sources = plain_component
     logger.debug(
@@ -170,7 +171,7 @@ def _enrich(
         while layer:
             gpu_distance = None
             head = layer.pop(0)
-            if head in fused:
+            if head in fused or head in gangs:
                 continue
             chain = []
             fused.add(head)
@@ -183,7 +184,7 @@ def _enrich(
                 gpu_fused_distance[head] = gpu_distance
                 found = False
                 for edge in edge_i[head]:
-                    if edge not in fused:
+                    if edge not in fused and edge not in gangs:
                         chain.insert(0, head)
                         head = edge
                         fused.add(head)
@@ -222,11 +223,16 @@ def precompute(job_instance: JobInstance) -> Preschedule:
         for task_id, task in job_instance.tasks.items()
         if task.definition.needs_gpu
     }
+    gangs = {
+        task_id
+        for constraint in job_instance.constraints
+        for task_id in constraint.gang
+    }
     with ThreadPoolExecutor(max_workers=4) as tp:
         # TODO if coptrs is not used, then this doesnt make sense
         f = lambda plain_component: timer(_enrich, Microtrace.presched_enrich)(
-            plain_component, edge_i_proj, edge_o_proj, needs_gpu
+            plain_component, edge_i_proj, edge_o_proj, needs_gpu, gangs
         )
         plain_components = (
             plain_component

cascade/shm/server.py CHANGED Viewed

@@ -115,5 +115,5 @@ def entrypoint(
         server.start()
     except Exception as e:
         # we always get a Bad file descriptor due to sigterm handler calling sock close mid-read
-        logger.warning(f"shutdown issue: {e}")
+        logger.warning(f"shutdown issue: {repr(e)}")
     server.atexit(0, None)

earthkit/workflows/_version.py CHANGED Viewed

@@ -1,2 +1,2 @@
 # Do not change! Do not track in version control!
-__version__ = "0.4.0"
+__version__ = "0.4.2"

earthkit/workflows/backends/__init__.py CHANGED Viewed

@@ -7,37 +7,52 @@
 # nor does it submit to any jurisdiction.
 import functools
-import warnings
-from typing import Callable
+import logging
+from typing import Callable, Union
 import xarray as xr
 from .arrayapi import ArrayAPIBackend
 from .xarray import XArrayBackend
+logger = logging.getLogger(__name__)
 BACKENDS = {
     xr.DataArray: XArrayBackend,
     xr.Dataset: XArrayBackend,
-    "default": ArrayAPIBackend,
+    object: ArrayAPIBackend,
 }
 def register(type, backend):
     if type in BACKENDS:
-        warnings.warn(
+        logger.warning(
             f"Overwriting backend for {type}. Existing backend {BACKENDS[type]}."
         )
     BACKENDS[type] = backend
+def _get_backend(obj_type: type) -> Union[type, None]:
+    return BACKENDS.get(obj_type, None)
 def array_module(*arrays):
-    # Only deduce type from first element to allow for mixed types
-    # but this means the first argument needs to specify the correct module
+    """Return the backend module for the given arrays."""
+    # Checks all bases of the first array type for a registered backend.
+    # If no backend is found, it will traverse the hierarchy of types
+    # until it finds a registered backend or reaches the base object type.
+    if not arrays:
+        raise ValueError("No arrays provided to determine backend.")
     array_type = type(arrays[0])
-    backend = BACKENDS.get(array_type, None)
-    if backend is None:
-        # Fall back on array API
-        backend = BACKENDS["default"]
+    while True:
+        backend = _get_backend(array_type)
+        if backend is not None:
+            break
+        # If no backend found, try the next type in the hierarchy
+        array_type = array_type.__bases__[0]
+    logger.debug(f"Using backend {backend} for {array_type}")
     return backend
@@ -201,5 +216,6 @@ try:
     BACKENDS[SimpleFieldList] = FieldListBackend
     BACKENDS[FieldList] = FieldListBackend
 except ImportError:
-    warnings.warn("earthkit could not be imported, FieldList not supported.")
+    logger.warning("earthkit could not be imported, FieldList not supported.")

{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: earthkit-workflows
-Version: 0.4.0
+Version: 0.4.2
 Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
 Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
 License-Expression: Apache-2.0

{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/RECORD RENAMED Viewed

@@ -1,28 +1,29 @@
 cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
-cascade/benchmarks/__main__.py,sha256=n0RX44Sj_j6InFNKCjwXRVWKTYznMsrPBdf8kwGKhjM,8065
+cascade/benchmarks/__main__.py,sha256=g03xRzp58dXLHDj8kTPyPnbBOS5sRIAMTthFtFjDRbs,9876
 cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
+cascade/benchmarks/dist.py,sha256=ngXJJzegnMUVwDFPvGMG6997lamB-aSEHi74oBbayrE,4116
 cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
 cascade/benchmarks/job1.py,sha256=MOcZZYgf36MzHCjtby0lQyenM1ODUlagG8wtt2CbpnI,4640
 cascade/benchmarks/matmul.py,sha256=5STuvPY6Q37E2pKRCde9dQjL5M6tx7tkES9cBLZ6eK4,1972
 cascade/benchmarks/plotting.py,sha256=vSz9HHbqZwMXHpBUS-In6xsXGgK7QIoQTTiYfSwYwZs,4428
 cascade/benchmarks/reporting.py,sha256=MejaM-eekbMYLAnuBxGv_t4dR1ODJs4Rpc0fiZSGjyw,5410
 cascade/controller/__init__.py,sha256=p4C2p3S_0nUGamP9Mi6cSa5bvpiWbI6sVWtGhFnNqjw,1278
-cascade/controller/act.py,sha256=POzWwIlnp26hCY78Gp-ZMvCO6iXGOfA7TJUrEWrheyw,2941
+cascade/controller/act.py,sha256=WHIsk4H-Bbyl_DABX2VWhyKy_cNnp12x1nilatPCL8I,2981
 cascade/controller/core.py,sha256=NqvZ5g5GNphwOpzdXbCI0_fxIzzmO97_n2xZKswK72Q,3589
-cascade/controller/impl.py,sha256=sLllTM509obsBHWbNtJ_Zu8Q6IJDG2IZOw0E08LDVfg,3247
-cascade/controller/notify.py,sha256=-FgHCsEVlghXuHX3_8Okyt_yL1AMj57ZBBHXiebX_Ys,5757
+cascade/controller/impl.py,sha256=9jdTikYO8OkaNIfzatyr3Mhai5EfEhaeii9GaF9cQw4,3526
+cascade/controller/notify.py,sha256=5eSPKcxqrv9kHy7St-iIm1NttsyzcvwLhZI5dvr4cEY,5881
 cascade/controller/report.py,sha256=FD-MAWZq6pwSw2CP2m4OUBw4hzrX46vKE_FZO5NpjDU,3670
-cascade/executor/bridge.py,sha256=vrs-5_Qt2mgkAD7Mzi43Xt_q7tpXX6i1UOPfqZSxHfs,8169
+cascade/executor/bridge.py,sha256=WDE-GM2Bv7nUk1-nV-otMGuaRYw1-Vmd7PWploXBp6Y,8267
 cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
-cascade/executor/config.py,sha256=rA4WeCNbdJJ3FdOKJ6WN3_VUorYW3cqdMfKUYPSyj0Y,1471
+cascade/executor/config.py,sha256=8azy_sXdvDGO0zTNqA0pdtkXsyihM4FQ4U1W_3Dhua0,1571
 cascade/executor/data_server.py,sha256=xLIbLkWn8PnJl4lMP8ADHa2S0EgPwr0-bH7_Sib_Y70,13701
-cascade/executor/executor.py,sha256=SqMVM8BvCNM2r2Zbg9kxSxwFADAaoBU7nCMtfzktsgI,13282
-cascade/executor/msg.py,sha256=QW7Me-8Sin-x-f4M4bzvO7_av8MRkjnabQN6Ch3x22c,4230
+cascade/executor/executor.py,sha256=egPhfqhzYfeM77Hu10-mGHNVsQAdqmZOA7hmjFP1Q8M,13484
+cascade/executor/msg.py,sha256=7HI0rKeCRaV1ONR4HWEa64nHbu-p6-QdBwJNitmst48,4340
 cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
 cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
-cascade/executor/runner/entrypoint.py,sha256=e_MWYTSQroGMkgMddrqtn5DEqUeN-svC565TlOrv5iA,7598
+cascade/executor/runner/entrypoint.py,sha256=32i2U4fmEvQnsV1MTl0Xf8mK_1nbk1BEVJqIidd6MRM,8042
 cascade/executor/runner/memory.py,sha256=jkAV9T7-imciVcGvkV7OhRfosEpOQJU1OME7z-4ztAs,6371
 cascade/executor/runner/packages.py,sha256=OZjEOvKy8LQ2uguGZU1L7TVYz1415JOUGySRfU_D_sc,2513
 cascade/executor/runner/runner.py,sha256=zqpkvxdWLbwyUFaUbZmSj0KQEBNRpmF8gwVotiaamhc,4870
@@ -34,17 +35,17 @@ cascade/gateway/router.py,sha256=iN-dc3L46aEy0EV57NNKYwaqIu0Au9kImu1pg-UbxwE,768
 cascade/gateway/server.py,sha256=tsOyKtVFs5EZmWrjKdi9JwWxK0DG207oSa9OQ-4zN3M,3772
 cascade/low/__init__.py,sha256=5cw2taOGITK_gFbICftzK2YLdEAnLUY5OzblFzdHss4,769
 cascade/low/builders.py,sha256=_u5X8G_EF00hFt8Anv9AXo6yPf1O8MHDmqs2kKmREl0,7073
-cascade/low/core.py,sha256=txya9rgks2b1ze9yLvFvrZCs8sCCtDUlfNwz4sHgybM,5994
+cascade/low/core.py,sha256=_3x4ka_pmCgZbfwFeyhq8S4M6wmh0s24VRCLhk5yQFM,6444
 cascade/low/execution_context.py,sha256=cdDJLYhreo4T7t4qXgFBosncubZpTrm0hELo7q4miqo,6640
 cascade/low/func.py,sha256=ihL5n3cK-IJnATgP4Dub2m-Mp_jHMxJzCA1v4uMEsi8,5211
 cascade/low/into.py,sha256=QvjrcBuHfu7qpEkeB0EJu1EAaRxOEZskUnyjkRJ_9gA,3391
 cascade/low/tracing.py,sha256=qvGVKB1huwcYoyvMYN-2wQ92pLQTErocTjpIjWv9glA,4511
 cascade/low/views.py,sha256=UwafO2EQHre17GjG8hdzO8b6qBRtTRtDlhOc1pTf8Io,1822
 cascade/scheduler/__init__.py,sha256=VT2qQ0gOQWHC4-T0FcCs59w8WZ94j2nUn7tiGm5XepA,1148
-cascade/scheduler/api.py,sha256=uyRslN3ZNXOZNax27pQOrczeo9-2zTxal7-xYAPCDgI,5911
-cascade/scheduler/assign.py,sha256=XRTu3wEK2FYM-4Y_Gp4_O6h2wr6LSUa7e05DTwPHRcs,12250
-cascade/scheduler/core.py,sha256=XtXpfq6gtE8FS1BQd0ku0uQOrJpe1_CzzuBd98W6y7g,2891
-cascade/scheduler/precompute.py,sha256=QmZgriwfb07LViMztZogX5DOC1L4dCTbZJNGuFvFS9A,8513
+cascade/scheduler/api.py,sha256=UuomWS2ISuDw-ngFFUKLyucygpTWF0EBW8ZuF91EUBU,7778
+cascade/scheduler/assign.py,sha256=gpOLL22-k3ah4gihiztIGMX2uF0RdJ5AtJ8fOCJUviE,18362
+cascade/scheduler/core.py,sha256=umORLC6SDeOyS4z8nQuVFkDukBJ96JfH4hdLSj6Km20,3378
+cascade/scheduler/precompute.py,sha256=AhTn8RgnU4XuV_WAgbVXz9z0YRpNS6LCY1dJeHdTfCc,8709
 cascade/shm/__init__.py,sha256=R9QgGSnsl_YDjFjAUQkoleM_5yGM37ce9S8a4ReA1mE,3854
 cascade/shm/algorithms.py,sha256=SGxnJF4ovUaywTunMJWkG77l5DN-jXx7HgABt3sRJXM,2356
 cascade/shm/api.py,sha256=a_KrjyELsDms0Di0ThHsZe7MfmNEkekflmjXAQ1_Qws,6040
@@ -52,9 +53,9 @@ cascade/shm/client.py,sha256=pnod_dmUROJZRtipCpoeCuAEuynW0IgSfgjrp21CH2M,5893
 cascade/shm/dataset.py,sha256=Z2ewpnW7mVDJB9GylIVoOWV0DYOF7FWLIXkIvV-Y7sI,12347
 cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
 cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
-cascade/shm/server.py,sha256=5Ub9bnBmDto9BwfjX3h3sJeiLzZN4lawgtLfvK-vcMU,5036
+cascade/shm/server.py,sha256=LnnNX0F6QJt5V_JLfmC3ZMHGNL5WpLY44wpB_pYDr7Y,5042
 earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
-earthkit/workflows/_version.py,sha256=-UXII43tJWWG-Bw3-ObfEfbloOAVS2Clozd55E6zYvA,72
+earthkit/workflows/_version.py,sha256=nkd71CReR3pz5TZ9pcVgB2cP1MDj4YK6VH9UGJYzXDM,72
 earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
 earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
 earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
@@ -63,7 +64,7 @@ earthkit/workflows/taskgraph.py,sha256=RsT1Qlng1uPZSaSBNqE8vFsoI5J8DDcQl468YPX-k
 earthkit/workflows/transformers.py,sha256=BsUUvnG-UyerT3XUYcHc1qJkSsLc0ZX3Zxqq70tJWLU,2105
 earthkit/workflows/utility.py,sha256=ygqn1s846WQbo7HGY46Z8N1AXrDFGwyygSgsv4YnGJ8,1344
 earthkit/workflows/visualise.py,sha256=WbqJWvn648B7Qo3VCKJyoJzU6Mgvv0p3UWZb0lf01m8,2290
-earthkit/workflows/backends/__init__.py,sha256=XMJM2OL55bnWOSB_g4nzoY9dgBfnh250d8nLBOCj0MA,6013
+earthkit/workflows/backends/__init__.py,sha256=6ONg-EdNODiqeBZqyosI5iq1UfZfaOLqhAo8l8_wn9o,6519
 earthkit/workflows/backends/arrayapi.py,sha256=QfUsTlYuFH3CroWdcf_XBcLnt2znMcS1HwNNEe8J0qU,2279
 earthkit/workflows/backends/earthkit.py,sha256=rZURJf6FLKcCjJkyWgOf6NqKjPZjSNX09dV_SicIlss,8958
 earthkit/workflows/backends/xarray.py,sha256=4pnnPgIug4DmvhigkU0JsituvdvspuVA_vxbIsrq8-A,6762
@@ -84,8 +85,8 @@ earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESs
 earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
 earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
 earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
-earthkit_workflows-0.4.0.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
-earthkit_workflows-0.4.0.dist-info/METADATA,sha256=GUxPv5SDQH-BE7InVU4Yy0MheZaSXdD1ys1seH-vPO4,1571
-earthkit_workflows-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-earthkit_workflows-0.4.0.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
-earthkit_workflows-0.4.0.dist-info/RECORD,,
+earthkit_workflows-0.4.2.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
+earthkit_workflows-0.4.2.dist-info/METADATA,sha256=LWW-xDc0sq8cOdu6IpY335_MSFfe7Lmg1SHYT9cXjWA,1571
+earthkit_workflows-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+earthkit_workflows-0.4.2.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
+earthkit_workflows-0.4.2.dist-info/RECORD,,

{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

earthkit-workflows 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

earthkit-workflows 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl