PyPI - earthkit-workflows - Versions diffs - 0.3.6__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

earthkit-workflows 0.3.6py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

cascade/benchmarks/__main__.py +11 -1
cascade/benchmarks/job1.py +2 -2
cascade/benchmarks/matmul.py +73 -0
cascade/executor/runner/entrypoint.py +29 -3
cascade/executor/runner/memory.py +23 -4
cascade/low/execution_context.py +6 -0
cascade/scheduler/api.py +1 -1
cascade/scheduler/assign.py +100 -56
cascade/scheduler/core.py +4 -0
cascade/scheduler/{graph.py → precompute.py} +95 -44
earthkit/workflows/__init__.py +4 -0
earthkit/workflows/_version.py +1 -1
earthkit/workflows/plugins/__init__.py +4 -0
{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/METADATA +1 -1
{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/RECORD +18 -17
{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/WHEEL +0 -0
{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/licenses/LICENSE +0 -0
{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/top_level.txt +0 -0

cascade/benchmarks/__main__.py CHANGED Viewed

@@ -41,7 +41,7 @@ from cascade.executor.executor import Executor
 from cascade.executor.msg import BackboneAddress, ExecutorShutdown
 from cascade.low.core import JobInstance
 from cascade.low.func import msum
-from cascade.scheduler.graph import precompute
+from cascade.scheduler.precompute import precompute
 from earthkit.workflows.graph import Graph, deduplicate_nodes
 logger = logging.getLogger("cascade.benchmarks")
@@ -73,6 +73,10 @@ def get_job(benchmark: str | None, instance_path: str | None) -> JobInstance:
             import cascade.benchmarks.generators as generators
             return generators.get_job()
+        elif benchmark.startswith("matmul"):
+            import cascade.benchmarks.matmul as matmul
+            return matmul.get_job()
         else:
             raise NotImplementedError(benchmark)
     else:
@@ -81,6 +85,12 @@ def get_job(benchmark: str | None, instance_path: str | None) -> JobInstance:
 def get_gpu_count() -> int:
     try:
+        if "CUDA_VISIBLE_DEVICES" in os.environ:
+            # TODO we dont want to just count, we want to actually use literally these ids
+            # NOTE this is particularly useful for "" value -- careful when refactoring
+            visible = os.environ["CUDA_VISIBLE_DEVICES"]
+            visible_count = sum(1 for e in visible if e == ",") + (1 if visible else 0)
+            return visible_count
         gpus = sum(
             1
             for l in subprocess.run(

cascade/benchmarks/job1.py CHANGED Viewed

@@ -16,10 +16,10 @@ Controlled by env var params: JOB1_{DATA_ROOT, GRID, ...}, see below
 import os
 import earthkit.data
-from ppcascade.fluent import from_source
-from ppcascade.utils.window import Range
 from earthkit.workflows.fluent import Payload
+from earthkit.workflows.plugins.pproc.fluent import from_source
+from earthkit.workflows.plugins.pproc.utils.window import Range
 # *** PARAMS ***

cascade/benchmarks/matmul.py ADDED Viewed

@@ -0,0 +1,73 @@
+import os
+from typing import Any
+import jax
+import jax.numpy as jp
+import jax.random as jr
+from cascade.low.builders import JobBuilder, TaskBuilder
+from cascade.low.core import JobInstance
+def get_funcs():
+    K = int(os.environ["MATMUL_K"])
+    size = (2**K, 2**K)
+    E = int(os.environ["MATMUL_E"])
+    def source() -> Any:
+        k0 = jr.key(0)
+        m = jr.uniform(key=k0, shape=size)
+        return m
+    def powr(m: Any) -> Any:
+        print(f"powr device is {m.device}")
+        return m**E * jp.percentile(m, 0.7)
+    return source, powr
+def get_job() -> JobInstance:
+    L = int(os.environ["MATMUL_L"])
+    # D = os.environ["MATMUL_D"]
+    # it would be tempting to with jax.default_device(jax.devices(D)):
+    # alas, it doesn't work because we can't inject this at deser time
+    source, powr = get_funcs()
+    source_node = TaskBuilder.from_callable(source)
+    if os.environ.get("CUDA_VISIBLE_DEVICES", "") != "":
+        source_node.definition.needs_gpu = True
+    # currently no need to set True downstream since scheduler prefers no transfer
+    job = JobBuilder().with_node("source", source_node)
+    prv = "source"
+    for i in range(L):
+        cur = f"pow{i}"
+        node = TaskBuilder.from_callable(powr)
+        job = job.with_node(cur, node).with_edge(prv, cur, 0)
+        prv = cur
+    job = job.build().get_or_raise()
+    job.ext_outputs = list(job.outputs_of(cur))
+    return job
+def execute_locally():
+    L = int(os.environ["MATMUL_L"])
+    source, powr = get_funcs()
+    device = "gpu" if os.environ.get("CUDA_VISIBLE_DEVICES", "") != "" else "cpu"
+    print(f"device is {device}")
+    with jax.default_device(jax.devices(device)[0]):
+        m0 = source()
+        for _ in range(L):
+            m0 = powr(m0)
+    from multiprocessing.shared_memory import SharedMemory
+    mem = SharedMemory("benchmark_tmp", create=True, size=m0.nbytes)
+    mem.buf[:] = m0.tobytes()
+if __name__ == "__main__":
+    execute_locally()

cascade/executor/runner/entrypoint.py CHANGED Viewed

@@ -67,6 +67,25 @@ class RunnerContext:
         )
+class Config:
+    """Some parameters to drive behaviour. Currently not exposed externally -- no clear argument
+    that they should be. As is, just a means of code experimentation.
+    """
+    # flushing approach -- when we finish a computation of task sequence, there is a question what
+    # to do with the output. We could either publish & drop, or publish and retain in memory. The
+    # former is is slower -- if the next task sequence needs this output, it requires a fetch & deser
+    # from cashme. But the latter is more risky -- we effectively have the same dataset twice in
+    # system memory. The `posttask_flush` below goes the former way, the `pretask_flush` is a careful
+    # way of latter -- we drop the output from memory only if the *next* task sequence does not need
+    # it, ie, we retain a cache of age 1. We could ultimately have controller decide about this, or
+    # decide dynamically based on memory pressure -- but neither is easy.
+    posttask_flush = False  # after task is done, drop all outputs from memory
+    pretask_flush = (
+        True  # when we receive a task, we drop those in memory that wont be needed
+    )
 def worker_address(workerId: WorkerId) -> BackboneAddress:
     return f"ipc:///tmp/{repr(workerId)}.socket"
@@ -83,7 +102,8 @@ def execute_sequence(
         for taskId in taskSequence.tasks:
             pckg.extend(executionContext.tasks[taskId].definition.environment)
             run(taskId, executionContext, memory)
-        memory.flush()
+        if Config.posttask_flush:
+            memory.flush()
     except Exception as e:
         logger.exception("runner failure, about to report")
         callback(
@@ -107,8 +127,11 @@ def entrypoint(runnerContext: RunnerContext):
         PackagesEnv() as pckg,
     ):
         label("worker", repr(runnerContext.workerId))
-        gpu_id = str(runnerContext.workerId.worker_num())
-        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(gpu_id)
+        worker_num = runnerContext.workerId.worker_num()
+        gpus = int(os.environ.get("CASCADE_GPU_COUNT", "0"))
+        os.environ["CUDA_VISIBLE_DEVICES"] = (
+            ",".join(str(worker_num)) if worker_num < gpus else ""
+        )
         # NOTE check any(task.definition.needs_gpu) anywhere?
         # TODO configure OMP_NUM_THREADS, blas, mkl, etc -- not clear how tho
@@ -151,6 +174,9 @@ def entrypoint(runnerContext: RunnerContext):
                     for key, _ in runnerContext.job.tasks[task].definition.output_schema
                 }
                 missing_ds = required - availab_ds
+                if Config.pretask_flush:
+                    extraneous_ds = availab_ds - required
+                    memory.flush(extraneous_ds)
                 if missing_ds:
                     waiting_ts = mDes
                     for ds in availab_ds.intersection(required):

cascade/executor/runner/memory.py CHANGED Viewed

@@ -51,7 +51,6 @@ class Memory(AbstractContextManager):
             else:
                 outputValue = "ok"
-        # TODO how do we purge from here over time?
         self.local[outputId] = outputValue
         if isPublish:
@@ -68,6 +67,18 @@ class Memory(AbstractContextManager):
                 self.callback,
                 DatasetPublished(ds=outputId, origin=self.worker, transmit_idx=None),
             )
+        else:
+            # NOTE even if its not actually published, we send the message to allow for
+            # marking the task itself as completed -- its odd, but arguably better than
+            # introducing a TaskCompleted message. TODO we should fine-grain host-wide
+            # and worker-only publishes at the `controller.notify` level, to not cause
+            # incorrect shm.purge calls at worklow end, which log an annoying key error
+            logger.debug(f"fake publish of {outputId} for the sake of task completion")
+            shmid = ds2shmid(outputId)
+            callback(
+                self.callback,
+                DatasetPublished(ds=outputId, origin=self.worker, transmit_idx=None),
+            )
     def provide(self, inputId: DatasetId, annotation: str) -> Any:
         if inputId not in self.local:
@@ -85,18 +96,24 @@ class Memory(AbstractContextManager):
     def pop(self, ds: DatasetId) -> None:
         if ds in self.local:
+            logger.debug(f"popping local {ds}")
             val = self.local.pop(ds)  # noqa: F841
             del val
         if ds in self.bufs:
+            logger.debug(f"popping buf {ds}")
             buf = self.bufs.pop(ds)
             buf.close()
-    def flush(self) -> None:
-        # NOTE poor man's memory management -- just drop those locals that weren't published. Called
+    def flush(self, datasets: set[DatasetId] = set()) -> None:
+        # NOTE poor man's memory management -- just drop those locals that didn't come from cashme. Called
         # after every taskSequence. In principle, we could purge some locals earlier, and ideally scheduler
         # would invoke some targeted purges to also remove some published ones earlier (eg, they are still
         # needed somewhere but not here)
-        purgeable = [inputId for inputId in self.local if inputId not in self.bufs]
+        purgeable = [
+            inputId
+            for inputId in self.local
+            if inputId not in self.bufs and (not datasets or inputId in datasets)
+        ]
         logger.debug(f"will flush {len(purgeable)} datasets")
         for inputId in purgeable:
             self.local.pop(inputId)
@@ -115,6 +132,8 @@ class Memory(AbstractContextManager):
                     free, total = torch.cuda.mem_get_info()
                     logger.debug(f"cuda mem avail post cache empty: {free/total:.2%}")
                     if free / total < 0.8:
+                        # NOTE this ofc makes low sense if there is any other application (like browser or ollama)
+                        # that the user may be running
                         logger.warning("cuda mem avail low despite cache empty!")
                         logger.debug(torch.cuda.memory_summary())
         except ImportError:

cascade/low/execution_context.py CHANGED Viewed

@@ -108,6 +108,12 @@ class JobExecutionContext:
             self.idle_workers.add(worker)
     def dataset_preparing(self, dataset: DatasetId, worker: WorkerId) -> None:
+        # NOTE Currently this is invoked during `build_assignment`, as we need
+        # some state tranisition to allow fusing opportunities as well as
+        # preventing double transmits. This may not be the best idea, eg for long
+        # fusing chains -- instead, we may execute this transition at the time
+        # it actually happens, granularize the preparing state into
+        # (will_appear, is_appearing), etc
         # NOTE Currently, these `if`s are necessary because we issue transmit
         # command when host *has* DS but worker does *not*. This ends up no-op,
         # but we totally dont want host state to reset -- it wouldnt recover

cascade/scheduler/api.py CHANGED Viewed

@@ -136,7 +136,7 @@ def plan(
         for task in assignment.tasks:
             for ds in assignment.outputs:
                 children = context.edge_o[ds]
-                context.dataset_preparing(ds, assignment.worker)
+                # context.dataset_preparing(ds, assignment.worker) # happends during build already
                 update_worker2task_distance(
                     children, assignment.worker, schedule, context
                 )

cascade/scheduler/assign.py CHANGED Viewed

@@ -18,50 +18,80 @@ from typing import Iterable, Iterator
 from cascade.low.core import DatasetId, HostId, TaskId, WorkerId
 from cascade.low.execution_context import DatasetStatus, JobExecutionContext
 from cascade.low.tracing import Microtrace, trace
-from cascade.scheduler.core import Assignment, ComponentId, Schedule
+from cascade.scheduler.core import Assignment, ComponentCore, ComponentId, Schedule
 logger = logging.getLogger(__name__)
 def build_assignment(
-    worker: WorkerId, task: TaskId, context: JobExecutionContext
+    worker: WorkerId, task: TaskId, context: JobExecutionContext, core: ComponentCore
 ) -> Assignment:
     eligible_load = {DatasetStatus.preparing, DatasetStatus.available}
     eligible_transmit = {DatasetStatus.available}
     prep: list[tuple[DatasetId, HostId]] = []
-    for dataset in context.edge_i[task]:
-        at_worker = context.worker2ds[worker]
-        if at_worker.get(dataset, DatasetStatus.missing) not in eligible_load:
-            if (
-                context.host2ds[worker.host].get(dataset, DatasetStatus.missing)
-                in eligible_load
-            ):
-                # NOTE this currently leads to no-op, but with persistent workers would possibly allow an early fetch
-                prep.append((dataset, worker.host))
+    if task in core.fusing_opportunities:
+        tasks = core.fusing_opportunities.pop(task)
+    else:
+        tasks = [task]
+    assigned = []
+    exhausted = False
+    at_worker = context.worker2ds[worker]
+    at_host = context.host2ds[worker.host]
+    worker_has_gpu = context.environment.workers[worker].gpu > 0
+    while tasks and not exhausted:
+        task = tasks[0]
+        if context.job_instance.tasks[task].definition.needs_gpu and not worker_has_gpu:
+            if not assigned:
+                raise ValueError(f"tried to assign gpu {task=} to non-gpu {worker=}")
             else:
-                if any(
-                    candidate := host
-                    for host, status in context.ds2host[dataset].items()
-                    if status in eligible_transmit
-                ):
-                    prep.append((dataset, candidate))
-                    # NOTE this is a slight hack, to prevent issuing further transmit commands of this ds to this host
-                    # in this phase. A proper state transition happens later in the `plan` phase. We may want to instead
-                    # create a new `transmit_queue` state field to capture this, and consume it later during plan
-                    context.host2ds[worker.host][dataset] = DatasetStatus.preparing
-                    context.ds2host[dataset][worker.host] = DatasetStatus.preparing
+                break
+        for dataset in context.edge_i[task]:
+            if at_worker.get(dataset, DatasetStatus.missing) not in eligible_load:
+                if at_host.get(dataset, DatasetStatus.missing) in eligible_load:
+                    prep.append((dataset, worker.host))
                 else:
-                    raise ValueError(f"{dataset=} not found in any host, whoa whoa!")
+                    if any(
+                        candidate := host
+                        for host, status in context.ds2host[dataset].items()
+                        if status in eligible_transmit
+                    ):
+                        prep.append((dataset, candidate))
+                        context.dataset_preparing(dataset, worker)
+                    else:
+                        # if we are dealing with the first task to assign, we don't expect to be here!
+                        if not assigned:
+                            raise ValueError(f"{dataset=} not found anywhere!")
+                        # if we are already trying some fusing opportunities, it is legit to not find the dataset anywhere
+                        else:
+                            # TODO rollback preps done for this one task
+                            exhausted = True
+                            break
+        if not exhausted:
+            assigned.append(tasks.pop(0))
+            for dataset in context.task_o[task]:
+                context.dataset_preparing(dataset, worker)
+    if len(tasks) > 1:
+        head = tasks[0]
+        if head in core.fusing_opportunities:
+            raise ValueError(f"double assignment to {head} in fusing opportunities!")
+        core.fusing_opportunities[head] = tasks
+    # trim for only the necessary ones -- that is, having any edge outside of this current assignment
+    all_outputs = {ds for task in assigned for ds in context.task_o[task]}
+    assigned_tasks = set(assigned)
+    trimmed_outputs = {
+        ds
+        for ds in all_outputs
+        if (context.edge_o[ds] - assigned_tasks)
+        or (ds in context.job_instance.ext_outputs)
+    }
     return Assignment(
         worker=worker,
-        tasks=[
-            task
-        ],  # TODO eager fusing for outdeg=1? Or heuristic via ratio of outdeg vs workers@component?
+        tasks=assigned,
         prep=prep,
-        outputs={  # TODO trim for only the necessary ones
-            ds for ds in context.task_o[task]
-        },
+        outputs=trimmed_outputs,
     )
@@ -72,27 +102,39 @@ def _assignment_heuristic(
     component_id: ComponentId,
     context: JobExecutionContext,
 ) -> Iterator[Assignment]:
-    """Finds a reasonable assignment within a single component. Does not migrate hosts to a different component"""
+    """Finds a reasonable assignment within a single component. Does not migrate hosts to a different component."""
     start = perf_counter_ns()
     component = schedule.components[component_id]
+    def postproc_assignment(assignment: Assignment) -> None:
+        for assigned in assignment.tasks:
+            if assigned in component.computable:
+                component.computable.pop(assigned)
+                component.worker2task_values.remove(assigned)
+                schedule.computable -= 1
+            else:
+                # shortcut for fused-in tasks
+                component.is_computable_tracker[assigned] = set()
+        context.idle_workers.remove(worker)
+        component.weight -= len(assignment.tasks)
     # first, attempt optimum-distance assignment
     unassigned: list[TaskId] = []
     for task in tasks:
+        if task not in component.computable:
+            # it may be that some fusing for previous task already assigned this
+            continue
         opt_dist = component.computable[task]
         was_assigned = False
         for idx, worker in enumerate(workers):
             if component.worker2task_distance[worker][task] == opt_dist:
                 end = perf_counter_ns()
                 trace(Microtrace.ctrl_assign, end - start)
-                yield build_assignment(worker, task, context)
+                assignment = build_assignment(worker, task, context, component.core)
+                yield assignment
                 start = perf_counter_ns()
+                postproc_assignment(assignment)
                 workers.pop(idx)
-                component.computable.pop(task)
-                component.worker2task_values.remove(task)
-                component.weight -= 1
-                schedule.computable -= 1
-                context.idle_workers.remove(worker)
                 was_assigned = True
                 break
         if not was_assigned:
@@ -109,17 +151,17 @@ def _assignment_heuristic(
     candidates.sort(key=lambda e: (e[0], e[1]))
     for _, _, worker, task in candidates:
         if task in remaining_t and worker in remaining_w:
+            if task not in component.computable:
+                # it may be that some fusing for previous task already assigned this
+                continue
             end = perf_counter_ns()
             trace(Microtrace.ctrl_assign, end - start)
-            yield build_assignment(worker, task, context)
+            assignment = build_assignment(worker, task, context, component.core)
+            yield assignment
             start = perf_counter_ns()
-            component.computable.pop(task)
-            component.worker2task_values.remove(task)
+            postproc_assignment(assignment)
             remaining_t.remove(task)
             remaining_w.remove(worker)
-            context.idle_workers.remove(worker)
-            schedule.computable -= 1
-            component.weight -= 1
     end = perf_counter_ns()
     trace(Microtrace.ctrl_assign, end - start)
@@ -131,27 +173,29 @@ def assign_within_component(
     component_id: ComponentId,
     context: JobExecutionContext,
 ) -> Iterator[Assignment]:
-    """We first handle gpu things, second cpu things, using the same algorithm for either case"""
+    """We first handle tasks requiring a gpu, then tasks whose child requires a gpu, last cpu only tasks, using the same algorithm for either case"""
     # TODO employ a more systematic solution and handle all multicriterially at once -- ideally together with adding support for multi-gpu-groups
+    # NOTE this is getting even more important as we started considering gpu fused distance
+    # NOTE the concept of "strategic wait" is completely missing here (eg dont assign a gpu worker to a cpu task because there will come a gpu task in a few secs)
     cpu_t: list[TaskId] = []
     gpu_t: list[TaskId] = []
-    gpu_w: list[WorkerId] = []
-    cpu_w: list[WorkerId] = []
-    for task in schedule.components[component_id].computable.keys():
+    opu_t: list[TaskId] = []
+    component = schedule.components[component_id]
+    for task in component.computable.keys():
         if context.job_instance.tasks[task].definition.needs_gpu:
             gpu_t.append(task)
+        elif component.core.gpu_fused_distance[task] is not None:
+            opu_t.append(task)
         else:
             cpu_t.append(task)
-    for worker in workers:
-        if context.environment.workers[worker].gpu > 0:
-            gpu_w.append(worker)
-        else:
-            cpu_w.append(worker)
-    yield from _assignment_heuristic(schedule, gpu_t, gpu_w, component_id, context)
-    for worker in gpu_w:
-        if worker in context.idle_workers:
-            cpu_w.append(worker)
-    yield from _assignment_heuristic(schedule, cpu_t, cpu_w, component_id, context)
+    eligible_w = [
+        worker for worker in workers if context.environment.workers[worker].gpu > 0
+    ]
+    yield from _assignment_heuristic(schedule, gpu_t, eligible_w, component_id, context)
+    eligible_w = [worker for worker in eligible_w if worker in context.idle_workers]
+    yield from _assignment_heuristic(schedule, opu_t, eligible_w, component_id, context)
+    eligible_w = [worker for worker in workers if worker in context.idle_workers]
+    yield from _assignment_heuristic(schedule, cpu_t, eligible_w, component_id, context)
 def update_worker2task_distance(

cascade/scheduler/core.py CHANGED Viewed

@@ -22,6 +22,10 @@ class ComponentCore:
     distance_matrix: Task2TaskDistance  # nearest common descendant
     value: TaskValue  # closer to a sink -> higher value
     depth: int  # maximum value
+    fusing_opportunities: dict[TaskId, list[TaskId]]
+    gpu_fused_distance: dict[
+        TaskId, int | None
+    ]  # closer to a gpu task -> lower value. Using fusing_opportunities paths only
     def weight(self) -> int:
         # TODO eventually replace with runtime sum or smth

cascade/scheduler/{graph.py → precompute.py} RENAMED Viewed

@@ -26,50 +26,55 @@ logger = logging.getLogger(__name__)
 PlainComponent = tuple[list[TaskId], list[TaskId]]  # nodes, sources
-def nearest_common_descendant(
-    paths: Task2TaskDistance, nodes: list[TaskId], L: int
+def _nearest_common_descendant(
+    paths: Task2TaskDistance,
+    nodes: list[TaskId],
+    L: int,
+    parents: dict[TaskId, set[TaskId]],
+    children: dict[TaskId, set[TaskId]],
 ) -> Task2TaskDistance:
+    # well crawl through the graph starting from sinks
+    remaining_children = {v: len(children[v]) for v in nodes}
+    queue = [v for v in nodes if remaining_children[v] == 0]
+    # for each pair of vertices V & U, we store here their so-far-nearest common descendant D + max(dist(V, D), dist(U, D))
+    # we need to keep track of D while we build this to be able to recalculate, but we'll drop it in the end
+    result: dict[TaskId, dict[TaskId, tuple[TaskId, int]]] = {}
+    while queue:
+        v = queue.pop(0)
+        result[v] = {}
+        # for each u, do we have a common ancestor with it?
+        for u in nodes:
+            # if we are their ancestor then we are a common ancestor, though not necessarily the nearest one
+            if v in paths[u]:
+                result[v][u] = (v, paths[u][v])
+            # some of our children may have a common ancestor with u
+            for c in children[v]:
+                if u in result[c]:
+                    d = result[c][u][0]
+                    dist = max(paths[v][d], paths[u][d])
+                    if u not in result[v] or result[v][u][1] > dist:
+                        result[v][u] = (d, dist)
+        # identify whether any of our parents children were completely processed -- if yes,
+        # we can continue the crawl with them
+        for p in parents[v]:
+            remaining_children[p] -= 1
+            if remaining_children[p] == 0:
+                queue.append(p)
+    # just drop the D witness, and fill default L if no common ancestor whatsoever
     ncd: Task2TaskDistance = {}
-    try:
-        import coptrs
-        logger.debug("using coptrs library, watch out for the blazing speed")
-        m = {}
-        d1 = {}
-        d2 = {}
-        i = 0
-        # TODO we convert from double dict to dict of tuples -- extend coptrs to support the other as well to get rid fo this
-        for a in paths.keys():
-            for b in paths[a].keys():
-                if a not in d1:
-                    d1[a] = i
-                    d2[i] = a
-                    i += 1
-                if b not in d1:
-                    d1[b] = i
-                    d2[i] = b
-                    i += 1
-                m[(d1[a], d1[b])] = paths[a][b]
-        ncdT: dict[tuple[int, int], int] = coptrs.nearest_common_descendant(m, L)
-        for (ai, bi), e in ncdT.items():
-            if d2[ai] not in ncd:
-                ncd[d2[ai]] = {}
-            ncd[d2[ai]][d2[bi]] = e
-    except ImportError:
-        logger.warning("coptrs not found, falling back to python")
-        for a in nodes:
-            ncd[a] = {}
-            for b in nodes:
-                if b == a:
-                    ncd[a][b] = 0
-                    continue
-                ncd[a][b] = L
-                for c in nodes:
-                    ncd[a][b] = min(ncd[a][b], max(paths[a][c], paths[b][c]))
+    for v in nodes:
+        ncd[v] = {}
+        for u in nodes:
+            if u in result[v]:
+                ncd[v][u] = result[v][u][1]
+            else:
+                ncd[v][u] = L
     return ncd
-def decompose(
+def _decompose(
     nodes: list[TaskId],
     edge_i: dict[TaskId, set[TaskId]],
     edge_o: dict[TaskId, set[TaskId]],
@@ -102,10 +107,11 @@ def decompose(
         )
-def enrich(
+def _enrich(
     plain_component: PlainComponent,
     edge_i: dict[TaskId, set[TaskId]],
     edge_o: dict[TaskId, set[TaskId]],
+    needs_gpu: set[TaskId],
 ) -> ComponentCore:
     nodes, sources = plain_component
     logger.debug(
@@ -148,7 +154,44 @@ def enrich(
                     paths[v][desc] = min(paths[v][desc], dist + 1)
                 value[v] = max(value[v], value[c] - 1)
-    ncd = nearest_common_descendant(paths, nodes, L)
+    # calculate ncd
+    ncd = _nearest_common_descendant(paths, nodes, L, edge_i, edge_o)
+    # fusing opportunities
+    # TODO we just arbitrarily crawl down from sinks, until everything is
+    # decomposed into paths. A smarter approach would utilize profiling
+    # information such as dataset size, trying to fuse the large datasets
+    # first so that they end up on the longest paths
+    fusing_opportunities = {}
+    gpu_fused_distance = {}
+    fused = set()
+    while layers:
+        layer = layers.pop(0)
+        while layer:
+            gpu_distance = None
+            head = layer.pop(0)
+            if head in fused:
+                continue
+            chain = []
+            fused.add(head)
+            found = True
+            while found:
+                if head in needs_gpu:
+                    gpu_distance = 0
+                elif gpu_distance is not None:
+                    gpu_distance += 1
+                gpu_fused_distance[head] = gpu_distance
+                found = False
+                for edge in edge_i[head]:
+                    if edge not in fused:
+                        chain.insert(0, head)
+                        head = edge
+                        fused.add(head)
+                        found = True
+                        break
+            if len(chain) > 0:
+                chain.insert(0, head)
+                fusing_opportunities[head] = chain
     return ComponentCore(
         nodes=nodes,
@@ -156,6 +199,8 @@ def enrich(
         distance_matrix=ncd,
         value=value,
         depth=L,
+        fusing_opportunities=fusing_opportunities,
+        gpu_fused_distance=gpu_fused_distance,
     )
@@ -172,14 +217,20 @@ def precompute(job_instance: JobInstance) -> Preschedule:
     for vert, inps in edge_i.items():
         edge_i_proj[vert] = {dataset.task for dataset in inps}
+    needs_gpu = {
+        task_id
+        for task_id, task in job_instance.tasks.items()
+        if task.definition.needs_gpu
+    }
     with ThreadPoolExecutor(max_workers=4) as tp:
         # TODO if coptrs is not used, then this doesnt make sense
-        f = lambda plain_component: timer(enrich, Microtrace.presched_enrich)(
-            plain_component, edge_i_proj, edge_o_proj
+        f = lambda plain_component: timer(_enrich, Microtrace.presched_enrich)(
+            plain_component, edge_i_proj, edge_o_proj, needs_gpu
         )
         plain_components = (
             plain_component
-            for plain_component in timer(decompose, Microtrace.presched_decompose)(
+            for plain_component in timer(_decompose, Microtrace.presched_decompose)(
                 list(job_instance.tasks.keys()),
                 edge_i_proj,
                 edge_o_proj,

earthkit/workflows/__init__.py CHANGED Viewed

@@ -6,8 +6,12 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
+import pkgutil
 import dill
+__path__ = pkgutil.extend_path(__path__, __name__)
 try:
     from ._version import __version__  # noqa: F401
 except ImportError:

earthkit/workflows/_version.py CHANGED Viewed

@@ -1,2 +1,2 @@
 # Do not change! Do not track in version control!
-__version__ = "0.3.6"
+__version__ = "0.4.0"

earthkit/workflows/plugins/__init__.py CHANGED Viewed

@@ -1 +1,5 @@
 """Placeholder module to be populated by -plugin packages"""
+import pkgutil
+__path__ = pkgutil.extend_path(__path__, __name__)

{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: earthkit-workflows
-Version: 0.3.6
+Version: 0.4.0
 Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
 Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
 License-Expression: Apache-2.0

{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,11 @@
 cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
-cascade/benchmarks/__main__.py,sha256=LyYwIAGLgZD4Fq7Kxb3vFXu3oDqA6MzzICH9h-bpLL8,7517
+cascade/benchmarks/__main__.py,sha256=n0RX44Sj_j6InFNKCjwXRVWKTYznMsrPBdf8kwGKhjM,8065
 cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
 cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
-cascade/benchmarks/job1.py,sha256=NY1k9PvkUZODCIDO_zSNwC9sFiMYpBwOaWB7FMSkt2o,4594
+cascade/benchmarks/job1.py,sha256=MOcZZYgf36MzHCjtby0lQyenM1ODUlagG8wtt2CbpnI,4640
+cascade/benchmarks/matmul.py,sha256=5STuvPY6Q37E2pKRCde9dQjL5M6tx7tkES9cBLZ6eK4,1972
 cascade/benchmarks/plotting.py,sha256=vSz9HHbqZwMXHpBUS-In6xsXGgK7QIoQTTiYfSwYwZs,4428
 cascade/benchmarks/reporting.py,sha256=MejaM-eekbMYLAnuBxGv_t4dR1ODJs4Rpc0fiZSGjyw,5410
 cascade/controller/__init__.py,sha256=p4C2p3S_0nUGamP9Mi6cSa5bvpiWbI6sVWtGhFnNqjw,1278
@@ -21,8 +22,8 @@ cascade/executor/executor.py,sha256=SqMVM8BvCNM2r2Zbg9kxSxwFADAaoBU7nCMtfzktsgI,
 cascade/executor/msg.py,sha256=QW7Me-8Sin-x-f4M4bzvO7_av8MRkjnabQN6Ch3x22c,4230
 cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
 cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
-cascade/executor/runner/entrypoint.py,sha256=paTrkURhI0Vvxb36BUO8QMohIXJdTDVI03o2vNr8VN8,6135
-cascade/executor/runner/memory.py,sha256=EhFhZIFiDo1wDiNuw2gpeUi15yAVDW0hxD7cvX0m0Ho,5299
+cascade/executor/runner/entrypoint.py,sha256=e_MWYTSQroGMkgMddrqtn5DEqUeN-svC565TlOrv5iA,7598
+cascade/executor/runner/memory.py,sha256=jkAV9T7-imciVcGvkV7OhRfosEpOQJU1OME7z-4ztAs,6371
 cascade/executor/runner/packages.py,sha256=OZjEOvKy8LQ2uguGZU1L7TVYz1415JOUGySRfU_D_sc,2513
 cascade/executor/runner/runner.py,sha256=zqpkvxdWLbwyUFaUbZmSj0KQEBNRpmF8gwVotiaamhc,4870
 cascade/gateway/__init__.py,sha256=1EzMKdLFXEucj0YWOlyVqLx4suOntitwM03T_rRubIk,829
@@ -34,16 +35,16 @@ cascade/gateway/server.py,sha256=tsOyKtVFs5EZmWrjKdi9JwWxK0DG207oSa9OQ-4zN3M,377
 cascade/low/__init__.py,sha256=5cw2taOGITK_gFbICftzK2YLdEAnLUY5OzblFzdHss4,769
 cascade/low/builders.py,sha256=_u5X8G_EF00hFt8Anv9AXo6yPf1O8MHDmqs2kKmREl0,7073
 cascade/low/core.py,sha256=txya9rgks2b1ze9yLvFvrZCs8sCCtDUlfNwz4sHgybM,5994
-cascade/low/execution_context.py,sha256=BJ9rc-vpm9eOLpAFFexEliUawr6r-DCDtFgTEKYftCA,6215
+cascade/low/execution_context.py,sha256=cdDJLYhreo4T7t4qXgFBosncubZpTrm0hELo7q4miqo,6640
 cascade/low/func.py,sha256=ihL5n3cK-IJnATgP4Dub2m-Mp_jHMxJzCA1v4uMEsi8,5211
 cascade/low/into.py,sha256=QvjrcBuHfu7qpEkeB0EJu1EAaRxOEZskUnyjkRJ_9gA,3391
 cascade/low/tracing.py,sha256=qvGVKB1huwcYoyvMYN-2wQ92pLQTErocTjpIjWv9glA,4511
 cascade/low/views.py,sha256=UwafO2EQHre17GjG8hdzO8b6qBRtTRtDlhOc1pTf8Io,1822
 cascade/scheduler/__init__.py,sha256=VT2qQ0gOQWHC4-T0FcCs59w8WZ94j2nUn7tiGm5XepA,1148
-cascade/scheduler/api.py,sha256=wyXIGO_4wGgShRT9AQ5rIGuVmSgHgoNDSSeHI_aFgOw,5877
-cascade/scheduler/assign.py,sha256=fKnv2ByiCOyzIx6-xcYyXQVRUG5e4Osq60MJkR1hLgc,10101
-cascade/scheduler/core.py,sha256=WqbUHNniy_wvc74ytPZ4yvYXH5hWKx_MX_jTXh_0bLs,2697
-cascade/scheduler/graph.py,sha256=p6UnbGEhqPkogU8fDYx2OcwvI-yNnI14p1AitAFhsYo,6363
+cascade/scheduler/api.py,sha256=uyRslN3ZNXOZNax27pQOrczeo9-2zTxal7-xYAPCDgI,5911
+cascade/scheduler/assign.py,sha256=XRTu3wEK2FYM-4Y_Gp4_O6h2wr6LSUa7e05DTwPHRcs,12250
+cascade/scheduler/core.py,sha256=XtXpfq6gtE8FS1BQd0ku0uQOrJpe1_CzzuBd98W6y7g,2891
+cascade/scheduler/precompute.py,sha256=QmZgriwfb07LViMztZogX5DOC1L4dCTbZJNGuFvFS9A,8513
 cascade/shm/__init__.py,sha256=R9QgGSnsl_YDjFjAUQkoleM_5yGM37ce9S8a4ReA1mE,3854
 cascade/shm/algorithms.py,sha256=SGxnJF4ovUaywTunMJWkG77l5DN-jXx7HgABt3sRJXM,2356
 cascade/shm/api.py,sha256=a_KrjyELsDms0Di0ThHsZe7MfmNEkekflmjXAQ1_Qws,6040
@@ -52,8 +53,8 @@ cascade/shm/dataset.py,sha256=Z2ewpnW7mVDJB9GylIVoOWV0DYOF7FWLIXkIvV-Y7sI,12347
 cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
 cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
 cascade/shm/server.py,sha256=5Ub9bnBmDto9BwfjX3h3sJeiLzZN4lawgtLfvK-vcMU,5036
-earthkit/workflows/__init__.py,sha256=f17AdiV9g4eRN8m4dUnSU58RoLRqk1e6iMRrQiBUSKk,1880
-earthkit/workflows/_version.py,sha256=RCglqs61OUYaUdhn0AKIjoOZXQBCSTS0C7aXjmtZiuA,72
+earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
+earthkit/workflows/_version.py,sha256=-UXII43tJWWG-Bw3-ObfEfbloOAVS2Clozd55E6zYvA,72
 earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
 earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
 earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
@@ -82,9 +83,9 @@ earthkit/workflows/graph/samplegraphs.py,sha256=GafOqOcM0QvVLe4w4qHKFhBLXwr3PBrn
 earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESsMDAM,4215
 earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
 earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
-earthkit/workflows/plugins/__init__.py,sha256=WcX4qbEhgTXabIbogydtzNmZ2tB_SuW6NzNkOYQfS-Y,61
-earthkit_workflows-0.3.6.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
-earthkit_workflows-0.3.6.dist-info/METADATA,sha256=yu-WoCd76So0jElLV9J1woPnwCYcJHoRla8bga7qsgM,1571
-earthkit_workflows-0.3.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-earthkit_workflows-0.3.6.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
-earthkit_workflows-0.3.6.dist-info/RECORD,,
+earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
+earthkit_workflows-0.4.0.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
+earthkit_workflows-0.4.0.dist-info/METADATA,sha256=GUxPv5SDQH-BE7InVU4Yy0MheZaSXdD1ys1seH-vPO4,1571
+earthkit_workflows-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+earthkit_workflows-0.4.0.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
+earthkit_workflows-0.4.0.dist-info/RECORD,,

{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{earthkit_workflows-0.3.6.dist-info → earthkit_workflows-0.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

earthkit-workflows 0.3.6__py3-none-any.whl → 0.4.0__py3-none-any.whl

earthkit-workflows 0.3.6py3-none-any.whl → 0.4.0py3-none-any.whl