earthkit-workflows 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cascade/benchmarks/__main__.py +29 -5
- cascade/benchmarks/job1.py +2 -2
- cascade/benchmarks/matmul.py +73 -0
- cascade/executor/executor.py +18 -7
- cascade/executor/runner/entrypoint.py +36 -5
- cascade/executor/runner/memory.py +23 -4
- cascade/gateway/__main__.py +8 -4
- cascade/gateway/router.py +19 -6
- cascade/gateway/server.py +2 -2
- cascade/low/execution_context.py +6 -0
- cascade/scheduler/api.py +1 -1
- cascade/scheduler/assign.py +100 -56
- cascade/scheduler/core.py +4 -0
- cascade/scheduler/{graph.py → precompute.py} +95 -44
- earthkit/workflows/__init__.py +4 -0
- earthkit/workflows/_version.py +1 -1
- earthkit/workflows/plugins/__init__.py +4 -0
- {earthkit_workflows-0.3.5.dist-info → earthkit_workflows-0.4.0.dist-info}/METADATA +1 -1
- {earthkit_workflows-0.3.5.dist-info → earthkit_workflows-0.4.0.dist-info}/RECORD +22 -21
- {earthkit_workflows-0.3.5.dist-info → earthkit_workflows-0.4.0.dist-info}/WHEEL +0 -0
- {earthkit_workflows-0.3.5.dist-info → earthkit_workflows-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {earthkit_workflows-0.3.5.dist-info → earthkit_workflows-0.4.0.dist-info}/top_level.txt +0 -0
cascade/benchmarks/__main__.py
CHANGED
|
@@ -36,12 +36,12 @@ import cascade.low.into
|
|
|
36
36
|
from cascade.controller.impl import run
|
|
37
37
|
from cascade.executor.bridge import Bridge
|
|
38
38
|
from cascade.executor.comms import callback
|
|
39
|
-
from cascade.executor.config import logging_config
|
|
39
|
+
from cascade.executor.config import logging_config, logging_config_filehandler
|
|
40
40
|
from cascade.executor.executor import Executor
|
|
41
41
|
from cascade.executor.msg import BackboneAddress, ExecutorShutdown
|
|
42
42
|
from cascade.low.core import JobInstance
|
|
43
43
|
from cascade.low.func import msum
|
|
44
|
-
from cascade.scheduler.
|
|
44
|
+
from cascade.scheduler.precompute import precompute
|
|
45
45
|
from earthkit.workflows.graph import Graph, deduplicate_nodes
|
|
46
46
|
|
|
47
47
|
logger = logging.getLogger("cascade.benchmarks")
|
|
@@ -73,6 +73,10 @@ def get_job(benchmark: str | None, instance_path: str | None) -> JobInstance:
|
|
|
73
73
|
import cascade.benchmarks.generators as generators
|
|
74
74
|
|
|
75
75
|
return generators.get_job()
|
|
76
|
+
elif benchmark.startswith("matmul"):
|
|
77
|
+
import cascade.benchmarks.matmul as matmul
|
|
78
|
+
|
|
79
|
+
return matmul.get_job()
|
|
76
80
|
else:
|
|
77
81
|
raise NotImplementedError(benchmark)
|
|
78
82
|
else:
|
|
@@ -81,6 +85,12 @@ def get_job(benchmark: str | None, instance_path: str | None) -> JobInstance:
|
|
|
81
85
|
|
|
82
86
|
def get_gpu_count() -> int:
|
|
83
87
|
try:
|
|
88
|
+
if "CUDA_VISIBLE_DEVICES" in os.environ:
|
|
89
|
+
# TODO we dont want to just count, we want to actually use literally these ids
|
|
90
|
+
# NOTE this is particularly useful for "" value -- careful when refactoring
|
|
91
|
+
visible = os.environ["CUDA_VISIBLE_DEVICES"]
|
|
92
|
+
visible_count = sum(1 for e in visible if e == ",") + (1 if visible else 0)
|
|
93
|
+
return visible_count
|
|
84
94
|
gpus = sum(
|
|
85
95
|
1
|
|
86
96
|
for l in subprocess.run(
|
|
@@ -105,8 +115,14 @@ def launch_executor(
|
|
|
105
115
|
i: int,
|
|
106
116
|
shm_vol_gb: int | None,
|
|
107
117
|
gpu_count: int,
|
|
118
|
+
log_base: str | None,
|
|
108
119
|
):
|
|
109
|
-
|
|
120
|
+
if log_base is not None:
|
|
121
|
+
log_base = f"{log_base}.host{i}"
|
|
122
|
+
log_path = f"{log_base}.txt"
|
|
123
|
+
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
124
|
+
else:
|
|
125
|
+
logging.config.dictConfig(logging_config)
|
|
110
126
|
logger.info(f"will set {gpu_count} gpus on host {i}")
|
|
111
127
|
os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
|
|
112
128
|
executor = Executor(
|
|
@@ -116,6 +132,7 @@ def launch_executor(
|
|
|
116
132
|
f"h{i}",
|
|
117
133
|
portBase,
|
|
118
134
|
shm_vol_gb,
|
|
135
|
+
log_base,
|
|
119
136
|
)
|
|
120
137
|
executor.register()
|
|
121
138
|
executor.recv_loop()
|
|
@@ -126,9 +143,14 @@ def run_locally(
|
|
|
126
143
|
hosts: int,
|
|
127
144
|
workers: int,
|
|
128
145
|
portBase: int = 12345,
|
|
146
|
+
log_base: str | None = None,
|
|
129
147
|
report_address: str | None = None,
|
|
130
148
|
):
|
|
131
|
-
|
|
149
|
+
if log_base is not None:
|
|
150
|
+
log_path = f"{log_base}.controller.txt"
|
|
151
|
+
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
152
|
+
else:
|
|
153
|
+
logging.config.dictConfig(logging_config)
|
|
132
154
|
launch = perf_counter_ns()
|
|
133
155
|
preschedule = precompute(job)
|
|
134
156
|
c = f"tcp://localhost:{portBase}"
|
|
@@ -142,7 +164,7 @@ def run_locally(
|
|
|
142
164
|
# NOTE forkserver/spawn seem to forget venv, we need fork
|
|
143
165
|
p = multiprocessing.get_context("fork").Process(
|
|
144
166
|
target=launch_executor,
|
|
145
|
-
args=(job, c, workers, portBase + 1 + i * 10, i, None, gpu_count),
|
|
167
|
+
args=(job, c, workers, portBase + 1 + i * 10, i, None, gpu_count, log_base),
|
|
146
168
|
)
|
|
147
169
|
p.start()
|
|
148
170
|
ps.append(p)
|
|
@@ -172,6 +194,7 @@ def main_local(
|
|
|
172
194
|
job: str | None = None,
|
|
173
195
|
instance: str | None = None,
|
|
174
196
|
port_base: int = 12345,
|
|
197
|
+
log_base: str | None = None,
|
|
175
198
|
) -> None:
|
|
176
199
|
jobInstance = get_job(job, instance)
|
|
177
200
|
run_locally(
|
|
@@ -180,6 +203,7 @@ def main_local(
|
|
|
180
203
|
workers_per_host,
|
|
181
204
|
report_address=report_address,
|
|
182
205
|
portBase=port_base,
|
|
206
|
+
log_base=log_base,
|
|
183
207
|
)
|
|
184
208
|
|
|
185
209
|
|
cascade/benchmarks/job1.py
CHANGED
|
@@ -16,10 +16,10 @@ Controlled by env var params: JOB1_{DATA_ROOT, GRID, ...}, see below
|
|
|
16
16
|
import os
|
|
17
17
|
|
|
18
18
|
import earthkit.data
|
|
19
|
-
from ppcascade.fluent import from_source
|
|
20
|
-
from ppcascade.utils.window import Range
|
|
21
19
|
|
|
22
20
|
from earthkit.workflows.fluent import Payload
|
|
21
|
+
from earthkit.workflows.plugins.pproc.fluent import from_source
|
|
22
|
+
from earthkit.workflows.plugins.pproc.utils.window import Range
|
|
23
23
|
|
|
24
24
|
# *** PARAMS ***
|
|
25
25
|
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import jax
|
|
5
|
+
import jax.numpy as jp
|
|
6
|
+
import jax.random as jr
|
|
7
|
+
|
|
8
|
+
from cascade.low.builders import JobBuilder, TaskBuilder
|
|
9
|
+
from cascade.low.core import JobInstance
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_funcs():
|
|
13
|
+
K = int(os.environ["MATMUL_K"])
|
|
14
|
+
size = (2**K, 2**K)
|
|
15
|
+
E = int(os.environ["MATMUL_E"])
|
|
16
|
+
|
|
17
|
+
def source() -> Any:
|
|
18
|
+
k0 = jr.key(0)
|
|
19
|
+
m = jr.uniform(key=k0, shape=size)
|
|
20
|
+
return m
|
|
21
|
+
|
|
22
|
+
def powr(m: Any) -> Any:
|
|
23
|
+
print(f"powr device is {m.device}")
|
|
24
|
+
return m**E * jp.percentile(m, 0.7)
|
|
25
|
+
|
|
26
|
+
return source, powr
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_job() -> JobInstance:
|
|
30
|
+
L = int(os.environ["MATMUL_L"])
|
|
31
|
+
# D = os.environ["MATMUL_D"]
|
|
32
|
+
# it would be tempting to with jax.default_device(jax.devices(D)):
|
|
33
|
+
# alas, it doesn't work because we can't inject this at deser time
|
|
34
|
+
|
|
35
|
+
source, powr = get_funcs()
|
|
36
|
+
source_node = TaskBuilder.from_callable(source)
|
|
37
|
+
if os.environ.get("CUDA_VISIBLE_DEVICES", "") != "":
|
|
38
|
+
source_node.definition.needs_gpu = True
|
|
39
|
+
# currently no need to set True downstream since scheduler prefers no transfer
|
|
40
|
+
|
|
41
|
+
job = JobBuilder().with_node("source", source_node)
|
|
42
|
+
prv = "source"
|
|
43
|
+
for i in range(L):
|
|
44
|
+
cur = f"pow{i}"
|
|
45
|
+
node = TaskBuilder.from_callable(powr)
|
|
46
|
+
job = job.with_node(cur, node).with_edge(prv, cur, 0)
|
|
47
|
+
prv = cur
|
|
48
|
+
|
|
49
|
+
job = job.build().get_or_raise()
|
|
50
|
+
job.ext_outputs = list(job.outputs_of(cur))
|
|
51
|
+
return job
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def execute_locally():
|
|
55
|
+
L = int(os.environ["MATMUL_L"])
|
|
56
|
+
|
|
57
|
+
source, powr = get_funcs()
|
|
58
|
+
|
|
59
|
+
device = "gpu" if os.environ.get("CUDA_VISIBLE_DEVICES", "") != "" else "cpu"
|
|
60
|
+
print(f"device is {device}")
|
|
61
|
+
with jax.default_device(jax.devices(device)[0]):
|
|
62
|
+
m0 = source()
|
|
63
|
+
for _ in range(L):
|
|
64
|
+
m0 = powr(m0)
|
|
65
|
+
|
|
66
|
+
from multiprocessing.shared_memory import SharedMemory
|
|
67
|
+
|
|
68
|
+
mem = SharedMemory("benchmark_tmp", create=True, size=m0.nbytes)
|
|
69
|
+
mem.buf[:] = m0.tobytes()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
if __name__ == "__main__":
|
|
73
|
+
execute_locally()
|
cascade/executor/executor.py
CHANGED
|
@@ -28,7 +28,7 @@ import cascade.shm.client as shm_client
|
|
|
28
28
|
from cascade.executor.comms import GraceWatcher, Listener, ReliableSender, callback
|
|
29
29
|
from cascade.executor.comms import default_message_resend_ms as resend_grace_ms
|
|
30
30
|
from cascade.executor.comms import default_timeout_ms as comms_default_timeout_ms
|
|
31
|
-
from cascade.executor.config import logging_config
|
|
31
|
+
from cascade.executor.config import logging_config, logging_config_filehandler
|
|
32
32
|
from cascade.executor.data_server import start_data_server
|
|
33
33
|
from cascade.executor.msg import (
|
|
34
34
|
Ack,
|
|
@@ -70,6 +70,7 @@ class Executor:
|
|
|
70
70
|
host: HostId,
|
|
71
71
|
portBase: int,
|
|
72
72
|
shm_vol_gb: int | None = None,
|
|
73
|
+
log_base: str | None = None,
|
|
73
74
|
) -> None:
|
|
74
75
|
self.job_instance = job_instance
|
|
75
76
|
self.param_source = param_source(job_instance.edges)
|
|
@@ -78,6 +79,7 @@ class Executor:
|
|
|
78
79
|
self.workers: dict[WorkerId, BaseProcess | None] = {
|
|
79
80
|
WorkerId(host, f"w{i}"): None for i in range(workers)
|
|
80
81
|
}
|
|
82
|
+
self.log_base = log_base
|
|
81
83
|
|
|
82
84
|
self.datasets: set[DatasetId] = set()
|
|
83
85
|
self.heartbeat_watcher = GraceWatcher(grace_ms=heartbeat_grace_ms)
|
|
@@ -92,17 +94,25 @@ class Executor:
|
|
|
92
94
|
shm_port = portBase + 2
|
|
93
95
|
shm_api.publish_client_port(shm_port)
|
|
94
96
|
ctx = get_context("fork")
|
|
97
|
+
if log_base:
|
|
98
|
+
shm_logging = logging_config_filehandler(f"{log_base}.shm.txt")
|
|
99
|
+
else:
|
|
100
|
+
shm_logging = logging_config
|
|
95
101
|
self.shm_process = ctx.Process(
|
|
96
102
|
target=shm_server,
|
|
97
103
|
args=(
|
|
98
104
|
shm_port,
|
|
99
105
|
shm_vol_gb * (1024**3) if shm_vol_gb else None,
|
|
100
|
-
|
|
106
|
+
shm_logging,
|
|
101
107
|
f"sCasc{host}",
|
|
102
108
|
),
|
|
103
109
|
)
|
|
104
110
|
self.shm_process.start()
|
|
105
111
|
self.daddress = address_of(portBase + 1)
|
|
112
|
+
if log_base:
|
|
113
|
+
dsr_logging = logging_config_filehandler(f"{log_base}.dsr.txt")
|
|
114
|
+
else:
|
|
115
|
+
dsr_logging = logging_config
|
|
106
116
|
self.data_server = ctx.Process(
|
|
107
117
|
target=start_data_server,
|
|
108
118
|
args=(
|
|
@@ -110,7 +120,7 @@ class Executor:
|
|
|
110
120
|
self.daddress,
|
|
111
121
|
self.host,
|
|
112
122
|
shm_port,
|
|
113
|
-
|
|
123
|
+
dsr_logging,
|
|
114
124
|
),
|
|
115
125
|
)
|
|
116
126
|
self.data_server.start()
|
|
@@ -181,6 +191,7 @@ class Executor:
|
|
|
181
191
|
job=self.job_instance,
|
|
182
192
|
param_source=self.param_source,
|
|
183
193
|
callback=self.mlistener.address,
|
|
194
|
+
log_base=self.log_base,
|
|
184
195
|
)
|
|
185
196
|
p = ctx.Process(target=entrypoint, kwargs={"runnerContext": runnerContext})
|
|
186
197
|
p.start()
|
|
@@ -222,17 +233,17 @@ class Executor:
|
|
|
222
233
|
procFail = lambda ex: ex is not None and ex != 0
|
|
223
234
|
for k, e in self.workers.items():
|
|
224
235
|
if e is None:
|
|
225
|
-
ValueError(f"process on {k} is not alive")
|
|
236
|
+
raise ValueError(f"process on {k} is not alive")
|
|
226
237
|
elif procFail(e.exitcode):
|
|
227
|
-
ValueError(
|
|
238
|
+
raise ValueError(
|
|
228
239
|
f"process on {k} failed to terminate correctly: {e.pid} -> {e.exitcode}"
|
|
229
240
|
)
|
|
230
241
|
if procFail(self.shm_process.exitcode):
|
|
231
|
-
ValueError(
|
|
242
|
+
raise ValueError(
|
|
232
243
|
f"shm server {self.shm_process.pid} failed with {self.shm_process.exitcode}"
|
|
233
244
|
)
|
|
234
245
|
if procFail(self.data_server.exitcode):
|
|
235
|
-
ValueError(
|
|
246
|
+
raise ValueError(
|
|
236
247
|
f"data server {self.data_server.pid} failed with {self.data_server.exitcode}"
|
|
237
248
|
)
|
|
238
249
|
if self.heartbeat_watcher.is_breach() > 0:
|
|
@@ -17,7 +17,7 @@ import zmq
|
|
|
17
17
|
|
|
18
18
|
import cascade.executor.serde as serde
|
|
19
19
|
from cascade.executor.comms import callback
|
|
20
|
-
from cascade.executor.config import logging_config
|
|
20
|
+
from cascade.executor.config import logging_config, logging_config_filehandler
|
|
21
21
|
from cascade.executor.msg import (
|
|
22
22
|
BackboneAddress,
|
|
23
23
|
DatasetPublished,
|
|
@@ -44,6 +44,7 @@ class RunnerContext:
|
|
|
44
44
|
job: JobInstance
|
|
45
45
|
callback: BackboneAddress
|
|
46
46
|
param_source: dict[TaskId, dict[int | str, DatasetId]]
|
|
47
|
+
log_base: str | None
|
|
47
48
|
|
|
48
49
|
def project(self, taskSequence: TaskSequence) -> ExecutionContext:
|
|
49
50
|
schema_lookup: dict[DatasetId, str] = {}
|
|
@@ -66,6 +67,25 @@ class RunnerContext:
|
|
|
66
67
|
)
|
|
67
68
|
|
|
68
69
|
|
|
70
|
+
class Config:
|
|
71
|
+
"""Some parameters to drive behaviour. Currently not exposed externally -- no clear argument
|
|
72
|
+
that they should be. As is, just a means of code experimentation.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
# flushing approach -- when we finish a computation of task sequence, there is a question what
|
|
76
|
+
# to do with the output. We could either publish & drop, or publish and retain in memory. The
|
|
77
|
+
# former is is slower -- if the next task sequence needs this output, it requires a fetch & deser
|
|
78
|
+
# from cashme. But the latter is more risky -- we effectively have the same dataset twice in
|
|
79
|
+
# system memory. The `posttask_flush` below goes the former way, the `pretask_flush` is a careful
|
|
80
|
+
# way of latter -- we drop the output from memory only if the *next* task sequence does not need
|
|
81
|
+
# it, ie, we retain a cache of age 1. We could ultimately have controller decide about this, or
|
|
82
|
+
# decide dynamically based on memory pressure -- but neither is easy.
|
|
83
|
+
posttask_flush = False # after task is done, drop all outputs from memory
|
|
84
|
+
pretask_flush = (
|
|
85
|
+
True # when we receive a task, we drop those in memory that wont be needed
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
69
89
|
def worker_address(workerId: WorkerId) -> BackboneAddress:
|
|
70
90
|
return f"ipc:///tmp/{repr(workerId)}.socket"
|
|
71
91
|
|
|
@@ -82,7 +102,8 @@ def execute_sequence(
|
|
|
82
102
|
for taskId in taskSequence.tasks:
|
|
83
103
|
pckg.extend(executionContext.tasks[taskId].definition.environment)
|
|
84
104
|
run(taskId, executionContext, memory)
|
|
85
|
-
|
|
105
|
+
if Config.posttask_flush:
|
|
106
|
+
memory.flush()
|
|
86
107
|
except Exception as e:
|
|
87
108
|
logger.exception("runner failure, about to report")
|
|
88
109
|
callback(
|
|
@@ -92,7 +113,11 @@ def execute_sequence(
|
|
|
92
113
|
|
|
93
114
|
|
|
94
115
|
def entrypoint(runnerContext: RunnerContext):
|
|
95
|
-
|
|
116
|
+
if runnerContext.log_base:
|
|
117
|
+
log_path = f"{runnerContext.log_base}.{runnerContext.workerId.worker}"
|
|
118
|
+
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
119
|
+
else:
|
|
120
|
+
logging.config.dictConfig(logging_config)
|
|
96
121
|
ctx = zmq.Context()
|
|
97
122
|
socket = ctx.socket(zmq.PULL)
|
|
98
123
|
socket.bind(worker_address(runnerContext.workerId))
|
|
@@ -102,8 +127,11 @@ def entrypoint(runnerContext: RunnerContext):
|
|
|
102
127
|
PackagesEnv() as pckg,
|
|
103
128
|
):
|
|
104
129
|
label("worker", repr(runnerContext.workerId))
|
|
105
|
-
|
|
106
|
-
os.environ
|
|
130
|
+
worker_num = runnerContext.workerId.worker_num()
|
|
131
|
+
gpus = int(os.environ.get("CASCADE_GPU_COUNT", "0"))
|
|
132
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = (
|
|
133
|
+
",".join(str(worker_num)) if worker_num < gpus else ""
|
|
134
|
+
)
|
|
107
135
|
# NOTE check any(task.definition.needs_gpu) anywhere?
|
|
108
136
|
# TODO configure OMP_NUM_THREADS, blas, mkl, etc -- not clear how tho
|
|
109
137
|
|
|
@@ -146,6 +174,9 @@ def entrypoint(runnerContext: RunnerContext):
|
|
|
146
174
|
for key, _ in runnerContext.job.tasks[task].definition.output_schema
|
|
147
175
|
}
|
|
148
176
|
missing_ds = required - availab_ds
|
|
177
|
+
if Config.pretask_flush:
|
|
178
|
+
extraneous_ds = availab_ds - required
|
|
179
|
+
memory.flush(extraneous_ds)
|
|
149
180
|
if missing_ds:
|
|
150
181
|
waiting_ts = mDes
|
|
151
182
|
for ds in availab_ds.intersection(required):
|
|
@@ -51,7 +51,6 @@ class Memory(AbstractContextManager):
|
|
|
51
51
|
else:
|
|
52
52
|
outputValue = "ok"
|
|
53
53
|
|
|
54
|
-
# TODO how do we purge from here over time?
|
|
55
54
|
self.local[outputId] = outputValue
|
|
56
55
|
|
|
57
56
|
if isPublish:
|
|
@@ -68,6 +67,18 @@ class Memory(AbstractContextManager):
|
|
|
68
67
|
self.callback,
|
|
69
68
|
DatasetPublished(ds=outputId, origin=self.worker, transmit_idx=None),
|
|
70
69
|
)
|
|
70
|
+
else:
|
|
71
|
+
# NOTE even if its not actually published, we send the message to allow for
|
|
72
|
+
# marking the task itself as completed -- its odd, but arguably better than
|
|
73
|
+
# introducing a TaskCompleted message. TODO we should fine-grain host-wide
|
|
74
|
+
# and worker-only publishes at the `controller.notify` level, to not cause
|
|
75
|
+
# incorrect shm.purge calls at worklow end, which log an annoying key error
|
|
76
|
+
logger.debug(f"fake publish of {outputId} for the sake of task completion")
|
|
77
|
+
shmid = ds2shmid(outputId)
|
|
78
|
+
callback(
|
|
79
|
+
self.callback,
|
|
80
|
+
DatasetPublished(ds=outputId, origin=self.worker, transmit_idx=None),
|
|
81
|
+
)
|
|
71
82
|
|
|
72
83
|
def provide(self, inputId: DatasetId, annotation: str) -> Any:
|
|
73
84
|
if inputId not in self.local:
|
|
@@ -85,18 +96,24 @@ class Memory(AbstractContextManager):
|
|
|
85
96
|
|
|
86
97
|
def pop(self, ds: DatasetId) -> None:
|
|
87
98
|
if ds in self.local:
|
|
99
|
+
logger.debug(f"popping local {ds}")
|
|
88
100
|
val = self.local.pop(ds) # noqa: F841
|
|
89
101
|
del val
|
|
90
102
|
if ds in self.bufs:
|
|
103
|
+
logger.debug(f"popping buf {ds}")
|
|
91
104
|
buf = self.bufs.pop(ds)
|
|
92
105
|
buf.close()
|
|
93
106
|
|
|
94
|
-
def flush(self) -> None:
|
|
95
|
-
# NOTE poor man's memory management -- just drop those locals that
|
|
107
|
+
def flush(self, datasets: set[DatasetId] = set()) -> None:
|
|
108
|
+
# NOTE poor man's memory management -- just drop those locals that didn't come from cashme. Called
|
|
96
109
|
# after every taskSequence. In principle, we could purge some locals earlier, and ideally scheduler
|
|
97
110
|
# would invoke some targeted purges to also remove some published ones earlier (eg, they are still
|
|
98
111
|
# needed somewhere but not here)
|
|
99
|
-
purgeable = [
|
|
112
|
+
purgeable = [
|
|
113
|
+
inputId
|
|
114
|
+
for inputId in self.local
|
|
115
|
+
if inputId not in self.bufs and (not datasets or inputId in datasets)
|
|
116
|
+
]
|
|
100
117
|
logger.debug(f"will flush {len(purgeable)} datasets")
|
|
101
118
|
for inputId in purgeable:
|
|
102
119
|
self.local.pop(inputId)
|
|
@@ -115,6 +132,8 @@ class Memory(AbstractContextManager):
|
|
|
115
132
|
free, total = torch.cuda.mem_get_info()
|
|
116
133
|
logger.debug(f"cuda mem avail post cache empty: {free/total:.2%}")
|
|
117
134
|
if free / total < 0.8:
|
|
135
|
+
# NOTE this ofc makes low sense if there is any other application (like browser or ollama)
|
|
136
|
+
# that the user may be running
|
|
118
137
|
logger.warning("cuda mem avail low despite cache empty!")
|
|
119
138
|
logger.debug(torch.cuda.memory_summary())
|
|
120
139
|
except ImportError:
|
cascade/gateway/__main__.py
CHANGED
|
@@ -10,13 +10,17 @@ import logging.config
|
|
|
10
10
|
|
|
11
11
|
import fire
|
|
12
12
|
|
|
13
|
-
from cascade.executor.config import logging_config
|
|
13
|
+
from cascade.executor.config import logging_config, logging_config_filehandler
|
|
14
14
|
from cascade.gateway.server import serve
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def main(url: str) -> None:
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
def main(url: str, log_base: str | None = None) -> None:
|
|
18
|
+
if log_base:
|
|
19
|
+
log_path = f"{log_base}/gateway.txt"
|
|
20
|
+
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
21
|
+
else:
|
|
22
|
+
logging.config.dictConfig(logging_config)
|
|
23
|
+
serve(url, log_base)
|
|
20
24
|
|
|
21
25
|
|
|
22
26
|
if __name__ == "__main__":
|
cascade/gateway/router.py
CHANGED
|
@@ -43,7 +43,9 @@ class Job:
|
|
|
43
43
|
local_job_port = 12345
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
def _spawn_local(
|
|
46
|
+
def _spawn_local(
|
|
47
|
+
job_spec: JobSpec, addr: str, job_id: JobId, log_base: str | None
|
|
48
|
+
) -> subprocess.Popen:
|
|
47
49
|
base = [
|
|
48
50
|
"python",
|
|
49
51
|
"-m",
|
|
@@ -68,11 +70,15 @@ def _spawn_local(job_spec: JobSpec, addr: str, job_id: JobId) -> subprocess.Pope
|
|
|
68
70
|
f"{job_spec.hosts}",
|
|
69
71
|
]
|
|
70
72
|
report = ["--report_address", f"{addr},{job_id}"]
|
|
73
|
+
if log_base:
|
|
74
|
+
logs = ["--log_base", f"{log_base}/job.{job_id}"]
|
|
75
|
+
else:
|
|
76
|
+
logs = []
|
|
71
77
|
global local_job_port
|
|
72
78
|
portBase = ["--port_base", str(local_job_port)]
|
|
73
79
|
local_job_port += 1 + job_spec.hosts * job_spec.workers_per_host * 10
|
|
74
80
|
return subprocess.Popen(
|
|
75
|
-
base + infra + report + portBase, env={**os.environ, **job_spec.envvars}
|
|
81
|
+
base + infra + report + portBase + logs, env={**os.environ, **job_spec.envvars}
|
|
76
82
|
)
|
|
77
83
|
|
|
78
84
|
|
|
@@ -105,18 +111,23 @@ def _spawn_slurm(job_spec: JobSpec, addr: str, job_id: JobId) -> subprocess.Pope
|
|
|
105
111
|
)
|
|
106
112
|
|
|
107
113
|
|
|
108
|
-
def _spawn_subprocess(
|
|
114
|
+
def _spawn_subprocess(
|
|
115
|
+
job_spec: JobSpec, addr: str, job_id: JobId, log_base: str | None
|
|
116
|
+
) -> subprocess.Popen:
|
|
109
117
|
if job_spec.use_slurm:
|
|
118
|
+
if log_base is not None:
|
|
119
|
+
raise ValueError(f"unexpected {log_base=}")
|
|
110
120
|
return _spawn_slurm(job_spec, addr, job_id)
|
|
111
121
|
else:
|
|
112
|
-
return _spawn_local(job_spec, addr, job_id)
|
|
122
|
+
return _spawn_local(job_spec, addr, job_id, log_base)
|
|
113
123
|
|
|
114
124
|
|
|
115
125
|
class JobRouter:
|
|
116
|
-
def __init__(self, poller: zmq.Poller):
|
|
126
|
+
def __init__(self, poller: zmq.Poller, log_base: str | None):
|
|
117
127
|
self.poller = poller
|
|
118
128
|
self.jobs: dict[str, Job] = {}
|
|
119
129
|
self.procs: dict[str, subprocess.Popen] = {}
|
|
130
|
+
self.log_base = log_base
|
|
120
131
|
|
|
121
132
|
def spawn_job(self, job_spec: JobSpec) -> JobId:
|
|
122
133
|
job_id = next_uuid(self.jobs.keys(), lambda: str(uuid.uuid4()))
|
|
@@ -131,7 +142,9 @@ class JobRouter:
|
|
|
131
142
|
logger.debug(f"will spawn job {job_id} and listen on {full_addr}")
|
|
132
143
|
self.poller.register(socket, flags=zmq.POLLIN)
|
|
133
144
|
self.jobs[job_id] = Job(socket, JobProgressStarted, -1, {})
|
|
134
|
-
self.procs[job_id] = _spawn_subprocess(
|
|
145
|
+
self.procs[job_id] = _spawn_subprocess(
|
|
146
|
+
job_spec, full_addr, job_id, self.log_base
|
|
147
|
+
)
|
|
135
148
|
return job_id
|
|
136
149
|
|
|
137
150
|
def progress_of(
|
cascade/gateway/server.py
CHANGED
|
@@ -79,14 +79,14 @@ def handle_controller(socket: zmq.Socket, jobs: JobRouter) -> None:
|
|
|
79
79
|
jobs.put_result(report.job_id, dataset_id, result)
|
|
80
80
|
|
|
81
81
|
|
|
82
|
-
def serve(url: str) -> None:
|
|
82
|
+
def serve(url: str, log_base: str | None = None) -> None:
|
|
83
83
|
ctx = get_context()
|
|
84
84
|
poller = zmq.Poller()
|
|
85
85
|
|
|
86
86
|
fe = ctx.socket(zmq.REP)
|
|
87
87
|
fe.bind(url)
|
|
88
88
|
poller.register(fe, flags=zmq.POLLIN)
|
|
89
|
-
jobs = JobRouter(poller)
|
|
89
|
+
jobs = JobRouter(poller, log_base)
|
|
90
90
|
|
|
91
91
|
logger.debug("entering recv loop")
|
|
92
92
|
is_break = False
|
cascade/low/execution_context.py
CHANGED
|
@@ -108,6 +108,12 @@ class JobExecutionContext:
|
|
|
108
108
|
self.idle_workers.add(worker)
|
|
109
109
|
|
|
110
110
|
def dataset_preparing(self, dataset: DatasetId, worker: WorkerId) -> None:
|
|
111
|
+
# NOTE Currently this is invoked during `build_assignment`, as we need
|
|
112
|
+
# some state tranisition to allow fusing opportunities as well as
|
|
113
|
+
# preventing double transmits. This may not be the best idea, eg for long
|
|
114
|
+
# fusing chains -- instead, we may execute this transition at the time
|
|
115
|
+
# it actually happens, granularize the preparing state into
|
|
116
|
+
# (will_appear, is_appearing), etc
|
|
111
117
|
# NOTE Currently, these `if`s are necessary because we issue transmit
|
|
112
118
|
# command when host *has* DS but worker does *not*. This ends up no-op,
|
|
113
119
|
# but we totally dont want host state to reset -- it wouldnt recover
|
cascade/scheduler/api.py
CHANGED
|
@@ -136,7 +136,7 @@ def plan(
|
|
|
136
136
|
for task in assignment.tasks:
|
|
137
137
|
for ds in assignment.outputs:
|
|
138
138
|
children = context.edge_o[ds]
|
|
139
|
-
context.dataset_preparing(ds, assignment.worker)
|
|
139
|
+
# context.dataset_preparing(ds, assignment.worker) # happends during build already
|
|
140
140
|
update_worker2task_distance(
|
|
141
141
|
children, assignment.worker, schedule, context
|
|
142
142
|
)
|
cascade/scheduler/assign.py
CHANGED
|
@@ -18,50 +18,80 @@ from typing import Iterable, Iterator
|
|
|
18
18
|
from cascade.low.core import DatasetId, HostId, TaskId, WorkerId
|
|
19
19
|
from cascade.low.execution_context import DatasetStatus, JobExecutionContext
|
|
20
20
|
from cascade.low.tracing import Microtrace, trace
|
|
21
|
-
from cascade.scheduler.core import Assignment, ComponentId, Schedule
|
|
21
|
+
from cascade.scheduler.core import Assignment, ComponentCore, ComponentId, Schedule
|
|
22
22
|
|
|
23
23
|
logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def build_assignment(
|
|
27
|
-
worker: WorkerId, task: TaskId, context: JobExecutionContext
|
|
27
|
+
worker: WorkerId, task: TaskId, context: JobExecutionContext, core: ComponentCore
|
|
28
28
|
) -> Assignment:
|
|
29
29
|
eligible_load = {DatasetStatus.preparing, DatasetStatus.available}
|
|
30
30
|
eligible_transmit = {DatasetStatus.available}
|
|
31
31
|
prep: list[tuple[DatasetId, HostId]] = []
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
32
|
+
if task in core.fusing_opportunities:
|
|
33
|
+
tasks = core.fusing_opportunities.pop(task)
|
|
34
|
+
else:
|
|
35
|
+
tasks = [task]
|
|
36
|
+
assigned = []
|
|
37
|
+
exhausted = False
|
|
38
|
+
at_worker = context.worker2ds[worker]
|
|
39
|
+
at_host = context.host2ds[worker.host]
|
|
40
|
+
worker_has_gpu = context.environment.workers[worker].gpu > 0
|
|
41
|
+
while tasks and not exhausted:
|
|
42
|
+
task = tasks[0]
|
|
43
|
+
if context.job_instance.tasks[task].definition.needs_gpu and not worker_has_gpu:
|
|
44
|
+
if not assigned:
|
|
45
|
+
raise ValueError(f"tried to assign gpu {task=} to non-gpu {worker=}")
|
|
41
46
|
else:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
prep.append((dataset, candidate))
|
|
48
|
-
# NOTE this is a slight hack, to prevent issuing further transmit commands of this ds to this host
|
|
49
|
-
# in this phase. A proper state transition happens later in the `plan` phase. We may want to instead
|
|
50
|
-
# create a new `transmit_queue` state field to capture this, and consume it later during plan
|
|
51
|
-
context.host2ds[worker.host][dataset] = DatasetStatus.preparing
|
|
52
|
-
context.ds2host[dataset][worker.host] = DatasetStatus.preparing
|
|
47
|
+
break
|
|
48
|
+
for dataset in context.edge_i[task]:
|
|
49
|
+
if at_worker.get(dataset, DatasetStatus.missing) not in eligible_load:
|
|
50
|
+
if at_host.get(dataset, DatasetStatus.missing) in eligible_load:
|
|
51
|
+
prep.append((dataset, worker.host))
|
|
53
52
|
else:
|
|
54
|
-
|
|
53
|
+
if any(
|
|
54
|
+
candidate := host
|
|
55
|
+
for host, status in context.ds2host[dataset].items()
|
|
56
|
+
if status in eligible_transmit
|
|
57
|
+
):
|
|
58
|
+
prep.append((dataset, candidate))
|
|
59
|
+
context.dataset_preparing(dataset, worker)
|
|
60
|
+
else:
|
|
61
|
+
# if we are dealing with the first task to assign, we don't expect to be here!
|
|
62
|
+
if not assigned:
|
|
63
|
+
raise ValueError(f"{dataset=} not found anywhere!")
|
|
64
|
+
# if we are already trying some fusing opportunities, it is legit to not find the dataset anywhere
|
|
65
|
+
else:
|
|
66
|
+
# TODO rollback preps done for this one task
|
|
67
|
+
exhausted = True
|
|
68
|
+
break
|
|
69
|
+
if not exhausted:
|
|
70
|
+
assigned.append(tasks.pop(0))
|
|
71
|
+
for dataset in context.task_o[task]:
|
|
72
|
+
context.dataset_preparing(dataset, worker)
|
|
73
|
+
|
|
74
|
+
if len(tasks) > 1:
|
|
75
|
+
head = tasks[0]
|
|
76
|
+
if head in core.fusing_opportunities:
|
|
77
|
+
raise ValueError(f"double assignment to {head} in fusing opportunities!")
|
|
78
|
+
core.fusing_opportunities[head] = tasks
|
|
79
|
+
|
|
80
|
+
# trim for only the necessary ones -- that is, having any edge outside of this current assignment
|
|
81
|
+
all_outputs = {ds for task in assigned for ds in context.task_o[task]}
|
|
82
|
+
assigned_tasks = set(assigned)
|
|
83
|
+
trimmed_outputs = {
|
|
84
|
+
ds
|
|
85
|
+
for ds in all_outputs
|
|
86
|
+
if (context.edge_o[ds] - assigned_tasks)
|
|
87
|
+
or (ds in context.job_instance.ext_outputs)
|
|
88
|
+
}
|
|
55
89
|
|
|
56
90
|
return Assignment(
|
|
57
91
|
worker=worker,
|
|
58
|
-
tasks=
|
|
59
|
-
task
|
|
60
|
-
], # TODO eager fusing for outdeg=1? Or heuristic via ratio of outdeg vs workers@component?
|
|
92
|
+
tasks=assigned,
|
|
61
93
|
prep=prep,
|
|
62
|
-
outputs=
|
|
63
|
-
ds for ds in context.task_o[task]
|
|
64
|
-
},
|
|
94
|
+
outputs=trimmed_outputs,
|
|
65
95
|
)
|
|
66
96
|
|
|
67
97
|
|
|
@@ -72,27 +102,39 @@ def _assignment_heuristic(
|
|
|
72
102
|
component_id: ComponentId,
|
|
73
103
|
context: JobExecutionContext,
|
|
74
104
|
) -> Iterator[Assignment]:
|
|
75
|
-
"""Finds a reasonable assignment within a single component. Does not migrate hosts to a different component"""
|
|
105
|
+
"""Finds a reasonable assignment within a single component. Does not migrate hosts to a different component."""
|
|
76
106
|
start = perf_counter_ns()
|
|
77
107
|
component = schedule.components[component_id]
|
|
78
108
|
|
|
109
|
+
def postproc_assignment(assignment: Assignment) -> None:
|
|
110
|
+
for assigned in assignment.tasks:
|
|
111
|
+
if assigned in component.computable:
|
|
112
|
+
component.computable.pop(assigned)
|
|
113
|
+
component.worker2task_values.remove(assigned)
|
|
114
|
+
schedule.computable -= 1
|
|
115
|
+
else:
|
|
116
|
+
# shortcut for fused-in tasks
|
|
117
|
+
component.is_computable_tracker[assigned] = set()
|
|
118
|
+
context.idle_workers.remove(worker)
|
|
119
|
+
component.weight -= len(assignment.tasks)
|
|
120
|
+
|
|
79
121
|
# first, attempt optimum-distance assignment
|
|
80
122
|
unassigned: list[TaskId] = []
|
|
81
123
|
for task in tasks:
|
|
124
|
+
if task not in component.computable:
|
|
125
|
+
# it may be that some fusing for previous task already assigned this
|
|
126
|
+
continue
|
|
82
127
|
opt_dist = component.computable[task]
|
|
83
128
|
was_assigned = False
|
|
84
129
|
for idx, worker in enumerate(workers):
|
|
85
130
|
if component.worker2task_distance[worker][task] == opt_dist:
|
|
86
131
|
end = perf_counter_ns()
|
|
87
132
|
trace(Microtrace.ctrl_assign, end - start)
|
|
88
|
-
|
|
133
|
+
assignment = build_assignment(worker, task, context, component.core)
|
|
134
|
+
yield assignment
|
|
89
135
|
start = perf_counter_ns()
|
|
136
|
+
postproc_assignment(assignment)
|
|
90
137
|
workers.pop(idx)
|
|
91
|
-
component.computable.pop(task)
|
|
92
|
-
component.worker2task_values.remove(task)
|
|
93
|
-
component.weight -= 1
|
|
94
|
-
schedule.computable -= 1
|
|
95
|
-
context.idle_workers.remove(worker)
|
|
96
138
|
was_assigned = True
|
|
97
139
|
break
|
|
98
140
|
if not was_assigned:
|
|
@@ -109,17 +151,17 @@ def _assignment_heuristic(
|
|
|
109
151
|
candidates.sort(key=lambda e: (e[0], e[1]))
|
|
110
152
|
for _, _, worker, task in candidates:
|
|
111
153
|
if task in remaining_t and worker in remaining_w:
|
|
154
|
+
if task not in component.computable:
|
|
155
|
+
# it may be that some fusing for previous task already assigned this
|
|
156
|
+
continue
|
|
112
157
|
end = perf_counter_ns()
|
|
113
158
|
trace(Microtrace.ctrl_assign, end - start)
|
|
114
|
-
|
|
159
|
+
assignment = build_assignment(worker, task, context, component.core)
|
|
160
|
+
yield assignment
|
|
115
161
|
start = perf_counter_ns()
|
|
116
|
-
|
|
117
|
-
component.worker2task_values.remove(task)
|
|
162
|
+
postproc_assignment(assignment)
|
|
118
163
|
remaining_t.remove(task)
|
|
119
164
|
remaining_w.remove(worker)
|
|
120
|
-
context.idle_workers.remove(worker)
|
|
121
|
-
schedule.computable -= 1
|
|
122
|
-
component.weight -= 1
|
|
123
165
|
|
|
124
166
|
end = perf_counter_ns()
|
|
125
167
|
trace(Microtrace.ctrl_assign, end - start)
|
|
@@ -131,27 +173,29 @@ def assign_within_component(
|
|
|
131
173
|
component_id: ComponentId,
|
|
132
174
|
context: JobExecutionContext,
|
|
133
175
|
) -> Iterator[Assignment]:
|
|
134
|
-
"""We first handle gpu
|
|
176
|
+
"""We first handle tasks requiring a gpu, then tasks whose child requires a gpu, last cpu only tasks, using the same algorithm for either case"""
|
|
135
177
|
# TODO employ a more systematic solution and handle all multicriterially at once -- ideally together with adding support for multi-gpu-groups
|
|
178
|
+
# NOTE this is getting even more important as we started considering gpu fused distance
|
|
179
|
+
# NOTE the concept of "strategic wait" is completely missing here (eg dont assign a gpu worker to a cpu task because there will come a gpu task in a few secs)
|
|
136
180
|
cpu_t: list[TaskId] = []
|
|
137
181
|
gpu_t: list[TaskId] = []
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
for task in
|
|
182
|
+
opu_t: list[TaskId] = []
|
|
183
|
+
component = schedule.components[component_id]
|
|
184
|
+
for task in component.computable.keys():
|
|
141
185
|
if context.job_instance.tasks[task].definition.needs_gpu:
|
|
142
186
|
gpu_t.append(task)
|
|
187
|
+
elif component.core.gpu_fused_distance[task] is not None:
|
|
188
|
+
opu_t.append(task)
|
|
143
189
|
else:
|
|
144
190
|
cpu_t.append(task)
|
|
145
|
-
|
|
146
|
-
if context.environment.workers[worker].gpu > 0
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
yield from _assignment_heuristic(schedule,
|
|
151
|
-
for worker in
|
|
152
|
-
|
|
153
|
-
cpu_w.append(worker)
|
|
154
|
-
yield from _assignment_heuristic(schedule, cpu_t, cpu_w, component_id, context)
|
|
191
|
+
eligible_w = [
|
|
192
|
+
worker for worker in workers if context.environment.workers[worker].gpu > 0
|
|
193
|
+
]
|
|
194
|
+
yield from _assignment_heuristic(schedule, gpu_t, eligible_w, component_id, context)
|
|
195
|
+
eligible_w = [worker for worker in eligible_w if worker in context.idle_workers]
|
|
196
|
+
yield from _assignment_heuristic(schedule, opu_t, eligible_w, component_id, context)
|
|
197
|
+
eligible_w = [worker for worker in workers if worker in context.idle_workers]
|
|
198
|
+
yield from _assignment_heuristic(schedule, cpu_t, eligible_w, component_id, context)
|
|
155
199
|
|
|
156
200
|
|
|
157
201
|
def update_worker2task_distance(
|
cascade/scheduler/core.py
CHANGED
|
@@ -22,6 +22,10 @@ class ComponentCore:
|
|
|
22
22
|
distance_matrix: Task2TaskDistance # nearest common descendant
|
|
23
23
|
value: TaskValue # closer to a sink -> higher value
|
|
24
24
|
depth: int # maximum value
|
|
25
|
+
fusing_opportunities: dict[TaskId, list[TaskId]]
|
|
26
|
+
gpu_fused_distance: dict[
|
|
27
|
+
TaskId, int | None
|
|
28
|
+
] # closer to a gpu task -> lower value. Using fusing_opportunities paths only
|
|
25
29
|
|
|
26
30
|
def weight(self) -> int:
|
|
27
31
|
# TODO eventually replace with runtime sum or smth
|
|
@@ -26,50 +26,55 @@ logger = logging.getLogger(__name__)
|
|
|
26
26
|
PlainComponent = tuple[list[TaskId], list[TaskId]] # nodes, sources
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def
|
|
30
|
-
paths: Task2TaskDistance,
|
|
29
|
+
def _nearest_common_descendant(
|
|
30
|
+
paths: Task2TaskDistance,
|
|
31
|
+
nodes: list[TaskId],
|
|
32
|
+
L: int,
|
|
33
|
+
parents: dict[TaskId, set[TaskId]],
|
|
34
|
+
children: dict[TaskId, set[TaskId]],
|
|
31
35
|
) -> Task2TaskDistance:
|
|
36
|
+
# well crawl through the graph starting from sinks
|
|
37
|
+
remaining_children = {v: len(children[v]) for v in nodes}
|
|
38
|
+
queue = [v for v in nodes if remaining_children[v] == 0]
|
|
39
|
+
|
|
40
|
+
# for each pair of vertices V & U, we store here their so-far-nearest common descendant D + max(dist(V, D), dist(U, D))
|
|
41
|
+
# we need to keep track of D while we build this to be able to recalculate, but we'll drop it in the end
|
|
42
|
+
result: dict[TaskId, dict[TaskId, tuple[TaskId, int]]] = {}
|
|
43
|
+
while queue:
|
|
44
|
+
v = queue.pop(0)
|
|
45
|
+
result[v] = {}
|
|
46
|
+
# for each u, do we have a common ancestor with it?
|
|
47
|
+
for u in nodes:
|
|
48
|
+
# if we are their ancestor then we are a common ancestor, though not necessarily the nearest one
|
|
49
|
+
if v in paths[u]:
|
|
50
|
+
result[v][u] = (v, paths[u][v])
|
|
51
|
+
# some of our children may have a common ancestor with u
|
|
52
|
+
for c in children[v]:
|
|
53
|
+
if u in result[c]:
|
|
54
|
+
d = result[c][u][0]
|
|
55
|
+
dist = max(paths[v][d], paths[u][d])
|
|
56
|
+
if u not in result[v] or result[v][u][1] > dist:
|
|
57
|
+
result[v][u] = (d, dist)
|
|
58
|
+
# identify whether any of our parents children were completely processed -- if yes,
|
|
59
|
+
# we can continue the crawl with them
|
|
60
|
+
for p in parents[v]:
|
|
61
|
+
remaining_children[p] -= 1
|
|
62
|
+
if remaining_children[p] == 0:
|
|
63
|
+
queue.append(p)
|
|
64
|
+
|
|
65
|
+
# just drop the D witness, and fill default L if no common ancestor whatsoever
|
|
32
66
|
ncd: Task2TaskDistance = {}
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
i = 0
|
|
41
|
-
# TODO we convert from double dict to dict of tuples -- extend coptrs to support the other as well to get rid fo this
|
|
42
|
-
for a in paths.keys():
|
|
43
|
-
for b in paths[a].keys():
|
|
44
|
-
if a not in d1:
|
|
45
|
-
d1[a] = i
|
|
46
|
-
d2[i] = a
|
|
47
|
-
i += 1
|
|
48
|
-
if b not in d1:
|
|
49
|
-
d1[b] = i
|
|
50
|
-
d2[i] = b
|
|
51
|
-
i += 1
|
|
52
|
-
m[(d1[a], d1[b])] = paths[a][b]
|
|
53
|
-
ncdT: dict[tuple[int, int], int] = coptrs.nearest_common_descendant(m, L)
|
|
54
|
-
for (ai, bi), e in ncdT.items():
|
|
55
|
-
if d2[ai] not in ncd:
|
|
56
|
-
ncd[d2[ai]] = {}
|
|
57
|
-
ncd[d2[ai]][d2[bi]] = e
|
|
58
|
-
except ImportError:
|
|
59
|
-
logger.warning("coptrs not found, falling back to python")
|
|
60
|
-
for a in nodes:
|
|
61
|
-
ncd[a] = {}
|
|
62
|
-
for b in nodes:
|
|
63
|
-
if b == a:
|
|
64
|
-
ncd[a][b] = 0
|
|
65
|
-
continue
|
|
66
|
-
ncd[a][b] = L
|
|
67
|
-
for c in nodes:
|
|
68
|
-
ncd[a][b] = min(ncd[a][b], max(paths[a][c], paths[b][c]))
|
|
67
|
+
for v in nodes:
|
|
68
|
+
ncd[v] = {}
|
|
69
|
+
for u in nodes:
|
|
70
|
+
if u in result[v]:
|
|
71
|
+
ncd[v][u] = result[v][u][1]
|
|
72
|
+
else:
|
|
73
|
+
ncd[v][u] = L
|
|
69
74
|
return ncd
|
|
70
75
|
|
|
71
76
|
|
|
72
|
-
def
|
|
77
|
+
def _decompose(
|
|
73
78
|
nodes: list[TaskId],
|
|
74
79
|
edge_i: dict[TaskId, set[TaskId]],
|
|
75
80
|
edge_o: dict[TaskId, set[TaskId]],
|
|
@@ -102,10 +107,11 @@ def decompose(
|
|
|
102
107
|
)
|
|
103
108
|
|
|
104
109
|
|
|
105
|
-
def
|
|
110
|
+
def _enrich(
|
|
106
111
|
plain_component: PlainComponent,
|
|
107
112
|
edge_i: dict[TaskId, set[TaskId]],
|
|
108
113
|
edge_o: dict[TaskId, set[TaskId]],
|
|
114
|
+
needs_gpu: set[TaskId],
|
|
109
115
|
) -> ComponentCore:
|
|
110
116
|
nodes, sources = plain_component
|
|
111
117
|
logger.debug(
|
|
@@ -148,7 +154,44 @@ def enrich(
|
|
|
148
154
|
paths[v][desc] = min(paths[v][desc], dist + 1)
|
|
149
155
|
value[v] = max(value[v], value[c] - 1)
|
|
150
156
|
|
|
151
|
-
|
|
157
|
+
# calculate ncd
|
|
158
|
+
ncd = _nearest_common_descendant(paths, nodes, L, edge_i, edge_o)
|
|
159
|
+
|
|
160
|
+
# fusing opportunities
|
|
161
|
+
# TODO we just arbitrarily crawl down from sinks, until everything is
|
|
162
|
+
# decomposed into paths. A smarter approach would utilize profiling
|
|
163
|
+
# information such as dataset size, trying to fuse the large datasets
|
|
164
|
+
# first so that they end up on the longest paths
|
|
165
|
+
fusing_opportunities = {}
|
|
166
|
+
gpu_fused_distance = {}
|
|
167
|
+
fused = set()
|
|
168
|
+
while layers:
|
|
169
|
+
layer = layers.pop(0)
|
|
170
|
+
while layer:
|
|
171
|
+
gpu_distance = None
|
|
172
|
+
head = layer.pop(0)
|
|
173
|
+
if head in fused:
|
|
174
|
+
continue
|
|
175
|
+
chain = []
|
|
176
|
+
fused.add(head)
|
|
177
|
+
found = True
|
|
178
|
+
while found:
|
|
179
|
+
if head in needs_gpu:
|
|
180
|
+
gpu_distance = 0
|
|
181
|
+
elif gpu_distance is not None:
|
|
182
|
+
gpu_distance += 1
|
|
183
|
+
gpu_fused_distance[head] = gpu_distance
|
|
184
|
+
found = False
|
|
185
|
+
for edge in edge_i[head]:
|
|
186
|
+
if edge not in fused:
|
|
187
|
+
chain.insert(0, head)
|
|
188
|
+
head = edge
|
|
189
|
+
fused.add(head)
|
|
190
|
+
found = True
|
|
191
|
+
break
|
|
192
|
+
if len(chain) > 0:
|
|
193
|
+
chain.insert(0, head)
|
|
194
|
+
fusing_opportunities[head] = chain
|
|
152
195
|
|
|
153
196
|
return ComponentCore(
|
|
154
197
|
nodes=nodes,
|
|
@@ -156,6 +199,8 @@ def enrich(
|
|
|
156
199
|
distance_matrix=ncd,
|
|
157
200
|
value=value,
|
|
158
201
|
depth=L,
|
|
202
|
+
fusing_opportunities=fusing_opportunities,
|
|
203
|
+
gpu_fused_distance=gpu_fused_distance,
|
|
159
204
|
)
|
|
160
205
|
|
|
161
206
|
|
|
@@ -172,14 +217,20 @@ def precompute(job_instance: JobInstance) -> Preschedule:
|
|
|
172
217
|
for vert, inps in edge_i.items():
|
|
173
218
|
edge_i_proj[vert] = {dataset.task for dataset in inps}
|
|
174
219
|
|
|
220
|
+
needs_gpu = {
|
|
221
|
+
task_id
|
|
222
|
+
for task_id, task in job_instance.tasks.items()
|
|
223
|
+
if task.definition.needs_gpu
|
|
224
|
+
}
|
|
225
|
+
|
|
175
226
|
with ThreadPoolExecutor(max_workers=4) as tp:
|
|
176
227
|
# TODO if coptrs is not used, then this doesnt make sense
|
|
177
|
-
f = lambda plain_component: timer(
|
|
178
|
-
plain_component, edge_i_proj, edge_o_proj
|
|
228
|
+
f = lambda plain_component: timer(_enrich, Microtrace.presched_enrich)(
|
|
229
|
+
plain_component, edge_i_proj, edge_o_proj, needs_gpu
|
|
179
230
|
)
|
|
180
231
|
plain_components = (
|
|
181
232
|
plain_component
|
|
182
|
-
for plain_component in timer(
|
|
233
|
+
for plain_component in timer(_decompose, Microtrace.presched_decompose)(
|
|
183
234
|
list(job_instance.tasks.keys()),
|
|
184
235
|
edge_i_proj,
|
|
185
236
|
edge_o_proj,
|
earthkit/workflows/__init__.py
CHANGED
|
@@ -6,8 +6,12 @@
|
|
|
6
6
|
# granted to it by virtue of its status as an intergovernmental organisation
|
|
7
7
|
# nor does it submit to any jurisdiction.
|
|
8
8
|
|
|
9
|
+
import pkgutil
|
|
10
|
+
|
|
9
11
|
import dill
|
|
10
12
|
|
|
13
|
+
__path__ = pkgutil.extend_path(__path__, __name__)
|
|
14
|
+
|
|
11
15
|
try:
|
|
12
16
|
from ._version import __version__ # noqa: F401
|
|
13
17
|
except ImportError:
|
earthkit/workflows/_version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
# Do not change! Do not track in version control!
|
|
2
|
-
__version__ = "0.
|
|
2
|
+
__version__ = "0.4.0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: earthkit-workflows
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
|
|
5
5
|
Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
|
|
4
|
-
cascade/benchmarks/__main__.py,sha256=
|
|
4
|
+
cascade/benchmarks/__main__.py,sha256=n0RX44Sj_j6InFNKCjwXRVWKTYznMsrPBdf8kwGKhjM,8065
|
|
5
5
|
cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
|
|
6
6
|
cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
|
|
7
|
-
cascade/benchmarks/job1.py,sha256=
|
|
7
|
+
cascade/benchmarks/job1.py,sha256=MOcZZYgf36MzHCjtby0lQyenM1ODUlagG8wtt2CbpnI,4640
|
|
8
|
+
cascade/benchmarks/matmul.py,sha256=5STuvPY6Q37E2pKRCde9dQjL5M6tx7tkES9cBLZ6eK4,1972
|
|
8
9
|
cascade/benchmarks/plotting.py,sha256=vSz9HHbqZwMXHpBUS-In6xsXGgK7QIoQTTiYfSwYwZs,4428
|
|
9
10
|
cascade/benchmarks/reporting.py,sha256=MejaM-eekbMYLAnuBxGv_t4dR1ODJs4Rpc0fiZSGjyw,5410
|
|
10
11
|
cascade/controller/__init__.py,sha256=p4C2p3S_0nUGamP9Mi6cSa5bvpiWbI6sVWtGhFnNqjw,1278
|
|
@@ -17,33 +18,33 @@ cascade/executor/bridge.py,sha256=vrs-5_Qt2mgkAD7Mzi43Xt_q7tpXX6i1UOPfqZSxHfs,81
|
|
|
17
18
|
cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
|
|
18
19
|
cascade/executor/config.py,sha256=rA4WeCNbdJJ3FdOKJ6WN3_VUorYW3cqdMfKUYPSyj0Y,1471
|
|
19
20
|
cascade/executor/data_server.py,sha256=xLIbLkWn8PnJl4lMP8ADHa2S0EgPwr0-bH7_Sib_Y70,13701
|
|
20
|
-
cascade/executor/executor.py,sha256=
|
|
21
|
+
cascade/executor/executor.py,sha256=SqMVM8BvCNM2r2Zbg9kxSxwFADAaoBU7nCMtfzktsgI,13282
|
|
21
22
|
cascade/executor/msg.py,sha256=QW7Me-8Sin-x-f4M4bzvO7_av8MRkjnabQN6Ch3x22c,4230
|
|
22
23
|
cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
|
|
23
24
|
cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
|
|
24
|
-
cascade/executor/runner/entrypoint.py,sha256=
|
|
25
|
-
cascade/executor/runner/memory.py,sha256=
|
|
25
|
+
cascade/executor/runner/entrypoint.py,sha256=e_MWYTSQroGMkgMddrqtn5DEqUeN-svC565TlOrv5iA,7598
|
|
26
|
+
cascade/executor/runner/memory.py,sha256=jkAV9T7-imciVcGvkV7OhRfosEpOQJU1OME7z-4ztAs,6371
|
|
26
27
|
cascade/executor/runner/packages.py,sha256=OZjEOvKy8LQ2uguGZU1L7TVYz1415JOUGySRfU_D_sc,2513
|
|
27
28
|
cascade/executor/runner/runner.py,sha256=zqpkvxdWLbwyUFaUbZmSj0KQEBNRpmF8gwVotiaamhc,4870
|
|
28
29
|
cascade/gateway/__init__.py,sha256=1EzMKdLFXEucj0YWOlyVqLx4suOntitwM03T_rRubIk,829
|
|
29
|
-
cascade/gateway/__main__.py,sha256=
|
|
30
|
+
cascade/gateway/__main__.py,sha256=x6-DQin6ICvalHT9YcghGyVMoykEATOdN5ON9IeHPYA,862
|
|
30
31
|
cascade/gateway/api.py,sha256=-7HTUhK9idszVCwiVwyHMcNx7n6qRcyPWsLx2e19n3A,2511
|
|
31
32
|
cascade/gateway/client.py,sha256=1p4Tvrf-BH0LQHOES5rY1z3JNIfmXcqWG2kYl4rpcE0,4061
|
|
32
|
-
cascade/gateway/router.py,sha256=
|
|
33
|
-
cascade/gateway/server.py,sha256=
|
|
33
|
+
cascade/gateway/router.py,sha256=iN-dc3L46aEy0EV57NNKYwaqIu0Au9kImu1pg-UbxwE,7680
|
|
34
|
+
cascade/gateway/server.py,sha256=tsOyKtVFs5EZmWrjKdi9JwWxK0DG207oSa9OQ-4zN3M,3772
|
|
34
35
|
cascade/low/__init__.py,sha256=5cw2taOGITK_gFbICftzK2YLdEAnLUY5OzblFzdHss4,769
|
|
35
36
|
cascade/low/builders.py,sha256=_u5X8G_EF00hFt8Anv9AXo6yPf1O8MHDmqs2kKmREl0,7073
|
|
36
37
|
cascade/low/core.py,sha256=txya9rgks2b1ze9yLvFvrZCs8sCCtDUlfNwz4sHgybM,5994
|
|
37
|
-
cascade/low/execution_context.py,sha256=
|
|
38
|
+
cascade/low/execution_context.py,sha256=cdDJLYhreo4T7t4qXgFBosncubZpTrm0hELo7q4miqo,6640
|
|
38
39
|
cascade/low/func.py,sha256=ihL5n3cK-IJnATgP4Dub2m-Mp_jHMxJzCA1v4uMEsi8,5211
|
|
39
40
|
cascade/low/into.py,sha256=QvjrcBuHfu7qpEkeB0EJu1EAaRxOEZskUnyjkRJ_9gA,3391
|
|
40
41
|
cascade/low/tracing.py,sha256=qvGVKB1huwcYoyvMYN-2wQ92pLQTErocTjpIjWv9glA,4511
|
|
41
42
|
cascade/low/views.py,sha256=UwafO2EQHre17GjG8hdzO8b6qBRtTRtDlhOc1pTf8Io,1822
|
|
42
43
|
cascade/scheduler/__init__.py,sha256=VT2qQ0gOQWHC4-T0FcCs59w8WZ94j2nUn7tiGm5XepA,1148
|
|
43
|
-
cascade/scheduler/api.py,sha256=
|
|
44
|
-
cascade/scheduler/assign.py,sha256=
|
|
45
|
-
cascade/scheduler/core.py,sha256=
|
|
46
|
-
cascade/scheduler/
|
|
44
|
+
cascade/scheduler/api.py,sha256=uyRslN3ZNXOZNax27pQOrczeo9-2zTxal7-xYAPCDgI,5911
|
|
45
|
+
cascade/scheduler/assign.py,sha256=XRTu3wEK2FYM-4Y_Gp4_O6h2wr6LSUa7e05DTwPHRcs,12250
|
|
46
|
+
cascade/scheduler/core.py,sha256=XtXpfq6gtE8FS1BQd0ku0uQOrJpe1_CzzuBd98W6y7g,2891
|
|
47
|
+
cascade/scheduler/precompute.py,sha256=QmZgriwfb07LViMztZogX5DOC1L4dCTbZJNGuFvFS9A,8513
|
|
47
48
|
cascade/shm/__init__.py,sha256=R9QgGSnsl_YDjFjAUQkoleM_5yGM37ce9S8a4ReA1mE,3854
|
|
48
49
|
cascade/shm/algorithms.py,sha256=SGxnJF4ovUaywTunMJWkG77l5DN-jXx7HgABt3sRJXM,2356
|
|
49
50
|
cascade/shm/api.py,sha256=a_KrjyELsDms0Di0ThHsZe7MfmNEkekflmjXAQ1_Qws,6040
|
|
@@ -52,8 +53,8 @@ cascade/shm/dataset.py,sha256=Z2ewpnW7mVDJB9GylIVoOWV0DYOF7FWLIXkIvV-Y7sI,12347
|
|
|
52
53
|
cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
|
|
53
54
|
cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
|
|
54
55
|
cascade/shm/server.py,sha256=5Ub9bnBmDto9BwfjX3h3sJeiLzZN4lawgtLfvK-vcMU,5036
|
|
55
|
-
earthkit/workflows/__init__.py,sha256
|
|
56
|
-
earthkit/workflows/_version.py,sha256
|
|
56
|
+
earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
|
|
57
|
+
earthkit/workflows/_version.py,sha256=-UXII43tJWWG-Bw3-ObfEfbloOAVS2Clozd55E6zYvA,72
|
|
57
58
|
earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
|
|
58
59
|
earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
|
|
59
60
|
earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
|
|
@@ -82,9 +83,9 @@ earthkit/workflows/graph/samplegraphs.py,sha256=GafOqOcM0QvVLe4w4qHKFhBLXwr3PBrn
|
|
|
82
83
|
earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESsMDAM,4215
|
|
83
84
|
earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
|
|
84
85
|
earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
|
|
85
|
-
earthkit/workflows/plugins/__init__.py,sha256=
|
|
86
|
-
earthkit_workflows-0.
|
|
87
|
-
earthkit_workflows-0.
|
|
88
|
-
earthkit_workflows-0.
|
|
89
|
-
earthkit_workflows-0.
|
|
90
|
-
earthkit_workflows-0.
|
|
86
|
+
earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
|
|
87
|
+
earthkit_workflows-0.4.0.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
|
|
88
|
+
earthkit_workflows-0.4.0.dist-info/METADATA,sha256=GUxPv5SDQH-BE7InVU4Yy0MheZaSXdD1ys1seH-vPO4,1571
|
|
89
|
+
earthkit_workflows-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
90
|
+
earthkit_workflows-0.4.0.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
|
|
91
|
+
earthkit_workflows-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|