earthkit-workflows 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cascade/benchmarks/__main__.py +79 -30
- cascade/benchmarks/dist.py +123 -0
- cascade/controller/act.py +1 -0
- cascade/controller/impl.py +5 -0
- cascade/controller/notify.py +2 -0
- cascade/executor/bridge.py +2 -1
- cascade/executor/config.py +2 -0
- cascade/executor/executor.py +7 -2
- cascade/executor/msg.py +2 -0
- cascade/executor/runner/entrypoint.py +15 -5
- cascade/low/core.py +14 -2
- cascade/scheduler/api.py +55 -0
- cascade/scheduler/assign.py +190 -23
- cascade/scheduler/core.py +15 -0
- cascade/scheduler/precompute.py +9 -3
- cascade/shm/server.py +1 -1
- earthkit/workflows/_version.py +1 -1
- earthkit/workflows/backends/__init__.py +27 -11
- {earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/METADATA +1 -1
- {earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/RECORD +23 -22
- {earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/WHEEL +0 -0
- {earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {earthkit_workflows-0.4.0.dist-info → earthkit_workflows-0.4.2.dist-info}/top_level.txt +0 -0
cascade/benchmarks/__main__.py
CHANGED
|
@@ -26,7 +26,9 @@ import logging.config
|
|
|
26
26
|
import multiprocessing
|
|
27
27
|
import os
|
|
28
28
|
import subprocess
|
|
29
|
+
import sys
|
|
29
30
|
from concurrent.futures import ThreadPoolExecutor
|
|
31
|
+
from socket import getfqdn
|
|
30
32
|
from time import perf_counter_ns
|
|
31
33
|
|
|
32
34
|
import fire
|
|
@@ -77,13 +79,17 @@ def get_job(benchmark: str | None, instance_path: str | None) -> JobInstance:
|
|
|
77
79
|
import cascade.benchmarks.matmul as matmul
|
|
78
80
|
|
|
79
81
|
return matmul.get_job()
|
|
82
|
+
elif benchmark.startswith("dist"):
|
|
83
|
+
import cascade.benchmarks.dist as dist
|
|
84
|
+
|
|
85
|
+
return dist.get_job()
|
|
80
86
|
else:
|
|
81
87
|
raise NotImplementedError(benchmark)
|
|
82
88
|
else:
|
|
83
89
|
raise TypeError("specified neither benchmark name nor job instance")
|
|
84
90
|
|
|
85
91
|
|
|
86
|
-
def
|
|
92
|
+
def get_cuda_count() -> int:
|
|
87
93
|
try:
|
|
88
94
|
if "CUDA_VISIBLE_DEVICES" in os.environ:
|
|
89
95
|
# TODO we dont want to just count, we want to actually use literally these ids
|
|
@@ -101,12 +107,22 @@ def get_gpu_count() -> int:
|
|
|
101
107
|
if "GPU" in l
|
|
102
108
|
)
|
|
103
109
|
except:
|
|
104
|
-
# TODO support macos
|
|
105
110
|
logger.exception("unable to determine available gpus")
|
|
106
111
|
gpus = 0
|
|
107
112
|
return gpus
|
|
108
113
|
|
|
109
114
|
|
|
115
|
+
def get_gpu_count(host_idx: int, worker_count: int) -> int:
|
|
116
|
+
if sys.platform == "darwin":
|
|
117
|
+
# we should inspect some gpu capabilities details to prevent overcommit
|
|
118
|
+
return worker_count
|
|
119
|
+
else:
|
|
120
|
+
if host_idx == 0:
|
|
121
|
+
return get_cuda_count()
|
|
122
|
+
else:
|
|
123
|
+
return 0
|
|
124
|
+
|
|
125
|
+
|
|
110
126
|
def launch_executor(
|
|
111
127
|
job_instance: JobInstance,
|
|
112
128
|
controller_address: BackboneAddress,
|
|
@@ -116,6 +132,7 @@ def launch_executor(
|
|
|
116
132
|
shm_vol_gb: int | None,
|
|
117
133
|
gpu_count: int,
|
|
118
134
|
log_base: str | None,
|
|
135
|
+
url_base: str,
|
|
119
136
|
):
|
|
120
137
|
if log_base is not None:
|
|
121
138
|
log_base = f"{log_base}.host{i}"
|
|
@@ -123,19 +140,25 @@ def launch_executor(
|
|
|
123
140
|
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
124
141
|
else:
|
|
125
142
|
logging.config.dictConfig(logging_config)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
143
|
+
try:
|
|
144
|
+
logger.info(f"will set {gpu_count} gpus on host {i}")
|
|
145
|
+
os.environ["CASCADE_GPU_COUNT"] = str(gpu_count)
|
|
146
|
+
executor = Executor(
|
|
147
|
+
job_instance,
|
|
148
|
+
controller_address,
|
|
149
|
+
workers_per_host,
|
|
150
|
+
f"h{i}",
|
|
151
|
+
portBase,
|
|
152
|
+
shm_vol_gb,
|
|
153
|
+
log_base,
|
|
154
|
+
url_base,
|
|
155
|
+
)
|
|
156
|
+
executor.register()
|
|
157
|
+
executor.recv_loop()
|
|
158
|
+
except Exception:
|
|
159
|
+
# NOTE we log this to get the stacktrace into the logfile
|
|
160
|
+
logger.exception("executor failure")
|
|
161
|
+
raise
|
|
139
162
|
|
|
140
163
|
|
|
141
164
|
def run_locally(
|
|
@@ -151,24 +174,47 @@ def run_locally(
|
|
|
151
174
|
logging.config.dictConfig(logging_config_filehandler(log_path))
|
|
152
175
|
else:
|
|
153
176
|
logging.config.dictConfig(logging_config)
|
|
177
|
+
logger.debug(f"local run starting with {hosts=} and {workers=} on {portBase=}")
|
|
154
178
|
launch = perf_counter_ns()
|
|
155
|
-
preschedule = precompute(job)
|
|
156
179
|
c = f"tcp://localhost:{portBase}"
|
|
157
180
|
m = f"tcp://localhost:{portBase+1}"
|
|
158
181
|
ps = []
|
|
159
|
-
for i, executor in enumerate(range(hosts)):
|
|
160
|
-
if i == 0:
|
|
161
|
-
gpu_count = get_gpu_count()
|
|
162
|
-
else:
|
|
163
|
-
gpu_count = 0
|
|
164
|
-
# NOTE forkserver/spawn seem to forget venv, we need fork
|
|
165
|
-
p = multiprocessing.get_context("fork").Process(
|
|
166
|
-
target=launch_executor,
|
|
167
|
-
args=(job, c, workers, portBase + 1 + i * 10, i, None, gpu_count, log_base),
|
|
168
|
-
)
|
|
169
|
-
p.start()
|
|
170
|
-
ps.append(p)
|
|
171
182
|
try:
|
|
183
|
+
# executors forking
|
|
184
|
+
for i, executor in enumerate(range(hosts)):
|
|
185
|
+
gpu_count = get_gpu_count(i, workers)
|
|
186
|
+
# NOTE forkserver/spawn seem to forget venv, we need fork
|
|
187
|
+
logger.debug(f"forking into executor on host {i}")
|
|
188
|
+
p = multiprocessing.get_context("fork").Process(
|
|
189
|
+
target=launch_executor,
|
|
190
|
+
args=(
|
|
191
|
+
job,
|
|
192
|
+
c,
|
|
193
|
+
workers,
|
|
194
|
+
portBase + 1 + i * 10,
|
|
195
|
+
i,
|
|
196
|
+
None,
|
|
197
|
+
gpu_count,
|
|
198
|
+
log_base,
|
|
199
|
+
"tcp://localhost",
|
|
200
|
+
),
|
|
201
|
+
)
|
|
202
|
+
p.start()
|
|
203
|
+
ps.append(p)
|
|
204
|
+
|
|
205
|
+
# compute preschedule
|
|
206
|
+
preschedule = precompute(job)
|
|
207
|
+
|
|
208
|
+
# check processes started healthy
|
|
209
|
+
for i, p in enumerate(ps):
|
|
210
|
+
if not p.is_alive():
|
|
211
|
+
# TODO ideally we would somehow connect this with the Register message
|
|
212
|
+
# consumption in the Controller -- but there we don't assume that
|
|
213
|
+
# executors are on the same physical host
|
|
214
|
+
raise ValueError(f"executor {i} failed to live due to {p.exitcode}")
|
|
215
|
+
|
|
216
|
+
# start bridge itself
|
|
217
|
+
logger.debug("starting bridge")
|
|
172
218
|
b = Bridge(c, hosts)
|
|
173
219
|
start = perf_counter_ns()
|
|
174
220
|
run(job, b, preschedule, report_address=report_address)
|
|
@@ -176,7 +222,9 @@ def run_locally(
|
|
|
176
222
|
print(
|
|
177
223
|
f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
|
|
178
224
|
)
|
|
179
|
-
except:
|
|
225
|
+
except Exception:
|
|
226
|
+
# NOTE we log this to get the stacktrace into the logfile
|
|
227
|
+
logger.exception("controller failure, proceed with executor shutdown")
|
|
180
228
|
for p in ps:
|
|
181
229
|
if p.is_alive():
|
|
182
230
|
callback(m, ExecutorShutdown())
|
|
@@ -238,7 +286,7 @@ def main_dist(
|
|
|
238
286
|
f"compute took {(end-start)/1e9:.3f}s, including startup {(end-launch)/1e9:.3f}s"
|
|
239
287
|
)
|
|
240
288
|
else:
|
|
241
|
-
gpu_count = get_gpu_count()
|
|
289
|
+
gpu_count = get_gpu_count(0, workers_per_host)
|
|
242
290
|
launch_executor(
|
|
243
291
|
jobInstance,
|
|
244
292
|
controller_url,
|
|
@@ -247,6 +295,7 @@ def main_dist(
|
|
|
247
295
|
idx,
|
|
248
296
|
shm_vol_gb,
|
|
249
297
|
gpu_count,
|
|
298
|
+
f"tcp://{getfqdn()}",
|
|
250
299
|
)
|
|
251
300
|
|
|
252
301
|
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Demonstrates gang scheduling capabilities, ie, multiple nodes capable of mutual communication.
|
|
2
|
+
|
|
3
|
+
The job is a source -> (dist group) -> sink, where:
|
|
4
|
+
source just returns an int,
|
|
5
|
+
dist group is L nodes to be scheduled as a single gang
|
|
6
|
+
rank=0 node broadcasts a buffer containing the node's input
|
|
7
|
+
each node returns its input multiplied by broadcasted buffer
|
|
8
|
+
sink returns the sum of all inputs
|
|
9
|
+
|
|
10
|
+
There are multiple implementations of that:
|
|
11
|
+
torch
|
|
12
|
+
jax (actually does a mesh-shard global sum instead of broadcast -- the point is to showcase dist init)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
from cascade.low.builders import JobBuilder, TaskBuilder
|
|
18
|
+
from cascade.low.core import JobInstance, SchedulingConstraint
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def source_func() -> int:
|
|
22
|
+
return 42
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def dist_func_torch(a: int) -> int:
|
|
26
|
+
import datetime as dt
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
import torch.distributed as dist
|
|
30
|
+
|
|
31
|
+
world_size = int(os.environ["CASCADE_GANG_WORLD_SIZE"])
|
|
32
|
+
rank = int(os.environ["CASCADE_GANG_RANK"])
|
|
33
|
+
coordinator = os.environ["CASCADE_GANG_COORDINATOR"]
|
|
34
|
+
print(f"starting with envvars: {rank=}/{world_size=}, {coordinator=}")
|
|
35
|
+
dist.init_process_group(
|
|
36
|
+
backend="gloo",
|
|
37
|
+
init_method=coordinator,
|
|
38
|
+
timeout=dt.timedelta(minutes=1),
|
|
39
|
+
world_size=world_size,
|
|
40
|
+
rank=rank,
|
|
41
|
+
)
|
|
42
|
+
group_ranks = np.arange(world_size, dtype=int)
|
|
43
|
+
group = dist.new_group(group_ranks)
|
|
44
|
+
|
|
45
|
+
if rank == 0:
|
|
46
|
+
buf = [a]
|
|
47
|
+
dist.broadcast_object_list(buf, src=0, group=group)
|
|
48
|
+
print("broadcast ok")
|
|
49
|
+
else:
|
|
50
|
+
buf = np.array([0], dtype=np.uint64)
|
|
51
|
+
dist.broadcast_object_list(buf, src=0, group=group)
|
|
52
|
+
print(f"broadcast recevied {buf}")
|
|
53
|
+
|
|
54
|
+
return a * buf[0]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def dist_func_jax(a: int) -> int:
|
|
58
|
+
world_size = int(os.environ["CASCADE_GANG_WORLD_SIZE"])
|
|
59
|
+
rank = int(os.environ["CASCADE_GANG_RANK"])
|
|
60
|
+
coordinator = os.environ["CASCADE_GANG_COORDINATOR"]
|
|
61
|
+
os.environ["JAX_NUM_CPU_DEVICES"] = "1"
|
|
62
|
+
os.environ["JAX_PLATFORM_NAME"] = "cpu"
|
|
63
|
+
os.environ["JAX_PLATFORMS"] = "cpu"
|
|
64
|
+
import jax
|
|
65
|
+
import jax.numpy as jp
|
|
66
|
+
|
|
67
|
+
jax.config.update("jax_platforms", "cpu")
|
|
68
|
+
jax.config.update("jax_platform_name", "cpu")
|
|
69
|
+
# NOTE neither of the above seems to actually help with an init error message :(
|
|
70
|
+
print(f"starting with envvars: {rank=}/{world_size=}, {coordinator=}")
|
|
71
|
+
if coordinator.startswith("tcp://"):
|
|
72
|
+
coordinator = coordinator[len("tcp://") :]
|
|
73
|
+
jax.distributed.initialize(coordinator, num_processes=world_size, process_id=rank)
|
|
74
|
+
assert jax.device_count() == world_size
|
|
75
|
+
|
|
76
|
+
mesh = jax.make_mesh((world_size,), ("i",))
|
|
77
|
+
global_data = jp.arange(world_size)
|
|
78
|
+
sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("i"))
|
|
79
|
+
global_array = jax.device_put(global_data, sharding)
|
|
80
|
+
result = jp.sum(global_array)
|
|
81
|
+
print(f"worker {rank}# got result {result=}")
|
|
82
|
+
return a + result
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def build_dist_func(impl: str):
|
|
86
|
+
if impl == "torch":
|
|
87
|
+
return dist_func_torch
|
|
88
|
+
elif impl == "jax":
|
|
89
|
+
return dist_func_jax
|
|
90
|
+
else:
|
|
91
|
+
raise NotImplementedError(impl)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def sink_func(**kwargs) -> int:
|
|
95
|
+
c = 0
|
|
96
|
+
for _, v in kwargs.items():
|
|
97
|
+
c += v
|
|
98
|
+
print(f"sink accumulated {c}")
|
|
99
|
+
return c
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_job() -> JobInstance:
|
|
103
|
+
source_node = TaskBuilder.from_callable(source_func)
|
|
104
|
+
sink_node = TaskBuilder.from_callable(sink_func)
|
|
105
|
+
job = JobBuilder().with_node("source", source_node).with_node("sink", sink_node)
|
|
106
|
+
L = int(os.environ["DIST_L"])
|
|
107
|
+
IMPL = os.environ["DIST_IMPL"]
|
|
108
|
+
node = TaskBuilder.from_callable(build_dist_func(IMPL))
|
|
109
|
+
|
|
110
|
+
for i in range(L):
|
|
111
|
+
job = (
|
|
112
|
+
job.with_node(f"proc{i}", node)
|
|
113
|
+
.with_edge("source", f"proc{i}", "a")
|
|
114
|
+
.with_edge(f"proc{i}", "sink", f"v{i}")
|
|
115
|
+
)
|
|
116
|
+
job.nodes["sink"].definition.input_schema[
|
|
117
|
+
f"v{i}"
|
|
118
|
+
] = "int" # TODO put some allow_kw into TaskDefinition instead to allow this
|
|
119
|
+
|
|
120
|
+
job = job.build().get_or_raise()
|
|
121
|
+
job.ext_outputs = list(job.outputs_of("sink"))
|
|
122
|
+
job.constraints = [SchedulingConstraint(gang=[f"proc{i}" for i in range(L)])]
|
|
123
|
+
return job
|
cascade/controller/act.py
CHANGED
cascade/controller/impl.py
CHANGED
|
@@ -43,6 +43,11 @@ def run(
|
|
|
43
43
|
reporter = Reporter(report_address)
|
|
44
44
|
|
|
45
45
|
try:
|
|
46
|
+
total_gpus = sum(worker.gpu for worker in env.workers.values())
|
|
47
|
+
needs_gpus = any(task.definition.needs_gpu for task in job.tasks.values())
|
|
48
|
+
if needs_gpus and total_gpus == 0:
|
|
49
|
+
raise ValueError("environment contains no gpu yet job demands one")
|
|
50
|
+
|
|
46
51
|
while (
|
|
47
52
|
state.has_awaitable()
|
|
48
53
|
or context.has_awaitable()
|
cascade/controller/notify.py
CHANGED
|
@@ -22,6 +22,7 @@ from cascade.low.core import DatasetId, HostId, WorkerId
|
|
|
22
22
|
from cascade.low.execution_context import DatasetStatus, JobExecutionContext
|
|
23
23
|
from cascade.low.func import assert_never
|
|
24
24
|
from cascade.low.tracing import TaskLifecycle, TransmitLifecycle, mark
|
|
25
|
+
from cascade.scheduler.api import gang_check_ready
|
|
25
26
|
from cascade.scheduler.assign import set_worker2task_overhead
|
|
26
27
|
from cascade.scheduler.core import Schedule
|
|
27
28
|
|
|
@@ -67,6 +68,7 @@ def consider_computable(
|
|
|
67
68
|
# NOTE this is a task newly made computable, so we need to calc
|
|
68
69
|
# `overhead` for all hosts/workers assigned to the component
|
|
69
70
|
set_worker2task_overhead(schedule, context, worker, child_task)
|
|
71
|
+
gang_check_ready(child_task, component.gang_preparation)
|
|
70
72
|
|
|
71
73
|
|
|
72
74
|
# TODO refac less explicit mutation of context, use class methods
|
cascade/executor/bridge.py
CHANGED
|
@@ -46,7 +46,7 @@ class Bridge:
|
|
|
46
46
|
self.transmit_idx_counter = 0
|
|
47
47
|
self.sender = ReliableSender(self.mlistener.address, resend_grace_ms)
|
|
48
48
|
registered = 0
|
|
49
|
-
self.environment = Environment(workers={})
|
|
49
|
+
self.environment = Environment(workers={}, host_url_base={})
|
|
50
50
|
logger.debug("about to start receiving registrations")
|
|
51
51
|
registration_grace = time.time_ns() + 3 * 60 * 1_000_000_000
|
|
52
52
|
while registered < expected_executors:
|
|
@@ -69,6 +69,7 @@ class Bridge:
|
|
|
69
69
|
self.environment.workers[worker.worker_id] = Worker(
|
|
70
70
|
cpu=worker.cpu, gpu=worker.gpu, memory_mb=worker.memory_mb
|
|
71
71
|
)
|
|
72
|
+
self.environment.host_url_base[message.host] = message.url_base
|
|
72
73
|
registered += 1
|
|
73
74
|
self.heartbeat_checker[message.host] = GraceWatcher(
|
|
74
75
|
2 * executor_heartbeat_grace_ms
|
cascade/executor/config.py
CHANGED
|
@@ -21,12 +21,14 @@ logging_config = {
|
|
|
21
21
|
"forecastbox.worker": {"level": "DEBUG"},
|
|
22
22
|
"forecastbox.executor": {"level": "DEBUG"},
|
|
23
23
|
"cascade": {"level": "INFO"},
|
|
24
|
+
"cascade.benchmarks": {"level": "DEBUG"},
|
|
24
25
|
"cascade.low": {"level": "DEBUG"},
|
|
25
26
|
"cascade.shm": {"level": "DEBUG"},
|
|
26
27
|
"cascade.controller": {"level": "DEBUG"},
|
|
27
28
|
"cascade.executor": {"level": "DEBUG"},
|
|
28
29
|
"cascade.scheduler": {"level": "DEBUG"},
|
|
29
30
|
"cascade.gateway": {"level": "DEBUG"},
|
|
31
|
+
"earthkit.workflows": {"level": "DEBUG"},
|
|
30
32
|
"httpcore": {"level": "ERROR"},
|
|
31
33
|
"httpx": {"level": "ERROR"},
|
|
32
34
|
"": {"level": "WARNING", "handlers": ["default"]},
|
cascade/executor/executor.py
CHANGED
|
@@ -69,8 +69,9 @@ class Executor:
|
|
|
69
69
|
workers: int,
|
|
70
70
|
host: HostId,
|
|
71
71
|
portBase: int,
|
|
72
|
-
shm_vol_gb: int | None
|
|
73
|
-
log_base: str | None
|
|
72
|
+
shm_vol_gb: int | None,
|
|
73
|
+
log_base: str | None,
|
|
74
|
+
url_base: str,
|
|
74
75
|
) -> None:
|
|
75
76
|
self.job_instance = job_instance
|
|
76
77
|
self.param_source = param_source(job_instance.edges)
|
|
@@ -85,6 +86,7 @@ class Executor:
|
|
|
85
86
|
self.heartbeat_watcher = GraceWatcher(grace_ms=heartbeat_grace_ms)
|
|
86
87
|
|
|
87
88
|
self.terminating = False
|
|
89
|
+
logger.debug("register terminate function")
|
|
88
90
|
atexit.register(self.terminate)
|
|
89
91
|
# NOTE following inits are with potential side effects
|
|
90
92
|
self.mlistener = Listener(address_of(portBase))
|
|
@@ -98,6 +100,7 @@ class Executor:
|
|
|
98
100
|
shm_logging = logging_config_filehandler(f"{log_base}.shm.txt")
|
|
99
101
|
else:
|
|
100
102
|
shm_logging = logging_config
|
|
103
|
+
logger.debug("about to fork into shm process")
|
|
101
104
|
self.shm_process = ctx.Process(
|
|
102
105
|
target=shm_server,
|
|
103
106
|
args=(
|
|
@@ -113,6 +116,7 @@ class Executor:
|
|
|
113
116
|
dsr_logging = logging_config_filehandler(f"{log_base}.dsr.txt")
|
|
114
117
|
else:
|
|
115
118
|
dsr_logging = logging_config
|
|
119
|
+
logger.debug("about to fork into data server")
|
|
116
120
|
self.data_server = ctx.Process(
|
|
117
121
|
target=start_data_server,
|
|
118
122
|
args=(
|
|
@@ -138,6 +142,7 @@ class Executor:
|
|
|
138
142
|
)
|
|
139
143
|
for idx, worker_id in enumerate(self.workers.keys())
|
|
140
144
|
],
|
|
145
|
+
url_base=url_base,
|
|
141
146
|
)
|
|
142
147
|
logger.debug("constructed executor")
|
|
143
148
|
|
cascade/executor/msg.py
CHANGED
|
@@ -71,6 +71,7 @@ class TaskSequence:
|
|
|
71
71
|
worker: WorkerId # worker for running those tasks
|
|
72
72
|
tasks: list[TaskId] # to be executed in the given order
|
|
73
73
|
publish: set[DatasetId] # set of outputs to be published
|
|
74
|
+
extra_env: list[tuple[str, str]] # extra env var to set
|
|
74
75
|
|
|
75
76
|
|
|
76
77
|
@dataclass(frozen=True)
|
|
@@ -147,6 +148,7 @@ class ExecutorRegistration:
|
|
|
147
148
|
host: HostId
|
|
148
149
|
maddress: BackboneAddress
|
|
149
150
|
daddress: BackboneAddress
|
|
151
|
+
url_base: str # used for eg dist comms init
|
|
150
152
|
workers: list[Worker]
|
|
151
153
|
|
|
152
154
|
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
import logging
|
|
12
12
|
import logging.config
|
|
13
13
|
import os
|
|
14
|
+
import sys
|
|
14
15
|
from dataclasses import dataclass
|
|
15
16
|
|
|
16
17
|
import zmq
|
|
@@ -98,12 +99,17 @@ def execute_sequence(
|
|
|
98
99
|
) -> None:
|
|
99
100
|
taskId: TaskId | None = None
|
|
100
101
|
try:
|
|
102
|
+
for key, value in taskSequence.extra_env.items():
|
|
103
|
+
os.environ[key] = value
|
|
101
104
|
executionContext = runnerContext.project(taskSequence)
|
|
102
105
|
for taskId in taskSequence.tasks:
|
|
103
106
|
pckg.extend(executionContext.tasks[taskId].definition.environment)
|
|
104
107
|
run(taskId, executionContext, memory)
|
|
105
108
|
if Config.posttask_flush:
|
|
106
109
|
memory.flush()
|
|
110
|
+
for key in taskSequence.extra_env.keys():
|
|
111
|
+
# NOTE we should in principle restore the previous value, but we dont expect collisions
|
|
112
|
+
del os.environ[key]
|
|
107
113
|
except Exception as e:
|
|
108
114
|
logger.exception("runner failure, about to report")
|
|
109
115
|
callback(
|
|
@@ -129,11 +135,15 @@ def entrypoint(runnerContext: RunnerContext):
|
|
|
129
135
|
label("worker", repr(runnerContext.workerId))
|
|
130
136
|
worker_num = runnerContext.workerId.worker_num()
|
|
131
137
|
gpus = int(os.environ.get("CASCADE_GPU_COUNT", "0"))
|
|
132
|
-
|
|
133
|
-
"
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
138
|
+
if sys.platform != "darwin":
|
|
139
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = (
|
|
140
|
+
str(worker_num) if worker_num < gpus else ""
|
|
141
|
+
)
|
|
142
|
+
# NOTE check any(task.definition.needs_gpu) anywhere?
|
|
143
|
+
# TODO configure OMP_NUM_THREADS, blas, mkl, etc -- not clear how tho
|
|
144
|
+
else:
|
|
145
|
+
if gpus != 1:
|
|
146
|
+
logger.warning("unexpected absence of gpu on darwin")
|
|
137
147
|
|
|
138
148
|
for serdeTypeEnc, (serdeSer, serdeDes) in runnerContext.job.serdes.items():
|
|
139
149
|
serde.SerdeRegistry.register(type_dec(serdeTypeEnc), serdeSer, serdeDes)
|
cascade/low/core.py
CHANGED
|
@@ -106,15 +106,26 @@ def type_enc(t: Type) -> str:
|
|
|
106
106
|
return b64encode(cloudpickle.dumps(t)).decode("ascii")
|
|
107
107
|
|
|
108
108
|
|
|
109
|
+
class SchedulingConstraint(BaseModel):
|
|
110
|
+
gang: list[TaskId] = Field(
|
|
111
|
+
description="this set of TaskIds must be started at the same time, with ranks and address list as envvar",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
109
115
|
class JobInstance(BaseModel):
|
|
110
116
|
tasks: dict[TaskId, TaskInstance]
|
|
111
117
|
edges: list[Task2TaskEdge]
|
|
112
118
|
serdes: dict[str, tuple[str, str]] = Field(
|
|
113
|
-
{},
|
|
119
|
+
default_factory=lambda: {},
|
|
114
120
|
description="for each Type with custom serde, add entry here. The string is fully qualified name of the ser/des functions",
|
|
115
121
|
)
|
|
116
122
|
ext_outputs: list[DatasetId] = Field(
|
|
117
|
-
[],
|
|
123
|
+
default_factory=lambda: [],
|
|
124
|
+
description="ids to externally materialize",
|
|
125
|
+
)
|
|
126
|
+
constraints: list[SchedulingConstraint] = Field(
|
|
127
|
+
default_factory=lambda: [],
|
|
128
|
+
description="constraints for the scheduler such as gangs",
|
|
118
129
|
)
|
|
119
130
|
|
|
120
131
|
def outputs_of(self, task_id: TaskId) -> set[DatasetId]:
|
|
@@ -157,6 +168,7 @@ class Worker(BaseModel):
|
|
|
157
168
|
|
|
158
169
|
class Environment(BaseModel):
|
|
159
170
|
workers: dict[WorkerId, Worker]
|
|
171
|
+
host_url_base: dict[HostId, str]
|
|
160
172
|
|
|
161
173
|
|
|
162
174
|
class TaskExecutionRecord(BaseModel):
|
cascade/scheduler/api.py
CHANGED
|
@@ -22,6 +22,7 @@ from cascade.scheduler.core import (
|
|
|
22
22
|
Assignment,
|
|
23
23
|
ComponentId,
|
|
24
24
|
ComponentSchedule,
|
|
25
|
+
GangPreparation,
|
|
25
26
|
Preschedule,
|
|
26
27
|
Schedule,
|
|
27
28
|
)
|
|
@@ -29,12 +30,60 @@ from cascade.scheduler.core import (
|
|
|
29
30
|
logger = logging.getLogger(__name__)
|
|
30
31
|
|
|
31
32
|
|
|
33
|
+
def gang_check_ready(task: TaskId, gang_prep: GangPreparation):
|
|
34
|
+
"""When a task becomes computable, mutate the gang_prep to possibly
|
|
35
|
+
transition some gangs to `ready`
|
|
36
|
+
"""
|
|
37
|
+
for gang in gang_prep.lookup[task]:
|
|
38
|
+
if gang not in gang_prep.countdown:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
f"after {task=} marked computable, {gang=} not found -- double compuptable mark?"
|
|
41
|
+
)
|
|
42
|
+
remaining = gang_prep.countdown[gang]
|
|
43
|
+
if task not in remaining:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"after {task=} marked computable, {gang=} does not have it in {remaining=}. Invalid gang?"
|
|
46
|
+
)
|
|
47
|
+
remaining.remove(task)
|
|
48
|
+
if not remaining:
|
|
49
|
+
logger.debug(f"gang just became ready {gang=}")
|
|
50
|
+
gang_prep.ready.append(gang)
|
|
51
|
+
gang_prep.countdown.pop(gang)
|
|
52
|
+
|
|
53
|
+
|
|
32
54
|
def init_schedule(preschedule: Preschedule, context: JobExecutionContext) -> Schedule:
|
|
33
55
|
components: list[ComponentSchedule] = []
|
|
34
56
|
ts2component: dict[TaskId, ComponentId] = {}
|
|
35
57
|
|
|
58
|
+
gangs = [
|
|
59
|
+
frozenset(constraint.gang) for constraint in context.job_instance.constraints
|
|
60
|
+
]
|
|
61
|
+
|
|
36
62
|
computable = 0
|
|
37
63
|
for componentId, precomponent in enumerate(preschedule.components):
|
|
64
|
+
# gang preparation
|
|
65
|
+
tasks = set(precomponent.nodes)
|
|
66
|
+
lookup = defaultdict(list)
|
|
67
|
+
countdown = {}
|
|
68
|
+
i = 0
|
|
69
|
+
while i < len(gangs):
|
|
70
|
+
if not gangs[i].issubset(tasks):
|
|
71
|
+
i += 1
|
|
72
|
+
continue
|
|
73
|
+
gang = gangs.pop(i)
|
|
74
|
+
countdown[gang] = set(gang)
|
|
75
|
+
for e in gang:
|
|
76
|
+
lookup[e].append(gang)
|
|
77
|
+
|
|
78
|
+
gang_preparation = GangPreparation(
|
|
79
|
+
ready=[],
|
|
80
|
+
lookup=lookup,
|
|
81
|
+
countdown=countdown,
|
|
82
|
+
)
|
|
83
|
+
for source in precomponent.sources:
|
|
84
|
+
gang_check_ready(source, gang_preparation)
|
|
85
|
+
|
|
86
|
+
# component itself
|
|
38
87
|
component = ComponentSchedule(
|
|
39
88
|
core=precomponent,
|
|
40
89
|
weight=precomponent.weight(),
|
|
@@ -45,12 +94,18 @@ def init_schedule(preschedule: Preschedule, context: JobExecutionContext) -> Sch
|
|
|
45
94
|
task: {inp for inp in context.edge_i[task]}
|
|
46
95
|
for task in precomponent.nodes
|
|
47
96
|
},
|
|
97
|
+
gang_preparation=gang_preparation,
|
|
48
98
|
)
|
|
49
99
|
components.append(component)
|
|
50
100
|
computable += len(precomponent.sources)
|
|
51
101
|
for task in precomponent.nodes:
|
|
52
102
|
ts2component[task] = componentId
|
|
53
103
|
|
|
104
|
+
if gangs:
|
|
105
|
+
for gang in gangs:
|
|
106
|
+
logger.error(f"a gang not part of a component: {gang}")
|
|
107
|
+
raise ValueError(f"a total of {len(gangs)} were not a subcomponent")
|
|
108
|
+
|
|
54
109
|
return Schedule(
|
|
55
110
|
components=components,
|
|
56
111
|
ts2component=ts2component,
|
cascade/scheduler/assign.py
CHANGED
|
@@ -18,7 +18,13 @@ from typing import Iterable, Iterator
|
|
|
18
18
|
from cascade.low.core import DatasetId, HostId, TaskId, WorkerId
|
|
19
19
|
from cascade.low.execution_context import DatasetStatus, JobExecutionContext
|
|
20
20
|
from cascade.low.tracing import Microtrace, trace
|
|
21
|
-
from cascade.scheduler.core import
|
|
21
|
+
from cascade.scheduler.core import (
|
|
22
|
+
Assignment,
|
|
23
|
+
ComponentCore,
|
|
24
|
+
ComponentId,
|
|
25
|
+
ComponentSchedule,
|
|
26
|
+
Schedule,
|
|
27
|
+
)
|
|
22
28
|
|
|
23
29
|
logger = logging.getLogger(__name__)
|
|
24
30
|
|
|
@@ -92,9 +98,148 @@ def build_assignment(
|
|
|
92
98
|
tasks=assigned,
|
|
93
99
|
prep=prep,
|
|
94
100
|
outputs=trimmed_outputs,
|
|
101
|
+
extra_env={},
|
|
95
102
|
)
|
|
96
103
|
|
|
97
104
|
|
|
105
|
+
def _postproc_assignment(
|
|
106
|
+
assignment: Assignment,
|
|
107
|
+
component: ComponentSchedule,
|
|
108
|
+
schedule: Schedule,
|
|
109
|
+
context: JobExecutionContext,
|
|
110
|
+
) -> None:
|
|
111
|
+
for assigned in assignment.tasks:
|
|
112
|
+
if assigned in component.computable:
|
|
113
|
+
component.computable.pop(assigned)
|
|
114
|
+
component.worker2task_values.remove(assigned)
|
|
115
|
+
schedule.computable -= 1
|
|
116
|
+
else:
|
|
117
|
+
# shortcut for fused-in tasks
|
|
118
|
+
component.is_computable_tracker[assigned] = set()
|
|
119
|
+
context.idle_workers.remove(assignment.worker)
|
|
120
|
+
component.weight -= len(assignment.tasks)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# TODO this is not particularly systematic! We cant bind dynamically at the host as we send this
|
|
124
|
+
# in advance, so we need to hardcode. Ideally we centrallize all port opening into a single module,
|
|
125
|
+
# in particular unify this with the portBase from benchmarks/__main__ and then derived ports from
|
|
126
|
+
# executor/executor.py etc. As is, we have a single global variable that we increment, to ensure
|
|
127
|
+
# no port collision happens gang-wise -- we dont really expect many gangs per a workflow
|
|
128
|
+
gang_port = 12355
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _try_assign_gang(
|
|
132
|
+
schedule: Schedule,
|
|
133
|
+
gang: list[frozenset[TaskId]],
|
|
134
|
+
workers: list[WorkerId],
|
|
135
|
+
component_id: ComponentId,
|
|
136
|
+
context: JobExecutionContext,
|
|
137
|
+
fail_acc: list[frozenset[TaskId]],
|
|
138
|
+
) -> Iterator[Assignment]:
|
|
139
|
+
"""We greedily assign by descending worker-task distance"""
|
|
140
|
+
global gang_port
|
|
141
|
+
if len(gang) > len(workers):
|
|
142
|
+
logger.debug(f"not enough workers ({len(workers)}) for {gang=}")
|
|
143
|
+
fail_acc.append(gang)
|
|
144
|
+
return
|
|
145
|
+
start = perf_counter_ns()
|
|
146
|
+
component = schedule.components[component_id]
|
|
147
|
+
gpu_tasks: set[TaskId] = set()
|
|
148
|
+
cpu_tasks: set[TaskId] = set()
|
|
149
|
+
gpu_workers: set[WorkerId] = set()
|
|
150
|
+
cpu_workers: set[WorkerId] = set()
|
|
151
|
+
for task in gang:
|
|
152
|
+
if context.job_instance.tasks[task].definition.needs_gpu:
|
|
153
|
+
gpu_tasks.add(task)
|
|
154
|
+
else:
|
|
155
|
+
cpu_tasks.add(task)
|
|
156
|
+
for worker in workers:
|
|
157
|
+
if context.environment.workers[worker].gpu > 0:
|
|
158
|
+
gpu_workers.add(worker)
|
|
159
|
+
else:
|
|
160
|
+
cpu_workers.add(worker)
|
|
161
|
+
if len(gpu_tasks) > len(gpu_workers):
|
|
162
|
+
logger.debug(f"not enough gpu workers ({len(workers)}) for {gang=}")
|
|
163
|
+
fail_acc.append(gang)
|
|
164
|
+
end = perf_counter_ns()
|
|
165
|
+
trace(Microtrace.ctrl_assign, end - start)
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
world_size = len(gang)
|
|
169
|
+
rank = 0
|
|
170
|
+
coordinator = None
|
|
171
|
+
|
|
172
|
+
# similarly to _assignment_heuristic, a greedy algorithm
|
|
173
|
+
candidates = [
|
|
174
|
+
(schedule.worker2task_overhead[w][t], component.core.value[t], w, t)
|
|
175
|
+
for w in gpu_workers
|
|
176
|
+
for t in gpu_tasks
|
|
177
|
+
]
|
|
178
|
+
candidates.sort(key=lambda e: (e[0], e[1]))
|
|
179
|
+
for _, _, worker, task in candidates:
|
|
180
|
+
if task in gpu_tasks and worker in gpu_workers:
|
|
181
|
+
if task not in component.computable:
|
|
182
|
+
# it may be that some fusing for previous task already assigned this
|
|
183
|
+
continue
|
|
184
|
+
end = perf_counter_ns()
|
|
185
|
+
trace(Microtrace.ctrl_assign, end - start)
|
|
186
|
+
assignment = build_assignment(worker, task, context, component.core)
|
|
187
|
+
if not coordinator:
|
|
188
|
+
coordinator = (
|
|
189
|
+
f"{context.environment.host_url_base[worker.host]}:{gang_port}"
|
|
190
|
+
)
|
|
191
|
+
assignment.extra_env["CASCADE_GANG_WORLD_SIZE"] = str(world_size)
|
|
192
|
+
assignment.extra_env["CASCADE_GANG_RANK"] = str(rank)
|
|
193
|
+
assignment.extra_env["CASCADE_GANG_COORDINATOR"] = coordinator
|
|
194
|
+
rank += 1
|
|
195
|
+
yield assignment
|
|
196
|
+
start = perf_counter_ns()
|
|
197
|
+
_postproc_assignment(assignment, component, schedule, context)
|
|
198
|
+
gpu_tasks.remove(task)
|
|
199
|
+
gpu_workers.remove(worker)
|
|
200
|
+
if gpu_tasks:
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"expected to assign all gang gpu tasks, yet {gpu_tasks} remain"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
all_workers = cpu_workers.union(gpu_workers)
|
|
206
|
+
candidates = [
|
|
207
|
+
(schedule.worker2task_overhead[w][t], component.core.value[t], w, t)
|
|
208
|
+
for w in all_workers
|
|
209
|
+
for t in cpu_tasks
|
|
210
|
+
]
|
|
211
|
+
candidates.sort(key=lambda e: (e[0], e[1]))
|
|
212
|
+
for _, _, worker, task in candidates:
|
|
213
|
+
if task in cpu_tasks and worker in all_workers:
|
|
214
|
+
if task not in component.computable:
|
|
215
|
+
# it may be that some fusing for previous task already assigned this
|
|
216
|
+
continue
|
|
217
|
+
end = perf_counter_ns()
|
|
218
|
+
trace(Microtrace.ctrl_assign, end - start)
|
|
219
|
+
assignment = build_assignment(worker, task, context, component.core)
|
|
220
|
+
if not coordinator:
|
|
221
|
+
coordinator = (
|
|
222
|
+
f"{context.environment.host_url_base[worker.host]}:{gang_port}"
|
|
223
|
+
)
|
|
224
|
+
assignment.extra_env["CASCADE_GANG_WORLD_SIZE"] = str(world_size)
|
|
225
|
+
assignment.extra_env["CASCADE_GANG_RANK"] = str(rank)
|
|
226
|
+
assignment.extra_env["CASCADE_GANG_COORDINATOR"] = coordinator
|
|
227
|
+
rank += 1
|
|
228
|
+
yield assignment
|
|
229
|
+
start = perf_counter_ns()
|
|
230
|
+
_postproc_assignment(assignment, component, schedule, context)
|
|
231
|
+
cpu_tasks.remove(task)
|
|
232
|
+
all_workers.remove(worker)
|
|
233
|
+
if cpu_tasks:
|
|
234
|
+
raise ValueError(
|
|
235
|
+
f"expected to assign all gang cpu tasks, yet {cpu_tasks} remain"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
end = perf_counter_ns()
|
|
239
|
+
trace(Microtrace.ctrl_assign, end - start)
|
|
240
|
+
gang_port += 1
|
|
241
|
+
|
|
242
|
+
|
|
98
243
|
def _assignment_heuristic(
|
|
99
244
|
schedule: Schedule,
|
|
100
245
|
tasks: list[TaskId],
|
|
@@ -106,18 +251,6 @@ def _assignment_heuristic(
|
|
|
106
251
|
start = perf_counter_ns()
|
|
107
252
|
component = schedule.components[component_id]
|
|
108
253
|
|
|
109
|
-
def postproc_assignment(assignment: Assignment) -> None:
|
|
110
|
-
for assigned in assignment.tasks:
|
|
111
|
-
if assigned in component.computable:
|
|
112
|
-
component.computable.pop(assigned)
|
|
113
|
-
component.worker2task_values.remove(assigned)
|
|
114
|
-
schedule.computable -= 1
|
|
115
|
-
else:
|
|
116
|
-
# shortcut for fused-in tasks
|
|
117
|
-
component.is_computable_tracker[assigned] = set()
|
|
118
|
-
context.idle_workers.remove(worker)
|
|
119
|
-
component.weight -= len(assignment.tasks)
|
|
120
|
-
|
|
121
254
|
# first, attempt optimum-distance assignment
|
|
122
255
|
unassigned: list[TaskId] = []
|
|
123
256
|
for task in tasks:
|
|
@@ -133,7 +266,7 @@ def _assignment_heuristic(
|
|
|
133
266
|
assignment = build_assignment(worker, task, context, component.core)
|
|
134
267
|
yield assignment
|
|
135
268
|
start = perf_counter_ns()
|
|
136
|
-
|
|
269
|
+
_postproc_assignment(assignment, component, schedule, context)
|
|
137
270
|
workers.pop(idx)
|
|
138
271
|
was_assigned = True
|
|
139
272
|
break
|
|
@@ -159,7 +292,7 @@ def _assignment_heuristic(
|
|
|
159
292
|
assignment = build_assignment(worker, task, context, component.core)
|
|
160
293
|
yield assignment
|
|
161
294
|
start = perf_counter_ns()
|
|
162
|
-
|
|
295
|
+
_postproc_assignment(assignment, component, schedule, context)
|
|
163
296
|
remaining_t.remove(task)
|
|
164
297
|
remaining_w.remove(worker)
|
|
165
298
|
|
|
@@ -173,29 +306,63 @@ def assign_within_component(
|
|
|
173
306
|
component_id: ComponentId,
|
|
174
307
|
context: JobExecutionContext,
|
|
175
308
|
) -> Iterator[Assignment]:
|
|
176
|
-
"""We
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
309
|
+
"""We hardcode order of handling task groups:
|
|
310
|
+
1/ ready gangs,
|
|
311
|
+
2/ tasks requiring a gpu,
|
|
312
|
+
3/ tasks whose fusable child requires a gpu,
|
|
313
|
+
4/ all other tasks,
|
|
314
|
+
using the same algorithm for cases 2-4 and a naive for case 1
|
|
315
|
+
"""
|
|
316
|
+
# TODO rework into a more systematic multicriterial opt solution that is able to consider all groups
|
|
317
|
+
# at once, using a generic value/cost framework and matching algorithm. It should additionally be able
|
|
318
|
+
# to issue a "strategic wait" command -- eg if we could assign a task to an idle worker with high cost,
|
|
319
|
+
# or wait until a better-equipped busy worker finished, etc.
|
|
320
|
+
component = schedule.components[component_id]
|
|
321
|
+
|
|
322
|
+
# gangs
|
|
323
|
+
fail_acc: list[frozenset[TaskId]] = []
|
|
324
|
+
for gang in component.gang_preparation.ready:
|
|
325
|
+
logger.debug(f"trying to assign a {gang=}")
|
|
326
|
+
yield from _try_assign_gang(
|
|
327
|
+
schedule, gang, list(context.idle_workers), component_id, context, fail_acc
|
|
328
|
+
)
|
|
329
|
+
component.gang_preparation.ready = fail_acc
|
|
330
|
+
|
|
331
|
+
# the other cases: build them first
|
|
180
332
|
cpu_t: list[TaskId] = []
|
|
181
333
|
gpu_t: list[TaskId] = []
|
|
182
334
|
opu_t: list[TaskId] = []
|
|
183
|
-
component = schedule.components[component_id]
|
|
184
335
|
for task in component.computable.keys():
|
|
185
|
-
if
|
|
336
|
+
if component.gang_preparation.lookup[task]:
|
|
337
|
+
# no gang participation in single-task scheduling
|
|
338
|
+
continue
|
|
339
|
+
elif context.job_instance.tasks[task].definition.needs_gpu:
|
|
186
340
|
gpu_t.append(task)
|
|
187
341
|
elif component.core.gpu_fused_distance[task] is not None:
|
|
188
342
|
opu_t.append(task)
|
|
189
343
|
else:
|
|
190
344
|
cpu_t.append(task)
|
|
345
|
+
|
|
346
|
+
# tasks immediately needing a gpu
|
|
191
347
|
eligible_w = [
|
|
192
|
-
worker
|
|
348
|
+
worker
|
|
349
|
+
for worker in workers
|
|
350
|
+
if context.environment.workers[worker].gpu > 0
|
|
351
|
+
and worker in context.idle_workers
|
|
193
352
|
]
|
|
353
|
+
logger.debug(
|
|
354
|
+
f"considering {len(gpu_t)}# gpu tasks, {len(opu_t)}# maybe-gpu tasks, {len(cpu_t)}# cpu tasks, with {len(workers)}# workers out of which {len(eligible_w)} have gpu"
|
|
355
|
+
)
|
|
194
356
|
yield from _assignment_heuristic(schedule, gpu_t, eligible_w, component_id, context)
|
|
357
|
+
# tasks whose fusing opportunity needs a gpu
|
|
195
358
|
eligible_w = [worker for worker in eligible_w if worker in context.idle_workers]
|
|
196
359
|
yield from _assignment_heuristic(schedule, opu_t, eligible_w, component_id, context)
|
|
360
|
+
# remaining tasks
|
|
197
361
|
eligible_w = [worker for worker in workers if worker in context.idle_workers]
|
|
198
|
-
|
|
362
|
+
u_opu_t = [task for task in opu_t if task in component.computable]
|
|
363
|
+
yield from _assignment_heuristic(
|
|
364
|
+
schedule, cpu_t + u_opu_t, eligible_w, component_id, context
|
|
365
|
+
)
|
|
199
366
|
|
|
200
367
|
|
|
201
368
|
def update_worker2task_distance(
|
cascade/scheduler/core.py
CHANGED
|
@@ -44,6 +44,19 @@ Worker2TaskDistance = dict[WorkerId, dict[TaskId, int]]
|
|
|
44
44
|
ComponentId = int
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
@dataclass
|
|
48
|
+
class GangPreparation:
|
|
49
|
+
ready: list[
|
|
50
|
+
frozenset[TaskId]
|
|
51
|
+
] # used by scheduler to see if any gangs can be assigned/started
|
|
52
|
+
countdown: dict[
|
|
53
|
+
frozenset[TaskId], set[TaskId]
|
|
54
|
+
] # used to check after a task completion whether a gang can be moved to ready
|
|
55
|
+
lookup: dict[
|
|
56
|
+
TaskId, list[frozenset[TaskId]]
|
|
57
|
+
] # used to decrease countdown after a task completion
|
|
58
|
+
|
|
59
|
+
|
|
47
60
|
@dataclass
|
|
48
61
|
class ComponentSchedule:
|
|
49
62
|
core: ComponentCore
|
|
@@ -58,6 +71,7 @@ class ComponentSchedule:
|
|
|
58
71
|
worker2task_distance: Worker2TaskDistance
|
|
59
72
|
# eligible values -- a cached value. Used when migrating new workers to the component, inserted whenever a parent of this task gets `preparing`, removed when this task is made computable
|
|
60
73
|
worker2task_values: set[TaskId]
|
|
74
|
+
gang_preparation: GangPreparation
|
|
61
75
|
|
|
62
76
|
|
|
63
77
|
@dataclass
|
|
@@ -79,3 +93,4 @@ class Assignment:
|
|
|
79
93
|
tasks: list[TaskId]
|
|
80
94
|
prep: list[tuple[DatasetId, HostId]]
|
|
81
95
|
outputs: set[DatasetId]
|
|
96
|
+
extra_env: list[tuple[str, str]]
|
cascade/scheduler/precompute.py
CHANGED
|
@@ -112,6 +112,7 @@ def _enrich(
|
|
|
112
112
|
edge_i: dict[TaskId, set[TaskId]],
|
|
113
113
|
edge_o: dict[TaskId, set[TaskId]],
|
|
114
114
|
needs_gpu: set[TaskId],
|
|
115
|
+
gangs: set[TaskId],
|
|
115
116
|
) -> ComponentCore:
|
|
116
117
|
nodes, sources = plain_component
|
|
117
118
|
logger.debug(
|
|
@@ -170,7 +171,7 @@ def _enrich(
|
|
|
170
171
|
while layer:
|
|
171
172
|
gpu_distance = None
|
|
172
173
|
head = layer.pop(0)
|
|
173
|
-
if head in fused:
|
|
174
|
+
if head in fused or head in gangs:
|
|
174
175
|
continue
|
|
175
176
|
chain = []
|
|
176
177
|
fused.add(head)
|
|
@@ -183,7 +184,7 @@ def _enrich(
|
|
|
183
184
|
gpu_fused_distance[head] = gpu_distance
|
|
184
185
|
found = False
|
|
185
186
|
for edge in edge_i[head]:
|
|
186
|
-
if edge not in fused:
|
|
187
|
+
if edge not in fused and edge not in gangs:
|
|
187
188
|
chain.insert(0, head)
|
|
188
189
|
head = edge
|
|
189
190
|
fused.add(head)
|
|
@@ -222,11 +223,16 @@ def precompute(job_instance: JobInstance) -> Preschedule:
|
|
|
222
223
|
for task_id, task in job_instance.tasks.items()
|
|
223
224
|
if task.definition.needs_gpu
|
|
224
225
|
}
|
|
226
|
+
gangs = {
|
|
227
|
+
task_id
|
|
228
|
+
for constraint in job_instance.constraints
|
|
229
|
+
for task_id in constraint.gang
|
|
230
|
+
}
|
|
225
231
|
|
|
226
232
|
with ThreadPoolExecutor(max_workers=4) as tp:
|
|
227
233
|
# TODO if coptrs is not used, then this doesnt make sense
|
|
228
234
|
f = lambda plain_component: timer(_enrich, Microtrace.presched_enrich)(
|
|
229
|
-
plain_component, edge_i_proj, edge_o_proj, needs_gpu
|
|
235
|
+
plain_component, edge_i_proj, edge_o_proj, needs_gpu, gangs
|
|
230
236
|
)
|
|
231
237
|
plain_components = (
|
|
232
238
|
plain_component
|
cascade/shm/server.py
CHANGED
|
@@ -115,5 +115,5 @@ def entrypoint(
|
|
|
115
115
|
server.start()
|
|
116
116
|
except Exception as e:
|
|
117
117
|
# we always get a Bad file descriptor due to sigterm handler calling sock close mid-read
|
|
118
|
-
logger.warning(f"shutdown issue: {e}")
|
|
118
|
+
logger.warning(f"shutdown issue: {repr(e)}")
|
|
119
119
|
server.atexit(0, None)
|
earthkit/workflows/_version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
# Do not change! Do not track in version control!
|
|
2
|
-
__version__ = "0.4.
|
|
2
|
+
__version__ = "0.4.2"
|
|
@@ -7,37 +7,52 @@
|
|
|
7
7
|
# nor does it submit to any jurisdiction.
|
|
8
8
|
|
|
9
9
|
import functools
|
|
10
|
-
import
|
|
11
|
-
from typing import Callable
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Callable, Union
|
|
12
12
|
|
|
13
13
|
import xarray as xr
|
|
14
14
|
|
|
15
15
|
from .arrayapi import ArrayAPIBackend
|
|
16
16
|
from .xarray import XArrayBackend
|
|
17
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
18
21
|
BACKENDS = {
|
|
19
22
|
xr.DataArray: XArrayBackend,
|
|
20
23
|
xr.Dataset: XArrayBackend,
|
|
21
|
-
|
|
24
|
+
object: ArrayAPIBackend,
|
|
22
25
|
}
|
|
23
26
|
|
|
24
27
|
|
|
25
28
|
def register(type, backend):
|
|
26
29
|
if type in BACKENDS:
|
|
27
|
-
|
|
30
|
+
logger.warning(
|
|
28
31
|
f"Overwriting backend for {type}. Existing backend {BACKENDS[type]}."
|
|
29
32
|
)
|
|
30
33
|
BACKENDS[type] = backend
|
|
31
34
|
|
|
32
35
|
|
|
36
|
+
def _get_backend(obj_type: type) -> Union[type, None]:
|
|
37
|
+
return BACKENDS.get(obj_type, None)
|
|
38
|
+
|
|
39
|
+
|
|
33
40
|
def array_module(*arrays):
|
|
34
|
-
|
|
35
|
-
#
|
|
41
|
+
"""Return the backend module for the given arrays."""
|
|
42
|
+
# Checks all bases of the first array type for a registered backend.
|
|
43
|
+
# If no backend is found, it will traverse the hierarchy of types
|
|
44
|
+
# until it finds a registered backend or reaches the base object type.
|
|
45
|
+
if not arrays:
|
|
46
|
+
raise ValueError("No arrays provided to determine backend.")
|
|
36
47
|
array_type = type(arrays[0])
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
48
|
+
while True:
|
|
49
|
+
backend = _get_backend(array_type)
|
|
50
|
+
if backend is not None:
|
|
51
|
+
break
|
|
52
|
+
# If no backend found, try the next type in the hierarchy
|
|
53
|
+
array_type = array_type.__bases__[0]
|
|
54
|
+
|
|
55
|
+
logger.debug(f"Using backend {backend} for {array_type}")
|
|
41
56
|
return backend
|
|
42
57
|
|
|
43
58
|
|
|
@@ -201,5 +216,6 @@ try:
|
|
|
201
216
|
|
|
202
217
|
BACKENDS[SimpleFieldList] = FieldListBackend
|
|
203
218
|
BACKENDS[FieldList] = FieldListBackend
|
|
219
|
+
|
|
204
220
|
except ImportError:
|
|
205
|
-
|
|
221
|
+
logger.warning("earthkit could not be imported, FieldList not supported.")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: earthkit-workflows
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Earthkit Workflows is a Python library for declaring earthkit task DAGs, as well as scheduling and executing them on heterogeneous computing systems.
|
|
5
5
|
Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -1,28 +1,29 @@
|
|
|
1
1
|
cascade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
cascade/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
cascade/benchmarks/__init__.py,sha256=Gu8kEApmJ2zsIhT2zpm1-6n84-OwWnz-0vO8UHYtBzo,528
|
|
4
|
-
cascade/benchmarks/__main__.py,sha256=
|
|
4
|
+
cascade/benchmarks/__main__.py,sha256=g03xRzp58dXLHDj8kTPyPnbBOS5sRIAMTthFtFjDRbs,9876
|
|
5
5
|
cascade/benchmarks/anemoi.py,sha256=qtAI03HdtAmcksCgjIEZyNyUNzMp370KF4lAh5g4cOk,1077
|
|
6
|
+
cascade/benchmarks/dist.py,sha256=ngXJJzegnMUVwDFPvGMG6997lamB-aSEHi74oBbayrE,4116
|
|
6
7
|
cascade/benchmarks/generators.py,sha256=NK4fFisWsZdMkA2Auzrn-P7G5D9AKpo2JVnqXE44YT8,2169
|
|
7
8
|
cascade/benchmarks/job1.py,sha256=MOcZZYgf36MzHCjtby0lQyenM1ODUlagG8wtt2CbpnI,4640
|
|
8
9
|
cascade/benchmarks/matmul.py,sha256=5STuvPY6Q37E2pKRCde9dQjL5M6tx7tkES9cBLZ6eK4,1972
|
|
9
10
|
cascade/benchmarks/plotting.py,sha256=vSz9HHbqZwMXHpBUS-In6xsXGgK7QIoQTTiYfSwYwZs,4428
|
|
10
11
|
cascade/benchmarks/reporting.py,sha256=MejaM-eekbMYLAnuBxGv_t4dR1ODJs4Rpc0fiZSGjyw,5410
|
|
11
12
|
cascade/controller/__init__.py,sha256=p4C2p3S_0nUGamP9Mi6cSa5bvpiWbI6sVWtGhFnNqjw,1278
|
|
12
|
-
cascade/controller/act.py,sha256=
|
|
13
|
+
cascade/controller/act.py,sha256=WHIsk4H-Bbyl_DABX2VWhyKy_cNnp12x1nilatPCL8I,2981
|
|
13
14
|
cascade/controller/core.py,sha256=NqvZ5g5GNphwOpzdXbCI0_fxIzzmO97_n2xZKswK72Q,3589
|
|
14
|
-
cascade/controller/impl.py,sha256=
|
|
15
|
-
cascade/controller/notify.py,sha256
|
|
15
|
+
cascade/controller/impl.py,sha256=9jdTikYO8OkaNIfzatyr3Mhai5EfEhaeii9GaF9cQw4,3526
|
|
16
|
+
cascade/controller/notify.py,sha256=5eSPKcxqrv9kHy7St-iIm1NttsyzcvwLhZI5dvr4cEY,5881
|
|
16
17
|
cascade/controller/report.py,sha256=FD-MAWZq6pwSw2CP2m4OUBw4hzrX46vKE_FZO5NpjDU,3670
|
|
17
|
-
cascade/executor/bridge.py,sha256=
|
|
18
|
+
cascade/executor/bridge.py,sha256=WDE-GM2Bv7nUk1-nV-otMGuaRYw1-Vmd7PWploXBp6Y,8267
|
|
18
19
|
cascade/executor/comms.py,sha256=-9qrKwva6WXkHRQtzSnLFy5gB3bOWuxYJP5fL6Uavw8,8736
|
|
19
|
-
cascade/executor/config.py,sha256=
|
|
20
|
+
cascade/executor/config.py,sha256=8azy_sXdvDGO0zTNqA0pdtkXsyihM4FQ4U1W_3Dhua0,1571
|
|
20
21
|
cascade/executor/data_server.py,sha256=xLIbLkWn8PnJl4lMP8ADHa2S0EgPwr0-bH7_Sib_Y70,13701
|
|
21
|
-
cascade/executor/executor.py,sha256=
|
|
22
|
-
cascade/executor/msg.py,sha256=
|
|
22
|
+
cascade/executor/executor.py,sha256=egPhfqhzYfeM77Hu10-mGHNVsQAdqmZOA7hmjFP1Q8M,13484
|
|
23
|
+
cascade/executor/msg.py,sha256=7HI0rKeCRaV1ONR4HWEa64nHbu-p6-QdBwJNitmst48,4340
|
|
23
24
|
cascade/executor/serde.py,sha256=z6klTOZqW_BVGrbIRNz4FN0_XTfRiKBRQuvgsQIuyAo,2827
|
|
24
25
|
cascade/executor/runner/__init__.py,sha256=30BM80ZyA7w3IrGiKKLSFuhRehbR2Mm99OJ8q5PJ63c,1547
|
|
25
|
-
cascade/executor/runner/entrypoint.py,sha256=
|
|
26
|
+
cascade/executor/runner/entrypoint.py,sha256=32i2U4fmEvQnsV1MTl0Xf8mK_1nbk1BEVJqIidd6MRM,8042
|
|
26
27
|
cascade/executor/runner/memory.py,sha256=jkAV9T7-imciVcGvkV7OhRfosEpOQJU1OME7z-4ztAs,6371
|
|
27
28
|
cascade/executor/runner/packages.py,sha256=OZjEOvKy8LQ2uguGZU1L7TVYz1415JOUGySRfU_D_sc,2513
|
|
28
29
|
cascade/executor/runner/runner.py,sha256=zqpkvxdWLbwyUFaUbZmSj0KQEBNRpmF8gwVotiaamhc,4870
|
|
@@ -34,17 +35,17 @@ cascade/gateway/router.py,sha256=iN-dc3L46aEy0EV57NNKYwaqIu0Au9kImu1pg-UbxwE,768
|
|
|
34
35
|
cascade/gateway/server.py,sha256=tsOyKtVFs5EZmWrjKdi9JwWxK0DG207oSa9OQ-4zN3M,3772
|
|
35
36
|
cascade/low/__init__.py,sha256=5cw2taOGITK_gFbICftzK2YLdEAnLUY5OzblFzdHss4,769
|
|
36
37
|
cascade/low/builders.py,sha256=_u5X8G_EF00hFt8Anv9AXo6yPf1O8MHDmqs2kKmREl0,7073
|
|
37
|
-
cascade/low/core.py,sha256=
|
|
38
|
+
cascade/low/core.py,sha256=_3x4ka_pmCgZbfwFeyhq8S4M6wmh0s24VRCLhk5yQFM,6444
|
|
38
39
|
cascade/low/execution_context.py,sha256=cdDJLYhreo4T7t4qXgFBosncubZpTrm0hELo7q4miqo,6640
|
|
39
40
|
cascade/low/func.py,sha256=ihL5n3cK-IJnATgP4Dub2m-Mp_jHMxJzCA1v4uMEsi8,5211
|
|
40
41
|
cascade/low/into.py,sha256=QvjrcBuHfu7qpEkeB0EJu1EAaRxOEZskUnyjkRJ_9gA,3391
|
|
41
42
|
cascade/low/tracing.py,sha256=qvGVKB1huwcYoyvMYN-2wQ92pLQTErocTjpIjWv9glA,4511
|
|
42
43
|
cascade/low/views.py,sha256=UwafO2EQHre17GjG8hdzO8b6qBRtTRtDlhOc1pTf8Io,1822
|
|
43
44
|
cascade/scheduler/__init__.py,sha256=VT2qQ0gOQWHC4-T0FcCs59w8WZ94j2nUn7tiGm5XepA,1148
|
|
44
|
-
cascade/scheduler/api.py,sha256=
|
|
45
|
-
cascade/scheduler/assign.py,sha256=
|
|
46
|
-
cascade/scheduler/core.py,sha256=
|
|
47
|
-
cascade/scheduler/precompute.py,sha256=
|
|
45
|
+
cascade/scheduler/api.py,sha256=UuomWS2ISuDw-ngFFUKLyucygpTWF0EBW8ZuF91EUBU,7778
|
|
46
|
+
cascade/scheduler/assign.py,sha256=gpOLL22-k3ah4gihiztIGMX2uF0RdJ5AtJ8fOCJUviE,18362
|
|
47
|
+
cascade/scheduler/core.py,sha256=umORLC6SDeOyS4z8nQuVFkDukBJ96JfH4hdLSj6Km20,3378
|
|
48
|
+
cascade/scheduler/precompute.py,sha256=AhTn8RgnU4XuV_WAgbVXz9z0YRpNS6LCY1dJeHdTfCc,8709
|
|
48
49
|
cascade/shm/__init__.py,sha256=R9QgGSnsl_YDjFjAUQkoleM_5yGM37ce9S8a4ReA1mE,3854
|
|
49
50
|
cascade/shm/algorithms.py,sha256=SGxnJF4ovUaywTunMJWkG77l5DN-jXx7HgABt3sRJXM,2356
|
|
50
51
|
cascade/shm/api.py,sha256=a_KrjyELsDms0Di0ThHsZe7MfmNEkekflmjXAQ1_Qws,6040
|
|
@@ -52,9 +53,9 @@ cascade/shm/client.py,sha256=pnod_dmUROJZRtipCpoeCuAEuynW0IgSfgjrp21CH2M,5893
|
|
|
52
53
|
cascade/shm/dataset.py,sha256=Z2ewpnW7mVDJB9GylIVoOWV0DYOF7FWLIXkIvV-Y7sI,12347
|
|
53
54
|
cascade/shm/disk.py,sha256=Fdl_pKOseaXroRp01OwqWVsdI-sSmiFizIFCdxBuMWM,2653
|
|
54
55
|
cascade/shm/func.py,sha256=ZWikgnSLCmbSoW2LDRJwtjxdwTxkR00OUHAsIRQ-ChE,638
|
|
55
|
-
cascade/shm/server.py,sha256=
|
|
56
|
+
cascade/shm/server.py,sha256=LnnNX0F6QJt5V_JLfmC3ZMHGNL5WpLY44wpB_pYDr7Y,5042
|
|
56
57
|
earthkit/workflows/__init__.py,sha256=-p4anEn0YQbYWM2tbXb0Vc3wq4-m6kFhcNEgAVu5Jis,1948
|
|
57
|
-
earthkit/workflows/_version.py,sha256
|
|
58
|
+
earthkit/workflows/_version.py,sha256=nkd71CReR3pz5TZ9pcVgB2cP1MDj4YK6VH9UGJYzXDM,72
|
|
58
59
|
earthkit/workflows/decorators.py,sha256=DM4QAtQ2glUUcDecwPkXcdlu4dio7MvgpcdmU5LYvD8,937
|
|
59
60
|
earthkit/workflows/fluent.py,sha256=IN_sqwr7W8wbwP7wTOklgnjVe34IUCmv1ku-DWVTCJc,30179
|
|
60
61
|
earthkit/workflows/mark.py,sha256=PdsXmRfhw1SyyJ74mzFPsLRqMCdlYv556fFX4bqlh9Y,1319
|
|
@@ -63,7 +64,7 @@ earthkit/workflows/taskgraph.py,sha256=RsT1Qlng1uPZSaSBNqE8vFsoI5J8DDcQl468YPX-k
|
|
|
63
64
|
earthkit/workflows/transformers.py,sha256=BsUUvnG-UyerT3XUYcHc1qJkSsLc0ZX3Zxqq70tJWLU,2105
|
|
64
65
|
earthkit/workflows/utility.py,sha256=ygqn1s846WQbo7HGY46Z8N1AXrDFGwyygSgsv4YnGJ8,1344
|
|
65
66
|
earthkit/workflows/visualise.py,sha256=WbqJWvn648B7Qo3VCKJyoJzU6Mgvv0p3UWZb0lf01m8,2290
|
|
66
|
-
earthkit/workflows/backends/__init__.py,sha256=
|
|
67
|
+
earthkit/workflows/backends/__init__.py,sha256=6ONg-EdNODiqeBZqyosI5iq1UfZfaOLqhAo8l8_wn9o,6519
|
|
67
68
|
earthkit/workflows/backends/arrayapi.py,sha256=QfUsTlYuFH3CroWdcf_XBcLnt2znMcS1HwNNEe8J0qU,2279
|
|
68
69
|
earthkit/workflows/backends/earthkit.py,sha256=rZURJf6FLKcCjJkyWgOf6NqKjPZjSNX09dV_SicIlss,8958
|
|
69
70
|
earthkit/workflows/backends/xarray.py,sha256=4pnnPgIug4DmvhigkU0JsituvdvspuVA_vxbIsrq8-A,6762
|
|
@@ -84,8 +85,8 @@ earthkit/workflows/graph/split.py,sha256=t-Sji5eZb01QO1szqmDNTodDDALqdo-0R0x1ESs
|
|
|
84
85
|
earthkit/workflows/graph/transform.py,sha256=BZ8n7ePUnuGgoHkMqZC3SLzifu4oq6q6t6vka0khFtg,3842
|
|
85
86
|
earthkit/workflows/graph/visit.py,sha256=MP-aFSqOl7aqJY2i7QTgY4epqb6yM7_lK3ofvOqfahw,1755
|
|
86
87
|
earthkit/workflows/plugins/__init__.py,sha256=nhMAC0eMLxoJamjqB5Ns0OWy0OuxEJ_YvaDFGEQITls,129
|
|
87
|
-
earthkit_workflows-0.4.
|
|
88
|
-
earthkit_workflows-0.4.
|
|
89
|
-
earthkit_workflows-0.4.
|
|
90
|
-
earthkit_workflows-0.4.
|
|
91
|
-
earthkit_workflows-0.4.
|
|
88
|
+
earthkit_workflows-0.4.2.dist-info/licenses/LICENSE,sha256=73MJ7twXMKnWwmzmrMiFwUeY7c6JTvxphVggeUq9Sq4,11381
|
|
89
|
+
earthkit_workflows-0.4.2.dist-info/METADATA,sha256=LWW-xDc0sq8cOdu6IpY335_MSFfe7Lmg1SHYT9cXjWA,1571
|
|
90
|
+
earthkit_workflows-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
91
|
+
earthkit_workflows-0.4.2.dist-info/top_level.txt,sha256=oNrH3Km3hK5kDkTOiM-8G8OQglvZcy-gUKy7rlooWXs,17
|
|
92
|
+
earthkit_workflows-0.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|