xmanager-slurm 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- xm_slurm/__init__.py +6 -2
- xm_slurm/api.py +301 -34
- xm_slurm/batching.py +4 -4
- xm_slurm/config.py +105 -55
- xm_slurm/constants.py +19 -0
- xm_slurm/contrib/__init__.py +0 -0
- xm_slurm/contrib/clusters/__init__.py +47 -13
- xm_slurm/contrib/clusters/drac.py +34 -16
- xm_slurm/dependencies.py +171 -0
- xm_slurm/executables.py +34 -22
- xm_slurm/execution.py +305 -107
- xm_slurm/executors.py +8 -12
- xm_slurm/experiment.py +601 -168
- xm_slurm/experimental/parameter_controller.py +202 -0
- xm_slurm/job_blocks.py +7 -0
- xm_slurm/packageables.py +42 -20
- xm_slurm/packaging/{docker/local.py → docker.py} +135 -40
- xm_slurm/packaging/router.py +3 -1
- xm_slurm/packaging/utils.py +9 -81
- xm_slurm/resources.py +28 -4
- xm_slurm/scripts/_cloudpickle.py +28 -0
- xm_slurm/scripts/cli.py +52 -0
- xm_slurm/status.py +9 -0
- xm_slurm/templates/docker/mamba.Dockerfile +4 -2
- xm_slurm/templates/docker/python.Dockerfile +18 -10
- xm_slurm/templates/docker/uv.Dockerfile +35 -0
- xm_slurm/templates/slurm/fragments/monitor.bash.j2 +5 -0
- xm_slurm/templates/slurm/job-array.bash.j2 +1 -2
- xm_slurm/templates/slurm/job.bash.j2 +4 -3
- xm_slurm/types.py +23 -0
- xm_slurm/utils.py +18 -10
- xmanager_slurm-0.4.1.dist-info/METADATA +26 -0
- xmanager_slurm-0.4.1.dist-info/RECORD +44 -0
- {xmanager_slurm-0.3.2.dist-info → xmanager_slurm-0.4.1.dist-info}/WHEEL +1 -1
- xmanager_slurm-0.4.1.dist-info/entry_points.txt +2 -0
- xmanager_slurm-0.4.1.dist-info/licenses/LICENSE.md +227 -0
- xm_slurm/packaging/docker/__init__.py +0 -75
- xm_slurm/packaging/docker/abc.py +0 -112
- xm_slurm/packaging/docker/cloud.py +0 -503
- xm_slurm/templates/docker/pdm.Dockerfile +0 -31
- xmanager_slurm-0.3.2.dist-info/METADATA +0 -25
- xmanager_slurm-0.3.2.dist-info/RECORD +0 -38
|
@@ -1,503 +0,0 @@
|
|
|
1
|
-
import dataclasses
|
|
2
|
-
import functools
|
|
3
|
-
import hashlib
|
|
4
|
-
import io
|
|
5
|
-
import os
|
|
6
|
-
import pathlib
|
|
7
|
-
import tarfile
|
|
8
|
-
import tempfile
|
|
9
|
-
import threading
|
|
10
|
-
import time
|
|
11
|
-
from typing import Mapping, Sequence
|
|
12
|
-
|
|
13
|
-
try:
|
|
14
|
-
import google.api_core.exceptions
|
|
15
|
-
import google.protobuf.duration_pb2
|
|
16
|
-
import google_crc32c as crc32c
|
|
17
|
-
import pathspec
|
|
18
|
-
from google.cloud import iam_credentials, kms, storage
|
|
19
|
-
from google.cloud import logging as cloud_logging
|
|
20
|
-
from google.cloud.devtools import cloudbuild
|
|
21
|
-
from google.cloud.logging_v2.services import logging_service_v2
|
|
22
|
-
from google.logging.type import log_severity_pb2
|
|
23
|
-
except ImportError as ex:
|
|
24
|
-
raise ImportError(
|
|
25
|
-
"The `gcp` extra is required for the Google Cloud Builder. " "Install with `xm-slurm[gcp]`."
|
|
26
|
-
) from ex
|
|
27
|
-
|
|
28
|
-
import humanize
|
|
29
|
-
from xmanager import xm
|
|
30
|
-
from xmanager.cloud import auth
|
|
31
|
-
|
|
32
|
-
from xm_slurm import utils
|
|
33
|
-
from xm_slurm.console import console
|
|
34
|
-
from xm_slurm.executables import (
|
|
35
|
-
Dockerfile,
|
|
36
|
-
ImageURI,
|
|
37
|
-
RemoteImage,
|
|
38
|
-
RemoteRepositoryCredentials,
|
|
39
|
-
)
|
|
40
|
-
from xm_slurm.executors import SlurmSpec
|
|
41
|
-
from xm_slurm.packaging import utils as packaging_utils
|
|
42
|
-
from xm_slurm.packaging.docker.abc import (
|
|
43
|
-
DockerBakeCommand,
|
|
44
|
-
DockerClient,
|
|
45
|
-
DockerLoginCommand,
|
|
46
|
-
DockerPullCommand,
|
|
47
|
-
)
|
|
48
|
-
from xm_slurm.packaging.registry import IndexedContainer
|
|
49
|
-
|
|
50
|
-
_CLOUD_DOCKER_REGISTRY = "XM_SLURM_CLOUD_DOCKER_REGISTRY"
|
|
51
|
-
_CLOUD_DOCKER_USERNAME = "XM_SLURM_CLOUD_DOCKER_USERNAME"
|
|
52
|
-
_CLOUD_DOCKER_PASSWORD = "XM_SLURM_CLOUD_DOCKER_PASSWORD"
|
|
53
|
-
|
|
54
|
-
_GCP_BUILD_MACHINE = "XM_SLURM_GCP_BUILD_MACHINE"
|
|
55
|
-
_GCP_BUILD_TIMEOUT = "XM_SLURM_GCP_BUILD_TIMEOUT"
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def _tar_writestr(tar: tarfile.TarFile, name: str, data: str):
|
|
59
|
-
"""Writes a string to a tar file."""
|
|
60
|
-
info = tarfile.TarInfo(name)
|
|
61
|
-
encoded = data.encode()
|
|
62
|
-
info.size = len(encoded)
|
|
63
|
-
tar.addfile(info, fileobj=io.BytesIO(encoded))
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
class GoogleCloudRemoteDockerClient(DockerClient):
|
|
67
|
-
"""A Docker client that uses Google Cloud Build to build and push images."""
|
|
68
|
-
|
|
69
|
-
def __init__(self):
|
|
70
|
-
self.cloud_credentials = auth.get_creds()
|
|
71
|
-
self.cloud_project = auth.get_project_name()
|
|
72
|
-
self.cloud_bucket = auth.get_bucket()
|
|
73
|
-
self.cloud_service_account = auth.get_service_account()
|
|
74
|
-
|
|
75
|
-
self.cloud_storage_client = storage.Client(
|
|
76
|
-
project=self.cloud_project, credentials=self.cloud_credentials
|
|
77
|
-
)
|
|
78
|
-
self.cloud_build_client = cloudbuild.CloudBuildClient(credentials=self.cloud_credentials)
|
|
79
|
-
self.cloud_logging_client = logging_service_v2.LoggingServiceV2Client(
|
|
80
|
-
credentials=self.cloud_credentials
|
|
81
|
-
)
|
|
82
|
-
self.cloud_credentials_client = iam_credentials.IAMCredentialsClient(
|
|
83
|
-
credentials=self.cloud_credentials
|
|
84
|
-
)
|
|
85
|
-
self.cloud_kms_client = kms.KeyManagementServiceClient(credentials=self.cloud_credentials)
|
|
86
|
-
self._credentials_cache: dict[str, RemoteRepositoryCredentials] = {}
|
|
87
|
-
|
|
88
|
-
def credentials(self, hostname: str) -> RemoteRepositoryCredentials | None:
|
|
89
|
-
"""Fetch access token for images in the Google Cloud Artifact Registry."""
|
|
90
|
-
if (
|
|
91
|
-
(username := os.environ.get(_CLOUD_DOCKER_USERNAME, None))
|
|
92
|
-
and (password := os.environ.get(_CLOUD_DOCKER_PASSWORD, None))
|
|
93
|
-
and (registry := os.environ.get(_CLOUD_DOCKER_REGISTRY, None))
|
|
94
|
-
and hostname.endswith(registry)
|
|
95
|
-
):
|
|
96
|
-
return RemoteRepositoryCredentials(username=username, password=password)
|
|
97
|
-
elif not hostname.endswith("gcr.io"):
|
|
98
|
-
return None
|
|
99
|
-
elif hostname in self._credentials_cache:
|
|
100
|
-
return self._credentials_cache[hostname]
|
|
101
|
-
|
|
102
|
-
key = self.cloud_credentials_client.generate_access_token(
|
|
103
|
-
iam_credentials.GenerateAccessTokenRequest(
|
|
104
|
-
name=f"projects/-/serviceAccounts/{self.cloud_service_account}",
|
|
105
|
-
lifetime=google.protobuf.duration_pb2.Duration(seconds=3600),
|
|
106
|
-
scope=[
|
|
107
|
-
"https://www.googleapis.com/auth/devstorage.read_only",
|
|
108
|
-
],
|
|
109
|
-
)
|
|
110
|
-
)
|
|
111
|
-
credentials = RemoteRepositoryCredentials(
|
|
112
|
-
username="oauth2accesstoken", password=key.access_token
|
|
113
|
-
)
|
|
114
|
-
self._credentials_cache[hostname] = credentials
|
|
115
|
-
|
|
116
|
-
return credentials
|
|
117
|
-
|
|
118
|
-
def _upload_context_to_storage(
|
|
119
|
-
self, archive_path: str | os.PathLike[str], destination_name: str | os.PathLike[str]
|
|
120
|
-
):
|
|
121
|
-
"""Uploads context archive to GCS."""
|
|
122
|
-
bucket = self.cloud_storage_client.bucket(self.cloud_bucket)
|
|
123
|
-
blob = bucket.blob(destination_name)
|
|
124
|
-
blob.upload_from_filename(archive_path)
|
|
125
|
-
|
|
126
|
-
def _encrypt_secret_env(
|
|
127
|
-
self, secret_env: Mapping[str, str], *, key_id: str
|
|
128
|
-
) -> cloudbuild.Secret:
|
|
129
|
-
"""Encrypts the given secret environment using Cloud KMS."""
|
|
130
|
-
|
|
131
|
-
key_location = f"projects/{self.cloud_project}/locations/global"
|
|
132
|
-
key_ring_id = "xmanager"
|
|
133
|
-
key_ring = f"{key_location}/keyRings/{key_ring_id}"
|
|
134
|
-
key_name = f"{key_ring}/cryptoKeys/{key_id}"
|
|
135
|
-
|
|
136
|
-
# Create the key ring and key if they don't exist
|
|
137
|
-
try:
|
|
138
|
-
self.cloud_kms_client.get_key_ring(kms.GetKeyRingRequest(name=key_ring))
|
|
139
|
-
except google.api_core.exceptions.NotFound:
|
|
140
|
-
self.cloud_kms_client.create_key_ring(
|
|
141
|
-
kms.CreateKeyRingRequest(
|
|
142
|
-
parent=key_location,
|
|
143
|
-
key_ring_id=key_ring_id,
|
|
144
|
-
key_ring=kms.KeyRing(name=key_ring),
|
|
145
|
-
)
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
try:
|
|
149
|
-
self.cloud_kms_client.get_crypto_key(kms.GetCryptoKeyRequest(name=key_name))
|
|
150
|
-
except google.api_core.exceptions.NotFound:
|
|
151
|
-
self.cloud_kms_client.create_crypto_key(
|
|
152
|
-
kms.CreateCryptoKeyRequest(
|
|
153
|
-
parent=key_ring,
|
|
154
|
-
crypto_key_id=key_id,
|
|
155
|
-
crypto_key=kms.CryptoKey(
|
|
156
|
-
purpose=kms.CryptoKey.CryptoKeyPurpose.ENCRYPT_DECRYPT,
|
|
157
|
-
),
|
|
158
|
-
)
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
def _crc32c_digest(value: bytes) -> int:
|
|
162
|
-
return int.from_bytes(crc32c.Checksum(value).digest(), "big")
|
|
163
|
-
|
|
164
|
-
ciphers: dict[str, bytes] = {}
|
|
165
|
-
for secret_name, secret in secret_env.items():
|
|
166
|
-
secret_bytes = secret.encode("utf-8")
|
|
167
|
-
response = self.cloud_kms_client.encrypt(
|
|
168
|
-
kms.EncryptRequest(
|
|
169
|
-
name=key_name,
|
|
170
|
-
plaintext=secret_bytes,
|
|
171
|
-
plaintext_crc32c=_crc32c_digest(secret_bytes),
|
|
172
|
-
)
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
if not response.verified_plaintext_crc32c:
|
|
176
|
-
raise RuntimeError(
|
|
177
|
-
"The encryption request sent to the server was corrupted in-transit."
|
|
178
|
-
)
|
|
179
|
-
if not response.ciphertext_crc32c == _crc32c_digest(response.ciphertext):
|
|
180
|
-
raise Exception(
|
|
181
|
-
"The encryption response received from the server was corrupted in-transit."
|
|
182
|
-
)
|
|
183
|
-
ciphers[secret_name] = response.ciphertext
|
|
184
|
-
|
|
185
|
-
return cloudbuild.Secret(kms_key_name=key_name, secret_env=ciphers)
|
|
186
|
-
|
|
187
|
-
def _make_build_request(
|
|
188
|
-
self,
|
|
189
|
-
targets: Sequence[IndexedContainer],
|
|
190
|
-
context_path: str | os.PathLike[str],
|
|
191
|
-
) -> cloudbuild.CreateBuildRequest:
|
|
192
|
-
"""Creates a Cloud Build request to build the given targets."""
|
|
193
|
-
bake_command = DockerBakeCommand(
|
|
194
|
-
targets=[
|
|
195
|
-
packaging_utils.hash_digest(target.value.executable_spec) for target in targets
|
|
196
|
-
],
|
|
197
|
-
pull=False,
|
|
198
|
-
push=False,
|
|
199
|
-
load=True,
|
|
200
|
-
)
|
|
201
|
-
steps = [
|
|
202
|
-
*[
|
|
203
|
-
cloudbuild.BuildStep(
|
|
204
|
-
name="gcr.io/cloud-builders/docker",
|
|
205
|
-
args=DockerPullCommand(image=target.value.executor_spec.tag)
|
|
206
|
-
.to_args()
|
|
207
|
-
.to_list(),
|
|
208
|
-
)
|
|
209
|
-
for target in targets
|
|
210
|
-
],
|
|
211
|
-
cloudbuild.BuildStep(
|
|
212
|
-
name="gcr.io/cloud-builders/docker",
|
|
213
|
-
args=bake_command.to_args().to_list(),
|
|
214
|
-
),
|
|
215
|
-
# Delete the context archive on success
|
|
216
|
-
cloudbuild.BuildStep(
|
|
217
|
-
name="gcr.io/cloud-builders/gsutil",
|
|
218
|
-
args=["rm", "-a", "-f", f"gs://{self.cloud_bucket}/{context_path}"],
|
|
219
|
-
allow_failure=True,
|
|
220
|
-
),
|
|
221
|
-
]
|
|
222
|
-
secrets: list[cloudbuild.Secret] = []
|
|
223
|
-
|
|
224
|
-
if (
|
|
225
|
-
(username := os.environ.get(_CLOUD_DOCKER_USERNAME, None))
|
|
226
|
-
and (password := os.environ.get(_CLOUD_DOCKER_PASSWORD, None))
|
|
227
|
-
and (registry := os.environ.get(_CLOUD_DOCKER_REGISTRY, None))
|
|
228
|
-
):
|
|
229
|
-
login_command = DockerLoginCommand(
|
|
230
|
-
server=registry, username="$$DOCKER_USERNAME", password="$$DOCKER_PASSWORD"
|
|
231
|
-
)
|
|
232
|
-
steps.insert(
|
|
233
|
-
0,
|
|
234
|
-
cloudbuild.BuildStep(
|
|
235
|
-
name="gcr.io/cloud-builders/docker",
|
|
236
|
-
args=["-c", f"docker {' '.join(login_command.to_args().to_list(escaper=str))}"],
|
|
237
|
-
entrypoint="bash",
|
|
238
|
-
secret_env=["DOCKER_USERNAME", "DOCKER_PASSWORD"],
|
|
239
|
-
allow_failure=False,
|
|
240
|
-
),
|
|
241
|
-
)
|
|
242
|
-
secrets += [
|
|
243
|
-
self._encrypt_secret_env(
|
|
244
|
-
{"DOCKER_USERNAME": username}, key_id="dockerRegistryUsername"
|
|
245
|
-
),
|
|
246
|
-
self._encrypt_secret_env(
|
|
247
|
-
{"DOCKER_PASSWORD": password}, key_id="dockerRegistryPassword"
|
|
248
|
-
),
|
|
249
|
-
]
|
|
250
|
-
|
|
251
|
-
return cloudbuild.CreateBuildRequest(
|
|
252
|
-
project_id=self.cloud_project,
|
|
253
|
-
build=cloudbuild.Build(
|
|
254
|
-
source=cloudbuild.Source(
|
|
255
|
-
storage_source=cloudbuild.StorageSource(
|
|
256
|
-
bucket=self.cloud_bucket,
|
|
257
|
-
object_=context_path,
|
|
258
|
-
)
|
|
259
|
-
),
|
|
260
|
-
timeout=google.protobuf.duration_pb2.Duration(
|
|
261
|
-
seconds=int(os.environ.get(_GCP_BUILD_TIMEOUT, 1200))
|
|
262
|
-
),
|
|
263
|
-
steps=steps,
|
|
264
|
-
options=cloudbuild.BuildOptions(
|
|
265
|
-
machine_type=os.environ.get(
|
|
266
|
-
_GCP_BUILD_MACHINE, cloudbuild.BuildOptions.MachineType.UNSPECIFIED
|
|
267
|
-
),
|
|
268
|
-
),
|
|
269
|
-
images=list(
|
|
270
|
-
functools.reduce(
|
|
271
|
-
lambda tags, executor_spec: tags | {executor_spec.tag},
|
|
272
|
-
[target.value.executor_spec for target in targets],
|
|
273
|
-
set(),
|
|
274
|
-
)
|
|
275
|
-
),
|
|
276
|
-
secrets=secrets,
|
|
277
|
-
),
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
def _tail_logs(self, build_id: str, stop_event: threading.Event):
|
|
281
|
-
def request_generator():
|
|
282
|
-
yield cloud_logging.types.TailLogEntriesRequest(
|
|
283
|
-
resource_names=[f"projects/{self.cloud_project}"],
|
|
284
|
-
filter=(
|
|
285
|
-
f'logName="projects/{self.cloud_project}/logs/cloudbuild" AND '
|
|
286
|
-
'resource.type="build" AND '
|
|
287
|
-
f'resource.labels.build_id="{build_id}"'
|
|
288
|
-
),
|
|
289
|
-
)
|
|
290
|
-
while not stop_event.is_set():
|
|
291
|
-
time.sleep(0.1)
|
|
292
|
-
|
|
293
|
-
stream = self.cloud_logging_client.tail_log_entries(request_generator())
|
|
294
|
-
style_by_severity = {
|
|
295
|
-
log_severity_pb2.DEFAULT: "",
|
|
296
|
-
log_severity_pb2.DEBUG: "dim",
|
|
297
|
-
log_severity_pb2.INFO: "bright_cyan",
|
|
298
|
-
log_severity_pb2.NOTICE: "cyan",
|
|
299
|
-
log_severity_pb2.WARNING: "yellow",
|
|
300
|
-
log_severity_pb2.ERROR: "red",
|
|
301
|
-
log_severity_pb2.CRITICAL: "bold red",
|
|
302
|
-
log_severity_pb2.ALERT: "bold red",
|
|
303
|
-
log_severity_pb2.EMERGENCY: "bold red",
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
for response in stream:
|
|
307
|
-
for entry in response.entries:
|
|
308
|
-
console.print(
|
|
309
|
-
f"[magenta][BUILD][/magenta] {entry.text_payload}",
|
|
310
|
-
style=style_by_severity.get(entry.severity, ""), # type: ignore
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
def _wait_for_build(self, build_id: str, *, backoff: int = 5) -> dict[str, str]:
|
|
314
|
-
"""Waits for the given build to complete."""
|
|
315
|
-
stop_logging_event = threading.Event()
|
|
316
|
-
log_thread = threading.Thread(target=self._tail_logs, args=(build_id, stop_logging_event))
|
|
317
|
-
log_thread.start()
|
|
318
|
-
|
|
319
|
-
while True:
|
|
320
|
-
time.sleep(backoff)
|
|
321
|
-
result: cloudbuild.Build = self.cloud_build_client.get_build(
|
|
322
|
-
request=cloudbuild.GetBuildRequest(id=build_id, project_id=self.cloud_project)
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
# Stop logging if the build is no longer running
|
|
326
|
-
if result.status not in (
|
|
327
|
-
cloudbuild.Build.Status.QUEUED,
|
|
328
|
-
cloudbuild.Build.Status.WORKING,
|
|
329
|
-
):
|
|
330
|
-
stop_logging_event.set()
|
|
331
|
-
|
|
332
|
-
match result.status:
|
|
333
|
-
case cloudbuild.Build.Status.SUCCESS:
|
|
334
|
-
return {image.name: image.digest for image in result.results.images}
|
|
335
|
-
case cloudbuild.Build.Status.FAILURE:
|
|
336
|
-
console.log(
|
|
337
|
-
"Build FAILED. See logs for more information",
|
|
338
|
-
style="bold red",
|
|
339
|
-
)
|
|
340
|
-
raise RuntimeError("Build failed.")
|
|
341
|
-
case cloudbuild.Build.Status.QUEUED | cloudbuild.Build.Status.WORKING:
|
|
342
|
-
continue
|
|
343
|
-
case cloudbuild.Build.Status.CANCELLED:
|
|
344
|
-
console.log(
|
|
345
|
-
f"Cloud build tool internal error: {result.status}", style="bold red"
|
|
346
|
-
)
|
|
347
|
-
raise RuntimeError("Build cancelled.")
|
|
348
|
-
case cloudbuild.Build.Status.INTERNAL_ERROR:
|
|
349
|
-
console.log(
|
|
350
|
-
f"Cloud build tool internal error: {result.status}", style="bold red"
|
|
351
|
-
)
|
|
352
|
-
raise RuntimeError("Build internal error.")
|
|
353
|
-
case cloudbuild.Build.Status.TIMEOUT:
|
|
354
|
-
console.log("Build timed out after 1200 seconds.", style="bold red")
|
|
355
|
-
raise RuntimeError("Build timed out.")
|
|
356
|
-
|
|
357
|
-
def _resolve_ignore_pathspec(
|
|
358
|
-
self, path: pathlib.Path, *, ignore_files: Sequence[str] = [".gitignore", ".dockerignore"]
|
|
359
|
-
) -> pathspec.PathSpec:
|
|
360
|
-
"""Resolves the ignore list for the given context path."""
|
|
361
|
-
|
|
362
|
-
def _maybe_add_ignore_file(patterns: list[str], ignore_file: str) -> list[str]:
|
|
363
|
-
if (file := path / ignore_file).exists():
|
|
364
|
-
patterns.extend(file.read_text().splitlines())
|
|
365
|
-
return patterns
|
|
366
|
-
|
|
367
|
-
ignore_patterns = functools.reduce(_maybe_add_ignore_file, ignore_files, [])
|
|
368
|
-
return pathspec.PathSpec.from_lines(
|
|
369
|
-
pathspec.patterns.GitWildMatchPattern, # type: ignore
|
|
370
|
-
ignore_patterns,
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
def _reroot_targets(
|
|
374
|
-
self, targets: Sequence[IndexedContainer], source_path: pathlib.Path
|
|
375
|
-
) -> Sequence[IndexedContainer]:
|
|
376
|
-
"""Reroots the given targets to be relative to the given source path."""
|
|
377
|
-
# We must re-root the targets as when we upload to GCS everything is relative to /
|
|
378
|
-
rerooted_targets = []
|
|
379
|
-
for target in targets:
|
|
380
|
-
executable_spec: Dockerfile = target.value.executable_spec
|
|
381
|
-
assert isinstance(executable_spec, Dockerfile)
|
|
382
|
-
context_path = executable_spec.context.relative_to(source_path)
|
|
383
|
-
executable_spec = dataclasses.replace(
|
|
384
|
-
executable_spec,
|
|
385
|
-
context=context_path,
|
|
386
|
-
# dockerfile=context_path
|
|
387
|
-
# / f"Dockerfile-{packaging_utils.hash_digest(executable_spec)}",
|
|
388
|
-
)
|
|
389
|
-
rerooted_targets.append(
|
|
390
|
-
dataclasses.replace(
|
|
391
|
-
target,
|
|
392
|
-
value=xm.Packageable(
|
|
393
|
-
executable_spec=executable_spec,
|
|
394
|
-
executor_spec=target.value.executor_spec,
|
|
395
|
-
args=target.value.args,
|
|
396
|
-
env_vars=target.value.env_vars,
|
|
397
|
-
),
|
|
398
|
-
)
|
|
399
|
-
)
|
|
400
|
-
return rerooted_targets
|
|
401
|
-
|
|
402
|
-
def bake(self, *, targets: Sequence[IndexedContainer]) -> list[IndexedContainer[RemoteImage]]:
|
|
403
|
-
"""Builds the given targets and returns the digest for each image."""
|
|
404
|
-
# Step 1: Upload to GCS
|
|
405
|
-
source_path = utils.find_project_root()
|
|
406
|
-
# dockerfiles: dict[str, str] = {
|
|
407
|
-
# packaging_utils.hash_digest(
|
|
408
|
-
# target.value.executable_spec
|
|
409
|
-
# ): target.value.executable_spec.dockerfile.read_text()
|
|
410
|
-
# for target in targets
|
|
411
|
-
# }
|
|
412
|
-
targets = self._reroot_targets(targets, source_path)
|
|
413
|
-
|
|
414
|
-
with tempfile.NamedTemporaryFile(suffix=".tar.gx") as tmpfile:
|
|
415
|
-
console.print("Packaging context for upload...", style="blue")
|
|
416
|
-
|
|
417
|
-
ignore_pathspec = self._resolve_ignore_pathspec(source_path)
|
|
418
|
-
executors_by_executables = packaging_utils.collect_executors_by_executable(targets)
|
|
419
|
-
with tarfile.open(tmpfile.name, "w:gz") as tar:
|
|
420
|
-
for file in sorted(ignore_pathspec.match_tree_files(source_path, negate=True)):
|
|
421
|
-
file = pathlib.Path(file).resolve()
|
|
422
|
-
tar.add(file, file.relative_to(source_path))
|
|
423
|
-
hcl = self._bake_template.render(
|
|
424
|
-
executables=executors_by_executables,
|
|
425
|
-
hash=packaging_utils.hash_digest,
|
|
426
|
-
)
|
|
427
|
-
_tar_writestr(tar, "docker-bake.hcl", hcl)
|
|
428
|
-
# for executable_hash, dockerfile in dockerfiles.items():
|
|
429
|
-
# _tar_writestr(tar, f"Dockerfile-{executable_hash}", dockerfile)
|
|
430
|
-
# with zipfile.ZipFile(tmpfile.name, "w", compression=zipfile.ZIP_DEFLATED) as zip:
|
|
431
|
-
# for file in sorted(ignore_pathspec.match_tree_files(source_path, negate=True)):
|
|
432
|
-
# file = pathlib.Path(file).resolve()
|
|
433
|
-
# zip.write(file, file.relative_to(source_path))
|
|
434
|
-
# hcl = self._bake_template.render(
|
|
435
|
-
# executables=executors_by_executables,
|
|
436
|
-
# hash=packaging_utils.hash_digest,
|
|
437
|
-
# )
|
|
438
|
-
# zip.writestr("docker-bake.hcl", hcl)
|
|
439
|
-
# for executable_hash, dockerfile in dockerfiles.items():
|
|
440
|
-
# zip.writestr(f"Dockerfile-{executable_hash}", dockerfile)
|
|
441
|
-
|
|
442
|
-
archive_file_size = os.path.getsize(tmpfile.name)
|
|
443
|
-
# Check file size of archive and warn if it's too large
|
|
444
|
-
if archive_file_size > 1 * xm.GB:
|
|
445
|
-
console.log(
|
|
446
|
-
"WARNING: Context archive is larger than 1GB. "
|
|
447
|
-
"This may cause slow builds. "
|
|
448
|
-
"Try to avoid storing datasets or large files in the source directory. "
|
|
449
|
-
"You may also ignore files by adding them to `.gitignore` or `.dockerignore`.",
|
|
450
|
-
style="bold yellow",
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
hasher = hashlib.blake2s()
|
|
454
|
-
tmpfile.seek(0)
|
|
455
|
-
for chunk in iter(lambda: tmpfile.read(4096), b""):
|
|
456
|
-
hasher.update(chunk)
|
|
457
|
-
destination_path = pathlib.Path(f"blake2s:{hasher.hexdigest()}").with_suffix(".tar.gz")
|
|
458
|
-
|
|
459
|
-
console.print(
|
|
460
|
-
f"Sending build context ({humanize.naturalsize(archive_file_size)}) to GCS...",
|
|
461
|
-
style="blue",
|
|
462
|
-
)
|
|
463
|
-
self._upload_context_to_storage(tmpfile.name, destination_path.as_posix())
|
|
464
|
-
|
|
465
|
-
# Step 2: Schedule build
|
|
466
|
-
create_build_op = self.cloud_build_client.create_build(
|
|
467
|
-
self._make_build_request(targets, destination_path.as_posix())
|
|
468
|
-
)
|
|
469
|
-
build_metadata: cloudbuild.BuildOperationMetadata = create_build_op.metadata # type: ignore
|
|
470
|
-
build_id = build_metadata.build.id
|
|
471
|
-
build_url = create_build_op.metadata.build.log_url # type: ignore
|
|
472
|
-
console.print(f"Queued build with ID {build_id}...", style="blue")
|
|
473
|
-
console.print(f"URL: [{build_url}]{build_url}", markup=True, style="blue")
|
|
474
|
-
|
|
475
|
-
# Step 3: Wait for build completion for digests & collect credentials
|
|
476
|
-
console.print("Waiting for build agent...", style="blue")
|
|
477
|
-
digest_by_image_names = self._wait_for_build(build_id)
|
|
478
|
-
|
|
479
|
-
# Step 4: Construct new remote images
|
|
480
|
-
images = []
|
|
481
|
-
for target in targets:
|
|
482
|
-
assert isinstance(target.value.executable_spec, Dockerfile)
|
|
483
|
-
assert isinstance(target.value.executor_spec, SlurmSpec)
|
|
484
|
-
assert target.value.executor_spec.tag
|
|
485
|
-
|
|
486
|
-
uri = ImageURI(target.value.executor_spec.tag).with_digest(
|
|
487
|
-
digest_by_image_names[target.value.executor_spec.tag]
|
|
488
|
-
)
|
|
489
|
-
|
|
490
|
-
images.append(
|
|
491
|
-
dataclasses.replace(
|
|
492
|
-
target,
|
|
493
|
-
value=RemoteImage( # type: ignore
|
|
494
|
-
image=str(uri),
|
|
495
|
-
workdir=target.value.executable_spec.workdir,
|
|
496
|
-
args=target.value.args,
|
|
497
|
-
env_vars=target.value.env_vars,
|
|
498
|
-
credentials=self.credentials(uri.domain),
|
|
499
|
-
),
|
|
500
|
-
)
|
|
501
|
-
)
|
|
502
|
-
|
|
503
|
-
return images
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
# syntax=docker/dockerfile:1.4
|
|
2
|
-
ARG BASE_IMAGE
|
|
3
|
-
|
|
4
|
-
FROM $BASE_IMAGE AS builder
|
|
5
|
-
|
|
6
|
-
RUN apt-get update \
|
|
7
|
-
&& apt-get install -y --no-install-recommends \
|
|
8
|
-
git \
|
|
9
|
-
&& rm -rf /var/lib/apt/lists/*
|
|
10
|
-
|
|
11
|
-
RUN pip install -U pip setuptools wheel pysocks \
|
|
12
|
-
&& pip install pdm
|
|
13
|
-
|
|
14
|
-
COPY --link pyproject.toml pdm.lock /workspace/
|
|
15
|
-
WORKDIR /workspace
|
|
16
|
-
|
|
17
|
-
RUN --mount=type=cache,target=/root/.cache/pdm mkdir __pypackages__ \
|
|
18
|
-
&& PDM_CACHE_DIR=/root/.cache/pdm pdm sync --prod --no-editable
|
|
19
|
-
|
|
20
|
-
FROM $BASE_IMAGE
|
|
21
|
-
|
|
22
|
-
ARG PYTHON_MAJOR
|
|
23
|
-
ARG PYTHON_MINOR
|
|
24
|
-
|
|
25
|
-
ENV PYTHONPATH=/workspace/pkgs:$PYTHONPATH
|
|
26
|
-
COPY --link --from=builder /workspace/__pypackages__/$PYTHON_MAJOR.$PYTHON_MINOR/lib /workspace/pkgs
|
|
27
|
-
|
|
28
|
-
WORKDIR /workspace/src
|
|
29
|
-
COPY --link . /workspace/src
|
|
30
|
-
|
|
31
|
-
ENTRYPOINT ["python"]
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: xmanager-slurm
|
|
3
|
-
Version: 0.3.2
|
|
4
|
-
Summary: Slurm backend for XManager.
|
|
5
|
-
Author-Email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
|
|
6
|
-
License: MIT
|
|
7
|
-
Requires-Python: >=3.10
|
|
8
|
-
Requires-Dist: xmanager>=0.4.0
|
|
9
|
-
Requires-Dist: asyncssh>=2.13.2
|
|
10
|
-
Requires-Dist: humanize>=4.8.0
|
|
11
|
-
Requires-Dist: jinja2>=3.1.2
|
|
12
|
-
Requires-Dist: toml>=0.10.2
|
|
13
|
-
Requires-Dist: rich>=13.5.2
|
|
14
|
-
Requires-Dist: immutabledict>=3.0.0
|
|
15
|
-
Requires-Dist: backoff>=2.2.1
|
|
16
|
-
Requires-Dist: pathspec>=0.11.2; extra == "gcp"
|
|
17
|
-
Requires-Dist: google-cloud-storage>=2.11.0; extra == "gcp"
|
|
18
|
-
Requires-Dist: google-cloud-build>=3.20.0; extra == "gcp"
|
|
19
|
-
Requires-Dist: google-cloud-logging>=3.8.0; extra == "gcp"
|
|
20
|
-
Requires-Dist: google-cloud-iam>=2.12.2; extra == "gcp"
|
|
21
|
-
Requires-Dist: google-cloud-kms>=2.19.2; extra == "gcp"
|
|
22
|
-
Requires-Dist: google-crc32c>=1.5.0; extra == "gcp"
|
|
23
|
-
Requires-Dist: pytest>=7.4.3; extra == "test"
|
|
24
|
-
Provides-Extra: gcp
|
|
25
|
-
Provides-Extra: test
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
xm_slurm/__init__.py,sha256=J5FkaAXbcT7yWmcNgBq3mDLOmnNZW7c4WekSt7JVoFc,1019
|
|
2
|
-
xm_slurm/api.py,sha256=LhAnNfP_M62FEbTSa-NgToxi0OA8kCrUx2czbxfpOjw,9533
|
|
3
|
-
xm_slurm/batching.py,sha256=mGVvccehsC4dfjtg7QqrtxuoxYI_Fs8o1GLJIGVyvyo,4379
|
|
4
|
-
xm_slurm/config.py,sha256=St2bdp2m2pajGxivmsRkmZrKd-inBKF7Hn6oezTT9zM,4955
|
|
5
|
-
xm_slurm/console.py,sha256=UpMqeJ0C8i0pkue1AHnnyyX0bFJ9zZeJ7HBR6yhuA8A,54
|
|
6
|
-
xm_slurm/contrib/clusters/__init__.py,sha256=BM9W2iJE3616uwAUdj1uqcXF70zunXlxWHEXgLdH2xs,1600
|
|
7
|
-
xm_slurm/contrib/clusters/drac.py,sha256=Tl4Cv0DOHssJcnTU5c60w0Ye3dwlDZE4AkPtmew6-lQ,5201
|
|
8
|
-
xm_slurm/executables.py,sha256=G62nACUyki7Jwif3w6c6h3t88ebiv_w9ISYWeeAUVFo,5959
|
|
9
|
-
xm_slurm/execution.py,sha256=EDckbWYtBoMAc0yWU-BitFWGPOkA-AxoQRFv7XdUNcA,17998
|
|
10
|
-
xm_slurm/executors.py,sha256=vilogTjlxHLfZDms4aYOZWUW8w-2IdxU7xh-9vcW1Y0,4723
|
|
11
|
-
xm_slurm/experiment.py,sha256=y5K-kZgavRk022IGXw5bg7beX4cnU1JtUOeP9824sRA,25578
|
|
12
|
-
xm_slurm/job_blocks.py,sha256=1H1eZ5gbEGEoDYcoSh8S_gvp04MLXP7G128crDJlMYo,482
|
|
13
|
-
xm_slurm/packageables.py,sha256=mHG2ackEKh1Qg0HdIP72uDZt-9j4qvsW-l_WScdWMes,11375
|
|
14
|
-
xm_slurm/packaging/__init__.py,sha256=dh307yLpUT9KN7rJ1e9fYC6hegGKfZcGboUq9nGpDVQ,233
|
|
15
|
-
xm_slurm/packaging/docker/__init__.py,sha256=SQxaDtomYc4NwZ5lSoCiMoy2S2lRmc0sgwVMbENIatU,2474
|
|
16
|
-
xm_slurm/packaging/docker/abc.py,sha256=f8XvUA_FusIpXI45PR5isA2msxM003ycW5mWbAyiKfk,3830
|
|
17
|
-
xm_slurm/packaging/docker/cloud.py,sha256=8V3Hg_v_TLe6WoJkk3l6S-m7OgK2TNxwfK2glLSFN3o,21300
|
|
18
|
-
xm_slurm/packaging/docker/local.py,sha256=-_elHmU-v0R3XucNMpYF98HCQcMV0x_RTvkt1jLnes4,8008
|
|
19
|
-
xm_slurm/packaging/registry.py,sha256=GrdmQg9MgSo38OiqOzMKWSkQyBuyryOfc3zcdgZ4CUE,1148
|
|
20
|
-
xm_slurm/packaging/router.py,sha256=6qjtsy4BoYgSaQzC_pQSHVHeWcphG_xWVsWgW6ALC7U,2033
|
|
21
|
-
xm_slurm/packaging/utils.py,sha256=dCWAuUXT5COXGe1BQEW8luo5patxTeLSAGOWPR63iY8,6219
|
|
22
|
-
xm_slurm/resources.py,sha256=FqAULBhchu6z66On3SRDOJvXfs0sLGfBvcMGUUB3jxU,4835
|
|
23
|
-
xm_slurm/status.py,sha256=4BUBm-qwVWH_RJGzRxO8Eom5d92_cp8jydQVwqH8v6U,6653
|
|
24
|
-
xm_slurm/templates/docker/docker-bake.hcl.j2,sha256=ClsFpj91Mr1VfA8L6eqBG3HQz0Z8VenF6mEfmAhQgUo,1498
|
|
25
|
-
xm_slurm/templates/docker/mamba.Dockerfile,sha256=vsOYkm-T33C2RanwbdjJIUjhPJ_H1NDBeEACguQJZ8c,716
|
|
26
|
-
xm_slurm/templates/docker/pdm.Dockerfile,sha256=Yg5-lOXkNVJr0OER_yOnRvn9NlFLnt3RfdYfq4f0ilg,748
|
|
27
|
-
xm_slurm/templates/docker/python.Dockerfile,sha256=O6lHesmsLz7cX-efVQpsafEVYmbHPyV73xA9WbBKGKg,738
|
|
28
|
-
xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=CxtbxOJzd0Un-ApDO6T8JHuKlSv6uwwBFMJPeGjCKnk,1071
|
|
29
|
-
xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
|
|
30
|
-
xm_slurm/templates/slurm/job-array.bash.j2,sha256=d4twfV1PATGQwTIleFBUIGmMAIHH-F7RjBsdfaAIQko,599
|
|
31
|
-
xm_slurm/templates/slurm/job-group.bash.j2,sha256=UkjfBE7jg9mepcUWaHZEAjkiXsIM1j_sLxLzxkteD-Y,1120
|
|
32
|
-
xm_slurm/templates/slurm/job.bash.j2,sha256=EUeq3P2xqTIqlHi2SVhFBT7NL4lUj8okYUa3GnlaIIc,1852
|
|
33
|
-
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=dMntzelhs8DqKyIpO9S6wzMfH2PDevmgvyjCW8Xc2dY,3222
|
|
34
|
-
xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=xKXYFvQvazMx0PgvmlRXR6eecoiBUl8y52dIzQtWkBE,1469
|
|
35
|
-
xm_slurm/utils.py,sha256=PNd0vTn33UKm5LpC41TdO9QIFe21V5A0RbYEhQIMjrA,1930
|
|
36
|
-
xmanager_slurm-0.3.2.dist-info/METADATA,sha256=wQy1L_EjWG6foDa1iClRQT0-_iC6LMMh5E30_GHEePc,909
|
|
37
|
-
xmanager_slurm-0.3.2.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
|
|
38
|
-
xmanager_slurm-0.3.2.dist-info/RECORD,,
|