wandb 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -2
- wandb/agents/pyagent.py +1 -1
- wandb/apis/importers/__init__.py +1 -4
- wandb/apis/importers/internals/internal.py +386 -0
- wandb/apis/importers/internals/protocols.py +125 -0
- wandb/apis/importers/internals/util.py +78 -0
- wandb/apis/importers/mlflow.py +125 -88
- wandb/apis/importers/validation.py +108 -0
- wandb/apis/importers/wandb.py +1604 -0
- wandb/apis/public/api.py +7 -10
- wandb/apis/public/artifacts.py +38 -0
- wandb/apis/public/files.py +11 -2
- wandb/apis/reports/v2/__init__.py +0 -19
- wandb/apis/reports/v2/expr_parsing.py +0 -1
- wandb/apis/reports/v2/interface.py +15 -18
- wandb/apis/reports/v2/internal.py +12 -45
- wandb/cli/cli.py +52 -55
- wandb/integration/gym/__init__.py +2 -1
- wandb/integration/keras/callbacks/model_checkpoint.py +1 -1
- wandb/integration/keras/keras.py +6 -4
- wandb/integration/kfp/kfp_patch.py +2 -2
- wandb/integration/openai/fine_tuning.py +1 -2
- wandb/integration/ultralytics/callback.py +0 -1
- wandb/proto/v3/wandb_internal_pb2.py +332 -312
- wandb/proto/v3/wandb_settings_pb2.py +13 -3
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +316 -312
- wandb/proto/v4/wandb_settings_pb2.py +5 -3
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/artifact.py +75 -31
- wandb/sdk/artifacts/artifact_manifest.py +5 -2
- wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +8 -2
- wandb/sdk/artifacts/artifact_saver.py +19 -47
- wandb/sdk/artifacts/storage_handler.py +2 -1
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +22 -9
- wandb/sdk/artifacts/storage_policy.py +4 -1
- wandb/sdk/data_types/base_types/wb_value.py +1 -1
- wandb/sdk/data_types/image.py +2 -2
- wandb/sdk/interface/interface.py +49 -13
- wandb/sdk/interface/interface_shared.py +17 -11
- wandb/sdk/internal/file_stream.py +20 -1
- wandb/sdk/internal/handler.py +1 -4
- wandb/sdk/internal/internal_api.py +3 -1
- wandb/sdk/internal/job_builder.py +49 -19
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/sender.py +96 -124
- wandb/sdk/internal/sender_config.py +197 -0
- wandb/sdk/internal/settings_static.py +9 -0
- wandb/sdk/internal/system/system_info.py +5 -3
- wandb/sdk/internal/update.py +1 -1
- wandb/sdk/launch/_launch.py +3 -3
- wandb/sdk/launch/_launch_add.py +28 -29
- wandb/sdk/launch/_project_spec.py +148 -136
- wandb/sdk/launch/agent/agent.py +3 -7
- wandb/sdk/launch/agent/config.py +0 -27
- wandb/sdk/launch/builder/build.py +54 -28
- wandb/sdk/launch/builder/docker_builder.py +4 -15
- wandb/sdk/launch/builder/kaniko_builder.py +72 -45
- wandb/sdk/launch/create_job.py +6 -40
- wandb/sdk/launch/loader.py +10 -0
- wandb/sdk/launch/registry/anon.py +29 -0
- wandb/sdk/launch/registry/local_registry.py +4 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
- wandb/sdk/launch/runner/local_container.py +15 -10
- wandb/sdk/launch/runner/sagemaker_runner.py +1 -1
- wandb/sdk/launch/sweeps/scheduler.py +11 -3
- wandb/sdk/launch/utils.py +14 -0
- wandb/sdk/lib/__init__.py +2 -5
- wandb/sdk/lib/_settings_toposort_generated.py +4 -1
- wandb/sdk/lib/apikey.py +0 -5
- wandb/sdk/lib/config_util.py +0 -31
- wandb/sdk/lib/filesystem.py +11 -1
- wandb/sdk/lib/run_moment.py +72 -0
- wandb/sdk/service/service.py +7 -2
- wandb/sdk/service/streams.py +1 -6
- wandb/sdk/verify/verify.py +2 -1
- wandb/sdk/wandb_init.py +12 -1
- wandb/sdk/wandb_login.py +43 -26
- wandb/sdk/wandb_run.py +164 -110
- wandb/sdk/wandb_settings.py +58 -16
- wandb/testing/relay.py +5 -6
- wandb/util.py +50 -7
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/METADATA +8 -1
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/RECORD +89 -82
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/WHEEL +1 -1
- wandb/apis/importers/base.py +0 -400
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/LICENSE +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/top_level.txt +0 -0
@@ -11,18 +11,13 @@ from wandb.sdk.launch.builder.build import registry_from_uri
|
|
11
11
|
from wandb.sdk.launch.environment.abstract import AbstractEnvironment
|
12
12
|
from wandb.sdk.launch.registry.abstract import AbstractRegistry
|
13
13
|
|
14
|
-
from .._project_spec import
|
15
|
-
EntryPoint,
|
16
|
-
LaunchProject,
|
17
|
-
create_metadata_file,
|
18
|
-
get_entry_point_command,
|
19
|
-
)
|
14
|
+
from .._project_spec import EntryPoint, LaunchProject
|
20
15
|
from ..errors import LaunchDockerError, LaunchError
|
16
|
+
from ..registry.anon import AnonynmousRegistry
|
21
17
|
from ..registry.local_registry import LocalRegistry
|
22
18
|
from ..utils import (
|
23
19
|
LOG_PREFIX,
|
24
20
|
event_loop_thread_exec,
|
25
|
-
sanitize_wandb_api_key,
|
26
21
|
warn_failed_packages_from_build_logs,
|
27
22
|
)
|
28
23
|
from .build import (
|
@@ -106,6 +101,8 @@ class DockerBuilder(AbstractBuilder):
|
|
106
101
|
"""Login to the registry."""
|
107
102
|
if isinstance(self.registry, LocalRegistry):
|
108
103
|
_logger.info(f"{LOG_PREFIX}No registry configured, skipping login.")
|
104
|
+
elif isinstance(self.registry, AnonynmousRegistry):
|
105
|
+
_logger.info(f"{LOG_PREFIX}Anonymous registry, skipping login.")
|
109
106
|
else:
|
110
107
|
username, password = await self.registry.get_username_password()
|
111
108
|
login = event_loop_thread_exec(docker.login)
|
@@ -155,14 +152,6 @@ class DockerBuilder(AbstractBuilder):
|
|
155
152
|
f"image {image_uri} does not already exist in repository, building."
|
156
153
|
)
|
157
154
|
|
158
|
-
entry_cmd = get_entry_point_command(entrypoint, launch_project.override_args)
|
159
|
-
|
160
|
-
create_metadata_file(
|
161
|
-
launch_project,
|
162
|
-
image_uri,
|
163
|
-
sanitize_wandb_api_key(" ".join(entry_cmd)),
|
164
|
-
dockerfile_str,
|
165
|
-
)
|
166
155
|
build_ctx_path = _create_docker_build_ctx(launch_project, dockerfile_str)
|
167
156
|
dockerfile = os.path.join(build_ctx_path, _WANDB_DOCKERFILE_NAME)
|
168
157
|
try:
|
@@ -3,6 +3,7 @@ import base64
|
|
3
3
|
import json
|
4
4
|
import logging
|
5
5
|
import os
|
6
|
+
import shutil
|
6
7
|
import tarfile
|
7
8
|
import tempfile
|
8
9
|
import time
|
@@ -23,17 +24,11 @@ from wandb.sdk.launch.registry.elastic_container_registry import (
|
|
23
24
|
from wandb.sdk.launch.registry.google_artifact_registry import GoogleArtifactRegistry
|
24
25
|
from wandb.util import get_module
|
25
26
|
|
26
|
-
from .._project_spec import
|
27
|
-
EntryPoint,
|
28
|
-
LaunchProject,
|
29
|
-
create_metadata_file,
|
30
|
-
get_entry_point_command,
|
31
|
-
)
|
27
|
+
from .._project_spec import EntryPoint, LaunchProject
|
32
28
|
from ..errors import LaunchError
|
33
29
|
from ..utils import (
|
34
30
|
LOG_PREFIX,
|
35
31
|
get_kube_context_and_api_client,
|
36
|
-
sanitize_wandb_api_key,
|
37
32
|
warn_failed_packages_from_build_logs,
|
38
33
|
)
|
39
34
|
from .build import (
|
@@ -56,6 +51,14 @@ _logger = logging.getLogger(__name__)
|
|
56
51
|
_DEFAULT_BUILD_TIMEOUT_SECS = 1800 # 30 minute build timeout
|
57
52
|
|
58
53
|
SERVICE_ACCOUNT_NAME = os.environ.get("WANDB_LAUNCH_SERVICE_ACCOUNT_NAME", "default")
|
54
|
+
PVC_NAME = os.environ.get("WANDB_LAUNCH_KANIKO_PVC_NAME")
|
55
|
+
PVC_MOUNT_PATH = (
|
56
|
+
os.environ.get("WANDB_LAUNCH_KANIKO_PVC_MOUNT_PATH", "/kaniko").rstrip("/")
|
57
|
+
if PVC_NAME
|
58
|
+
else None
|
59
|
+
)
|
60
|
+
DOCKER_CONFIG_SECRET = os.environ.get("WANDB_LAUNCH_KANIKO_AUTH_SECRET")
|
61
|
+
|
59
62
|
|
60
63
|
if os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
|
61
64
|
with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
|
@@ -115,12 +118,6 @@ class KanikoBuilder(AbstractBuilder):
|
|
115
118
|
verify (bool, optional): Whether to verify the functionality of the builder.
|
116
119
|
Defaults to True.
|
117
120
|
"""
|
118
|
-
if build_context_store is None:
|
119
|
-
raise LaunchError(
|
120
|
-
"You are required to specify an external build "
|
121
|
-
"context store for Kaniko builds. Please specify a storage url "
|
122
|
-
"in the 'build-context-store' field of your builder config."
|
123
|
-
)
|
124
121
|
self.environment = environment
|
125
122
|
self.registry = registry
|
126
123
|
self.build_job_name = build_job_name
|
@@ -154,13 +151,16 @@ class KanikoBuilder(AbstractBuilder):
|
|
154
151
|
raise LaunchError(
|
155
152
|
"Builder config must include 'type':'kaniko' to create a KanikoBuilder."
|
156
153
|
)
|
157
|
-
build_context_store = config.get("build-context-store")
|
154
|
+
build_context_store = config.get("build-context-store", "")
|
158
155
|
if build_context_store is None:
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
156
|
+
if not PVC_MOUNT_PATH:
|
157
|
+
raise LaunchError(
|
158
|
+
"You must specify a build context store for kaniko builds. "
|
159
|
+
"You can set builder.build-context-store in your agent config "
|
160
|
+
"to a valid s3, gcs, or azure blog storage URI. Or, configure "
|
161
|
+
"a persistent volume claim through the agent helm chart: "
|
162
|
+
"https://github.com/wandb/helm-charts/tree/main/charts/launch-agent"
|
163
|
+
)
|
164
164
|
build_job_name = config.get("build-job-name", "wandb-launch-container-build")
|
165
165
|
secret_name = config.get("secret-name", "")
|
166
166
|
secret_key = config.get("secret-key", "")
|
@@ -170,6 +170,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
170
170
|
image_uri = config.get("destination")
|
171
171
|
if image_uri is not None:
|
172
172
|
registry = registry_from_uri(image_uri)
|
173
|
+
|
173
174
|
return cls(
|
174
175
|
environment,
|
175
176
|
registry,
|
@@ -186,9 +187,8 @@ class KanikoBuilder(AbstractBuilder):
|
|
186
187
|
Raises:
|
187
188
|
LaunchError: If the builder config is invalid.
|
188
189
|
"""
|
189
|
-
if self.
|
190
|
-
|
191
|
-
await self.environment.verify_storage_uri(self.build_context_store)
|
190
|
+
if self.build_context_store:
|
191
|
+
await self.environment.verify_storage_uri(self.build_context_store)
|
192
192
|
|
193
193
|
def login(self) -> None:
|
194
194
|
"""Login to the registry."""
|
@@ -197,8 +197,6 @@ class KanikoBuilder(AbstractBuilder):
|
|
197
197
|
async def _create_docker_ecr_config_map(
|
198
198
|
self, job_name: str, corev1_client: client.CoreV1Api, repository: str
|
199
199
|
) -> None:
|
200
|
-
if self.registry is None:
|
201
|
-
raise LaunchError("No registry specified for Kaniko build.")
|
202
200
|
username, password = await self.registry.get_username_password()
|
203
201
|
encoded = base64.b64encode(f"{username}:{password}".encode()).decode("utf-8")
|
204
202
|
ecr_config_map = client.V1ConfigMap(
|
@@ -235,11 +233,21 @@ class KanikoBuilder(AbstractBuilder):
|
|
235
233
|
with tarfile.TarFile.open(fileobj=context_file, mode="w:gz") as context_tgz:
|
236
234
|
context_tgz.add(context_path, arcname=".")
|
237
235
|
context_file.close()
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
236
|
+
if PVC_MOUNT_PATH is None:
|
237
|
+
destination = f"{self.build_context_store}/{run_id}.tgz"
|
238
|
+
if self.environment is None:
|
239
|
+
raise LaunchError("No environment specified for Kaniko build.")
|
240
|
+
await self.environment.upload_file(context_file.name, destination)
|
241
|
+
return destination
|
242
|
+
else:
|
243
|
+
destination = f"{PVC_MOUNT_PATH}/{run_id}.tgz"
|
244
|
+
try:
|
245
|
+
shutil.copy(context_file.name, destination)
|
246
|
+
except Exception as e:
|
247
|
+
raise LaunchError(
|
248
|
+
f"Error copying build context to PVC mounted at {PVC_MOUNT_PATH}: {e}"
|
249
|
+
) from e
|
250
|
+
return f"tar:///context/{run_id}.tgz"
|
243
251
|
|
244
252
|
async def build_image(
|
245
253
|
self,
|
@@ -248,9 +256,6 @@ class KanikoBuilder(AbstractBuilder):
|
|
248
256
|
job_tracker: Optional[JobAndRunStatusTracker] = None,
|
249
257
|
) -> str:
|
250
258
|
await self.verify()
|
251
|
-
# TODO: this should probably throw an error if the registry is a local registry
|
252
|
-
if not self.registry:
|
253
|
-
raise LaunchError("No registry specified for Kaniko build.")
|
254
259
|
# kaniko builder doesn't seem to work with a custom user id, need more investigation
|
255
260
|
dockerfile_str = generate_dockerfile(
|
256
261
|
launch_project=launch_project,
|
@@ -262,7 +267,6 @@ class KanikoBuilder(AbstractBuilder):
|
|
262
267
|
image_tag = image_tag_from_dockerfile_and_source(launch_project, dockerfile_str)
|
263
268
|
repo_uri = await self.registry.get_repo_uri()
|
264
269
|
image_uri = repo_uri + ":" + image_tag
|
265
|
-
|
266
270
|
if (
|
267
271
|
not launch_project.build_required()
|
268
272
|
and await self.registry.check_image_exists(image_uri)
|
@@ -271,16 +275,6 @@ class KanikoBuilder(AbstractBuilder):
|
|
271
275
|
|
272
276
|
_logger.info(f"Building image {image_uri}...")
|
273
277
|
|
274
|
-
entry_cmd = " ".join(
|
275
|
-
get_entry_point_command(entrypoint, launch_project.override_args)
|
276
|
-
)
|
277
|
-
|
278
|
-
create_metadata_file(
|
279
|
-
launch_project,
|
280
|
-
image_uri,
|
281
|
-
sanitize_wandb_api_key(entry_cmd),
|
282
|
-
sanitize_wandb_api_key(dockerfile_str),
|
283
|
-
)
|
284
278
|
context_path = _create_docker_build_ctx(launch_project, dockerfile_str)
|
285
279
|
run_id = launch_project.run_id
|
286
280
|
|
@@ -381,6 +375,20 @@ class KanikoBuilder(AbstractBuilder):
|
|
381
375
|
env = []
|
382
376
|
volume_mounts = []
|
383
377
|
volumes = []
|
378
|
+
|
379
|
+
if PVC_MOUNT_PATH:
|
380
|
+
volumes.append(
|
381
|
+
client.V1Volume(
|
382
|
+
name="kaniko-pvc",
|
383
|
+
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
384
|
+
claim_name=PVC_NAME
|
385
|
+
),
|
386
|
+
)
|
387
|
+
)
|
388
|
+
volume_mounts.append(
|
389
|
+
client.V1VolumeMount(name="kaniko-pvc", mount_path="/context")
|
390
|
+
)
|
391
|
+
|
384
392
|
if bool(self.secret_name) != bool(self.secret_key):
|
385
393
|
raise LaunchError(
|
386
394
|
"Both secret_name and secret_key or neither must be specified "
|
@@ -419,8 +427,27 @@ class KanikoBuilder(AbstractBuilder):
|
|
419
427
|
),
|
420
428
|
)
|
421
429
|
]
|
422
|
-
|
423
|
-
|
430
|
+
if DOCKER_CONFIG_SECRET:
|
431
|
+
volumes.append(
|
432
|
+
client.V1Volume(
|
433
|
+
name="kaniko-docker-config",
|
434
|
+
secret=client.V1SecretVolumeSource(
|
435
|
+
secret_name=DOCKER_CONFIG_SECRET,
|
436
|
+
items=[
|
437
|
+
client.V1KeyToPath(
|
438
|
+
key=".dockerconfigjson", path="config.json"
|
439
|
+
)
|
440
|
+
],
|
441
|
+
),
|
442
|
+
)
|
443
|
+
)
|
444
|
+
volume_mounts.append(
|
445
|
+
client.V1VolumeMount(
|
446
|
+
name="kaniko-docker-config",
|
447
|
+
mount_path="/kaniko/.docker",
|
448
|
+
)
|
449
|
+
)
|
450
|
+
elif self.secret_name and self.secret_key:
|
424
451
|
volumes += [
|
425
452
|
client.V1Volume(
|
426
453
|
name="docker-config",
|
wandb/sdk/launch/create_job.py
CHANGED
@@ -315,39 +315,6 @@ def _create_repo_metadata(
|
|
315
315
|
wandb.termerror(f"Entrypoint {entrypoint} not found in git repo")
|
316
316
|
return None
|
317
317
|
|
318
|
-
# check if requirements.txt exists
|
319
|
-
# start at the location of the python file and recurse up to the git root
|
320
|
-
entrypoint_dir = os.path.dirname(entrypoint)
|
321
|
-
if entrypoint_dir:
|
322
|
-
req_dir = os.path.join(local_dir, entrypoint_dir)
|
323
|
-
else:
|
324
|
-
req_dir = local_dir
|
325
|
-
|
326
|
-
# If there is a Dockerfile.wandb in the starting rec dir, don't require a requirements.txt
|
327
|
-
if os.path.exists(os.path.join(req_dir, "Dockerfile.wandb")):
|
328
|
-
wandb.termlog(
|
329
|
-
f"Using Dockerfile.wandb in {req_dir.replace(tempdir, '') or 'repository root'}"
|
330
|
-
)
|
331
|
-
else:
|
332
|
-
while (
|
333
|
-
not os.path.exists(os.path.join(req_dir, "requirements.txt"))
|
334
|
-
and req_dir != tempdir
|
335
|
-
):
|
336
|
-
req_dir = os.path.dirname(req_dir)
|
337
|
-
|
338
|
-
if not os.path.exists(os.path.join(req_dir, "requirements.txt")):
|
339
|
-
path_with_subdir = os.path.dirname(
|
340
|
-
os.path.join(path or "", entrypoint or "")
|
341
|
-
)
|
342
|
-
wandb.termerror(
|
343
|
-
f"Could not find requirements.txt file in git repo at {path_with_subdir}"
|
344
|
-
)
|
345
|
-
return None
|
346
|
-
|
347
|
-
wandb.termlog(
|
348
|
-
f"Using requirements.txt in {req_dir.replace(tempdir, '') or 'repository root'}"
|
349
|
-
)
|
350
|
-
|
351
318
|
metadata = {
|
352
319
|
"git": {
|
353
320
|
"commit": commit,
|
@@ -366,18 +333,16 @@ def _create_repo_metadata(
|
|
366
333
|
def _create_artifact_metadata(
|
367
334
|
path: str, entrypoint: str, runtime: Optional[str] = None
|
368
335
|
) -> Tuple[Dict[str, Any], List[str]]:
|
369
|
-
if not os.path.
|
336
|
+
if not os.path.isdir(path):
|
370
337
|
wandb.termerror("Path must be a valid file or directory")
|
371
338
|
return {}, []
|
372
339
|
|
373
|
-
if not os.path.exists(os.path.join(path, "requirements.txt")):
|
374
|
-
wandb.termerror(f"Could not find requirements.txt file in: {path}")
|
375
|
-
return {}, []
|
376
|
-
|
377
340
|
# read local requirements.txt and dump to temp dir for builder
|
378
341
|
requirements = []
|
379
|
-
|
380
|
-
|
342
|
+
depspath = os.path.join(path, "requirements.txt")
|
343
|
+
if os.path.exists(depspath):
|
344
|
+
with open(depspath) as f:
|
345
|
+
requirements = f.read().splitlines()
|
381
346
|
|
382
347
|
if runtime:
|
383
348
|
python_version = _clean_python_version(runtime)
|
@@ -431,6 +396,7 @@ def _configure_job_builder_for_partial(tmpdir: str, job_source: str) -> JobBuild
|
|
431
396
|
settings.update({"files_dir": tmpdir, "job_source": job_source})
|
432
397
|
job_builder = JobBuilder(
|
433
398
|
settings=settings, # type: ignore
|
399
|
+
verbose=True,
|
434
400
|
)
|
435
401
|
# never allow notebook runs
|
436
402
|
job_builder._is_notebook_run = False
|
wandb/sdk/launch/loader.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Utilities for the agent."""
|
2
2
|
from typing import Any, Dict, Optional
|
3
3
|
|
4
|
+
import wandb
|
4
5
|
from wandb.apis.internal import Api
|
5
6
|
from wandb.docker import is_docker_installed
|
6
7
|
from wandb.sdk.launch.errors import LaunchError
|
@@ -87,6 +88,15 @@ def registry_from_config(
|
|
87
88
|
from .registry.local_registry import LocalRegistry
|
88
89
|
|
89
90
|
return LocalRegistry() # This is the default, dummy registry.
|
91
|
+
|
92
|
+
wandb.termwarn(
|
93
|
+
"The `registry` block of the launch agent config is being deprecated. "
|
94
|
+
"Please specify an image repository URI under the `builder.destination` "
|
95
|
+
"key of your launch agent config. See "
|
96
|
+
"https://docs.wandb.ai/guides/launch/setup-agent-advanced#agent-configuration "
|
97
|
+
"for more information."
|
98
|
+
)
|
99
|
+
|
90
100
|
registry_type = config.get("type")
|
91
101
|
if registry_type is None or registry_type == "local":
|
92
102
|
from .registry.local_registry import LocalRegistry
|
@@ -0,0 +1,29 @@
|
|
1
|
+
from typing import Tuple
|
2
|
+
|
3
|
+
from wandb.docker import is_docker_installed
|
4
|
+
from wandb.sdk.launch.utils import docker_image_exists
|
5
|
+
|
6
|
+
from .abstract import AbstractRegistry
|
7
|
+
|
8
|
+
|
9
|
+
class AnonynmousRegistry(AbstractRegistry):
|
10
|
+
def __init__(self, uri: str) -> None:
|
11
|
+
"""Initialize the registry."""
|
12
|
+
self.uri = uri
|
13
|
+
|
14
|
+
async def get_username_password(self) -> Tuple[str, str]:
|
15
|
+
"""Get the username and password for the registry."""
|
16
|
+
raise NotImplementedError("Anonymous registry does not require authentication")
|
17
|
+
|
18
|
+
async def get_repo_uri(self) -> str:
|
19
|
+
return self.uri
|
20
|
+
|
21
|
+
async def check_image_exists(self, image_uri: str) -> bool:
|
22
|
+
"""Check if an image exists in the registry."""
|
23
|
+
if not is_docker_installed():
|
24
|
+
return False
|
25
|
+
return docker_image_exists(image_uri)
|
26
|
+
|
27
|
+
@classmethod
|
28
|
+
def from_config(cls, config: dict) -> "AbstractRegistry":
|
29
|
+
return cls(uri=config["uri"])
|
@@ -2,6 +2,7 @@
|
|
2
2
|
import logging
|
3
3
|
from typing import Tuple
|
4
4
|
|
5
|
+
from wandb.docker import is_docker_installed
|
5
6
|
from wandb.sdk.launch.errors import LaunchError
|
6
7
|
from wandb.sdk.launch.utils import docker_image_exists
|
7
8
|
|
@@ -60,4 +61,6 @@ class LocalRegistry(AbstractRegistry):
|
|
60
61
|
Returns:
|
61
62
|
bool: True.
|
62
63
|
"""
|
63
|
-
|
64
|
+
if is_docker_installed():
|
65
|
+
return docker_image_exists(image_uri)
|
66
|
+
return False
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Implementation of KubernetesRunner class for wandb launch."""
|
2
2
|
import asyncio
|
3
3
|
import base64
|
4
|
+
import datetime
|
4
5
|
import json
|
5
6
|
import logging
|
6
7
|
import os
|
@@ -23,6 +24,7 @@ from wandb.sdk.launch.runner.kubernetes_monitor import (
|
|
23
24
|
CustomResource,
|
24
25
|
LaunchKubernetesMonitor,
|
25
26
|
)
|
27
|
+
from wandb.sdk.lib.retry import ExponentialBackoff, retry_async
|
26
28
|
from wandb.util import get_module
|
27
29
|
|
28
30
|
from .._project_spec import EntryPoint, LaunchProject
|
@@ -59,6 +61,7 @@ from kubernetes_asyncio.client.models.v1_secret import ( # type: ignore # noqa:
|
|
59
61
|
from kubernetes_asyncio.client.rest import ApiException # type: ignore # noqa: E402
|
60
62
|
|
61
63
|
TIMEOUT = 5
|
64
|
+
API_KEY_SECRET_MAX_RETRIES = 5
|
62
65
|
|
63
66
|
_logger = logging.getLogger(__name__)
|
64
67
|
|
@@ -421,8 +424,23 @@ class KubernetesRunner(AbstractRunner):
|
|
421
424
|
else:
|
422
425
|
secret_name += f"-{launch_project.run_id}"
|
423
426
|
|
424
|
-
|
425
|
-
|
427
|
+
def handle_exception(e):
|
428
|
+
wandb.termwarn(
|
429
|
+
f"Exception when ensuring Kubernetes API key secret: {e}. Retrying..."
|
430
|
+
)
|
431
|
+
|
432
|
+
api_key_secret = await retry_async(
|
433
|
+
backoff=ExponentialBackoff(
|
434
|
+
initial_sleep=datetime.timedelta(seconds=1),
|
435
|
+
max_sleep=datetime.timedelta(minutes=1),
|
436
|
+
max_retries=API_KEY_SECRET_MAX_RETRIES,
|
437
|
+
),
|
438
|
+
fn=ensure_api_key_secret,
|
439
|
+
on_exc=handle_exception,
|
440
|
+
core_api=core_api,
|
441
|
+
secret_name=secret_name,
|
442
|
+
namespace=namespace,
|
443
|
+
api_key=value,
|
426
444
|
)
|
427
445
|
env.append(
|
428
446
|
{
|
@@ -148,15 +148,14 @@ class LocalContainerRunner(AbstractRunner):
|
|
148
148
|
env_vars["WANDB_BASE_URL"] = "http://host.docker.internal:9001"
|
149
149
|
|
150
150
|
if launch_project.docker_image:
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
)
|
151
|
+
try:
|
152
|
+
pull_docker_image(image_uri)
|
153
|
+
except Exception as e:
|
154
|
+
wandb.termwarn(f"Error attempting to pull docker image {image_uri}")
|
155
|
+
if not docker_image_exists(image_uri):
|
156
|
+
raise LaunchError(
|
157
|
+
f"Failed to pull docker image {image_uri} with error: {e}"
|
158
|
+
)
|
160
159
|
|
161
160
|
assert launch_project.docker_image == image_uri
|
162
161
|
|
@@ -234,7 +233,13 @@ def _thread_process_runner(
|
|
234
233
|
if not chunk:
|
235
234
|
break
|
236
235
|
index = chunk.find(b"\r")
|
237
|
-
decoded_chunk =
|
236
|
+
decoded_chunk = None
|
237
|
+
while not decoded_chunk:
|
238
|
+
try:
|
239
|
+
decoded_chunk = chunk.decode()
|
240
|
+
except UnicodeDecodeError:
|
241
|
+
# Multi-byte character cut off, try to get the rest of it
|
242
|
+
chunk += os.read(process.stdout.fileno(), 1) # type: ignore
|
238
243
|
if index != -1:
|
239
244
|
run._stdout += decoded_chunk
|
240
245
|
print(chunk.decode(), end="")
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Abstract Scheduler class."""
|
2
2
|
import asyncio
|
3
3
|
import base64
|
4
|
+
import copy
|
4
5
|
import logging
|
5
6
|
import os
|
6
7
|
import socket
|
@@ -24,7 +25,10 @@ from wandb.sdk.launch.sweeps.utils import (
|
|
24
25
|
create_sweep_command_args,
|
25
26
|
make_launch_sweep_entrypoint,
|
26
27
|
)
|
27
|
-
from wandb.sdk.launch.utils import
|
28
|
+
from wandb.sdk.launch.utils import (
|
29
|
+
event_loop_thread_exec,
|
30
|
+
strip_resource_args_and_template_vars,
|
31
|
+
)
|
28
32
|
from wandb.sdk.lib.runid import generate_id
|
29
33
|
|
30
34
|
if TYPE_CHECKING:
|
@@ -658,7 +662,7 @@ class Scheduler(ABC):
|
|
658
662
|
pidx = entry_point.index("${program}")
|
659
663
|
entry_point[pidx] = self._sweep_config["program"]
|
660
664
|
|
661
|
-
launch_config = self._wandb_run.config.get("launch", {})
|
665
|
+
launch_config = copy.deepcopy(self._wandb_run.config.get("launch", {}))
|
662
666
|
if "overrides" not in launch_config:
|
663
667
|
launch_config["overrides"] = {"run_config": {}}
|
664
668
|
launch_config["overrides"]["run_config"].update(args["args_dict"])
|
@@ -694,11 +698,14 @@ class Scheduler(ABC):
|
|
694
698
|
)
|
695
699
|
|
696
700
|
# override resource and args of job
|
697
|
-
_job_launch_config = self._wandb_run.config.get("launch") or {}
|
701
|
+
_job_launch_config = copy.deepcopy(self._wandb_run.config.get("launch")) or {}
|
698
702
|
|
699
703
|
# default priority is "medium"
|
700
704
|
_priority = int(launch_config.get("priority", 2)) # type: ignore
|
701
705
|
|
706
|
+
# strip resource_args and template_variables from launch_config
|
707
|
+
strip_resource_args_and_template_vars(_job_launch_config)
|
708
|
+
|
702
709
|
run_id = run.id or generate_id()
|
703
710
|
queued_run = launch_add(
|
704
711
|
run_id=run_id,
|
@@ -712,6 +719,7 @@ class Scheduler(ABC):
|
|
712
719
|
project_queue=self._project_queue,
|
713
720
|
resource=_job_launch_config.get("resource"),
|
714
721
|
resource_args=_job_launch_config.get("resource_args"),
|
722
|
+
template_variables=_job_launch_config.get("template_variables"),
|
715
723
|
author=self._kwargs.get("author"),
|
716
724
|
sweep_id=self._sweep_id,
|
717
725
|
priority=_priority,
|
wandb/sdk/launch/utils.py
CHANGED
@@ -221,6 +221,17 @@ def get_default_entity(api: Api, launch_config: Optional[Dict[str, Any]]):
|
|
221
221
|
return config_entity or api.default_entity
|
222
222
|
|
223
223
|
|
224
|
+
def strip_resource_args_and_template_vars(launch_spec: Dict[str, Any]) -> None:
|
225
|
+
if launch_spec.get("resource_args", None) and launch_spec.get(
|
226
|
+
"template_variables", None
|
227
|
+
):
|
228
|
+
wandb.termwarn(
|
229
|
+
"Launch spec contains both resource_args and template_variables, "
|
230
|
+
"only one can be set. Using template_variables."
|
231
|
+
)
|
232
|
+
launch_spec.pop("resource_args")
|
233
|
+
|
234
|
+
|
224
235
|
def construct_launch_spec(
|
225
236
|
uri: Optional[str],
|
226
237
|
job: Optional[str],
|
@@ -298,6 +309,9 @@ def construct_launch_spec(
|
|
298
309
|
else:
|
299
310
|
launch_config["registry"] = {"url": repository}
|
300
311
|
|
312
|
+
# dont send both resource args and template variables
|
313
|
+
strip_resource_args_and_template_vars(launch_spec)
|
314
|
+
|
301
315
|
return launch_spec
|
302
316
|
|
303
317
|
|
wandb/sdk/lib/__init__.py
CHANGED
@@ -15,6 +15,7 @@ _Setting = Literal[
|
|
15
15
|
"_aws_lambda",
|
16
16
|
"_async_upload_concurrency_limit",
|
17
17
|
"_cli_only_mode",
|
18
|
+
"_code_path_local",
|
18
19
|
"_colab",
|
19
20
|
"_cuda",
|
20
21
|
"_disable_meta",
|
@@ -101,6 +102,7 @@ _Setting = Literal[
|
|
101
102
|
"entity",
|
102
103
|
"files_dir",
|
103
104
|
"force",
|
105
|
+
"fork_from",
|
104
106
|
"git_commit",
|
105
107
|
"git_remote",
|
106
108
|
"git_remote_url",
|
@@ -191,6 +193,8 @@ SETTINGS_TOPOLOGICALLY_SORTED: Final[Tuple[_Setting, ...]] = (
|
|
191
193
|
"run_id",
|
192
194
|
"start_method",
|
193
195
|
"_aws_lambda",
|
196
|
+
"program",
|
197
|
+
"_code_path_local",
|
194
198
|
"_colab",
|
195
199
|
"_disable_machine_info",
|
196
200
|
"_disable_meta",
|
@@ -227,7 +231,6 @@ SETTINGS_TOPOLOGICALLY_SORTED: Final[Tuple[_Setting, ...]] = (
|
|
227
231
|
"log_symlink_internal",
|
228
232
|
"log_symlink_user",
|
229
233
|
"log_user",
|
230
|
-
"program",
|
231
234
|
"project_url",
|
232
235
|
"resume_fname",
|
233
236
|
"run_url",
|
wandb/sdk/lib/apikey.py
CHANGED
@@ -179,11 +179,6 @@ def write_netrc(host: str, entity: str, key: str) -> Optional[bool]:
|
|
179
179
|
return None
|
180
180
|
try:
|
181
181
|
normalized_host = urlparse(host).netloc.split(":")[0]
|
182
|
-
if normalized_host != "localhost" and "." not in normalized_host:
|
183
|
-
wandb.termerror(
|
184
|
-
f"Host must be a url in the form https://some.address.com, received {host}"
|
185
|
-
)
|
186
|
-
return None
|
187
182
|
netrc_path = get_netrc_file_path()
|
188
183
|
wandb.termlog(
|
189
184
|
f"Appending key for {normalized_host} to your netrc file: {netrc_path}"
|