wandb 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. wandb/__init__.py +2 -2
  2. wandb/agents/pyagent.py +1 -1
  3. wandb/apis/importers/__init__.py +1 -4
  4. wandb/apis/importers/internals/internal.py +386 -0
  5. wandb/apis/importers/internals/protocols.py +125 -0
  6. wandb/apis/importers/internals/util.py +78 -0
  7. wandb/apis/importers/mlflow.py +125 -88
  8. wandb/apis/importers/validation.py +108 -0
  9. wandb/apis/importers/wandb.py +1604 -0
  10. wandb/apis/public/api.py +7 -10
  11. wandb/apis/public/artifacts.py +38 -0
  12. wandb/apis/public/files.py +11 -2
  13. wandb/apis/reports/v2/__init__.py +0 -19
  14. wandb/apis/reports/v2/expr_parsing.py +0 -1
  15. wandb/apis/reports/v2/interface.py +15 -18
  16. wandb/apis/reports/v2/internal.py +12 -45
  17. wandb/cli/cli.py +52 -55
  18. wandb/integration/gym/__init__.py +2 -1
  19. wandb/integration/keras/callbacks/model_checkpoint.py +1 -1
  20. wandb/integration/keras/keras.py +6 -4
  21. wandb/integration/kfp/kfp_patch.py +2 -2
  22. wandb/integration/openai/fine_tuning.py +1 -2
  23. wandb/integration/ultralytics/callback.py +0 -1
  24. wandb/proto/v3/wandb_internal_pb2.py +332 -312
  25. wandb/proto/v3/wandb_settings_pb2.py +13 -3
  26. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  27. wandb/proto/v4/wandb_internal_pb2.py +316 -312
  28. wandb/proto/v4/wandb_settings_pb2.py +5 -3
  29. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  30. wandb/sdk/artifacts/artifact.py +75 -31
  31. wandb/sdk/artifacts/artifact_manifest.py +5 -2
  32. wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
  33. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +8 -2
  34. wandb/sdk/artifacts/artifact_saver.py +19 -47
  35. wandb/sdk/artifacts/storage_handler.py +2 -1
  36. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +22 -9
  37. wandb/sdk/artifacts/storage_policy.py +4 -1
  38. wandb/sdk/data_types/base_types/wb_value.py +1 -1
  39. wandb/sdk/data_types/image.py +2 -2
  40. wandb/sdk/interface/interface.py +49 -13
  41. wandb/sdk/interface/interface_shared.py +17 -11
  42. wandb/sdk/internal/file_stream.py +20 -1
  43. wandb/sdk/internal/handler.py +1 -4
  44. wandb/sdk/internal/internal_api.py +3 -1
  45. wandb/sdk/internal/job_builder.py +49 -19
  46. wandb/sdk/internal/profiler.py +1 -1
  47. wandb/sdk/internal/sender.py +96 -124
  48. wandb/sdk/internal/sender_config.py +197 -0
  49. wandb/sdk/internal/settings_static.py +9 -0
  50. wandb/sdk/internal/system/system_info.py +5 -3
  51. wandb/sdk/internal/update.py +1 -1
  52. wandb/sdk/launch/_launch.py +3 -3
  53. wandb/sdk/launch/_launch_add.py +28 -29
  54. wandb/sdk/launch/_project_spec.py +148 -136
  55. wandb/sdk/launch/agent/agent.py +3 -7
  56. wandb/sdk/launch/agent/config.py +0 -27
  57. wandb/sdk/launch/builder/build.py +54 -28
  58. wandb/sdk/launch/builder/docker_builder.py +4 -15
  59. wandb/sdk/launch/builder/kaniko_builder.py +72 -45
  60. wandb/sdk/launch/create_job.py +6 -40
  61. wandb/sdk/launch/loader.py +10 -0
  62. wandb/sdk/launch/registry/anon.py +29 -0
  63. wandb/sdk/launch/registry/local_registry.py +4 -1
  64. wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
  65. wandb/sdk/launch/runner/local_container.py +15 -10
  66. wandb/sdk/launch/runner/sagemaker_runner.py +1 -1
  67. wandb/sdk/launch/sweeps/scheduler.py +11 -3
  68. wandb/sdk/launch/utils.py +14 -0
  69. wandb/sdk/lib/__init__.py +2 -5
  70. wandb/sdk/lib/_settings_toposort_generated.py +4 -1
  71. wandb/sdk/lib/apikey.py +0 -5
  72. wandb/sdk/lib/config_util.py +0 -31
  73. wandb/sdk/lib/filesystem.py +11 -1
  74. wandb/sdk/lib/run_moment.py +72 -0
  75. wandb/sdk/service/service.py +7 -2
  76. wandb/sdk/service/streams.py +1 -6
  77. wandb/sdk/verify/verify.py +2 -1
  78. wandb/sdk/wandb_init.py +12 -1
  79. wandb/sdk/wandb_login.py +43 -26
  80. wandb/sdk/wandb_run.py +164 -110
  81. wandb/sdk/wandb_settings.py +58 -16
  82. wandb/testing/relay.py +5 -6
  83. wandb/util.py +50 -7
  84. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/METADATA +8 -1
  85. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/RECORD +89 -82
  86. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/WHEEL +1 -1
  87. wandb/apis/importers/base.py +0 -400
  88. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/LICENSE +0 -0
  89. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/entry_points.txt +0 -0
  90. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/top_level.txt +0 -0
@@ -11,18 +11,13 @@ from wandb.sdk.launch.builder.build import registry_from_uri
11
11
  from wandb.sdk.launch.environment.abstract import AbstractEnvironment
12
12
  from wandb.sdk.launch.registry.abstract import AbstractRegistry
13
13
 
14
- from .._project_spec import (
15
- EntryPoint,
16
- LaunchProject,
17
- create_metadata_file,
18
- get_entry_point_command,
19
- )
14
+ from .._project_spec import EntryPoint, LaunchProject
20
15
  from ..errors import LaunchDockerError, LaunchError
16
+ from ..registry.anon import AnonynmousRegistry
21
17
  from ..registry.local_registry import LocalRegistry
22
18
  from ..utils import (
23
19
  LOG_PREFIX,
24
20
  event_loop_thread_exec,
25
- sanitize_wandb_api_key,
26
21
  warn_failed_packages_from_build_logs,
27
22
  )
28
23
  from .build import (
@@ -106,6 +101,8 @@ class DockerBuilder(AbstractBuilder):
106
101
  """Login to the registry."""
107
102
  if isinstance(self.registry, LocalRegistry):
108
103
  _logger.info(f"{LOG_PREFIX}No registry configured, skipping login.")
104
+ elif isinstance(self.registry, AnonynmousRegistry):
105
+ _logger.info(f"{LOG_PREFIX}Anonymous registry, skipping login.")
109
106
  else:
110
107
  username, password = await self.registry.get_username_password()
111
108
  login = event_loop_thread_exec(docker.login)
@@ -155,14 +152,6 @@ class DockerBuilder(AbstractBuilder):
155
152
  f"image {image_uri} does not already exist in repository, building."
156
153
  )
157
154
 
158
- entry_cmd = get_entry_point_command(entrypoint, launch_project.override_args)
159
-
160
- create_metadata_file(
161
- launch_project,
162
- image_uri,
163
- sanitize_wandb_api_key(" ".join(entry_cmd)),
164
- dockerfile_str,
165
- )
166
155
  build_ctx_path = _create_docker_build_ctx(launch_project, dockerfile_str)
167
156
  dockerfile = os.path.join(build_ctx_path, _WANDB_DOCKERFILE_NAME)
168
157
  try:
@@ -3,6 +3,7 @@ import base64
3
3
  import json
4
4
  import logging
5
5
  import os
6
+ import shutil
6
7
  import tarfile
7
8
  import tempfile
8
9
  import time
@@ -23,17 +24,11 @@ from wandb.sdk.launch.registry.elastic_container_registry import (
23
24
  from wandb.sdk.launch.registry.google_artifact_registry import GoogleArtifactRegistry
24
25
  from wandb.util import get_module
25
26
 
26
- from .._project_spec import (
27
- EntryPoint,
28
- LaunchProject,
29
- create_metadata_file,
30
- get_entry_point_command,
31
- )
27
+ from .._project_spec import EntryPoint, LaunchProject
32
28
  from ..errors import LaunchError
33
29
  from ..utils import (
34
30
  LOG_PREFIX,
35
31
  get_kube_context_and_api_client,
36
- sanitize_wandb_api_key,
37
32
  warn_failed_packages_from_build_logs,
38
33
  )
39
34
  from .build import (
@@ -56,6 +51,14 @@ _logger = logging.getLogger(__name__)
56
51
  _DEFAULT_BUILD_TIMEOUT_SECS = 1800 # 30 minute build timeout
57
52
 
58
53
  SERVICE_ACCOUNT_NAME = os.environ.get("WANDB_LAUNCH_SERVICE_ACCOUNT_NAME", "default")
54
+ PVC_NAME = os.environ.get("WANDB_LAUNCH_KANIKO_PVC_NAME")
55
+ PVC_MOUNT_PATH = (
56
+ os.environ.get("WANDB_LAUNCH_KANIKO_PVC_MOUNT_PATH", "/kaniko").rstrip("/")
57
+ if PVC_NAME
58
+ else None
59
+ )
60
+ DOCKER_CONFIG_SECRET = os.environ.get("WANDB_LAUNCH_KANIKO_AUTH_SECRET")
61
+
59
62
 
60
63
  if os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
61
64
  with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
@@ -115,12 +118,6 @@ class KanikoBuilder(AbstractBuilder):
115
118
  verify (bool, optional): Whether to verify the functionality of the builder.
116
119
  Defaults to True.
117
120
  """
118
- if build_context_store is None:
119
- raise LaunchError(
120
- "You are required to specify an external build "
121
- "context store for Kaniko builds. Please specify a storage url "
122
- "in the 'build-context-store' field of your builder config."
123
- )
124
121
  self.environment = environment
125
122
  self.registry = registry
126
123
  self.build_job_name = build_job_name
@@ -154,13 +151,16 @@ class KanikoBuilder(AbstractBuilder):
154
151
  raise LaunchError(
155
152
  "Builder config must include 'type':'kaniko' to create a KanikoBuilder."
156
153
  )
157
- build_context_store = config.get("build-context-store")
154
+ build_context_store = config.get("build-context-store", "")
158
155
  if build_context_store is None:
159
- raise LaunchError(
160
- "You are required to specify an external build "
161
- "context store for Kaniko builds. Please specify a "
162
- "storage url in the 'build_context_store' field of your builder config."
163
- )
156
+ if not PVC_MOUNT_PATH:
157
+ raise LaunchError(
158
+ "You must specify a build context store for kaniko builds. "
159
+ "You can set builder.build-context-store in your agent config "
160
+ "to a valid s3, gcs, or azure blog storage URI. Or, configure "
161
+ "a persistent volume claim through the agent helm chart: "
162
+ "https://github.com/wandb/helm-charts/tree/main/charts/launch-agent"
163
+ )
164
164
  build_job_name = config.get("build-job-name", "wandb-launch-container-build")
165
165
  secret_name = config.get("secret-name", "")
166
166
  secret_key = config.get("secret-key", "")
@@ -170,6 +170,7 @@ class KanikoBuilder(AbstractBuilder):
170
170
  image_uri = config.get("destination")
171
171
  if image_uri is not None:
172
172
  registry = registry_from_uri(image_uri)
173
+
173
174
  return cls(
174
175
  environment,
175
176
  registry,
@@ -186,9 +187,8 @@ class KanikoBuilder(AbstractBuilder):
186
187
  Raises:
187
188
  LaunchError: If the builder config is invalid.
188
189
  """
189
- if self.environment is None:
190
- raise LaunchError("No environment specified for Kaniko build.")
191
- await self.environment.verify_storage_uri(self.build_context_store)
190
+ if self.build_context_store:
191
+ await self.environment.verify_storage_uri(self.build_context_store)
192
192
 
193
193
  def login(self) -> None:
194
194
  """Login to the registry."""
@@ -197,8 +197,6 @@ class KanikoBuilder(AbstractBuilder):
197
197
  async def _create_docker_ecr_config_map(
198
198
  self, job_name: str, corev1_client: client.CoreV1Api, repository: str
199
199
  ) -> None:
200
- if self.registry is None:
201
- raise LaunchError("No registry specified for Kaniko build.")
202
200
  username, password = await self.registry.get_username_password()
203
201
  encoded = base64.b64encode(f"{username}:{password}".encode()).decode("utf-8")
204
202
  ecr_config_map = client.V1ConfigMap(
@@ -235,11 +233,21 @@ class KanikoBuilder(AbstractBuilder):
235
233
  with tarfile.TarFile.open(fileobj=context_file, mode="w:gz") as context_tgz:
236
234
  context_tgz.add(context_path, arcname=".")
237
235
  context_file.close()
238
- destination = f"{self.build_context_store}/{run_id}.tgz"
239
- if self.environment is None:
240
- raise LaunchError("No environment specified for Kaniko build.")
241
- await self.environment.upload_file(context_file.name, destination)
242
- return destination
236
+ if PVC_MOUNT_PATH is None:
237
+ destination = f"{self.build_context_store}/{run_id}.tgz"
238
+ if self.environment is None:
239
+ raise LaunchError("No environment specified for Kaniko build.")
240
+ await self.environment.upload_file(context_file.name, destination)
241
+ return destination
242
+ else:
243
+ destination = f"{PVC_MOUNT_PATH}/{run_id}.tgz"
244
+ try:
245
+ shutil.copy(context_file.name, destination)
246
+ except Exception as e:
247
+ raise LaunchError(
248
+ f"Error copying build context to PVC mounted at {PVC_MOUNT_PATH}: {e}"
249
+ ) from e
250
+ return f"tar:///context/{run_id}.tgz"
243
251
 
244
252
  async def build_image(
245
253
  self,
@@ -248,9 +256,6 @@ class KanikoBuilder(AbstractBuilder):
248
256
  job_tracker: Optional[JobAndRunStatusTracker] = None,
249
257
  ) -> str:
250
258
  await self.verify()
251
- # TODO: this should probably throw an error if the registry is a local registry
252
- if not self.registry:
253
- raise LaunchError("No registry specified for Kaniko build.")
254
259
  # kaniko builder doesn't seem to work with a custom user id, need more investigation
255
260
  dockerfile_str = generate_dockerfile(
256
261
  launch_project=launch_project,
@@ -262,7 +267,6 @@ class KanikoBuilder(AbstractBuilder):
262
267
  image_tag = image_tag_from_dockerfile_and_source(launch_project, dockerfile_str)
263
268
  repo_uri = await self.registry.get_repo_uri()
264
269
  image_uri = repo_uri + ":" + image_tag
265
-
266
270
  if (
267
271
  not launch_project.build_required()
268
272
  and await self.registry.check_image_exists(image_uri)
@@ -271,16 +275,6 @@ class KanikoBuilder(AbstractBuilder):
271
275
 
272
276
  _logger.info(f"Building image {image_uri}...")
273
277
 
274
- entry_cmd = " ".join(
275
- get_entry_point_command(entrypoint, launch_project.override_args)
276
- )
277
-
278
- create_metadata_file(
279
- launch_project,
280
- image_uri,
281
- sanitize_wandb_api_key(entry_cmd),
282
- sanitize_wandb_api_key(dockerfile_str),
283
- )
284
278
  context_path = _create_docker_build_ctx(launch_project, dockerfile_str)
285
279
  run_id = launch_project.run_id
286
280
 
@@ -381,6 +375,20 @@ class KanikoBuilder(AbstractBuilder):
381
375
  env = []
382
376
  volume_mounts = []
383
377
  volumes = []
378
+
379
+ if PVC_MOUNT_PATH:
380
+ volumes.append(
381
+ client.V1Volume(
382
+ name="kaniko-pvc",
383
+ persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
384
+ claim_name=PVC_NAME
385
+ ),
386
+ )
387
+ )
388
+ volume_mounts.append(
389
+ client.V1VolumeMount(name="kaniko-pvc", mount_path="/context")
390
+ )
391
+
384
392
  if bool(self.secret_name) != bool(self.secret_key):
385
393
  raise LaunchError(
386
394
  "Both secret_name and secret_key or neither must be specified "
@@ -419,8 +427,27 @@ class KanikoBuilder(AbstractBuilder):
419
427
  ),
420
428
  )
421
429
  ]
422
-
423
- if self.secret_name and self.secret_key:
430
+ if DOCKER_CONFIG_SECRET:
431
+ volumes.append(
432
+ client.V1Volume(
433
+ name="kaniko-docker-config",
434
+ secret=client.V1SecretVolumeSource(
435
+ secret_name=DOCKER_CONFIG_SECRET,
436
+ items=[
437
+ client.V1KeyToPath(
438
+ key=".dockerconfigjson", path="config.json"
439
+ )
440
+ ],
441
+ ),
442
+ )
443
+ )
444
+ volume_mounts.append(
445
+ client.V1VolumeMount(
446
+ name="kaniko-docker-config",
447
+ mount_path="/kaniko/.docker",
448
+ )
449
+ )
450
+ elif self.secret_name and self.secret_key:
424
451
  volumes += [
425
452
  client.V1Volume(
426
453
  name="docker-config",
@@ -315,39 +315,6 @@ def _create_repo_metadata(
315
315
  wandb.termerror(f"Entrypoint {entrypoint} not found in git repo")
316
316
  return None
317
317
 
318
- # check if requirements.txt exists
319
- # start at the location of the python file and recurse up to the git root
320
- entrypoint_dir = os.path.dirname(entrypoint)
321
- if entrypoint_dir:
322
- req_dir = os.path.join(local_dir, entrypoint_dir)
323
- else:
324
- req_dir = local_dir
325
-
326
- # If there is a Dockerfile.wandb in the starting rec dir, don't require a requirements.txt
327
- if os.path.exists(os.path.join(req_dir, "Dockerfile.wandb")):
328
- wandb.termlog(
329
- f"Using Dockerfile.wandb in {req_dir.replace(tempdir, '') or 'repository root'}"
330
- )
331
- else:
332
- while (
333
- not os.path.exists(os.path.join(req_dir, "requirements.txt"))
334
- and req_dir != tempdir
335
- ):
336
- req_dir = os.path.dirname(req_dir)
337
-
338
- if not os.path.exists(os.path.join(req_dir, "requirements.txt")):
339
- path_with_subdir = os.path.dirname(
340
- os.path.join(path or "", entrypoint or "")
341
- )
342
- wandb.termerror(
343
- f"Could not find requirements.txt file in git repo at {path_with_subdir}"
344
- )
345
- return None
346
-
347
- wandb.termlog(
348
- f"Using requirements.txt in {req_dir.replace(tempdir, '') or 'repository root'}"
349
- )
350
-
351
318
  metadata = {
352
319
  "git": {
353
320
  "commit": commit,
@@ -366,18 +333,16 @@ def _create_repo_metadata(
366
333
  def _create_artifact_metadata(
367
334
  path: str, entrypoint: str, runtime: Optional[str] = None
368
335
  ) -> Tuple[Dict[str, Any], List[str]]:
369
- if not os.path.exists(path):
336
+ if not os.path.isdir(path):
370
337
  wandb.termerror("Path must be a valid file or directory")
371
338
  return {}, []
372
339
 
373
- if not os.path.exists(os.path.join(path, "requirements.txt")):
374
- wandb.termerror(f"Could not find requirements.txt file in: {path}")
375
- return {}, []
376
-
377
340
  # read local requirements.txt and dump to temp dir for builder
378
341
  requirements = []
379
- with open(os.path.join(path, "requirements.txt")) as f:
380
- requirements = f.read().splitlines()
342
+ depspath = os.path.join(path, "requirements.txt")
343
+ if os.path.exists(depspath):
344
+ with open(depspath) as f:
345
+ requirements = f.read().splitlines()
381
346
 
382
347
  if runtime:
383
348
  python_version = _clean_python_version(runtime)
@@ -431,6 +396,7 @@ def _configure_job_builder_for_partial(tmpdir: str, job_source: str) -> JobBuild
431
396
  settings.update({"files_dir": tmpdir, "job_source": job_source})
432
397
  job_builder = JobBuilder(
433
398
  settings=settings, # type: ignore
399
+ verbose=True,
434
400
  )
435
401
  # never allow notebook runs
436
402
  job_builder._is_notebook_run = False
@@ -1,6 +1,7 @@
1
1
  """Utilities for the agent."""
2
2
  from typing import Any, Dict, Optional
3
3
 
4
+ import wandb
4
5
  from wandb.apis.internal import Api
5
6
  from wandb.docker import is_docker_installed
6
7
  from wandb.sdk.launch.errors import LaunchError
@@ -87,6 +88,15 @@ def registry_from_config(
87
88
  from .registry.local_registry import LocalRegistry
88
89
 
89
90
  return LocalRegistry() # This is the default, dummy registry.
91
+
92
+ wandb.termwarn(
93
+ "The `registry` block of the launch agent config is being deprecated. "
94
+ "Please specify an image repository URI under the `builder.destination` "
95
+ "key of your launch agent config. See "
96
+ "https://docs.wandb.ai/guides/launch/setup-agent-advanced#agent-configuration "
97
+ "for more information."
98
+ )
99
+
90
100
  registry_type = config.get("type")
91
101
  if registry_type is None or registry_type == "local":
92
102
  from .registry.local_registry import LocalRegistry
@@ -0,0 +1,29 @@
1
+ from typing import Tuple
2
+
3
+ from wandb.docker import is_docker_installed
4
+ from wandb.sdk.launch.utils import docker_image_exists
5
+
6
+ from .abstract import AbstractRegistry
7
+
8
+
9
+ class AnonynmousRegistry(AbstractRegistry):
10
+ def __init__(self, uri: str) -> None:
11
+ """Initialize the registry."""
12
+ self.uri = uri
13
+
14
+ async def get_username_password(self) -> Tuple[str, str]:
15
+ """Get the username and password for the registry."""
16
+ raise NotImplementedError("Anonymous registry does not require authentication")
17
+
18
+ async def get_repo_uri(self) -> str:
19
+ return self.uri
20
+
21
+ async def check_image_exists(self, image_uri: str) -> bool:
22
+ """Check if an image exists in the registry."""
23
+ if not is_docker_installed():
24
+ return False
25
+ return docker_image_exists(image_uri)
26
+
27
+ @classmethod
28
+ def from_config(cls, config: dict) -> "AbstractRegistry":
29
+ return cls(uri=config["uri"])
@@ -2,6 +2,7 @@
2
2
  import logging
3
3
  from typing import Tuple
4
4
 
5
+ from wandb.docker import is_docker_installed
5
6
  from wandb.sdk.launch.errors import LaunchError
6
7
  from wandb.sdk.launch.utils import docker_image_exists
7
8
 
@@ -60,4 +61,6 @@ class LocalRegistry(AbstractRegistry):
60
61
  Returns:
61
62
  bool: True.
62
63
  """
63
- return docker_image_exists(image_uri)
64
+ if is_docker_installed():
65
+ return docker_image_exists(image_uri)
66
+ return False
@@ -1,6 +1,7 @@
1
1
  """Implementation of KubernetesRunner class for wandb launch."""
2
2
  import asyncio
3
3
  import base64
4
+ import datetime
4
5
  import json
5
6
  import logging
6
7
  import os
@@ -23,6 +24,7 @@ from wandb.sdk.launch.runner.kubernetes_monitor import (
23
24
  CustomResource,
24
25
  LaunchKubernetesMonitor,
25
26
  )
27
+ from wandb.sdk.lib.retry import ExponentialBackoff, retry_async
26
28
  from wandb.util import get_module
27
29
 
28
30
  from .._project_spec import EntryPoint, LaunchProject
@@ -59,6 +61,7 @@ from kubernetes_asyncio.client.models.v1_secret import ( # type: ignore # noqa:
59
61
  from kubernetes_asyncio.client.rest import ApiException # type: ignore # noqa: E402
60
62
 
61
63
  TIMEOUT = 5
64
+ API_KEY_SECRET_MAX_RETRIES = 5
62
65
 
63
66
  _logger = logging.getLogger(__name__)
64
67
 
@@ -421,8 +424,23 @@ class KubernetesRunner(AbstractRunner):
421
424
  else:
422
425
  secret_name += f"-{launch_project.run_id}"
423
426
 
424
- api_key_secret = await ensure_api_key_secret(
425
- core_api, secret_name, namespace, value
427
+ def handle_exception(e):
428
+ wandb.termwarn(
429
+ f"Exception when ensuring Kubernetes API key secret: {e}. Retrying..."
430
+ )
431
+
432
+ api_key_secret = await retry_async(
433
+ backoff=ExponentialBackoff(
434
+ initial_sleep=datetime.timedelta(seconds=1),
435
+ max_sleep=datetime.timedelta(minutes=1),
436
+ max_retries=API_KEY_SECRET_MAX_RETRIES,
437
+ ),
438
+ fn=ensure_api_key_secret,
439
+ on_exc=handle_exception,
440
+ core_api=core_api,
441
+ secret_name=secret_name,
442
+ namespace=namespace,
443
+ api_key=value,
426
444
  )
427
445
  env.append(
428
446
  {
@@ -148,15 +148,14 @@ class LocalContainerRunner(AbstractRunner):
148
148
  env_vars["WANDB_BASE_URL"] = "http://host.docker.internal:9001"
149
149
 
150
150
  if launch_project.docker_image:
151
- if image_uri.endswith(":latest") or not docker_image_exists(image_uri):
152
- try:
153
- pull_docker_image(image_uri)
154
- except Exception as e:
155
- wandb.termwarn(f"Error attempting to pull docker image {image_uri}")
156
- if not docker_image_exists(image_uri):
157
- raise LaunchError(
158
- f"Failed to pull docker image {image_uri} with error: {e}"
159
- )
151
+ try:
152
+ pull_docker_image(image_uri)
153
+ except Exception as e:
154
+ wandb.termwarn(f"Error attempting to pull docker image {image_uri}")
155
+ if not docker_image_exists(image_uri):
156
+ raise LaunchError(
157
+ f"Failed to pull docker image {image_uri} with error: {e}"
158
+ )
160
159
 
161
160
  assert launch_project.docker_image == image_uri
162
161
 
@@ -234,7 +233,13 @@ def _thread_process_runner(
234
233
  if not chunk:
235
234
  break
236
235
  index = chunk.find(b"\r")
237
- decoded_chunk = chunk.decode()
236
+ decoded_chunk = None
237
+ while not decoded_chunk:
238
+ try:
239
+ decoded_chunk = chunk.decode()
240
+ except UnicodeDecodeError:
241
+ # Multi-byte character cut off, try to get the rest of it
242
+ chunk += os.read(process.stdout.fileno(), 1) # type: ignore
238
243
  if index != -1:
239
244
  run._stdout += decoded_chunk
240
245
  print(chunk.decode(), end="")
@@ -239,7 +239,7 @@ class SageMakerRunner(AbstractRunner):
239
239
  launch_project,
240
240
  self._api,
241
241
  role_arn,
242
- launch_project.override_entrypoint,
242
+ entry_point,
243
243
  launch_project.override_args,
244
244
  MAX_ENV_LENGTHS[self.__class__.__name__],
245
245
  image_uri,
@@ -1,6 +1,7 @@
1
1
  """Abstract Scheduler class."""
2
2
  import asyncio
3
3
  import base64
4
+ import copy
4
5
  import logging
5
6
  import os
6
7
  import socket
@@ -24,7 +25,10 @@ from wandb.sdk.launch.sweeps.utils import (
24
25
  create_sweep_command_args,
25
26
  make_launch_sweep_entrypoint,
26
27
  )
27
- from wandb.sdk.launch.utils import event_loop_thread_exec
28
+ from wandb.sdk.launch.utils import (
29
+ event_loop_thread_exec,
30
+ strip_resource_args_and_template_vars,
31
+ )
28
32
  from wandb.sdk.lib.runid import generate_id
29
33
 
30
34
  if TYPE_CHECKING:
@@ -658,7 +662,7 @@ class Scheduler(ABC):
658
662
  pidx = entry_point.index("${program}")
659
663
  entry_point[pidx] = self._sweep_config["program"]
660
664
 
661
- launch_config = self._wandb_run.config.get("launch", {})
665
+ launch_config = copy.deepcopy(self._wandb_run.config.get("launch", {}))
662
666
  if "overrides" not in launch_config:
663
667
  launch_config["overrides"] = {"run_config": {}}
664
668
  launch_config["overrides"]["run_config"].update(args["args_dict"])
@@ -694,11 +698,14 @@ class Scheduler(ABC):
694
698
  )
695
699
 
696
700
  # override resource and args of job
697
- _job_launch_config = self._wandb_run.config.get("launch") or {}
701
+ _job_launch_config = copy.deepcopy(self._wandb_run.config.get("launch")) or {}
698
702
 
699
703
  # default priority is "medium"
700
704
  _priority = int(launch_config.get("priority", 2)) # type: ignore
701
705
 
706
+ # strip resource_args and template_variables from launch_config
707
+ strip_resource_args_and_template_vars(_job_launch_config)
708
+
702
709
  run_id = run.id or generate_id()
703
710
  queued_run = launch_add(
704
711
  run_id=run_id,
@@ -712,6 +719,7 @@ class Scheduler(ABC):
712
719
  project_queue=self._project_queue,
713
720
  resource=_job_launch_config.get("resource"),
714
721
  resource_args=_job_launch_config.get("resource_args"),
722
+ template_variables=_job_launch_config.get("template_variables"),
715
723
  author=self._kwargs.get("author"),
716
724
  sweep_id=self._sweep_id,
717
725
  priority=_priority,
wandb/sdk/launch/utils.py CHANGED
@@ -221,6 +221,17 @@ def get_default_entity(api: Api, launch_config: Optional[Dict[str, Any]]):
221
221
  return config_entity or api.default_entity
222
222
 
223
223
 
224
+ def strip_resource_args_and_template_vars(launch_spec: Dict[str, Any]) -> None:
225
+ if launch_spec.get("resource_args", None) and launch_spec.get(
226
+ "template_variables", None
227
+ ):
228
+ wandb.termwarn(
229
+ "Launch spec contains both resource_args and template_variables, "
230
+ "only one can be set. Using template_variables."
231
+ )
232
+ launch_spec.pop("resource_args")
233
+
234
+
224
235
  def construct_launch_spec(
225
236
  uri: Optional[str],
226
237
  job: Optional[str],
@@ -298,6 +309,9 @@ def construct_launch_spec(
298
309
  else:
299
310
  launch_config["registry"] = {"url": repository}
300
311
 
312
+ # dont send both resource args and template variables
313
+ strip_resource_args_and_template_vars(launch_spec)
314
+
301
315
  return launch_spec
302
316
 
303
317
 
wandb/sdk/lib/__init__.py CHANGED
@@ -1,8 +1,5 @@
1
1
  from . import lazyloader
2
2
  from .disabled import RunDisabled, SummaryDisabled
3
+ from .run_moment import RunMoment
3
4
 
4
- __all__ = (
5
- "lazyloader",
6
- "RunDisabled",
7
- "SummaryDisabled",
8
- )
5
+ __all__ = ("lazyloader", "RunDisabled", "SummaryDisabled", "RunMoment")
@@ -15,6 +15,7 @@ _Setting = Literal[
15
15
  "_aws_lambda",
16
16
  "_async_upload_concurrency_limit",
17
17
  "_cli_only_mode",
18
+ "_code_path_local",
18
19
  "_colab",
19
20
  "_cuda",
20
21
  "_disable_meta",
@@ -101,6 +102,7 @@ _Setting = Literal[
101
102
  "entity",
102
103
  "files_dir",
103
104
  "force",
105
+ "fork_from",
104
106
  "git_commit",
105
107
  "git_remote",
106
108
  "git_remote_url",
@@ -191,6 +193,8 @@ SETTINGS_TOPOLOGICALLY_SORTED: Final[Tuple[_Setting, ...]] = (
191
193
  "run_id",
192
194
  "start_method",
193
195
  "_aws_lambda",
196
+ "program",
197
+ "_code_path_local",
194
198
  "_colab",
195
199
  "_disable_machine_info",
196
200
  "_disable_meta",
@@ -227,7 +231,6 @@ SETTINGS_TOPOLOGICALLY_SORTED: Final[Tuple[_Setting, ...]] = (
227
231
  "log_symlink_internal",
228
232
  "log_symlink_user",
229
233
  "log_user",
230
- "program",
231
234
  "project_url",
232
235
  "resume_fname",
233
236
  "run_url",
wandb/sdk/lib/apikey.py CHANGED
@@ -179,11 +179,6 @@ def write_netrc(host: str, entity: str, key: str) -> Optional[bool]:
179
179
  return None
180
180
  try:
181
181
  normalized_host = urlparse(host).netloc.split(":")[0]
182
- if normalized_host != "localhost" and "." not in normalized_host:
183
- wandb.termerror(
184
- f"Host must be a url in the form https://some.address.com, received {host}"
185
- )
186
- return None
187
182
  netrc_path = get_netrc_file_path()
188
183
  wandb.termlog(
189
184
  f"Appending key for {normalized_host} to your netrc file: {netrc_path}"