wandb 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. wandb/__init__.py +2 -2
  2. wandb/agents/pyagent.py +1 -1
  3. wandb/apis/importers/__init__.py +1 -4
  4. wandb/apis/importers/internals/internal.py +386 -0
  5. wandb/apis/importers/internals/protocols.py +125 -0
  6. wandb/apis/importers/internals/util.py +78 -0
  7. wandb/apis/importers/mlflow.py +125 -88
  8. wandb/apis/importers/validation.py +108 -0
  9. wandb/apis/importers/wandb.py +1604 -0
  10. wandb/apis/public/api.py +7 -10
  11. wandb/apis/public/artifacts.py +38 -0
  12. wandb/apis/public/files.py +11 -2
  13. wandb/apis/reports/v2/__init__.py +0 -19
  14. wandb/apis/reports/v2/expr_parsing.py +0 -1
  15. wandb/apis/reports/v2/interface.py +15 -18
  16. wandb/apis/reports/v2/internal.py +12 -45
  17. wandb/cli/cli.py +52 -55
  18. wandb/integration/gym/__init__.py +2 -1
  19. wandb/integration/keras/callbacks/model_checkpoint.py +1 -1
  20. wandb/integration/keras/keras.py +6 -4
  21. wandb/integration/kfp/kfp_patch.py +2 -2
  22. wandb/integration/openai/fine_tuning.py +1 -2
  23. wandb/integration/ultralytics/callback.py +0 -1
  24. wandb/proto/v3/wandb_internal_pb2.py +332 -312
  25. wandb/proto/v3/wandb_settings_pb2.py +13 -3
  26. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  27. wandb/proto/v4/wandb_internal_pb2.py +316 -312
  28. wandb/proto/v4/wandb_settings_pb2.py +5 -3
  29. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  30. wandb/sdk/artifacts/artifact.py +75 -31
  31. wandb/sdk/artifacts/artifact_manifest.py +5 -2
  32. wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
  33. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +8 -2
  34. wandb/sdk/artifacts/artifact_saver.py +19 -47
  35. wandb/sdk/artifacts/storage_handler.py +2 -1
  36. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +22 -9
  37. wandb/sdk/artifacts/storage_policy.py +4 -1
  38. wandb/sdk/data_types/base_types/wb_value.py +1 -1
  39. wandb/sdk/data_types/image.py +2 -2
  40. wandb/sdk/interface/interface.py +49 -13
  41. wandb/sdk/interface/interface_shared.py +17 -11
  42. wandb/sdk/internal/file_stream.py +20 -1
  43. wandb/sdk/internal/handler.py +1 -4
  44. wandb/sdk/internal/internal_api.py +3 -1
  45. wandb/sdk/internal/job_builder.py +49 -19
  46. wandb/sdk/internal/profiler.py +1 -1
  47. wandb/sdk/internal/sender.py +96 -124
  48. wandb/sdk/internal/sender_config.py +197 -0
  49. wandb/sdk/internal/settings_static.py +9 -0
  50. wandb/sdk/internal/system/system_info.py +5 -3
  51. wandb/sdk/internal/update.py +1 -1
  52. wandb/sdk/launch/_launch.py +3 -3
  53. wandb/sdk/launch/_launch_add.py +28 -29
  54. wandb/sdk/launch/_project_spec.py +148 -136
  55. wandb/sdk/launch/agent/agent.py +3 -7
  56. wandb/sdk/launch/agent/config.py +0 -27
  57. wandb/sdk/launch/builder/build.py +54 -28
  58. wandb/sdk/launch/builder/docker_builder.py +4 -15
  59. wandb/sdk/launch/builder/kaniko_builder.py +72 -45
  60. wandb/sdk/launch/create_job.py +6 -40
  61. wandb/sdk/launch/loader.py +10 -0
  62. wandb/sdk/launch/registry/anon.py +29 -0
  63. wandb/sdk/launch/registry/local_registry.py +4 -1
  64. wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
  65. wandb/sdk/launch/runner/local_container.py +15 -10
  66. wandb/sdk/launch/runner/sagemaker_runner.py +1 -1
  67. wandb/sdk/launch/sweeps/scheduler.py +11 -3
  68. wandb/sdk/launch/utils.py +14 -0
  69. wandb/sdk/lib/__init__.py +2 -5
  70. wandb/sdk/lib/_settings_toposort_generated.py +4 -1
  71. wandb/sdk/lib/apikey.py +0 -5
  72. wandb/sdk/lib/config_util.py +0 -31
  73. wandb/sdk/lib/filesystem.py +11 -1
  74. wandb/sdk/lib/run_moment.py +72 -0
  75. wandb/sdk/service/service.py +7 -2
  76. wandb/sdk/service/streams.py +1 -6
  77. wandb/sdk/verify/verify.py +2 -1
  78. wandb/sdk/wandb_init.py +12 -1
  79. wandb/sdk/wandb_login.py +43 -26
  80. wandb/sdk/wandb_run.py +164 -110
  81. wandb/sdk/wandb_settings.py +58 -16
  82. wandb/testing/relay.py +5 -6
  83. wandb/util.py +50 -7
  84. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/METADATA +8 -1
  85. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/RECORD +89 -82
  86. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/WHEEL +1 -1
  87. wandb/apis/importers/base.py +0 -400
  88. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/LICENSE +0 -0
  89. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/entry_points.txt +0 -0
  90. {wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/top_level.txt +0 -0
@@ -11,18 +11,13 @@ from wandb.sdk.launch.builder.build import registry_from_uri
11
11
  from wandb.sdk.launch.environment.abstract import AbstractEnvironment
12
12
  from wandb.sdk.launch.registry.abstract import AbstractRegistry
13
13
 
14
- from .._project_spec import (
15
- EntryPoint,
16
- LaunchProject,
17
- create_metadata_file,
18
- get_entry_point_command,
19
- )
14
+ from .._project_spec import EntryPoint, LaunchProject
20
15
  from ..errors import LaunchDockerError, LaunchError
16
+ from ..registry.anon import AnonynmousRegistry
21
17
  from ..registry.local_registry import LocalRegistry
22
18
  from ..utils import (
23
19
  LOG_PREFIX,
24
20
  event_loop_thread_exec,
25
- sanitize_wandb_api_key,
26
21
  warn_failed_packages_from_build_logs,
27
22
  )
28
23
  from .build import (
@@ -106,6 +101,8 @@ class DockerBuilder(AbstractBuilder):
106
101
  """Login to the registry."""
107
102
  if isinstance(self.registry, LocalRegistry):
108
103
  _logger.info(f"{LOG_PREFIX}No registry configured, skipping login.")
104
+ elif isinstance(self.registry, AnonynmousRegistry):
105
+ _logger.info(f"{LOG_PREFIX}Anonymous registry, skipping login.")
109
106
  else:
110
107
  username, password = await self.registry.get_username_password()
111
108
  login = event_loop_thread_exec(docker.login)
@@ -155,14 +152,6 @@ class DockerBuilder(AbstractBuilder):
155
152
  f"image {image_uri} does not already exist in repository, building."
156
153
  )
157
154
 
158
- entry_cmd = get_entry_point_command(entrypoint, launch_project.override_args)
159
-
160
- create_metadata_file(
161
- launch_project,
162
- image_uri,
163
- sanitize_wandb_api_key(" ".join(entry_cmd)),
164
- dockerfile_str,
165
- )
166
155
  build_ctx_path = _create_docker_build_ctx(launch_project, dockerfile_str)
167
156
  dockerfile = os.path.join(build_ctx_path, _WANDB_DOCKERFILE_NAME)
168
157
  try:
@@ -3,6 +3,7 @@ import base64
3
3
  import json
4
4
  import logging
5
5
  import os
6
+ import shutil
6
7
  import tarfile
7
8
  import tempfile
8
9
  import time
@@ -23,17 +24,11 @@ from wandb.sdk.launch.registry.elastic_container_registry import (
23
24
  from wandb.sdk.launch.registry.google_artifact_registry import GoogleArtifactRegistry
24
25
  from wandb.util import get_module
25
26
 
26
- from .._project_spec import (
27
- EntryPoint,
28
- LaunchProject,
29
- create_metadata_file,
30
- get_entry_point_command,
31
- )
27
+ from .._project_spec import EntryPoint, LaunchProject
32
28
  from ..errors import LaunchError
33
29
  from ..utils import (
34
30
  LOG_PREFIX,
35
31
  get_kube_context_and_api_client,
36
- sanitize_wandb_api_key,
37
32
  warn_failed_packages_from_build_logs,
38
33
  )
39
34
  from .build import (
@@ -56,6 +51,14 @@ _logger = logging.getLogger(__name__)
56
51
  _DEFAULT_BUILD_TIMEOUT_SECS = 1800 # 30 minute build timeout
57
52
 
58
53
  SERVICE_ACCOUNT_NAME = os.environ.get("WANDB_LAUNCH_SERVICE_ACCOUNT_NAME", "default")
54
+ PVC_NAME = os.environ.get("WANDB_LAUNCH_KANIKO_PVC_NAME")
55
+ PVC_MOUNT_PATH = (
56
+ os.environ.get("WANDB_LAUNCH_KANIKO_PVC_MOUNT_PATH", "/kaniko").rstrip("/")
57
+ if PVC_NAME
58
+ else None
59
+ )
60
+ DOCKER_CONFIG_SECRET = os.environ.get("WANDB_LAUNCH_KANIKO_AUTH_SECRET")
61
+
59
62
 
60
63
  if os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
61
64
  with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
@@ -115,12 +118,6 @@ class KanikoBuilder(AbstractBuilder):
115
118
  verify (bool, optional): Whether to verify the functionality of the builder.
116
119
  Defaults to True.
117
120
  """
118
- if build_context_store is None:
119
- raise LaunchError(
120
- "You are required to specify an external build "
121
- "context store for Kaniko builds. Please specify a storage url "
122
- "in the 'build-context-store' field of your builder config."
123
- )
124
121
  self.environment = environment
125
122
  self.registry = registry
126
123
  self.build_job_name = build_job_name
@@ -154,13 +151,16 @@ class KanikoBuilder(AbstractBuilder):
154
151
  raise LaunchError(
155
152
  "Builder config must include 'type':'kaniko' to create a KanikoBuilder."
156
153
  )
157
- build_context_store = config.get("build-context-store")
154
+ build_context_store = config.get("build-context-store", "")
158
155
  if build_context_store is None:
159
- raise LaunchError(
160
- "You are required to specify an external build "
161
- "context store for Kaniko builds. Please specify a "
162
- "storage url in the 'build_context_store' field of your builder config."
163
- )
156
+ if not PVC_MOUNT_PATH:
157
+ raise LaunchError(
158
+ "You must specify a build context store for kaniko builds. "
159
+ "You can set builder.build-context-store in your agent config "
160
+ "to a valid s3, gcs, or azure blog storage URI. Or, configure "
161
+ "a persistent volume claim through the agent helm chart: "
162
+ "https://github.com/wandb/helm-charts/tree/main/charts/launch-agent"
163
+ )
164
164
  build_job_name = config.get("build-job-name", "wandb-launch-container-build")
165
165
  secret_name = config.get("secret-name", "")
166
166
  secret_key = config.get("secret-key", "")
@@ -170,6 +170,7 @@ class KanikoBuilder(AbstractBuilder):
170
170
  image_uri = config.get("destination")
171
171
  if image_uri is not None:
172
172
  registry = registry_from_uri(image_uri)
173
+
173
174
  return cls(
174
175
  environment,
175
176
  registry,
@@ -186,9 +187,8 @@ class KanikoBuilder(AbstractBuilder):
186
187
  Raises:
187
188
  LaunchError: If the builder config is invalid.
188
189
  """
189
- if self.environment is None:
190
- raise LaunchError("No environment specified for Kaniko build.")
191
- await self.environment.verify_storage_uri(self.build_context_store)
190
+ if self.build_context_store:
191
+ await self.environment.verify_storage_uri(self.build_context_store)
192
192
 
193
193
  def login(self) -> None:
194
194
  """Login to the registry."""
@@ -197,8 +197,6 @@ class KanikoBuilder(AbstractBuilder):
197
197
  async def _create_docker_ecr_config_map(
198
198
  self, job_name: str, corev1_client: client.CoreV1Api, repository: str
199
199
  ) -> None:
200
- if self.registry is None:
201
- raise LaunchError("No registry specified for Kaniko build.")
202
200
  username, password = await self.registry.get_username_password()
203
201
  encoded = base64.b64encode(f"{username}:{password}".encode()).decode("utf-8")
204
202
  ecr_config_map = client.V1ConfigMap(
@@ -235,11 +233,21 @@ class KanikoBuilder(AbstractBuilder):
235
233
  with tarfile.TarFile.open(fileobj=context_file, mode="w:gz") as context_tgz:
236
234
  context_tgz.add(context_path, arcname=".")
237
235
  context_file.close()
238
- destination = f"{self.build_context_store}/{run_id}.tgz"
239
- if self.environment is None:
240
- raise LaunchError("No environment specified for Kaniko build.")
241
- await self.environment.upload_file(context_file.name, destination)
242
- return destination
236
+ if PVC_MOUNT_PATH is None:
237
+ destination = f"{self.build_context_store}/{run_id}.tgz"
238
+ if self.environment is None:
239
+ raise LaunchError("No environment specified for Kaniko build.")
240
+ await self.environment.upload_file(context_file.name, destination)
241
+ return destination
242
+ else:
243
+ destination = f"{PVC_MOUNT_PATH}/{run_id}.tgz"
244
+ try:
245
+ shutil.copy(context_file.name, destination)
246
+ except Exception as e:
247
+ raise LaunchError(
248
+ f"Error copying build context to PVC mounted at {PVC_MOUNT_PATH}: {e}"
249
+ ) from e
250
+ return f"tar:///context/{run_id}.tgz"
243
251
 
244
252
  async def build_image(
245
253
  self,
@@ -248,9 +256,6 @@ class KanikoBuilder(AbstractBuilder):
248
256
  job_tracker: Optional[JobAndRunStatusTracker] = None,
249
257
  ) -> str:
250
258
  await self.verify()
251
- # TODO: this should probably throw an error if the registry is a local registry
252
- if not self.registry:
253
- raise LaunchError("No registry specified for Kaniko build.")
254
259
  # kaniko builder doesn't seem to work with a custom user id, need more investigation
255
260
  dockerfile_str = generate_dockerfile(
256
261
  launch_project=launch_project,
@@ -262,7 +267,6 @@ class KanikoBuilder(AbstractBuilder):
262
267
  image_tag = image_tag_from_dockerfile_and_source(launch_project, dockerfile_str)
263
268
  repo_uri = await self.registry.get_repo_uri()
264
269
  image_uri = repo_uri + ":" + image_tag
265
-
266
270
  if (
267
271
  not launch_project.build_required()
268
272
  and await self.registry.check_image_exists(image_uri)
@@ -271,16 +275,6 @@ class KanikoBuilder(AbstractBuilder):
271
275
 
272
276
  _logger.info(f"Building image {image_uri}...")
273
277
 
274
- entry_cmd = " ".join(
275
- get_entry_point_command(entrypoint, launch_project.override_args)
276
- )
277
-
278
- create_metadata_file(
279
- launch_project,
280
- image_uri,
281
- sanitize_wandb_api_key(entry_cmd),
282
- sanitize_wandb_api_key(dockerfile_str),
283
- )
284
278
  context_path = _create_docker_build_ctx(launch_project, dockerfile_str)
285
279
  run_id = launch_project.run_id
286
280
 
@@ -381,6 +375,20 @@ class KanikoBuilder(AbstractBuilder):
381
375
  env = []
382
376
  volume_mounts = []
383
377
  volumes = []
378
+
379
+ if PVC_MOUNT_PATH:
380
+ volumes.append(
381
+ client.V1Volume(
382
+ name="kaniko-pvc",
383
+ persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
384
+ claim_name=PVC_NAME
385
+ ),
386
+ )
387
+ )
388
+ volume_mounts.append(
389
+ client.V1VolumeMount(name="kaniko-pvc", mount_path="/context")
390
+ )
391
+
384
392
  if bool(self.secret_name) != bool(self.secret_key):
385
393
  raise LaunchError(
386
394
  "Both secret_name and secret_key or neither must be specified "
@@ -419,8 +427,27 @@ class KanikoBuilder(AbstractBuilder):
419
427
  ),
420
428
  )
421
429
  ]
422
-
423
- if self.secret_name and self.secret_key:
430
+ if DOCKER_CONFIG_SECRET:
431
+ volumes.append(
432
+ client.V1Volume(
433
+ name="kaniko-docker-config",
434
+ secret=client.V1SecretVolumeSource(
435
+ secret_name=DOCKER_CONFIG_SECRET,
436
+ items=[
437
+ client.V1KeyToPath(
438
+ key=".dockerconfigjson", path="config.json"
439
+ )
440
+ ],
441
+ ),
442
+ )
443
+ )
444
+ volume_mounts.append(
445
+ client.V1VolumeMount(
446
+ name="kaniko-docker-config",
447
+ mount_path="/kaniko/.docker",
448
+ )
449
+ )
450
+ elif self.secret_name and self.secret_key:
424
451
  volumes += [
425
452
  client.V1Volume(
426
453
  name="docker-config",
@@ -315,39 +315,6 @@ def _create_repo_metadata(
315
315
  wandb.termerror(f"Entrypoint {entrypoint} not found in git repo")
316
316
  return None
317
317
 
318
- # check if requirements.txt exists
319
- # start at the location of the python file and recurse up to the git root
320
- entrypoint_dir = os.path.dirname(entrypoint)
321
- if entrypoint_dir:
322
- req_dir = os.path.join(local_dir, entrypoint_dir)
323
- else:
324
- req_dir = local_dir
325
-
326
- # If there is a Dockerfile.wandb in the starting rec dir, don't require a requirements.txt
327
- if os.path.exists(os.path.join(req_dir, "Dockerfile.wandb")):
328
- wandb.termlog(
329
- f"Using Dockerfile.wandb in {req_dir.replace(tempdir, '') or 'repository root'}"
330
- )
331
- else:
332
- while (
333
- not os.path.exists(os.path.join(req_dir, "requirements.txt"))
334
- and req_dir != tempdir
335
- ):
336
- req_dir = os.path.dirname(req_dir)
337
-
338
- if not os.path.exists(os.path.join(req_dir, "requirements.txt")):
339
- path_with_subdir = os.path.dirname(
340
- os.path.join(path or "", entrypoint or "")
341
- )
342
- wandb.termerror(
343
- f"Could not find requirements.txt file in git repo at {path_with_subdir}"
344
- )
345
- return None
346
-
347
- wandb.termlog(
348
- f"Using requirements.txt in {req_dir.replace(tempdir, '') or 'repository root'}"
349
- )
350
-
351
318
  metadata = {
352
319
  "git": {
353
320
  "commit": commit,
@@ -366,18 +333,16 @@ def _create_repo_metadata(
366
333
  def _create_artifact_metadata(
367
334
  path: str, entrypoint: str, runtime: Optional[str] = None
368
335
  ) -> Tuple[Dict[str, Any], List[str]]:
369
- if not os.path.exists(path):
336
+ if not os.path.isdir(path):
370
337
  wandb.termerror("Path must be a valid file or directory")
371
338
  return {}, []
372
339
 
373
- if not os.path.exists(os.path.join(path, "requirements.txt")):
374
- wandb.termerror(f"Could not find requirements.txt file in: {path}")
375
- return {}, []
376
-
377
340
  # read local requirements.txt and dump to temp dir for builder
378
341
  requirements = []
379
- with open(os.path.join(path, "requirements.txt")) as f:
380
- requirements = f.read().splitlines()
342
+ depspath = os.path.join(path, "requirements.txt")
343
+ if os.path.exists(depspath):
344
+ with open(depspath) as f:
345
+ requirements = f.read().splitlines()
381
346
 
382
347
  if runtime:
383
348
  python_version = _clean_python_version(runtime)
@@ -431,6 +396,7 @@ def _configure_job_builder_for_partial(tmpdir: str, job_source: str) -> JobBuild
431
396
  settings.update({"files_dir": tmpdir, "job_source": job_source})
432
397
  job_builder = JobBuilder(
433
398
  settings=settings, # type: ignore
399
+ verbose=True,
434
400
  )
435
401
  # never allow notebook runs
436
402
  job_builder._is_notebook_run = False
@@ -1,6 +1,7 @@
1
1
  """Utilities for the agent."""
2
2
  from typing import Any, Dict, Optional
3
3
 
4
+ import wandb
4
5
  from wandb.apis.internal import Api
5
6
  from wandb.docker import is_docker_installed
6
7
  from wandb.sdk.launch.errors import LaunchError
@@ -87,6 +88,15 @@ def registry_from_config(
87
88
  from .registry.local_registry import LocalRegistry
88
89
 
89
90
  return LocalRegistry() # This is the default, dummy registry.
91
+
92
+ wandb.termwarn(
93
+ "The `registry` block of the launch agent config is being deprecated. "
94
+ "Please specify an image repository URI under the `builder.destination` "
95
+ "key of your launch agent config. See "
96
+ "https://docs.wandb.ai/guides/launch/setup-agent-advanced#agent-configuration "
97
+ "for more information."
98
+ )
99
+
90
100
  registry_type = config.get("type")
91
101
  if registry_type is None or registry_type == "local":
92
102
  from .registry.local_registry import LocalRegistry
@@ -0,0 +1,29 @@
1
+ from typing import Tuple
2
+
3
+ from wandb.docker import is_docker_installed
4
+ from wandb.sdk.launch.utils import docker_image_exists
5
+
6
+ from .abstract import AbstractRegistry
7
+
8
+
9
+ class AnonynmousRegistry(AbstractRegistry):
10
+ def __init__(self, uri: str) -> None:
11
+ """Initialize the registry."""
12
+ self.uri = uri
13
+
14
+ async def get_username_password(self) -> Tuple[str, str]:
15
+ """Get the username and password for the registry."""
16
+ raise NotImplementedError("Anonymous registry does not require authentication")
17
+
18
+ async def get_repo_uri(self) -> str:
19
+ return self.uri
20
+
21
+ async def check_image_exists(self, image_uri: str) -> bool:
22
+ """Check if an image exists in the registry."""
23
+ if not is_docker_installed():
24
+ return False
25
+ return docker_image_exists(image_uri)
26
+
27
+ @classmethod
28
+ def from_config(cls, config: dict) -> "AbstractRegistry":
29
+ return cls(uri=config["uri"])
@@ -2,6 +2,7 @@
2
2
  import logging
3
3
  from typing import Tuple
4
4
 
5
+ from wandb.docker import is_docker_installed
5
6
  from wandb.sdk.launch.errors import LaunchError
6
7
  from wandb.sdk.launch.utils import docker_image_exists
7
8
 
@@ -60,4 +61,6 @@ class LocalRegistry(AbstractRegistry):
60
61
  Returns:
61
62
  bool: True.
62
63
  """
63
- return docker_image_exists(image_uri)
64
+ if is_docker_installed():
65
+ return docker_image_exists(image_uri)
66
+ return False
@@ -1,6 +1,7 @@
1
1
  """Implementation of KubernetesRunner class for wandb launch."""
2
2
  import asyncio
3
3
  import base64
4
+ import datetime
4
5
  import json
5
6
  import logging
6
7
  import os
@@ -23,6 +24,7 @@ from wandb.sdk.launch.runner.kubernetes_monitor import (
23
24
  CustomResource,
24
25
  LaunchKubernetesMonitor,
25
26
  )
27
+ from wandb.sdk.lib.retry import ExponentialBackoff, retry_async
26
28
  from wandb.util import get_module
27
29
 
28
30
  from .._project_spec import EntryPoint, LaunchProject
@@ -59,6 +61,7 @@ from kubernetes_asyncio.client.models.v1_secret import ( # type: ignore # noqa:
59
61
  from kubernetes_asyncio.client.rest import ApiException # type: ignore # noqa: E402
60
62
 
61
63
  TIMEOUT = 5
64
+ API_KEY_SECRET_MAX_RETRIES = 5
62
65
 
63
66
  _logger = logging.getLogger(__name__)
64
67
 
@@ -421,8 +424,23 @@ class KubernetesRunner(AbstractRunner):
421
424
  else:
422
425
  secret_name += f"-{launch_project.run_id}"
423
426
 
424
- api_key_secret = await ensure_api_key_secret(
425
- core_api, secret_name, namespace, value
427
+ def handle_exception(e):
428
+ wandb.termwarn(
429
+ f"Exception when ensuring Kubernetes API key secret: {e}. Retrying..."
430
+ )
431
+
432
+ api_key_secret = await retry_async(
433
+ backoff=ExponentialBackoff(
434
+ initial_sleep=datetime.timedelta(seconds=1),
435
+ max_sleep=datetime.timedelta(minutes=1),
436
+ max_retries=API_KEY_SECRET_MAX_RETRIES,
437
+ ),
438
+ fn=ensure_api_key_secret,
439
+ on_exc=handle_exception,
440
+ core_api=core_api,
441
+ secret_name=secret_name,
442
+ namespace=namespace,
443
+ api_key=value,
426
444
  )
427
445
  env.append(
428
446
  {
@@ -148,15 +148,14 @@ class LocalContainerRunner(AbstractRunner):
148
148
  env_vars["WANDB_BASE_URL"] = "http://host.docker.internal:9001"
149
149
 
150
150
  if launch_project.docker_image:
151
- if image_uri.endswith(":latest") or not docker_image_exists(image_uri):
152
- try:
153
- pull_docker_image(image_uri)
154
- except Exception as e:
155
- wandb.termwarn(f"Error attempting to pull docker image {image_uri}")
156
- if not docker_image_exists(image_uri):
157
- raise LaunchError(
158
- f"Failed to pull docker image {image_uri} with error: {e}"
159
- )
151
+ try:
152
+ pull_docker_image(image_uri)
153
+ except Exception as e:
154
+ wandb.termwarn(f"Error attempting to pull docker image {image_uri}")
155
+ if not docker_image_exists(image_uri):
156
+ raise LaunchError(
157
+ f"Failed to pull docker image {image_uri} with error: {e}"
158
+ )
160
159
 
161
160
  assert launch_project.docker_image == image_uri
162
161
 
@@ -234,7 +233,13 @@ def _thread_process_runner(
234
233
  if not chunk:
235
234
  break
236
235
  index = chunk.find(b"\r")
237
- decoded_chunk = chunk.decode()
236
+ decoded_chunk = None
237
+ while not decoded_chunk:
238
+ try:
239
+ decoded_chunk = chunk.decode()
240
+ except UnicodeDecodeError:
241
+ # Multi-byte character cut off, try to get the rest of it
242
+ chunk += os.read(process.stdout.fileno(), 1) # type: ignore
238
243
  if index != -1:
239
244
  run._stdout += decoded_chunk
240
245
  print(chunk.decode(), end="")
@@ -239,7 +239,7 @@ class SageMakerRunner(AbstractRunner):
239
239
  launch_project,
240
240
  self._api,
241
241
  role_arn,
242
- launch_project.override_entrypoint,
242
+ entry_point,
243
243
  launch_project.override_args,
244
244
  MAX_ENV_LENGTHS[self.__class__.__name__],
245
245
  image_uri,
@@ -1,6 +1,7 @@
1
1
  """Abstract Scheduler class."""
2
2
  import asyncio
3
3
  import base64
4
+ import copy
4
5
  import logging
5
6
  import os
6
7
  import socket
@@ -24,7 +25,10 @@ from wandb.sdk.launch.sweeps.utils import (
24
25
  create_sweep_command_args,
25
26
  make_launch_sweep_entrypoint,
26
27
  )
27
- from wandb.sdk.launch.utils import event_loop_thread_exec
28
+ from wandb.sdk.launch.utils import (
29
+ event_loop_thread_exec,
30
+ strip_resource_args_and_template_vars,
31
+ )
28
32
  from wandb.sdk.lib.runid import generate_id
29
33
 
30
34
  if TYPE_CHECKING:
@@ -658,7 +662,7 @@ class Scheduler(ABC):
658
662
  pidx = entry_point.index("${program}")
659
663
  entry_point[pidx] = self._sweep_config["program"]
660
664
 
661
- launch_config = self._wandb_run.config.get("launch", {})
665
+ launch_config = copy.deepcopy(self._wandb_run.config.get("launch", {}))
662
666
  if "overrides" not in launch_config:
663
667
  launch_config["overrides"] = {"run_config": {}}
664
668
  launch_config["overrides"]["run_config"].update(args["args_dict"])
@@ -694,11 +698,14 @@ class Scheduler(ABC):
694
698
  )
695
699
 
696
700
  # override resource and args of job
697
- _job_launch_config = self._wandb_run.config.get("launch") or {}
701
+ _job_launch_config = copy.deepcopy(self._wandb_run.config.get("launch")) or {}
698
702
 
699
703
  # default priority is "medium"
700
704
  _priority = int(launch_config.get("priority", 2)) # type: ignore
701
705
 
706
+ # strip resource_args and template_variables from launch_config
707
+ strip_resource_args_and_template_vars(_job_launch_config)
708
+
702
709
  run_id = run.id or generate_id()
703
710
  queued_run = launch_add(
704
711
  run_id=run_id,
@@ -712,6 +719,7 @@ class Scheduler(ABC):
712
719
  project_queue=self._project_queue,
713
720
  resource=_job_launch_config.get("resource"),
714
721
  resource_args=_job_launch_config.get("resource_args"),
722
+ template_variables=_job_launch_config.get("template_variables"),
715
723
  author=self._kwargs.get("author"),
716
724
  sweep_id=self._sweep_id,
717
725
  priority=_priority,
wandb/sdk/launch/utils.py CHANGED
@@ -221,6 +221,17 @@ def get_default_entity(api: Api, launch_config: Optional[Dict[str, Any]]):
221
221
  return config_entity or api.default_entity
222
222
 
223
223
 
224
+ def strip_resource_args_and_template_vars(launch_spec: Dict[str, Any]) -> None:
225
+ if launch_spec.get("resource_args", None) and launch_spec.get(
226
+ "template_variables", None
227
+ ):
228
+ wandb.termwarn(
229
+ "Launch spec contains both resource_args and template_variables, "
230
+ "only one can be set. Using template_variables."
231
+ )
232
+ launch_spec.pop("resource_args")
233
+
234
+
224
235
  def construct_launch_spec(
225
236
  uri: Optional[str],
226
237
  job: Optional[str],
@@ -298,6 +309,9 @@ def construct_launch_spec(
298
309
  else:
299
310
  launch_config["registry"] = {"url": repository}
300
311
 
312
+ # dont send both resource args and template variables
313
+ strip_resource_args_and_template_vars(launch_spec)
314
+
301
315
  return launch_spec
302
316
 
303
317
 
wandb/sdk/lib/__init__.py CHANGED
@@ -1,8 +1,5 @@
1
1
  from . import lazyloader
2
2
  from .disabled import RunDisabled, SummaryDisabled
3
+ from .run_moment import RunMoment
3
4
 
4
- __all__ = (
5
- "lazyloader",
6
- "RunDisabled",
7
- "SummaryDisabled",
8
- )
5
+ __all__ = ("lazyloader", "RunDisabled", "SummaryDisabled", "RunMoment")
@@ -15,6 +15,7 @@ _Setting = Literal[
15
15
  "_aws_lambda",
16
16
  "_async_upload_concurrency_limit",
17
17
  "_cli_only_mode",
18
+ "_code_path_local",
18
19
  "_colab",
19
20
  "_cuda",
20
21
  "_disable_meta",
@@ -101,6 +102,7 @@ _Setting = Literal[
101
102
  "entity",
102
103
  "files_dir",
103
104
  "force",
105
+ "fork_from",
104
106
  "git_commit",
105
107
  "git_remote",
106
108
  "git_remote_url",
@@ -191,6 +193,8 @@ SETTINGS_TOPOLOGICALLY_SORTED: Final[Tuple[_Setting, ...]] = (
191
193
  "run_id",
192
194
  "start_method",
193
195
  "_aws_lambda",
196
+ "program",
197
+ "_code_path_local",
194
198
  "_colab",
195
199
  "_disable_machine_info",
196
200
  "_disable_meta",
@@ -227,7 +231,6 @@ SETTINGS_TOPOLOGICALLY_SORTED: Final[Tuple[_Setting, ...]] = (
227
231
  "log_symlink_internal",
228
232
  "log_symlink_user",
229
233
  "log_user",
230
- "program",
231
234
  "project_url",
232
235
  "resume_fname",
233
236
  "run_url",
wandb/sdk/lib/apikey.py CHANGED
@@ -179,11 +179,6 @@ def write_netrc(host: str, entity: str, key: str) -> Optional[bool]:
179
179
  return None
180
180
  try:
181
181
  normalized_host = urlparse(host).netloc.split(":")[0]
182
- if normalized_host != "localhost" and "." not in normalized_host:
183
- wandb.termerror(
184
- f"Host must be a url in the form https://some.address.com, received {host}"
185
- )
186
- return None
187
182
  netrc_path = get_netrc_file_path()
188
183
  wandb.termlog(
189
184
  f"Appending key for {normalized_host} to your netrc file: {netrc_path}"