wandb 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (156) hide show
  1. wandb/__init__.py +1 -1
  2. wandb/analytics/sentry.py +1 -0
  3. wandb/apis/importers/base.py +20 -5
  4. wandb/apis/importers/mlflow.py +7 -1
  5. wandb/apis/internal.py +12 -0
  6. wandb/apis/public.py +247 -1387
  7. wandb/apis/reports/_panels.py +58 -35
  8. wandb/beta/workflows.py +6 -7
  9. wandb/cli/cli.py +130 -60
  10. wandb/data_types.py +3 -1
  11. wandb/filesync/dir_watcher.py +21 -27
  12. wandb/filesync/step_checksum.py +8 -8
  13. wandb/filesync/step_prepare.py +23 -10
  14. wandb/filesync/step_upload.py +13 -13
  15. wandb/filesync/upload_job.py +4 -8
  16. wandb/integration/cohere/__init__.py +3 -0
  17. wandb/integration/cohere/cohere.py +21 -0
  18. wandb/integration/cohere/resolver.py +347 -0
  19. wandb/integration/gym/__init__.py +4 -6
  20. wandb/integration/huggingface/__init__.py +3 -0
  21. wandb/integration/huggingface/huggingface.py +18 -0
  22. wandb/integration/huggingface/resolver.py +213 -0
  23. wandb/integration/langchain/wandb_tracer.py +16 -179
  24. wandb/integration/openai/__init__.py +1 -3
  25. wandb/integration/openai/openai.py +11 -143
  26. wandb/integration/openai/resolver.py +111 -38
  27. wandb/integration/sagemaker/config.py +2 -2
  28. wandb/integration/tensorboard/log.py +4 -4
  29. wandb/old/settings.py +24 -7
  30. wandb/proto/v3/wandb_telemetry_pb2.py +12 -12
  31. wandb/proto/v4/wandb_telemetry_pb2.py +12 -12
  32. wandb/proto/wandb_deprecated.py +3 -1
  33. wandb/sdk/__init__.py +1 -1
  34. wandb/sdk/artifacts/__init__.py +0 -0
  35. wandb/sdk/artifacts/artifact.py +2101 -0
  36. wandb/sdk/artifacts/artifact_download_logger.py +42 -0
  37. wandb/sdk/artifacts/artifact_manifest.py +67 -0
  38. wandb/sdk/artifacts/artifact_manifest_entry.py +159 -0
  39. wandb/sdk/artifacts/artifact_manifests/__init__.py +0 -0
  40. wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +91 -0
  41. wandb/sdk/{internal → artifacts}/artifact_saver.py +6 -5
  42. wandb/sdk/artifacts/artifact_state.py +10 -0
  43. wandb/sdk/{interface/artifacts/artifact_cache.py → artifacts/artifacts_cache.py} +22 -12
  44. wandb/sdk/artifacts/exceptions.py +55 -0
  45. wandb/sdk/artifacts/storage_handler.py +59 -0
  46. wandb/sdk/artifacts/storage_handlers/__init__.py +0 -0
  47. wandb/sdk/artifacts/storage_handlers/azure_handler.py +192 -0
  48. wandb/sdk/artifacts/storage_handlers/gcs_handler.py +224 -0
  49. wandb/sdk/artifacts/storage_handlers/http_handler.py +112 -0
  50. wandb/sdk/artifacts/storage_handlers/local_file_handler.py +134 -0
  51. wandb/sdk/artifacts/storage_handlers/multi_handler.py +53 -0
  52. wandb/sdk/artifacts/storage_handlers/s3_handler.py +301 -0
  53. wandb/sdk/artifacts/storage_handlers/tracking_handler.py +67 -0
  54. wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +132 -0
  55. wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +72 -0
  56. wandb/sdk/artifacts/storage_layout.py +6 -0
  57. wandb/sdk/artifacts/storage_policies/__init__.py +0 -0
  58. wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +61 -0
  59. wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +386 -0
  60. wandb/sdk/{interface/artifacts/artifact_storage.py → artifacts/storage_policy.py} +5 -57
  61. wandb/sdk/data_types/_dtypes.py +7 -12
  62. wandb/sdk/data_types/base_types/json_metadata.py +3 -2
  63. wandb/sdk/data_types/base_types/media.py +8 -8
  64. wandb/sdk/data_types/base_types/wb_value.py +12 -13
  65. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +5 -6
  66. wandb/sdk/data_types/helper_types/classes.py +6 -8
  67. wandb/sdk/data_types/helper_types/image_mask.py +5 -6
  68. wandb/sdk/data_types/histogram.py +4 -3
  69. wandb/sdk/data_types/html.py +3 -4
  70. wandb/sdk/data_types/image.py +11 -9
  71. wandb/sdk/data_types/molecule.py +5 -3
  72. wandb/sdk/data_types/object_3d.py +7 -5
  73. wandb/sdk/data_types/plotly.py +3 -2
  74. wandb/sdk/data_types/saved_model.py +11 -11
  75. wandb/sdk/data_types/trace_tree.py +5 -4
  76. wandb/sdk/data_types/utils.py +3 -5
  77. wandb/sdk/data_types/video.py +5 -4
  78. wandb/sdk/integration_utils/auto_logging.py +215 -0
  79. wandb/sdk/interface/interface.py +15 -15
  80. wandb/sdk/internal/file_pusher.py +8 -16
  81. wandb/sdk/internal/file_stream.py +5 -11
  82. wandb/sdk/internal/handler.py +13 -1
  83. wandb/sdk/internal/internal_api.py +287 -13
  84. wandb/sdk/internal/job_builder.py +119 -30
  85. wandb/sdk/internal/sender.py +6 -26
  86. wandb/sdk/internal/settings_static.py +2 -0
  87. wandb/sdk/internal/system/assets/__init__.py +2 -0
  88. wandb/sdk/internal/system/assets/gpu.py +42 -0
  89. wandb/sdk/internal/system/assets/gpu_amd.py +216 -0
  90. wandb/sdk/internal/system/env_probe_helpers.py +13 -0
  91. wandb/sdk/internal/system/system_info.py +3 -3
  92. wandb/sdk/internal/tb_watcher.py +32 -22
  93. wandb/sdk/internal/thread_local_settings.py +18 -0
  94. wandb/sdk/launch/_project_spec.py +57 -11
  95. wandb/sdk/launch/agent/agent.py +147 -65
  96. wandb/sdk/launch/agent/job_status_tracker.py +34 -0
  97. wandb/sdk/launch/agent/run_queue_item_file_saver.py +45 -0
  98. wandb/sdk/launch/builder/abstract.py +5 -1
  99. wandb/sdk/launch/builder/build.py +21 -18
  100. wandb/sdk/launch/builder/docker_builder.py +10 -4
  101. wandb/sdk/launch/builder/kaniko_builder.py +113 -23
  102. wandb/sdk/launch/builder/noop.py +6 -3
  103. wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +46 -14
  104. wandb/sdk/launch/environment/aws_environment.py +3 -2
  105. wandb/sdk/launch/environment/azure_environment.py +124 -0
  106. wandb/sdk/launch/environment/gcp_environment.py +2 -4
  107. wandb/sdk/launch/environment/local_environment.py +1 -1
  108. wandb/sdk/launch/errors.py +19 -0
  109. wandb/sdk/launch/github_reference.py +32 -19
  110. wandb/sdk/launch/launch.py +3 -8
  111. wandb/sdk/launch/launch_add.py +6 -2
  112. wandb/sdk/launch/loader.py +21 -2
  113. wandb/sdk/launch/registry/azure_container_registry.py +132 -0
  114. wandb/sdk/launch/registry/elastic_container_registry.py +39 -5
  115. wandb/sdk/launch/registry/google_artifact_registry.py +68 -26
  116. wandb/sdk/launch/registry/local_registry.py +2 -1
  117. wandb/sdk/launch/runner/abstract.py +24 -3
  118. wandb/sdk/launch/runner/kubernetes_runner.py +479 -26
  119. wandb/sdk/launch/runner/local_container.py +103 -51
  120. wandb/sdk/launch/runner/local_process.py +1 -1
  121. wandb/sdk/launch/runner/sagemaker_runner.py +60 -10
  122. wandb/sdk/launch/runner/vertex_runner.py +10 -5
  123. wandb/sdk/launch/sweeps/__init__.py +7 -9
  124. wandb/sdk/launch/sweeps/scheduler.py +307 -77
  125. wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
  126. wandb/sdk/launch/sweeps/utils.py +82 -35
  127. wandb/sdk/launch/utils.py +89 -75
  128. wandb/sdk/lib/_settings_toposort_generated.py +7 -0
  129. wandb/sdk/lib/capped_dict.py +26 -0
  130. wandb/sdk/lib/{git.py → gitlib.py} +76 -59
  131. wandb/sdk/lib/hashutil.py +12 -4
  132. wandb/sdk/lib/paths.py +96 -8
  133. wandb/sdk/lib/sock_client.py +2 -2
  134. wandb/sdk/lib/timer.py +1 -0
  135. wandb/sdk/service/server.py +22 -9
  136. wandb/sdk/service/server_sock.py +1 -1
  137. wandb/sdk/service/service.py +27 -8
  138. wandb/sdk/verify/verify.py +4 -7
  139. wandb/sdk/wandb_config.py +2 -6
  140. wandb/sdk/wandb_init.py +57 -53
  141. wandb/sdk/wandb_require.py +7 -0
  142. wandb/sdk/wandb_run.py +61 -223
  143. wandb/sdk/wandb_settings.py +28 -4
  144. wandb/testing/relay.py +15 -2
  145. wandb/util.py +74 -36
  146. {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/METADATA +15 -9
  147. {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/RECORD +151 -116
  148. {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/entry_points.txt +1 -0
  149. wandb/integration/langchain/util.py +0 -191
  150. wandb/sdk/interface/artifacts/__init__.py +0 -33
  151. wandb/sdk/interface/artifacts/artifact.py +0 -615
  152. wandb/sdk/interface/artifacts/artifact_manifest.py +0 -131
  153. wandb/sdk/wandb_artifacts.py +0 -2226
  154. {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/LICENSE +0 -0
  155. {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/WHEEL +0 -0
  156. {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,45 @@
1
+ """Implementation of the run queue item file saver class."""
2
+
3
+ import os
4
+ import sys
5
+ from typing import List, Optional, Union
6
+
7
+ import wandb
8
+ from wandb.sdk.lib import RunDisabled
9
+ from wandb.sdk.wandb_run import Run
10
+
11
+ if sys.version_info >= (3, 8):
12
+ from typing import Literal
13
+ else:
14
+ from typing_extensions import Literal
15
+
16
+ FileSubtypes = Literal["warning", "error"]
17
+
18
+
19
+ class RunQueueItemFileSaver:
20
+ def __init__(
21
+ self, agent_run: Optional[Union[Run, RunDisabled]], run_queue_item_id: str
22
+ ):
23
+ self.run_queue_item_id = run_queue_item_id
24
+ self.run = agent_run
25
+
26
+ def save_contents(
27
+ self, contents: str, fname: str, file_sub_type: FileSubtypes
28
+ ) -> Optional[List[str]]:
29
+ if not isinstance(self.run, Run):
30
+ wandb.termwarn("Not saving file contents because agent has no run")
31
+ return None
32
+ root_dir = self.run._settings.files_dir
33
+ saved_run_path = os.path.join(self.run_queue_item_id, file_sub_type, fname)
34
+ local_path = os.path.join(root_dir, saved_run_path)
35
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
36
+ with open(local_path, "w") as f:
37
+ f.write(contents)
38
+ res = self.run.save(local_path, base_path=root_dir, policy="now")
39
+ if isinstance(res, list):
40
+ return [saved_run_path]
41
+ else:
42
+ wandb.termwarn(
43
+ f"Failed to save files for run queue item: {self.run_queue_item_id}"
44
+ )
45
+ return None
@@ -1,12 +1,15 @@
1
1
  """Abstract plugin class defining the interface needed to build container images for W&B Launch."""
2
2
  from abc import ABC, abstractmethod
3
- from typing import Any, Dict
3
+ from typing import TYPE_CHECKING, Any, Dict, Optional
4
4
 
5
5
  from wandb.sdk.launch.environment.abstract import AbstractEnvironment
6
6
  from wandb.sdk.launch.registry.abstract import AbstractRegistry
7
7
 
8
8
  from .._project_spec import EntryPoint, LaunchProject
9
9
 
10
+ if TYPE_CHECKING:
11
+ from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
12
+
10
13
 
11
14
  class AbstractBuilder(ABC):
12
15
  """Abstract plugin class defining the interface needed to build container images for W&B Launch."""
@@ -63,6 +66,7 @@ class AbstractBuilder(ABC):
63
66
  self,
64
67
  launch_project: LaunchProject,
65
68
  entrypoint: EntryPoint,
69
+ job_tracker: Optional["JobAndRunStatusTracker"] = None,
66
70
  ) -> str:
67
71
  """Build the image for the given project.
68
72
 
@@ -28,13 +28,8 @@ from .._project_spec import (
28
28
  LaunchProject,
29
29
  fetch_and_validate_project,
30
30
  )
31
- from ..utils import (
32
- LAUNCH_CONFIG_FILE,
33
- LOG_PREFIX,
34
- ExecutionError,
35
- LaunchError,
36
- resolve_build_and_registry_config,
37
- )
31
+ from ..errors import ExecutionError, LaunchError
32
+ from ..utils import LAUNCH_CONFIG_FILE, LOG_PREFIX, resolve_build_and_registry_config
38
33
  from .abstract import AbstractBuilder
39
34
 
40
35
  _logger = logging.getLogger(__name__)
@@ -43,8 +38,6 @@ _logger = logging.getLogger(__name__)
43
38
  _GENERATED_DOCKERFILE_NAME = "Dockerfile.wandb-autogenerated"
44
39
  DEFAULT_ENTRYPOINT = "_wandb_default_entrypoint"
45
40
 
46
- DEFAULT_CUDA_VERSION = "10.0"
47
-
48
41
 
49
42
  def validate_docker_installation() -> None:
50
43
  """Verify if Docker is installed on host machine."""
@@ -108,8 +101,12 @@ FROM {py_base_image} as base
108
101
  """
109
102
 
110
103
  # this goes into base_setup in TEMPLATE
111
- CUDA_SETUP_TEMPLATE = """
112
- FROM {cuda_base_image} as base
104
+ ACCELERATOR_SETUP_TEMPLATE = """
105
+ FROM {accelerator_base_image} as base
106
+
107
+ # make non-interactive so build doesn't block on questions
108
+ ENV DEBIAN_FRONTEND=noninteractive
109
+
113
110
  # TODO: once NVIDIA their linux repository keys for all docker images
114
111
  RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$(cat /etc/os-release | grep ^ID= | cut -d "=" -f2 )$(cat /etc/os-release | grep ^VERSION_ID= | cut -d "=" -f2 | sed -e 's/[\".]//g' )/$(uname -i)/3bf863cc.pub
115
112
  RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/$(cat /etc/os-release | grep ^ID= | cut -d "=" -f2 )$(cat /etc/os-release | grep ^VERSION_ID= | cut -d "=" -f2 | sed -e 's/[\".]//g' )/$(uname -i)/7fa2af80.pub
@@ -189,12 +186,14 @@ def get_base_setup(
189
186
  ) -> str:
190
187
  """Fill in the Dockerfile templates for stage 2 of build.
191
188
 
192
- CPU version is built on python, GPU version is built on nvidia:cuda.
189
+ CPU version is built on python, Accelerator version is built on user provided.
193
190
  """
194
191
  python_base_image = f"python:{py_version}-buster"
195
- if launch_project.cuda_base_image:
196
- _logger.info(f"Using cuda base image: {launch_project.cuda_base_image}")
197
- # cuda image doesn't come with python tooling
192
+ if launch_project.accelerator_base_image:
193
+ _logger.info(
194
+ f"Using accelerator base image: {launch_project.accelerator_base_image}"
195
+ )
196
+ # accelerator base images doesn't come with python tooling
198
197
  if py_major == "2":
199
198
  python_packages = [
200
199
  f"python{py_version}",
@@ -209,8 +208,8 @@ def get_base_setup(
209
208
  "python3-pip",
210
209
  "python3-setuptools",
211
210
  ]
212
- base_setup = CUDA_SETUP_TEMPLATE.format(
213
- cuda_base_image=launch_project.cuda_base_image,
211
+ base_setup = ACCELERATOR_SETUP_TEMPLATE.format(
212
+ accelerator_base_image=launch_project.accelerator_base_image,
214
213
  python_packages=" \\\n".join(python_packages),
215
214
  py_version=py_version,
216
215
  )
@@ -246,6 +245,10 @@ def get_env_vars_dict(launch_project: LaunchProject, api: Api) -> Dict[str, str]
246
245
  env_vars["WANDB_NAME"] = launch_project.name
247
246
  if "author" in launch_project.launch_spec and not override_api_key:
248
247
  env_vars["WANDB_USERNAME"] = launch_project.launch_spec["author"]
248
+ if launch_project.sweep_id:
249
+ env_vars["WANDB_SWEEP_ID"] = launch_project.sweep_id
250
+ if launch_project.launch_spec.get("_resume_count"):
251
+ env_vars["WANDB_RESUME"] = "must"
249
252
 
250
253
  # TODO: handle env vars > 32760 characters
251
254
  env_vars["WANDB_CONFIG"] = json.dumps(launch_project.override_config)
@@ -288,7 +291,7 @@ def get_requirements_section(launch_project: LaunchProject, builder_type: str) -
288
291
  ):
289
292
  requirements_files += ["src/requirements.txt"]
290
293
  pip_install_line = "pip install -r requirements.txt"
291
- if launch_project.project_dir is not None and os.path.exists(
294
+ elif launch_project.project_dir is not None and os.path.exists(
292
295
  os.path.join(launch_project.project_dir, "requirements.frozen.txt")
293
296
  ):
294
297
  # if we have frozen requirements stored, copy those over and have them take precedence
@@ -1,10 +1,11 @@
1
1
  """Implementation of the docker builder."""
2
2
  import logging
3
3
  import os
4
- from typing import Any, Dict
4
+ from typing import Any, Dict, Optional
5
5
 
6
6
  import wandb
7
7
  import wandb.docker as docker
8
+ from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
8
9
  from wandb.sdk.launch.builder.abstract import AbstractBuilder
9
10
  from wandb.sdk.launch.environment.abstract import AbstractEnvironment
10
11
  from wandb.sdk.launch.registry.abstract import AbstractRegistry
@@ -15,11 +16,10 @@ from .._project_spec import (
15
16
  create_metadata_file,
16
17
  get_entry_point_command,
17
18
  )
19
+ from ..errors import LaunchDockerError, LaunchError
18
20
  from ..registry.local_registry import LocalRegistry
19
21
  from ..utils import (
20
22
  LOG_PREFIX,
21
- LaunchDockerError,
22
- LaunchError,
23
23
  sanitize_wandb_api_key,
24
24
  warn_failed_packages_from_build_logs,
25
25
  )
@@ -112,6 +112,7 @@ class DockerBuilder(AbstractBuilder):
112
112
  self,
113
113
  launch_project: LaunchProject,
114
114
  entrypoint: EntryPoint,
115
+ job_tracker: Optional[JobAndRunStatusTracker] = None,
115
116
  ) -> str:
116
117
  """Build the image for the given project.
117
118
 
@@ -160,9 +161,14 @@ class DockerBuilder(AbstractBuilder):
160
161
  context_path=build_ctx_path,
161
162
  platform=self.config.get("platform"),
162
163
  )
163
- warn_failed_packages_from_build_logs(output, image_uri)
164
+
165
+ warn_failed_packages_from_build_logs(
166
+ output, image_uri, launch_project.api, job_tracker
167
+ )
164
168
 
165
169
  except docker.DockerError as e:
170
+ if job_tracker:
171
+ job_tracker.set_err_stage("build")
166
172
  raise LaunchDockerError(f"Error communicating with docker client: {e}")
167
173
 
168
174
  try:
@@ -1,15 +1,19 @@
1
1
  import base64
2
2
  import json
3
3
  import logging
4
+ import os
4
5
  import tarfile
5
6
  import tempfile
6
7
  import time
7
8
  from typing import Optional
8
9
 
9
10
  import wandb
11
+ from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
10
12
  from wandb.sdk.launch.builder.abstract import AbstractBuilder
11
13
  from wandb.sdk.launch.environment.abstract import AbstractEnvironment
14
+ from wandb.sdk.launch.environment.azure_environment import AzureEnvironment
12
15
  from wandb.sdk.launch.registry.abstract import AbstractRegistry
16
+ from wandb.sdk.launch.registry.azure_container_registry import AzureContainerRegistry
13
17
  from wandb.sdk.launch.registry.elastic_container_registry import (
14
18
  ElasticContainerRegistry,
15
19
  )
@@ -22,9 +26,9 @@ from .._project_spec import (
22
26
  create_metadata_file,
23
27
  get_entry_point_command,
24
28
  )
29
+ from ..errors import LaunchError
25
30
  from ..utils import (
26
31
  LOG_PREFIX,
27
- LaunchError,
28
32
  get_kube_context_and_api_client,
29
33
  sanitize_wandb_api_key,
30
34
  warn_failed_packages_from_build_logs,
@@ -47,13 +51,21 @@ _logger = logging.getLogger(__name__)
47
51
 
48
52
  _DEFAULT_BUILD_TIMEOUT_SECS = 1800 # 30 minute build timeout
49
53
 
54
+ SERVICE_ACCOUNT_NAME = os.environ.get("WANDB_LAUNCH_SERVICE_ACCOUNT_NAME", "default")
55
+
56
+ if os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
57
+ with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
58
+ NAMESPACE = f.read().strip()
59
+ else:
60
+ NAMESPACE = "wandb"
61
+
50
62
 
51
63
  def _wait_for_completion(
52
64
  batch_client: client.BatchV1Api, job_name: str, deadline_secs: Optional[int] = None
53
65
  ) -> bool:
54
66
  start_time = time.time()
55
67
  while True:
56
- job = batch_client.read_namespaced_job_status(job_name, "wandb")
68
+ job = batch_client.read_namespaced_job_status(job_name, NAMESPACE)
57
69
  if job.status.succeeded is not None and job.status.succeeded >= 1:
58
70
  return True
59
71
  elif job.status.failed is not None and job.status.failed >= 1:
@@ -75,6 +87,7 @@ class KanikoBuilder(AbstractBuilder):
75
87
  build_context_store: str
76
88
  secret_name: Optional[str]
77
89
  secret_key: Optional[str]
90
+ image: str
78
91
 
79
92
  def __init__(
80
93
  self,
@@ -84,6 +97,7 @@ class KanikoBuilder(AbstractBuilder):
84
97
  build_context_store: str = "",
85
98
  secret_name: str = "",
86
99
  secret_key: str = "",
100
+ image: str = "gcr.io/kaniko-project/executor:v1.11.0",
87
101
  verify: bool = True,
88
102
  ):
89
103
  """Initialize a KanikoBuilder.
@@ -110,6 +124,7 @@ class KanikoBuilder(AbstractBuilder):
110
124
  self.build_context_store = build_context_store.rstrip("/")
111
125
  self.secret_name = secret_name
112
126
  self.secret_key = secret_key
127
+ self.image = image
113
128
  if verify:
114
129
  self.verify()
115
130
 
@@ -148,6 +163,7 @@ class KanikoBuilder(AbstractBuilder):
148
163
  build_job_name = config.get("build-job-name", "wandb-launch-container-build")
149
164
  secret_name = config.get("secret-name", "")
150
165
  secret_key = config.get("secret-key", "")
166
+ image = config.get("kaniko-image", "gcr.io/kaniko-project/executor:v1.11.0")
151
167
  return cls(
152
168
  environment,
153
169
  registry,
@@ -155,6 +171,7 @@ class KanikoBuilder(AbstractBuilder):
155
171
  build_job_name=build_job_name,
156
172
  secret_name=secret_name,
157
173
  secret_key=secret_key,
174
+ image=image,
158
175
  verify=verify,
159
176
  )
160
177
 
@@ -184,7 +201,7 @@ class KanikoBuilder(AbstractBuilder):
184
201
  kind="ConfigMap",
185
202
  metadata=client.V1ObjectMeta(
186
203
  name=f"docker-config-{job_name}",
187
- namespace="wandb",
204
+ namespace=NAMESPACE,
188
205
  ),
189
206
  data={
190
207
  "config.json": json.dumps(
@@ -193,13 +210,13 @@ class KanikoBuilder(AbstractBuilder):
193
210
  },
194
211
  immutable=True,
195
212
  )
196
- corev1_client.create_namespaced_config_map("wandb", ecr_config_map)
213
+ corev1_client.create_namespaced_config_map(NAMESPACE, ecr_config_map)
197
214
 
198
215
  def _delete_docker_ecr_config_map(
199
216
  self, job_name: str, client: client.CoreV1Api
200
217
  ) -> None:
201
218
  if self.secret_name:
202
- client.delete_namespaced_config_map(f"docker-config-{job_name}", "wandb")
219
+ client.delete_namespaced_config_map(f"docker-config-{job_name}", NAMESPACE)
203
220
 
204
221
  def _upload_build_context(self, run_id: str, context_path: str) -> str:
205
222
  # creat a tar archive of the build context and upload it to s3
@@ -217,6 +234,7 @@ class KanikoBuilder(AbstractBuilder):
217
234
  self,
218
235
  launch_project: LaunchProject,
219
236
  entrypoint: EntryPoint,
237
+ job_tracker: Optional[JobAndRunStatusTracker] = None,
220
238
  ) -> str:
221
239
  # TODO: this should probably throw an error if the registry is a local registry
222
240
  if not self.registry:
@@ -252,35 +270,52 @@ class KanikoBuilder(AbstractBuilder):
252
270
  _, api_client = get_kube_context_and_api_client(
253
271
  kubernetes, launch_project.resource_args
254
272
  )
273
+ # TODO: use same client as kuberentes_runner.py
274
+ batch_v1 = client.BatchV1Api(api_client)
275
+ core_v1 = client.CoreV1Api(api_client)
276
+
255
277
  build_job_name = f"{self.build_job_name}-{run_id}"
256
278
 
257
279
  build_context = self._upload_build_context(run_id, context_path)
258
280
  build_job = self._create_kaniko_job(
259
- build_job_name,
260
- repo_uri,
261
- image_uri,
262
- build_context,
281
+ build_job_name, repo_uri, image_uri, build_context, core_v1
263
282
  )
264
283
  wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
265
284
 
266
- # TODO: use same client as kuberentes.py
267
- batch_v1 = client.BatchV1Api(api_client)
268
- core_v1 = client.CoreV1Api(api_client)
269
-
270
285
  try:
286
+ if isinstance(self.registry, AzureContainerRegistry):
287
+ dockerfile_config_map = client.V1ConfigMap(
288
+ metadata=client.V1ObjectMeta(
289
+ name=f"docker-config-{build_job_name}"
290
+ ),
291
+ data={
292
+ "config.json": json.dumps(
293
+ {
294
+ "credHelpers": {
295
+ f"{self.registry.registry_name}.azurecr.io": "acr-env"
296
+ }
297
+ }
298
+ )
299
+ },
300
+ )
301
+ core_v1.create_namespaced_config_map("wandb", dockerfile_config_map)
271
302
  # core_v1.create_namespaced_config_map("wandb", dockerfile_config_map)
272
303
  if self.secret_name:
273
304
  self._create_docker_ecr_config_map(build_job_name, core_v1, repo_uri)
274
- batch_v1.create_namespaced_job("wandb", build_job)
305
+ batch_v1.create_namespaced_job(NAMESPACE, build_job)
275
306
 
276
307
  # wait for double the job deadline since it might take time to schedule
277
308
  if not _wait_for_completion(
278
309
  batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
279
310
  ):
311
+ if job_tracker:
312
+ job_tracker.set_err_stage("build")
280
313
  raise Exception(f"Failed to build image in kaniko for job {run_id}")
281
314
  try:
282
- logs = batch_v1.read_namespaced_job_log(build_job_name, "wandb")
283
- warn_failed_packages_from_build_logs(logs, image_uri)
315
+ logs = batch_v1.read_namespaced_job_log(build_job_name, NAMESPACE)
316
+ warn_failed_packages_from_build_logs(
317
+ logs, image_uri, launch_project.api, job_tracker
318
+ )
284
319
  except Exception as e:
285
320
  wandb.termwarn(
286
321
  f"{LOG_PREFIX}Failed to get logs for kaniko job {build_job_name}: {e}"
@@ -295,9 +330,13 @@ class KanikoBuilder(AbstractBuilder):
295
330
  try:
296
331
  # should we clean up the s3 build contexts? can set bucket level policy to auto deletion
297
332
  # core_v1.delete_namespaced_config_map(config_map_name, "wandb")
333
+ if isinstance(self.registry, AzureContainerRegistry):
334
+ core_v1.delete_namespaced_config_map(
335
+ f"docker-config-{build_job_name}", "wandb"
336
+ )
298
337
  if self.secret_name:
299
338
  self._delete_docker_ecr_config_map(build_job_name, core_v1)
300
- batch_v1.delete_namespaced_job(build_job_name, "wandb")
339
+ batch_v1.delete_namespaced_job(build_job_name, NAMESPACE)
301
340
  except Exception as e:
302
341
  raise LaunchError(f"Exception during Kubernetes resource clean up {e}")
303
342
 
@@ -309,6 +348,7 @@ class KanikoBuilder(AbstractBuilder):
309
348
  repository: str,
310
349
  image_tag: str,
311
350
  build_context_path: str,
351
+ core_client: client.CoreV1Api,
312
352
  ) -> "client.V1Job":
313
353
  env = []
314
354
  volume_mounts = []
@@ -325,6 +365,33 @@ class KanikoBuilder(AbstractBuilder):
325
365
  value=self.registry.environment.region,
326
366
  )
327
367
  ]
368
+ # TODO: Refactor all of this environment/registry
369
+ # specific stuff into methods of those classes.
370
+ if isinstance(self.environment, AzureEnvironment):
371
+ # Use the core api to check if the secret exists
372
+ try:
373
+ core_client.read_namespaced_secret(
374
+ "azure-storage-access-key",
375
+ "wandb",
376
+ )
377
+ except Exception as e:
378
+ raise LaunchError(
379
+ "Secret azure-storage-access-key does not exist in "
380
+ "namespace wandb. Please create it with the key password "
381
+ "set to your azure storage access key."
382
+ ) from e
383
+ env += [
384
+ client.V1EnvVar(
385
+ name="AZURE_STORAGE_ACCESS_KEY",
386
+ value_from=client.V1EnvVarSource(
387
+ secret_key_ref=client.V1SecretKeySelector(
388
+ name="azure-storage-access-key",
389
+ key="password",
390
+ )
391
+ ),
392
+ )
393
+ ]
394
+
328
395
  if self.secret_name and self.secret_key:
329
396
  volumes += [
330
397
  client.V1Volume(
@@ -379,31 +446,54 @@ class KanikoBuilder(AbstractBuilder):
379
446
  ),
380
447
  )
381
448
  ]
382
-
449
+ if isinstance(self.registry, AzureContainerRegistry):
450
+ # ADd the docker config map
451
+ volume_mounts += [
452
+ client.V1VolumeMount(
453
+ name="docker-config", mount_path="/kaniko/.docker/"
454
+ ),
455
+ ]
456
+ volumes += [
457
+ client.V1Volume(
458
+ name="docker-config",
459
+ config_map=client.V1ConfigMapVolumeSource(
460
+ name=f"docker-config-{job_name}",
461
+ ),
462
+ ),
463
+ ]
464
+ # Kaniko doesn't want https:// at the begining of the image tag.
465
+ destination = image_tag
466
+ if destination.startswith("https://"):
467
+ destination = destination.replace("https://", "")
383
468
  args = [
384
469
  f"--context={build_context_path}",
385
470
  "--dockerfile=Dockerfile.wandb-autogenerated",
386
- f"--destination={image_tag}",
471
+ f"--destination={destination}",
387
472
  "--cache=true",
388
- f"--cache-repo={repository}",
473
+ f"--cache-repo={repository.replace('https://', '')}",
389
474
  "--snapshotMode=redo",
390
475
  "--compressed-caching=false",
391
476
  ]
392
477
  container = client.V1Container(
393
478
  name="wandb-container-build",
394
- image="gcr.io/kaniko-project/executor:v1.8.0",
479
+ image=self.image,
395
480
  args=args,
396
481
  volume_mounts=volume_mounts,
397
482
  env=env if env else None,
398
483
  )
399
484
  # Create and configure a spec section
485
+ labels = {"wandb": "launch"}
486
+ # This annotation is required to enable azure workload identity.
487
+ if isinstance(self.registry, AzureContainerRegistry):
488
+ labels["azure.workload.identity/use"] = "true"
400
489
  template = client.V1PodTemplateSpec(
401
- metadata=client.V1ObjectMeta(labels={"wandb": "launch"}),
490
+ metadata=client.V1ObjectMeta(labels=labels),
402
491
  spec=client.V1PodSpec(
403
492
  restart_policy="Never",
404
493
  active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
405
494
  containers=[container],
406
495
  volumes=volumes,
496
+ service_account_name=SERVICE_ACCOUNT_NAME,
407
497
  ),
408
498
  )
409
499
  # Create the specification of job
@@ -412,7 +502,7 @@ class KanikoBuilder(AbstractBuilder):
412
502
  api_version="batch/v1",
413
503
  kind="Job",
414
504
  metadata=client.V1ObjectMeta(
415
- name=job_name, namespace="wandb", labels={"wandb": "launch"}
505
+ name=job_name, namespace=NAMESPACE, labels={"wandb": "launch"}
416
506
  ),
417
507
  spec=spec,
418
508
  )
@@ -1,12 +1,13 @@
1
1
  """NoOp builder implementation."""
2
- from typing import Any, Dict
2
+ from typing import Any, Dict, Optional
3
3
 
4
4
  from wandb.sdk.launch.builder.abstract import AbstractBuilder
5
5
  from wandb.sdk.launch.environment.abstract import AbstractEnvironment
6
+ from wandb.sdk.launch.errors import LaunchError
6
7
  from wandb.sdk.launch.registry.abstract import AbstractRegistry
7
- from wandb.sdk.launch.utils import LaunchError
8
8
 
9
9
  from .._project_spec import EntryPoint, LaunchProject
10
+ from ..agent.job_status_tracker import JobAndRunStatusTracker
10
11
 
11
12
 
12
13
  class NoOpBuilder(AbstractBuilder):
@@ -21,7 +22,8 @@ class NoOpBuilder(AbstractBuilder):
21
22
  registry: AbstractRegistry,
22
23
  ) -> None:
23
24
  """Initialize a NoOpBuilder."""
24
- pass
25
+ self.environment = environment
26
+ self.registry = registry
25
27
 
26
28
  @classmethod
27
29
  def from_config(
@@ -42,6 +44,7 @@ class NoOpBuilder(AbstractBuilder):
42
44
  self,
43
45
  launch_project: LaunchProject,
44
46
  entrypoint: EntryPoint,
47
+ job_tracker: Optional[JobAndRunStatusTracker] = None,
45
48
  ) -> str:
46
49
  """Build the image.
47
50
 
@@ -1,5 +1,4 @@
1
1
  import json
2
- import multiprocessing
3
2
  import os
4
3
  import re
5
4
  import subprocess
@@ -8,7 +7,6 @@ from typing import List, Optional, Set
8
7
 
9
8
  FAILED_PACKAGES_PREFIX = "ERROR: Failed to install: "
10
9
  FAILED_PACKAGES_POSTFIX = ". During automated build process."
11
- CORES = multiprocessing.cpu_count()
12
10
  ONLY_INCLUDE = {x for x in os.getenv("WANDB_ONLY_INCLUDE", "").split(",") if x != ""}
13
11
  OPTS = []
14
12
  # If the builder doesn't support buildx no need to use the cache
@@ -52,8 +50,16 @@ def install_deps(
52
50
  if failed is None:
53
51
  failed = set()
54
52
  num_failed = len(failed)
53
+ current_pkg = None
55
54
  for line in e.output.decode("utf8").splitlines():
56
- if line.startswith("ERROR:"):
55
+ # Since the name of the package might not be on the same line as
56
+ # the error msg, keep track of the currently installing package
57
+ current_pkg = get_current_package(line, clean_deps, current_pkg)
58
+
59
+ if "error: subprocess-exited-with-error" in line:
60
+ if current_pkg is not None:
61
+ failed.add(current_pkg)
62
+ elif line.startswith("ERROR:"):
57
63
  clean_dep = find_package_in_error_string(clean_deps, line)
58
64
  if clean_dep is not None:
59
65
  if clean_dep in deps:
@@ -84,7 +90,6 @@ def main() -> None:
84
90
  with open("requirements.frozen.txt") as f:
85
91
  print("Installing frozen dependencies...")
86
92
  reqs = []
87
- failed: Set[str] = set()
88
93
  for req in f:
89
94
  if (
90
95
  len(ONLY_INCLUDE) == 0
@@ -109,15 +114,7 @@ def main() -> None:
109
114
  reqs.append(req.strip().replace(" ", ""))
110
115
  else:
111
116
  print(f"Ignoring requirement: {req} from frozen requirements")
112
- if len(reqs) >= CORES:
113
- deps_failed = install_deps(reqs, opts=OPTS)
114
- reqs = []
115
- if deps_failed is not None:
116
- failed = failed.union(deps_failed)
117
- if len(reqs) > 0:
118
- deps_failed = install_deps(reqs, opts=OPTS)
119
- if deps_failed is not None:
120
- failed = failed.union(deps_failed)
117
+ failed = install_deps(reqs, opts=OPTS) or set()
121
118
  with open("_wandb_bootstrap_errors.json", "w") as f:
122
119
  f.write(json.dumps({"pip": list(failed)}))
123
120
  if len(failed) > 0:
@@ -130,6 +127,41 @@ def main() -> None:
130
127
  print("No frozen requirements found")
131
128
 
132
129
 
130
+ def add_version_to_package_name(deps: List[str], package: str) -> Optional[str]:
131
+ """Add the associated version to a package name.
132
+
133
+ For example: `my-package` -> `my-package==1.0.0`
134
+ """
135
+ for dep in deps:
136
+ if dep.split("==")[0] == package:
137
+ return dep
138
+ return None
139
+
140
+
141
+ def get_current_package(
142
+ line: str, deps: List[str], current_pkg: Optional[str]
143
+ ) -> Optional[str]:
144
+ """Tries to pull a package name from the line.
145
+
146
+ Used to keep track of what the currently-installing package is,
147
+ in case an error message isn't on the same line as the package
148
+ """
149
+ # "Collecting my-package==1.0.0"
150
+ if line.startswith("Collecting"):
151
+ return line.split(" ")[1]
152
+ # "Building wheel for my-package (pyproject.toml): finished with status 'error'"
153
+ elif line.strip().startswith("Building wheel") and line.strip().endswith(
154
+ "finished with status 'error'"
155
+ ):
156
+ return add_version_to_package_name(deps, line.strip().split(" ")[3])
157
+ # "Running setup.py install for my-package: finished with status 'error'"
158
+ elif line.strip().startswith("Running setup.py install") and line.strip().endswith(
159
+ "finished with status 'error'"
160
+ ):
161
+ return add_version_to_package_name(deps, line.strip().split(" ")[4][:-1])
162
+ return current_pkg
163
+
164
+
133
165
  # hacky way to get the name of the requirement that failed
134
166
  # attempt last word which is the name of the package often
135
167
  # fall back to checking all words in the line for the package name
@@ -143,7 +175,7 @@ def find_package_in_error_string(deps: List[str], line: str) -> Optional[str]:
143
175
  # contains a reference to another package in the deps
144
176
  # before the package that failed to install
145
177
  for word in line.split(" "):
146
- if word in deps:
178
+ if word.strip(",") in deps:
147
179
  return word
148
180
  # if we can't find the package, return None
149
181
  return None