wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. wandb/__init__.py +2 -3
  2. wandb/apis/__init__.py +1 -3
  3. wandb/apis/importers/__init__.py +4 -0
  4. wandb/apis/importers/base.py +312 -0
  5. wandb/apis/importers/mlflow.py +113 -0
  6. wandb/apis/internal.py +29 -2
  7. wandb/apis/normalize.py +6 -5
  8. wandb/apis/public.py +163 -180
  9. wandb/apis/reports/_templates.py +6 -12
  10. wandb/apis/reports/report.py +1 -1
  11. wandb/apis/reports/runset.py +1 -3
  12. wandb/apis/reports/util.py +12 -10
  13. wandb/beta/workflows.py +57 -34
  14. wandb/catboost/__init__.py +1 -2
  15. wandb/cli/cli.py +215 -133
  16. wandb/data_types.py +63 -56
  17. wandb/docker/__init__.py +78 -16
  18. wandb/docker/auth.py +21 -22
  19. wandb/env.py +0 -1
  20. wandb/errors/__init__.py +8 -116
  21. wandb/errors/term.py +1 -1
  22. wandb/fastai/__init__.py +1 -2
  23. wandb/filesync/dir_watcher.py +8 -5
  24. wandb/filesync/step_prepare.py +76 -75
  25. wandb/filesync/step_upload.py +1 -2
  26. wandb/integration/catboost/__init__.py +1 -3
  27. wandb/integration/catboost/catboost.py +8 -14
  28. wandb/integration/fastai/__init__.py +7 -13
  29. wandb/integration/gym/__init__.py +35 -4
  30. wandb/integration/keras/__init__.py +3 -3
  31. wandb/integration/keras/callbacks/metrics_logger.py +9 -8
  32. wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
  33. wandb/integration/keras/callbacks/tables_builder.py +31 -19
  34. wandb/integration/kfp/kfp_patch.py +20 -17
  35. wandb/integration/kfp/wandb_logging.py +1 -2
  36. wandb/integration/lightgbm/__init__.py +21 -19
  37. wandb/integration/prodigy/prodigy.py +6 -7
  38. wandb/integration/sacred/__init__.py +9 -12
  39. wandb/integration/sagemaker/__init__.py +1 -3
  40. wandb/integration/sagemaker/auth.py +0 -1
  41. wandb/integration/sagemaker/config.py +1 -1
  42. wandb/integration/sagemaker/resources.py +1 -1
  43. wandb/integration/sb3/sb3.py +8 -4
  44. wandb/integration/tensorboard/__init__.py +1 -3
  45. wandb/integration/tensorboard/log.py +8 -8
  46. wandb/integration/tensorboard/monkeypatch.py +11 -9
  47. wandb/integration/tensorflow/__init__.py +1 -3
  48. wandb/integration/xgboost/__init__.py +4 -6
  49. wandb/integration/yolov8/__init__.py +7 -0
  50. wandb/integration/yolov8/yolov8.py +250 -0
  51. wandb/jupyter.py +31 -35
  52. wandb/lightgbm/__init__.py +1 -2
  53. wandb/old/settings.py +2 -2
  54. wandb/plot/bar.py +1 -2
  55. wandb/plot/confusion_matrix.py +1 -3
  56. wandb/plot/histogram.py +1 -2
  57. wandb/plot/line.py +1 -2
  58. wandb/plot/line_series.py +4 -4
  59. wandb/plot/pr_curve.py +17 -20
  60. wandb/plot/roc_curve.py +1 -3
  61. wandb/plot/scatter.py +1 -2
  62. wandb/proto/v3/wandb_server_pb2.py +85 -39
  63. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  64. wandb/proto/v4/wandb_server_pb2.py +51 -39
  65. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  66. wandb/sdk/__init__.py +1 -3
  67. wandb/sdk/backend/backend.py +1 -1
  68. wandb/sdk/data_types/_dtypes.py +38 -30
  69. wandb/sdk/data_types/base_types/json_metadata.py +1 -3
  70. wandb/sdk/data_types/base_types/media.py +17 -17
  71. wandb/sdk/data_types/base_types/wb_value.py +33 -26
  72. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
  73. wandb/sdk/data_types/helper_types/classes.py +1 -1
  74. wandb/sdk/data_types/helper_types/image_mask.py +12 -12
  75. wandb/sdk/data_types/histogram.py +5 -4
  76. wandb/sdk/data_types/html.py +1 -2
  77. wandb/sdk/data_types/image.py +11 -11
  78. wandb/sdk/data_types/molecule.py +3 -6
  79. wandb/sdk/data_types/object_3d.py +1 -2
  80. wandb/sdk/data_types/plotly.py +1 -2
  81. wandb/sdk/data_types/saved_model.py +10 -8
  82. wandb/sdk/data_types/video.py +1 -1
  83. wandb/sdk/integration_utils/data_logging.py +5 -5
  84. wandb/sdk/interface/artifacts.py +288 -266
  85. wandb/sdk/interface/interface.py +2 -3
  86. wandb/sdk/interface/interface_grpc.py +1 -1
  87. wandb/sdk/interface/interface_queue.py +1 -1
  88. wandb/sdk/interface/interface_relay.py +1 -1
  89. wandb/sdk/interface/interface_shared.py +1 -2
  90. wandb/sdk/interface/interface_sock.py +1 -1
  91. wandb/sdk/interface/message_future.py +1 -1
  92. wandb/sdk/interface/message_future_poll.py +1 -1
  93. wandb/sdk/interface/router.py +1 -1
  94. wandb/sdk/interface/router_queue.py +1 -1
  95. wandb/sdk/interface/router_relay.py +1 -1
  96. wandb/sdk/interface/router_sock.py +1 -1
  97. wandb/sdk/interface/summary_record.py +1 -1
  98. wandb/sdk/internal/artifacts.py +1 -1
  99. wandb/sdk/internal/datastore.py +2 -3
  100. wandb/sdk/internal/file_pusher.py +5 -3
  101. wandb/sdk/internal/file_stream.py +22 -19
  102. wandb/sdk/internal/handler.py +5 -4
  103. wandb/sdk/internal/internal.py +1 -1
  104. wandb/sdk/internal/internal_api.py +115 -55
  105. wandb/sdk/internal/job_builder.py +1 -3
  106. wandb/sdk/internal/profiler.py +1 -1
  107. wandb/sdk/internal/progress.py +4 -6
  108. wandb/sdk/internal/sample.py +1 -3
  109. wandb/sdk/internal/sender.py +28 -16
  110. wandb/sdk/internal/settings_static.py +5 -5
  111. wandb/sdk/internal/system/assets/__init__.py +1 -0
  112. wandb/sdk/internal/system/assets/cpu.py +3 -9
  113. wandb/sdk/internal/system/assets/disk.py +2 -4
  114. wandb/sdk/internal/system/assets/gpu.py +6 -18
  115. wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
  116. wandb/sdk/internal/system/assets/interfaces.py +50 -22
  117. wandb/sdk/internal/system/assets/ipu.py +1 -3
  118. wandb/sdk/internal/system/assets/memory.py +7 -13
  119. wandb/sdk/internal/system/assets/network.py +4 -8
  120. wandb/sdk/internal/system/assets/open_metrics.py +283 -0
  121. wandb/sdk/internal/system/assets/tpu.py +1 -4
  122. wandb/sdk/internal/system/assets/trainium.py +26 -14
  123. wandb/sdk/internal/system/system_info.py +2 -3
  124. wandb/sdk/internal/system/system_monitor.py +52 -20
  125. wandb/sdk/internal/tb_watcher.py +12 -13
  126. wandb/sdk/launch/_project_spec.py +54 -65
  127. wandb/sdk/launch/agent/agent.py +374 -90
  128. wandb/sdk/launch/builder/abstract.py +61 -7
  129. wandb/sdk/launch/builder/build.py +81 -110
  130. wandb/sdk/launch/builder/docker_builder.py +181 -0
  131. wandb/sdk/launch/builder/kaniko_builder.py +419 -0
  132. wandb/sdk/launch/builder/noop.py +31 -12
  133. wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
  134. wandb/sdk/launch/environment/abstract.py +28 -0
  135. wandb/sdk/launch/environment/aws_environment.py +276 -0
  136. wandb/sdk/launch/environment/gcp_environment.py +271 -0
  137. wandb/sdk/launch/environment/local_environment.py +65 -0
  138. wandb/sdk/launch/github_reference.py +3 -8
  139. wandb/sdk/launch/launch.py +38 -29
  140. wandb/sdk/launch/launch_add.py +6 -8
  141. wandb/sdk/launch/loader.py +230 -0
  142. wandb/sdk/launch/registry/abstract.py +54 -0
  143. wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
  144. wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
  145. wandb/sdk/launch/registry/local_registry.py +62 -0
  146. wandb/sdk/launch/runner/abstract.py +1 -16
  147. wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
  148. wandb/sdk/launch/runner/local_container.py +46 -22
  149. wandb/sdk/launch/runner/local_process.py +1 -4
  150. wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
  151. wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
  152. wandb/sdk/launch/sweeps/__init__.py +3 -2
  153. wandb/sdk/launch/sweeps/scheduler.py +132 -39
  154. wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
  155. wandb/sdk/launch/utils.py +101 -30
  156. wandb/sdk/launch/wandb_reference.py +2 -7
  157. wandb/sdk/lib/_settings_toposort_generate.py +166 -0
  158. wandb/sdk/lib/_settings_toposort_generated.py +201 -0
  159. wandb/sdk/lib/apikey.py +2 -4
  160. wandb/sdk/lib/config_util.py +4 -1
  161. wandb/sdk/lib/console.py +1 -3
  162. wandb/sdk/lib/deprecate.py +3 -3
  163. wandb/sdk/lib/file_stream_utils.py +7 -5
  164. wandb/sdk/lib/filenames.py +1 -1
  165. wandb/sdk/lib/filesystem.py +61 -5
  166. wandb/sdk/lib/git.py +1 -3
  167. wandb/sdk/lib/import_hooks.py +4 -7
  168. wandb/sdk/lib/ipython.py +8 -5
  169. wandb/sdk/lib/lazyloader.py +1 -3
  170. wandb/sdk/lib/mailbox.py +14 -4
  171. wandb/sdk/lib/proto_util.py +10 -5
  172. wandb/sdk/lib/redirect.py +15 -22
  173. wandb/sdk/lib/reporting.py +1 -3
  174. wandb/sdk/lib/retry.py +4 -5
  175. wandb/sdk/lib/runid.py +1 -3
  176. wandb/sdk/lib/server.py +15 -9
  177. wandb/sdk/lib/sock_client.py +1 -1
  178. wandb/sdk/lib/sparkline.py +1 -1
  179. wandb/sdk/lib/wburls.py +1 -1
  180. wandb/sdk/service/port_file.py +1 -2
  181. wandb/sdk/service/service.py +36 -13
  182. wandb/sdk/service/service_base.py +12 -1
  183. wandb/sdk/verify/verify.py +5 -7
  184. wandb/sdk/wandb_artifacts.py +142 -177
  185. wandb/sdk/wandb_config.py +5 -8
  186. wandb/sdk/wandb_helper.py +1 -1
  187. wandb/sdk/wandb_init.py +24 -13
  188. wandb/sdk/wandb_login.py +9 -9
  189. wandb/sdk/wandb_manager.py +39 -4
  190. wandb/sdk/wandb_metric.py +2 -6
  191. wandb/sdk/wandb_require.py +4 -15
  192. wandb/sdk/wandb_require_helpers.py +1 -9
  193. wandb/sdk/wandb_run.py +95 -141
  194. wandb/sdk/wandb_save.py +1 -3
  195. wandb/sdk/wandb_settings.py +149 -54
  196. wandb/sdk/wandb_setup.py +66 -46
  197. wandb/sdk/wandb_summary.py +13 -10
  198. wandb/sdk/wandb_sweep.py +6 -7
  199. wandb/sdk/wandb_watch.py +1 -1
  200. wandb/sklearn/calculate/confusion_matrix.py +1 -1
  201. wandb/sklearn/calculate/learning_curve.py +1 -1
  202. wandb/sklearn/calculate/summary_metrics.py +1 -3
  203. wandb/sklearn/plot/__init__.py +1 -1
  204. wandb/sklearn/plot/classifier.py +27 -18
  205. wandb/sklearn/plot/clusterer.py +4 -5
  206. wandb/sklearn/plot/regressor.py +4 -4
  207. wandb/sklearn/plot/shared.py +2 -2
  208. wandb/sync/__init__.py +1 -3
  209. wandb/sync/sync.py +4 -5
  210. wandb/testing/relay.py +11 -10
  211. wandb/trigger.py +1 -1
  212. wandb/util.py +106 -81
  213. wandb/viz.py +4 -4
  214. wandb/wandb_agent.py +50 -50
  215. wandb/wandb_controller.py +2 -3
  216. wandb/wandb_run.py +1 -2
  217. wandb/wandb_torch.py +1 -1
  218. wandb/xgboost/__init__.py +1 -2
  219. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
  220. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
  221. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
  222. wandb/sdk/launch/builder/docker.py +0 -80
  223. wandb/sdk/launch/builder/kaniko.py +0 -393
  224. wandb/sdk/launch/builder/loader.py +0 -32
  225. wandb/sdk/launch/runner/loader.py +0 -50
  226. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
  227. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
  228. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,419 @@
1
+ import base64
2
+ import json
3
+ import logging
4
+ import tarfile
5
+ import tempfile
6
+ import time
7
+ from typing import Optional
8
+
9
+ import wandb
10
+ from wandb.sdk.launch.builder.abstract import AbstractBuilder
11
+ from wandb.sdk.launch.environment.abstract import AbstractEnvironment
12
+ from wandb.sdk.launch.registry.abstract import AbstractRegistry
13
+ from wandb.sdk.launch.registry.elastic_container_registry import (
14
+ ElasticContainerRegistry,
15
+ )
16
+ from wandb.sdk.launch.registry.google_artifact_registry import GoogleArtifactRegistry
17
+ from wandb.util import get_module
18
+
19
+ from .._project_spec import (
20
+ EntryPoint,
21
+ LaunchProject,
22
+ create_metadata_file,
23
+ get_entry_point_command,
24
+ )
25
+ from ..utils import (
26
+ LOG_PREFIX,
27
+ LaunchError,
28
+ get_kube_context_and_api_client,
29
+ sanitize_wandb_api_key,
30
+ warn_failed_packages_from_build_logs,
31
+ )
32
+ from .build import (
33
+ _create_docker_build_ctx,
34
+ generate_dockerfile,
35
+ image_tag_from_dockerfile_and_source,
36
+ )
37
+
38
+ get_module(
39
+ "kubernetes",
40
+ required="Kaniko builder requires the kubernetes package. Please install it with `pip install wandb[launch]`.",
41
+ )
42
+
43
+ import kubernetes # type: ignore # noqa: E402
44
+ from kubernetes import client # noqa: E402
45
+
46
+ _logger = logging.getLogger(__name__)
47
+
48
+ _DEFAULT_BUILD_TIMEOUT_SECS = 1800 # 30 minute build timeout
49
+
50
+
51
+ def _wait_for_completion(
52
+ batch_client: client.BatchV1Api, job_name: str, deadline_secs: Optional[int] = None
53
+ ) -> bool:
54
+ start_time = time.time()
55
+ while True:
56
+ job = batch_client.read_namespaced_job_status(job_name, "wandb")
57
+ if job.status.succeeded is not None and job.status.succeeded >= 1:
58
+ return True
59
+ elif job.status.failed is not None and job.status.failed >= 1:
60
+ wandb.termerror(f"{LOG_PREFIX}Build job {job.status.failed} failed {job}")
61
+ return False
62
+ wandb.termlog(f"{LOG_PREFIX}Waiting for build job to complete...")
63
+ if deadline_secs is not None and time.time() - start_time > deadline_secs:
64
+ return False
65
+
66
+ time.sleep(5)
67
+
68
+
69
+ class KanikoBuilder(AbstractBuilder):
70
+ """Builds a docker image for a project using Kaniko."""
71
+
72
+ type = "kaniko"
73
+
74
+ build_job_name: str
75
+ build_context_store: str
76
+ secret_name: Optional[str]
77
+ secret_key: Optional[str]
78
+
79
+ def __init__(
80
+ self,
81
+ environment: AbstractEnvironment,
82
+ registry: AbstractRegistry,
83
+ build_job_name: str = "wandb-launch-container-build",
84
+ build_context_store: str = "",
85
+ secret_name: str = "",
86
+ secret_key: str = "",
87
+ verify: bool = True,
88
+ ):
89
+ """Initialize a KanikoBuilder.
90
+
91
+ Arguments:
92
+ environment (AbstractEnvironment): The environment to use.
93
+ registry (AbstractRegistry): The registry to use.
94
+ build_job_name (str, optional): The name of the build job.
95
+ build_context_store (str, optional): The name of the build context store.
96
+ secret_name (str, optional): The name of the secret to use for the registry.
97
+ secret_key (str, optional): The key of the secret to use for the registry.
98
+ verify (bool, optional): Whether to verify the functionality of the builder.
99
+ Defaults to True.
100
+ """
101
+ if build_context_store is None:
102
+ raise LaunchError(
103
+ "You are required to specify an external build "
104
+ "context store for Kaniko builds. Please specify a storage url "
105
+ "in the 'build-context-store' field of your builder config."
106
+ )
107
+ self.environment = environment
108
+ self.registry = registry
109
+ self.build_job_name = build_job_name
110
+ self.build_context_store = build_context_store.rstrip("/")
111
+ self.secret_name = secret_name
112
+ self.secret_key = secret_key
113
+ if verify:
114
+ self.verify()
115
+
116
+ @classmethod
117
+ def from_config(
118
+ cls,
119
+ config: dict,
120
+ environment: AbstractEnvironment,
121
+ registry: AbstractRegistry,
122
+ verify: bool = True,
123
+ login: bool = True,
124
+ ) -> "AbstractBuilder":
125
+ """Create a KanikoBuilder from a config dict.
126
+
127
+ Arguments:
128
+ config: A dict containing the builder config. Must contain a "type" key
129
+ with value "kaniko".
130
+ environment: The environment to use for the build.
131
+ registry: The registry to use for the build.
132
+ verify: Whether to verify the builder config.
133
+
134
+ Returns:
135
+ A KanikoBuilder instance.
136
+ """
137
+ if config.get("type") != "kaniko":
138
+ raise LaunchError(
139
+ "Builder config must include 'type':'kaniko' to create a KanikoBuilder."
140
+ )
141
+ build_context_store = config.get("build-context-store")
142
+ if build_context_store is None:
143
+ raise LaunchError(
144
+ "You are required to specify an external build "
145
+ "context store for Kaniko builds. Please specify a "
146
+ "storage url in the 'build_context_store' field of your builder config."
147
+ )
148
+ build_job_name = config.get("build-job-name", "wandb-launch-container-build")
149
+ secret_name = config.get("secret-name", "")
150
+ secret_key = config.get("secret-key", "")
151
+ return cls(
152
+ environment,
153
+ registry,
154
+ build_context_store=build_context_store,
155
+ build_job_name=build_job_name,
156
+ secret_name=secret_name,
157
+ secret_key=secret_key,
158
+ verify=verify,
159
+ )
160
+
161
+ def verify(self) -> None:
162
+ """Verify that the builder config is valid.
163
+
164
+ Raises:
165
+ LaunchError: If the builder config is invalid.
166
+ """
167
+ if self.environment is None:
168
+ raise LaunchError("No environment specified for Kaniko build.")
169
+ self.environment.verify_storage_uri(self.build_context_store)
170
+
171
+ def login(self) -> None:
172
+ """Login to the registry."""
173
+ pass
174
+
175
+ def _create_docker_ecr_config_map(
176
+ self, job_name: str, corev1_client: client.CoreV1Api, repository: str
177
+ ) -> None:
178
+ if self.registry is None:
179
+ raise LaunchError("No registry specified for Kaniko build.")
180
+ username, password = self.registry.get_username_password()
181
+ encoded = base64.b64encode(f"{username}:{password}".encode()).decode("utf-8")
182
+ ecr_config_map = client.V1ConfigMap(
183
+ api_version="v1",
184
+ kind="ConfigMap",
185
+ metadata=client.V1ObjectMeta(
186
+ name=f"docker-config-{job_name}",
187
+ namespace="wandb",
188
+ ),
189
+ data={
190
+ "config.json": json.dumps(
191
+ {"auths": {f"{self.registry.get_repo_uri()}": {"auth": encoded}}}
192
+ )
193
+ },
194
+ immutable=True,
195
+ )
196
+ corev1_client.create_namespaced_config_map("wandb", ecr_config_map)
197
+
198
+ def _delete_docker_ecr_config_map(
199
+ self, job_name: str, client: client.CoreV1Api
200
+ ) -> None:
201
+ if self.secret_name:
202
+ client.delete_namespaced_config_map(f"docker-config-{job_name}", "wandb")
203
+
204
+ def _upload_build_context(self, run_id: str, context_path: str) -> str:
205
+ # creat a tar archive of the build context and upload it to s3
206
+ context_file = tempfile.NamedTemporaryFile(delete=False)
207
+ with tarfile.TarFile.open(fileobj=context_file, mode="w:gz") as context_tgz:
208
+ context_tgz.add(context_path, arcname=".")
209
+ context_file.close()
210
+ destination = f"{self.build_context_store}/{run_id}.tgz"
211
+ if self.environment is None:
212
+ raise LaunchError("No environment specified for Kaniko build.")
213
+ self.environment.upload_file(context_file.name, destination)
214
+ return destination
215
+
216
+ def build_image(
217
+ self,
218
+ launch_project: LaunchProject,
219
+ entrypoint: EntryPoint,
220
+ ) -> str:
221
+ # TODO: this should probably throw an error if the registry is a local registry
222
+ if not self.registry:
223
+ raise LaunchError("No registry specified for Kaniko build.")
224
+ # kaniko builder doesn't seem to work with a custom user id, need more investigation
225
+ dockerfile_str = generate_dockerfile(
226
+ launch_project, entrypoint, launch_project.resource, "kaniko"
227
+ )
228
+ image_tag = image_tag_from_dockerfile_and_source(launch_project, dockerfile_str)
229
+ repo_uri = self.registry.get_repo_uri()
230
+ image_uri = repo_uri + ":" + image_tag
231
+
232
+ if not launch_project.build_required() and self.registry.check_image_exists(
233
+ image_uri
234
+ ):
235
+ return image_uri
236
+
237
+ _logger.info(f"Building image {image_uri}...")
238
+
239
+ entry_cmd = " ".join(
240
+ get_entry_point_command(entrypoint, launch_project.override_args)
241
+ )
242
+
243
+ create_metadata_file(
244
+ launch_project,
245
+ image_uri,
246
+ sanitize_wandb_api_key(entry_cmd),
247
+ sanitize_wandb_api_key(dockerfile_str),
248
+ )
249
+ context_path = _create_docker_build_ctx(launch_project, dockerfile_str)
250
+ run_id = launch_project.run_id
251
+
252
+ _, api_client = get_kube_context_and_api_client(
253
+ kubernetes, launch_project.resource_args
254
+ )
255
+ build_job_name = f"{self.build_job_name}-{run_id}"
256
+
257
+ build_context = self._upload_build_context(run_id, context_path)
258
+ build_job = self._create_kaniko_job(
259
+ build_job_name,
260
+ repo_uri,
261
+ image_uri,
262
+ build_context,
263
+ )
264
+ wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
265
+
266
+ # TODO: use same client as kuberentes.py
267
+ batch_v1 = client.BatchV1Api(api_client)
268
+ core_v1 = client.CoreV1Api(api_client)
269
+
270
+ try:
271
+ # core_v1.create_namespaced_config_map("wandb", dockerfile_config_map)
272
+ if self.secret_name:
273
+ self._create_docker_ecr_config_map(build_job_name, core_v1, repo_uri)
274
+ batch_v1.create_namespaced_job("wandb", build_job)
275
+
276
+ # wait for double the job deadline since it might take time to schedule
277
+ if not _wait_for_completion(
278
+ batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
279
+ ):
280
+ raise Exception(f"Failed to build image in kaniko for job {run_id}")
281
+ try:
282
+ logs = batch_v1.read_namespaced_job_log(build_job_name, "wandb")
283
+ warn_failed_packages_from_build_logs(logs, image_uri)
284
+ except Exception as e:
285
+ wandb.termwarn(
286
+ f"{LOG_PREFIX}Failed to get logs for kaniko job {build_job_name}: {e}"
287
+ )
288
+ except Exception as e:
289
+ wandb.termerror(
290
+ f"{LOG_PREFIX}Exception when creating Kubernetes resources: {e}\n"
291
+ )
292
+ raise e
293
+ finally:
294
+ wandb.termlog(f"{LOG_PREFIX}Cleaning up resources")
295
+ try:
296
+ # should we clean up the s3 build contexts? can set bucket level policy to auto deletion
297
+ # core_v1.delete_namespaced_config_map(config_map_name, "wandb")
298
+ if self.secret_name:
299
+ self._delete_docker_ecr_config_map(build_job_name, core_v1)
300
+ batch_v1.delete_namespaced_job(build_job_name, "wandb")
301
+ except Exception as e:
302
+ raise LaunchError(f"Exception during Kubernetes resource clean up {e}")
303
+
304
+ return image_uri
305
+
306
+ def _create_kaniko_job(
307
+ self,
308
+ job_name: str,
309
+ repository: str,
310
+ image_tag: str,
311
+ build_context_path: str,
312
+ ) -> "client.V1Job":
313
+ env = []
314
+ volume_mounts = []
315
+ volumes = []
316
+ if bool(self.secret_name) != bool(self.secret_key):
317
+ raise LaunchError(
318
+ "Both secret_name and secret_key or neither must be specified "
319
+ "for kaniko build. You provided only one of them."
320
+ )
321
+ if isinstance(self.registry, ElasticContainerRegistry):
322
+ env += [
323
+ client.V1EnvVar(
324
+ name="AWS_REGION",
325
+ value=self.registry.environment.region,
326
+ )
327
+ ]
328
+ if self.secret_name and self.secret_key:
329
+ volumes += [
330
+ client.V1Volume(
331
+ name="docker-config",
332
+ config_map=client.V1ConfigMapVolumeSource(
333
+ name=f"docker-config-{job_name}",
334
+ ),
335
+ ),
336
+ ]
337
+ volume_mounts += [
338
+ client.V1VolumeMount(
339
+ name="docker-config", mount_path="/kaniko/.docker/"
340
+ ),
341
+ ]
342
+ # TODO: I don't like conditioning on the registry type here. As a
343
+ # future change I want the registry and environment classes to provide
344
+ # a list of environment variables and volume mounts that need to be
345
+ # added to the job. The environment class provides credentials for
346
+ # build context access, and the registry class provides credentials
347
+ # for pushing the image. This way we can have separate secrets for
348
+ # each and support build contexts and registries that require
349
+ # different credentials.
350
+ if isinstance(self.registry, ElasticContainerRegistry):
351
+ mount_path = "/root/.aws"
352
+ key = "credentials"
353
+ elif isinstance(self.registry, GoogleArtifactRegistry):
354
+ mount_path = "/kaniko/.config/gcloud"
355
+ key = "config.json"
356
+ env += [
357
+ client.V1EnvVar(
358
+ name="GOOGLE_APPLICATION_CREDENTIALS",
359
+ value="/kaniko/.config/gcloud/config.json",
360
+ )
361
+ ]
362
+ else:
363
+ raise LaunchError(
364
+ f"Registry type {type(self.registry)} not supported by kaniko"
365
+ )
366
+ volume_mounts += [
367
+ client.V1VolumeMount(
368
+ name=self.secret_name,
369
+ mount_path=mount_path,
370
+ read_only=True,
371
+ )
372
+ ]
373
+ volumes += [
374
+ client.V1Volume(
375
+ name=self.secret_name,
376
+ secret=client.V1SecretVolumeSource(
377
+ secret_name=self.secret_name,
378
+ items=[client.V1KeyToPath(key=self.secret_key, path=key)],
379
+ ),
380
+ )
381
+ ]
382
+
383
+ args = [
384
+ f"--context={build_context_path}",
385
+ "--dockerfile=Dockerfile.wandb-autogenerated",
386
+ f"--destination={image_tag}",
387
+ "--cache=true",
388
+ f"--cache-repo={repository}",
389
+ "--snapshotMode=redo",
390
+ "--compressed-caching=false",
391
+ ]
392
+ container = client.V1Container(
393
+ name="wandb-container-build",
394
+ image="gcr.io/kaniko-project/executor:v1.8.0",
395
+ args=args,
396
+ volume_mounts=volume_mounts,
397
+ env=env if env else None,
398
+ )
399
+ # Create and configure a spec section
400
+ template = client.V1PodTemplateSpec(
401
+ metadata=client.V1ObjectMeta(labels={"wandb": "launch"}),
402
+ spec=client.V1PodSpec(
403
+ restart_policy="Never",
404
+ active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
405
+ containers=[container],
406
+ volumes=volumes,
407
+ ),
408
+ )
409
+ # Create the specification of job
410
+ spec = client.V1JobSpec(template=template, backoff_limit=1)
411
+ job = client.V1Job(
412
+ api_version="batch/v1",
413
+ kind="Job",
414
+ metadata=client.V1ObjectMeta(
415
+ name=job_name, namespace="wandb", labels={"wandb": "launch"}
416
+ ),
417
+ spec=spec,
418
+ )
419
+ return job
@@ -1,32 +1,51 @@
1
- from typing import Any, Dict, Optional
1
+ """NoOp builder implementation."""
2
+ from typing import Any, Dict
2
3
 
3
- from wandb.errors import LaunchError
4
4
  from wandb.sdk.launch.builder.abstract import AbstractBuilder
5
+ from wandb.sdk.launch.environment.abstract import AbstractEnvironment
6
+ from wandb.sdk.launch.registry.abstract import AbstractRegistry
7
+ from wandb.sdk.launch.utils import LaunchError
5
8
 
6
9
  from .._project_spec import EntryPoint, LaunchProject
7
10
 
8
11
 
9
12
  class NoOpBuilder(AbstractBuilder):
13
+ """NoOp builder."""
10
14
 
11
15
  type = "noop"
12
16
 
13
- def __init__(self, builder_config: Dict[str, Any]) -> None:
14
- self.builder_config = builder_config
17
+ def __init__(
18
+ self,
19
+ builder_config: Dict[str, Any],
20
+ environment: AbstractEnvironment,
21
+ registry: AbstractRegistry,
22
+ ) -> None:
23
+ """Initialize a NoOpBuilder."""
24
+ pass
25
+
26
+ @classmethod
27
+ def from_config(
28
+ cls,
29
+ config: dict,
30
+ environment: AbstractEnvironment,
31
+ registry: AbstractRegistry,
32
+ verify: bool = True,
33
+ ) -> "AbstractBuilder":
34
+ """Create a noop builder from a config."""
35
+ return cls(config, environment, registry)
36
+
37
+ def verify(self) -> None:
38
+ """Verify the builder."""
39
+ raise LaunchError("Attempted to verify noop builder.")
15
40
 
16
41
  def build_image(
17
42
  self,
18
43
  launch_project: LaunchProject,
19
- registry: Optional[str],
20
44
  entrypoint: EntryPoint,
21
45
  ) -> str:
22
- """Build the image for the given project.
23
-
24
- Args:
25
- launch_project: The project to build.
26
- build_ctx_path: The path to the build context.
46
+ """Build the image.
27
47
 
28
- Returns:
29
- The image name.
48
+ For this we raise a launch error since it can't build.
30
49
  """
31
50
  raise LaunchError(
32
51
  "Attempted build with noop builder. Specify a builder in your launch config at ~/.config/wandb/launch-config.yaml"
@@ -1,10 +1,13 @@
1
1
  import json
2
2
  import multiprocessing
3
3
  import os
4
+ import re
4
5
  import subprocess
5
6
  import sys
6
7
  from typing import List, Optional, Set
7
8
 
9
+ FAILED_PACKAGES_PREFIX = "ERROR: Failed to install: "
10
+ FAILED_PACKAGES_POSTFIX = ". During automated build process."
8
11
  CORES = multiprocessing.cpu_count()
9
12
  ONLY_INCLUDE = {x for x in os.getenv("WANDB_ONLY_INCLUDE", "").split(",") if x != ""}
10
13
  OPTS = []
@@ -21,9 +24,12 @@ else:
21
24
 
22
25
 
23
26
  def install_deps(
24
- deps: List[str], failed: Optional[Set[str]] = None
27
+ deps: List[str],
28
+ failed: Optional[Set[str]] = None,
29
+ extra_index: Optional[str] = None,
30
+ opts: Optional[List[str]] = None,
25
31
  ) -> Optional[Set[str]]:
26
- """Install pip dependencies
32
+ """Install pip dependencies.
27
33
 
28
34
  Arguments:
29
35
  deps {List[str]} -- List of dependencies to install
@@ -35,33 +41,45 @@ def install_deps(
35
41
  try:
36
42
  # Include only uri if @ is present
37
43
  clean_deps = [d.split("@")[-1].strip() if "@" in d else d for d in deps]
38
-
44
+ index_args = ["--extra-index-url", extra_index] if extra_index else []
39
45
  print("installing {}...".format(", ".join(clean_deps)))
46
+ opts = opts or []
47
+ args = ["pip", "install"] + opts + clean_deps + index_args
40
48
  sys.stdout.flush()
41
- subprocess.check_output(
42
- ["pip", "install"] + OPTS + clean_deps, stderr=subprocess.STDOUT
43
- )
44
- if failed is not None and len(failed) > 0:
45
- sys.stderr.write(
46
- "ERROR: Unable to install: {}".format(", ".join(clean_deps))
47
- )
48
- sys.stderr.flush()
49
+ subprocess.check_output(args, stderr=subprocess.STDOUT)
49
50
  return failed
50
51
  except subprocess.CalledProcessError as e:
51
52
  if failed is None:
52
53
  failed = set()
53
54
  num_failed = len(failed)
54
- for line in e.output.decode("utf8"):
55
+ for line in e.output.decode("utf8").splitlines():
55
56
  if line.startswith("ERROR:"):
56
- failed.add(line.split(" ")[-1])
57
- if len(failed) > num_failed:
58
- return install_deps(list(set(clean_deps) - failed), failed)
57
+ clean_dep = find_package_in_error_string(clean_deps, line)
58
+ if clean_dep is not None:
59
+ if clean_dep in deps:
60
+ failed.add(clean_dep)
61
+ else:
62
+ for d in deps:
63
+ if clean_dep in d:
64
+ failed.add(d.replace(" ", ""))
65
+ break
66
+ if len(set(clean_deps) - failed) == 0:
67
+ return failed
68
+ elif len(failed) > num_failed:
69
+ return install_deps(
70
+ list(set(clean_deps) - failed),
71
+ failed,
72
+ extra_index=extra_index,
73
+ opts=opts,
74
+ )
59
75
  else:
60
76
  return failed
61
77
 
62
78
 
63
79
  def main() -> None:
64
- """Install deps in requirements.frozen.txt"""
80
+ """Install deps in requirements.frozen.txt."""
81
+ extra_index = None
82
+ torch_reqs = []
65
83
  if os.path.exists("requirements.frozen.txt"):
66
84
  with open("requirements.frozen.txt") as f:
67
85
  print("Installing frozen dependencies...")
@@ -72,28 +90,60 @@ def main() -> None:
72
90
  # can't pip install wandb==0.*.*.dev1 through pip. Lets just install wandb for now
73
91
  if req.startswith("wandb==") and "dev1" in req:
74
92
  req = "wandb"
75
- reqs.append(req.strip().replace(" ", ""))
93
+ match = re.match(
94
+ r"torch(vision|audio)?==\d+\.\d+\.\d+(\+(?:cu[\d]{2,3})|(?:cpu))?",
95
+ req,
96
+ )
97
+ if match:
98
+ variant = match.group(2)
99
+ if variant:
100
+ extra_index = (
101
+ f"https://download.pytorch.org/whl/{variant[1:]}"
102
+ )
103
+ torch_reqs.append(req.strip().replace(" ", ""))
104
+ else:
105
+ reqs.append(req.strip().replace(" ", ""))
76
106
  else:
77
107
  print(f"Ignoring requirement: {req} from frozen requirements")
78
108
  if len(reqs) >= CORES:
79
- deps_failed = install_deps(reqs)
109
+ deps_failed = install_deps(reqs, opts=OPTS)
80
110
  reqs = []
81
111
  if deps_failed is not None:
82
112
  failed = failed.union(deps_failed)
83
113
  if len(reqs) > 0:
84
- deps_failed = install_deps(reqs)
114
+ deps_failed = install_deps(reqs, opts=OPTS)
85
115
  if deps_failed is not None:
86
116
  failed = failed.union(deps_failed)
87
117
  with open("_wandb_bootstrap_errors.json", "w") as f:
88
118
  f.write(json.dumps({"pip": list(failed)}))
89
119
  if len(failed) > 0:
90
120
  sys.stderr.write(
91
- "ERROR: Failed to install: {}".format(",".join(failed))
121
+ FAILED_PACKAGES_PREFIX + ",".join(failed) + FAILED_PACKAGES_POSTFIX
92
122
  )
93
123
  sys.stderr.flush()
124
+ install_deps(torch_reqs, extra_index=extra_index)
94
125
  else:
95
126
  print("No frozen requirements found")
96
127
 
97
128
 
129
+ # hacky way to get the name of the requirement that failed
130
+ # attempt last word which is the name of the package often
131
+ # fall back to checking all words in the line for the package name
132
+ def find_package_in_error_string(deps: List[str], line: str) -> Optional[str]:
133
+ # if the last word in the error string is in the list of deps, return it
134
+ last_word = line.split(" ")[-1]
135
+ if last_word in deps:
136
+ return last_word
137
+ # if the last word is not in the list of deps, check all words
138
+ # TODO: this could report the wrong package if the error string
139
+ # contains a reference to another package in the deps
140
+ # before the package that failed to install
141
+ for word in line.split(" "):
142
+ if word in deps:
143
+ return word
144
+ # if we can't find the package, return None
145
+ return None
146
+
147
+
98
148
  if __name__ == "__main__":
99
149
  main()
@@ -0,0 +1,28 @@
1
+ """Abstract base class for environments."""
2
+ from abc import ABC, abstractmethod
3
+
4
+
5
+ class AbstractEnvironment(ABC):
6
+ """Abstract base class for environments."""
7
+
8
+ region: str
9
+
10
+ @abstractmethod
11
+ def verify(self) -> None:
12
+ """Verify that the environment is configured correctly."""
13
+ raise NotImplementedError
14
+
15
+ @abstractmethod
16
+ def upload_file(self, source: str, destination: str) -> None:
17
+ """Upload a file from the local filesystem to storage in the environment."""
18
+ raise NotImplementedError
19
+
20
+ @abstractmethod
21
+ def upload_dir(self, source: str, destination: str) -> None:
22
+ """Upload the contents of a directory from the local filesystem to the environment."""
23
+ raise NotImplementedError
24
+
25
+ @abstractmethod
26
+ def verify_storage_uri(self, uri: str) -> None:
27
+ """Verify that the storage URI is configured correctly."""
28
+ raise NotImplementedError