wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. wandb/__init__.py +2 -3
  2. wandb/apis/__init__.py +1 -3
  3. wandb/apis/importers/__init__.py +4 -0
  4. wandb/apis/importers/base.py +312 -0
  5. wandb/apis/importers/mlflow.py +113 -0
  6. wandb/apis/internal.py +29 -2
  7. wandb/apis/normalize.py +6 -5
  8. wandb/apis/public.py +163 -180
  9. wandb/apis/reports/_templates.py +6 -12
  10. wandb/apis/reports/report.py +1 -1
  11. wandb/apis/reports/runset.py +1 -3
  12. wandb/apis/reports/util.py +12 -10
  13. wandb/beta/workflows.py +57 -34
  14. wandb/catboost/__init__.py +1 -2
  15. wandb/cli/cli.py +215 -133
  16. wandb/data_types.py +63 -56
  17. wandb/docker/__init__.py +78 -16
  18. wandb/docker/auth.py +21 -22
  19. wandb/env.py +0 -1
  20. wandb/errors/__init__.py +8 -116
  21. wandb/errors/term.py +1 -1
  22. wandb/fastai/__init__.py +1 -2
  23. wandb/filesync/dir_watcher.py +8 -5
  24. wandb/filesync/step_prepare.py +76 -75
  25. wandb/filesync/step_upload.py +1 -2
  26. wandb/integration/catboost/__init__.py +1 -3
  27. wandb/integration/catboost/catboost.py +8 -14
  28. wandb/integration/fastai/__init__.py +7 -13
  29. wandb/integration/gym/__init__.py +35 -4
  30. wandb/integration/keras/__init__.py +3 -3
  31. wandb/integration/keras/callbacks/metrics_logger.py +9 -8
  32. wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
  33. wandb/integration/keras/callbacks/tables_builder.py +31 -19
  34. wandb/integration/kfp/kfp_patch.py +20 -17
  35. wandb/integration/kfp/wandb_logging.py +1 -2
  36. wandb/integration/lightgbm/__init__.py +21 -19
  37. wandb/integration/prodigy/prodigy.py +6 -7
  38. wandb/integration/sacred/__init__.py +9 -12
  39. wandb/integration/sagemaker/__init__.py +1 -3
  40. wandb/integration/sagemaker/auth.py +0 -1
  41. wandb/integration/sagemaker/config.py +1 -1
  42. wandb/integration/sagemaker/resources.py +1 -1
  43. wandb/integration/sb3/sb3.py +8 -4
  44. wandb/integration/tensorboard/__init__.py +1 -3
  45. wandb/integration/tensorboard/log.py +8 -8
  46. wandb/integration/tensorboard/monkeypatch.py +11 -9
  47. wandb/integration/tensorflow/__init__.py +1 -3
  48. wandb/integration/xgboost/__init__.py +4 -6
  49. wandb/integration/yolov8/__init__.py +7 -0
  50. wandb/integration/yolov8/yolov8.py +250 -0
  51. wandb/jupyter.py +31 -35
  52. wandb/lightgbm/__init__.py +1 -2
  53. wandb/old/settings.py +2 -2
  54. wandb/plot/bar.py +1 -2
  55. wandb/plot/confusion_matrix.py +1 -3
  56. wandb/plot/histogram.py +1 -2
  57. wandb/plot/line.py +1 -2
  58. wandb/plot/line_series.py +4 -4
  59. wandb/plot/pr_curve.py +17 -20
  60. wandb/plot/roc_curve.py +1 -3
  61. wandb/plot/scatter.py +1 -2
  62. wandb/proto/v3/wandb_server_pb2.py +85 -39
  63. wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
  64. wandb/proto/v4/wandb_server_pb2.py +51 -39
  65. wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
  66. wandb/sdk/__init__.py +1 -3
  67. wandb/sdk/backend/backend.py +1 -1
  68. wandb/sdk/data_types/_dtypes.py +38 -30
  69. wandb/sdk/data_types/base_types/json_metadata.py +1 -3
  70. wandb/sdk/data_types/base_types/media.py +17 -17
  71. wandb/sdk/data_types/base_types/wb_value.py +33 -26
  72. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
  73. wandb/sdk/data_types/helper_types/classes.py +1 -1
  74. wandb/sdk/data_types/helper_types/image_mask.py +12 -12
  75. wandb/sdk/data_types/histogram.py +5 -4
  76. wandb/sdk/data_types/html.py +1 -2
  77. wandb/sdk/data_types/image.py +11 -11
  78. wandb/sdk/data_types/molecule.py +3 -6
  79. wandb/sdk/data_types/object_3d.py +1 -2
  80. wandb/sdk/data_types/plotly.py +1 -2
  81. wandb/sdk/data_types/saved_model.py +10 -8
  82. wandb/sdk/data_types/video.py +1 -1
  83. wandb/sdk/integration_utils/data_logging.py +5 -5
  84. wandb/sdk/interface/artifacts.py +288 -266
  85. wandb/sdk/interface/interface.py +2 -3
  86. wandb/sdk/interface/interface_grpc.py +1 -1
  87. wandb/sdk/interface/interface_queue.py +1 -1
  88. wandb/sdk/interface/interface_relay.py +1 -1
  89. wandb/sdk/interface/interface_shared.py +1 -2
  90. wandb/sdk/interface/interface_sock.py +1 -1
  91. wandb/sdk/interface/message_future.py +1 -1
  92. wandb/sdk/interface/message_future_poll.py +1 -1
  93. wandb/sdk/interface/router.py +1 -1
  94. wandb/sdk/interface/router_queue.py +1 -1
  95. wandb/sdk/interface/router_relay.py +1 -1
  96. wandb/sdk/interface/router_sock.py +1 -1
  97. wandb/sdk/interface/summary_record.py +1 -1
  98. wandb/sdk/internal/artifacts.py +1 -1
  99. wandb/sdk/internal/datastore.py +2 -3
  100. wandb/sdk/internal/file_pusher.py +5 -3
  101. wandb/sdk/internal/file_stream.py +22 -19
  102. wandb/sdk/internal/handler.py +5 -4
  103. wandb/sdk/internal/internal.py +1 -1
  104. wandb/sdk/internal/internal_api.py +115 -55
  105. wandb/sdk/internal/job_builder.py +1 -3
  106. wandb/sdk/internal/profiler.py +1 -1
  107. wandb/sdk/internal/progress.py +4 -6
  108. wandb/sdk/internal/sample.py +1 -3
  109. wandb/sdk/internal/sender.py +28 -16
  110. wandb/sdk/internal/settings_static.py +5 -5
  111. wandb/sdk/internal/system/assets/__init__.py +1 -0
  112. wandb/sdk/internal/system/assets/cpu.py +3 -9
  113. wandb/sdk/internal/system/assets/disk.py +2 -4
  114. wandb/sdk/internal/system/assets/gpu.py +6 -18
  115. wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
  116. wandb/sdk/internal/system/assets/interfaces.py +50 -22
  117. wandb/sdk/internal/system/assets/ipu.py +1 -3
  118. wandb/sdk/internal/system/assets/memory.py +7 -13
  119. wandb/sdk/internal/system/assets/network.py +4 -8
  120. wandb/sdk/internal/system/assets/open_metrics.py +283 -0
  121. wandb/sdk/internal/system/assets/tpu.py +1 -4
  122. wandb/sdk/internal/system/assets/trainium.py +26 -14
  123. wandb/sdk/internal/system/system_info.py +2 -3
  124. wandb/sdk/internal/system/system_monitor.py +52 -20
  125. wandb/sdk/internal/tb_watcher.py +12 -13
  126. wandb/sdk/launch/_project_spec.py +54 -65
  127. wandb/sdk/launch/agent/agent.py +374 -90
  128. wandb/sdk/launch/builder/abstract.py +61 -7
  129. wandb/sdk/launch/builder/build.py +81 -110
  130. wandb/sdk/launch/builder/docker_builder.py +181 -0
  131. wandb/sdk/launch/builder/kaniko_builder.py +419 -0
  132. wandb/sdk/launch/builder/noop.py +31 -12
  133. wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
  134. wandb/sdk/launch/environment/abstract.py +28 -0
  135. wandb/sdk/launch/environment/aws_environment.py +276 -0
  136. wandb/sdk/launch/environment/gcp_environment.py +271 -0
  137. wandb/sdk/launch/environment/local_environment.py +65 -0
  138. wandb/sdk/launch/github_reference.py +3 -8
  139. wandb/sdk/launch/launch.py +38 -29
  140. wandb/sdk/launch/launch_add.py +6 -8
  141. wandb/sdk/launch/loader.py +230 -0
  142. wandb/sdk/launch/registry/abstract.py +54 -0
  143. wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
  144. wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
  145. wandb/sdk/launch/registry/local_registry.py +62 -0
  146. wandb/sdk/launch/runner/abstract.py +1 -16
  147. wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
  148. wandb/sdk/launch/runner/local_container.py +46 -22
  149. wandb/sdk/launch/runner/local_process.py +1 -4
  150. wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
  151. wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
  152. wandb/sdk/launch/sweeps/__init__.py +3 -2
  153. wandb/sdk/launch/sweeps/scheduler.py +132 -39
  154. wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
  155. wandb/sdk/launch/utils.py +101 -30
  156. wandb/sdk/launch/wandb_reference.py +2 -7
  157. wandb/sdk/lib/_settings_toposort_generate.py +166 -0
  158. wandb/sdk/lib/_settings_toposort_generated.py +201 -0
  159. wandb/sdk/lib/apikey.py +2 -4
  160. wandb/sdk/lib/config_util.py +4 -1
  161. wandb/sdk/lib/console.py +1 -3
  162. wandb/sdk/lib/deprecate.py +3 -3
  163. wandb/sdk/lib/file_stream_utils.py +7 -5
  164. wandb/sdk/lib/filenames.py +1 -1
  165. wandb/sdk/lib/filesystem.py +61 -5
  166. wandb/sdk/lib/git.py +1 -3
  167. wandb/sdk/lib/import_hooks.py +4 -7
  168. wandb/sdk/lib/ipython.py +8 -5
  169. wandb/sdk/lib/lazyloader.py +1 -3
  170. wandb/sdk/lib/mailbox.py +14 -4
  171. wandb/sdk/lib/proto_util.py +10 -5
  172. wandb/sdk/lib/redirect.py +15 -22
  173. wandb/sdk/lib/reporting.py +1 -3
  174. wandb/sdk/lib/retry.py +4 -5
  175. wandb/sdk/lib/runid.py +1 -3
  176. wandb/sdk/lib/server.py +15 -9
  177. wandb/sdk/lib/sock_client.py +1 -1
  178. wandb/sdk/lib/sparkline.py +1 -1
  179. wandb/sdk/lib/wburls.py +1 -1
  180. wandb/sdk/service/port_file.py +1 -2
  181. wandb/sdk/service/service.py +36 -13
  182. wandb/sdk/service/service_base.py +12 -1
  183. wandb/sdk/verify/verify.py +5 -7
  184. wandb/sdk/wandb_artifacts.py +142 -177
  185. wandb/sdk/wandb_config.py +5 -8
  186. wandb/sdk/wandb_helper.py +1 -1
  187. wandb/sdk/wandb_init.py +24 -13
  188. wandb/sdk/wandb_login.py +9 -9
  189. wandb/sdk/wandb_manager.py +39 -4
  190. wandb/sdk/wandb_metric.py +2 -6
  191. wandb/sdk/wandb_require.py +4 -15
  192. wandb/sdk/wandb_require_helpers.py +1 -9
  193. wandb/sdk/wandb_run.py +95 -141
  194. wandb/sdk/wandb_save.py +1 -3
  195. wandb/sdk/wandb_settings.py +149 -54
  196. wandb/sdk/wandb_setup.py +66 -46
  197. wandb/sdk/wandb_summary.py +13 -10
  198. wandb/sdk/wandb_sweep.py +6 -7
  199. wandb/sdk/wandb_watch.py +1 -1
  200. wandb/sklearn/calculate/confusion_matrix.py +1 -1
  201. wandb/sklearn/calculate/learning_curve.py +1 -1
  202. wandb/sklearn/calculate/summary_metrics.py +1 -3
  203. wandb/sklearn/plot/__init__.py +1 -1
  204. wandb/sklearn/plot/classifier.py +27 -18
  205. wandb/sklearn/plot/clusterer.py +4 -5
  206. wandb/sklearn/plot/regressor.py +4 -4
  207. wandb/sklearn/plot/shared.py +2 -2
  208. wandb/sync/__init__.py +1 -3
  209. wandb/sync/sync.py +4 -5
  210. wandb/testing/relay.py +11 -10
  211. wandb/trigger.py +1 -1
  212. wandb/util.py +106 -81
  213. wandb/viz.py +4 -4
  214. wandb/wandb_agent.py +50 -50
  215. wandb/wandb_controller.py +2 -3
  216. wandb/wandb_run.py +1 -2
  217. wandb/wandb_torch.py +1 -1
  218. wandb/xgboost/__init__.py +1 -2
  219. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
  220. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
  221. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
  222. wandb/sdk/launch/builder/docker.py +0 -80
  223. wandb/sdk/launch/builder/kaniko.py +0 -393
  224. wandb/sdk/launch/builder/loader.py +0 -32
  225. wandb/sdk/launch/runner/loader.py +0 -50
  226. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
  227. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
  228. {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,203 @@
1
+ """Implementation of Google Artifact Registry for wandb launch."""
2
+ import logging
3
+ import re
4
+ from typing import Tuple
5
+
6
+ from wandb.sdk.launch.environment.gcp_environment import GcpEnvironment
7
+ from wandb.sdk.launch.utils import LaunchError
8
+ from wandb.util import get_module
9
+
10
+ from .abstract import AbstractRegistry
11
+
12
+ google = get_module(
13
+ "google",
14
+ required="Google Cloud Platform support requires the google package. Please"
15
+ " install it with `pip install wandb[launch]`.",
16
+ )
17
+
18
+ google.cloud.artifactregistry = get_module(
19
+ "google.cloud.artifactregistry",
20
+ required="Google Cloud Platform support requires the google-cloud-artifact-registry package. "
21
+ "Please install it with `pip install wandb[launch]`.",
22
+ )
23
+
24
+ google.auth.credentials = get_module(
25
+ "google.auth.credentials",
26
+ required="Google Cloud Platform support requires google-auth. "
27
+ "Please install it with `pip install wandb[launch]`.",
28
+ )
29
+
30
+ _logger = logging.getLogger(__name__)
31
+
32
+
33
+ class GoogleArtifactRegistry(AbstractRegistry):
34
+ """Google Artifact Registry.
35
+
36
+ Attributes:
37
+ repository: The repository name.
38
+ environment: A GcpEnvironment configured for access to this registry.
39
+ """
40
+
41
+ repository: str
42
+ image_name: str
43
+ environment: GcpEnvironment
44
+
45
+ def __init__(
46
+ self,
47
+ repository: str,
48
+ image_name: str,
49
+ environment: GcpEnvironment,
50
+ verify: bool = True,
51
+ ) -> None:
52
+ """Initialize the Google Artifact Registry.
53
+
54
+ Arguments:
55
+ repository: The repository name.
56
+ image_name: The image name.
57
+ environment: A GcpEnvironment configured for access to this registry.
58
+ verify: Whether to verify the credentials, region, and project.
59
+
60
+ Raises:
61
+ LaunchError: If verify is True and the container registry or its
62
+ environment have not been properly configured. Or if the environment
63
+ is not an instance of GcpEnvironment.
64
+ """
65
+ _logger.info(
66
+ f"Initializing Google Artifact Registry with repository {repository} "
67
+ f"and image name {image_name}"
68
+ )
69
+ self.repository = repository
70
+ self.image_name = image_name
71
+ if not re.match(r"^\w[\w.-]+$", image_name):
72
+ raise LaunchError(
73
+ f"The image name {image_name} is invalid. The image name must "
74
+ "consist of alphanumeric characters and underscores."
75
+ )
76
+ self.environment = environment
77
+ if verify:
78
+ self.verify()
79
+
80
+ @property
81
+ def uri(self) -> str:
82
+ """The uri of the registry."""
83
+ return f"{self.environment.region}-docker.pkg.dev/{self.environment.project}/{self.repository}/{self.image_name}"
84
+
85
+ @uri.setter
86
+ def uri(self, uri: str) -> None:
87
+ """Set the uri of the registry."""
88
+ raise LaunchError("The uri of the Google Artifact Registry cannot be set.")
89
+
90
+ @classmethod
91
+ def from_config( # type: ignore[override]
92
+ cls,
93
+ config: dict,
94
+ environment: GcpEnvironment,
95
+ verify: bool = True,
96
+ ) -> "GoogleArtifactRegistry":
97
+ """Create a Google Artifact Registry from a config.
98
+
99
+ Arguments:
100
+ config: A dictionary containing the following keys:
101
+ repository: The repository name.
102
+ image_name: The image name.
103
+ environment: A GcpEnvironment configured for access to this registry.
104
+
105
+ Returns:
106
+ A GoogleArtifactRegistry.
107
+ """
108
+ repository = config.get("repository")
109
+ if not repository:
110
+ raise LaunchError(
111
+ "The Google Artifact Registry repository must be specified."
112
+ )
113
+ image_name = config.get("image_name")
114
+ if not image_name:
115
+ raise LaunchError("The image name must be specified.")
116
+ return cls(repository, image_name, environment, verify=verify)
117
+
118
+ def verify(self) -> None:
119
+ """Verify the registry is properly configured.
120
+
121
+ Raises:
122
+ LaunchError: If the registry is not properly configured.
123
+ """
124
+ credentials = self.environment.get_credentials()
125
+ parent = (
126
+ f"projects/{self.environment.project}/locations/{self.environment.region}"
127
+ )
128
+ # We need to list the repositories to verify that the repository exists.
129
+ request = google.cloud.artifactregistry.ListRepositoriesRequest(parent=parent)
130
+ client = google.cloud.artifactregistry.ArtifactRegistryClient(
131
+ credentials=credentials
132
+ )
133
+ try:
134
+ response = client.list_repositories(request=request)
135
+ except google.api_core.exceptions.PermissionDenied:
136
+ raise LaunchError(
137
+ "The provided credentials do not have permission to access the "
138
+ f"Google Artifact Registry repository {self.repository}."
139
+ )
140
+ # Look for self.repository in the list of responses.
141
+ for repo in response:
142
+ if repo.name.endswith(self.repository):
143
+ break
144
+ # If we didn't find the repository, raise an error.
145
+ else:
146
+ raise LaunchError(
147
+ f"The Google Artifact Registry repository {self.repository} does not exist."
148
+ )
149
+
150
+ def get_username_password(self) -> Tuple[str, str]:
151
+ """Get the username and password for the registry.
152
+
153
+ Returns:
154
+ A tuple of the username and password.
155
+ """
156
+ credentials = self.environment.get_credentials()
157
+ return "oauth2accesstoken", credentials.token
158
+
159
+ def get_repo_uri(self) -> str:
160
+ """Get the URI for the given repository.
161
+
162
+ Arguments:
163
+ repo_name: The repository name.
164
+
165
+ Returns:
166
+ The repository URI.
167
+ """
168
+ return (
169
+ f"{self.environment.region}-docker.pkg.dev/"
170
+ f"{self.environment.project}/{self.repository}/{self.image_name}"
171
+ )
172
+
173
+ def check_image_exists(self, image_uri: str) -> bool:
174
+ """Check if the image exists.
175
+
176
+ Arguments:
177
+ image_uri: The image URI.
178
+
179
+ Returns:
180
+ True if the image exists, False otherwise.
181
+ """
182
+ _logger.info(
183
+ f"Checking if image {image_uri} exists. In Google Artifact Registry {self.uri}."
184
+ )
185
+
186
+ return False
187
+ # TODO: Test GCP Artifact Registry image exists to get working
188
+ # repo_uri, _ = image_uri.split(":")
189
+ # if repo_uri != self.get_repo_uri():
190
+ # raise LaunchError(
191
+ # f"The image {image_uri} does not belong to the Google Artifact "
192
+ # f"Repository {self.get_repo_uri()}."
193
+ # )
194
+ # credentials = self.environment.get_credentials()
195
+ # request = google.cloud.artifactregistry.GetTagRequest(parent=image_uri)
196
+ # client = google.cloud.artifactregistry.ArtifactRegistryClient(
197
+ # credentials=credentials
198
+ # )
199
+ # try:
200
+ # client.get_tag(request=request)
201
+ # return True
202
+ # except google.api_core.exceptions.NotFound:
203
+ # return False
@@ -0,0 +1,62 @@
1
+ """Local registry implementation."""
2
+ import logging
3
+ from typing import Tuple
4
+
5
+ from wandb.sdk.launch.utils import LaunchError, docker_image_exists
6
+
7
+ from ..environment.abstract import AbstractEnvironment
8
+ from .abstract import AbstractRegistry
9
+
10
+ _logger = logging.getLogger(__name__)
11
+
12
+
13
+ class LocalRegistry(AbstractRegistry):
14
+ """A local registry.
15
+
16
+ This is a dummy registry that is used when no registry is configured.
17
+ """
18
+
19
+ def __init__(self) -> None:
20
+ """Initialize a local registry."""
21
+ pass
22
+
23
+ @classmethod
24
+ def from_config(
25
+ cls, config: dict, environment: "AbstractEnvironment", verify: bool = True
26
+ ) -> "LocalRegistry":
27
+ """Create a local registry from a config.
28
+
29
+ Arguments:
30
+ config (dict): The config. This is ignored.
31
+ environment (AbstractEnvironment): The environment. This is ignored.
32
+
33
+ Returns:
34
+ LocalRegistry: The local registry.
35
+ """
36
+ return cls()
37
+
38
+ def verify(self) -> None:
39
+ """Verify the local registry by doing nothing."""
40
+ pass
41
+
42
+ def get_username_password(self) -> Tuple[str, str]:
43
+ """Get the username and password of the local registry."""
44
+ raise LaunchError("Attempted to get username and password for LocalRegistry.")
45
+
46
+ def get_repo_uri(self) -> str:
47
+ """Get the uri of the local registry.
48
+
49
+ Returns: An empty string.
50
+ """
51
+ return ""
52
+
53
+ def check_image_exists(self, image_uri: str) -> bool:
54
+ """Check if an image exists in the local registry.
55
+
56
+ Arguments:
57
+ image_uri (str): The uri of the image.
58
+
59
+ Returns:
60
+ bool: True.
61
+ """
62
+ return docker_image_exists(image_uri)
@@ -10,7 +10,6 @@ from dockerpycreds.utils import find_executable # type: ignore
10
10
  import wandb
11
11
  from wandb import Settings
12
12
  from wandb.apis.internal import Api
13
- from wandb.errors import CommError
14
13
  from wandb.sdk.launch.builder.abstract import AbstractBuilder
15
14
  from wandb.sdk.lib import runid
16
15
 
@@ -60,7 +59,7 @@ class AbstractRun(ABC):
60
59
  def _run_cmd(
61
60
  self, cmd: List[str], output_only: Optional[bool] = False
62
61
  ) -> Optional[Union["subprocess.Popen[bytes]", bytes]]:
63
- """Runs the command and returns a popen object or the stdout of the command.
62
+ """Run the command and returns a popen object or the stdout of the command.
64
63
 
65
64
  Arguments:
66
65
  cmd: The command to run
@@ -143,25 +142,11 @@ class AbstractRunner(ABC):
143
142
  sys.exit(1)
144
143
  return True
145
144
 
146
- def ack_run_queue_item(self, launch_project: LaunchProject) -> bool:
147
- if self.backend_config.get("runQueueItemId"):
148
- try:
149
- self._api.ack_run_queue_item(
150
- self.backend_config["runQueueItemId"], launch_project.run_id
151
- )
152
- except CommError:
153
- wandb.termerror(
154
- "Error acking run queue item. Item lease may have ended or another process may have acked it."
155
- )
156
- return False
157
- return True
158
-
159
145
  @abstractmethod
160
146
  def run(
161
147
  self,
162
148
  launch_project: LaunchProject,
163
149
  builder: AbstractBuilder,
164
- registry_config: Dict[str, Any],
165
150
  ) -> Optional[AbstractRun]:
166
151
  """Submit an LaunchProject to be run.
167
152
 
@@ -1,17 +1,15 @@
1
1
  import base64
2
2
  import json
3
+ import logging
3
4
  import time
4
5
  from typing import Any, Dict, List, Optional
5
6
 
6
- from kubernetes import client # type: ignore
7
- from kubernetes.client.api.batch_v1_api import BatchV1Api # type: ignore
8
- from kubernetes.client.api.core_v1_api import CoreV1Api # type: ignore
9
- from kubernetes.client.models.v1_job import V1Job # type: ignore
10
- from kubernetes.client.models.v1_secret import V1Secret # type: ignore
11
-
12
7
  import wandb
13
- from wandb.errors import LaunchError
8
+ from wandb.apis.internal import Api
14
9
  from wandb.sdk.launch.builder.abstract import AbstractBuilder
10
+ from wandb.sdk.launch.environment.abstract import AbstractEnvironment
11
+ from wandb.sdk.launch.registry.abstract import AbstractRegistry
12
+ from wandb.sdk.launch.registry.local_registry import LocalRegistry
15
13
  from wandb.util import get_module, load_json_yaml_dict
16
14
 
17
15
  from .._project_spec import LaunchProject, get_entry_point_command
@@ -19,15 +17,30 @@ from ..builder.build import get_env_vars_dict
19
17
  from ..utils import (
20
18
  LOG_PREFIX,
21
19
  PROJECT_SYNCHRONOUS,
20
+ LaunchError,
22
21
  get_kube_context_and_api_client,
23
22
  make_name_dns_safe,
24
23
  )
25
24
  from .abstract import AbstractRun, AbstractRunner, Status
26
25
 
26
+ get_module(
27
+ "kubernetes",
28
+ required="Kubernetes runner requires the kubernetes package. Please install it with `pip install wandb[launch]`.",
29
+ )
30
+
31
+ from kubernetes import client # type: ignore # noqa: E402
32
+ from kubernetes.client.api.batch_v1_api import BatchV1Api # type: ignore # noqa: E402
33
+ from kubernetes.client.api.core_v1_api import CoreV1Api # type: ignore # noqa: E402
34
+ from kubernetes.client.models.v1_job import V1Job # type: ignore # noqa: E402
35
+ from kubernetes.client.models.v1_secret import V1Secret # type: ignore # noqa: E402
36
+
27
37
  TIMEOUT = 5
28
38
  MAX_KUBERNETES_RETRIES = (
29
39
  60 # default 10 second loop time on the agent, this is 10 minutes
30
40
  )
41
+ FAIL_MESSAGE_INTERVAL = 60
42
+
43
+ _logger = logging.getLogger(__name__)
31
44
 
32
45
 
33
46
  class KubernetesSubmittedRun(AbstractRun):
@@ -76,20 +89,23 @@ class KubernetesSubmittedRun(AbstractRun):
76
89
  name=self.name, namespace=self.namespace
77
90
  )
78
91
  status = job_response.status
79
- try:
80
- self.core_api.read_namespaced_pod_log(
81
- name=self.pod_names[0], namespace=self.namespace
82
- )
83
- except Exception as e:
84
- if self._fail_count == 1:
92
+
93
+ pod = self.core_api.read_namespaced_pod(
94
+ name=self.pod_names[0], namespace=self.namespace
95
+ )
96
+ if pod.status.phase in ["Pending", "Unknown"]:
97
+ now = time.time()
98
+ if self._fail_count == 0:
99
+ self._fail_first_msg_time = now
100
+ self._fail_last_msg_time = 0.0
101
+ self._fail_count += 1
102
+ if now - self._fail_last_msg_time > FAIL_MESSAGE_INTERVAL:
85
103
  wandb.termlog(
86
- f"{LOG_PREFIX}Failed to get pod status for job: {self.name}. Will wait up to 10 minutes for job to start."
104
+ f"{LOG_PREFIX}Pod has not started yet for job: {self.name}. Will wait up to {round(10 - (now - self._fail_first_msg_time)/60)} minutes."
87
105
  )
88
- self._fail_count += 1
106
+ self._fail_last_msg_time = now
89
107
  if self._fail_count > MAX_KUBERNETES_RETRIES:
90
- raise LaunchError(
91
- f"Failed to start job {self.name}, because of error {str(e)}"
92
- )
108
+ raise LaunchError(f"Failed to start job {self.name}")
93
109
  # todo: we only handle the 1 pod case. see https://kubernetes.io/docs/concepts/workloads/controllers/job/#parallel-jobs for multipod handling
94
110
  return_status = None
95
111
  if status.succeeded == 1:
@@ -145,6 +161,12 @@ class KubernetesSubmittedRun(AbstractRun):
145
161
 
146
162
 
147
163
  class KubernetesRunner(AbstractRunner):
164
+ def __init__(
165
+ self, api: Api, backend_config: Dict[str, Any], environment: AbstractEnvironment
166
+ ) -> None:
167
+ super().__init__(api, backend_config)
168
+ self.environment = environment
169
+
148
170
  def populate_job_spec(
149
171
  self, job_spec: Dict[str, Any], resource_args: Dict[str, Any]
150
172
  ) -> None:
@@ -180,7 +202,6 @@ class KubernetesRunner(AbstractRunner):
180
202
  def populate_container_resources(
181
203
  self, containers: List[Dict[str, Any]], resource_args: Dict[str, Any]
182
204
  ) -> None:
183
-
184
205
  if resource_args.get("container_name"):
185
206
  if len(containers) > 1:
186
207
  raise LaunchError(
@@ -247,9 +268,7 @@ class KubernetesRunner(AbstractRunner):
247
268
  )
248
269
  return pod_names
249
270
 
250
- def get_namespace(
251
- self, resource_args: Dict[str, Any]
252
- ) -> Optional[str]: # noqa: C901
271
+ def get_namespace(self, resource_args: Dict[str, Any]) -> Optional[str]:
253
272
  return self.backend_config.get("runner", {}).get(
254
273
  "namespace"
255
274
  ) or resource_args.get("namespace")
@@ -257,18 +276,19 @@ class KubernetesRunner(AbstractRunner):
257
276
  def run(
258
277
  self,
259
278
  launch_project: LaunchProject,
260
- builder: AbstractBuilder,
261
- registry_config: Dict[str, Any],
279
+ builder: Optional[AbstractBuilder],
262
280
  ) -> Optional[AbstractRun]: # noqa: C901
263
281
  kubernetes = get_module( # noqa: F811
264
- "kubernetes", "KubernetesRunner requires kubernetes to be installed"
282
+ "kubernetes",
283
+ required="Kubernetes runner requires the kubernetes package. Please"
284
+ " install it with `pip install wandb[launch]`.",
265
285
  )
266
-
267
286
  resource_args = launch_project.resource_args.get("kubernetes", {})
268
287
  if not resource_args:
269
288
  wandb.termlog(
270
289
  f"{LOG_PREFIX}Note: no resource args specified. Add a Kubernetes yaml spec or other options in a json file with --resource-args <json>."
271
290
  )
291
+ _logger.info(f"Running Kubernetes job with resource args: {resource_args}")
272
292
  context, api_client = get_kube_context_and_api_client(kubernetes, resource_args)
273
293
 
274
294
  batch_api = kubernetes.client.BatchV1Api(api_client)
@@ -332,35 +352,25 @@ class KubernetesRunner(AbstractRunner):
332
352
  "Multiple container configurations should be specified in a yaml file supplied via job_spec."
333
353
  )
334
354
  # dont specify run id if user provided image, could have multiple runs
335
- env_vars.pop("WANDB_RUN_ID")
336
355
  containers[0]["image"] = launch_project.docker_image
337
356
  image_uri = launch_project.docker_image
338
357
  # TODO: handle secret pulling image from registry
339
- elif any(["image" in cont for cont in containers]):
340
- # user specified image configurations via kubernetes yaml, could have multiple images
341
- # dont specify run id if user provided image, could have multiple runs
342
- env_vars.pop("WANDB_RUN_ID")
343
- # TODO: handle secret pulling image from registries?
344
- else:
358
+ elif not any(["image" in cont for cont in containers]):
345
359
  if len(containers) > 1:
346
360
  raise LaunchError(
347
361
  "Launch only builds one container at a time. Multiple container configurations should be pre-built and specified in a yaml file supplied via job_spec."
348
362
  )
349
- given_reg = resource_args.get("registry", "")
350
- repository: Optional[str] = (
351
- given_reg if given_reg != "" else registry_config.get("url")
352
- )
353
- if repository is None:
354
- # allow local registry usage for eg local clusters but throw a warning
355
- wandb.termwarn(
356
- f"{LOG_PREFIX}Warning: No Docker repository specified. Image will be hosted on local registry, which may not be accessible to your training cluster."
357
- )
358
363
  assert entry_point is not None
359
- image_uri = builder.build_image(launch_project, repository, entry_point)
364
+ assert builder is not None
365
+ image_uri = builder.build_image(launch_project, entry_point)
360
366
  # in the non instance case we need to make an imagePullSecret
361
367
  # so the new job can pull the image
368
+ if not builder.registry:
369
+ raise LaunchError(
370
+ "No registry specified. Please specify a registry in your wandb/settings file or pass a registry to the builder."
371
+ )
362
372
  secret = maybe_create_imagepull_secret(
363
- core_api, registry_config, launch_project.run_id, namespace
373
+ core_api, builder.registry, launch_project.run_id, namespace
364
374
  )
365
375
 
366
376
  containers[0]["image"] = image_uri
@@ -370,6 +380,9 @@ class KubernetesRunner(AbstractRunner):
370
380
  kubernetes_style_env_vars = [
371
381
  {"name": k, "value": v} for k, v in env_vars.items()
372
382
  ]
383
+ _logger.info(
384
+ f"Using environment variables: {given_env_vars + kubernetes_style_env_vars}"
385
+ )
373
386
  for cont in containers:
374
387
  cont["env"] = given_env_vars + kubernetes_style_env_vars
375
388
  pod_spec["containers"] = containers
@@ -385,9 +398,7 @@ class KubernetesRunner(AbstractRunner):
385
398
  job_dict["metadata"] = job_metadata
386
399
  job_dict["status"] = job_status
387
400
 
388
- if not self.ack_run_queue_item(launch_project):
389
- return None
390
-
401
+ _logger.info(f"Creating Kubernetes job from: {job_dict}")
391
402
  job_response = kubernetes.utils.create_from_yaml(
392
403
  api_client, yaml_objects=[job_dict], namespace=namespace
393
404
  )[0][
@@ -409,57 +420,34 @@ class KubernetesRunner(AbstractRunner):
409
420
 
410
421
  def maybe_create_imagepull_secret(
411
422
  core_api: "CoreV1Api",
412
- registry_config: Dict[str, Any],
423
+ registry: AbstractRegistry,
413
424
  run_id: str,
414
425
  namespace: str,
415
426
  ) -> Optional["V1Secret"]:
416
427
  secret = None
417
- ecr_provider = registry_config.get("ecr-provider", "").lower()
418
- if (
419
- ecr_provider
420
- and ecr_provider == "aws"
421
- and registry_config.get("url") is not None
422
- and registry_config.get("credentials") is not None
423
- ):
424
- boto3 = get_module(
425
- "boto3", "AWS ECR requires boto3, install with pip install wandb[launch]"
426
- )
427
- ecr_client = boto3.client("ecr")
428
- try:
429
- encoded_token = ecr_client.get_authorization_token()["authorizationData"][
430
- 0
431
- ]["authorizationToken"]
432
- decoded_token = base64.b64decode(encoded_token.encode()).decode()
433
- uname, token = decoded_token.split(":")
434
- except Exception as e:
435
- raise LaunchError(f"Could not get authorization token for ECR, error: {e}")
436
- creds_info = {
437
- "auths": {
438
- registry_config.get("url"): {
439
- "username": uname,
440
- "password": token,
441
- # need an email but the use is deprecated
442
- "email": "deprecated@wandblaunch.com",
443
- "auth": encoded_token,
444
- }
428
+ if isinstance(registry, LocalRegistry):
429
+ # Secret not required
430
+ return None
431
+ uname, token = registry.get_username_password()
432
+ creds_info = {
433
+ "auths": {
434
+ registry.uri: {
435
+ "auth": base64.b64encode(f"{uname}:{token}".encode()).decode(),
436
+ # need an email but the use is deprecated
437
+ "email": "deprecated@wandblaunch.com",
445
438
  }
446
439
  }
447
- secret_data = {
448
- ".dockerconfigjson": base64.b64encode(
449
- json.dumps(creds_info).encode()
450
- ).decode()
451
- }
452
- secret = client.V1Secret(
453
- data=secret_data,
454
- metadata=client.V1ObjectMeta(name=f"regcred-{run_id}", namespace=namespace),
455
- kind="Secret",
456
- type="kubernetes.io/dockerconfigjson",
457
- )
458
- try:
459
- core_api.create_namespaced_secret(namespace, secret)
460
- except Exception as e:
461
- raise LaunchError(f"Exception when creating Kubernetes secret: {str(e)}\n")
462
- # TODO: support other ecr providers
463
- elif ecr_provider and ecr_provider != "aws":
464
- raise LaunchError(f"Registry provider not supported: {ecr_provider}")
465
- return secret
440
+ }
441
+ secret_data = {
442
+ ".dockerconfigjson": base64.b64encode(json.dumps(creds_info).encode()).decode()
443
+ }
444
+ secret = client.V1Secret(
445
+ data=secret_data,
446
+ metadata=client.V1ObjectMeta(name=f"regcred-{run_id}", namespace=namespace),
447
+ kind="Secret",
448
+ type="kubernetes.io/dockerconfigjson",
449
+ )
450
+ try:
451
+ return core_api.create_namespaced_secret(namespace, secret)
452
+ except Exception as e:
453
+ raise LaunchError(f"Exception when creating Kubernetes secret: {str(e)}\n")