wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -3
- wandb/apis/__init__.py +1 -3
- wandb/apis/importers/__init__.py +4 -0
- wandb/apis/importers/base.py +312 -0
- wandb/apis/importers/mlflow.py +113 -0
- wandb/apis/internal.py +29 -2
- wandb/apis/normalize.py +6 -5
- wandb/apis/public.py +163 -180
- wandb/apis/reports/_templates.py +6 -12
- wandb/apis/reports/report.py +1 -1
- wandb/apis/reports/runset.py +1 -3
- wandb/apis/reports/util.py +12 -10
- wandb/beta/workflows.py +57 -34
- wandb/catboost/__init__.py +1 -2
- wandb/cli/cli.py +215 -133
- wandb/data_types.py +63 -56
- wandb/docker/__init__.py +78 -16
- wandb/docker/auth.py +21 -22
- wandb/env.py +0 -1
- wandb/errors/__init__.py +8 -116
- wandb/errors/term.py +1 -1
- wandb/fastai/__init__.py +1 -2
- wandb/filesync/dir_watcher.py +8 -5
- wandb/filesync/step_prepare.py +76 -75
- wandb/filesync/step_upload.py +1 -2
- wandb/integration/catboost/__init__.py +1 -3
- wandb/integration/catboost/catboost.py +8 -14
- wandb/integration/fastai/__init__.py +7 -13
- wandb/integration/gym/__init__.py +35 -4
- wandb/integration/keras/__init__.py +3 -3
- wandb/integration/keras/callbacks/metrics_logger.py +9 -8
- wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
- wandb/integration/keras/callbacks/tables_builder.py +31 -19
- wandb/integration/kfp/kfp_patch.py +20 -17
- wandb/integration/kfp/wandb_logging.py +1 -2
- wandb/integration/lightgbm/__init__.py +21 -19
- wandb/integration/prodigy/prodigy.py +6 -7
- wandb/integration/sacred/__init__.py +9 -12
- wandb/integration/sagemaker/__init__.py +1 -3
- wandb/integration/sagemaker/auth.py +0 -1
- wandb/integration/sagemaker/config.py +1 -1
- wandb/integration/sagemaker/resources.py +1 -1
- wandb/integration/sb3/sb3.py +8 -4
- wandb/integration/tensorboard/__init__.py +1 -3
- wandb/integration/tensorboard/log.py +8 -8
- wandb/integration/tensorboard/monkeypatch.py +11 -9
- wandb/integration/tensorflow/__init__.py +1 -3
- wandb/integration/xgboost/__init__.py +4 -6
- wandb/integration/yolov8/__init__.py +7 -0
- wandb/integration/yolov8/yolov8.py +250 -0
- wandb/jupyter.py +31 -35
- wandb/lightgbm/__init__.py +1 -2
- wandb/old/settings.py +2 -2
- wandb/plot/bar.py +1 -2
- wandb/plot/confusion_matrix.py +1 -3
- wandb/plot/histogram.py +1 -2
- wandb/plot/line.py +1 -2
- wandb/plot/line_series.py +4 -4
- wandb/plot/pr_curve.py +17 -20
- wandb/plot/roc_curve.py +1 -3
- wandb/plot/scatter.py +1 -2
- wandb/proto/v3/wandb_server_pb2.py +85 -39
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_server_pb2.py +51 -39
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/__init__.py +1 -3
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/data_types/_dtypes.py +38 -30
- wandb/sdk/data_types/base_types/json_metadata.py +1 -3
- wandb/sdk/data_types/base_types/media.py +17 -17
- wandb/sdk/data_types/base_types/wb_value.py +33 -26
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
- wandb/sdk/data_types/helper_types/classes.py +1 -1
- wandb/sdk/data_types/helper_types/image_mask.py +12 -12
- wandb/sdk/data_types/histogram.py +5 -4
- wandb/sdk/data_types/html.py +1 -2
- wandb/sdk/data_types/image.py +11 -11
- wandb/sdk/data_types/molecule.py +3 -6
- wandb/sdk/data_types/object_3d.py +1 -2
- wandb/sdk/data_types/plotly.py +1 -2
- wandb/sdk/data_types/saved_model.py +10 -8
- wandb/sdk/data_types/video.py +1 -1
- wandb/sdk/integration_utils/data_logging.py +5 -5
- wandb/sdk/interface/artifacts.py +288 -266
- wandb/sdk/interface/interface.py +2 -3
- wandb/sdk/interface/interface_grpc.py +1 -1
- wandb/sdk/interface/interface_queue.py +1 -1
- wandb/sdk/interface/interface_relay.py +1 -1
- wandb/sdk/interface/interface_shared.py +1 -2
- wandb/sdk/interface/interface_sock.py +1 -1
- wandb/sdk/interface/message_future.py +1 -1
- wandb/sdk/interface/message_future_poll.py +1 -1
- wandb/sdk/interface/router.py +1 -1
- wandb/sdk/interface/router_queue.py +1 -1
- wandb/sdk/interface/router_relay.py +1 -1
- wandb/sdk/interface/router_sock.py +1 -1
- wandb/sdk/interface/summary_record.py +1 -1
- wandb/sdk/internal/artifacts.py +1 -1
- wandb/sdk/internal/datastore.py +2 -3
- wandb/sdk/internal/file_pusher.py +5 -3
- wandb/sdk/internal/file_stream.py +22 -19
- wandb/sdk/internal/handler.py +5 -4
- wandb/sdk/internal/internal.py +1 -1
- wandb/sdk/internal/internal_api.py +115 -55
- wandb/sdk/internal/job_builder.py +1 -3
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/progress.py +4 -6
- wandb/sdk/internal/sample.py +1 -3
- wandb/sdk/internal/sender.py +28 -16
- wandb/sdk/internal/settings_static.py +5 -5
- wandb/sdk/internal/system/assets/__init__.py +1 -0
- wandb/sdk/internal/system/assets/cpu.py +3 -9
- wandb/sdk/internal/system/assets/disk.py +2 -4
- wandb/sdk/internal/system/assets/gpu.py +6 -18
- wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
- wandb/sdk/internal/system/assets/interfaces.py +50 -22
- wandb/sdk/internal/system/assets/ipu.py +1 -3
- wandb/sdk/internal/system/assets/memory.py +7 -13
- wandb/sdk/internal/system/assets/network.py +4 -8
- wandb/sdk/internal/system/assets/open_metrics.py +283 -0
- wandb/sdk/internal/system/assets/tpu.py +1 -4
- wandb/sdk/internal/system/assets/trainium.py +26 -14
- wandb/sdk/internal/system/system_info.py +2 -3
- wandb/sdk/internal/system/system_monitor.py +52 -20
- wandb/sdk/internal/tb_watcher.py +12 -13
- wandb/sdk/launch/_project_spec.py +54 -65
- wandb/sdk/launch/agent/agent.py +374 -90
- wandb/sdk/launch/builder/abstract.py +61 -7
- wandb/sdk/launch/builder/build.py +81 -110
- wandb/sdk/launch/builder/docker_builder.py +181 -0
- wandb/sdk/launch/builder/kaniko_builder.py +419 -0
- wandb/sdk/launch/builder/noop.py +31 -12
- wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
- wandb/sdk/launch/environment/abstract.py +28 -0
- wandb/sdk/launch/environment/aws_environment.py +276 -0
- wandb/sdk/launch/environment/gcp_environment.py +271 -0
- wandb/sdk/launch/environment/local_environment.py +65 -0
- wandb/sdk/launch/github_reference.py +3 -8
- wandb/sdk/launch/launch.py +38 -29
- wandb/sdk/launch/launch_add.py +6 -8
- wandb/sdk/launch/loader.py +230 -0
- wandb/sdk/launch/registry/abstract.py +54 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
- wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
- wandb/sdk/launch/registry/local_registry.py +62 -0
- wandb/sdk/launch/runner/abstract.py +1 -16
- wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
- wandb/sdk/launch/runner/local_container.py +46 -22
- wandb/sdk/launch/runner/local_process.py +1 -4
- wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
- wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
- wandb/sdk/launch/sweeps/__init__.py +3 -2
- wandb/sdk/launch/sweeps/scheduler.py +132 -39
- wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
- wandb/sdk/launch/utils.py +101 -30
- wandb/sdk/launch/wandb_reference.py +2 -7
- wandb/sdk/lib/_settings_toposort_generate.py +166 -0
- wandb/sdk/lib/_settings_toposort_generated.py +201 -0
- wandb/sdk/lib/apikey.py +2 -4
- wandb/sdk/lib/config_util.py +4 -1
- wandb/sdk/lib/console.py +1 -3
- wandb/sdk/lib/deprecate.py +3 -3
- wandb/sdk/lib/file_stream_utils.py +7 -5
- wandb/sdk/lib/filenames.py +1 -1
- wandb/sdk/lib/filesystem.py +61 -5
- wandb/sdk/lib/git.py +1 -3
- wandb/sdk/lib/import_hooks.py +4 -7
- wandb/sdk/lib/ipython.py +8 -5
- wandb/sdk/lib/lazyloader.py +1 -3
- wandb/sdk/lib/mailbox.py +14 -4
- wandb/sdk/lib/proto_util.py +10 -5
- wandb/sdk/lib/redirect.py +15 -22
- wandb/sdk/lib/reporting.py +1 -3
- wandb/sdk/lib/retry.py +4 -5
- wandb/sdk/lib/runid.py +1 -3
- wandb/sdk/lib/server.py +15 -9
- wandb/sdk/lib/sock_client.py +1 -1
- wandb/sdk/lib/sparkline.py +1 -1
- wandb/sdk/lib/wburls.py +1 -1
- wandb/sdk/service/port_file.py +1 -2
- wandb/sdk/service/service.py +36 -13
- wandb/sdk/service/service_base.py +12 -1
- wandb/sdk/verify/verify.py +5 -7
- wandb/sdk/wandb_artifacts.py +142 -177
- wandb/sdk/wandb_config.py +5 -8
- wandb/sdk/wandb_helper.py +1 -1
- wandb/sdk/wandb_init.py +24 -13
- wandb/sdk/wandb_login.py +9 -9
- wandb/sdk/wandb_manager.py +39 -4
- wandb/sdk/wandb_metric.py +2 -6
- wandb/sdk/wandb_require.py +4 -15
- wandb/sdk/wandb_require_helpers.py +1 -9
- wandb/sdk/wandb_run.py +95 -141
- wandb/sdk/wandb_save.py +1 -3
- wandb/sdk/wandb_settings.py +149 -54
- wandb/sdk/wandb_setup.py +66 -46
- wandb/sdk/wandb_summary.py +13 -10
- wandb/sdk/wandb_sweep.py +6 -7
- wandb/sdk/wandb_watch.py +1 -1
- wandb/sklearn/calculate/confusion_matrix.py +1 -1
- wandb/sklearn/calculate/learning_curve.py +1 -1
- wandb/sklearn/calculate/summary_metrics.py +1 -3
- wandb/sklearn/plot/__init__.py +1 -1
- wandb/sklearn/plot/classifier.py +27 -18
- wandb/sklearn/plot/clusterer.py +4 -5
- wandb/sklearn/plot/regressor.py +4 -4
- wandb/sklearn/plot/shared.py +2 -2
- wandb/sync/__init__.py +1 -3
- wandb/sync/sync.py +4 -5
- wandb/testing/relay.py +11 -10
- wandb/trigger.py +1 -1
- wandb/util.py +106 -81
- wandb/viz.py +4 -4
- wandb/wandb_agent.py +50 -50
- wandb/wandb_controller.py +2 -3
- wandb/wandb_run.py +1 -2
- wandb/wandb_torch.py +1 -1
- wandb/xgboost/__init__.py +1 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
- wandb/sdk/launch/builder/docker.py +0 -80
- wandb/sdk/launch/builder/kaniko.py +0 -393
- wandb/sdk/launch/builder/loader.py +0 -32
- wandb/sdk/launch/runner/loader.py +0 -50
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,203 @@
|
|
1
|
+
"""Implementation of Google Artifact Registry for wandb launch."""
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
from typing import Tuple
|
5
|
+
|
6
|
+
from wandb.sdk.launch.environment.gcp_environment import GcpEnvironment
|
7
|
+
from wandb.sdk.launch.utils import LaunchError
|
8
|
+
from wandb.util import get_module
|
9
|
+
|
10
|
+
from .abstract import AbstractRegistry
|
11
|
+
|
12
|
+
google = get_module(
|
13
|
+
"google",
|
14
|
+
required="Google Cloud Platform support requires the google package. Please"
|
15
|
+
" install it with `pip install wandb[launch]`.",
|
16
|
+
)
|
17
|
+
|
18
|
+
google.cloud.artifactregistry = get_module(
|
19
|
+
"google.cloud.artifactregistry",
|
20
|
+
required="Google Cloud Platform support requires the google-cloud-artifact-registry package. "
|
21
|
+
"Please install it with `pip install wandb[launch]`.",
|
22
|
+
)
|
23
|
+
|
24
|
+
google.auth.credentials = get_module(
|
25
|
+
"google.auth.credentials",
|
26
|
+
required="Google Cloud Platform support requires google-auth. "
|
27
|
+
"Please install it with `pip install wandb[launch]`.",
|
28
|
+
)
|
29
|
+
|
30
|
+
_logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
|
33
|
+
class GoogleArtifactRegistry(AbstractRegistry):
|
34
|
+
"""Google Artifact Registry.
|
35
|
+
|
36
|
+
Attributes:
|
37
|
+
repository: The repository name.
|
38
|
+
environment: A GcpEnvironment configured for access to this registry.
|
39
|
+
"""
|
40
|
+
|
41
|
+
repository: str
|
42
|
+
image_name: str
|
43
|
+
environment: GcpEnvironment
|
44
|
+
|
45
|
+
def __init__(
|
46
|
+
self,
|
47
|
+
repository: str,
|
48
|
+
image_name: str,
|
49
|
+
environment: GcpEnvironment,
|
50
|
+
verify: bool = True,
|
51
|
+
) -> None:
|
52
|
+
"""Initialize the Google Artifact Registry.
|
53
|
+
|
54
|
+
Arguments:
|
55
|
+
repository: The repository name.
|
56
|
+
image_name: The image name.
|
57
|
+
environment: A GcpEnvironment configured for access to this registry.
|
58
|
+
verify: Whether to verify the credentials, region, and project.
|
59
|
+
|
60
|
+
Raises:
|
61
|
+
LaunchError: If verify is True and the container registry or its
|
62
|
+
environment have not been properly configured. Or if the environment
|
63
|
+
is not an instance of GcpEnvironment.
|
64
|
+
"""
|
65
|
+
_logger.info(
|
66
|
+
f"Initializing Google Artifact Registry with repository {repository} "
|
67
|
+
f"and image name {image_name}"
|
68
|
+
)
|
69
|
+
self.repository = repository
|
70
|
+
self.image_name = image_name
|
71
|
+
if not re.match(r"^\w[\w.-]+$", image_name):
|
72
|
+
raise LaunchError(
|
73
|
+
f"The image name {image_name} is invalid. The image name must "
|
74
|
+
"consist of alphanumeric characters and underscores."
|
75
|
+
)
|
76
|
+
self.environment = environment
|
77
|
+
if verify:
|
78
|
+
self.verify()
|
79
|
+
|
80
|
+
@property
|
81
|
+
def uri(self) -> str:
|
82
|
+
"""The uri of the registry."""
|
83
|
+
return f"{self.environment.region}-docker.pkg.dev/{self.environment.project}/{self.repository}/{self.image_name}"
|
84
|
+
|
85
|
+
@uri.setter
|
86
|
+
def uri(self, uri: str) -> None:
|
87
|
+
"""Set the uri of the registry."""
|
88
|
+
raise LaunchError("The uri of the Google Artifact Registry cannot be set.")
|
89
|
+
|
90
|
+
@classmethod
|
91
|
+
def from_config( # type: ignore[override]
|
92
|
+
cls,
|
93
|
+
config: dict,
|
94
|
+
environment: GcpEnvironment,
|
95
|
+
verify: bool = True,
|
96
|
+
) -> "GoogleArtifactRegistry":
|
97
|
+
"""Create a Google Artifact Registry from a config.
|
98
|
+
|
99
|
+
Arguments:
|
100
|
+
config: A dictionary containing the following keys:
|
101
|
+
repository: The repository name.
|
102
|
+
image_name: The image name.
|
103
|
+
environment: A GcpEnvironment configured for access to this registry.
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
A GoogleArtifactRegistry.
|
107
|
+
"""
|
108
|
+
repository = config.get("repository")
|
109
|
+
if not repository:
|
110
|
+
raise LaunchError(
|
111
|
+
"The Google Artifact Registry repository must be specified."
|
112
|
+
)
|
113
|
+
image_name = config.get("image_name")
|
114
|
+
if not image_name:
|
115
|
+
raise LaunchError("The image name must be specified.")
|
116
|
+
return cls(repository, image_name, environment, verify=verify)
|
117
|
+
|
118
|
+
def verify(self) -> None:
|
119
|
+
"""Verify the registry is properly configured.
|
120
|
+
|
121
|
+
Raises:
|
122
|
+
LaunchError: If the registry is not properly configured.
|
123
|
+
"""
|
124
|
+
credentials = self.environment.get_credentials()
|
125
|
+
parent = (
|
126
|
+
f"projects/{self.environment.project}/locations/{self.environment.region}"
|
127
|
+
)
|
128
|
+
# We need to list the repositories to verify that the repository exists.
|
129
|
+
request = google.cloud.artifactregistry.ListRepositoriesRequest(parent=parent)
|
130
|
+
client = google.cloud.artifactregistry.ArtifactRegistryClient(
|
131
|
+
credentials=credentials
|
132
|
+
)
|
133
|
+
try:
|
134
|
+
response = client.list_repositories(request=request)
|
135
|
+
except google.api_core.exceptions.PermissionDenied:
|
136
|
+
raise LaunchError(
|
137
|
+
"The provided credentials do not have permission to access the "
|
138
|
+
f"Google Artifact Registry repository {self.repository}."
|
139
|
+
)
|
140
|
+
# Look for self.repository in the list of responses.
|
141
|
+
for repo in response:
|
142
|
+
if repo.name.endswith(self.repository):
|
143
|
+
break
|
144
|
+
# If we didn't find the repository, raise an error.
|
145
|
+
else:
|
146
|
+
raise LaunchError(
|
147
|
+
f"The Google Artifact Registry repository {self.repository} does not exist."
|
148
|
+
)
|
149
|
+
|
150
|
+
def get_username_password(self) -> Tuple[str, str]:
|
151
|
+
"""Get the username and password for the registry.
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
A tuple of the username and password.
|
155
|
+
"""
|
156
|
+
credentials = self.environment.get_credentials()
|
157
|
+
return "oauth2accesstoken", credentials.token
|
158
|
+
|
159
|
+
def get_repo_uri(self) -> str:
|
160
|
+
"""Get the URI for the given repository.
|
161
|
+
|
162
|
+
Arguments:
|
163
|
+
repo_name: The repository name.
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
The repository URI.
|
167
|
+
"""
|
168
|
+
return (
|
169
|
+
f"{self.environment.region}-docker.pkg.dev/"
|
170
|
+
f"{self.environment.project}/{self.repository}/{self.image_name}"
|
171
|
+
)
|
172
|
+
|
173
|
+
def check_image_exists(self, image_uri: str) -> bool:
|
174
|
+
"""Check if the image exists.
|
175
|
+
|
176
|
+
Arguments:
|
177
|
+
image_uri: The image URI.
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
True if the image exists, False otherwise.
|
181
|
+
"""
|
182
|
+
_logger.info(
|
183
|
+
f"Checking if image {image_uri} exists. In Google Artifact Registry {self.uri}."
|
184
|
+
)
|
185
|
+
|
186
|
+
return False
|
187
|
+
# TODO: Test GCP Artifact Registry image exists to get working
|
188
|
+
# repo_uri, _ = image_uri.split(":")
|
189
|
+
# if repo_uri != self.get_repo_uri():
|
190
|
+
# raise LaunchError(
|
191
|
+
# f"The image {image_uri} does not belong to the Google Artifact "
|
192
|
+
# f"Repository {self.get_repo_uri()}."
|
193
|
+
# )
|
194
|
+
# credentials = self.environment.get_credentials()
|
195
|
+
# request = google.cloud.artifactregistry.GetTagRequest(parent=image_uri)
|
196
|
+
# client = google.cloud.artifactregistry.ArtifactRegistryClient(
|
197
|
+
# credentials=credentials
|
198
|
+
# )
|
199
|
+
# try:
|
200
|
+
# client.get_tag(request=request)
|
201
|
+
# return True
|
202
|
+
# except google.api_core.exceptions.NotFound:
|
203
|
+
# return False
|
@@ -0,0 +1,62 @@
|
|
1
|
+
"""Local registry implementation."""
|
2
|
+
import logging
|
3
|
+
from typing import Tuple
|
4
|
+
|
5
|
+
from wandb.sdk.launch.utils import LaunchError, docker_image_exists
|
6
|
+
|
7
|
+
from ..environment.abstract import AbstractEnvironment
|
8
|
+
from .abstract import AbstractRegistry
|
9
|
+
|
10
|
+
_logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class LocalRegistry(AbstractRegistry):
|
14
|
+
"""A local registry.
|
15
|
+
|
16
|
+
This is a dummy registry that is used when no registry is configured.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self) -> None:
|
20
|
+
"""Initialize a local registry."""
|
21
|
+
pass
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def from_config(
|
25
|
+
cls, config: dict, environment: "AbstractEnvironment", verify: bool = True
|
26
|
+
) -> "LocalRegistry":
|
27
|
+
"""Create a local registry from a config.
|
28
|
+
|
29
|
+
Arguments:
|
30
|
+
config (dict): The config. This is ignored.
|
31
|
+
environment (AbstractEnvironment): The environment. This is ignored.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
LocalRegistry: The local registry.
|
35
|
+
"""
|
36
|
+
return cls()
|
37
|
+
|
38
|
+
def verify(self) -> None:
|
39
|
+
"""Verify the local registry by doing nothing."""
|
40
|
+
pass
|
41
|
+
|
42
|
+
def get_username_password(self) -> Tuple[str, str]:
|
43
|
+
"""Get the username and password of the local registry."""
|
44
|
+
raise LaunchError("Attempted to get username and password for LocalRegistry.")
|
45
|
+
|
46
|
+
def get_repo_uri(self) -> str:
|
47
|
+
"""Get the uri of the local registry.
|
48
|
+
|
49
|
+
Returns: An empty string.
|
50
|
+
"""
|
51
|
+
return ""
|
52
|
+
|
53
|
+
def check_image_exists(self, image_uri: str) -> bool:
|
54
|
+
"""Check if an image exists in the local registry.
|
55
|
+
|
56
|
+
Arguments:
|
57
|
+
image_uri (str): The uri of the image.
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
bool: True.
|
61
|
+
"""
|
62
|
+
return docker_image_exists(image_uri)
|
@@ -10,7 +10,6 @@ from dockerpycreds.utils import find_executable # type: ignore
|
|
10
10
|
import wandb
|
11
11
|
from wandb import Settings
|
12
12
|
from wandb.apis.internal import Api
|
13
|
-
from wandb.errors import CommError
|
14
13
|
from wandb.sdk.launch.builder.abstract import AbstractBuilder
|
15
14
|
from wandb.sdk.lib import runid
|
16
15
|
|
@@ -60,7 +59,7 @@ class AbstractRun(ABC):
|
|
60
59
|
def _run_cmd(
|
61
60
|
self, cmd: List[str], output_only: Optional[bool] = False
|
62
61
|
) -> Optional[Union["subprocess.Popen[bytes]", bytes]]:
|
63
|
-
"""
|
62
|
+
"""Run the command and returns a popen object or the stdout of the command.
|
64
63
|
|
65
64
|
Arguments:
|
66
65
|
cmd: The command to run
|
@@ -143,25 +142,11 @@ class AbstractRunner(ABC):
|
|
143
142
|
sys.exit(1)
|
144
143
|
return True
|
145
144
|
|
146
|
-
def ack_run_queue_item(self, launch_project: LaunchProject) -> bool:
|
147
|
-
if self.backend_config.get("runQueueItemId"):
|
148
|
-
try:
|
149
|
-
self._api.ack_run_queue_item(
|
150
|
-
self.backend_config["runQueueItemId"], launch_project.run_id
|
151
|
-
)
|
152
|
-
except CommError:
|
153
|
-
wandb.termerror(
|
154
|
-
"Error acking run queue item. Item lease may have ended or another process may have acked it."
|
155
|
-
)
|
156
|
-
return False
|
157
|
-
return True
|
158
|
-
|
159
145
|
@abstractmethod
|
160
146
|
def run(
|
161
147
|
self,
|
162
148
|
launch_project: LaunchProject,
|
163
149
|
builder: AbstractBuilder,
|
164
|
-
registry_config: Dict[str, Any],
|
165
150
|
) -> Optional[AbstractRun]:
|
166
151
|
"""Submit an LaunchProject to be run.
|
167
152
|
|
@@ -1,17 +1,15 @@
|
|
1
1
|
import base64
|
2
2
|
import json
|
3
|
+
import logging
|
3
4
|
import time
|
4
5
|
from typing import Any, Dict, List, Optional
|
5
6
|
|
6
|
-
from kubernetes import client # type: ignore
|
7
|
-
from kubernetes.client.api.batch_v1_api import BatchV1Api # type: ignore
|
8
|
-
from kubernetes.client.api.core_v1_api import CoreV1Api # type: ignore
|
9
|
-
from kubernetes.client.models.v1_job import V1Job # type: ignore
|
10
|
-
from kubernetes.client.models.v1_secret import V1Secret # type: ignore
|
11
|
-
|
12
7
|
import wandb
|
13
|
-
from wandb.
|
8
|
+
from wandb.apis.internal import Api
|
14
9
|
from wandb.sdk.launch.builder.abstract import AbstractBuilder
|
10
|
+
from wandb.sdk.launch.environment.abstract import AbstractEnvironment
|
11
|
+
from wandb.sdk.launch.registry.abstract import AbstractRegistry
|
12
|
+
from wandb.sdk.launch.registry.local_registry import LocalRegistry
|
15
13
|
from wandb.util import get_module, load_json_yaml_dict
|
16
14
|
|
17
15
|
from .._project_spec import LaunchProject, get_entry_point_command
|
@@ -19,15 +17,30 @@ from ..builder.build import get_env_vars_dict
|
|
19
17
|
from ..utils import (
|
20
18
|
LOG_PREFIX,
|
21
19
|
PROJECT_SYNCHRONOUS,
|
20
|
+
LaunchError,
|
22
21
|
get_kube_context_and_api_client,
|
23
22
|
make_name_dns_safe,
|
24
23
|
)
|
25
24
|
from .abstract import AbstractRun, AbstractRunner, Status
|
26
25
|
|
26
|
+
get_module(
|
27
|
+
"kubernetes",
|
28
|
+
required="Kubernetes runner requires the kubernetes package. Please install it with `pip install wandb[launch]`.",
|
29
|
+
)
|
30
|
+
|
31
|
+
from kubernetes import client # type: ignore # noqa: E402
|
32
|
+
from kubernetes.client.api.batch_v1_api import BatchV1Api # type: ignore # noqa: E402
|
33
|
+
from kubernetes.client.api.core_v1_api import CoreV1Api # type: ignore # noqa: E402
|
34
|
+
from kubernetes.client.models.v1_job import V1Job # type: ignore # noqa: E402
|
35
|
+
from kubernetes.client.models.v1_secret import V1Secret # type: ignore # noqa: E402
|
36
|
+
|
27
37
|
TIMEOUT = 5
|
28
38
|
MAX_KUBERNETES_RETRIES = (
|
29
39
|
60 # default 10 second loop time on the agent, this is 10 minutes
|
30
40
|
)
|
41
|
+
FAIL_MESSAGE_INTERVAL = 60
|
42
|
+
|
43
|
+
_logger = logging.getLogger(__name__)
|
31
44
|
|
32
45
|
|
33
46
|
class KubernetesSubmittedRun(AbstractRun):
|
@@ -76,20 +89,23 @@ class KubernetesSubmittedRun(AbstractRun):
|
|
76
89
|
name=self.name, namespace=self.namespace
|
77
90
|
)
|
78
91
|
status = job_response.status
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
92
|
+
|
93
|
+
pod = self.core_api.read_namespaced_pod(
|
94
|
+
name=self.pod_names[0], namespace=self.namespace
|
95
|
+
)
|
96
|
+
if pod.status.phase in ["Pending", "Unknown"]:
|
97
|
+
now = time.time()
|
98
|
+
if self._fail_count == 0:
|
99
|
+
self._fail_first_msg_time = now
|
100
|
+
self._fail_last_msg_time = 0.0
|
101
|
+
self._fail_count += 1
|
102
|
+
if now - self._fail_last_msg_time > FAIL_MESSAGE_INTERVAL:
|
85
103
|
wandb.termlog(
|
86
|
-
f"{LOG_PREFIX}
|
104
|
+
f"{LOG_PREFIX}Pod has not started yet for job: {self.name}. Will wait up to {round(10 - (now - self._fail_first_msg_time)/60)} minutes."
|
87
105
|
)
|
88
|
-
|
106
|
+
self._fail_last_msg_time = now
|
89
107
|
if self._fail_count > MAX_KUBERNETES_RETRIES:
|
90
|
-
raise LaunchError(
|
91
|
-
f"Failed to start job {self.name}, because of error {str(e)}"
|
92
|
-
)
|
108
|
+
raise LaunchError(f"Failed to start job {self.name}")
|
93
109
|
# todo: we only handle the 1 pod case. see https://kubernetes.io/docs/concepts/workloads/controllers/job/#parallel-jobs for multipod handling
|
94
110
|
return_status = None
|
95
111
|
if status.succeeded == 1:
|
@@ -145,6 +161,12 @@ class KubernetesSubmittedRun(AbstractRun):
|
|
145
161
|
|
146
162
|
|
147
163
|
class KubernetesRunner(AbstractRunner):
|
164
|
+
def __init__(
|
165
|
+
self, api: Api, backend_config: Dict[str, Any], environment: AbstractEnvironment
|
166
|
+
) -> None:
|
167
|
+
super().__init__(api, backend_config)
|
168
|
+
self.environment = environment
|
169
|
+
|
148
170
|
def populate_job_spec(
|
149
171
|
self, job_spec: Dict[str, Any], resource_args: Dict[str, Any]
|
150
172
|
) -> None:
|
@@ -180,7 +202,6 @@ class KubernetesRunner(AbstractRunner):
|
|
180
202
|
def populate_container_resources(
|
181
203
|
self, containers: List[Dict[str, Any]], resource_args: Dict[str, Any]
|
182
204
|
) -> None:
|
183
|
-
|
184
205
|
if resource_args.get("container_name"):
|
185
206
|
if len(containers) > 1:
|
186
207
|
raise LaunchError(
|
@@ -247,9 +268,7 @@ class KubernetesRunner(AbstractRunner):
|
|
247
268
|
)
|
248
269
|
return pod_names
|
249
270
|
|
250
|
-
def get_namespace(
|
251
|
-
self, resource_args: Dict[str, Any]
|
252
|
-
) -> Optional[str]: # noqa: C901
|
271
|
+
def get_namespace(self, resource_args: Dict[str, Any]) -> Optional[str]:
|
253
272
|
return self.backend_config.get("runner", {}).get(
|
254
273
|
"namespace"
|
255
274
|
) or resource_args.get("namespace")
|
@@ -257,18 +276,19 @@ class KubernetesRunner(AbstractRunner):
|
|
257
276
|
def run(
|
258
277
|
self,
|
259
278
|
launch_project: LaunchProject,
|
260
|
-
builder: AbstractBuilder,
|
261
|
-
registry_config: Dict[str, Any],
|
279
|
+
builder: Optional[AbstractBuilder],
|
262
280
|
) -> Optional[AbstractRun]: # noqa: C901
|
263
281
|
kubernetes = get_module( # noqa: F811
|
264
|
-
"kubernetes",
|
282
|
+
"kubernetes",
|
283
|
+
required="Kubernetes runner requires the kubernetes package. Please"
|
284
|
+
" install it with `pip install wandb[launch]`.",
|
265
285
|
)
|
266
|
-
|
267
286
|
resource_args = launch_project.resource_args.get("kubernetes", {})
|
268
287
|
if not resource_args:
|
269
288
|
wandb.termlog(
|
270
289
|
f"{LOG_PREFIX}Note: no resource args specified. Add a Kubernetes yaml spec or other options in a json file with --resource-args <json>."
|
271
290
|
)
|
291
|
+
_logger.info(f"Running Kubernetes job with resource args: {resource_args}")
|
272
292
|
context, api_client = get_kube_context_and_api_client(kubernetes, resource_args)
|
273
293
|
|
274
294
|
batch_api = kubernetes.client.BatchV1Api(api_client)
|
@@ -332,35 +352,25 @@ class KubernetesRunner(AbstractRunner):
|
|
332
352
|
"Multiple container configurations should be specified in a yaml file supplied via job_spec."
|
333
353
|
)
|
334
354
|
# dont specify run id if user provided image, could have multiple runs
|
335
|
-
env_vars.pop("WANDB_RUN_ID")
|
336
355
|
containers[0]["image"] = launch_project.docker_image
|
337
356
|
image_uri = launch_project.docker_image
|
338
357
|
# TODO: handle secret pulling image from registry
|
339
|
-
elif any(["image" in cont for cont in containers]):
|
340
|
-
# user specified image configurations via kubernetes yaml, could have multiple images
|
341
|
-
# dont specify run id if user provided image, could have multiple runs
|
342
|
-
env_vars.pop("WANDB_RUN_ID")
|
343
|
-
# TODO: handle secret pulling image from registries?
|
344
|
-
else:
|
358
|
+
elif not any(["image" in cont for cont in containers]):
|
345
359
|
if len(containers) > 1:
|
346
360
|
raise LaunchError(
|
347
361
|
"Launch only builds one container at a time. Multiple container configurations should be pre-built and specified in a yaml file supplied via job_spec."
|
348
362
|
)
|
349
|
-
given_reg = resource_args.get("registry", "")
|
350
|
-
repository: Optional[str] = (
|
351
|
-
given_reg if given_reg != "" else registry_config.get("url")
|
352
|
-
)
|
353
|
-
if repository is None:
|
354
|
-
# allow local registry usage for eg local clusters but throw a warning
|
355
|
-
wandb.termwarn(
|
356
|
-
f"{LOG_PREFIX}Warning: No Docker repository specified. Image will be hosted on local registry, which may not be accessible to your training cluster."
|
357
|
-
)
|
358
363
|
assert entry_point is not None
|
359
|
-
|
364
|
+
assert builder is not None
|
365
|
+
image_uri = builder.build_image(launch_project, entry_point)
|
360
366
|
# in the non instance case we need to make an imagePullSecret
|
361
367
|
# so the new job can pull the image
|
368
|
+
if not builder.registry:
|
369
|
+
raise LaunchError(
|
370
|
+
"No registry specified. Please specify a registry in your wandb/settings file or pass a registry to the builder."
|
371
|
+
)
|
362
372
|
secret = maybe_create_imagepull_secret(
|
363
|
-
core_api,
|
373
|
+
core_api, builder.registry, launch_project.run_id, namespace
|
364
374
|
)
|
365
375
|
|
366
376
|
containers[0]["image"] = image_uri
|
@@ -370,6 +380,9 @@ class KubernetesRunner(AbstractRunner):
|
|
370
380
|
kubernetes_style_env_vars = [
|
371
381
|
{"name": k, "value": v} for k, v in env_vars.items()
|
372
382
|
]
|
383
|
+
_logger.info(
|
384
|
+
f"Using environment variables: {given_env_vars + kubernetes_style_env_vars}"
|
385
|
+
)
|
373
386
|
for cont in containers:
|
374
387
|
cont["env"] = given_env_vars + kubernetes_style_env_vars
|
375
388
|
pod_spec["containers"] = containers
|
@@ -385,9 +398,7 @@ class KubernetesRunner(AbstractRunner):
|
|
385
398
|
job_dict["metadata"] = job_metadata
|
386
399
|
job_dict["status"] = job_status
|
387
400
|
|
388
|
-
|
389
|
-
return None
|
390
|
-
|
401
|
+
_logger.info(f"Creating Kubernetes job from: {job_dict}")
|
391
402
|
job_response = kubernetes.utils.create_from_yaml(
|
392
403
|
api_client, yaml_objects=[job_dict], namespace=namespace
|
393
404
|
)[0][
|
@@ -409,57 +420,34 @@ class KubernetesRunner(AbstractRunner):
|
|
409
420
|
|
410
421
|
def maybe_create_imagepull_secret(
|
411
422
|
core_api: "CoreV1Api",
|
412
|
-
|
423
|
+
registry: AbstractRegistry,
|
413
424
|
run_id: str,
|
414
425
|
namespace: str,
|
415
426
|
) -> Optional["V1Secret"]:
|
416
427
|
secret = None
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
ecr_client = boto3.client("ecr")
|
428
|
-
try:
|
429
|
-
encoded_token = ecr_client.get_authorization_token()["authorizationData"][
|
430
|
-
0
|
431
|
-
]["authorizationToken"]
|
432
|
-
decoded_token = base64.b64decode(encoded_token.encode()).decode()
|
433
|
-
uname, token = decoded_token.split(":")
|
434
|
-
except Exception as e:
|
435
|
-
raise LaunchError(f"Could not get authorization token for ECR, error: {e}")
|
436
|
-
creds_info = {
|
437
|
-
"auths": {
|
438
|
-
registry_config.get("url"): {
|
439
|
-
"username": uname,
|
440
|
-
"password": token,
|
441
|
-
# need an email but the use is deprecated
|
442
|
-
"email": "deprecated@wandblaunch.com",
|
443
|
-
"auth": encoded_token,
|
444
|
-
}
|
428
|
+
if isinstance(registry, LocalRegistry):
|
429
|
+
# Secret not required
|
430
|
+
return None
|
431
|
+
uname, token = registry.get_username_password()
|
432
|
+
creds_info = {
|
433
|
+
"auths": {
|
434
|
+
registry.uri: {
|
435
|
+
"auth": base64.b64encode(f"{uname}:{token}".encode()).decode(),
|
436
|
+
# need an email but the use is deprecated
|
437
|
+
"email": "deprecated@wandblaunch.com",
|
445
438
|
}
|
446
439
|
}
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
raise LaunchError(f"Exception when creating Kubernetes secret: {str(e)}\n")
|
462
|
-
# TODO: support other ecr providers
|
463
|
-
elif ecr_provider and ecr_provider != "aws":
|
464
|
-
raise LaunchError(f"Registry provider not supported: {ecr_provider}")
|
465
|
-
return secret
|
440
|
+
}
|
441
|
+
secret_data = {
|
442
|
+
".dockerconfigjson": base64.b64encode(json.dumps(creds_info).encode()).decode()
|
443
|
+
}
|
444
|
+
secret = client.V1Secret(
|
445
|
+
data=secret_data,
|
446
|
+
metadata=client.V1ObjectMeta(name=f"regcred-{run_id}", namespace=namespace),
|
447
|
+
kind="Secret",
|
448
|
+
type="kubernetes.io/dockerconfigjson",
|
449
|
+
)
|
450
|
+
try:
|
451
|
+
return core_api.create_namespaced_secret(namespace, secret)
|
452
|
+
except Exception as e:
|
453
|
+
raise LaunchError(f"Exception when creating Kubernetes secret: {str(e)}\n")
|