wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -3
- wandb/apis/__init__.py +1 -3
- wandb/apis/importers/__init__.py +4 -0
- wandb/apis/importers/base.py +312 -0
- wandb/apis/importers/mlflow.py +113 -0
- wandb/apis/internal.py +29 -2
- wandb/apis/normalize.py +6 -5
- wandb/apis/public.py +163 -180
- wandb/apis/reports/_templates.py +6 -12
- wandb/apis/reports/report.py +1 -1
- wandb/apis/reports/runset.py +1 -3
- wandb/apis/reports/util.py +12 -10
- wandb/beta/workflows.py +57 -34
- wandb/catboost/__init__.py +1 -2
- wandb/cli/cli.py +215 -133
- wandb/data_types.py +63 -56
- wandb/docker/__init__.py +78 -16
- wandb/docker/auth.py +21 -22
- wandb/env.py +0 -1
- wandb/errors/__init__.py +8 -116
- wandb/errors/term.py +1 -1
- wandb/fastai/__init__.py +1 -2
- wandb/filesync/dir_watcher.py +8 -5
- wandb/filesync/step_prepare.py +76 -75
- wandb/filesync/step_upload.py +1 -2
- wandb/integration/catboost/__init__.py +1 -3
- wandb/integration/catboost/catboost.py +8 -14
- wandb/integration/fastai/__init__.py +7 -13
- wandb/integration/gym/__init__.py +35 -4
- wandb/integration/keras/__init__.py +3 -3
- wandb/integration/keras/callbacks/metrics_logger.py +9 -8
- wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
- wandb/integration/keras/callbacks/tables_builder.py +31 -19
- wandb/integration/kfp/kfp_patch.py +20 -17
- wandb/integration/kfp/wandb_logging.py +1 -2
- wandb/integration/lightgbm/__init__.py +21 -19
- wandb/integration/prodigy/prodigy.py +6 -7
- wandb/integration/sacred/__init__.py +9 -12
- wandb/integration/sagemaker/__init__.py +1 -3
- wandb/integration/sagemaker/auth.py +0 -1
- wandb/integration/sagemaker/config.py +1 -1
- wandb/integration/sagemaker/resources.py +1 -1
- wandb/integration/sb3/sb3.py +8 -4
- wandb/integration/tensorboard/__init__.py +1 -3
- wandb/integration/tensorboard/log.py +8 -8
- wandb/integration/tensorboard/monkeypatch.py +11 -9
- wandb/integration/tensorflow/__init__.py +1 -3
- wandb/integration/xgboost/__init__.py +4 -6
- wandb/integration/yolov8/__init__.py +7 -0
- wandb/integration/yolov8/yolov8.py +250 -0
- wandb/jupyter.py +31 -35
- wandb/lightgbm/__init__.py +1 -2
- wandb/old/settings.py +2 -2
- wandb/plot/bar.py +1 -2
- wandb/plot/confusion_matrix.py +1 -3
- wandb/plot/histogram.py +1 -2
- wandb/plot/line.py +1 -2
- wandb/plot/line_series.py +4 -4
- wandb/plot/pr_curve.py +17 -20
- wandb/plot/roc_curve.py +1 -3
- wandb/plot/scatter.py +1 -2
- wandb/proto/v3/wandb_server_pb2.py +85 -39
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_server_pb2.py +51 -39
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/__init__.py +1 -3
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/data_types/_dtypes.py +38 -30
- wandb/sdk/data_types/base_types/json_metadata.py +1 -3
- wandb/sdk/data_types/base_types/media.py +17 -17
- wandb/sdk/data_types/base_types/wb_value.py +33 -26
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
- wandb/sdk/data_types/helper_types/classes.py +1 -1
- wandb/sdk/data_types/helper_types/image_mask.py +12 -12
- wandb/sdk/data_types/histogram.py +5 -4
- wandb/sdk/data_types/html.py +1 -2
- wandb/sdk/data_types/image.py +11 -11
- wandb/sdk/data_types/molecule.py +3 -6
- wandb/sdk/data_types/object_3d.py +1 -2
- wandb/sdk/data_types/plotly.py +1 -2
- wandb/sdk/data_types/saved_model.py +10 -8
- wandb/sdk/data_types/video.py +1 -1
- wandb/sdk/integration_utils/data_logging.py +5 -5
- wandb/sdk/interface/artifacts.py +288 -266
- wandb/sdk/interface/interface.py +2 -3
- wandb/sdk/interface/interface_grpc.py +1 -1
- wandb/sdk/interface/interface_queue.py +1 -1
- wandb/sdk/interface/interface_relay.py +1 -1
- wandb/sdk/interface/interface_shared.py +1 -2
- wandb/sdk/interface/interface_sock.py +1 -1
- wandb/sdk/interface/message_future.py +1 -1
- wandb/sdk/interface/message_future_poll.py +1 -1
- wandb/sdk/interface/router.py +1 -1
- wandb/sdk/interface/router_queue.py +1 -1
- wandb/sdk/interface/router_relay.py +1 -1
- wandb/sdk/interface/router_sock.py +1 -1
- wandb/sdk/interface/summary_record.py +1 -1
- wandb/sdk/internal/artifacts.py +1 -1
- wandb/sdk/internal/datastore.py +2 -3
- wandb/sdk/internal/file_pusher.py +5 -3
- wandb/sdk/internal/file_stream.py +22 -19
- wandb/sdk/internal/handler.py +5 -4
- wandb/sdk/internal/internal.py +1 -1
- wandb/sdk/internal/internal_api.py +115 -55
- wandb/sdk/internal/job_builder.py +1 -3
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/progress.py +4 -6
- wandb/sdk/internal/sample.py +1 -3
- wandb/sdk/internal/sender.py +28 -16
- wandb/sdk/internal/settings_static.py +5 -5
- wandb/sdk/internal/system/assets/__init__.py +1 -0
- wandb/sdk/internal/system/assets/cpu.py +3 -9
- wandb/sdk/internal/system/assets/disk.py +2 -4
- wandb/sdk/internal/system/assets/gpu.py +6 -18
- wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
- wandb/sdk/internal/system/assets/interfaces.py +50 -22
- wandb/sdk/internal/system/assets/ipu.py +1 -3
- wandb/sdk/internal/system/assets/memory.py +7 -13
- wandb/sdk/internal/system/assets/network.py +4 -8
- wandb/sdk/internal/system/assets/open_metrics.py +283 -0
- wandb/sdk/internal/system/assets/tpu.py +1 -4
- wandb/sdk/internal/system/assets/trainium.py +26 -14
- wandb/sdk/internal/system/system_info.py +2 -3
- wandb/sdk/internal/system/system_monitor.py +52 -20
- wandb/sdk/internal/tb_watcher.py +12 -13
- wandb/sdk/launch/_project_spec.py +54 -65
- wandb/sdk/launch/agent/agent.py +374 -90
- wandb/sdk/launch/builder/abstract.py +61 -7
- wandb/sdk/launch/builder/build.py +81 -110
- wandb/sdk/launch/builder/docker_builder.py +181 -0
- wandb/sdk/launch/builder/kaniko_builder.py +419 -0
- wandb/sdk/launch/builder/noop.py +31 -12
- wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
- wandb/sdk/launch/environment/abstract.py +28 -0
- wandb/sdk/launch/environment/aws_environment.py +276 -0
- wandb/sdk/launch/environment/gcp_environment.py +271 -0
- wandb/sdk/launch/environment/local_environment.py +65 -0
- wandb/sdk/launch/github_reference.py +3 -8
- wandb/sdk/launch/launch.py +38 -29
- wandb/sdk/launch/launch_add.py +6 -8
- wandb/sdk/launch/loader.py +230 -0
- wandb/sdk/launch/registry/abstract.py +54 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
- wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
- wandb/sdk/launch/registry/local_registry.py +62 -0
- wandb/sdk/launch/runner/abstract.py +1 -16
- wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
- wandb/sdk/launch/runner/local_container.py +46 -22
- wandb/sdk/launch/runner/local_process.py +1 -4
- wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
- wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
- wandb/sdk/launch/sweeps/__init__.py +3 -2
- wandb/sdk/launch/sweeps/scheduler.py +132 -39
- wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
- wandb/sdk/launch/utils.py +101 -30
- wandb/sdk/launch/wandb_reference.py +2 -7
- wandb/sdk/lib/_settings_toposort_generate.py +166 -0
- wandb/sdk/lib/_settings_toposort_generated.py +201 -0
- wandb/sdk/lib/apikey.py +2 -4
- wandb/sdk/lib/config_util.py +4 -1
- wandb/sdk/lib/console.py +1 -3
- wandb/sdk/lib/deprecate.py +3 -3
- wandb/sdk/lib/file_stream_utils.py +7 -5
- wandb/sdk/lib/filenames.py +1 -1
- wandb/sdk/lib/filesystem.py +61 -5
- wandb/sdk/lib/git.py +1 -3
- wandb/sdk/lib/import_hooks.py +4 -7
- wandb/sdk/lib/ipython.py +8 -5
- wandb/sdk/lib/lazyloader.py +1 -3
- wandb/sdk/lib/mailbox.py +14 -4
- wandb/sdk/lib/proto_util.py +10 -5
- wandb/sdk/lib/redirect.py +15 -22
- wandb/sdk/lib/reporting.py +1 -3
- wandb/sdk/lib/retry.py +4 -5
- wandb/sdk/lib/runid.py +1 -3
- wandb/sdk/lib/server.py +15 -9
- wandb/sdk/lib/sock_client.py +1 -1
- wandb/sdk/lib/sparkline.py +1 -1
- wandb/sdk/lib/wburls.py +1 -1
- wandb/sdk/service/port_file.py +1 -2
- wandb/sdk/service/service.py +36 -13
- wandb/sdk/service/service_base.py +12 -1
- wandb/sdk/verify/verify.py +5 -7
- wandb/sdk/wandb_artifacts.py +142 -177
- wandb/sdk/wandb_config.py +5 -8
- wandb/sdk/wandb_helper.py +1 -1
- wandb/sdk/wandb_init.py +24 -13
- wandb/sdk/wandb_login.py +9 -9
- wandb/sdk/wandb_manager.py +39 -4
- wandb/sdk/wandb_metric.py +2 -6
- wandb/sdk/wandb_require.py +4 -15
- wandb/sdk/wandb_require_helpers.py +1 -9
- wandb/sdk/wandb_run.py +95 -141
- wandb/sdk/wandb_save.py +1 -3
- wandb/sdk/wandb_settings.py +149 -54
- wandb/sdk/wandb_setup.py +66 -46
- wandb/sdk/wandb_summary.py +13 -10
- wandb/sdk/wandb_sweep.py +6 -7
- wandb/sdk/wandb_watch.py +1 -1
- wandb/sklearn/calculate/confusion_matrix.py +1 -1
- wandb/sklearn/calculate/learning_curve.py +1 -1
- wandb/sklearn/calculate/summary_metrics.py +1 -3
- wandb/sklearn/plot/__init__.py +1 -1
- wandb/sklearn/plot/classifier.py +27 -18
- wandb/sklearn/plot/clusterer.py +4 -5
- wandb/sklearn/plot/regressor.py +4 -4
- wandb/sklearn/plot/shared.py +2 -2
- wandb/sync/__init__.py +1 -3
- wandb/sync/sync.py +4 -5
- wandb/testing/relay.py +11 -10
- wandb/trigger.py +1 -1
- wandb/util.py +106 -81
- wandb/viz.py +4 -4
- wandb/wandb_agent.py +50 -50
- wandb/wandb_controller.py +2 -3
- wandb/wandb_run.py +1 -2
- wandb/wandb_torch.py +1 -1
- wandb/xgboost/__init__.py +1 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
- wandb/sdk/launch/builder/docker.py +0 -80
- wandb/sdk/launch/builder/kaniko.py +0 -393
- wandb/sdk/launch/builder/loader.py +0 -32
- wandb/sdk/launch/runner/loader.py +0 -50
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,419 @@
|
|
1
|
+
import base64
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import tarfile
|
5
|
+
import tempfile
|
6
|
+
import time
|
7
|
+
from typing import Optional
|
8
|
+
|
9
|
+
import wandb
|
10
|
+
from wandb.sdk.launch.builder.abstract import AbstractBuilder
|
11
|
+
from wandb.sdk.launch.environment.abstract import AbstractEnvironment
|
12
|
+
from wandb.sdk.launch.registry.abstract import AbstractRegistry
|
13
|
+
from wandb.sdk.launch.registry.elastic_container_registry import (
|
14
|
+
ElasticContainerRegistry,
|
15
|
+
)
|
16
|
+
from wandb.sdk.launch.registry.google_artifact_registry import GoogleArtifactRegistry
|
17
|
+
from wandb.util import get_module
|
18
|
+
|
19
|
+
from .._project_spec import (
|
20
|
+
EntryPoint,
|
21
|
+
LaunchProject,
|
22
|
+
create_metadata_file,
|
23
|
+
get_entry_point_command,
|
24
|
+
)
|
25
|
+
from ..utils import (
|
26
|
+
LOG_PREFIX,
|
27
|
+
LaunchError,
|
28
|
+
get_kube_context_and_api_client,
|
29
|
+
sanitize_wandb_api_key,
|
30
|
+
warn_failed_packages_from_build_logs,
|
31
|
+
)
|
32
|
+
from .build import (
|
33
|
+
_create_docker_build_ctx,
|
34
|
+
generate_dockerfile,
|
35
|
+
image_tag_from_dockerfile_and_source,
|
36
|
+
)
|
37
|
+
|
38
|
+
get_module(
|
39
|
+
"kubernetes",
|
40
|
+
required="Kaniko builder requires the kubernetes package. Please install it with `pip install wandb[launch]`.",
|
41
|
+
)
|
42
|
+
|
43
|
+
import kubernetes # type: ignore # noqa: E402
|
44
|
+
from kubernetes import client # noqa: E402
|
45
|
+
|
46
|
+
_logger = logging.getLogger(__name__)
|
47
|
+
|
48
|
+
_DEFAULT_BUILD_TIMEOUT_SECS = 1800 # 30 minute build timeout
|
49
|
+
|
50
|
+
|
51
|
+
def _wait_for_completion(
|
52
|
+
batch_client: client.BatchV1Api, job_name: str, deadline_secs: Optional[int] = None
|
53
|
+
) -> bool:
|
54
|
+
start_time = time.time()
|
55
|
+
while True:
|
56
|
+
job = batch_client.read_namespaced_job_status(job_name, "wandb")
|
57
|
+
if job.status.succeeded is not None and job.status.succeeded >= 1:
|
58
|
+
return True
|
59
|
+
elif job.status.failed is not None and job.status.failed >= 1:
|
60
|
+
wandb.termerror(f"{LOG_PREFIX}Build job {job.status.failed} failed {job}")
|
61
|
+
return False
|
62
|
+
wandb.termlog(f"{LOG_PREFIX}Waiting for build job to complete...")
|
63
|
+
if deadline_secs is not None and time.time() - start_time > deadline_secs:
|
64
|
+
return False
|
65
|
+
|
66
|
+
time.sleep(5)
|
67
|
+
|
68
|
+
|
69
|
+
class KanikoBuilder(AbstractBuilder):
|
70
|
+
"""Builds a docker image for a project using Kaniko."""
|
71
|
+
|
72
|
+
type = "kaniko"
|
73
|
+
|
74
|
+
build_job_name: str
|
75
|
+
build_context_store: str
|
76
|
+
secret_name: Optional[str]
|
77
|
+
secret_key: Optional[str]
|
78
|
+
|
79
|
+
def __init__(
|
80
|
+
self,
|
81
|
+
environment: AbstractEnvironment,
|
82
|
+
registry: AbstractRegistry,
|
83
|
+
build_job_name: str = "wandb-launch-container-build",
|
84
|
+
build_context_store: str = "",
|
85
|
+
secret_name: str = "",
|
86
|
+
secret_key: str = "",
|
87
|
+
verify: bool = True,
|
88
|
+
):
|
89
|
+
"""Initialize a KanikoBuilder.
|
90
|
+
|
91
|
+
Arguments:
|
92
|
+
environment (AbstractEnvironment): The environment to use.
|
93
|
+
registry (AbstractRegistry): The registry to use.
|
94
|
+
build_job_name (str, optional): The name of the build job.
|
95
|
+
build_context_store (str, optional): The name of the build context store.
|
96
|
+
secret_name (str, optional): The name of the secret to use for the registry.
|
97
|
+
secret_key (str, optional): The key of the secret to use for the registry.
|
98
|
+
verify (bool, optional): Whether to verify the functionality of the builder.
|
99
|
+
Defaults to True.
|
100
|
+
"""
|
101
|
+
if build_context_store is None:
|
102
|
+
raise LaunchError(
|
103
|
+
"You are required to specify an external build "
|
104
|
+
"context store for Kaniko builds. Please specify a storage url "
|
105
|
+
"in the 'build-context-store' field of your builder config."
|
106
|
+
)
|
107
|
+
self.environment = environment
|
108
|
+
self.registry = registry
|
109
|
+
self.build_job_name = build_job_name
|
110
|
+
self.build_context_store = build_context_store.rstrip("/")
|
111
|
+
self.secret_name = secret_name
|
112
|
+
self.secret_key = secret_key
|
113
|
+
if verify:
|
114
|
+
self.verify()
|
115
|
+
|
116
|
+
@classmethod
|
117
|
+
def from_config(
|
118
|
+
cls,
|
119
|
+
config: dict,
|
120
|
+
environment: AbstractEnvironment,
|
121
|
+
registry: AbstractRegistry,
|
122
|
+
verify: bool = True,
|
123
|
+
login: bool = True,
|
124
|
+
) -> "AbstractBuilder":
|
125
|
+
"""Create a KanikoBuilder from a config dict.
|
126
|
+
|
127
|
+
Arguments:
|
128
|
+
config: A dict containing the builder config. Must contain a "type" key
|
129
|
+
with value "kaniko".
|
130
|
+
environment: The environment to use for the build.
|
131
|
+
registry: The registry to use for the build.
|
132
|
+
verify: Whether to verify the builder config.
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
A KanikoBuilder instance.
|
136
|
+
"""
|
137
|
+
if config.get("type") != "kaniko":
|
138
|
+
raise LaunchError(
|
139
|
+
"Builder config must include 'type':'kaniko' to create a KanikoBuilder."
|
140
|
+
)
|
141
|
+
build_context_store = config.get("build-context-store")
|
142
|
+
if build_context_store is None:
|
143
|
+
raise LaunchError(
|
144
|
+
"You are required to specify an external build "
|
145
|
+
"context store for Kaniko builds. Please specify a "
|
146
|
+
"storage url in the 'build_context_store' field of your builder config."
|
147
|
+
)
|
148
|
+
build_job_name = config.get("build-job-name", "wandb-launch-container-build")
|
149
|
+
secret_name = config.get("secret-name", "")
|
150
|
+
secret_key = config.get("secret-key", "")
|
151
|
+
return cls(
|
152
|
+
environment,
|
153
|
+
registry,
|
154
|
+
build_context_store=build_context_store,
|
155
|
+
build_job_name=build_job_name,
|
156
|
+
secret_name=secret_name,
|
157
|
+
secret_key=secret_key,
|
158
|
+
verify=verify,
|
159
|
+
)
|
160
|
+
|
161
|
+
def verify(self) -> None:
|
162
|
+
"""Verify that the builder config is valid.
|
163
|
+
|
164
|
+
Raises:
|
165
|
+
LaunchError: If the builder config is invalid.
|
166
|
+
"""
|
167
|
+
if self.environment is None:
|
168
|
+
raise LaunchError("No environment specified for Kaniko build.")
|
169
|
+
self.environment.verify_storage_uri(self.build_context_store)
|
170
|
+
|
171
|
+
def login(self) -> None:
|
172
|
+
"""Login to the registry."""
|
173
|
+
pass
|
174
|
+
|
175
|
+
def _create_docker_ecr_config_map(
|
176
|
+
self, job_name: str, corev1_client: client.CoreV1Api, repository: str
|
177
|
+
) -> None:
|
178
|
+
if self.registry is None:
|
179
|
+
raise LaunchError("No registry specified for Kaniko build.")
|
180
|
+
username, password = self.registry.get_username_password()
|
181
|
+
encoded = base64.b64encode(f"{username}:{password}".encode()).decode("utf-8")
|
182
|
+
ecr_config_map = client.V1ConfigMap(
|
183
|
+
api_version="v1",
|
184
|
+
kind="ConfigMap",
|
185
|
+
metadata=client.V1ObjectMeta(
|
186
|
+
name=f"docker-config-{job_name}",
|
187
|
+
namespace="wandb",
|
188
|
+
),
|
189
|
+
data={
|
190
|
+
"config.json": json.dumps(
|
191
|
+
{"auths": {f"{self.registry.get_repo_uri()}": {"auth": encoded}}}
|
192
|
+
)
|
193
|
+
},
|
194
|
+
immutable=True,
|
195
|
+
)
|
196
|
+
corev1_client.create_namespaced_config_map("wandb", ecr_config_map)
|
197
|
+
|
198
|
+
def _delete_docker_ecr_config_map(
|
199
|
+
self, job_name: str, client: client.CoreV1Api
|
200
|
+
) -> None:
|
201
|
+
if self.secret_name:
|
202
|
+
client.delete_namespaced_config_map(f"docker-config-{job_name}", "wandb")
|
203
|
+
|
204
|
+
def _upload_build_context(self, run_id: str, context_path: str) -> str:
|
205
|
+
# creat a tar archive of the build context and upload it to s3
|
206
|
+
context_file = tempfile.NamedTemporaryFile(delete=False)
|
207
|
+
with tarfile.TarFile.open(fileobj=context_file, mode="w:gz") as context_tgz:
|
208
|
+
context_tgz.add(context_path, arcname=".")
|
209
|
+
context_file.close()
|
210
|
+
destination = f"{self.build_context_store}/{run_id}.tgz"
|
211
|
+
if self.environment is None:
|
212
|
+
raise LaunchError("No environment specified for Kaniko build.")
|
213
|
+
self.environment.upload_file(context_file.name, destination)
|
214
|
+
return destination
|
215
|
+
|
216
|
+
def build_image(
|
217
|
+
self,
|
218
|
+
launch_project: LaunchProject,
|
219
|
+
entrypoint: EntryPoint,
|
220
|
+
) -> str:
|
221
|
+
# TODO: this should probably throw an error if the registry is a local registry
|
222
|
+
if not self.registry:
|
223
|
+
raise LaunchError("No registry specified for Kaniko build.")
|
224
|
+
# kaniko builder doesn't seem to work with a custom user id, need more investigation
|
225
|
+
dockerfile_str = generate_dockerfile(
|
226
|
+
launch_project, entrypoint, launch_project.resource, "kaniko"
|
227
|
+
)
|
228
|
+
image_tag = image_tag_from_dockerfile_and_source(launch_project, dockerfile_str)
|
229
|
+
repo_uri = self.registry.get_repo_uri()
|
230
|
+
image_uri = repo_uri + ":" + image_tag
|
231
|
+
|
232
|
+
if not launch_project.build_required() and self.registry.check_image_exists(
|
233
|
+
image_uri
|
234
|
+
):
|
235
|
+
return image_uri
|
236
|
+
|
237
|
+
_logger.info(f"Building image {image_uri}...")
|
238
|
+
|
239
|
+
entry_cmd = " ".join(
|
240
|
+
get_entry_point_command(entrypoint, launch_project.override_args)
|
241
|
+
)
|
242
|
+
|
243
|
+
create_metadata_file(
|
244
|
+
launch_project,
|
245
|
+
image_uri,
|
246
|
+
sanitize_wandb_api_key(entry_cmd),
|
247
|
+
sanitize_wandb_api_key(dockerfile_str),
|
248
|
+
)
|
249
|
+
context_path = _create_docker_build_ctx(launch_project, dockerfile_str)
|
250
|
+
run_id = launch_project.run_id
|
251
|
+
|
252
|
+
_, api_client = get_kube_context_and_api_client(
|
253
|
+
kubernetes, launch_project.resource_args
|
254
|
+
)
|
255
|
+
build_job_name = f"{self.build_job_name}-{run_id}"
|
256
|
+
|
257
|
+
build_context = self._upload_build_context(run_id, context_path)
|
258
|
+
build_job = self._create_kaniko_job(
|
259
|
+
build_job_name,
|
260
|
+
repo_uri,
|
261
|
+
image_uri,
|
262
|
+
build_context,
|
263
|
+
)
|
264
|
+
wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
|
265
|
+
|
266
|
+
# TODO: use same client as kuberentes.py
|
267
|
+
batch_v1 = client.BatchV1Api(api_client)
|
268
|
+
core_v1 = client.CoreV1Api(api_client)
|
269
|
+
|
270
|
+
try:
|
271
|
+
# core_v1.create_namespaced_config_map("wandb", dockerfile_config_map)
|
272
|
+
if self.secret_name:
|
273
|
+
self._create_docker_ecr_config_map(build_job_name, core_v1, repo_uri)
|
274
|
+
batch_v1.create_namespaced_job("wandb", build_job)
|
275
|
+
|
276
|
+
# wait for double the job deadline since it might take time to schedule
|
277
|
+
if not _wait_for_completion(
|
278
|
+
batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
|
279
|
+
):
|
280
|
+
raise Exception(f"Failed to build image in kaniko for job {run_id}")
|
281
|
+
try:
|
282
|
+
logs = batch_v1.read_namespaced_job_log(build_job_name, "wandb")
|
283
|
+
warn_failed_packages_from_build_logs(logs, image_uri)
|
284
|
+
except Exception as e:
|
285
|
+
wandb.termwarn(
|
286
|
+
f"{LOG_PREFIX}Failed to get logs for kaniko job {build_job_name}: {e}"
|
287
|
+
)
|
288
|
+
except Exception as e:
|
289
|
+
wandb.termerror(
|
290
|
+
f"{LOG_PREFIX}Exception when creating Kubernetes resources: {e}\n"
|
291
|
+
)
|
292
|
+
raise e
|
293
|
+
finally:
|
294
|
+
wandb.termlog(f"{LOG_PREFIX}Cleaning up resources")
|
295
|
+
try:
|
296
|
+
# should we clean up the s3 build contexts? can set bucket level policy to auto deletion
|
297
|
+
# core_v1.delete_namespaced_config_map(config_map_name, "wandb")
|
298
|
+
if self.secret_name:
|
299
|
+
self._delete_docker_ecr_config_map(build_job_name, core_v1)
|
300
|
+
batch_v1.delete_namespaced_job(build_job_name, "wandb")
|
301
|
+
except Exception as e:
|
302
|
+
raise LaunchError(f"Exception during Kubernetes resource clean up {e}")
|
303
|
+
|
304
|
+
return image_uri
|
305
|
+
|
306
|
+
def _create_kaniko_job(
|
307
|
+
self,
|
308
|
+
job_name: str,
|
309
|
+
repository: str,
|
310
|
+
image_tag: str,
|
311
|
+
build_context_path: str,
|
312
|
+
) -> "client.V1Job":
|
313
|
+
env = []
|
314
|
+
volume_mounts = []
|
315
|
+
volumes = []
|
316
|
+
if bool(self.secret_name) != bool(self.secret_key):
|
317
|
+
raise LaunchError(
|
318
|
+
"Both secret_name and secret_key or neither must be specified "
|
319
|
+
"for kaniko build. You provided only one of them."
|
320
|
+
)
|
321
|
+
if isinstance(self.registry, ElasticContainerRegistry):
|
322
|
+
env += [
|
323
|
+
client.V1EnvVar(
|
324
|
+
name="AWS_REGION",
|
325
|
+
value=self.registry.environment.region,
|
326
|
+
)
|
327
|
+
]
|
328
|
+
if self.secret_name and self.secret_key:
|
329
|
+
volumes += [
|
330
|
+
client.V1Volume(
|
331
|
+
name="docker-config",
|
332
|
+
config_map=client.V1ConfigMapVolumeSource(
|
333
|
+
name=f"docker-config-{job_name}",
|
334
|
+
),
|
335
|
+
),
|
336
|
+
]
|
337
|
+
volume_mounts += [
|
338
|
+
client.V1VolumeMount(
|
339
|
+
name="docker-config", mount_path="/kaniko/.docker/"
|
340
|
+
),
|
341
|
+
]
|
342
|
+
# TODO: I don't like conditioning on the registry type here. As a
|
343
|
+
# future change I want the registry and environment classes to provide
|
344
|
+
# a list of environment variables and volume mounts that need to be
|
345
|
+
# added to the job. The environment class provides credentials for
|
346
|
+
# build context access, and the registry class provides credentials
|
347
|
+
# for pushing the image. This way we can have separate secrets for
|
348
|
+
# each and support build contexts and registries that require
|
349
|
+
# different credentials.
|
350
|
+
if isinstance(self.registry, ElasticContainerRegistry):
|
351
|
+
mount_path = "/root/.aws"
|
352
|
+
key = "credentials"
|
353
|
+
elif isinstance(self.registry, GoogleArtifactRegistry):
|
354
|
+
mount_path = "/kaniko/.config/gcloud"
|
355
|
+
key = "config.json"
|
356
|
+
env += [
|
357
|
+
client.V1EnvVar(
|
358
|
+
name="GOOGLE_APPLICATION_CREDENTIALS",
|
359
|
+
value="/kaniko/.config/gcloud/config.json",
|
360
|
+
)
|
361
|
+
]
|
362
|
+
else:
|
363
|
+
raise LaunchError(
|
364
|
+
f"Registry type {type(self.registry)} not supported by kaniko"
|
365
|
+
)
|
366
|
+
volume_mounts += [
|
367
|
+
client.V1VolumeMount(
|
368
|
+
name=self.secret_name,
|
369
|
+
mount_path=mount_path,
|
370
|
+
read_only=True,
|
371
|
+
)
|
372
|
+
]
|
373
|
+
volumes += [
|
374
|
+
client.V1Volume(
|
375
|
+
name=self.secret_name,
|
376
|
+
secret=client.V1SecretVolumeSource(
|
377
|
+
secret_name=self.secret_name,
|
378
|
+
items=[client.V1KeyToPath(key=self.secret_key, path=key)],
|
379
|
+
),
|
380
|
+
)
|
381
|
+
]
|
382
|
+
|
383
|
+
args = [
|
384
|
+
f"--context={build_context_path}",
|
385
|
+
"--dockerfile=Dockerfile.wandb-autogenerated",
|
386
|
+
f"--destination={image_tag}",
|
387
|
+
"--cache=true",
|
388
|
+
f"--cache-repo={repository}",
|
389
|
+
"--snapshotMode=redo",
|
390
|
+
"--compressed-caching=false",
|
391
|
+
]
|
392
|
+
container = client.V1Container(
|
393
|
+
name="wandb-container-build",
|
394
|
+
image="gcr.io/kaniko-project/executor:v1.8.0",
|
395
|
+
args=args,
|
396
|
+
volume_mounts=volume_mounts,
|
397
|
+
env=env if env else None,
|
398
|
+
)
|
399
|
+
# Create and configure a spec section
|
400
|
+
template = client.V1PodTemplateSpec(
|
401
|
+
metadata=client.V1ObjectMeta(labels={"wandb": "launch"}),
|
402
|
+
spec=client.V1PodSpec(
|
403
|
+
restart_policy="Never",
|
404
|
+
active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
|
405
|
+
containers=[container],
|
406
|
+
volumes=volumes,
|
407
|
+
),
|
408
|
+
)
|
409
|
+
# Create the specification of job
|
410
|
+
spec = client.V1JobSpec(template=template, backoff_limit=1)
|
411
|
+
job = client.V1Job(
|
412
|
+
api_version="batch/v1",
|
413
|
+
kind="Job",
|
414
|
+
metadata=client.V1ObjectMeta(
|
415
|
+
name=job_name, namespace="wandb", labels={"wandb": "launch"}
|
416
|
+
),
|
417
|
+
spec=spec,
|
418
|
+
)
|
419
|
+
return job
|
wandb/sdk/launch/builder/noop.py
CHANGED
@@ -1,32 +1,51 @@
|
|
1
|
-
|
1
|
+
"""NoOp builder implementation."""
|
2
|
+
from typing import Any, Dict
|
2
3
|
|
3
|
-
from wandb.errors import LaunchError
|
4
4
|
from wandb.sdk.launch.builder.abstract import AbstractBuilder
|
5
|
+
from wandb.sdk.launch.environment.abstract import AbstractEnvironment
|
6
|
+
from wandb.sdk.launch.registry.abstract import AbstractRegistry
|
7
|
+
from wandb.sdk.launch.utils import LaunchError
|
5
8
|
|
6
9
|
from .._project_spec import EntryPoint, LaunchProject
|
7
10
|
|
8
11
|
|
9
12
|
class NoOpBuilder(AbstractBuilder):
|
13
|
+
"""NoOp builder."""
|
10
14
|
|
11
15
|
type = "noop"
|
12
16
|
|
13
|
-
def __init__(
|
14
|
-
self
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
builder_config: Dict[str, Any],
|
20
|
+
environment: AbstractEnvironment,
|
21
|
+
registry: AbstractRegistry,
|
22
|
+
) -> None:
|
23
|
+
"""Initialize a NoOpBuilder."""
|
24
|
+
pass
|
25
|
+
|
26
|
+
@classmethod
|
27
|
+
def from_config(
|
28
|
+
cls,
|
29
|
+
config: dict,
|
30
|
+
environment: AbstractEnvironment,
|
31
|
+
registry: AbstractRegistry,
|
32
|
+
verify: bool = True,
|
33
|
+
) -> "AbstractBuilder":
|
34
|
+
"""Create a noop builder from a config."""
|
35
|
+
return cls(config, environment, registry)
|
36
|
+
|
37
|
+
def verify(self) -> None:
|
38
|
+
"""Verify the builder."""
|
39
|
+
raise LaunchError("Attempted to verify noop builder.")
|
15
40
|
|
16
41
|
def build_image(
|
17
42
|
self,
|
18
43
|
launch_project: LaunchProject,
|
19
|
-
registry: Optional[str],
|
20
44
|
entrypoint: EntryPoint,
|
21
45
|
) -> str:
|
22
|
-
"""Build the image
|
23
|
-
|
24
|
-
Args:
|
25
|
-
launch_project: The project to build.
|
26
|
-
build_ctx_path: The path to the build context.
|
46
|
+
"""Build the image.
|
27
47
|
|
28
|
-
|
29
|
-
The image name.
|
48
|
+
For this we raise a launch error since it can't build.
|
30
49
|
"""
|
31
50
|
raise LaunchError(
|
32
51
|
"Attempted build with noop builder. Specify a builder in your launch config at ~/.config/wandb/launch-config.yaml"
|
@@ -1,10 +1,13 @@
|
|
1
1
|
import json
|
2
2
|
import multiprocessing
|
3
3
|
import os
|
4
|
+
import re
|
4
5
|
import subprocess
|
5
6
|
import sys
|
6
7
|
from typing import List, Optional, Set
|
7
8
|
|
9
|
+
FAILED_PACKAGES_PREFIX = "ERROR: Failed to install: "
|
10
|
+
FAILED_PACKAGES_POSTFIX = ". During automated build process."
|
8
11
|
CORES = multiprocessing.cpu_count()
|
9
12
|
ONLY_INCLUDE = {x for x in os.getenv("WANDB_ONLY_INCLUDE", "").split(",") if x != ""}
|
10
13
|
OPTS = []
|
@@ -21,9 +24,12 @@ else:
|
|
21
24
|
|
22
25
|
|
23
26
|
def install_deps(
|
24
|
-
deps: List[str],
|
27
|
+
deps: List[str],
|
28
|
+
failed: Optional[Set[str]] = None,
|
29
|
+
extra_index: Optional[str] = None,
|
30
|
+
opts: Optional[List[str]] = None,
|
25
31
|
) -> Optional[Set[str]]:
|
26
|
-
"""Install pip dependencies
|
32
|
+
"""Install pip dependencies.
|
27
33
|
|
28
34
|
Arguments:
|
29
35
|
deps {List[str]} -- List of dependencies to install
|
@@ -35,33 +41,45 @@ def install_deps(
|
|
35
41
|
try:
|
36
42
|
# Include only uri if @ is present
|
37
43
|
clean_deps = [d.split("@")[-1].strip() if "@" in d else d for d in deps]
|
38
|
-
|
44
|
+
index_args = ["--extra-index-url", extra_index] if extra_index else []
|
39
45
|
print("installing {}...".format(", ".join(clean_deps)))
|
46
|
+
opts = opts or []
|
47
|
+
args = ["pip", "install"] + opts + clean_deps + index_args
|
40
48
|
sys.stdout.flush()
|
41
|
-
subprocess.check_output(
|
42
|
-
["pip", "install"] + OPTS + clean_deps, stderr=subprocess.STDOUT
|
43
|
-
)
|
44
|
-
if failed is not None and len(failed) > 0:
|
45
|
-
sys.stderr.write(
|
46
|
-
"ERROR: Unable to install: {}".format(", ".join(clean_deps))
|
47
|
-
)
|
48
|
-
sys.stderr.flush()
|
49
|
+
subprocess.check_output(args, stderr=subprocess.STDOUT)
|
49
50
|
return failed
|
50
51
|
except subprocess.CalledProcessError as e:
|
51
52
|
if failed is None:
|
52
53
|
failed = set()
|
53
54
|
num_failed = len(failed)
|
54
|
-
for line in e.output.decode("utf8"):
|
55
|
+
for line in e.output.decode("utf8").splitlines():
|
55
56
|
if line.startswith("ERROR:"):
|
56
|
-
|
57
|
-
|
58
|
-
|
57
|
+
clean_dep = find_package_in_error_string(clean_deps, line)
|
58
|
+
if clean_dep is not None:
|
59
|
+
if clean_dep in deps:
|
60
|
+
failed.add(clean_dep)
|
61
|
+
else:
|
62
|
+
for d in deps:
|
63
|
+
if clean_dep in d:
|
64
|
+
failed.add(d.replace(" ", ""))
|
65
|
+
break
|
66
|
+
if len(set(clean_deps) - failed) == 0:
|
67
|
+
return failed
|
68
|
+
elif len(failed) > num_failed:
|
69
|
+
return install_deps(
|
70
|
+
list(set(clean_deps) - failed),
|
71
|
+
failed,
|
72
|
+
extra_index=extra_index,
|
73
|
+
opts=opts,
|
74
|
+
)
|
59
75
|
else:
|
60
76
|
return failed
|
61
77
|
|
62
78
|
|
63
79
|
def main() -> None:
|
64
|
-
"""Install deps in requirements.frozen.txt"""
|
80
|
+
"""Install deps in requirements.frozen.txt."""
|
81
|
+
extra_index = None
|
82
|
+
torch_reqs = []
|
65
83
|
if os.path.exists("requirements.frozen.txt"):
|
66
84
|
with open("requirements.frozen.txt") as f:
|
67
85
|
print("Installing frozen dependencies...")
|
@@ -72,28 +90,60 @@ def main() -> None:
|
|
72
90
|
# can't pip install wandb==0.*.*.dev1 through pip. Lets just install wandb for now
|
73
91
|
if req.startswith("wandb==") and "dev1" in req:
|
74
92
|
req = "wandb"
|
75
|
-
|
93
|
+
match = re.match(
|
94
|
+
r"torch(vision|audio)?==\d+\.\d+\.\d+(\+(?:cu[\d]{2,3})|(?:cpu))?",
|
95
|
+
req,
|
96
|
+
)
|
97
|
+
if match:
|
98
|
+
variant = match.group(2)
|
99
|
+
if variant:
|
100
|
+
extra_index = (
|
101
|
+
f"https://download.pytorch.org/whl/{variant[1:]}"
|
102
|
+
)
|
103
|
+
torch_reqs.append(req.strip().replace(" ", ""))
|
104
|
+
else:
|
105
|
+
reqs.append(req.strip().replace(" ", ""))
|
76
106
|
else:
|
77
107
|
print(f"Ignoring requirement: {req} from frozen requirements")
|
78
108
|
if len(reqs) >= CORES:
|
79
|
-
deps_failed = install_deps(reqs)
|
109
|
+
deps_failed = install_deps(reqs, opts=OPTS)
|
80
110
|
reqs = []
|
81
111
|
if deps_failed is not None:
|
82
112
|
failed = failed.union(deps_failed)
|
83
113
|
if len(reqs) > 0:
|
84
|
-
deps_failed = install_deps(reqs)
|
114
|
+
deps_failed = install_deps(reqs, opts=OPTS)
|
85
115
|
if deps_failed is not None:
|
86
116
|
failed = failed.union(deps_failed)
|
87
117
|
with open("_wandb_bootstrap_errors.json", "w") as f:
|
88
118
|
f.write(json.dumps({"pip": list(failed)}))
|
89
119
|
if len(failed) > 0:
|
90
120
|
sys.stderr.write(
|
91
|
-
|
121
|
+
FAILED_PACKAGES_PREFIX + ",".join(failed) + FAILED_PACKAGES_POSTFIX
|
92
122
|
)
|
93
123
|
sys.stderr.flush()
|
124
|
+
install_deps(torch_reqs, extra_index=extra_index)
|
94
125
|
else:
|
95
126
|
print("No frozen requirements found")
|
96
127
|
|
97
128
|
|
129
|
+
# hacky way to get the name of the requirement that failed
|
130
|
+
# attempt last word which is the name of the package often
|
131
|
+
# fall back to checking all words in the line for the package name
|
132
|
+
def find_package_in_error_string(deps: List[str], line: str) -> Optional[str]:
|
133
|
+
# if the last word in the error string is in the list of deps, return it
|
134
|
+
last_word = line.split(" ")[-1]
|
135
|
+
if last_word in deps:
|
136
|
+
return last_word
|
137
|
+
# if the last word is not in the list of deps, check all words
|
138
|
+
# TODO: this could report the wrong package if the error string
|
139
|
+
# contains a reference to another package in the deps
|
140
|
+
# before the package that failed to install
|
141
|
+
for word in line.split(" "):
|
142
|
+
if word in deps:
|
143
|
+
return word
|
144
|
+
# if we can't find the package, return None
|
145
|
+
return None
|
146
|
+
|
147
|
+
|
98
148
|
if __name__ == "__main__":
|
99
149
|
main()
|
@@ -0,0 +1,28 @@
|
|
1
|
+
"""Abstract base class for environments."""
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
|
4
|
+
|
5
|
+
class AbstractEnvironment(ABC):
|
6
|
+
"""Abstract base class for environments."""
|
7
|
+
|
8
|
+
region: str
|
9
|
+
|
10
|
+
@abstractmethod
|
11
|
+
def verify(self) -> None:
|
12
|
+
"""Verify that the environment is configured correctly."""
|
13
|
+
raise NotImplementedError
|
14
|
+
|
15
|
+
@abstractmethod
|
16
|
+
def upload_file(self, source: str, destination: str) -> None:
|
17
|
+
"""Upload a file from the local filesystem to storage in the environment."""
|
18
|
+
raise NotImplementedError
|
19
|
+
|
20
|
+
@abstractmethod
|
21
|
+
def upload_dir(self, source: str, destination: str) -> None:
|
22
|
+
"""Upload the contents of a directory from the local filesystem to the environment."""
|
23
|
+
raise NotImplementedError
|
24
|
+
|
25
|
+
@abstractmethod
|
26
|
+
def verify_storage_uri(self, uri: str) -> None:
|
27
|
+
"""Verify that the storage URI is configured correctly."""
|
28
|
+
raise NotImplementedError
|