wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -3
- wandb/apis/__init__.py +1 -3
- wandb/apis/importers/__init__.py +4 -0
- wandb/apis/importers/base.py +312 -0
- wandb/apis/importers/mlflow.py +113 -0
- wandb/apis/internal.py +29 -2
- wandb/apis/normalize.py +6 -5
- wandb/apis/public.py +163 -180
- wandb/apis/reports/_templates.py +6 -12
- wandb/apis/reports/report.py +1 -1
- wandb/apis/reports/runset.py +1 -3
- wandb/apis/reports/util.py +12 -10
- wandb/beta/workflows.py +57 -34
- wandb/catboost/__init__.py +1 -2
- wandb/cli/cli.py +215 -133
- wandb/data_types.py +63 -56
- wandb/docker/__init__.py +78 -16
- wandb/docker/auth.py +21 -22
- wandb/env.py +0 -1
- wandb/errors/__init__.py +8 -116
- wandb/errors/term.py +1 -1
- wandb/fastai/__init__.py +1 -2
- wandb/filesync/dir_watcher.py +8 -5
- wandb/filesync/step_prepare.py +76 -75
- wandb/filesync/step_upload.py +1 -2
- wandb/integration/catboost/__init__.py +1 -3
- wandb/integration/catboost/catboost.py +8 -14
- wandb/integration/fastai/__init__.py +7 -13
- wandb/integration/gym/__init__.py +35 -4
- wandb/integration/keras/__init__.py +3 -3
- wandb/integration/keras/callbacks/metrics_logger.py +9 -8
- wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
- wandb/integration/keras/callbacks/tables_builder.py +31 -19
- wandb/integration/kfp/kfp_patch.py +20 -17
- wandb/integration/kfp/wandb_logging.py +1 -2
- wandb/integration/lightgbm/__init__.py +21 -19
- wandb/integration/prodigy/prodigy.py +6 -7
- wandb/integration/sacred/__init__.py +9 -12
- wandb/integration/sagemaker/__init__.py +1 -3
- wandb/integration/sagemaker/auth.py +0 -1
- wandb/integration/sagemaker/config.py +1 -1
- wandb/integration/sagemaker/resources.py +1 -1
- wandb/integration/sb3/sb3.py +8 -4
- wandb/integration/tensorboard/__init__.py +1 -3
- wandb/integration/tensorboard/log.py +8 -8
- wandb/integration/tensorboard/monkeypatch.py +11 -9
- wandb/integration/tensorflow/__init__.py +1 -3
- wandb/integration/xgboost/__init__.py +4 -6
- wandb/integration/yolov8/__init__.py +7 -0
- wandb/integration/yolov8/yolov8.py +250 -0
- wandb/jupyter.py +31 -35
- wandb/lightgbm/__init__.py +1 -2
- wandb/old/settings.py +2 -2
- wandb/plot/bar.py +1 -2
- wandb/plot/confusion_matrix.py +1 -3
- wandb/plot/histogram.py +1 -2
- wandb/plot/line.py +1 -2
- wandb/plot/line_series.py +4 -4
- wandb/plot/pr_curve.py +17 -20
- wandb/plot/roc_curve.py +1 -3
- wandb/plot/scatter.py +1 -2
- wandb/proto/v3/wandb_server_pb2.py +85 -39
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_server_pb2.py +51 -39
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/__init__.py +1 -3
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/data_types/_dtypes.py +38 -30
- wandb/sdk/data_types/base_types/json_metadata.py +1 -3
- wandb/sdk/data_types/base_types/media.py +17 -17
- wandb/sdk/data_types/base_types/wb_value.py +33 -26
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
- wandb/sdk/data_types/helper_types/classes.py +1 -1
- wandb/sdk/data_types/helper_types/image_mask.py +12 -12
- wandb/sdk/data_types/histogram.py +5 -4
- wandb/sdk/data_types/html.py +1 -2
- wandb/sdk/data_types/image.py +11 -11
- wandb/sdk/data_types/molecule.py +3 -6
- wandb/sdk/data_types/object_3d.py +1 -2
- wandb/sdk/data_types/plotly.py +1 -2
- wandb/sdk/data_types/saved_model.py +10 -8
- wandb/sdk/data_types/video.py +1 -1
- wandb/sdk/integration_utils/data_logging.py +5 -5
- wandb/sdk/interface/artifacts.py +288 -266
- wandb/sdk/interface/interface.py +2 -3
- wandb/sdk/interface/interface_grpc.py +1 -1
- wandb/sdk/interface/interface_queue.py +1 -1
- wandb/sdk/interface/interface_relay.py +1 -1
- wandb/sdk/interface/interface_shared.py +1 -2
- wandb/sdk/interface/interface_sock.py +1 -1
- wandb/sdk/interface/message_future.py +1 -1
- wandb/sdk/interface/message_future_poll.py +1 -1
- wandb/sdk/interface/router.py +1 -1
- wandb/sdk/interface/router_queue.py +1 -1
- wandb/sdk/interface/router_relay.py +1 -1
- wandb/sdk/interface/router_sock.py +1 -1
- wandb/sdk/interface/summary_record.py +1 -1
- wandb/sdk/internal/artifacts.py +1 -1
- wandb/sdk/internal/datastore.py +2 -3
- wandb/sdk/internal/file_pusher.py +5 -3
- wandb/sdk/internal/file_stream.py +22 -19
- wandb/sdk/internal/handler.py +5 -4
- wandb/sdk/internal/internal.py +1 -1
- wandb/sdk/internal/internal_api.py +115 -55
- wandb/sdk/internal/job_builder.py +1 -3
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/progress.py +4 -6
- wandb/sdk/internal/sample.py +1 -3
- wandb/sdk/internal/sender.py +28 -16
- wandb/sdk/internal/settings_static.py +5 -5
- wandb/sdk/internal/system/assets/__init__.py +1 -0
- wandb/sdk/internal/system/assets/cpu.py +3 -9
- wandb/sdk/internal/system/assets/disk.py +2 -4
- wandb/sdk/internal/system/assets/gpu.py +6 -18
- wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
- wandb/sdk/internal/system/assets/interfaces.py +50 -22
- wandb/sdk/internal/system/assets/ipu.py +1 -3
- wandb/sdk/internal/system/assets/memory.py +7 -13
- wandb/sdk/internal/system/assets/network.py +4 -8
- wandb/sdk/internal/system/assets/open_metrics.py +283 -0
- wandb/sdk/internal/system/assets/tpu.py +1 -4
- wandb/sdk/internal/system/assets/trainium.py +26 -14
- wandb/sdk/internal/system/system_info.py +2 -3
- wandb/sdk/internal/system/system_monitor.py +52 -20
- wandb/sdk/internal/tb_watcher.py +12 -13
- wandb/sdk/launch/_project_spec.py +54 -65
- wandb/sdk/launch/agent/agent.py +374 -90
- wandb/sdk/launch/builder/abstract.py +61 -7
- wandb/sdk/launch/builder/build.py +81 -110
- wandb/sdk/launch/builder/docker_builder.py +181 -0
- wandb/sdk/launch/builder/kaniko_builder.py +419 -0
- wandb/sdk/launch/builder/noop.py +31 -12
- wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
- wandb/sdk/launch/environment/abstract.py +28 -0
- wandb/sdk/launch/environment/aws_environment.py +276 -0
- wandb/sdk/launch/environment/gcp_environment.py +271 -0
- wandb/sdk/launch/environment/local_environment.py +65 -0
- wandb/sdk/launch/github_reference.py +3 -8
- wandb/sdk/launch/launch.py +38 -29
- wandb/sdk/launch/launch_add.py +6 -8
- wandb/sdk/launch/loader.py +230 -0
- wandb/sdk/launch/registry/abstract.py +54 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
- wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
- wandb/sdk/launch/registry/local_registry.py +62 -0
- wandb/sdk/launch/runner/abstract.py +1 -16
- wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
- wandb/sdk/launch/runner/local_container.py +46 -22
- wandb/sdk/launch/runner/local_process.py +1 -4
- wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
- wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
- wandb/sdk/launch/sweeps/__init__.py +3 -2
- wandb/sdk/launch/sweeps/scheduler.py +132 -39
- wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
- wandb/sdk/launch/utils.py +101 -30
- wandb/sdk/launch/wandb_reference.py +2 -7
- wandb/sdk/lib/_settings_toposort_generate.py +166 -0
- wandb/sdk/lib/_settings_toposort_generated.py +201 -0
- wandb/sdk/lib/apikey.py +2 -4
- wandb/sdk/lib/config_util.py +4 -1
- wandb/sdk/lib/console.py +1 -3
- wandb/sdk/lib/deprecate.py +3 -3
- wandb/sdk/lib/file_stream_utils.py +7 -5
- wandb/sdk/lib/filenames.py +1 -1
- wandb/sdk/lib/filesystem.py +61 -5
- wandb/sdk/lib/git.py +1 -3
- wandb/sdk/lib/import_hooks.py +4 -7
- wandb/sdk/lib/ipython.py +8 -5
- wandb/sdk/lib/lazyloader.py +1 -3
- wandb/sdk/lib/mailbox.py +14 -4
- wandb/sdk/lib/proto_util.py +10 -5
- wandb/sdk/lib/redirect.py +15 -22
- wandb/sdk/lib/reporting.py +1 -3
- wandb/sdk/lib/retry.py +4 -5
- wandb/sdk/lib/runid.py +1 -3
- wandb/sdk/lib/server.py +15 -9
- wandb/sdk/lib/sock_client.py +1 -1
- wandb/sdk/lib/sparkline.py +1 -1
- wandb/sdk/lib/wburls.py +1 -1
- wandb/sdk/service/port_file.py +1 -2
- wandb/sdk/service/service.py +36 -13
- wandb/sdk/service/service_base.py +12 -1
- wandb/sdk/verify/verify.py +5 -7
- wandb/sdk/wandb_artifacts.py +142 -177
- wandb/sdk/wandb_config.py +5 -8
- wandb/sdk/wandb_helper.py +1 -1
- wandb/sdk/wandb_init.py +24 -13
- wandb/sdk/wandb_login.py +9 -9
- wandb/sdk/wandb_manager.py +39 -4
- wandb/sdk/wandb_metric.py +2 -6
- wandb/sdk/wandb_require.py +4 -15
- wandb/sdk/wandb_require_helpers.py +1 -9
- wandb/sdk/wandb_run.py +95 -141
- wandb/sdk/wandb_save.py +1 -3
- wandb/sdk/wandb_settings.py +149 -54
- wandb/sdk/wandb_setup.py +66 -46
- wandb/sdk/wandb_summary.py +13 -10
- wandb/sdk/wandb_sweep.py +6 -7
- wandb/sdk/wandb_watch.py +1 -1
- wandb/sklearn/calculate/confusion_matrix.py +1 -1
- wandb/sklearn/calculate/learning_curve.py +1 -1
- wandb/sklearn/calculate/summary_metrics.py +1 -3
- wandb/sklearn/plot/__init__.py +1 -1
- wandb/sklearn/plot/classifier.py +27 -18
- wandb/sklearn/plot/clusterer.py +4 -5
- wandb/sklearn/plot/regressor.py +4 -4
- wandb/sklearn/plot/shared.py +2 -2
- wandb/sync/__init__.py +1 -3
- wandb/sync/sync.py +4 -5
- wandb/testing/relay.py +11 -10
- wandb/trigger.py +1 -1
- wandb/util.py +106 -81
- wandb/viz.py +4 -4
- wandb/wandb_agent.py +50 -50
- wandb/wandb_controller.py +2 -3
- wandb/wandb_run.py +1 -2
- wandb/wandb_torch.py +1 -1
- wandb/xgboost/__init__.py +1 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
- wandb/sdk/launch/builder/docker.py +0 -80
- wandb/sdk/launch/builder/kaniko.py +0 -393
- wandb/sdk/launch/builder/loader.py +0 -32
- wandb/sdk/launch/runner/loader.py +0 -50
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0
@@ -1,80 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
import os
|
3
|
-
from typing import Any, Dict, Optional
|
4
|
-
|
5
|
-
import wandb
|
6
|
-
import wandb.docker as docker
|
7
|
-
from wandb.errors import DockerError, LaunchError
|
8
|
-
from wandb.sdk.launch.builder.abstract import AbstractBuilder
|
9
|
-
|
10
|
-
from .._project_spec import (
|
11
|
-
EntryPoint,
|
12
|
-
LaunchProject,
|
13
|
-
create_metadata_file,
|
14
|
-
get_entry_point_command,
|
15
|
-
)
|
16
|
-
from ..utils import LOG_PREFIX, sanitize_wandb_api_key
|
17
|
-
from .build import (
|
18
|
-
_create_docker_build_ctx,
|
19
|
-
generate_dockerfile,
|
20
|
-
validate_docker_installation,
|
21
|
-
)
|
22
|
-
|
23
|
-
_GENERATED_DOCKERFILE_NAME = "Dockerfile.wandb-autogenerated"
|
24
|
-
_logger = logging.getLogger(__name__)
|
25
|
-
|
26
|
-
|
27
|
-
class DockerBuilder(AbstractBuilder):
|
28
|
-
type = "docker"
|
29
|
-
|
30
|
-
def __init__(self, builder_config: Dict[str, Any]):
|
31
|
-
super().__init__(builder_config)
|
32
|
-
validate_docker_installation()
|
33
|
-
|
34
|
-
def build_image(
|
35
|
-
self,
|
36
|
-
launch_project: LaunchProject,
|
37
|
-
repository: Optional[str],
|
38
|
-
entrypoint: EntryPoint,
|
39
|
-
) -> str:
|
40
|
-
|
41
|
-
if repository:
|
42
|
-
image_uri = f"{repository}:{launch_project.image_tag}"
|
43
|
-
else:
|
44
|
-
image_uri = launch_project.image_uri
|
45
|
-
entry_cmd = get_entry_point_command(entrypoint, launch_project.override_args)
|
46
|
-
dockerfile_str = generate_dockerfile(
|
47
|
-
launch_project, entrypoint, launch_project.resource, self.type
|
48
|
-
)
|
49
|
-
create_metadata_file(
|
50
|
-
launch_project,
|
51
|
-
image_uri,
|
52
|
-
sanitize_wandb_api_key(" ".join(entry_cmd)),
|
53
|
-
dockerfile_str,
|
54
|
-
)
|
55
|
-
build_ctx_path = _create_docker_build_ctx(launch_project, dockerfile_str)
|
56
|
-
dockerfile = os.path.join(build_ctx_path, _GENERATED_DOCKERFILE_NAME)
|
57
|
-
try:
|
58
|
-
docker.build(tags=[image_uri], file=dockerfile, context_path=build_ctx_path)
|
59
|
-
except DockerError as e:
|
60
|
-
raise LaunchError(f"Error communicating with docker client: {e}")
|
61
|
-
|
62
|
-
try:
|
63
|
-
os.remove(build_ctx_path)
|
64
|
-
except Exception:
|
65
|
-
_msg = f"{LOG_PREFIX}Temporary docker context file {build_ctx_path} was not deleted."
|
66
|
-
_logger.info(_msg)
|
67
|
-
|
68
|
-
if repository:
|
69
|
-
reg, tag = image_uri.split(":")
|
70
|
-
wandb.termlog(f"{LOG_PREFIX}Pushing image {image_uri}")
|
71
|
-
push_resp = docker.push(reg, tag)
|
72
|
-
if push_resp is None:
|
73
|
-
raise LaunchError("Failed to push image to repository")
|
74
|
-
elif (
|
75
|
-
launch_project.resource == "sagemaker"
|
76
|
-
and f"The push refers to repository [{repository}]" not in push_resp
|
77
|
-
):
|
78
|
-
raise LaunchError(f"Unable to push image to ECR, response: {push_resp}")
|
79
|
-
|
80
|
-
return image_uri
|
@@ -1,393 +0,0 @@
|
|
1
|
-
import base64
|
2
|
-
import json
|
3
|
-
import os
|
4
|
-
import tarfile
|
5
|
-
import tempfile
|
6
|
-
import time
|
7
|
-
from typing import Any, Dict, Optional
|
8
|
-
|
9
|
-
import kubernetes # type: ignore
|
10
|
-
from kubernetes import client
|
11
|
-
|
12
|
-
import wandb
|
13
|
-
from wandb.errors import LaunchError
|
14
|
-
from wandb.sdk.launch.builder.abstract import AbstractBuilder
|
15
|
-
from wandb.util import get_module
|
16
|
-
|
17
|
-
from .._project_spec import (
|
18
|
-
EntryPoint,
|
19
|
-
LaunchProject,
|
20
|
-
create_metadata_file,
|
21
|
-
get_entry_point_command,
|
22
|
-
)
|
23
|
-
from ..utils import LOG_PREFIX, get_kube_context_and_api_client, sanitize_wandb_api_key
|
24
|
-
from .build import _create_docker_build_ctx, generate_dockerfile
|
25
|
-
|
26
|
-
_DEFAULT_BUILD_TIMEOUT_SECS = 1800 # 30 minute build timeout
|
27
|
-
|
28
|
-
|
29
|
-
def _create_dockerfile_configmap(
|
30
|
-
config_map_name: str, context_path: str
|
31
|
-
) -> client.V1ConfigMap:
|
32
|
-
with open(os.path.join(context_path, "Dockerfile.wandb-autogenerated"), "rb") as f:
|
33
|
-
docker_file_bytes = f.read()
|
34
|
-
|
35
|
-
build_config_map = client.V1ConfigMap(
|
36
|
-
metadata=client.V1ObjectMeta(
|
37
|
-
name=config_map_name, namespace="wandb", labels={"wandb": "launch"}
|
38
|
-
),
|
39
|
-
binary_data={
|
40
|
-
"Dockerfile": base64.b64encode(docker_file_bytes).decode("UTF-8"),
|
41
|
-
},
|
42
|
-
immutable=True,
|
43
|
-
)
|
44
|
-
return build_config_map
|
45
|
-
|
46
|
-
|
47
|
-
def _wait_for_completion(
|
48
|
-
batch_client: client.BatchV1Api, job_name: str, deadline_secs: Optional[int] = None
|
49
|
-
) -> bool:
|
50
|
-
start_time = time.time()
|
51
|
-
while True:
|
52
|
-
job = batch_client.read_namespaced_job_status(job_name, "wandb")
|
53
|
-
if job.status.succeeded is not None and job.status.succeeded >= 1:
|
54
|
-
return True
|
55
|
-
elif job.status.failed is not None and job.status.failed >= 1:
|
56
|
-
return False
|
57
|
-
wandb.termlog(f"{LOG_PREFIX}Waiting for build job to complete...")
|
58
|
-
if deadline_secs is not None and time.time() - start_time > deadline_secs:
|
59
|
-
return False
|
60
|
-
|
61
|
-
time.sleep(5)
|
62
|
-
|
63
|
-
|
64
|
-
class KanikoBuilder(AbstractBuilder):
|
65
|
-
type = "kaniko"
|
66
|
-
|
67
|
-
def __init__(self, builder_config: Dict[str, Any]):
|
68
|
-
super().__init__(builder_config)
|
69
|
-
self.config_map_name = builder_config.get(
|
70
|
-
"config-map-name", "wandb-launch-build-context"
|
71
|
-
)
|
72
|
-
self.build_job_name = builder_config.get(
|
73
|
-
"build-job-name", "wandb-launch-container-build"
|
74
|
-
)
|
75
|
-
cloud_provider = builder_config.get("cloud-provider", None)
|
76
|
-
if cloud_provider is None or not isinstance(cloud_provider, str):
|
77
|
-
raise LaunchError("Kaniko builder requires string cloud-provider")
|
78
|
-
self.cloud_provider: str = cloud_provider.lower()
|
79
|
-
self.instance_mode = False
|
80
|
-
if not builder_config.get("credentials"):
|
81
|
-
self.instance_mode = True
|
82
|
-
# if no cloud provider info given, assume running in instance mode
|
83
|
-
# kaniko pod will have access to build context store and ecr
|
84
|
-
wandb.termlog(f"{LOG_PREFIX}Kaniko builder running in instance mode")
|
85
|
-
|
86
|
-
self.build_context_store = builder_config.get("build-context-store", None)
|
87
|
-
if self.build_context_store is None:
|
88
|
-
raise LaunchError("build-context-store is not set in cloud-provider")
|
89
|
-
credentials_config = builder_config.get("credentials", {})
|
90
|
-
self.credentials_secret_name = credentials_config.get("secret-name")
|
91
|
-
self.credentials_secret_mount_path = credentials_config.get("secret-mount-path")
|
92
|
-
if bool(self.credentials_secret_name) != bool(
|
93
|
-
self.credentials_secret_mount_path
|
94
|
-
):
|
95
|
-
raise LaunchError(
|
96
|
-
"Must provide secret-name and secret-mount-path or neither"
|
97
|
-
)
|
98
|
-
|
99
|
-
def _create_docker_ecr_config_map(
|
100
|
-
self, corev1_client: client.CoreV1Api, repository: str
|
101
|
-
) -> None:
|
102
|
-
if self.cloud_provider.lower() == "aws":
|
103
|
-
if not self.instance_mode:
|
104
|
-
ecr_config_map = client.V1ConfigMap(
|
105
|
-
api_version="v1",
|
106
|
-
kind="ConfigMap",
|
107
|
-
metadata=client.V1ObjectMeta(
|
108
|
-
name="docker-config",
|
109
|
-
namespace="wandb",
|
110
|
-
),
|
111
|
-
data={"config.json": json.dumps({"credsStore": "ecr-login"})},
|
112
|
-
immutable=True,
|
113
|
-
)
|
114
|
-
else:
|
115
|
-
wandb.termlog(
|
116
|
-
f"{LOG_PREFIX}Builder not supplied with credentials, assuming instance mode."
|
117
|
-
)
|
118
|
-
d = {
|
119
|
-
"config.json": json.dumps(
|
120
|
-
{"credHelpers": {repository.split(":")[0]: "ecr-login"}}
|
121
|
-
)
|
122
|
-
}
|
123
|
-
ecr_config_map = client.V1ConfigMap(
|
124
|
-
api_version="v1",
|
125
|
-
kind="ConfigMap",
|
126
|
-
metadata=client.V1ObjectMeta(
|
127
|
-
name="docker-config",
|
128
|
-
namespace="wandb",
|
129
|
-
),
|
130
|
-
data=d,
|
131
|
-
immutable=True,
|
132
|
-
)
|
133
|
-
corev1_client.create_namespaced_config_map("wandb", ecr_config_map)
|
134
|
-
|
135
|
-
def _delete_docker_ecr_config_map(self, client: client.CoreV1Api) -> None:
|
136
|
-
client.delete_namespaced_config_map("docker-config", "wandb")
|
137
|
-
|
138
|
-
def _upload_build_context(self, run_id: str, context_path: str) -> str:
|
139
|
-
# creat a tar archive of the build context and upload it to s3
|
140
|
-
context_file = tempfile.NamedTemporaryFile(delete=False)
|
141
|
-
with tarfile.TarFile.open(fileobj=context_file, mode="w:gz") as context_tgz:
|
142
|
-
context_tgz.add(context_path, arcname=".")
|
143
|
-
context_file.close()
|
144
|
-
if self.cloud_provider.lower() == "aws":
|
145
|
-
boto3 = get_module(
|
146
|
-
"boto3",
|
147
|
-
"AWS cloud provider requires boto3, install with pip install wandb[launch]",
|
148
|
-
)
|
149
|
-
botocore = get_module(
|
150
|
-
"botocore",
|
151
|
-
"aws cloud-provider requires botocore, install with pip install wandb[launch]",
|
152
|
-
)
|
153
|
-
|
154
|
-
s3_client = boto3.client("s3")
|
155
|
-
|
156
|
-
try:
|
157
|
-
s3_client.upload_file(
|
158
|
-
context_file.name, self.build_context_store, f"{run_id}.tgz"
|
159
|
-
)
|
160
|
-
os.remove(context_file.name)
|
161
|
-
except botocore.exceptions.ClientError as e:
|
162
|
-
os.remove(context_file.name)
|
163
|
-
raise LaunchError(f"Failed to upload build context to S3: {e}")
|
164
|
-
return f"s3://{self.build_context_store}/{run_id}.tgz"
|
165
|
-
# TODO: support gcp and azure cloud providers
|
166
|
-
elif self.cloud_provider.lower() == "gcp":
|
167
|
-
storage = get_module(
|
168
|
-
"google.cloud.storage",
|
169
|
-
"gcp provider requires google-cloud-storage, install with pip install wandb[launch]",
|
170
|
-
)
|
171
|
-
|
172
|
-
storage_client = storage.Client()
|
173
|
-
try:
|
174
|
-
bucket = storage_client.bucket(self.build_context_store)
|
175
|
-
blob = bucket.blob(f"{run_id}.tgz")
|
176
|
-
blob.upload_from_filename(context_file.name)
|
177
|
-
os.remove(context_file.name)
|
178
|
-
except Exception as e:
|
179
|
-
os.remove(context_file.name)
|
180
|
-
raise LaunchError(f"Failed to upload build context to GCP: {e}")
|
181
|
-
return f"gs://{self.build_context_store}/{run_id}.tgz"
|
182
|
-
else:
|
183
|
-
raise LaunchError("Unsupported storage provider")
|
184
|
-
|
185
|
-
def check_build_required(
|
186
|
-
self, repository: str, launch_project: LaunchProject
|
187
|
-
) -> bool:
|
188
|
-
# TODO(kyle): Robustify to remote the trycatch
|
189
|
-
try:
|
190
|
-
ecr_provider = self.cloud_provider.lower()
|
191
|
-
if ecr_provider == "aws" and repository:
|
192
|
-
# TODO: pass in registry config
|
193
|
-
region = repository.split(".")[3]
|
194
|
-
boto3 = get_module(
|
195
|
-
"boto3",
|
196
|
-
"AWS ECR requires boto3, install with pip install wandb[launch]",
|
197
|
-
)
|
198
|
-
ecr_client = boto3.client("ecr", region_name=region)
|
199
|
-
repo_name = repository.split("/")[-1]
|
200
|
-
try:
|
201
|
-
ecr_client.describe_images(
|
202
|
-
repositoryName=repo_name,
|
203
|
-
imageIds=[{"imageTag": launch_project.image_tag}],
|
204
|
-
)
|
205
|
-
return False
|
206
|
-
except ecr_client.exceptions.ImageNotFoundException:
|
207
|
-
return True
|
208
|
-
else:
|
209
|
-
return True
|
210
|
-
except Exception as e:
|
211
|
-
wandb.termlog(
|
212
|
-
f"{LOG_PREFIX}Failed while checking if build is required, defaulting to building: {e}"
|
213
|
-
)
|
214
|
-
return True
|
215
|
-
|
216
|
-
def build_image(
|
217
|
-
self,
|
218
|
-
launch_project: LaunchProject,
|
219
|
-
repository: Optional[str],
|
220
|
-
entrypoint: EntryPoint,
|
221
|
-
) -> str:
|
222
|
-
|
223
|
-
if repository is None:
|
224
|
-
raise LaunchError("repository is required for kaniko builder")
|
225
|
-
|
226
|
-
image_uri = f"{repository}:{launch_project.image_tag}"
|
227
|
-
wandb.termlog(f"{LOG_PREFIX}Checking for image {image_uri}")
|
228
|
-
if not self.check_build_required(repository, launch_project):
|
229
|
-
return image_uri
|
230
|
-
entry_cmd = " ".join(
|
231
|
-
get_entry_point_command(entrypoint, launch_project.override_args)
|
232
|
-
)
|
233
|
-
|
234
|
-
# kaniko builder doesn't seem to work with a custom user id, need more investigation
|
235
|
-
dockerfile_str = generate_dockerfile(
|
236
|
-
launch_project, entrypoint, launch_project.resource, self.type
|
237
|
-
)
|
238
|
-
create_metadata_file(
|
239
|
-
launch_project,
|
240
|
-
image_uri,
|
241
|
-
sanitize_wandb_api_key(entry_cmd),
|
242
|
-
sanitize_wandb_api_key(dockerfile_str),
|
243
|
-
)
|
244
|
-
context_path = _create_docker_build_ctx(launch_project, dockerfile_str)
|
245
|
-
run_id = launch_project.run_id
|
246
|
-
|
247
|
-
_, api_client = get_kube_context_and_api_client(
|
248
|
-
kubernetes, launch_project.resource_args
|
249
|
-
)
|
250
|
-
build_job_name = f"{self.build_job_name}-{run_id}"
|
251
|
-
config_map_name = f"{self.config_map_name}-{run_id}"
|
252
|
-
|
253
|
-
build_context = self._upload_build_context(run_id, context_path)
|
254
|
-
dockerfile_config_map = _create_dockerfile_configmap(
|
255
|
-
config_map_name, context_path
|
256
|
-
)
|
257
|
-
build_job = self._create_kaniko_job(
|
258
|
-
build_job_name,
|
259
|
-
dockerfile_config_map.metadata.name,
|
260
|
-
repository,
|
261
|
-
image_uri,
|
262
|
-
build_context,
|
263
|
-
)
|
264
|
-
wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
|
265
|
-
|
266
|
-
# TODO: use same client as kuberentes.py
|
267
|
-
batch_v1 = client.BatchV1Api(api_client)
|
268
|
-
core_v1 = client.CoreV1Api(api_client)
|
269
|
-
|
270
|
-
try:
|
271
|
-
core_v1.create_namespaced_config_map("wandb", dockerfile_config_map)
|
272
|
-
self._create_docker_ecr_config_map(core_v1, repository)
|
273
|
-
batch_v1.create_namespaced_job("wandb", build_job)
|
274
|
-
|
275
|
-
# wait for double the job deadline since it might take time to schedule
|
276
|
-
if not _wait_for_completion(
|
277
|
-
batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
|
278
|
-
):
|
279
|
-
raise Exception(f"Failed to build image in kaniko for job {run_id}")
|
280
|
-
except Exception as e:
|
281
|
-
wandb.termerror(
|
282
|
-
f"{LOG_PREFIX}Exception when creating Kubernetes resources: {e}\n"
|
283
|
-
)
|
284
|
-
finally:
|
285
|
-
wandb.termlog(f"{LOG_PREFIX}Cleaning up resources")
|
286
|
-
try:
|
287
|
-
# should we clean up the s3 build contexts? can set bucket level policy to auto deletion
|
288
|
-
core_v1.delete_namespaced_config_map(config_map_name, "wandb")
|
289
|
-
self._delete_docker_ecr_config_map(core_v1)
|
290
|
-
batch_v1.delete_namespaced_job(build_job_name, "wandb")
|
291
|
-
except Exception as e:
|
292
|
-
raise LaunchError(f"Exception during Kubernetes resource clean up {e}")
|
293
|
-
|
294
|
-
return image_uri
|
295
|
-
|
296
|
-
def _create_kaniko_job(
|
297
|
-
self,
|
298
|
-
job_name: str,
|
299
|
-
config_map_name: str,
|
300
|
-
repository: str,
|
301
|
-
image_tag: str,
|
302
|
-
build_context_path: str,
|
303
|
-
) -> "client.V1Job":
|
304
|
-
env = None
|
305
|
-
if self.instance_mode and self.cloud_provider.lower() == "aws":
|
306
|
-
region = repository.split(".")[3]
|
307
|
-
env = client.V1EnvVar(name="AWS_REGION", value=region)
|
308
|
-
|
309
|
-
volume_mounts = [
|
310
|
-
client.V1VolumeMount(
|
311
|
-
name="build-context-config-map", mount_path="/etc/config"
|
312
|
-
),
|
313
|
-
client.V1VolumeMount(name="docker-config", mount_path="/kaniko/.docker/"),
|
314
|
-
]
|
315
|
-
volumes = [
|
316
|
-
client.V1Volume(
|
317
|
-
name="build-context-config-map",
|
318
|
-
config_map=client.V1ConfigMapVolumeSource(
|
319
|
-
name=config_map_name,
|
320
|
-
),
|
321
|
-
),
|
322
|
-
client.V1Volume(
|
323
|
-
name="docker-config",
|
324
|
-
config_map=client.V1ConfigMapVolumeSource(
|
325
|
-
name="docker-config",
|
326
|
-
),
|
327
|
-
),
|
328
|
-
]
|
329
|
-
if (
|
330
|
-
self.credentials_secret_name is not None
|
331
|
-
and self.credentials_secret_mount_path is not None
|
332
|
-
):
|
333
|
-
volume_mounts += [
|
334
|
-
client.V1VolumeMount(
|
335
|
-
name=self.credentials_secret_name,
|
336
|
-
mount_path=self.credentials_secret_mount_path,
|
337
|
-
read_only=True,
|
338
|
-
)
|
339
|
-
]
|
340
|
-
volumes += [
|
341
|
-
client.V1Volume(
|
342
|
-
name=self.credentials_secret_name,
|
343
|
-
secret=client.V1SecretVolumeSource(
|
344
|
-
secret_name=self.credentials_secret_name
|
345
|
-
),
|
346
|
-
)
|
347
|
-
]
|
348
|
-
# Configurate Pod template container
|
349
|
-
args = [
|
350
|
-
f"--context={build_context_path}",
|
351
|
-
"--dockerfile=/etc/config/Dockerfile",
|
352
|
-
f"--destination={image_tag}",
|
353
|
-
"--cache=true",
|
354
|
-
f"--cache-repo={repository}",
|
355
|
-
"--snapshotMode=redo",
|
356
|
-
]
|
357
|
-
if env is not None:
|
358
|
-
container = client.V1Container(
|
359
|
-
name="wandb-container-build",
|
360
|
-
image="gcr.io/kaniko-project/executor:v1.8.0",
|
361
|
-
args=args,
|
362
|
-
volume_mounts=volume_mounts,
|
363
|
-
env=[env],
|
364
|
-
)
|
365
|
-
else:
|
366
|
-
container = client.V1Container(
|
367
|
-
name="wandb-container-build",
|
368
|
-
image="gcr.io/kaniko-project/executor:v1.8.0",
|
369
|
-
args=args,
|
370
|
-
volume_mounts=volume_mounts,
|
371
|
-
)
|
372
|
-
# Create and configure a spec section
|
373
|
-
template = client.V1PodTemplateSpec(
|
374
|
-
metadata=client.V1ObjectMeta(labels={"wandb": "launch"}),
|
375
|
-
spec=client.V1PodSpec(
|
376
|
-
restart_policy="Never",
|
377
|
-
active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
|
378
|
-
containers=[container],
|
379
|
-
volumes=volumes,
|
380
|
-
),
|
381
|
-
)
|
382
|
-
# Create the specification of job
|
383
|
-
spec = client.V1JobSpec(template=template, backoff_limit=1)
|
384
|
-
job = client.V1Job(
|
385
|
-
api_version="batch/v1",
|
386
|
-
kind="Job",
|
387
|
-
metadata=client.V1ObjectMeta(
|
388
|
-
name=job_name, namespace="wandb", labels={"wandb": "launch"}
|
389
|
-
),
|
390
|
-
spec=spec,
|
391
|
-
)
|
392
|
-
|
393
|
-
return job
|
@@ -1,32 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from typing import Any, Dict, List
|
3
|
-
|
4
|
-
from wandb.errors import LaunchError
|
5
|
-
|
6
|
-
from .abstract import AbstractBuilder
|
7
|
-
|
8
|
-
__logger__ = logging.getLogger(__name__)
|
9
|
-
|
10
|
-
|
11
|
-
_WANDB_BUILDERS: List[str] = ["kaniko", "docker", "noop"]
|
12
|
-
|
13
|
-
|
14
|
-
def load_builder(builder_config: Dict[str, Any]) -> AbstractBuilder:
|
15
|
-
builder_name = builder_config.get("type", "docker")
|
16
|
-
if builder_name == "kaniko":
|
17
|
-
from .kaniko import KanikoBuilder
|
18
|
-
|
19
|
-
return KanikoBuilder(builder_config)
|
20
|
-
elif builder_name == "docker":
|
21
|
-
from .docker import DockerBuilder
|
22
|
-
|
23
|
-
return DockerBuilder(builder_config)
|
24
|
-
elif builder_name == "noop":
|
25
|
-
from .noop import NoOpBuilder
|
26
|
-
|
27
|
-
return NoOpBuilder(builder_config)
|
28
|
-
raise LaunchError(
|
29
|
-
"Builder name not among available builders. Available builders: {} ".format(
|
30
|
-
",".join(_WANDB_BUILDERS)
|
31
|
-
)
|
32
|
-
)
|
@@ -1,50 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from typing import Any, Dict, List
|
3
|
-
|
4
|
-
from wandb.apis.internal import Api
|
5
|
-
from wandb.errors import LaunchError
|
6
|
-
|
7
|
-
from .abstract import AbstractRunner
|
8
|
-
|
9
|
-
__logger__ = logging.getLogger(__name__)
|
10
|
-
|
11
|
-
|
12
|
-
# Statically register backend defined in wandb
|
13
|
-
WANDB_RUNNERS: List[str] = [
|
14
|
-
"local-container",
|
15
|
-
"local-process",
|
16
|
-
"gcp-vertex",
|
17
|
-
"sagemaker",
|
18
|
-
"kubernetes",
|
19
|
-
]
|
20
|
-
|
21
|
-
|
22
|
-
def load_backend(
|
23
|
-
backend_name: str, api: Api, backend_config: Dict[str, Any]
|
24
|
-
) -> AbstractRunner:
|
25
|
-
# Static backends
|
26
|
-
if backend_name in ["local", "local-container"]:
|
27
|
-
from .local_container import LocalContainerRunner
|
28
|
-
|
29
|
-
return LocalContainerRunner(api, backend_config)
|
30
|
-
elif backend_name in ["bare", "local-process"]:
|
31
|
-
from .local_process import LocalProcessRunner
|
32
|
-
|
33
|
-
return LocalProcessRunner(api, backend_config)
|
34
|
-
elif backend_name == "gcp-vertex":
|
35
|
-
from .gcp_vertex import VertexRunner
|
36
|
-
|
37
|
-
return VertexRunner(api, backend_config)
|
38
|
-
elif backend_name == "sagemaker":
|
39
|
-
from .aws import AWSSagemakerRunner
|
40
|
-
|
41
|
-
return AWSSagemakerRunner(api, backend_config)
|
42
|
-
elif backend_name == "kubernetes":
|
43
|
-
from .kubernetes import KubernetesRunner
|
44
|
-
|
45
|
-
return KubernetesRunner(api, backend_config)
|
46
|
-
raise LaunchError(
|
47
|
-
"Resource name not among available resources. Available resources: {} ".format(
|
48
|
-
",".join(WANDB_RUNNERS)
|
49
|
-
)
|
50
|
-
)
|
File without changes
|
File without changes
|
File without changes
|