wandb 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +1 -1
- wandb/analytics/sentry.py +1 -0
- wandb/apis/importers/base.py +20 -5
- wandb/apis/importers/mlflow.py +7 -1
- wandb/apis/internal.py +12 -0
- wandb/apis/public.py +247 -1387
- wandb/apis/reports/_panels.py +58 -35
- wandb/beta/workflows.py +6 -7
- wandb/cli/cli.py +130 -60
- wandb/data_types.py +3 -1
- wandb/filesync/dir_watcher.py +21 -27
- wandb/filesync/step_checksum.py +8 -8
- wandb/filesync/step_prepare.py +23 -10
- wandb/filesync/step_upload.py +13 -13
- wandb/filesync/upload_job.py +4 -8
- wandb/integration/cohere/__init__.py +3 -0
- wandb/integration/cohere/cohere.py +21 -0
- wandb/integration/cohere/resolver.py +347 -0
- wandb/integration/gym/__init__.py +4 -6
- wandb/integration/huggingface/__init__.py +3 -0
- wandb/integration/huggingface/huggingface.py +18 -0
- wandb/integration/huggingface/resolver.py +213 -0
- wandb/integration/langchain/wandb_tracer.py +16 -179
- wandb/integration/openai/__init__.py +1 -3
- wandb/integration/openai/openai.py +11 -143
- wandb/integration/openai/resolver.py +111 -38
- wandb/integration/sagemaker/config.py +2 -2
- wandb/integration/tensorboard/log.py +4 -4
- wandb/old/settings.py +24 -7
- wandb/proto/v3/wandb_telemetry_pb2.py +12 -12
- wandb/proto/v4/wandb_telemetry_pb2.py +12 -12
- wandb/proto/wandb_deprecated.py +3 -1
- wandb/sdk/__init__.py +1 -1
- wandb/sdk/artifacts/__init__.py +0 -0
- wandb/sdk/artifacts/artifact.py +2101 -0
- wandb/sdk/artifacts/artifact_download_logger.py +42 -0
- wandb/sdk/artifacts/artifact_manifest.py +67 -0
- wandb/sdk/artifacts/artifact_manifest_entry.py +159 -0
- wandb/sdk/artifacts/artifact_manifests/__init__.py +0 -0
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +91 -0
- wandb/sdk/{internal → artifacts}/artifact_saver.py +6 -5
- wandb/sdk/artifacts/artifact_state.py +10 -0
- wandb/sdk/{interface/artifacts/artifact_cache.py → artifacts/artifacts_cache.py} +22 -12
- wandb/sdk/artifacts/exceptions.py +55 -0
- wandb/sdk/artifacts/storage_handler.py +59 -0
- wandb/sdk/artifacts/storage_handlers/__init__.py +0 -0
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +192 -0
- wandb/sdk/artifacts/storage_handlers/gcs_handler.py +224 -0
- wandb/sdk/artifacts/storage_handlers/http_handler.py +112 -0
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +134 -0
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +53 -0
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +301 -0
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +67 -0
- wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +132 -0
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +72 -0
- wandb/sdk/artifacts/storage_layout.py +6 -0
- wandb/sdk/artifacts/storage_policies/__init__.py +0 -0
- wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +61 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +386 -0
- wandb/sdk/{interface/artifacts/artifact_storage.py → artifacts/storage_policy.py} +5 -57
- wandb/sdk/data_types/_dtypes.py +7 -12
- wandb/sdk/data_types/base_types/json_metadata.py +3 -2
- wandb/sdk/data_types/base_types/media.py +8 -8
- wandb/sdk/data_types/base_types/wb_value.py +12 -13
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +5 -6
- wandb/sdk/data_types/helper_types/classes.py +6 -8
- wandb/sdk/data_types/helper_types/image_mask.py +5 -6
- wandb/sdk/data_types/histogram.py +4 -3
- wandb/sdk/data_types/html.py +3 -4
- wandb/sdk/data_types/image.py +11 -9
- wandb/sdk/data_types/molecule.py +5 -3
- wandb/sdk/data_types/object_3d.py +7 -5
- wandb/sdk/data_types/plotly.py +3 -2
- wandb/sdk/data_types/saved_model.py +11 -11
- wandb/sdk/data_types/trace_tree.py +5 -4
- wandb/sdk/data_types/utils.py +3 -5
- wandb/sdk/data_types/video.py +5 -4
- wandb/sdk/integration_utils/auto_logging.py +215 -0
- wandb/sdk/interface/interface.py +15 -15
- wandb/sdk/internal/file_pusher.py +8 -16
- wandb/sdk/internal/file_stream.py +5 -11
- wandb/sdk/internal/handler.py +13 -1
- wandb/sdk/internal/internal_api.py +287 -13
- wandb/sdk/internal/job_builder.py +119 -30
- wandb/sdk/internal/sender.py +6 -26
- wandb/sdk/internal/settings_static.py +2 -0
- wandb/sdk/internal/system/assets/__init__.py +2 -0
- wandb/sdk/internal/system/assets/gpu.py +42 -0
- wandb/sdk/internal/system/assets/gpu_amd.py +216 -0
- wandb/sdk/internal/system/env_probe_helpers.py +13 -0
- wandb/sdk/internal/system/system_info.py +3 -3
- wandb/sdk/internal/tb_watcher.py +32 -22
- wandb/sdk/internal/thread_local_settings.py +18 -0
- wandb/sdk/launch/_project_spec.py +57 -11
- wandb/sdk/launch/agent/agent.py +147 -65
- wandb/sdk/launch/agent/job_status_tracker.py +34 -0
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +45 -0
- wandb/sdk/launch/builder/abstract.py +5 -1
- wandb/sdk/launch/builder/build.py +21 -18
- wandb/sdk/launch/builder/docker_builder.py +10 -4
- wandb/sdk/launch/builder/kaniko_builder.py +113 -23
- wandb/sdk/launch/builder/noop.py +6 -3
- wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +46 -14
- wandb/sdk/launch/environment/aws_environment.py +3 -2
- wandb/sdk/launch/environment/azure_environment.py +124 -0
- wandb/sdk/launch/environment/gcp_environment.py +2 -4
- wandb/sdk/launch/environment/local_environment.py +1 -1
- wandb/sdk/launch/errors.py +19 -0
- wandb/sdk/launch/github_reference.py +32 -19
- wandb/sdk/launch/launch.py +3 -8
- wandb/sdk/launch/launch_add.py +6 -2
- wandb/sdk/launch/loader.py +21 -2
- wandb/sdk/launch/registry/azure_container_registry.py +132 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +39 -5
- wandb/sdk/launch/registry/google_artifact_registry.py +68 -26
- wandb/sdk/launch/registry/local_registry.py +2 -1
- wandb/sdk/launch/runner/abstract.py +24 -3
- wandb/sdk/launch/runner/kubernetes_runner.py +479 -26
- wandb/sdk/launch/runner/local_container.py +103 -51
- wandb/sdk/launch/runner/local_process.py +1 -1
- wandb/sdk/launch/runner/sagemaker_runner.py +60 -10
- wandb/sdk/launch/runner/vertex_runner.py +10 -5
- wandb/sdk/launch/sweeps/__init__.py +7 -9
- wandb/sdk/launch/sweeps/scheduler.py +307 -77
- wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
- wandb/sdk/launch/sweeps/utils.py +82 -35
- wandb/sdk/launch/utils.py +89 -75
- wandb/sdk/lib/_settings_toposort_generated.py +7 -0
- wandb/sdk/lib/capped_dict.py +26 -0
- wandb/sdk/lib/{git.py → gitlib.py} +76 -59
- wandb/sdk/lib/hashutil.py +12 -4
- wandb/sdk/lib/paths.py +96 -8
- wandb/sdk/lib/sock_client.py +2 -2
- wandb/sdk/lib/timer.py +1 -0
- wandb/sdk/service/server.py +22 -9
- wandb/sdk/service/server_sock.py +1 -1
- wandb/sdk/service/service.py +27 -8
- wandb/sdk/verify/verify.py +4 -7
- wandb/sdk/wandb_config.py +2 -6
- wandb/sdk/wandb_init.py +57 -53
- wandb/sdk/wandb_require.py +7 -0
- wandb/sdk/wandb_run.py +61 -223
- wandb/sdk/wandb_settings.py +28 -4
- wandb/testing/relay.py +15 -2
- wandb/util.py +74 -36
- {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/METADATA +15 -9
- {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/RECORD +151 -116
- {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/entry_points.txt +1 -0
- wandb/integration/langchain/util.py +0 -191
- wandb/sdk/interface/artifacts/__init__.py +0 -33
- wandb/sdk/interface/artifacts/artifact.py +0 -615
- wandb/sdk/interface/artifacts/artifact_manifest.py +0 -131
- wandb/sdk/wandb_artifacts.py +0 -2226
- {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/LICENSE +0 -0
- {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/WHEEL +0 -0
- {wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
"""Implementation of the run queue item file saver class."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
from typing import List, Optional, Union
|
6
|
+
|
7
|
+
import wandb
|
8
|
+
from wandb.sdk.lib import RunDisabled
|
9
|
+
from wandb.sdk.wandb_run import Run
|
10
|
+
|
11
|
+
if sys.version_info >= (3, 8):
|
12
|
+
from typing import Literal
|
13
|
+
else:
|
14
|
+
from typing_extensions import Literal
|
15
|
+
|
16
|
+
FileSubtypes = Literal["warning", "error"]
|
17
|
+
|
18
|
+
|
19
|
+
class RunQueueItemFileSaver:
|
20
|
+
def __init__(
|
21
|
+
self, agent_run: Optional[Union[Run, RunDisabled]], run_queue_item_id: str
|
22
|
+
):
|
23
|
+
self.run_queue_item_id = run_queue_item_id
|
24
|
+
self.run = agent_run
|
25
|
+
|
26
|
+
def save_contents(
|
27
|
+
self, contents: str, fname: str, file_sub_type: FileSubtypes
|
28
|
+
) -> Optional[List[str]]:
|
29
|
+
if not isinstance(self.run, Run):
|
30
|
+
wandb.termwarn("Not saving file contents because agent has no run")
|
31
|
+
return None
|
32
|
+
root_dir = self.run._settings.files_dir
|
33
|
+
saved_run_path = os.path.join(self.run_queue_item_id, file_sub_type, fname)
|
34
|
+
local_path = os.path.join(root_dir, saved_run_path)
|
35
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
36
|
+
with open(local_path, "w") as f:
|
37
|
+
f.write(contents)
|
38
|
+
res = self.run.save(local_path, base_path=root_dir, policy="now")
|
39
|
+
if isinstance(res, list):
|
40
|
+
return [saved_run_path]
|
41
|
+
else:
|
42
|
+
wandb.termwarn(
|
43
|
+
f"Failed to save files for run queue item: {self.run_queue_item_id}"
|
44
|
+
)
|
45
|
+
return None
|
@@ -1,12 +1,15 @@
|
|
1
1
|
"""Abstract plugin class defining the interface needed to build container images for W&B Launch."""
|
2
2
|
from abc import ABC, abstractmethod
|
3
|
-
from typing import Any, Dict
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional
|
4
4
|
|
5
5
|
from wandb.sdk.launch.environment.abstract import AbstractEnvironment
|
6
6
|
from wandb.sdk.launch.registry.abstract import AbstractRegistry
|
7
7
|
|
8
8
|
from .._project_spec import EntryPoint, LaunchProject
|
9
9
|
|
10
|
+
if TYPE_CHECKING:
|
11
|
+
from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
|
12
|
+
|
10
13
|
|
11
14
|
class AbstractBuilder(ABC):
|
12
15
|
"""Abstract plugin class defining the interface needed to build container images for W&B Launch."""
|
@@ -63,6 +66,7 @@ class AbstractBuilder(ABC):
|
|
63
66
|
self,
|
64
67
|
launch_project: LaunchProject,
|
65
68
|
entrypoint: EntryPoint,
|
69
|
+
job_tracker: Optional["JobAndRunStatusTracker"] = None,
|
66
70
|
) -> str:
|
67
71
|
"""Build the image for the given project.
|
68
72
|
|
@@ -28,13 +28,8 @@ from .._project_spec import (
|
|
28
28
|
LaunchProject,
|
29
29
|
fetch_and_validate_project,
|
30
30
|
)
|
31
|
-
from ..
|
32
|
-
|
33
|
-
LOG_PREFIX,
|
34
|
-
ExecutionError,
|
35
|
-
LaunchError,
|
36
|
-
resolve_build_and_registry_config,
|
37
|
-
)
|
31
|
+
from ..errors import ExecutionError, LaunchError
|
32
|
+
from ..utils import LAUNCH_CONFIG_FILE, LOG_PREFIX, resolve_build_and_registry_config
|
38
33
|
from .abstract import AbstractBuilder
|
39
34
|
|
40
35
|
_logger = logging.getLogger(__name__)
|
@@ -43,8 +38,6 @@ _logger = logging.getLogger(__name__)
|
|
43
38
|
_GENERATED_DOCKERFILE_NAME = "Dockerfile.wandb-autogenerated"
|
44
39
|
DEFAULT_ENTRYPOINT = "_wandb_default_entrypoint"
|
45
40
|
|
46
|
-
DEFAULT_CUDA_VERSION = "10.0"
|
47
|
-
|
48
41
|
|
49
42
|
def validate_docker_installation() -> None:
|
50
43
|
"""Verify if Docker is installed on host machine."""
|
@@ -108,8 +101,12 @@ FROM {py_base_image} as base
|
|
108
101
|
"""
|
109
102
|
|
110
103
|
# this goes into base_setup in TEMPLATE
|
111
|
-
|
112
|
-
FROM {
|
104
|
+
ACCELERATOR_SETUP_TEMPLATE = """
|
105
|
+
FROM {accelerator_base_image} as base
|
106
|
+
|
107
|
+
# make non-interactive so build doesn't block on questions
|
108
|
+
ENV DEBIAN_FRONTEND=noninteractive
|
109
|
+
|
113
110
|
# TODO: once NVIDIA their linux repository keys for all docker images
|
114
111
|
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$(cat /etc/os-release | grep ^ID= | cut -d "=" -f2 )$(cat /etc/os-release | grep ^VERSION_ID= | cut -d "=" -f2 | sed -e 's/[\".]//g' )/$(uname -i)/3bf863cc.pub
|
115
112
|
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/$(cat /etc/os-release | grep ^ID= | cut -d "=" -f2 )$(cat /etc/os-release | grep ^VERSION_ID= | cut -d "=" -f2 | sed -e 's/[\".]//g' )/$(uname -i)/7fa2af80.pub
|
@@ -189,12 +186,14 @@ def get_base_setup(
|
|
189
186
|
) -> str:
|
190
187
|
"""Fill in the Dockerfile templates for stage 2 of build.
|
191
188
|
|
192
|
-
CPU version is built on python,
|
189
|
+
CPU version is built on python, Accelerator version is built on user provided.
|
193
190
|
"""
|
194
191
|
python_base_image = f"python:{py_version}-buster"
|
195
|
-
if launch_project.
|
196
|
-
_logger.info(
|
197
|
-
|
192
|
+
if launch_project.accelerator_base_image:
|
193
|
+
_logger.info(
|
194
|
+
f"Using accelerator base image: {launch_project.accelerator_base_image}"
|
195
|
+
)
|
196
|
+
# accelerator base images doesn't come with python tooling
|
198
197
|
if py_major == "2":
|
199
198
|
python_packages = [
|
200
199
|
f"python{py_version}",
|
@@ -209,8 +208,8 @@ def get_base_setup(
|
|
209
208
|
"python3-pip",
|
210
209
|
"python3-setuptools",
|
211
210
|
]
|
212
|
-
base_setup =
|
213
|
-
|
211
|
+
base_setup = ACCELERATOR_SETUP_TEMPLATE.format(
|
212
|
+
accelerator_base_image=launch_project.accelerator_base_image,
|
214
213
|
python_packages=" \\\n".join(python_packages),
|
215
214
|
py_version=py_version,
|
216
215
|
)
|
@@ -246,6 +245,10 @@ def get_env_vars_dict(launch_project: LaunchProject, api: Api) -> Dict[str, str]
|
|
246
245
|
env_vars["WANDB_NAME"] = launch_project.name
|
247
246
|
if "author" in launch_project.launch_spec and not override_api_key:
|
248
247
|
env_vars["WANDB_USERNAME"] = launch_project.launch_spec["author"]
|
248
|
+
if launch_project.sweep_id:
|
249
|
+
env_vars["WANDB_SWEEP_ID"] = launch_project.sweep_id
|
250
|
+
if launch_project.launch_spec.get("_resume_count"):
|
251
|
+
env_vars["WANDB_RESUME"] = "must"
|
249
252
|
|
250
253
|
# TODO: handle env vars > 32760 characters
|
251
254
|
env_vars["WANDB_CONFIG"] = json.dumps(launch_project.override_config)
|
@@ -288,7 +291,7 @@ def get_requirements_section(launch_project: LaunchProject, builder_type: str) -
|
|
288
291
|
):
|
289
292
|
requirements_files += ["src/requirements.txt"]
|
290
293
|
pip_install_line = "pip install -r requirements.txt"
|
291
|
-
|
294
|
+
elif launch_project.project_dir is not None and os.path.exists(
|
292
295
|
os.path.join(launch_project.project_dir, "requirements.frozen.txt")
|
293
296
|
):
|
294
297
|
# if we have frozen requirements stored, copy those over and have them take precedence
|
@@ -1,10 +1,11 @@
|
|
1
1
|
"""Implementation of the docker builder."""
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
-
from typing import Any, Dict
|
4
|
+
from typing import Any, Dict, Optional
|
5
5
|
|
6
6
|
import wandb
|
7
7
|
import wandb.docker as docker
|
8
|
+
from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
|
8
9
|
from wandb.sdk.launch.builder.abstract import AbstractBuilder
|
9
10
|
from wandb.sdk.launch.environment.abstract import AbstractEnvironment
|
10
11
|
from wandb.sdk.launch.registry.abstract import AbstractRegistry
|
@@ -15,11 +16,10 @@ from .._project_spec import (
|
|
15
16
|
create_metadata_file,
|
16
17
|
get_entry_point_command,
|
17
18
|
)
|
19
|
+
from ..errors import LaunchDockerError, LaunchError
|
18
20
|
from ..registry.local_registry import LocalRegistry
|
19
21
|
from ..utils import (
|
20
22
|
LOG_PREFIX,
|
21
|
-
LaunchDockerError,
|
22
|
-
LaunchError,
|
23
23
|
sanitize_wandb_api_key,
|
24
24
|
warn_failed_packages_from_build_logs,
|
25
25
|
)
|
@@ -112,6 +112,7 @@ class DockerBuilder(AbstractBuilder):
|
|
112
112
|
self,
|
113
113
|
launch_project: LaunchProject,
|
114
114
|
entrypoint: EntryPoint,
|
115
|
+
job_tracker: Optional[JobAndRunStatusTracker] = None,
|
115
116
|
) -> str:
|
116
117
|
"""Build the image for the given project.
|
117
118
|
|
@@ -160,9 +161,14 @@ class DockerBuilder(AbstractBuilder):
|
|
160
161
|
context_path=build_ctx_path,
|
161
162
|
platform=self.config.get("platform"),
|
162
163
|
)
|
163
|
-
|
164
|
+
|
165
|
+
warn_failed_packages_from_build_logs(
|
166
|
+
output, image_uri, launch_project.api, job_tracker
|
167
|
+
)
|
164
168
|
|
165
169
|
except docker.DockerError as e:
|
170
|
+
if job_tracker:
|
171
|
+
job_tracker.set_err_stage("build")
|
166
172
|
raise LaunchDockerError(f"Error communicating with docker client: {e}")
|
167
173
|
|
168
174
|
try:
|
@@ -1,15 +1,19 @@
|
|
1
1
|
import base64
|
2
2
|
import json
|
3
3
|
import logging
|
4
|
+
import os
|
4
5
|
import tarfile
|
5
6
|
import tempfile
|
6
7
|
import time
|
7
8
|
from typing import Optional
|
8
9
|
|
9
10
|
import wandb
|
11
|
+
from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
|
10
12
|
from wandb.sdk.launch.builder.abstract import AbstractBuilder
|
11
13
|
from wandb.sdk.launch.environment.abstract import AbstractEnvironment
|
14
|
+
from wandb.sdk.launch.environment.azure_environment import AzureEnvironment
|
12
15
|
from wandb.sdk.launch.registry.abstract import AbstractRegistry
|
16
|
+
from wandb.sdk.launch.registry.azure_container_registry import AzureContainerRegistry
|
13
17
|
from wandb.sdk.launch.registry.elastic_container_registry import (
|
14
18
|
ElasticContainerRegistry,
|
15
19
|
)
|
@@ -22,9 +26,9 @@ from .._project_spec import (
|
|
22
26
|
create_metadata_file,
|
23
27
|
get_entry_point_command,
|
24
28
|
)
|
29
|
+
from ..errors import LaunchError
|
25
30
|
from ..utils import (
|
26
31
|
LOG_PREFIX,
|
27
|
-
LaunchError,
|
28
32
|
get_kube_context_and_api_client,
|
29
33
|
sanitize_wandb_api_key,
|
30
34
|
warn_failed_packages_from_build_logs,
|
@@ -47,13 +51,21 @@ _logger = logging.getLogger(__name__)
|
|
47
51
|
|
48
52
|
_DEFAULT_BUILD_TIMEOUT_SECS = 1800 # 30 minute build timeout
|
49
53
|
|
54
|
+
SERVICE_ACCOUNT_NAME = os.environ.get("WANDB_LAUNCH_SERVICE_ACCOUNT_NAME", "default")
|
55
|
+
|
56
|
+
if os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
|
57
|
+
with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
|
58
|
+
NAMESPACE = f.read().strip()
|
59
|
+
else:
|
60
|
+
NAMESPACE = "wandb"
|
61
|
+
|
50
62
|
|
51
63
|
def _wait_for_completion(
|
52
64
|
batch_client: client.BatchV1Api, job_name: str, deadline_secs: Optional[int] = None
|
53
65
|
) -> bool:
|
54
66
|
start_time = time.time()
|
55
67
|
while True:
|
56
|
-
job = batch_client.read_namespaced_job_status(job_name,
|
68
|
+
job = batch_client.read_namespaced_job_status(job_name, NAMESPACE)
|
57
69
|
if job.status.succeeded is not None and job.status.succeeded >= 1:
|
58
70
|
return True
|
59
71
|
elif job.status.failed is not None and job.status.failed >= 1:
|
@@ -75,6 +87,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
75
87
|
build_context_store: str
|
76
88
|
secret_name: Optional[str]
|
77
89
|
secret_key: Optional[str]
|
90
|
+
image: str
|
78
91
|
|
79
92
|
def __init__(
|
80
93
|
self,
|
@@ -84,6 +97,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
84
97
|
build_context_store: str = "",
|
85
98
|
secret_name: str = "",
|
86
99
|
secret_key: str = "",
|
100
|
+
image: str = "gcr.io/kaniko-project/executor:v1.11.0",
|
87
101
|
verify: bool = True,
|
88
102
|
):
|
89
103
|
"""Initialize a KanikoBuilder.
|
@@ -110,6 +124,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
110
124
|
self.build_context_store = build_context_store.rstrip("/")
|
111
125
|
self.secret_name = secret_name
|
112
126
|
self.secret_key = secret_key
|
127
|
+
self.image = image
|
113
128
|
if verify:
|
114
129
|
self.verify()
|
115
130
|
|
@@ -148,6 +163,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
148
163
|
build_job_name = config.get("build-job-name", "wandb-launch-container-build")
|
149
164
|
secret_name = config.get("secret-name", "")
|
150
165
|
secret_key = config.get("secret-key", "")
|
166
|
+
image = config.get("kaniko-image", "gcr.io/kaniko-project/executor:v1.11.0")
|
151
167
|
return cls(
|
152
168
|
environment,
|
153
169
|
registry,
|
@@ -155,6 +171,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
155
171
|
build_job_name=build_job_name,
|
156
172
|
secret_name=secret_name,
|
157
173
|
secret_key=secret_key,
|
174
|
+
image=image,
|
158
175
|
verify=verify,
|
159
176
|
)
|
160
177
|
|
@@ -184,7 +201,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
184
201
|
kind="ConfigMap",
|
185
202
|
metadata=client.V1ObjectMeta(
|
186
203
|
name=f"docker-config-{job_name}",
|
187
|
-
namespace=
|
204
|
+
namespace=NAMESPACE,
|
188
205
|
),
|
189
206
|
data={
|
190
207
|
"config.json": json.dumps(
|
@@ -193,13 +210,13 @@ class KanikoBuilder(AbstractBuilder):
|
|
193
210
|
},
|
194
211
|
immutable=True,
|
195
212
|
)
|
196
|
-
corev1_client.create_namespaced_config_map(
|
213
|
+
corev1_client.create_namespaced_config_map(NAMESPACE, ecr_config_map)
|
197
214
|
|
198
215
|
def _delete_docker_ecr_config_map(
|
199
216
|
self, job_name: str, client: client.CoreV1Api
|
200
217
|
) -> None:
|
201
218
|
if self.secret_name:
|
202
|
-
client.delete_namespaced_config_map(f"docker-config-{job_name}",
|
219
|
+
client.delete_namespaced_config_map(f"docker-config-{job_name}", NAMESPACE)
|
203
220
|
|
204
221
|
def _upload_build_context(self, run_id: str, context_path: str) -> str:
|
205
222
|
# creat a tar archive of the build context and upload it to s3
|
@@ -217,6 +234,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
217
234
|
self,
|
218
235
|
launch_project: LaunchProject,
|
219
236
|
entrypoint: EntryPoint,
|
237
|
+
job_tracker: Optional[JobAndRunStatusTracker] = None,
|
220
238
|
) -> str:
|
221
239
|
# TODO: this should probably throw an error if the registry is a local registry
|
222
240
|
if not self.registry:
|
@@ -252,35 +270,52 @@ class KanikoBuilder(AbstractBuilder):
|
|
252
270
|
_, api_client = get_kube_context_and_api_client(
|
253
271
|
kubernetes, launch_project.resource_args
|
254
272
|
)
|
273
|
+
# TODO: use same client as kuberentes_runner.py
|
274
|
+
batch_v1 = client.BatchV1Api(api_client)
|
275
|
+
core_v1 = client.CoreV1Api(api_client)
|
276
|
+
|
255
277
|
build_job_name = f"{self.build_job_name}-{run_id}"
|
256
278
|
|
257
279
|
build_context = self._upload_build_context(run_id, context_path)
|
258
280
|
build_job = self._create_kaniko_job(
|
259
|
-
build_job_name,
|
260
|
-
repo_uri,
|
261
|
-
image_uri,
|
262
|
-
build_context,
|
281
|
+
build_job_name, repo_uri, image_uri, build_context, core_v1
|
263
282
|
)
|
264
283
|
wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
|
265
284
|
|
266
|
-
# TODO: use same client as kuberentes.py
|
267
|
-
batch_v1 = client.BatchV1Api(api_client)
|
268
|
-
core_v1 = client.CoreV1Api(api_client)
|
269
|
-
|
270
285
|
try:
|
286
|
+
if isinstance(self.registry, AzureContainerRegistry):
|
287
|
+
dockerfile_config_map = client.V1ConfigMap(
|
288
|
+
metadata=client.V1ObjectMeta(
|
289
|
+
name=f"docker-config-{build_job_name}"
|
290
|
+
),
|
291
|
+
data={
|
292
|
+
"config.json": json.dumps(
|
293
|
+
{
|
294
|
+
"credHelpers": {
|
295
|
+
f"{self.registry.registry_name}.azurecr.io": "acr-env"
|
296
|
+
}
|
297
|
+
}
|
298
|
+
)
|
299
|
+
},
|
300
|
+
)
|
301
|
+
core_v1.create_namespaced_config_map("wandb", dockerfile_config_map)
|
271
302
|
# core_v1.create_namespaced_config_map("wandb", dockerfile_config_map)
|
272
303
|
if self.secret_name:
|
273
304
|
self._create_docker_ecr_config_map(build_job_name, core_v1, repo_uri)
|
274
|
-
batch_v1.create_namespaced_job(
|
305
|
+
batch_v1.create_namespaced_job(NAMESPACE, build_job)
|
275
306
|
|
276
307
|
# wait for double the job deadline since it might take time to schedule
|
277
308
|
if not _wait_for_completion(
|
278
309
|
batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
|
279
310
|
):
|
311
|
+
if job_tracker:
|
312
|
+
job_tracker.set_err_stage("build")
|
280
313
|
raise Exception(f"Failed to build image in kaniko for job {run_id}")
|
281
314
|
try:
|
282
|
-
logs = batch_v1.read_namespaced_job_log(build_job_name,
|
283
|
-
warn_failed_packages_from_build_logs(
|
315
|
+
logs = batch_v1.read_namespaced_job_log(build_job_name, NAMESPACE)
|
316
|
+
warn_failed_packages_from_build_logs(
|
317
|
+
logs, image_uri, launch_project.api, job_tracker
|
318
|
+
)
|
284
319
|
except Exception as e:
|
285
320
|
wandb.termwarn(
|
286
321
|
f"{LOG_PREFIX}Failed to get logs for kaniko job {build_job_name}: {e}"
|
@@ -295,9 +330,13 @@ class KanikoBuilder(AbstractBuilder):
|
|
295
330
|
try:
|
296
331
|
# should we clean up the s3 build contexts? can set bucket level policy to auto deletion
|
297
332
|
# core_v1.delete_namespaced_config_map(config_map_name, "wandb")
|
333
|
+
if isinstance(self.registry, AzureContainerRegistry):
|
334
|
+
core_v1.delete_namespaced_config_map(
|
335
|
+
f"docker-config-{build_job_name}", "wandb"
|
336
|
+
)
|
298
337
|
if self.secret_name:
|
299
338
|
self._delete_docker_ecr_config_map(build_job_name, core_v1)
|
300
|
-
batch_v1.delete_namespaced_job(build_job_name,
|
339
|
+
batch_v1.delete_namespaced_job(build_job_name, NAMESPACE)
|
301
340
|
except Exception as e:
|
302
341
|
raise LaunchError(f"Exception during Kubernetes resource clean up {e}")
|
303
342
|
|
@@ -309,6 +348,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
309
348
|
repository: str,
|
310
349
|
image_tag: str,
|
311
350
|
build_context_path: str,
|
351
|
+
core_client: client.CoreV1Api,
|
312
352
|
) -> "client.V1Job":
|
313
353
|
env = []
|
314
354
|
volume_mounts = []
|
@@ -325,6 +365,33 @@ class KanikoBuilder(AbstractBuilder):
|
|
325
365
|
value=self.registry.environment.region,
|
326
366
|
)
|
327
367
|
]
|
368
|
+
# TODO: Refactor all of this environment/registry
|
369
|
+
# specific stuff into methods of those classes.
|
370
|
+
if isinstance(self.environment, AzureEnvironment):
|
371
|
+
# Use the core api to check if the secret exists
|
372
|
+
try:
|
373
|
+
core_client.read_namespaced_secret(
|
374
|
+
"azure-storage-access-key",
|
375
|
+
"wandb",
|
376
|
+
)
|
377
|
+
except Exception as e:
|
378
|
+
raise LaunchError(
|
379
|
+
"Secret azure-storage-access-key does not exist in "
|
380
|
+
"namespace wandb. Please create it with the key password "
|
381
|
+
"set to your azure storage access key."
|
382
|
+
) from e
|
383
|
+
env += [
|
384
|
+
client.V1EnvVar(
|
385
|
+
name="AZURE_STORAGE_ACCESS_KEY",
|
386
|
+
value_from=client.V1EnvVarSource(
|
387
|
+
secret_key_ref=client.V1SecretKeySelector(
|
388
|
+
name="azure-storage-access-key",
|
389
|
+
key="password",
|
390
|
+
)
|
391
|
+
),
|
392
|
+
)
|
393
|
+
]
|
394
|
+
|
328
395
|
if self.secret_name and self.secret_key:
|
329
396
|
volumes += [
|
330
397
|
client.V1Volume(
|
@@ -379,31 +446,54 @@ class KanikoBuilder(AbstractBuilder):
|
|
379
446
|
),
|
380
447
|
)
|
381
448
|
]
|
382
|
-
|
449
|
+
if isinstance(self.registry, AzureContainerRegistry):
|
450
|
+
# ADd the docker config map
|
451
|
+
volume_mounts += [
|
452
|
+
client.V1VolumeMount(
|
453
|
+
name="docker-config", mount_path="/kaniko/.docker/"
|
454
|
+
),
|
455
|
+
]
|
456
|
+
volumes += [
|
457
|
+
client.V1Volume(
|
458
|
+
name="docker-config",
|
459
|
+
config_map=client.V1ConfigMapVolumeSource(
|
460
|
+
name=f"docker-config-{job_name}",
|
461
|
+
),
|
462
|
+
),
|
463
|
+
]
|
464
|
+
# Kaniko doesn't want https:// at the begining of the image tag.
|
465
|
+
destination = image_tag
|
466
|
+
if destination.startswith("https://"):
|
467
|
+
destination = destination.replace("https://", "")
|
383
468
|
args = [
|
384
469
|
f"--context={build_context_path}",
|
385
470
|
"--dockerfile=Dockerfile.wandb-autogenerated",
|
386
|
-
f"--destination={
|
471
|
+
f"--destination={destination}",
|
387
472
|
"--cache=true",
|
388
|
-
f"--cache-repo={repository}",
|
473
|
+
f"--cache-repo={repository.replace('https://', '')}",
|
389
474
|
"--snapshotMode=redo",
|
390
475
|
"--compressed-caching=false",
|
391
476
|
]
|
392
477
|
container = client.V1Container(
|
393
478
|
name="wandb-container-build",
|
394
|
-
image=
|
479
|
+
image=self.image,
|
395
480
|
args=args,
|
396
481
|
volume_mounts=volume_mounts,
|
397
482
|
env=env if env else None,
|
398
483
|
)
|
399
484
|
# Create and configure a spec section
|
485
|
+
labels = {"wandb": "launch"}
|
486
|
+
# This annotation is required to enable azure workload identity.
|
487
|
+
if isinstance(self.registry, AzureContainerRegistry):
|
488
|
+
labels["azure.workload.identity/use"] = "true"
|
400
489
|
template = client.V1PodTemplateSpec(
|
401
|
-
metadata=client.V1ObjectMeta(labels=
|
490
|
+
metadata=client.V1ObjectMeta(labels=labels),
|
402
491
|
spec=client.V1PodSpec(
|
403
492
|
restart_policy="Never",
|
404
493
|
active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
|
405
494
|
containers=[container],
|
406
495
|
volumes=volumes,
|
496
|
+
service_account_name=SERVICE_ACCOUNT_NAME,
|
407
497
|
),
|
408
498
|
)
|
409
499
|
# Create the specification of job
|
@@ -412,7 +502,7 @@ class KanikoBuilder(AbstractBuilder):
|
|
412
502
|
api_version="batch/v1",
|
413
503
|
kind="Job",
|
414
504
|
metadata=client.V1ObjectMeta(
|
415
|
-
name=job_name, namespace=
|
505
|
+
name=job_name, namespace=NAMESPACE, labels={"wandb": "launch"}
|
416
506
|
),
|
417
507
|
spec=spec,
|
418
508
|
)
|
wandb/sdk/launch/builder/noop.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
"""NoOp builder implementation."""
|
2
|
-
from typing import Any, Dict
|
2
|
+
from typing import Any, Dict, Optional
|
3
3
|
|
4
4
|
from wandb.sdk.launch.builder.abstract import AbstractBuilder
|
5
5
|
from wandb.sdk.launch.environment.abstract import AbstractEnvironment
|
6
|
+
from wandb.sdk.launch.errors import LaunchError
|
6
7
|
from wandb.sdk.launch.registry.abstract import AbstractRegistry
|
7
|
-
from wandb.sdk.launch.utils import LaunchError
|
8
8
|
|
9
9
|
from .._project_spec import EntryPoint, LaunchProject
|
10
|
+
from ..agent.job_status_tracker import JobAndRunStatusTracker
|
10
11
|
|
11
12
|
|
12
13
|
class NoOpBuilder(AbstractBuilder):
|
@@ -21,7 +22,8 @@ class NoOpBuilder(AbstractBuilder):
|
|
21
22
|
registry: AbstractRegistry,
|
22
23
|
) -> None:
|
23
24
|
"""Initialize a NoOpBuilder."""
|
24
|
-
|
25
|
+
self.environment = environment
|
26
|
+
self.registry = registry
|
25
27
|
|
26
28
|
@classmethod
|
27
29
|
def from_config(
|
@@ -42,6 +44,7 @@ class NoOpBuilder(AbstractBuilder):
|
|
42
44
|
self,
|
43
45
|
launch_project: LaunchProject,
|
44
46
|
entrypoint: EntryPoint,
|
47
|
+
job_tracker: Optional[JobAndRunStatusTracker] = None,
|
45
48
|
) -> str:
|
46
49
|
"""Build the image.
|
47
50
|
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import json
|
2
|
-
import multiprocessing
|
3
2
|
import os
|
4
3
|
import re
|
5
4
|
import subprocess
|
@@ -8,7 +7,6 @@ from typing import List, Optional, Set
|
|
8
7
|
|
9
8
|
FAILED_PACKAGES_PREFIX = "ERROR: Failed to install: "
|
10
9
|
FAILED_PACKAGES_POSTFIX = ". During automated build process."
|
11
|
-
CORES = multiprocessing.cpu_count()
|
12
10
|
ONLY_INCLUDE = {x for x in os.getenv("WANDB_ONLY_INCLUDE", "").split(",") if x != ""}
|
13
11
|
OPTS = []
|
14
12
|
# If the builder doesn't support buildx no need to use the cache
|
@@ -52,8 +50,16 @@ def install_deps(
|
|
52
50
|
if failed is None:
|
53
51
|
failed = set()
|
54
52
|
num_failed = len(failed)
|
53
|
+
current_pkg = None
|
55
54
|
for line in e.output.decode("utf8").splitlines():
|
56
|
-
|
55
|
+
# Since the name of the package might not be on the same line as
|
56
|
+
# the error msg, keep track of the currently installing package
|
57
|
+
current_pkg = get_current_package(line, clean_deps, current_pkg)
|
58
|
+
|
59
|
+
if "error: subprocess-exited-with-error" in line:
|
60
|
+
if current_pkg is not None:
|
61
|
+
failed.add(current_pkg)
|
62
|
+
elif line.startswith("ERROR:"):
|
57
63
|
clean_dep = find_package_in_error_string(clean_deps, line)
|
58
64
|
if clean_dep is not None:
|
59
65
|
if clean_dep in deps:
|
@@ -84,7 +90,6 @@ def main() -> None:
|
|
84
90
|
with open("requirements.frozen.txt") as f:
|
85
91
|
print("Installing frozen dependencies...")
|
86
92
|
reqs = []
|
87
|
-
failed: Set[str] = set()
|
88
93
|
for req in f:
|
89
94
|
if (
|
90
95
|
len(ONLY_INCLUDE) == 0
|
@@ -109,15 +114,7 @@ def main() -> None:
|
|
109
114
|
reqs.append(req.strip().replace(" ", ""))
|
110
115
|
else:
|
111
116
|
print(f"Ignoring requirement: {req} from frozen requirements")
|
112
|
-
|
113
|
-
deps_failed = install_deps(reqs, opts=OPTS)
|
114
|
-
reqs = []
|
115
|
-
if deps_failed is not None:
|
116
|
-
failed = failed.union(deps_failed)
|
117
|
-
if len(reqs) > 0:
|
118
|
-
deps_failed = install_deps(reqs, opts=OPTS)
|
119
|
-
if deps_failed is not None:
|
120
|
-
failed = failed.union(deps_failed)
|
117
|
+
failed = install_deps(reqs, opts=OPTS) or set()
|
121
118
|
with open("_wandb_bootstrap_errors.json", "w") as f:
|
122
119
|
f.write(json.dumps({"pip": list(failed)}))
|
123
120
|
if len(failed) > 0:
|
@@ -130,6 +127,41 @@ def main() -> None:
|
|
130
127
|
print("No frozen requirements found")
|
131
128
|
|
132
129
|
|
130
|
+
def add_version_to_package_name(deps: List[str], package: str) -> Optional[str]:
|
131
|
+
"""Add the associated version to a package name.
|
132
|
+
|
133
|
+
For example: `my-package` -> `my-package==1.0.0`
|
134
|
+
"""
|
135
|
+
for dep in deps:
|
136
|
+
if dep.split("==")[0] == package:
|
137
|
+
return dep
|
138
|
+
return None
|
139
|
+
|
140
|
+
|
141
|
+
def get_current_package(
|
142
|
+
line: str, deps: List[str], current_pkg: Optional[str]
|
143
|
+
) -> Optional[str]:
|
144
|
+
"""Tries to pull a package name from the line.
|
145
|
+
|
146
|
+
Used to keep track of what the currently-installing package is,
|
147
|
+
in case an error message isn't on the same line as the package
|
148
|
+
"""
|
149
|
+
# "Collecting my-package==1.0.0"
|
150
|
+
if line.startswith("Collecting"):
|
151
|
+
return line.split(" ")[1]
|
152
|
+
# "Building wheel for my-package (pyproject.toml): finished with status 'error'"
|
153
|
+
elif line.strip().startswith("Building wheel") and line.strip().endswith(
|
154
|
+
"finished with status 'error'"
|
155
|
+
):
|
156
|
+
return add_version_to_package_name(deps, line.strip().split(" ")[3])
|
157
|
+
# "Running setup.py install for my-package: finished with status 'error'"
|
158
|
+
elif line.strip().startswith("Running setup.py install") and line.strip().endswith(
|
159
|
+
"finished with status 'error'"
|
160
|
+
):
|
161
|
+
return add_version_to_package_name(deps, line.strip().split(" ")[4][:-1])
|
162
|
+
return current_pkg
|
163
|
+
|
164
|
+
|
133
165
|
# hacky way to get the name of the requirement that failed
|
134
166
|
# attempt last word which is the name of the package often
|
135
167
|
# fall back to checking all words in the line for the package name
|
@@ -143,7 +175,7 @@ def find_package_in_error_string(deps: List[str], line: str) -> Optional[str]:
|
|
143
175
|
# contains a reference to another package in the deps
|
144
176
|
# before the package that failed to install
|
145
177
|
for word in line.split(" "):
|
146
|
-
if word in deps:
|
178
|
+
if word.strip(",") in deps:
|
147
179
|
return word
|
148
180
|
# if we can't find the package, return None
|
149
181
|
return None
|