PyPI - wandb - Versions diffs - 0.15.4__py3-none-any.whl → 0.15.5__py3-none-any.whl - Mend

wandb 0.15.4py3-none-any.whl → 0.15.5py3-none-any.whl

Files changed (102) hide show

wandb/__init__.py +1 -1
wandb/analytics/sentry.py +1 -0
wandb/apis/internal.py +3 -0
wandb/apis/public.py +18 -20
wandb/beta/workflows.py +5 -6
wandb/cli/cli.py +27 -27
wandb/data_types.py +2 -0
wandb/integration/langchain/wandb_tracer.py +16 -179
wandb/integration/sagemaker/config.py +2 -2
wandb/integration/tensorboard/log.py +4 -4
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/proto/wandb_deprecated.py +3 -1
wandb/sdk/__init__.py +1 -4
wandb/sdk/artifacts/__init__.py +0 -14
wandb/sdk/artifacts/artifact.py +1757 -277
wandb/sdk/artifacts/artifact_manifest_entry.py +26 -6
wandb/sdk/artifacts/artifact_state.py +10 -0
wandb/sdk/artifacts/artifacts_cache.py +7 -8
wandb/sdk/artifacts/exceptions.py +4 -4
wandb/sdk/artifacts/storage_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/azure_handler.py +16 -6
wandb/sdk/artifacts/storage_handlers/gcs_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/http_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/local_file_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/multi_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/s3_handler.py +35 -32
wandb/sdk/artifacts/storage_handlers/tracking_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +5 -9
wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +2 -2
wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +2 -2
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +24 -16
wandb/sdk/artifacts/storage_policy.py +3 -3
wandb/sdk/data_types/_dtypes.py +7 -12
wandb/sdk/data_types/base_types/json_metadata.py +2 -2
wandb/sdk/data_types/base_types/media.py +5 -6
wandb/sdk/data_types/base_types/wb_value.py +12 -13
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +4 -5
wandb/sdk/data_types/helper_types/classes.py +5 -8
wandb/sdk/data_types/helper_types/image_mask.py +4 -5
wandb/sdk/data_types/histogram.py +3 -3
wandb/sdk/data_types/html.py +3 -4
wandb/sdk/data_types/image.py +4 -5
wandb/sdk/data_types/molecule.py +2 -2
wandb/sdk/data_types/object_3d.py +3 -3
wandb/sdk/data_types/plotly.py +2 -2
wandb/sdk/data_types/saved_model.py +7 -8
wandb/sdk/data_types/trace_tree.py +4 -4
wandb/sdk/data_types/video.py +4 -4
wandb/sdk/interface/interface.py +8 -10
wandb/sdk/internal/file_stream.py +2 -3
wandb/sdk/internal/internal_api.py +99 -4
wandb/sdk/internal/job_builder.py +15 -7
wandb/sdk/internal/sender.py +4 -0
wandb/sdk/internal/settings_static.py +1 -0
wandb/sdk/launch/_project_spec.py +9 -7
wandb/sdk/launch/agent/agent.py +115 -58
wandb/sdk/launch/agent/job_status_tracker.py +34 -0
wandb/sdk/launch/agent/run_queue_item_file_saver.py +45 -0
wandb/sdk/launch/builder/abstract.py +5 -1
wandb/sdk/launch/builder/build.py +16 -10
wandb/sdk/launch/builder/docker_builder.py +9 -2
wandb/sdk/launch/builder/kaniko_builder.py +108 -22
wandb/sdk/launch/builder/noop.py +3 -1
wandb/sdk/launch/environment/aws_environment.py +2 -1
wandb/sdk/launch/environment/azure_environment.py +124 -0
wandb/sdk/launch/github_reference.py +30 -18
wandb/sdk/launch/launch.py +1 -1
wandb/sdk/launch/loader.py +15 -0
wandb/sdk/launch/registry/azure_container_registry.py +132 -0
wandb/sdk/launch/registry/elastic_container_registry.py +38 -4
wandb/sdk/launch/registry/google_artifact_registry.py +46 -7
wandb/sdk/launch/runner/abstract.py +19 -3
wandb/sdk/launch/runner/kubernetes_runner.py +111 -47
wandb/sdk/launch/runner/local_container.py +101 -48
wandb/sdk/launch/runner/sagemaker_runner.py +59 -9
wandb/sdk/launch/runner/vertex_runner.py +8 -4
wandb/sdk/launch/sweeps/scheduler.py +102 -27
wandb/sdk/launch/sweeps/utils.py +21 -0
wandb/sdk/launch/utils.py +19 -7
wandb/sdk/lib/_settings_toposort_generated.py +3 -0
wandb/sdk/service/server.py +22 -9
wandb/sdk/service/service.py +27 -8
wandb/sdk/verify/verify.py +6 -9
wandb/sdk/wandb_config.py +2 -4
wandb/sdk/wandb_init.py +2 -0
wandb/sdk/wandb_require.py +7 -0
wandb/sdk/wandb_run.py +32 -35
wandb/sdk/wandb_settings.py +10 -3
wandb/testing/relay.py +15 -2
wandb/util.py +55 -23
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/METADATA +11 -8
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/RECORD +97 -97
wandb/integration/langchain/util.py +0 -191
wandb/sdk/artifacts/invalid_artifact.py +0 -23
wandb/sdk/artifacts/lazy_artifact.py +0 -162
wandb/sdk/artifacts/local_artifact.py +0 -719
wandb/sdk/artifacts/public_artifact.py +0 -1188
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/LICENSE +0 -0
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/WHEEL +0 -0
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/entry_points.txt +0 -0
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/top_level.txt +0 -0

wandb/sdk/launch/agent/agent.py CHANGED Viewed

@@ -5,7 +5,6 @@ import pprint
 import threading
 import time
 import traceback
-from dataclasses import dataclass
 from multiprocessing import Event
 from multiprocessing.pool import ThreadPool
 from typing import Any, Dict, List, Optional, Union
@@ -13,7 +12,7 @@ from typing import Any, Dict, List, Optional, Union
 import wandb
 from wandb.apis.internal import Api
 from wandb.errors import CommError
-from wandb.sdk.launch._project_spec import LaunchProject
+from wandb.sdk.launch.launch_add import launch_add
 from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
 from wandb.sdk.launch.sweeps.scheduler import Scheduler
 from wandb.sdk.lib import runid
@@ -22,8 +21,9 @@ from .. import loader
 from .._project_spec import create_project_from_spec, fetch_and_validate_project
 from ..builder.build import construct_builder_args
 from ..errors import LaunchDockerError, LaunchError
-from ..runner.abstract import AbstractRun
 from ..utils import LAUNCH_DEFAULT_PROJECT, LOG_PREFIX, PROJECT_SYNCHRONOUS
+from .job_status_tracker import JobAndRunStatusTracker
+from .run_queue_item_file_saver import RunQueueItemFileSaver
 AGENT_POLLING_INTERVAL = 10
 ACTIVE_SWEEP_POLLING_INTERVAL = 1  # more frequent when we know we have jobs
@@ -36,28 +36,9 @@ HIDDEN_AGENT_RUN_TYPE = "sweep-controller"
 MAX_THREADS = 64
-_logger = logging.getLogger(__name__)
-@dataclass
-class JobAndRunStatus:
-    run_queue_item_id: str
-    run_id: Optional[str] = None
-    project: Optional[str] = None
-    entity: Optional[str] = None
-    run: Optional[AbstractRun] = None
-    failed_to_start: bool = False
-    completed_status: Optional[str] = None
-    is_scheduler: bool = False
-    @property
-    def job_completed(self) -> bool:
-        return self.failed_to_start or self.completed_status is not None
+MAX_RESUME_COUNT = 5
-    def update_run_info(self, launch_project: LaunchProject) -> None:
-        self.run_id = launch_project.run_id
-        self.project = launch_project.target_project
-        self.entity = launch_project.target_entity
+_logger = logging.getLogger(__name__)
 def _convert_access(access: str) -> str:
@@ -139,7 +120,7 @@ class LaunchAgent:
         self._api = api
         self._base_url = self._api.settings().get("base_url")
         self._ticks = 0
-        self._jobs: Dict[int, JobAndRunStatus] = {}
+        self._jobs: Dict[int, JobAndRunStatusTracker] = {}
         self._jobs_lock = threading.Lock()
         self._jobs_event = Event()
         self._jobs_event.set()
@@ -180,22 +161,31 @@ class LaunchAgent:
             self._id, self.gorilla_supports_agents
         )
         self._name = agent_response["name"]
-        if self.gorilla_supports_agents:
-            self._init_agent_run()
+        self._init_agent_run()
-    def fail_run_queue_item(self, run_queue_item_id: str) -> None:
+    def fail_run_queue_item(
+        self,
+        run_queue_item_id: str,
+        message: str,
+        phase: str,
+        files: Optional[List[str]] = None,
+    ) -> None:
         if self._gorilla_supports_fail_run_queue_items:
-            self._api.fail_run_queue_item(run_queue_item_id)
+            self._api.fail_run_queue_item(run_queue_item_id, message, phase, files)
     def _init_agent_run(self) -> None:
-        settings = wandb.Settings(silent=True, disable_git=True)
-        wandb.init(
-            project=self._project,
-            entity=self._entity,
-            settings=settings,
-            id=self._name,
-            job_type=HIDDEN_AGENT_RUN_TYPE,
-        )
+        # TODO: has it been long enough that all backends support agents?
+        if self.gorilla_supports_agents:
+            settings = wandb.Settings(silent=True, disable_git=True)
+            self._wandb_run = wandb.init(
+                project=self._project,
+                entity=self._entity,
+                settings=settings,
+                id=self._name,
+                job_type=HIDDEN_AGENT_RUN_TYPE,
+            )
+        else:
+            self._wandb_run = None
     @property
     def thread_ids(self) -> List[int]:
@@ -279,24 +269,35 @@ class LaunchAgent:
         """Removes the job from our list for now."""
         job_and_run_status = self._jobs[thread_id]
         if (
-            not job_and_run_status.run_id
-            or not job_and_run_status.project
-            or exception is not None
+            job_and_run_status.entity is not None
+            and job_and_run_status.entity != self._entity
         ):
-            self.fail_run_queue_item(job_and_run_status.run_queue_item_id)
-        elif job_and_run_status.entity != self._entity:
             _logger.info(
                 "Skipping check for completed run status because run is on a different entity than agent"
             )
+        elif exception is not None:
+            tb_str = traceback.format_exception(
+                type(exception), value=exception, tb=exception.__traceback__
+            )
+            fnames = job_and_run_status.saver.save_contents(
+                "".join(tb_str), "error.log", "error"
+            )
+            self.fail_run_queue_item(
+                job_and_run_status.run_queue_item_id,
+                str(exception),
+                job_and_run_status.err_stage,
+                fnames,
+            )
         elif job_and_run_status.completed_status not in ["stopped", "failed"]:
             _logger.info(
                 "Skipping check for completed run status because run was successful"
             )
-        else:
+        elif job_and_run_status.run is not None:
             run_info = None
             # sweep runs exist but have no info before they are started
             # so run_info returned will be None
             # normal runs just throw a comm error
+            # TODO: make more clear
             try:
                 run_info = self._api.get_run_info(
                     self._entity, job_and_run_status.project, job_and_run_status.run_id
@@ -305,7 +306,22 @@ class LaunchAgent:
             except CommError:
                 pass
             if run_info is None:
-                self.fail_run_queue_item(job_and_run_status.run_queue_item_id)
+                _msg = "The submitted run was not successfully started"
+                fnames = None
+                logs = job_and_run_status.run.get_logs()
+                if logs:
+                    fnames = job_and_run_status.saver.save_contents(
+                        logs, "error.log", "error"
+                    )
+                self.fail_run_queue_item(
+                    job_and_run_status.run_queue_item_id, _msg, "run", fnames
+                )
+        else:
+            _logger.info("Finish thread id had no exception, ror run")
+            wandb._sentry.exception(
+                "launch agent called finish thread id on thread without run or exception"
+            )
         # TODO:  keep logs or something for the finished jobs
         with self._jobs_lock:
@@ -322,7 +338,9 @@ class LaunchAgent:
         if job.job_completed:
             self.finish_thread_id(thread_id)
-    def run_job(self, job: Dict[str, Any]) -> None:
+    def run_job(
+        self, job: Dict[str, Any], queue: str, file_saver: RunQueueItemFileSaver
+    ) -> None:
         """Set up project and run the job.
         Arguments:
@@ -348,6 +366,8 @@ class LaunchAgent:
                 job,
                 self.default_config,
                 self._api,
+                queue,
+                file_saver,
             ),
         )
@@ -401,6 +421,9 @@ class LaunchAgent:
                     for queue in self._queues:
                         job = self.pop_from_queue(queue)
                         if job:
+                            file_saver = RunQueueItemFileSaver(
+                                self._wandb_run, job["runQueueItemId"]
+                            )
                             if _is_scheduler_job(job.get("runSpec")):
                                 # If job is a scheduler, and we are already at the cap, ignore,
                                 #    don't ack, and it will be pushed back onto the queue in 1 min
@@ -413,13 +436,25 @@ class LaunchAgent:
                                     continue
                             try:
-                                self.run_job(job)
+                                self.run_job(job, queue, file_saver)
                             except Exception as e:
                                 wandb.termerror(
                                     f"{LOG_PREFIX}Error running job: {traceback.format_exc()}"
                                 )
                                 wandb._sentry.exception(e)
-                                self.fail_run_queue_item(job["runQueueItemId"])
+                                # always the first phase, because we only enter phase 2 within the thread
+                                files = file_saver.save_contents(
+                                    contents=traceback.format_exc(),
+                                    fname="error.log",
+                                    file_sub_type="error",
+                                )
+                                self.fail_run_queue_item(
+                                    run_queue_item_id=job["runQueueItemId"],
+                                    message=str(e),
+                                    phase="agent",
+                                    files=files,
+                                )
                 for thread_id in self.thread_ids:
                     self._update_finished(thread_id)
@@ -454,11 +489,18 @@ class LaunchAgent:
         job: Dict[str, Any],
         default_config: Dict[str, Any],
         api: Api,
+        queue: str,
+        file_saver: RunQueueItemFileSaver,
     ) -> None:
         thread_id = threading.current_thread().ident
         assert thread_id is not None
+        job_tracker = JobAndRunStatusTracker(job["runQueueItemId"], queue, file_saver)
+        with self._jobs_lock:
+            self._jobs[thread_id] = job_tracker
         try:
-            self._thread_run_job(launch_spec, job, default_config, api, thread_id)
+            self._thread_run_job(
+                launch_spec, job, default_config, api, queue, thread_id, job_tracker
+            )
         except LaunchDockerError as e:
             wandb.termerror(
                 f"{LOG_PREFIX}agent {self._name} encountered an issue while starting Docker, see above output for details."
@@ -476,11 +518,10 @@ class LaunchAgent:
         job: Dict[str, Any],
         default_config: Dict[str, Any],
         api: Api,
+        queue: str,
         thread_id: int,
+        job_tracker: JobAndRunStatusTracker,
     ) -> None:
-        job_tracker = JobAndRunStatus(job["runQueueItemId"])
-        with self._jobs_lock:
-            self._jobs[thread_id] = job_tracker
         project = create_project_from_spec(launch_spec, api)
         job_tracker.update_run_info(project)
         _logger.info("Fetching and validating project...")
@@ -505,8 +546,7 @@ class LaunchAgent:
         backend = loader.runner_from_config(resource, api, backend_config, environment)
         _logger.info("Backend loaded...")
         api.ack_run_queue_item(job["runQueueItemId"], project.run_id)
-        run = backend.run(project, builder)
+        run = backend.run(project, builder, job_tracker)
         if _is_scheduler_job(launch_spec):
             with self._jobs_lock:
                 self._jobs[thread_id].is_scheduler = True
@@ -522,15 +562,17 @@ class LaunchAgent:
         with self._jobs_lock:
             job_tracker.run = run
         while self._jobs_event.is_set():
-            if self._check_run_finished(job_tracker):
+            if self._check_run_finished(job_tracker, launch_spec):
                 return
             time.sleep(AGENT_POLLING_INTERVAL)
         # temp: for local, kill all jobs. we don't yet have good handling for different
         # types of runners in general
-        if isinstance(run, LocalSubmittedRun):
-            run.command_proc.kill()
+        if isinstance(run, LocalSubmittedRun) and run._command_proc is not None:
+            run._command_proc.kill()
-    def _check_run_finished(self, job_tracker: JobAndRunStatus) -> bool:
+    def _check_run_finished(
+        self, job_tracker: JobAndRunStatusTracker, launch_spec: Dict[str, Any]
+    ) -> bool:
         if job_tracker.completed_status:
             return True
@@ -547,13 +589,28 @@ class LaunchAgent:
         try:
             run = job_tracker.run
             status = run.get_status().state
-            if status in ["stopped", "failed", "finished"]:
+            if status in ["stopped", "failed", "finished", "preempted"]:
                 if job_tracker.is_scheduler:
                     wandb.termlog(f"{LOG_PREFIX}Scheduler finished with ID: {run.id}")
                 else:
                     wandb.termlog(f"{LOG_PREFIX}Job finished with ID: {run.id}")
                 with self._jobs_lock:
                     job_tracker.completed_status = status
+                if status == "preempted":
+                    config = launch_spec.copy()
+                    config["run_id"] = job_tracker.run_id
+                    config["_resume_count"] = config.get("_resume_count", 0) + 1
+                    if config["_resume_count"] > MAX_RESUME_COUNT:
+                        wandb.termlog(
+                            f"{LOG_PREFIX}Run {job_tracker.run_id} has already resumed {MAX_RESUME_COUNT} times."
+                        )
+                        return True
+                    wandb.termlog(f"{LOG_PREFIX}Requeueing run {job_tracker.run_id}.")
+                    launch_add(
+                        config=config,
+                        project_queue=self._project,
+                        queue_name=job_tracker.queue,
+                    )
                 return True
             return False
         except LaunchError as e:

wandb/sdk/launch/agent/job_status_tracker.py ADDED Viewed

@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+from typing import Optional
+from wandb.sdk.launch._project_spec import LaunchProject
+from ..runner.abstract import AbstractRun
+from .run_queue_item_file_saver import RunQueueItemFileSaver
+@dataclass
+class JobAndRunStatusTracker:
+    run_queue_item_id: str
+    queue: str
+    saver: RunQueueItemFileSaver
+    run_id: Optional[str] = None
+    project: Optional[str] = None
+    entity: Optional[str] = None
+    run: Optional[AbstractRun] = None
+    failed_to_start: bool = False
+    completed_status: Optional[str] = None
+    is_scheduler: bool = False
+    err_stage: str = "agent"
+    @property
+    def job_completed(self) -> bool:
+        return self.failed_to_start or self.completed_status is not None
+    def update_run_info(self, launch_project: LaunchProject) -> None:
+        self.run_id = launch_project.run_id
+        self.project = launch_project.target_project
+        self.entity = launch_project.target_entity
+    def set_err_stage(self, stage: str) -> None:
+        self.err_stage = stage

wandb/sdk/launch/agent/run_queue_item_file_saver.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Implementation of the run queue item file saver class."""
+import os
+import sys
+from typing import List, Optional, Union
+import wandb
+from wandb.sdk.lib import RunDisabled
+from wandb.sdk.wandb_run import Run
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+FileSubtypes = Literal["warning", "error"]
+class RunQueueItemFileSaver:
+    def __init__(
+        self, agent_run: Optional[Union[Run, RunDisabled]], run_queue_item_id: str
+    ):
+        self.run_queue_item_id = run_queue_item_id
+        self.run = agent_run
+    def save_contents(
+        self, contents: str, fname: str, file_sub_type: FileSubtypes
+    ) -> Optional[List[str]]:
+        if not isinstance(self.run, Run):
+            wandb.termwarn("Not saving file contents because agent has no run")
+            return None
+        root_dir = self.run._settings.files_dir
+        saved_run_path = os.path.join(self.run_queue_item_id, file_sub_type, fname)
+        local_path = os.path.join(root_dir, saved_run_path)
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        with open(local_path, "w") as f:
+            f.write(contents)
+        res = self.run.save(local_path, base_path=root_dir, policy="now")
+        if isinstance(res, list):
+            return [saved_run_path]
+        else:
+            wandb.termwarn(
+                f"Failed to save files for run queue item: {self.run_queue_item_id}"
+            )
+            return None

wandb/sdk/launch/builder/abstract.py CHANGED Viewed

@@ -1,12 +1,15 @@
 """Abstract plugin class defining the interface needed to build container images for W&B Launch."""
 from abc import ABC, abstractmethod
-from typing import Any, Dict
+from typing import TYPE_CHECKING, Any, Dict, Optional
 from wandb.sdk.launch.environment.abstract import AbstractEnvironment
 from wandb.sdk.launch.registry.abstract import AbstractRegistry
 from .._project_spec import EntryPoint, LaunchProject
+if TYPE_CHECKING:
+    from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
 class AbstractBuilder(ABC):
     """Abstract plugin class defining the interface needed to build container images for W&B Launch."""
@@ -63,6 +66,7 @@ class AbstractBuilder(ABC):
         self,
         launch_project: LaunchProject,
         entrypoint: EntryPoint,
+        job_tracker: Optional["JobAndRunStatusTracker"] = None,
     ) -> str:
         """Build the image for the given project.

wandb/sdk/launch/builder/build.py CHANGED Viewed

@@ -38,8 +38,6 @@ _logger = logging.getLogger(__name__)
 _GENERATED_DOCKERFILE_NAME = "Dockerfile.wandb-autogenerated"
 DEFAULT_ENTRYPOINT = "_wandb_default_entrypoint"
-DEFAULT_CUDA_VERSION = "10.0"
 def validate_docker_installation() -> None:
     """Verify if Docker is installed on host machine."""
@@ -103,8 +101,12 @@ FROM {py_base_image} as base
 """
 # this goes into base_setup in TEMPLATE
-CUDA_SETUP_TEMPLATE = """
-FROM {cuda_base_image} as base
+ACCELERATOR_SETUP_TEMPLATE = """
+FROM {accelerator_base_image} as base
+# make non-interactive so build doesn't block on questions
+ENV DEBIAN_FRONTEND=noninteractive
 # TODO: once NVIDIA their linux repository keys for all docker images
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$(cat /etc/os-release | grep ^ID= |  cut -d "=" -f2 )$(cat /etc/os-release | grep ^VERSION_ID= |  cut -d "=" -f2 | sed -e 's/[\".]//g' )/$(uname -i)/3bf863cc.pub
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/$(cat /etc/os-release | grep ^ID= |  cut -d "=" -f2 )$(cat /etc/os-release | grep ^VERSION_ID= |  cut -d "=" -f2 | sed -e 's/[\".]//g' )/$(uname -i)/7fa2af80.pub
@@ -184,12 +186,14 @@ def get_base_setup(
 ) -> str:
     """Fill in the Dockerfile templates for stage 2 of build.
-    CPU version is built on python, GPU version is built on nvidia:cuda.
+    CPU version is built on python, Accelerator version is built on user provided.
     """
     python_base_image = f"python:{py_version}-buster"
-    if launch_project.cuda_base_image:
-        _logger.info(f"Using cuda base image: {launch_project.cuda_base_image}")
-        # cuda image doesn't come with python tooling
+    if launch_project.accelerator_base_image:
+        _logger.info(
+            f"Using accelerator base image: {launch_project.accelerator_base_image}"
+        )
+        # accelerator base images doesn't come with python tooling
         if py_major == "2":
             python_packages = [
                 f"python{py_version}",
@@ -204,8 +208,8 @@ def get_base_setup(
                 "python3-pip",
                 "python3-setuptools",
             ]
-        base_setup = CUDA_SETUP_TEMPLATE.format(
-            cuda_base_image=launch_project.cuda_base_image,
+        base_setup = ACCELERATOR_SETUP_TEMPLATE.format(
+            accelerator_base_image=launch_project.accelerator_base_image,
             python_packages=" \\\n".join(python_packages),
             py_version=py_version,
         )
@@ -243,6 +247,8 @@ def get_env_vars_dict(launch_project: LaunchProject, api: Api) -> Dict[str, str]
         env_vars["WANDB_USERNAME"] = launch_project.launch_spec["author"]
     if launch_project.sweep_id:
         env_vars["WANDB_SWEEP_ID"] = launch_project.sweep_id
+    if launch_project.launch_spec.get("_resume_count"):
+        env_vars["WANDB_RESUME"] = "must"
     # TODO: handle env vars > 32760 characters
     env_vars["WANDB_CONFIG"] = json.dumps(launch_project.override_config)

wandb/sdk/launch/builder/docker_builder.py CHANGED Viewed

@@ -1,10 +1,11 @@
 """Implementation of the docker builder."""
 import logging
 import os
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 import wandb
 import wandb.docker as docker
+from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
 from wandb.sdk.launch.builder.abstract import AbstractBuilder
 from wandb.sdk.launch.environment.abstract import AbstractEnvironment
 from wandb.sdk.launch.registry.abstract import AbstractRegistry
@@ -111,6 +112,7 @@ class DockerBuilder(AbstractBuilder):
         self,
         launch_project: LaunchProject,
         entrypoint: EntryPoint,
+        job_tracker: Optional[JobAndRunStatusTracker] = None,
     ) -> str:
         """Build the image for the given project.
@@ -159,9 +161,14 @@ class DockerBuilder(AbstractBuilder):
                 context_path=build_ctx_path,
                 platform=self.config.get("platform"),
             )
-            warn_failed_packages_from_build_logs(output, image_uri)
+            warn_failed_packages_from_build_logs(
+                output, image_uri, launch_project.api, job_tracker
+            )
         except docker.DockerError as e:
+            if job_tracker:
+                job_tracker.set_err_stage("build")
             raise LaunchDockerError(f"Error communicating with docker client: {e}")
         try:

wandb 0.15.4__py3-none-any.whl → 0.15.5__py3-none-any.whl

wandb 0.15.4py3-none-any.whl → 0.15.5py3-none-any.whl