PyPI - wandb - Versions diffs - 0.15.9__py3-none-any.whl → 0.15.11__py3-none-any.whl - Mend

wandb 0.15.9py3-none-any.whl → 0.15.11py3-none-any.whl

Files changed (114) hide show

wandb/__init__.py +5 -1
wandb/apis/public.py +137 -17
wandb/apis/reports/_panels.py +1 -1
wandb/apis/reports/blocks.py +1 -0
wandb/apis/reports/report.py +27 -5
wandb/cli/cli.py +52 -41
wandb/docker/__init__.py +17 -0
wandb/docker/auth.py +1 -1
wandb/env.py +24 -4
wandb/filesync/step_checksum.py +3 -3
wandb/integration/openai/openai.py +3 -0
wandb/integration/ultralytics/__init__.py +9 -0
wandb/integration/ultralytics/bbox_utils.py +196 -0
wandb/integration/ultralytics/callback.py +458 -0
wandb/integration/ultralytics/classification_utils.py +66 -0
wandb/integration/ultralytics/mask_utils.py +141 -0
wandb/integration/ultralytics/pose_utils.py +92 -0
wandb/integration/xgboost/xgboost.py +3 -3
wandb/integration/yolov8/__init__.py +0 -7
wandb/integration/yolov8/yolov8.py +22 -3
wandb/old/settings.py +7 -0
wandb/plot/line_series.py +0 -1
wandb/proto/v3/wandb_internal_pb2.py +353 -300
wandb/proto/v3/wandb_server_pb2.py +37 -41
wandb/proto/v3/wandb_settings_pb2.py +2 -2
wandb/proto/v3/wandb_telemetry_pb2.py +16 -16
wandb/proto/v4/wandb_internal_pb2.py +272 -260
wandb/proto/v4/wandb_server_pb2.py +37 -40
wandb/proto/v4/wandb_settings_pb2.py +2 -2
wandb/proto/v4/wandb_telemetry_pb2.py +16 -16
wandb/proto/wandb_internal_codegen.py +7 -31
wandb/sdk/artifacts/artifact.py +321 -189
wandb/sdk/artifacts/artifact_cache.py +14 -0
wandb/sdk/artifacts/artifact_manifest.py +5 -4
wandb/sdk/artifacts/artifact_manifest_entry.py +37 -9
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -9
wandb/sdk/artifacts/artifact_saver.py +13 -50
wandb/sdk/artifacts/artifact_ttl.py +6 -0
wandb/sdk/artifacts/artifacts_cache.py +119 -93
wandb/sdk/artifacts/staging.py +25 -0
wandb/sdk/artifacts/storage_handlers/s3_handler.py +12 -7
wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +2 -3
wandb/sdk/artifacts/storage_policies/__init__.py +4 -0
wandb/sdk/artifacts/storage_policies/register.py +1 -0
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +4 -3
wandb/sdk/artifacts/storage_policy.py +4 -2
wandb/sdk/backend/backend.py +0 -16
wandb/sdk/data_types/image.py +3 -1
wandb/sdk/integration_utils/auto_logging.py +38 -13
wandb/sdk/interface/interface.py +16 -135
wandb/sdk/interface/interface_shared.py +9 -147
wandb/sdk/interface/interface_sock.py +0 -26
wandb/sdk/internal/file_pusher.py +20 -3
wandb/sdk/internal/file_stream.py +3 -1
wandb/sdk/internal/handler.py +53 -70
wandb/sdk/internal/internal_api.py +220 -130
wandb/sdk/internal/job_builder.py +41 -37
wandb/sdk/internal/sender.py +7 -25
wandb/sdk/internal/system/assets/disk.py +144 -11
wandb/sdk/internal/system/system_info.py +6 -2
wandb/sdk/launch/__init__.py +5 -0
wandb/sdk/launch/{launch.py → _launch.py} +53 -54
wandb/sdk/launch/{launch_add.py → _launch_add.py} +34 -31
wandb/sdk/launch/_project_spec.py +13 -2
wandb/sdk/launch/agent/agent.py +103 -59
wandb/sdk/launch/agent/run_queue_item_file_saver.py +6 -4
wandb/sdk/launch/builder/build.py +19 -1
wandb/sdk/launch/builder/docker_builder.py +5 -1
wandb/sdk/launch/builder/kaniko_builder.py +5 -1
wandb/sdk/launch/create_job.py +20 -5
wandb/sdk/launch/loader.py +14 -5
wandb/sdk/launch/runner/abstract.py +0 -2
wandb/sdk/launch/runner/kubernetes_monitor.py +329 -0
wandb/sdk/launch/runner/kubernetes_runner.py +66 -209
wandb/sdk/launch/runner/local_container.py +5 -2
wandb/sdk/launch/runner/local_process.py +4 -1
wandb/sdk/launch/sweeps/scheduler.py +43 -25
wandb/sdk/launch/sweeps/utils.py +5 -3
wandb/sdk/launch/utils.py +3 -1
wandb/sdk/lib/_settings_toposort_generate.py +3 -9
wandb/sdk/lib/_settings_toposort_generated.py +27 -3
wandb/sdk/lib/_wburls_generated.py +1 -0
wandb/sdk/lib/filenames.py +27 -6
wandb/sdk/lib/filesystem.py +181 -7
wandb/sdk/lib/fsm.py +5 -3
wandb/sdk/lib/gql_request.py +3 -0
wandb/sdk/lib/ipython.py +7 -0
wandb/sdk/lib/wburls.py +1 -0
wandb/sdk/service/port_file.py +2 -15
wandb/sdk/service/server.py +7 -55
wandb/sdk/service/service.py +56 -26
wandb/sdk/service/service_base.py +1 -1
wandb/sdk/service/streams.py +11 -5
wandb/sdk/verify/verify.py +2 -2
wandb/sdk/wandb_init.py +8 -2
wandb/sdk/wandb_manager.py +4 -14
wandb/sdk/wandb_run.py +143 -53
wandb/sdk/wandb_settings.py +148 -35
wandb/testing/relay.py +85 -38
wandb/util.py +87 -4
wandb/wandb_torch.py +24 -38
{wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/METADATA +48 -23
{wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/RECORD +107 -103
{wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/WHEEL +1 -1
wandb/proto/v3/wandb_server_pb2_grpc.py +0 -1422
wandb/proto/v4/wandb_server_pb2_grpc.py +0 -1422
wandb/proto/wandb_server_pb2_grpc.py +0 -8
wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +0 -61
wandb/sdk/interface/interface_grpc.py +0 -460
wandb/sdk/service/server_grpc.py +0 -444
wandb/sdk/service/service_grpc.py +0 -73
{wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/LICENSE +0 -0
{wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/entry_points.txt +0 -0
{wandb-0.15.9.dist-info → wandb-0.15.11.dist-info}/top_level.txt +0 -0

wandb/sdk/launch/runner/kubernetes_runner.py CHANGED Viewed

@@ -6,13 +6,15 @@ import logging
 import time
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+import yaml
 import wandb
 from wandb.apis.internal import Api
 from wandb.sdk.launch.environment.abstract import AbstractEnvironment
 from wandb.sdk.launch.registry.abstract import AbstractRegistry
 from wandb.sdk.launch.registry.azure_container_registry import AzureContainerRegistry
 from wandb.sdk.launch.registry.local_registry import LocalRegistry
-from wandb.sdk.launch.runner.abstract import State, Status
+from wandb.sdk.launch.runner.abstract import Status
 from wandb.util import get_module
 from .._project_spec import EntryPoint, LaunchProject
@@ -26,6 +28,7 @@ from ..utils import (
     make_name_dns_safe,
 )
 from .abstract import AbstractRun, AbstractRunner
+from .kubernetes_monitor import KubernetesRunMonitor
 get_module(
     "kubernetes",
@@ -43,32 +46,16 @@ from kubernetes.client.models.v1_secret import V1Secret  # type: ignore # noqa:
 from kubernetes.client.rest import ApiException  # type: ignore # noqa: E402
 TIMEOUT = 5
-MAX_KUBERNETES_RETRIES = (
-    60  # default 10 second loop time on the agent, this is 10 minutes
-)
-FAIL_MESSAGE_INTERVAL = 60
 _logger = logging.getLogger(__name__)
-# Dict for mapping possible states of custom objects to the states we want to report
-# to the agent.
-CRD_STATE_DICT: Dict[str, State] = {
-    "pending": "starting",
-    "running": "running",
-    "completed": "finished",
-    "failed": "failed",
-    "aborted": "failed",
-    "terminating": "stopping",
-    "terminated": "stopped",
-}
 class KubernetesSubmittedRun(AbstractRun):
     """Wrapper for a launched run on Kubernetes."""
     def __init__(
         self,
+        monitor: KubernetesRunMonitor,
         batch_api: "BatchV1Api",
         core_api: "CoreV1Api",
         name: str,
@@ -78,6 +65,14 @@ class KubernetesSubmittedRun(AbstractRun):
     ) -> None:
         """Initialize a KubernetesSubmittedRun.
+        Other implementations of the AbstractRun interface poll on the run
+        when `get_status` is called, but KubernetesSubmittedRun uses
+        Kubernetes watch streams to update the run status. One thread handles
+        events from the job object and another thread handles events from the
+        rank 0 pod. These threads updated the `_status` attributed of the
+        KubernetesSubmittedRun object. When `get_status` is called, the
+        `_status` attribute is returned.
         Arguments:
             batch_api: Kubernetes BatchV1Api object.
             core_api: Kubernetes CoreV1Api object.
@@ -89,13 +84,11 @@ class KubernetesSubmittedRun(AbstractRun):
         Returns:
             None.
         """
+        self.monitor = monitor
         self.batch_api = batch_api
         self.core_api = core_api
         self.name = name
         self.namespace = namespace
-        self.job = self.batch_api.read_namespaced_job(
-            name=self.name, namespace=self.namespace
-        )
         self._fail_count = 0
         self.pod_names = pod_names
         self.secret = secret
@@ -136,7 +129,7 @@ class KubernetesSubmittedRun(AbstractRun):
         while True:
             status = self.get_status()
             wandb.termlog(f"{LOG_PREFIX}Job {self.name} status: {status}")
-            if status.state != "running":
+            if status.state in ["finished", "failed", "preempted"]:
                 break
             time.sleep(5)
         return (
@@ -156,98 +149,20 @@ class KubernetesSubmittedRun(AbstractRun):
                 )
     def get_status(self) -> Status:
-        """Return the run status."""
-        try:
-            job_response = self.batch_api.read_namespaced_job_status(
-                name=self.name, namespace=self.namespace
-            )
-        except ApiException as e:
-            if e.status == 404:
-                wandb.termerror(
-                    f"Could not reach job {self.name} in namespace {self.namespace}"
-                )
-                self._delete_secret_if_completed("failed")
-                return Status("failed")
-        status = job_response.status
+        return self.monitor.get_status()
+    def cancel(self) -> None:
+        """Cancel the run."""
+        self.monitor.stop()
         try:
-            pod = self.core_api.read_namespaced_pod(
-                name=self.pod_names[0], namespace=self.namespace
+            self.batch_api.delete_namespaced_job(
+                namespace=self.namespace,
+                name=self.name,
             )
         except ApiException as e:
-            if e.status == 404:
-                wandb.termerror(
-                    f"Could not reach pod {self.pod_names[0]} in namespace {self.namespace}"
-                )
-                self._delete_secret_if_completed("failed")
-                return Status("failed")
-        if hasattr(pod.status, "conditions") and pod.status.conditions is not None:
-            for condition in pod.status.conditions:
-                if condition.type == "DisruptionTarget" and condition.reason in [
-                    "EvictionByEvictionAPI",
-                    "PreemptionByScheduler",
-                    "TerminationByKubelet",
-                ]:
-                    return Status("preempted")
-        if pod.status.phase in ["Pending", "Unknown"]:
-            now = time.time()
-            if self._fail_count == 0:
-                self._fail_first_msg_time = now
-                self._fail_last_msg_time = 0.0
-            self._fail_count += 1
-            if now - self._fail_last_msg_time > FAIL_MESSAGE_INTERVAL:
-                wandb.termlog(
-                    f"{LOG_PREFIX}Pod has not started yet for job: {self.name}. Will wait up to {round(10 - (now - self._fail_first_msg_time)/60)} minutes."
-                )
-                self._fail_last_msg_time = now
-            if self._fail_count > MAX_KUBERNETES_RETRIES:
-                raise LaunchError(f"Failed to start job {self.name}")
-        # todo: we only handle the 1 pod case. see https://kubernetes.io/docs/concepts/workloads/controllers/job/#parallel-jobs for multipod handling
-        return_status = None
-        if status.succeeded == 1:
-            return_status = Status("finished")
-        elif status.failed is not None and status.failed >= 1:
-            return_status = Status("failed")
-        elif status.active == 1:
-            return Status("running")
-        elif status.conditions is not None and status.conditions[0].type == "Suspended":
-            return_status = Status("stopped")
-        else:
-            return_status = Status("unknown")
-        self._delete_secret_if_completed(return_status.state)
-        return return_status
-    def suspend(self) -> None:
-        """Suspend the run."""
-        self.job.spec.suspend = True
-        self.batch_api.patch_namespaced_job(
-            name=self.name, namespace=self.namespace, body=self.job
-        )
-        timeout = TIMEOUT
-        job_response = self.batch_api.read_namespaced_job_status(
-            name=self.name, namespace=self.namespace
-        )
-        while job_response.status.conditions is None and timeout > 0:
-            time.sleep(1)
-            timeout -= 1
-            job_response = self.batch_api.read_namespaced_job_status(
-                name=self.name, namespace=self.namespace
-            )
-        if timeout == 0 or job_response.status.conditions[0].type != "Suspended":
             raise LaunchError(
-                "Failed to suspend job {}. Check Kubernetes dashboard for more info.".format(
-                    self.name
-                )
-            )
-    def cancel(self) -> None:
-        """Cancel the run."""
-        self.suspend()
-        self.batch_api.delete_namespaced_job(name=self.name, namespace=self.namespace)
+                f"Failed to delete Kubernetes Job {self.name} in namespace {self.namespace}: {str(e)}"
+            ) from e
 class CrdSubmittedRun(AbstractRun):
@@ -262,7 +177,7 @@ class CrdSubmittedRun(AbstractRun):
         namespace: str,
         core_api: CoreV1Api,
         custom_api: CustomObjectsApi,
-        pod_names: List[str],
+        monitor: KubernetesRunMonitor,
     ) -> None:
         """Create a run object for tracking the progress of a CRD.
@@ -274,7 +189,7 @@ class CrdSubmittedRun(AbstractRun):
             namespace: The namespace of the CRD instance.
             core_api: The Kubernetes core API client.
             custom_api: The Kubernetes custom object API client.
-            pod_names: The names of the pods associated with the CRD instance.
+            monitor: The run monitor.
         Raises:
             LaunchError: If the CRD instance does not exist.
@@ -286,20 +201,8 @@ class CrdSubmittedRun(AbstractRun):
         self.namespace = namespace
         self.core_api = core_api
         self.custom_api = custom_api
-        self.pod_names = pod_names
         self._fail_count = 0
-        try:
-            self.job = self.custom_api.get_namespaced_custom_object(
-                group=self.group,
-                version=self.version,
-                namespace=self.namespace,
-                plural=self.plural,
-                name=self.name,
-            )
-        except ApiException as e:
-            raise LaunchError(
-                f"Failed to get CRD {self.name} in namespace {self.namespace}: {str(e)}"
-            ) from e
+        self.monitor = monitor
     @property
     def id(self) -> str:
@@ -311,7 +214,11 @@ class CrdSubmittedRun(AbstractRun):
         # TODO: test more carefully once we release multi-node support
         logs: Dict[str, Optional[str]] = {}
         try:
-            for pod_name in self.pod_names:
+            pods = self.core_api.list_namespaced_pod(
+                label_selector=f"wandb/run-id={self.name}", namespace=self.namespace
+            )
+            pod_names = [pi.metadata.name for pi in pods.items]
+            for pod_name in pod_names:
                 logs[pod_name] = self.core_api.read_namespaced_pod_log(
                     name=pod_name, namespace=self.namespace
                 )
@@ -325,30 +232,7 @@ class CrdSubmittedRun(AbstractRun):
     def get_status(self) -> Status:
         """Get status of custom object."""
-        try:
-            job_response = self.custom_api.get_namespaced_custom_object_status(
-                group=self.group,
-                version=self.version,
-                namespace=self.namespace,
-                plural=self.plural,
-                name=self.name,
-            )
-        except ApiException as e:
-            raise LaunchError(
-                f"Failed to get CRD {self.name} in namespace {self.namespace}: {str(e)}"
-            ) from e
-        # Custom objects can technically define whater states and format the
-        # response to the status request however they want. This checks for
-        # the most common cases.
-        status = job_response["status"]
-        state = status.get("state")
-        if isinstance(state, dict):
-            state = state.get("phase")
-        if state is None:
-            raise LaunchError(
-                f"Failed to get CRD {self.name} in namespace {self.namespace}: no state found"
-            )
-        return Status(CRD_STATE_DICT.get(state.lower(), "unknown"))
+        return self.monitor.get_status()
     def cancel(self) -> None:
         """Cancel the custom object."""
@@ -370,10 +254,9 @@ class CrdSubmittedRun(AbstractRun):
         while True:
             status = self.get_status()
             wandb.termlog(f"{LOG_PREFIX}Job {self.name} status: {status}")
-            if status.state != "running":
-                break
             time.sleep(5)
-        return status.state == "finished"
+            if status.state in ["finished", "failed", "preempted"]:
+                return status.state == "finished"
 class KubernetesRunner(AbstractRunner):
@@ -400,48 +283,6 @@ class KubernetesRunner(AbstractRunner):
         self.environment = environment
         self.registry = registry
-    def wait_job_launch(
-        self,
-        job_name: str,
-        namespace: str,
-        core_api: "CoreV1Api",
-        label: str = "job-name",
-    ) -> List[str]:
-        """Wait for a job to be launched and return the pod names.
-        Arguments:
-            job_name: The name of the job.
-            namespace: The namespace of the job.
-            core_api: The Kubernetes core API client.
-            label: The label key to match against job_name.
-        Returns:
-            The names of the pods associated with the job.
-        """
-        pods = core_api.list_namespaced_pod(
-            label_selector=f"{label}={job_name}", namespace=namespace
-        )
-        timeout = TIMEOUT
-        while len(pods.items) == 0 and timeout > 0:
-            time.sleep(1)
-            timeout -= 1
-            pods = core_api.list_namespaced_pod(
-                label_selector=f"{label}={job_name}", namespace=namespace
-            )
-        if timeout == 0:
-            raise LaunchError(
-                "No pods found for job {}. Check dashboard to see if job was launched successfully.".format(
-                    job_name
-                )
-            )
-        pod_names = [pi.metadata.name for pi in pods.items]
-        wandb.termlog(
-            f"{LOG_PREFIX}Job {job_name} created on pod(s) {', '.join(pod_names)}. See logs with e.g. `kubectl logs {pod_names[0]} -n {namespace}`."
-        )
-        return pod_names
     def get_namespace(
         self, resource_args: Dict[str, Any], context: Dict[str, Any]
     ) -> str:
@@ -522,18 +363,10 @@ class KubernetesRunner(AbstractRunner):
             or launch_project.get_single_entry_point()
         )
         if launch_project.docker_image:
-            if len(containers) > 1:
-                raise LaunchError(
-                    "Invalid specification of multiple containers. See https://docs.wandb.ai/guides/launch for guidance on submitting jobs."
-                )
             # dont specify run id if user provided image, could have multiple runs
             containers[0]["image"] = image_uri
             # TODO: handle secret pulling image from registry
         elif not any(["image" in cont for cont in containers]):
-            if len(containers) > 1:
-                raise LaunchError(
-                    "Launch only builds one container at a time. See https://docs.wandb.ai/guides/launch for guidance on submitting jobs."
-                )
             assert entry_point is not None
             # in the non instance case we need to make an imagePullSecret
             # so the new job can pull the image
@@ -638,16 +471,27 @@ class KubernetesRunner(AbstractRunner):
                     body=resource_args,
                 )
             except ApiException as e:
+                body = json.loads(e.body)
+                body_yaml = yaml.dump(body)
                 raise LaunchError(
-                    f"Error creating CRD of kind {kind}: {e.status} {e.reason}"
+                    f"Error creating CRD of kind {kind}: {e.status} {e.reason}\n{body_yaml}"
                 ) from e
             name = response.get("metadata", {}).get("name")
             _logger.info(f"Created {kind} {response['metadata']['name']}")
             core = client.CoreV1Api(api_client)
-            pod_names = self.wait_job_launch(
-                launch_project.run_id, namespace, core, label="wandb/run-id"
+            run_monitor = KubernetesRunMonitor(
+                job_field_selector=f"metadata.name={name}",
+                pod_label_selector=f"wandb/run-id={launch_project.run_id}",
+                namespace=namespace,
+                batch_api=None,
+                core_api=core,
+                custom_api=api,
+                group=group,
+                version=version,
+                plural=plural,
             )
-            return CrdSubmittedRun(
+            run_monitor.start()
+            submitted_run = CrdSubmittedRun(
                 name=name,
                 group=group,
                 version=version,
@@ -655,8 +499,11 @@ class KubernetesRunner(AbstractRunner):
                 plural=plural,
                 core_api=client.CoreV1Api(api_client),
                 custom_api=api,
-                pod_names=pod_names,
+                monitor=run_monitor,
             )
+            if self.backend_config[PROJECT_SYNCHRONOUS]:
+                submitted_run.wait()
+            return submitted_run
         batch_api = kubernetes.client.BatchV1Api(api_client)
         core_api = kubernetes.client.CoreV1Api(api_client)
@@ -674,12 +521,22 @@ class KubernetesRunner(AbstractRunner):
             0
         ]  # create_from_yaml returns a nested list of k8s objects
         job_name = job_response.metadata.name
-        pod_names = self.wait_job_launch(job_name, namespace, core_api)
+        # Event stream monitor to ensure pod creation and job completion.
+        monitor = KubernetesRunMonitor(
+            job_field_selector=f"metadata.name={job_name}",
+            pod_label_selector=f"job-name={job_name}",
+            namespace=namespace,
+            batch_api=batch_api,
+            core_api=core_api,
+        )
+        monitor.start()
         submitted_job = KubernetesSubmittedRun(
-            batch_api, core_api, job_name, pod_names, namespace, secret
+            monitor, batch_api, core_api, job_name, [], namespace, secret
         )
         if self.backend_config[PROJECT_SYNCHRONOUS]:
             submitted_job.wait()
         return submitted_job

wandb/sdk/launch/runner/local_container.py CHANGED Viewed

@@ -5,7 +5,7 @@ import subprocess
 import sys
 import threading
 import time
-from typing import Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 import wandb
 from wandb.sdk.launch.environment.abstract import AbstractEnvironment
@@ -26,6 +26,9 @@ from ..utils import (
 )
 from .abstract import AbstractRun, AbstractRunner, Status
+if TYPE_CHECKING:
+    from wandb.apis.internal import Api
 _logger = logging.getLogger(__name__)
@@ -95,7 +98,7 @@ class LocalContainerRunner(AbstractRunner):
     def __init__(
         self,
-        api: wandb.apis.internal.Api,
+        api: "Api",
         backend_config: Dict[str, Any],
         environment: AbstractEnvironment,
         registry: AbstractRegistry,

wandb/sdk/launch/runner/local_process.py CHANGED Viewed

@@ -46,7 +46,10 @@ class LocalProcessRunner(AbstractRunner):
             _logger.warning(_msg)
         synchronous: bool = self.backend_config[PROJECT_SYNCHRONOUS]
-        entry_point = launch_project.get_single_entry_point()
+        entry_point = (
+            launch_project.override_entrypoint
+            or launch_project.get_single_entry_point()
+        )
         cmd: List[Any] = []

wandb/sdk/launch/sweeps/scheduler.py CHANGED Viewed

@@ -9,26 +9,28 @@ import traceback
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Union
 import click
 import yaml
 import wandb
-import wandb.apis.public as public
-from wandb.apis.internal import Api
-from wandb.apis.public import Api as PublicApi
-from wandb.apis.public import QueuedRun, Run
 from wandb.errors import CommError
+from wandb.sdk.launch._launch_add import launch_add
 from wandb.sdk.launch.errors import LaunchError
-from wandb.sdk.launch.launch_add import launch_add
 from wandb.sdk.launch.sweeps import SchedulerError
 from wandb.sdk.launch.sweeps.utils import (
     create_sweep_command_args,
     make_launch_sweep_entrypoint,
 )
 from wandb.sdk.lib.runid import generate_id
-from wandb.sdk.wandb_run import Run as SdkRun
+if TYPE_CHECKING:
+    import wandb.apis.public as public
+    from wandb.apis.internal import Api
+    from wandb.apis.public import QueuedRun, Run
+    from wandb.sdk.wandb_run import Run as SdkRun
 _logger = logging.getLogger(__name__)
 LOG_PREFIX = f"{click.style('sched:', fg='cyan')} "
@@ -84,7 +86,7 @@ class SweepRun:
     id: str
     worker_id: int
     state: RunState = RunState.RUNNING
-    queued_run: Optional[public.QueuedRun] = None
+    queued_run: Optional["public.QueuedRun"] = None
     args: Optional[Dict[str, Any]] = None
     logs: Optional[List[str]] = None
@@ -98,7 +100,7 @@ class Scheduler(ABC):
     def __init__(
         self,
-        api: Api,
+        api: "Api",
         *args: Optional[Any],
         polling_sleep: Optional[float] = None,
         sweep_id: Optional[str] = None,
@@ -108,6 +110,8 @@ class Scheduler(ABC):
         num_workers: Optional[Union[int, str]] = None,
         **kwargs: Optional[Any],
     ):
+        from wandb.apis.public import Api as PublicApi
         self._api = api
         self._public_api = PublicApi()
         self._entity = (
@@ -244,7 +248,7 @@ class Scheduler(ABC):
             _id: w for _id, w in self._workers.items() if _id not in self.busy_workers
         }
-    def _init_wandb_run(self) -> SdkRun:
+    def _init_wandb_run(self) -> "SdkRun":
         """Controls resume or init logic for a scheduler wandb run."""
         _type = self._kwargs.get("sweep_type", "sweep")
         run: SdkRun = wandb.init(
@@ -346,9 +350,8 @@ class Scheduler(ABC):
             self.exit()
             raise e
         else:
-            wandb.termlog(f"{LOG_PREFIX}Scheduler completed successfully")
-            # don't overwrite special states (e.g. STOPPED, FAILED)
-            if self.state in [SchedulerState.RUNNING, SchedulerState.FLUSH_RUNS]:
+            # scheduler succeeds if at runcap
+            if self.state == SchedulerState.FLUSH_RUNS and self.at_runcap:
                 self.state = SchedulerState.COMPLETED
             self.exit()
@@ -362,16 +365,24 @@ class Scheduler(ABC):
                 f"{LOG_PREFIX}Failed to save state: {traceback.format_exc()}"
             )
-        if self.state not in [
-            SchedulerState.COMPLETED,
-            SchedulerState.STOPPED,
-        ]:
+        status = ""
+        if self.state == SchedulerState.FLUSH_RUNS:
+            self._set_sweep_state("PAUSED")
+            status = "paused"
+        elif self.state == SchedulerState.COMPLETED:
+            self._set_sweep_state("FINISHED")
+            status = "completed"
+        elif self.state in [SchedulerState.CANCELLED, SchedulerState.STOPPED]:
+            self._set_sweep_state("CANCELED")  # one L
+            status = "cancelled"
+            self._stop_runs()
+        else:
             self.state = SchedulerState.FAILED
             self._set_sweep_state("CRASHED")
-        else:
-            self._set_sweep_state("FINISHED")
+            status = "crashed"
+            self._stop_runs()
-        self._stop_runs()
+        wandb.termlog(f"{LOG_PREFIX}Scheduler {status}")
         self._wandb_run.finish()
     def _get_num_runs_launched(self, runs: List[Dict[str, Any]]) -> int:
@@ -494,6 +505,7 @@ class Scheduler(ABC):
         """Update the scheduler state from state of scheduler run and sweep state."""
         state: RunState = self._get_run_state(self._wandb_run.id)
+        # map scheduler run-state to scheduler-state
         if state == RunState.KILLED:
             self.state = SchedulerState.STOPPED
         elif state in [RunState.FAILED, RunState.CRASHED]:
@@ -501,17 +513,20 @@ class Scheduler(ABC):
         elif state == RunState.FINISHED:
             self.state = SchedulerState.COMPLETED
+        # check sweep state for completed states, overwrite scheduler state
         try:
             sweep_state = self._api.get_sweep_state(
                 self._sweep_id, self._entity, self._project
             )
         except Exception as e:
-            _logger.debug(f"sweep state error: {sweep_state} e: {e}")
+            _logger.debug(f"sweep state error: {e}")
             return
-        if sweep_state in ["FINISHED", "CANCELLED"]:
+        if sweep_state == "FINISHED":
             self.state = SchedulerState.COMPLETED
-        elif sweep_state in ["PAUSED", "STOPPED"]:
+        elif sweep_state in ["CANCELLED", "STOPPED"]:
+            self.state = SchedulerState.CANCELLED
+        elif sweep_state == "PAUSED":
             self.state = SchedulerState.FLUSH_RUNS
     def _update_run_states(self) -> None:
@@ -674,6 +689,9 @@ class Scheduler(ABC):
                 f' {"job" if _job else "image_uri"} entrypoint'
             )
+        # override resource and args of job
+        _job_launch_config = self._wandb_run.config.get("launch") or {}
         run_id = run.id or generate_id()
         queued_run = launch_add(
             run_id=run_id,
@@ -685,8 +703,8 @@ class Scheduler(ABC):
             entity=self._entity,
             queue_name=self._kwargs.get("queue"),
             project_queue=self._project_queue,
-            resource=self._kwargs.get("resource", None),
-            resource_args=self._kwargs.get("resource_args", None),
+            resource=_job_launch_config.get("resource"),
+            resource_args=_job_launch_config.get("resource_args"),
             author=self._kwargs.get("author"),
             sweep_id=self._sweep_id,
         )

wandb/sdk/launch/sweeps/utils.py CHANGED Viewed

@@ -1,15 +1,17 @@
 import json
 import os
 import re
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 import yaml
 import wandb
 from wandb import util
-from wandb.apis.public import Api as PublicApi
 from wandb.sdk.launch.errors import LaunchError
+if TYPE_CHECKING:
+    from wandb.apis.public import Api as PublicApi
 DEFAULT_SWEEP_COMMAND: List[str] = [
     "${env}",
     "${interpreter}",
@@ -276,7 +278,7 @@ def make_launch_sweep_entrypoint(
     return entry_point, macro_args
-def check_job_exists(public_api: PublicApi, job: Optional[str]) -> bool:
+def check_job_exists(public_api: "PublicApi", job: Optional[str]) -> bool:
     """Check if the job exists using the public api.
     Returns: True if no job is passed, or if the job exists.

wandb/sdk/launch/utils.py CHANGED Viewed

@@ -127,7 +127,9 @@ def set_project_entity_defaults(
     prefix = ""
     if platform.system() != "Windows" and sys.stdout.encoding == "UTF-8":
         prefix = "🚀 "
-    wandb.termlog(f"{LOG_PREFIX}{prefix}Launching run into {entity}/{project}")
+    wandb.termlog(
+        f"{LOG_PREFIX}{prefix}Launching run into {entity}{'/' + project if project else ''}"
+    )
     return project, entity

wandb 0.15.9__py3-none-any.whl → 0.15.11__py3-none-any.whl

wandb 0.15.9py3-none-any.whl → 0.15.11py3-none-any.whl