PyPI - wandb - Versions diffs - 0.15.10__py3-none-any.whl → 0.15.11__py3-none-any.whl - Mend

wandb 0.15.10py3-none-any.whl → 0.15.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

wandb/__init__.py +2 -1
wandb/apis/public.py +51 -9
wandb/apis/reports/blocks.py +1 -0
wandb/cli/cli.py +14 -9
wandb/env.py +11 -1
wandb/integration/xgboost/xgboost.py +3 -3
wandb/proto/v3/wandb_internal_pb2.py +300 -267
wandb/proto/v3/wandb_settings_pb2.py +2 -2
wandb/proto/v3/wandb_telemetry_pb2.py +16 -16
wandb/proto/v4/wandb_internal_pb2.py +260 -252
wandb/proto/v4/wandb_settings_pb2.py +2 -2
wandb/proto/v4/wandb_telemetry_pb2.py +16 -16
wandb/sdk/artifacts/artifact.py +9 -6
wandb/sdk/artifacts/storage_handlers/s3_handler.py +12 -7
wandb/sdk/data_types/image.py +1 -1
wandb/sdk/internal/file_stream.py +2 -1
wandb/sdk/internal/handler.py +24 -20
wandb/sdk/internal/internal_api.py +9 -1
wandb/sdk/internal/sender.py +4 -1
wandb/sdk/internal/system/system_info.py +2 -2
wandb/sdk/launch/__init__.py +5 -0
wandb/sdk/launch/{launch.py → _launch.py} +53 -54
wandb/sdk/launch/{launch_add.py → _launch_add.py} +34 -31
wandb/sdk/launch/agent/agent.py +36 -18
wandb/sdk/launch/agent/run_queue_item_file_saver.py +6 -4
wandb/sdk/launch/runner/abstract.py +0 -2
wandb/sdk/launch/runner/kubernetes_monitor.py +329 -0
wandb/sdk/launch/runner/kubernetes_runner.py +44 -301
wandb/sdk/launch/runner/local_container.py +5 -2
wandb/sdk/launch/sweeps/scheduler.py +14 -10
wandb/sdk/launch/sweeps/utils.py +5 -3
wandb/sdk/launch/utils.py +3 -1
wandb/sdk/lib/_settings_toposort_generated.py +5 -0
wandb/sdk/lib/gql_request.py +3 -0
wandb/sdk/lib/ipython.py +4 -0
wandb/sdk/service/service.py +19 -6
wandb/sdk/wandb_init.py +7 -2
wandb/sdk/wandb_run.py +2 -5
wandb/sdk/wandb_settings.py +48 -2
wandb/util.py +1 -1
{wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/METADATA +4 -1
{wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/RECORD +46 -45
{wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/LICENSE +0 -0
{wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/WHEEL +0 -0
{wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/entry_points.txt +0 -0
{wandb-0.15.10.dist-info → wandb-0.15.11.dist-info}/top_level.txt +0 -0

wandb/sdk/launch/{launch_add.py → _launch_add.py} RENAMED Viewed

@@ -2,7 +2,7 @@ import pprint
 from typing import Any, Dict, List, Optional
 import wandb
-import wandb.apis.public as public
+from wandb.apis import public
 from wandb.apis.internal import Api
 from wandb.sdk.launch._project_spec import create_project_from_spec
 from wandb.sdk.launch.builder.build import build_image_from_project
@@ -49,39 +49,42 @@ def launch_add(
     """Enqueue a W&B launch experiment. With either a source uri, job or docker_image.
     Arguments:
-    uri: URI of experiment to run. A wandb run uri or a Git repository URI.
-    job: string reference to a wandb.Job eg: wandb/test/my-job:latest
-    config: A dictionary containing the configuration for the run. May also contain
-        resource specific arguments under the key "resource_args"
-    project: Target project to send launched run to
-    entity: Target entity to send launched run to
-    queue: the name of the queue to enqueue the run to
-    resource: Execution backend for the run: W&B provides built-in support for "local-container" backend
-    entry_point: Entry point to run within the project. Defaults to using the entry point used
-        in the original run for wandb URIs, or main.py for git repository URIs.
-    name: Name run under which to launch the run.
-    version: For Git-based projects, either a commit hash or a branch name.
-    docker_image: The name of the docker image to use for the run.
-    resource_args: Resource related arguments for launching runs onto a remote backend.
-        Will be stored on the constructed launch config under ``resource_args``.
-    run_id: optional string indicating the id of the launched run
-    build: optional flag defaulting to false, requires queue to be set
-        if build, an image is created, creates a job artifact, pushes a reference
-            to that job artifact to queue
-    repository: optional string to control the name of the remote repository, used when
-        pushing images to a registry
-    project_queue: optional string to control the name of the project for the queue. Primarily used
-        for back compatibility with project scoped queues
+        uri: URI of experiment to run. A wandb run uri or a Git repository URI.
+        job: string reference to a wandb.Job eg: wandb/test/my-job:latest
+        config: A dictionary containing the configuration for the run. May also contain
+            resource specific arguments under the key "resource_args"
+        project: Target project to send launched run to
+        entity: Target entity to send launched run to
+        queue: the name of the queue to enqueue the run to
+        resource: Execution backend for the run: W&B provides built-in support for "local-container" backend
+        entry_point: Entry point to run within the project. Defaults to using the entry point used
+            in the original run for wandb URIs, or main.py for git repository URIs.
+        name: Name run under which to launch the run.
+        version: For Git-based projects, either a commit hash or a branch name.
+        docker_image: The name of the docker image to use for the run.
+        resource_args: Resource related arguments for launching runs onto a remote backend.
+            Will be stored on the constructed launch config under ``resource_args``.
+        run_id: optional string indicating the id of the launched run
+        build: optional flag defaulting to false, requires queue to be set
+            if build, an image is created, creates a job artifact, pushes a reference
+                to that job artifact to queue
+        repository: optional string to control the name of the remote repository, used when
+            pushing images to a registry
+        project_queue: optional string to control the name of the project for the queue. Primarily used
+            for back compatibility with project scoped queues
     Example:
-        import wandb
-        project_uri = "https://github.com/wandb/examples"
-        params = {"alpha": 0.5, "l1_ratio": 0.01}
-        # Run W&B project and create a reproducible docker environment
-        # on a local host
-        api = wandb.apis.internal.Api()
-        wandb.launch_add(uri=project_uri, parameters=params)
+    ```python
+    from wandb.sdk.launch import launch_add
+    project_uri = "https://github.com/wandb/examples"
+    params = {"alpha": 0.5, "l1_ratio": 0.01}
+    # Run W&B project and create a reproducible docker environment
+    # on a local host
+    api = wandb.apis.internal.Api()
+    launch_add(uri=project_uri, parameters=params)
+    ```
     Returns:

wandb/sdk/launch/agent/agent.py CHANGED Viewed

@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Union
 import wandb
 from wandb.apis.internal import Api
 from wandb.errors import CommError
-from wandb.sdk.launch.launch_add import launch_add
+from wandb.sdk.launch._launch_add import launch_add
 from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
 from wandb.sdk.launch.runner.local_process import LocalProcessRunner
 from wandb.sdk.launch.sweeps.scheduler import Scheduler
@@ -36,6 +36,8 @@ HIDDEN_AGENT_RUN_TYPE = "sweep-controller"
 MAX_RESUME_COUNT = 5
+RUN_INFO_GRACE_PERIOD = 60
 _env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
 if _env_timeout:
     try:
@@ -301,27 +303,43 @@ class LaunchAgent:
                 job_and_run_status.err_stage,
                 fnames,
             )
-        elif job_and_run_status.completed_status not in ["stopped", "failed"]:
-            _logger.info(
-                "Skipping check for completed run status because run was successful"
-            )
         elif job_and_run_status.run is not None:
             run_info = None
-            # sweep runs exist but have no info before they are started
-            # so run_info returned will be None
-            # normal runs just throw a comm error
-            # TODO: make more clear
-            try:
-                run_info = self._api.get_run_info(
-                    self._entity, job_and_run_status.project, job_and_run_status.run_id
-                )
+            # We do some weird stuff here getting run info to check for a
+            # created in run in W&B.
+            #
+            # We retry for 60 seconds with an exponential backoff in case
+            # upsert run is taking a while.
+            #
+            # Sweep runs exist but have no info before they are started
+            # so run_info returned will be None, while normal runs just throw a
+            # comm error.
+            start_time = time.time()
+            interval = 1
+            while True:
+                try:
+                    run_info = self._api.get_run_info(
+                        self._entity,
+                        job_and_run_status.project,
+                        job_and_run_status.run_id,
+                    )
+                except CommError:
+                    pass
+                if (
+                    run_info is not None
+                    or time.time() - start_time > RUN_INFO_GRACE_PERIOD
+                ):
+                    break
+                if run_info is None:
+                    time.sleep(interval)
+                    interval *= 2
-            except CommError:
-                pass
             if run_info is None:
-                _msg = "The submitted run was not successfully started"
                 fnames = None
+                if job_and_run_status.completed_status == "finished":
+                    _msg = "The submitted job exited successfully but failed to call wandb.init"
+                else:
+                    _msg = "The submitted run was not successfully started"
                 logs = job_and_run_status.run.get_logs()
                 if logs:
                     fnames = job_and_run_status.saver.save_contents(
@@ -331,7 +349,7 @@ class LaunchAgent:
                     job_and_run_status.run_queue_item_id, _msg, "run", fnames
                 )
         else:
-            _logger.info("Finish thread id had no exception, ror run")
+            _logger.info(f"Finish thread id {thread_id} had no exception and no run")
             wandb._sentry.exception(
                 "launch agent called finish thread id on thread without run or exception"
             )

wandb/sdk/launch/agent/run_queue_item_file_saver.py CHANGED Viewed

@@ -5,8 +5,6 @@ import sys
 from typing import List, Optional, Union
 import wandb
-from wandb.sdk.lib import RunDisabled
-from wandb.sdk.wandb_run import Run
 if sys.version_info >= (3, 8):
     from typing import Literal
@@ -18,7 +16,11 @@ FileSubtypes = Literal["warning", "error"]
 class RunQueueItemFileSaver:
     def __init__(
-        self, agent_run: Optional[Union[Run, RunDisabled]], run_queue_item_id: str
+        self,
+        agent_run: Optional[
+            Union["wandb.sdk.wandb_run.Run", "wandb.sdk.lib.RunDisabled"]
+        ],
+        run_queue_item_id: str,
     ):
         self.run_queue_item_id = run_queue_item_id
         self.run = agent_run
@@ -26,7 +28,7 @@ class RunQueueItemFileSaver:
     def save_contents(
         self, contents: str, fname: str, file_sub_type: FileSubtypes
     ) -> Optional[List[str]]:
-        if not isinstance(self.run, Run):
+        if not isinstance(self.run, wandb.sdk.wandb_run.Run):
             wandb.termwarn("Not saving file contents because agent has no run")
             return None
         root_dir = self.run._settings.files_dir

wandb/sdk/launch/runner/abstract.py CHANGED Viewed

@@ -13,7 +13,6 @@ from typing import Any, Dict, List, Optional, Union
 from dockerpycreds.utils import find_executable  # type: ignore
 import wandb
-from wandb import Settings
 from wandb.apis.internal import Api
 from wandb.sdk.lib import runid
@@ -136,7 +135,6 @@ class AbstractRunner(ABC):
         api: Api,
         backend_config: Dict[str, Any],
     ) -> None:
-        self._settings = Settings()
         self._api = api
         self.backend_config = backend_config
         self._cwd = os.getcwd()

wandb/sdk/launch/runner/kubernetes_monitor.py ADDED Viewed

@@ -0,0 +1,329 @@
+import logging
+from threading import Lock, Thread
+from typing import Any, Dict, List, Optional
+import urllib3
+from kubernetes import watch  # type: ignore # noqa: F401
+from kubernetes.client import (  # type: ignore # noqa: F401
+    ApiException,
+    BatchV1Api,
+    CoreV1Api,
+    CustomObjectsApi,
+    V1PodStatus,
+)
+import wandb
+from .abstract import State, Status
+# Dict for mapping possible states of custom objects to the states we want to report
+# to the agent.
+CRD_STATE_DICT: Dict[str, State] = {
+    # Starting states.
+    "created": "starting",
+    "pending": "starting",
+    # Running states.
+    "running": "running",
+    "completing": "running",
+    # Finished states.
+    "succeeded": "finished",
+    "completed": "finished",
+    # Failed states.
+    "failed": "failed",
+    "aborted": "failed",
+    "timeout": "failed",
+    "terminated": "failed",
+    # Stopping states.
+    "terminating": "stopping",
+}
+_logger = logging.getLogger(__name__)
+class SafeWatch:
+    """Wrapper for the kubernetes watch class that can recover in more situations."""
+    def __init__(self, watcher: "watch.Watch") -> None:
+        """Initialize the SafeWatch."""
+        self._watcher = watcher
+        self._last_seen_resource_version: Optional[str] = None
+        self._stopped = False
+    def stream(self, func: Any, *args: Any, **kwargs: Any) -> Any:
+        """Stream the watcher."""
+        while True:
+            try:
+                for event in self._watcher.stream(
+                    func, *args, **kwargs, timeout_seconds=15
+                ):
+                    if self._stopped:
+                        break
+                    # Save the resource version so that we can resume the stream
+                    # if it breaks.
+                    object = event.get("object")
+                    if isinstance(object, dict):
+                        self._last_seen_resource_version = object.get(
+                            "metadata", dict()
+                        ).get("resourceVersion")
+                    else:
+                        self._last_seen_resource_version = (
+                            object.metadata.resource_version
+                        )
+                    kwargs["resource_version"] = self._last_seen_resource_version
+                    yield event
+                # If stream ends after stop just break
+                if self._stopped:
+                    break
+            except urllib3.exceptions.ProtocolError as e:
+                wandb.termwarn(f"Broken event stream: {e}")
+            except ApiException as e:
+                if e.status == 410:
+                    # If resource version is too old we need to start over.
+                    del kwargs["resource_version"]
+                    self._last_seen_resource_version = None
+            except Exception as E:
+                wandb.termerror(f"Unknown exception in event stream: {E}")
+    def stop(self) -> None:
+        """Stop the watcher."""
+        self._watcher.stop()
+        self._stopped = True
+def _is_preempted(status: "V1PodStatus") -> bool:
+    """Check if this pod has been preempted."""
+    if hasattr(status, "conditions") and status.conditions is not None:
+        for condition in status.conditions:
+            if condition.type == "DisruptionTarget" and condition.reason in [
+                "EvictionByEvictionAPI",
+                "PreemptionByScheduler",
+                "TerminationByKubelet",
+            ]:
+                return True
+    return False
+def _is_container_creating(status: "V1PodStatus") -> bool:
+    """Check if this pod has started creating containers."""
+    for container_status in status.container_statuses or []:
+        if (
+            container_status.state
+            and container_status.state.waiting
+            and container_status.state.waiting.reason == "ContainerCreating"
+        ):
+            return True
+    return False
+def _state_from_conditions(conditions: List[Dict[str, Any]]) -> Optional[str]:
+    """Get the status from the pod conditions."""
+    true_conditions = [
+        c.get("type", "").lower() for c in conditions if c.get("status") == "True"
+    ]
+    detected_states = {
+        CRD_STATE_DICT[c] for c in true_conditions if c in CRD_STATE_DICT
+    }
+    for state in ["finished", "failed", "stopping", "running", "starting"]:
+        if state in detected_states:
+            return state
+    return None
+class KubernetesRunMonitor:
+    def __init__(
+        self,
+        job_field_selector: str,
+        pod_label_selector: str,
+        namespace: str,
+        batch_api: "BatchV1Api",
+        core_api: "CoreV1Api",
+        custom_api: "CustomObjectsApi" = None,
+        group: Optional[str] = None,
+        version: Optional[str] = None,
+        plural: Optional[str] = None,
+    ) -> None:
+        """Initialize KubernetesRunMonitor.
+        If a custom api is provided, the group, version, and plural arguments must also
+        be provided. These are used to query the custom api for a launched custom
+        object (CRD). Group, version, and plural in this context refer to the
+        Kubernetes API group, version, and plural for the CRD. For more information
+        see: https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/
+        The run monitor starts two threads to watch for pods and jobs/crds matching the
+        provided selectors. The status is set to "starting" when the run monitor is
+        initialized. The status is set to "running" when a pod matching the pod selector
+        is found with a status of "Running" or has a container with a status of
+        "ContainerCreating". The status is set to "finished" when a job matching the job
+        selector is found with a status of "Succeeded". The status is set to "failed"
+        when a job matching the job selector is found with a status of "Failed" or a pod
+        matching the pod selector is found with a status of "Failed". The status is set
+        to "preempted" when a pod matching the pod selector is found with a condition
+        type of "DisruptionTarget" and a reason of "EvictionByEvictionAPI",
+        "PreemptionByScheduler", or "TerminationByKubelet".
+        The logic for the CRD is similar to the logic for the job, but we inspect
+        both the phase of the CRD and the conditions since some CRDs do not have a
+        phase field.
+        Arguments:
+            job_field_selector: The field selector for the job or crd.
+            pod_label_selector: The label selector for the pods.
+            namespace: The namespace to monitor.
+            batch_api: The batch api client.
+            core_api: The core api client.
+            custom_api: The custom api client.
+            group: The group of the CRD.
+            version: The version of the CRD.
+            plural: The plural of the CRD.
+        Returns:
+            None.
+        """
+        self.pod_label_selector = pod_label_selector
+        self.job_field_selector = job_field_selector
+        self.namespace = namespace
+        self.batch_api = batch_api
+        self.core_api = core_api
+        self.custom_api = custom_api
+        self.group = group
+        self.version = version
+        self.plural = plural
+        self._status_lock = Lock()
+        self._status = Status("starting")
+        # Only one of the job or crd watchers will be used.
+        self._watch_job_thread = Thread(target=self._watch_job, daemon=True)
+        self._watch_crd_thread = Thread(target=self._watch_crd, daemon=True)
+        self._watch_pods_thread = Thread(target=self._watch_pods, daemon=True)
+        self._job_watcher = SafeWatch(watch.Watch())
+        self._pod_watcher = SafeWatch(watch.Watch())
+    def start(self) -> None:
+        """Start the run monitor."""
+        if self.custom_api is None:
+            self._watch_job_thread.start()
+        else:
+            self._watch_crd_thread.start()
+        self._watch_pods_thread.start()
+    def stop(self) -> None:
+        """Stop the run monitor."""
+        self._job_watcher.stop()
+        self._pod_watcher.stop()
+    def _set_status(self, status: Status) -> None:
+        """Set the run status."""
+        with self._status_lock:
+            self._status = status
+    def get_status(self) -> Status:
+        """Get the run status."""
+        with self._status_lock:
+            # Each time this is called we verify that our watchers are active.
+            if self._status.state in ["running", "starting"]:
+                if self.custom_api is None:
+                    if not self._watch_job_thread.is_alive():
+                        wandb.termwarn(
+                            f"Job watcher thread is dead for {self.job_field_selector}"
+                        )
+                        self._watch_job_thread = Thread(
+                            target=self._watch_job, daemon=True
+                        )
+                        self._watch_job_thread.start()
+                else:
+                    if not self._watch_crd_thread.is_alive():
+                        wandb.termwarn(
+                            f"CRD watcher thread is dead for {self.job_field_selector}"
+                        )
+                        self._watch_crd_thread = Thread(
+                            target=self._watch_crd, daemon=True
+                        )
+                        self._watch_crd_thread.start()
+                if not self._watch_pods_thread.is_alive():
+                    wandb.termwarn(
+                        f"Pod watcher thread is dead for {self.pod_label_selector}"
+                    )
+                    self._watch_pods_thread = Thread(
+                        target=self._watch_pods, daemon=True
+                    )
+                    self._watch_pods_thread.start()
+            return self._status
+    def _watch_pods(self) -> None:
+        """Watch for pods created matching the jobname."""
+        # Stream with no timeout polling for pod status updates
+        for event in self._pod_watcher.stream(
+            self.core_api.list_namespaced_pod,
+            namespace=self.namespace,
+            label_selector=self.pod_label_selector,
+        ):
+            object = event.get("object")
+            # Sometimes ADDED events will be missing field.
+            if not hasattr(object, "status"):
+                continue
+            if object.status.phase == "Running":
+                self._set_status(Status("running"))
+            if _is_preempted(object.status):
+                self._set_status(Status("preempted"))
+                self.stop()
+                break
+            if _is_container_creating(object.status):
+                self._set_status(Status("running"))
+    def _watch_job(self) -> None:
+        """Watch for job matching the jobname."""
+        for event in self._job_watcher.stream(
+            self.batch_api.list_namespaced_job,
+            namespace=self.namespace,
+            field_selector=self.job_field_selector,
+        ):
+            object = event.get("object")
+            if object.status.succeeded == 1:
+                self._set_status(Status("finished"))
+                self.stop()
+                break
+            elif object.status.failed is not None and object.status.failed >= 1:
+                self._set_status(Status("failed"))
+                self.stop()
+                break
+    def _watch_crd(self) -> None:
+        """Watch for CRD matching the jobname."""
+        for event in self._job_watcher.stream(
+            self.custom_api.list_namespaced_custom_object,
+            namespace=self.namespace,
+            field_selector=self.job_field_selector,
+            group=self.group,
+            version=self.version,
+            plural=self.plural,
+        ):
+            object = event.get("object")
+            status = object.get("status")
+            if status is None:
+                continue
+            state = status.get("state")
+            if isinstance(state, dict):
+                raw_state = state.get("phase", "")
+                state = CRD_STATE_DICT.get(raw_state)
+            else:
+                conditions = status.get("conditions")
+                if isinstance(conditions, list):
+                    state = _state_from_conditions(conditions)
+                else:
+                    # This should never happen.
+                    _logger.warning(
+                        f"Unexpected conditions type {type(conditions)} "
+                        f"for CRD {self.job_field_selector}: {conditions}"
+                    )
+            if state is None:
+                continue
+            status = Status(state)
+            self._set_status(status)
+            if status.state in ["finished", "failed", "preempted"]:
+                self.stop()
+                break

wandb 0.15.10__py3-none-any.whl → 0.15.11__py3-none-any.whl

wandb 0.15.10py3-none-any.whl → 0.15.11py3-none-any.whl