PyPI - wandb - Versions diffs - 0.20.2rc20250616__py3-none-win_amd64.whl → 0.21.1__py3-none-win_amd64.whl - Mend

wandb 0.20.2rc20250616__py3-none-win_amd64.whl → 0.21.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

wandb/__init__.py +16 -14
wandb/__init__.pyi +450 -472
wandb/agents/pyagent.py +41 -12
wandb/analytics/sentry.py +7 -2
wandb/apis/importers/mlflow.py +1 -1
wandb/apis/internal.py +3 -0
wandb/apis/paginator.py +17 -4
wandb/apis/public/__init__.py +1 -1
wandb/apis/public/api.py +606 -359
wandb/apis/public/artifacts.py +214 -16
wandb/apis/public/automations.py +19 -3
wandb/apis/public/files.py +177 -38
wandb/apis/public/history.py +67 -15
wandb/apis/public/integrations.py +25 -2
wandb/apis/public/jobs.py +90 -2
wandb/apis/public/projects.py +161 -69
wandb/apis/public/query_generator.py +11 -1
wandb/apis/public/registries/registries_search.py +7 -15
wandb/apis/public/reports.py +147 -13
wandb/apis/public/runs.py +315 -128
wandb/apis/public/sweeps.py +222 -22
wandb/apis/public/teams.py +41 -4
wandb/apis/public/users.py +45 -4
wandb/automations/__init__.py +10 -10
wandb/automations/_filters/run_metrics.py +0 -2
wandb/automations/_utils.py +0 -2
wandb/automations/actions.py +0 -2
wandb/automations/automations.py +0 -2
wandb/automations/events.py +0 -2
wandb/beta/workflows.py +66 -30
wandb/bin/gpu_stats.exe +0 -0
wandb/bin/wandb-core +0 -0
wandb/cli/cli.py +80 -1
wandb/env.py +8 -0
wandb/errors/errors.py +4 -1
wandb/integration/catboost/catboost.py +6 -2
wandb/integration/kfp/kfp_patch.py +3 -1
wandb/integration/lightning/fabric/logger.py +3 -4
wandb/integration/metaflow/__init__.py +6 -0
wandb/integration/metaflow/data_pandas.py +74 -0
wandb/integration/metaflow/errors.py +13 -0
wandb/integration/metaflow/metaflow.py +205 -190
wandb/integration/openai/fine_tuning.py +1 -2
wandb/integration/sb3/sb3.py +3 -3
wandb/integration/ultralytics/callback.py +6 -2
wandb/jupyter.py +5 -5
wandb/plot/__init__.py +2 -0
wandb/plot/bar.py +30 -29
wandb/plot/confusion_matrix.py +75 -71
wandb/plot/custom_chart.py +30 -7
wandb/plot/histogram.py +26 -25
wandb/plot/line.py +33 -32
wandb/plot/line_series.py +100 -103
wandb/plot/pr_curve.py +33 -32
wandb/plot/roc_curve.py +38 -38
wandb/plot/scatter.py +27 -27
wandb/proto/v3/wandb_internal_pb2.py +366 -385
wandb/proto/v3/wandb_settings_pb2.py +2 -2
wandb/proto/v3/wandb_telemetry_pb2.py +4 -4
wandb/proto/v4/wandb_internal_pb2.py +352 -356
wandb/proto/v4/wandb_settings_pb2.py +2 -2
wandb/proto/v4/wandb_telemetry_pb2.py +4 -4
wandb/proto/v5/wandb_internal_pb2.py +352 -356
wandb/proto/v5/wandb_settings_pb2.py +2 -2
wandb/proto/v5/wandb_telemetry_pb2.py +4 -4
wandb/proto/v6/wandb_internal_pb2.py +352 -356
wandb/proto/v6/wandb_settings_pb2.py +2 -2
wandb/proto/v6/wandb_telemetry_pb2.py +4 -4
wandb/proto/wandb_deprecated.py +6 -0
wandb/sdk/artifacts/_generated/__init__.py +12 -1
wandb/sdk/artifacts/_generated/input_types.py +20 -2
wandb/sdk/artifacts/_generated/link_artifact.py +21 -0
wandb/sdk/artifacts/_generated/operations.py +9 -0
wandb/sdk/artifacts/_internal_artifact.py +19 -8
wandb/sdk/artifacts/_validators.py +48 -2
wandb/sdk/artifacts/artifact.py +269 -96
wandb/sdk/data_types/audio.py +38 -10
wandb/sdk/data_types/base_types/media.py +15 -63
wandb/sdk/data_types/base_types/wb_value.py +6 -6
wandb/sdk/data_types/graph.py +48 -14
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +1 -3
wandb/sdk/data_types/helper_types/image_mask.py +1 -3
wandb/sdk/data_types/histogram.py +34 -21
wandb/sdk/data_types/html.py +35 -12
wandb/sdk/data_types/image.py +104 -68
wandb/sdk/data_types/molecule.py +32 -19
wandb/sdk/data_types/object_3d.py +36 -17
wandb/sdk/data_types/plotly.py +18 -5
wandb/sdk/data_types/saved_model.py +7 -9
wandb/sdk/data_types/table.py +99 -70
wandb/sdk/data_types/trace_tree.py +12 -12
wandb/sdk/data_types/video.py +53 -26
wandb/sdk/integration_utils/auto_logging.py +2 -2
wandb/sdk/interface/interface.py +8 -19
wandb/sdk/interface/interface_shared.py +7 -16
wandb/sdk/internal/datastore.py +18 -18
wandb/sdk/internal/handler.py +3 -5
wandb/sdk/internal/internal_api.py +60 -0
wandb/sdk/internal/job_builder.py +6 -0
wandb/sdk/internal/sender.py +23 -3
wandb/sdk/internal/sender_config.py +9 -0
wandb/sdk/launch/_project_spec.py +3 -3
wandb/sdk/launch/agent/agent.py +11 -4
wandb/sdk/launch/agent/job_status_tracker.py +3 -1
wandb/sdk/launch/agent/run_queue_item_file_saver.py +2 -2
wandb/sdk/launch/create_job.py +3 -1
wandb/sdk/launch/inputs/internal.py +3 -4
wandb/sdk/launch/inputs/schema.py +1 -0
wandb/sdk/launch/runner/kubernetes_monitor.py +1 -0
wandb/sdk/launch/runner/kubernetes_runner.py +328 -1
wandb/sdk/launch/sweeps/scheduler.py +2 -3
wandb/sdk/launch/utils.py +3 -3
wandb/sdk/lib/asyncio_compat.py +3 -0
wandb/sdk/lib/console_capture.py +66 -19
wandb/sdk/lib/deprecate.py +1 -7
wandb/sdk/lib/disabled.py +1 -1
wandb/sdk/lib/hashutil.py +14 -1
wandb/sdk/lib/module.py +7 -13
wandb/sdk/lib/progress.py +0 -19
wandb/sdk/lib/sock_client.py +0 -4
wandb/sdk/wandb_init.py +67 -93
wandb/sdk/wandb_login.py +18 -14
wandb/sdk/wandb_metric.py +2 -0
wandb/sdk/wandb_require.py +0 -1
wandb/sdk/wandb_run.py +429 -527
wandb/sdk/wandb_settings.py +364 -74
wandb/sdk/wandb_setup.py +28 -28
wandb/sdk/wandb_sweep.py +14 -13
wandb/sdk/wandb_watch.py +4 -6
wandb/sync/sync.py +10 -0
wandb/util.py +57 -0
wandb/wandb_run.py +1 -2
{wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/METADATA +1 -1
{wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/RECORD +137 -137
wandb/sdk/wandb_metadata.py +0 -623
wandb/vendor/pynvml/__init__.py +0 -0
wandb/vendor/pynvml/pynvml.py +0 -4779
{wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/WHEEL +0 -0
{wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/entry_points.txt +0 -0
{wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/licenses/LICENSE +0 -0

wandb/sdk/launch/runner/kubernetes_runner.py CHANGED Viewed

@@ -6,6 +6,8 @@ import datetime
 import json
 import logging
 import os
+import time
+import uuid
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import yaml
@@ -20,6 +22,7 @@ from wandb.sdk.launch.registry.local_registry import LocalRegistry
 from wandb.sdk.launch.runner.abstract import Status
 from wandb.sdk.launch.runner.kubernetes_monitor import (
     WANDB_K8S_LABEL_AGENT,
+    WANDB_K8S_LABEL_AUXILIARY_RESOURCE,
     WANDB_K8S_LABEL_MONITOR,
     WANDB_K8S_RUN_ID,
     CustomResource,
@@ -47,6 +50,9 @@ get_module(
 import kubernetes_asyncio  # type: ignore # noqa: E402
 from kubernetes_asyncio import client  # noqa: E402
+from kubernetes_asyncio.client.api.apps_v1_api import (  # type: ignore # noqa: E402
+    AppsV1Api,
+)
 from kubernetes_asyncio.client.api.batch_v1_api import (  # type: ignore # noqa: E402
     BatchV1Api,
 )
@@ -78,9 +84,11 @@ class KubernetesSubmittedRun(AbstractRun):
         self,
         batch_api: "BatchV1Api",
         core_api: "CoreV1Api",
+        apps_api: "AppsV1Api",
         name: str,
         namespace: Optional[str] = "default",
         secret: Optional["V1Secret"] = None,
+        auxiliary_resource_label_key: Optional[str] = None,
     ) -> None:
         """Initialize a KubernetesSubmittedRun.
@@ -104,10 +112,12 @@ class KubernetesSubmittedRun(AbstractRun):
         """
         self.batch_api = batch_api
         self.core_api = core_api
+        self.apps_api = apps_api
         self.name = name
         self.namespace = namespace
         self._fail_count = 0
         self.secret = secret
+        self.auxiliary_resource_label_key = auxiliary_resource_label_key
     @property
     def id(self) -> str:
@@ -149,6 +159,7 @@ class KubernetesSubmittedRun(AbstractRun):
             await asyncio.sleep(5)
         await self._delete_secret()
+        await self._delete_auxiliary_resources_by_label()
         return (
             status.state == "finished"
         )  # todo: not sure if this (copied from aws runner) is the right approach? should we return false on failure
@@ -157,6 +168,7 @@ class KubernetesSubmittedRun(AbstractRun):
         status = LaunchKubernetesMonitor.get_status(self.name)
         if status in ["stopped", "failed", "finished", "preempted"]:
             await self._delete_secret()
+            await self._delete_auxiliary_resources_by_label()
         return status
     async def cancel(self) -> None:
@@ -167,6 +179,7 @@ class KubernetesSubmittedRun(AbstractRun):
                 name=self.name,
             )
             await self._delete_secret()
+            await self._delete_auxiliary_resources_by_label()
         except ApiException as e:
             raise LaunchError(
                 f"Failed to delete Kubernetes Job {self.name} in namespace {self.namespace}: {str(e)}"
@@ -181,6 +194,52 @@ class KubernetesSubmittedRun(AbstractRun):
             )
             self.secret = None
+    async def _delete_auxiliary_resources_by_label(self) -> None:
+        if self.auxiliary_resource_label_key is None:
+            return
+        label_selector = (
+            f"{WANDB_K8S_LABEL_AUXILIARY_RESOURCE}={self.auxiliary_resource_label_key}"
+        )
+        try:
+            resource_cleanups = [
+                (self.core_api, "service"),
+                (self.batch_api, "job"),
+                (self.core_api, "pod"),
+                (self.core_api, "config_map"),
+                (self.core_api, "secret"),
+                (self.apps_api, "deployment"),
+                (self.apps_api, "replica_set"),
+                (self.apps_api, "daemon_set"),
+            ]
+            for api_client, resource_type in resource_cleanups:
+                try:
+                    list_method = getattr(
+                        api_client, f"list_namespaced_{resource_type}"
+                    )
+                    delete_method = getattr(
+                        api_client, f"delete_namespaced_{resource_type}"
+                    )
+                    # List resources with our label
+                    resources = await list_method(
+                        namespace=self.namespace, label_selector=label_selector
+                    )
+                    # Delete each resource
+                    for resource in resources.items:
+                        await delete_method(
+                            name=resource.metadata.name, namespace=self.namespace
+                        )
+                except (AttributeError, ApiException) as e:
+                    wandb.termwarn(f"Could not clean up {resource_type}: {e}")
+        except Exception as e:
+            wandb.termwarn(f"Failed to clean up some auxiliary resources: {e}")
 class CrdSubmittedRun(AbstractRun):
     """Run submitted to a CRD backend, e.g. Volcano."""
@@ -366,6 +425,7 @@ class KubernetesRunner(AbstractRunner):
             job_metadata["generateName"] = make_name_dns_safe(
                 f"launch-{launch_project.target_entity}-{launch_project.target_project}-"
             )
+        job_metadata["namespace"] = namespace
         for i, cont in enumerate(containers):
             if "name" not in cont:
@@ -489,6 +549,235 @@ class KubernetesRunner(AbstractRunner):
         return job, api_key_secret
+    async def _wait_for_resource_ready(
+        self,
+        api_client: kubernetes_asyncio.client.ApiClient,
+        config: Dict[str, Any],
+        namespace: str,
+        timeout_seconds: int = 300,
+    ) -> None:
+        """Wait for a Kubernetes resource to be ready.
+        Arguments:
+            api_client: The Kubernetes API client.
+            config: The resource configuration.
+            namespace: The namespace where the resource was created.
+            timeout_seconds: Maximum time to wait for readiness.
+        """
+        resource_kind = config.get("kind")
+        resource_name = config.get("metadata", {}).get("name")
+        if not resource_kind or not resource_name:
+            wandb.termerror(
+                f"{LOG_PREFIX}Cannot wait for resource without kind or name"
+            )
+            return
+        wandb.termlog(
+            f"{LOG_PREFIX}Waiting for {resource_kind} '{resource_name}' to be ready..."
+        )
+        start_time = time.time()
+        if resource_kind == "Deployment":
+            await self._wait_for_deployment_ready(
+                api_client, resource_name, namespace, timeout_seconds
+            )
+        elif resource_kind == "Service":
+            await self._wait_for_service_ready(
+                api_client, resource_name, namespace, timeout_seconds
+            )
+        elif resource_kind == "Pod":
+            await self._wait_for_pod_ready(
+                api_client, resource_name, namespace, timeout_seconds
+            )
+        else:
+            wandb.termlog(
+                f"{LOG_PREFIX}No specific readiness check for {resource_kind}, waiting 5 seconds..."
+            )
+            await asyncio.sleep(5)
+        elapsed = time.time() - start_time
+        wandb.termlog(
+            f"{LOG_PREFIX}{resource_kind} '{resource_name}' is ready after {elapsed:.1f}s"
+        )
+    async def _wait_for_deployment_ready(
+        self,
+        api_client: kubernetes_asyncio.client.ApiClient,
+        name: str,
+        namespace: str,
+        timeout_seconds: int,
+    ) -> None:
+        """Wait for a Deployment to be ready."""
+        apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
+        async def check_deployment_ready():
+            deployment = await apps_api.read_namespaced_deployment(
+                name=name, namespace=namespace
+            )
+            status = deployment.status
+            if status.ready_replicas and status.replicas:
+                return status.ready_replicas >= status.replicas
+            return False
+        await self._wait_with_timeout(check_deployment_ready, timeout_seconds, name)
+    async def _wait_for_service_ready(
+        self,
+        api_client: kubernetes_asyncio.client.ApiClient,
+        name: str,
+        namespace: str,
+        timeout_seconds: int,
+    ) -> None:
+        """Wait for a Service to have endpoints."""
+        core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
+        async def check_service_ready():
+            endpoints = await core_api.read_namespaced_endpoints(
+                name=name, namespace=namespace
+            )
+            if endpoints.subsets:
+                for subset in endpoints.subsets:
+                    if subset.addresses:  # These are ready pod addresses
+                        return True
+            return False
+        await self._wait_with_timeout(check_service_ready, timeout_seconds, name)
+    async def _wait_for_pod_ready(
+        self,
+        api_client: kubernetes_asyncio.client.ApiClient,
+        name: str,
+        namespace: str,
+        timeout_seconds: int,
+    ) -> None:
+        """Wait for a Pod to be ready."""
+        core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
+        async def check_pod_ready():
+            pod = await core_api.read_namespaced_pod(name=name, namespace=namespace)
+            if pod.status.phase == "Running":
+                if pod.status.container_statuses:
+                    return all(status.ready for status in pod.status.container_statuses)
+                return True
+            return False
+        await self._wait_with_timeout(check_pod_ready, timeout_seconds, name)
+    async def _wait_with_timeout(
+        self, check_func, timeout_seconds: int, name: str
+    ) -> None:
+        """Generic timeout wrapper for readiness checks."""
+        start_time = time.time()
+        while time.time() - start_time < timeout_seconds:
+            try:
+                if await check_func():
+                    return
+            except kubernetes_asyncio.client.ApiException as e:
+                if e.status == 404:
+                    pass
+                else:
+                    wandb.termerror(
+                        f"{LOG_PREFIX}Error waiting for resource '{name}': {e}"
+                    )
+                    raise
+            except Exception as e:
+                wandb.termerror(f"{LOG_PREFIX}Error waiting for resource '{name}': {e}")
+                raise
+            await asyncio.sleep(2)
+        raise LaunchError(
+            f"Resource '{name}' not ready within {timeout_seconds} seconds"
+        )
+    async def _prepare_resource(
+        self,
+        api_client: kubernetes_asyncio.client.ApiClient,
+        config: Dict[str, Any],
+        namespace: str,
+        run_id: str,
+        auxiliary_resource_label_key: str,
+        launch_project: LaunchProject,
+        api_key_secret: Optional["V1Secret"] = None,
+        wait_for_ready: bool = True,
+        wait_timeout: int = 300,
+    ) -> None:
+        """Prepare a service for launch.
+        Arguments:
+            api_client: The Kubernetes API client.
+            config: The resource configuration to prepare.
+            namespace: The namespace to create the resource in.
+            run_id: The run ID to label the resource with.
+            auxiliary_resource_label_key: The key of the auxiliary resource label.
+            launch_project: The launch project to get environment variables from.
+            api_key_secret: The API key secret to inject.
+            wait_for_ready: Whether to wait for the resource to be ready after creation.
+            wait_timeout: Maximum time in seconds to wait for resource readiness.
+        """
+        config.setdefault("metadata", {})
+        config["metadata"].setdefault("labels", {})
+        config["metadata"]["labels"][WANDB_K8S_RUN_ID] = run_id
+        config["metadata"]["labels"][WANDB_K8S_LABEL_AUXILIARY_RESOURCE] = (
+            auxiliary_resource_label_key
+        )
+        config["metadata"]["labels"]["wandb.ai/created-by"] = "launch-agent"
+        if config.get("kind") == "Service" or config.get("kind") == "Deployment":
+            config.setdefault("metadata", {})
+            original_name = config["metadata"].get("name", config.get("kind"))
+            safe_name = make_name_dns_safe(original_name)
+            safe_entity = make_name_dns_safe(launch_project.target_entity or "")
+            safe_project = make_name_dns_safe(launch_project.target_project or "")
+            safe_run_id = make_name_dns_safe(run_id or "")
+            new_name = f"{safe_name}-{safe_entity}-{safe_project}-{safe_run_id}"
+            config["metadata"]["name"] = new_name
+            wandb.termlog(
+                f"{LOG_PREFIX}Modified {config.get('kind')} name from '{original_name}' to '{new_name}'"
+            )
+        env_vars = launch_project.get_env_vars_dict(
+            self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
+        )
+        wandb_config_env = {
+            "WANDB_CONFIG": env_vars.get("WANDB_CONFIG", "{}"),
+        }
+        add_wandb_env(config, wandb_config_env)
+        if api_key_secret:
+            for cont in yield_containers(config):
+                env = cont.setdefault("env", [])
+                env.append(
+                    {
+                        "name": "WANDB_API_KEY",
+                        "valueFrom": {
+                            "secretKeyRef": {
+                                "name": api_key_secret.metadata.name,
+                                "key": "password",
+                            }
+                        },
+                    }
+                )
+                cont["env"] = env
+        try:
+            await kubernetes_asyncio.utils.create_from_dict(
+                api_client, config, namespace=namespace
+            )
+            if wait_for_ready:
+                await self._wait_for_resource_ready(
+                    api_client, config, namespace, wait_timeout
+                )
+        except Exception as e:
+            wandb.termerror(f"{LOG_PREFIX}Failed to create Kubernetes resource: {e}")
+            raise LaunchError(f"Failed to create Kubernetes resource: {e}")
     async def run(
         self, launch_project: LaunchProject, image_uri: str
     ) -> Optional[AbstractRun]:
@@ -630,10 +919,42 @@ class KubernetesRunner(AbstractRunner):
         batch_api = kubernetes_asyncio.client.BatchV1Api(api_client)
         core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
+        apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
         namespace = self.get_namespace(resource_args, context)
         job, secret = await self._inject_defaults(
             resource_args, launch_project, image_uri, namespace, core_api
         )
+        additional_services = launch_project.launch_spec.get("additional_services", [])
+        auxiliary_resource_label_key = None
+        if additional_services:
+            wandb.termlog(
+                f"{LOG_PREFIX}Creating additional services: {additional_services}"
+            )
+            auxiliary_resource_label_key = f"aux-{uuid.uuid4()}"
+            wait_for_ready = resource_args.get("wait_for_ready", True)
+            wait_timeout = resource_args.get("wait_timeout", 300)
+            await asyncio.gather(
+                *[
+                    self._prepare_resource(
+                        api_client,
+                        resource.get("config"),
+                        namespace,
+                        launch_project.run_id,
+                        auxiliary_resource_label_key,
+                        launch_project,
+                        secret,
+                        wait_for_ready,
+                        wait_timeout,
+                    )
+                    for resource in additional_services
+                    if resource.get("config", {})
+                ]
+            )
         msg = "Creating Kubernetes job"
         if "name" in resource_args:
             msg += f": {resource_args['name']}"
@@ -658,7 +979,13 @@ class KubernetesRunner(AbstractRunner):
         job_name = job_response.metadata.name
         LaunchKubernetesMonitor.monitor_namespace(namespace)
         submitted_job = KubernetesSubmittedRun(
-            batch_api, core_api, job_name, namespace, secret
+            batch_api,
+            core_api,
+            apps_api,
+            job_name,
+            namespace,
+            secret,
+            auxiliary_resource_label_key,
         )
         if self.backend_config[PROJECT_SYNCHRONOUS]:
             await submitted_job.wait()

wandb/sdk/launch/sweeps/scheduler.py CHANGED Viewed

@@ -36,7 +36,6 @@ if TYPE_CHECKING:
     import wandb.apis.public as public
     from wandb.apis.internal import Api
     from wandb.apis.public import QueuedRun, Run
-    from wandb.sdk.wandb_run import Run as SdkRun
 _logger = logging.getLogger(__name__)
@@ -255,10 +254,10 @@ class Scheduler(ABC):
             _id: w for _id, w in self._workers.items() if _id not in self.busy_workers
         }
-    def _init_wandb_run(self) -> "SdkRun":
+    def _init_wandb_run(self) -> "wandb.Run":
         """Controls resume or init logic for a scheduler wandb run."""
         settings = wandb.Settings(disable_job_creation=True)
-        run: SdkRun = wandb.init(  # type: ignore
+        run: wandb.Run = wandb.init(  # type: ignore
             name=f"Scheduler.{self._sweep_id}",
             resume="allow",
             config=self._kwargs,  # when run as a job, this sets config

wandb/sdk/launch/utils.py CHANGED Viewed

@@ -380,9 +380,9 @@ def diff_pip_requirements(req_1: List[str], req_2: List[str]) -> Dict[str, str]:
             else:
                 raise ValueError(f"Unable to parse pip requirements file line: {line}")
             if _name is not None:
-                assert re.match(
-                    _VALID_PIP_PACKAGE_REGEX, _name
-                ), f"Invalid pip package name {_name}"
+                assert re.match(_VALID_PIP_PACKAGE_REGEX, _name), (
+                    f"Invalid pip package name {_name}"
+                )
                 d[_name] = _version
         return d

wandb/sdk/lib/asyncio_compat.py CHANGED Viewed

@@ -100,6 +100,9 @@ class _Runner:
                 raise _RunnerCancelledError()
         finally:
+            # NOTE: asyncio.run() cancels all tasks after the main task exits,
+            #   but this is not documented, so we cancel them explicitly here
+            #   as well. It also blocks until canceled tasks complete.
             cancellation_task.cancel()
             fn_task.cancel()

wandb/sdk/lib/console_capture.py CHANGED Viewed

@@ -25,17 +25,38 @@ In particular, it does not work with some combinations of pytest's
 from __future__ import annotations
+import logging
 import sys
 import threading
 from typing import IO, AnyStr, Callable, Protocol
+from . import wb_logging
+_logger = logging.getLogger(__name__)
 class CannotCaptureConsoleError(Exception):
     """The module failed to patch stdout or stderr."""
 class _WriteCallback(Protocol):
-    """A callback that receives intercepted bytes or string data."""
+    """A callback that receives intercepted bytes or string data.
+    This may be called from any thread, but is only called from one thread
+    at a time.
+    Note on errors: Any error raised during the callback will clear all
+    callbacks. This means that if a user presses Ctrl-C at an unlucky time
+    during a run, we will stop uploading console output---but it's not
+    likely to be a problem unless something catches the KeyboardInterrupt.
+    Regular Exceptions are caught and logged instead of bubbling up to the
+    user's print() statements; other exceptions like KeyboardInterrupt are
+    re-raised.
+    Callbacks should handle all exceptions---a callback that raises any
+    Exception is considered buggy.
+    """
     def __call__(
         self,
@@ -45,6 +66,8 @@ class _WriteCallback(Protocol):
     ) -> None:
         """Intercept data passed to `write()`.
+        See the protocol docstring for information about exceptions.
         Args:
             data: The object passed to stderr's or stdout's `write()`.
             written: The number of bytes or characters written.
@@ -52,7 +75,9 @@ class _WriteCallback(Protocol):
         """
-_module_lock = threading.Lock()
+# A reentrant lock is used to catch callbacks that write to stderr/stdout.
+_module_rlock = threading.RLock()
+_is_writing = False
 _patch_exception: CannotCaptureConsoleError | None = None
@@ -67,9 +92,6 @@ def capture_stdout(callback: _WriteCallback) -> Callable[[], None]:
     Args:
         callback: A callback to invoke after running `sys.stdout.write`.
-            This may be called from any thread, so it must be thread-safe.
-            Exceptions are propagated to the caller of `write`.
-            See `_WriteCallback` for the exact protocol.
     Returns:
         A function to uninstall the callback.
@@ -77,7 +99,7 @@ def capture_stdout(callback: _WriteCallback) -> Callable[[], None]:
     Raises:
         CannotCaptureConsoleError: If patching failed on import.
     """
-    with _module_lock:
+    with _module_rlock:
         if _patch_exception:
             raise _patch_exception
@@ -92,9 +114,6 @@ def capture_stderr(callback: _WriteCallback) -> Callable[[], None]:
     Args:
         callback: A callback to invoke after running `sys.stderr.write`.
-            This may be called from any thread, so it must be thread-safe.
-            Exceptions are propagated to the caller of `write`.
-            See `_WriteCallback` for the exact protocol.
     Returns:
         A function to uninstall the callback.
@@ -102,7 +121,7 @@ def capture_stderr(callback: _WriteCallback) -> Callable[[], None]:
     Raises:
         CannotCaptureConsoleError: If patching failed on import.
     """
-    with _module_lock:
+    with _module_rlock:
         if _patch_exception:
             raise _patch_exception
@@ -125,11 +144,11 @@ def _insert_disposably(
     def dispose() -> None:
         nonlocal disposed
-        with _module_lock:
+        with _module_rlock:
             if disposed:
                 return
-            del callback_dict[id]
+            callback_dict.pop(id, None)
             disposed = True
@@ -143,16 +162,44 @@ def _patch(
 ) -> None:
     orig_write: Callable[[AnyStr], int]
+    @wb_logging.log_to_all_runs()
     def write_with_callbacks(s: AnyStr, /) -> int:
+        global _is_writing
         n = orig_write(s)
-        # We make a copy here because callbacks could, in theory, modify
-        # the list of callbacks.
-        with _module_lock:
-            callbacks_copy = list(callbacks.values())
-        for cb in callbacks_copy:
-            cb(s, n)
+        # NOTE: Since _module_rlock is reentrant, this is safe. It will not
+        # deadlock if a callback invokes write() again.
+        with _module_rlock:
+            if _is_writing:
+                return n
+            _is_writing = True
+            try:
+                for cb in callbacks.values():
+                    cb(s, n)
+            except BaseException as e:
+                # Clear all callbacks on any exception to avoid infinite loops:
+                #
+                # * If we re-raise, an exception handler is likely to print
+                #   the exception to the console and trigger callbacks again
+                # * If we log, we can't guarantee that this doesn't print
+                #   to console.
+                #
+                # This is especially important for KeyboardInterrupt.
+                _stderr_callbacks.clear()
+                _stdout_callbacks.clear()
+                if isinstance(e, Exception):
+                    # We suppress Exceptions so that bugs in W&B code don't
+                    # cause the user's print() statements to raise errors.
+                    _logger.exception("Error in console callback, clearing all!")
+                else:
+                    # Re-raise errors like KeyboardInterrupt.
+                    raise
+            finally:
+                _is_writing = False
         return n

wandb/sdk/lib/deprecate.py CHANGED Viewed

@@ -1,20 +1,14 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
 import wandb
 from wandb.proto.wandb_deprecated import DEPRECATED_FEATURES
 from wandb.sdk.lib import telemetry
-# Necessary to break import cycle.
-if TYPE_CHECKING:
-    from wandb import wandb_run
 def deprecate(
     field_name: DEPRECATED_FEATURES,
     warning_message: str,
-    run: wandb_run.Run | None = None,
+    run: wandb.Run | None = None,
 ) -> None:
     """Warn the user that a feature has been deprecated.

wandb/sdk/lib/disabled.py CHANGED Viewed

@@ -26,5 +26,5 @@ class RunDisabled:
         deprecate.deprecate(
             field_name=Deprecated.run_disabled,
             warning_message="RunDisabled is deprecated and is a no-op. "
-            '`wandb.init(mode="disabled")` now returns and instance of `wandb.sdk.wandb_run.Run`.',
+            '`wandb.init(mode="disabled")` now returns an instance of `wandb.Run`.',
         )