PyPI - wandb - Versions diffs - 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl - Mend

wandb 0.15.3py3-none-any.whl → 0.15.5py3-none-any.whl

Files changed (156) hide show

wandb/__init__.py +1 -1
wandb/analytics/sentry.py +1 -0
wandb/apis/importers/base.py +20 -5
wandb/apis/importers/mlflow.py +7 -1
wandb/apis/internal.py +12 -0
wandb/apis/public.py +247 -1387
wandb/apis/reports/_panels.py +58 -35
wandb/beta/workflows.py +6 -7
wandb/cli/cli.py +130 -60
wandb/data_types.py +3 -1
wandb/filesync/dir_watcher.py +21 -27
wandb/filesync/step_checksum.py +8 -8
wandb/filesync/step_prepare.py +23 -10
wandb/filesync/step_upload.py +13 -13
wandb/filesync/upload_job.py +4 -8
wandb/integration/cohere/__init__.py +3 -0
wandb/integration/cohere/cohere.py +21 -0
wandb/integration/cohere/resolver.py +347 -0
wandb/integration/gym/__init__.py +4 -6
wandb/integration/huggingface/__init__.py +3 -0
wandb/integration/huggingface/huggingface.py +18 -0
wandb/integration/huggingface/resolver.py +213 -0
wandb/integration/langchain/wandb_tracer.py +16 -179
wandb/integration/openai/__init__.py +1 -3
wandb/integration/openai/openai.py +11 -143
wandb/integration/openai/resolver.py +111 -38
wandb/integration/sagemaker/config.py +2 -2
wandb/integration/tensorboard/log.py +4 -4
wandb/old/settings.py +24 -7
wandb/proto/v3/wandb_telemetry_pb2.py +12 -12
wandb/proto/v4/wandb_telemetry_pb2.py +12 -12
wandb/proto/wandb_deprecated.py +3 -1
wandb/sdk/__init__.py +1 -1
wandb/sdk/artifacts/__init__.py +0 -0
wandb/sdk/artifacts/artifact.py +2101 -0
wandb/sdk/artifacts/artifact_download_logger.py +42 -0
wandb/sdk/artifacts/artifact_manifest.py +67 -0
wandb/sdk/artifacts/artifact_manifest_entry.py +159 -0
wandb/sdk/artifacts/artifact_manifests/__init__.py +0 -0
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +91 -0
wandb/sdk/{internal → artifacts}/artifact_saver.py +6 -5
wandb/sdk/artifacts/artifact_state.py +10 -0
wandb/sdk/{interface/artifacts/artifact_cache.py → artifacts/artifacts_cache.py} +22 -12
wandb/sdk/artifacts/exceptions.py +55 -0
wandb/sdk/artifacts/storage_handler.py +59 -0
wandb/sdk/artifacts/storage_handlers/__init__.py +0 -0
wandb/sdk/artifacts/storage_handlers/azure_handler.py +192 -0
wandb/sdk/artifacts/storage_handlers/gcs_handler.py +224 -0
wandb/sdk/artifacts/storage_handlers/http_handler.py +112 -0
wandb/sdk/artifacts/storage_handlers/local_file_handler.py +134 -0
wandb/sdk/artifacts/storage_handlers/multi_handler.py +53 -0
wandb/sdk/artifacts/storage_handlers/s3_handler.py +301 -0
wandb/sdk/artifacts/storage_handlers/tracking_handler.py +67 -0
wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +132 -0
wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +72 -0
wandb/sdk/artifacts/storage_layout.py +6 -0
wandb/sdk/artifacts/storage_policies/__init__.py +0 -0
wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +61 -0
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +386 -0
wandb/sdk/{interface/artifacts/artifact_storage.py → artifacts/storage_policy.py} +5 -57
wandb/sdk/data_types/_dtypes.py +7 -12
wandb/sdk/data_types/base_types/json_metadata.py +3 -2
wandb/sdk/data_types/base_types/media.py +8 -8
wandb/sdk/data_types/base_types/wb_value.py +12 -13
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +5 -6
wandb/sdk/data_types/helper_types/classes.py +6 -8
wandb/sdk/data_types/helper_types/image_mask.py +5 -6
wandb/sdk/data_types/histogram.py +4 -3
wandb/sdk/data_types/html.py +3 -4
wandb/sdk/data_types/image.py +11 -9
wandb/sdk/data_types/molecule.py +5 -3
wandb/sdk/data_types/object_3d.py +7 -5
wandb/sdk/data_types/plotly.py +3 -2
wandb/sdk/data_types/saved_model.py +11 -11
wandb/sdk/data_types/trace_tree.py +5 -4
wandb/sdk/data_types/utils.py +3 -5
wandb/sdk/data_types/video.py +5 -4
wandb/sdk/integration_utils/auto_logging.py +215 -0
wandb/sdk/interface/interface.py +15 -15
wandb/sdk/internal/file_pusher.py +8 -16
wandb/sdk/internal/file_stream.py +5 -11
wandb/sdk/internal/handler.py +13 -1
wandb/sdk/internal/internal_api.py +287 -13
wandb/sdk/internal/job_builder.py +119 -30
wandb/sdk/internal/sender.py +6 -26
wandb/sdk/internal/settings_static.py +2 -0
wandb/sdk/internal/system/assets/__init__.py +2 -0
wandb/sdk/internal/system/assets/gpu.py +42 -0
wandb/sdk/internal/system/assets/gpu_amd.py +216 -0
wandb/sdk/internal/system/env_probe_helpers.py +13 -0
wandb/sdk/internal/system/system_info.py +3 -3
wandb/sdk/internal/tb_watcher.py +32 -22
wandb/sdk/internal/thread_local_settings.py +18 -0
wandb/sdk/launch/_project_spec.py +57 -11
wandb/sdk/launch/agent/agent.py +147 -65
wandb/sdk/launch/agent/job_status_tracker.py +34 -0
wandb/sdk/launch/agent/run_queue_item_file_saver.py +45 -0
wandb/sdk/launch/builder/abstract.py +5 -1
wandb/sdk/launch/builder/build.py +21 -18
wandb/sdk/launch/builder/docker_builder.py +10 -4
wandb/sdk/launch/builder/kaniko_builder.py +113 -23
wandb/sdk/launch/builder/noop.py +6 -3
wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +46 -14
wandb/sdk/launch/environment/aws_environment.py +3 -2
wandb/sdk/launch/environment/azure_environment.py +124 -0
wandb/sdk/launch/environment/gcp_environment.py +2 -4
wandb/sdk/launch/environment/local_environment.py +1 -1
wandb/sdk/launch/errors.py +19 -0
wandb/sdk/launch/github_reference.py +32 -19
wandb/sdk/launch/launch.py +3 -8
wandb/sdk/launch/launch_add.py +6 -2
wandb/sdk/launch/loader.py +21 -2
wandb/sdk/launch/registry/azure_container_registry.py +132 -0
wandb/sdk/launch/registry/elastic_container_registry.py +39 -5
wandb/sdk/launch/registry/google_artifact_registry.py +68 -26
wandb/sdk/launch/registry/local_registry.py +2 -1
wandb/sdk/launch/runner/abstract.py +24 -3
wandb/sdk/launch/runner/kubernetes_runner.py +479 -26
wandb/sdk/launch/runner/local_container.py +103 -51
wandb/sdk/launch/runner/local_process.py +1 -1
wandb/sdk/launch/runner/sagemaker_runner.py +60 -10
wandb/sdk/launch/runner/vertex_runner.py +10 -5
wandb/sdk/launch/sweeps/__init__.py +7 -9
wandb/sdk/launch/sweeps/scheduler.py +307 -77
wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
wandb/sdk/launch/sweeps/utils.py +82 -35
wandb/sdk/launch/utils.py +89 -75
wandb/sdk/lib/_settings_toposort_generated.py +7 -0
wandb/sdk/lib/capped_dict.py +26 -0
wandb/sdk/lib/{git.py → gitlib.py} +76 -59
wandb/sdk/lib/hashutil.py +12 -4
wandb/sdk/lib/paths.py +96 -8
wandb/sdk/lib/sock_client.py +2 -2
wandb/sdk/lib/timer.py +1 -0
wandb/sdk/service/server.py +22 -9
wandb/sdk/service/server_sock.py +1 -1
wandb/sdk/service/service.py +27 -8
wandb/sdk/verify/verify.py +4 -7
wandb/sdk/wandb_config.py +2 -6
wandb/sdk/wandb_init.py +57 -53
wandb/sdk/wandb_require.py +7 -0
wandb/sdk/wandb_run.py +61 -223
wandb/sdk/wandb_settings.py +28 -4
wandb/testing/relay.py +15 -2
wandb/util.py +74 -36
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/METADATA +15 -9
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/RECORD +151 -116
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/entry_points.txt +1 -0
wandb/integration/langchain/util.py +0 -191
wandb/sdk/interface/artifacts/__init__.py +0 -33
wandb/sdk/interface/artifacts/artifact.py +0 -615
wandb/sdk/interface/artifacts/artifact_manifest.py +0 -131
wandb/sdk/wandb_artifacts.py +0 -2226
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/LICENSE +0 -0
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/WHEEL +0 -0
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/top_level.txt +0 -0

wandb/sdk/launch/runner/kubernetes_runner.py CHANGED Viewed

@@ -1,27 +1,32 @@
+"""Implementation of KubernetesRunner class for wandb launch."""
 import base64
 import json
 import logging
 import time
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import wandb
 from wandb.apis.internal import Api
+from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
 from wandb.sdk.launch.builder.abstract import AbstractBuilder
 from wandb.sdk.launch.environment.abstract import AbstractEnvironment
 from wandb.sdk.launch.registry.abstract import AbstractRegistry
+from wandb.sdk.launch.registry.azure_container_registry import AzureContainerRegistry
 from wandb.sdk.launch.registry.local_registry import LocalRegistry
+from wandb.sdk.launch.runner.abstract import State, Status
 from wandb.util import get_module
 from .._project_spec import EntryPoint, LaunchProject
 from ..builder.build import get_env_vars_dict
+from ..errors import LaunchError
 from ..utils import (
     LOG_PREFIX,
     PROJECT_SYNCHRONOUS,
-    LaunchError,
     get_kube_context_and_api_client,
     make_name_dns_safe,
 )
-from .abstract import AbstractRun, AbstractRunner, Status
+from .abstract import AbstractRun, AbstractRunner
 get_module(
     "kubernetes",
@@ -31,8 +36,12 @@ get_module(
 from kubernetes import client  # type: ignore # noqa: E402
 from kubernetes.client.api.batch_v1_api import BatchV1Api  # type: ignore # noqa: E402
 from kubernetes.client.api.core_v1_api import CoreV1Api  # type: ignore # noqa: E402
+from kubernetes.client.api.custom_objects_api import (  # type: ignore # noqa: E402
+    CustomObjectsApi,
+)
 from kubernetes.client.models.v1_job import V1Job  # type: ignore # noqa: E402
 from kubernetes.client.models.v1_secret import V1Secret  # type: ignore # noqa: E402
+from kubernetes.client.rest import ApiException  # type: ignore # noqa: E402
 TIMEOUT = 5
 MAX_KUBERNETES_RETRIES = (
@@ -43,7 +52,22 @@ FAIL_MESSAGE_INTERVAL = 60
 _logger = logging.getLogger(__name__)
+# Dict for mapping possible states of custom objects to the states we want to report
+# to the agent.
+CRD_STATE_DICT: Dict[str, State] = {
+    "pending": "starting",
+    "running": "running",
+    "completed": "finished",
+    "failed": "failed",
+    "aborted": "failed",
+    "terminating": "stopping",
+    "terminated": "stopped",
+}
 class KubernetesSubmittedRun(AbstractRun):
+    """Wrapper for a launched run on Kubernetes."""
     def __init__(
         self,
         batch_api: "BatchV1Api",
@@ -53,6 +77,19 @@ class KubernetesSubmittedRun(AbstractRun):
         namespace: Optional[str] = "default",
         secret: Optional["V1Secret"] = None,
     ) -> None:
+        """Initialize a KubernetesSubmittedRun.
+        Arguments:
+            batch_api: Kubernetes BatchV1Api object.
+            core_api: Kubernetes CoreV1Api object.
+            name: Name of the job.
+            pod_names: List of pod names.
+            namespace: Kubernetes namespace.
+            secret: Kubernetes secret.
+        Returns:
+            None.
+        """
         self.batch_api = batch_api
         self.core_api = core_api
         self.name = name
@@ -66,14 +103,37 @@ class KubernetesSubmittedRun(AbstractRun):
     @property
     def id(self) -> str:
+        """Return the run id."""
         return self.name
+    def get_logs(self) -> Optional[str]:
+        try:
+            logs = self.core_api.read_namespaced_pod_log(
+                name=self.pod_names[0], namespace=self.namespace
+            )
+            if logs:
+                return str(logs)
+            else:
+                wandb.termwarn(
+                    f"Retrieved no logs for kubernetes pod(s): {self.pod_names}"
+                )
+            return None
+        except Exception as e:
+            wandb.termerror(f"{LOG_PREFIX}Failed to get pod logs: {e}")
+            return None
     def get_job(self) -> "V1Job":
+        """Return the job object."""
         return self.batch_api.read_namespaced_job(
             name=self.name, namespace=self.namespace
         )
     def wait(self) -> bool:
+        """Wait for the run to finish.
+        Returns:
+            True if the run finished successfully, False otherwise.
+        """
         while True:
             status = self.get_status()
             wandb.termlog(f"{LOG_PREFIX}Job {self.name} status: {status}")
@@ -85,14 +145,23 @@ class KubernetesSubmittedRun(AbstractRun):
         )  # todo: not sure if this (copied from aws runner) is the right approach? should we return false on failure
     def get_status(self) -> Status:
-        job_response = self.batch_api.read_namespaced_job_status(
-            name=self.name, namespace=self.namespace
-        )
-        status = job_response.status
+        """Return the run status."""
+        try:
+            job_response = self.batch_api.read_namespaced_job_status(
+                name=self.name, namespace=self.namespace
+            )
+            status = job_response.status
+            pod = self.core_api.read_namespaced_pod(
+                name=self.pod_names[0], namespace=self.namespace
+            )
+        except ApiException as e:
+            if "(404)" not in str(e):
+                raise
+            # 404 = Pod/job not reachable
+            wandb.termlog(f"{LOG_PREFIX}Job or pod disconnected for job: {self.name}")
+            return Status("preempted")
-        pod = self.core_api.read_namespaced_pod(
-            name=self.pod_names[0], namespace=self.namespace
-        )
         if pod.status.phase in ["Pending", "Unknown"]:
             now = time.time()
             if self._fail_count == 0:
@@ -111,7 +180,13 @@ class KubernetesSubmittedRun(AbstractRun):
         if status.succeeded == 1:
             return_status = Status("finished")
         elif status.failed is not None and status.failed >= 1:
-            return_status = Status("failed")
+            if status.conditions[0].reason == "BackoffLimitExceeded":
+                wandb.termlog(
+                    f"{LOG_PREFIX}Job or pod disconnected for job: {self.name}"
+                )
+                return_status = Status("preempted")
+            else:
+                return_status = Status("failed")
         elif status.active == 1:
             return Status("running")
         elif status.conditions is not None and status.conditions[0].type == "Suspended":
@@ -133,6 +208,7 @@ class KubernetesSubmittedRun(AbstractRun):
         return return_status
     def suspend(self) -> None:
+        """Suspend the run."""
         self.job.spec.suspend = True
         self.batch_api.patch_namespaced_job(
             name=self.name, namespace=self.namespace, body=self.job
@@ -156,29 +232,183 @@ class KubernetesSubmittedRun(AbstractRun):
             )
     def cancel(self) -> None:
+        """Cancel the run."""
         self.suspend()
         self.batch_api.delete_namespaced_job(name=self.name, namespace=self.namespace)
+class CrdSubmittedRun(AbstractRun):
+    """Run submitted to a CRD backend, e.g. Volcano."""
+    def __init__(
+        self,
+        group: str,
+        version: str,
+        plural: str,
+        name: str,
+        namespace: str,
+        core_api: CoreV1Api,
+        custom_api: CustomObjectsApi,
+        pod_names: List[str],
+    ) -> None:
+        """Create a run object for tracking the progress of a CRD.
+        Arguments:
+            group: The API group of the CRD.
+            version: The API version of the CRD.
+            plural: The plural name of the CRD.
+            name: The name of the CRD instance.
+            namespace: The namespace of the CRD instance.
+            core_api: The Kubernetes core API client.
+            custom_api: The Kubernetes custom object API client.
+            pod_names: The names of the pods associated with the CRD instance.
+        Raises:
+            LaunchError: If the CRD instance does not exist.
+        """
+        self.group = group
+        self.version = version
+        self.plural = plural
+        self.name = name
+        self.namespace = namespace
+        self.core_api = core_api
+        self.custom_api = custom_api
+        self.pod_names = pod_names
+        self._fail_count = 0
+        try:
+            self.job = self.custom_api.get_namespaced_custom_object(
+                group=self.group,
+                version=self.version,
+                namespace=self.namespace,
+                plural=self.plural,
+                name=self.name,
+            )
+        except ApiException as e:
+            raise LaunchError(
+                f"Failed to get CRD {self.name} in namespace {self.namespace}: {str(e)}"
+            ) from e
+    @property
+    def id(self) -> str:
+        """Get the name of the custom object."""
+        return self.name
+    def get_logs(self) -> Optional[str]:
+        """Get logs for custom object."""
+        # TODO: test more carefully once we release multi-node support
+        logs: Dict[str, Optional[str]] = {}
+        try:
+            for pod_name in self.pod_names:
+                logs[pod_name] = self.core_api.read_namespaced_pod_log(
+                    name=pod_name, namespace=self.namespace
+                )
+        except ApiException as e:
+            wandb.termwarn(f"Failed to get logs for {self.name}: {str(e)}")
+            return None
+        if not logs:
+            return None
+        logs_as_array = [f"Pod {pod_name}:\n{log}" for pod_name, log in logs.items()]
+        return "\n".join(logs_as_array)
+    def get_status(self) -> Status:
+        """Get status of custom object."""
+        try:
+            job_response = self.custom_api.get_namespaced_custom_object_status(
+                group=self.group,
+                version=self.version,
+                namespace=self.namespace,
+                plural=self.plural,
+                name=self.name,
+            )
+        except ApiException as e:
+            raise LaunchError(
+                f"Failed to get CRD {self.name} in namespace {self.namespace}: {str(e)}"
+            ) from e
+        # Custom objects can technically define whater states and format the
+        # response to the status request however they want. This checks for
+        # the most common cases.
+        status = job_response["status"]
+        state = status.get("state")
+        if isinstance(state, dict):
+            state = state.get("phase")
+        if state is None:
+            raise LaunchError(
+                f"Failed to get CRD {self.name} in namespace {self.namespace}: no state found"
+            )
+        return Status(CRD_STATE_DICT.get(state.lower(), "unknown"))
+    def cancel(self) -> None:
+        """Cancel the custom object."""
+        try:
+            self.custom_api.delete_namespaced_custom_object(
+                group=self.group,
+                version=self.version,
+                namespace=self.namespace,
+                plural=self.plural,
+                name=self.name,
+            )
+        except ApiException as e:
+            raise LaunchError(
+                f"Failed to delete CRD {self.name} in namespace {self.namespace}: {str(e)}"
+            ) from e
+    def wait(self) -> bool:
+        """Wait for this custom object to finish running."""
+        while True:
+            status = self.get_status()
+            wandb.termlog(f"{LOG_PREFIX}Job {self.name} status: {status}")
+            if status.state != "running":
+                break
+            time.sleep(5)
+        return status.state == "finished"
 class KubernetesRunner(AbstractRunner):
+    """Launches runs onto kubernetes."""
     def __init__(
         self, api: Api, backend_config: Dict[str, Any], environment: AbstractEnvironment
     ) -> None:
+        """Create a Kubernetes runner.
+        Arguments:
+            api: The API client object.
+            backend_config: The backend configuration.
+            environment: The environment to launch runs into.
+        Raises:
+            LaunchError: If the Kubernetes configuration is invalid.
+        """
         super().__init__(api, backend_config)
         self.environment = environment
     def wait_job_launch(
-        self, job_name: str, namespace: str, core_api: "CoreV1Api"
+        self,
+        job_name: str,
+        namespace: str,
+        core_api: "CoreV1Api",
+        label: str = "job-name",
     ) -> List[str]:
+        """Wait for a job to be launched and return the pod names.
+        Arguments:
+            job_name: The name of the job.
+            namespace: The namespace of the job.
+            core_api: The Kubernetes core API client.
+            label: The label key to match against job_name.
+        Returns:
+            The names of the pods associated with the job.
+        """
         pods = core_api.list_namespaced_pod(
-            label_selector=f"job-name={job_name}", namespace=namespace
+            label_selector=f"{label}={job_name}", namespace=namespace
         )
         timeout = TIMEOUT
         while len(pods.items) == 0 and timeout > 0:
             time.sleep(1)
             timeout -= 1
             pods = core_api.list_namespaced_pod(
-                label_selector=f"job-name={job_name}", namespace=namespace
+                label_selector=f"{label}={job_name}", namespace=namespace
             )
         if timeout == 0:
@@ -197,6 +427,15 @@ class KubernetesRunner(AbstractRunner):
     def get_namespace(
         self, resource_args: Dict[str, Any], context: Dict[str, Any]
     ) -> str:
+        """Get the namespace to launch into.
+        Arguments:
+            resource_args: The resource args to launch.
+            context: The k8s config context.
+        Returns:
+            The namespace to launch into.
+        """
         default_namespace = (
             context["context"].get("namespace", "default") if context else "default"
         )
@@ -213,8 +452,20 @@ class KubernetesRunner(AbstractRunner):
         builder: Optional[AbstractBuilder],
         namespace: str,
         core_api: "CoreV1Api",
+        job_tracker: Optional[JobAndRunStatusTracker],
     ) -> Tuple[Dict[str, Any], Optional["V1Secret"]]:
-        """Apply our default values, return job dict and secret."""
+        """Apply our default values, return job dict and secret.
+        Arguments:
+            resource_args (Dict[str, Any]): The resource args to launch.
+            launch_project (LaunchProject): The launch project.
+            builder (Optional[AbstractBuilder]): The builder.
+            namespace (str): The namespace.
+            core_api (CoreV1Api): The core api.
+        Returns:
+            Tuple[Dict[str, Any], Optional["V1Secret"]]: The resource args and secret.
+        """
         job: Dict[str, Any] = {
             "apiVersion": "batch/v1",
             "kind": "Job",
@@ -253,7 +504,9 @@ class KubernetesRunner(AbstractRunner):
                     "Invalid specification of multiple containers. See https://docs.wandb.ai/guides/launch for guidance on submitting jobs."
                 )
             # dont specify run id if user provided image, could have multiple runs
-            containers[0]["image"] = launch_project.docker_image
+            image_uri = launch_project.docker_image
+            containers[0]["image"] = image_uri
+            launch_project.fill_macros(image_uri)
             # TODO: handle secret pulling image from registry
         elif not any(["image" in cont for cont in containers]):
             if len(containers) > 1:
@@ -262,7 +515,9 @@ class KubernetesRunner(AbstractRunner):
                 )
             assert entry_point is not None
             assert builder is not None
-            image_uri = builder.build_image(launch_project, entry_point)
+            image_uri = builder.build_image(launch_project, entry_point, job_tracker)
+            image_uri = image_uri.replace("https://", "")
+            launch_project.fill_macros(image_uri)
             # in the non instance case we need to make an imagePullSecret
             # so the new job can pull the image
             if not builder.registry:
@@ -276,8 +531,8 @@ class KubernetesRunner(AbstractRunner):
                 pod_spec["imagePullSecrets"] = [
                     {"name": f"regcred-{launch_project.run_id}"}
                 ]
             containers[0]["image"] = image_uri
+            launch_project.fill_macros(image_uri)
         inject_entrypoint_and_args(
             containers,
@@ -306,8 +561,18 @@ class KubernetesRunner(AbstractRunner):
     def run(
         self,
         launch_project: LaunchProject,
-        builder: Optional[AbstractBuilder],
+        builder: AbstractBuilder,
+        job_tracker: Optional[JobAndRunStatusTracker] = None,
     ) -> Optional[AbstractRun]:  # noqa: C901
+        """Execute a launch project on Kubernetes.
+        Arguments:
+            launch_project: The launch project to execute.
+            builder: The builder to use to build the image.
+        Returns:
+            The run object if the run was successful, otherwise None.
+        """
         kubernetes = get_module(  # noqa: F811
             "kubernetes",
             required="Kubernetes runner requires the kubernetes package. Please"
@@ -316,23 +581,86 @@ class KubernetesRunner(AbstractRunner):
         resource_args = launch_project.resource_args.get("kubernetes", {})
         if not resource_args:
             wandb.termlog(
-                f"{LOG_PREFIX}Note: no resource args specified. Add a Kubernetes yaml spec or other options in a json file with --resource-args <json>."
+                f"{LOG_PREFIX}Note: no resource args specified. Add a "
+                "Kubernetes yaml spec or other options in a json file "
+                "with --resource-args <json>."
             )
         _logger.info(f"Running Kubernetes job with resource args: {resource_args}")
         context, api_client = get_kube_context_and_api_client(kubernetes, resource_args)
+        # If the user specified an alternate api, we need will execute this
+        # run by creating a custom object.
+        api_version = resource_args.get("apiVersion", "batch/v1")
+        if api_version not in ["batch/v1", "batch/v1beta1"]:
+            entrypoint = launch_project.get_single_entry_point()
+            if launch_project.docker_image:
+                image_uri = launch_project.docker_image
+            else:
+                assert entrypoint is not None
+                image_uri = builder.build_image(launch_project, entrypoint, job_tracker)
+            launch_project.fill_macros(image_uri)
+            env_vars = get_env_vars_dict(launch_project, self._api)
+            # Crawl the resource args and add our env vars to the containers.
+            add_wandb_env(launch_project.resource_args, env_vars)
+            # Crawl the resource arsg and add our labels to the pods. This is
+            # necessary for the agent to find the pods later on.
+            add_label_to_pods(
+                launch_project.resource_args, "wandb/run-id", launch_project.run_id
+            )
+            overrides = {}
+            if launch_project.override_args:
+                overrides["args"] = launch_project.override_args
+            if launch_project.override_entrypoint:
+                overrides["command"] = launch_project.override_entrypoint.command
+            add_entrypoint_args_overrides(
+                launch_project.resource_args,
+                overrides,
+            )
+            api = client.CustomObjectsApi(api_client)
+            # Infer the attributes of a custom object from the apiVersion and/or
+            # a kind: attribute in the resource args.
+            namespace = self.get_namespace(resource_args, context)
+            group = resource_args.get("group", api_version.split("/")[0])
+            version = api_version.split("/")[1]
+            kind = resource_args.get("kind", version)
+            plural = f"{kind.lower()}s"
+            try:
+                response = api.create_namespaced_custom_object(
+                    group=group,
+                    version=version,
+                    namespace=namespace,
+                    plural=plural,
+                    body=launch_project.resource_args.get("kubernetes"),
+                )
+            except ApiException as e:
+                raise LaunchError(
+                    f"Error creating CRD of kind {kind}: {e.status} {e.reason}"
+                ) from e
+            name = response.get("metadata", {}).get("name")
+            _logger.info(f"Created {kind} {response['metadata']['name']}")
+            core = client.CoreV1Api(api_client)
+            pod_names = self.wait_job_launch(
+                launch_project.run_id, namespace, core, label="wandb/run-id"
+            )
+            return CrdSubmittedRun(
+                name=name,
+                group=group,
+                version=version,
+                namespace=namespace,
+                plural=plural,
+                core_api=client.CoreV1Api(api_client),
+                custom_api=api,
+                pod_names=pod_names,
+            )
         batch_api = kubernetes.client.BatchV1Api(api_client)
         core_api = kubernetes.client.CoreV1Api(api_client)
         namespace = self.get_namespace(resource_args, context)
         job, secret = self._inject_defaults(
-            resource_args,
-            launch_project,
-            builder,
-            namespace,
-            core_api,
+            resource_args, launch_project, builder, namespace, core_api, job_tracker
         )
         msg = "Creating Kubernetes job"
@@ -364,6 +692,17 @@ def inject_entrypoint_and_args(
     override_args: List[str],
     should_override_entrypoint: bool,
 ) -> None:
+    """Inject the entrypoint and args into the containers.
+    Arguments:
+        containers: The containers to inject the entrypoint and args into.
+        entry_point: The entrypoint to inject.
+        override_args: The args to inject.
+        should_override_entrypoint: Whether to override the entrypoint.
+    Returns:
+        None
+    """
     for i in range(len(containers)):
         if override_args:
             containers[i]["args"] = override_args
@@ -379,8 +718,21 @@ def maybe_create_imagepull_secret(
     run_id: str,
     namespace: str,
 ) -> Optional["V1Secret"]:
+    """Create a secret for pulling images from a private registry.
+    Arguments:
+        core_api: The Kubernetes CoreV1Api object.
+        registry: The registry to pull from.
+        run_id: The run id.
+        namespace: The namespace to create the secret in.
+    Returns:
+        A secret if one was created, otherwise None.
+    """
     secret = None
-    if isinstance(registry, LocalRegistry):
+    if isinstance(registry, LocalRegistry) or isinstance(
+        registry, AzureContainerRegistry
+    ):
         # Secret not required
         return None
     uname, token = registry.get_username_password()
@@ -406,3 +758,104 @@ def maybe_create_imagepull_secret(
         return core_api.create_namespaced_secret(namespace, secret)
     except Exception as e:
         raise LaunchError(f"Exception when creating Kubernetes secret: {str(e)}\n")
+def add_wandb_env(root: Union[dict, list], env_vars: Dict[str, str]) -> None:
+    """Injects wandb environment variables into specs.
+    Recursively walks the spec and injects the environment variables into
+    every container spec. Containers are identified by the "containers" key.
+    This function treats the WANDB_RUN_ID and WANDB_GROUP_ID environment variables
+    specially. If they are present in the spec, they will be overwritten. If a setting
+    for WANDB_RUN_ID is provided in env_vars, then that environment variable will only be
+    set in the first container modified by this function.
+    Arguments:
+        root: The spec to modify.
+        env_vars: The environment variables to inject.
+    Returns: None.
+    """
+    def yield_containers(root: Any) -> Iterator[dict]:
+        if isinstance(root, dict):
+            for k, v in root.items():
+                if k == "containers":
+                    if isinstance(v, list):
+                        yield from v
+                elif isinstance(v, (dict, list)):
+                    yield from yield_containers(v)
+        elif isinstance(root, list):
+            for item in root:
+                yield from yield_containers(item)
+    for cont in yield_containers(root):
+        env = cont.setdefault("env", [])
+        env.extend([{"name": key, "value": value} for key, value in env_vars.items()])
+        cont["env"] = env
+        # After we have set WANDB_RUN_ID once, we don't want to set it again
+        if "WANDB_RUN_ID" in env_vars:
+            env_vars.pop("WANDB_RUN_ID")
+def add_label_to_pods(
+    manifest: Union[dict, list], label_key: str, label_value: str
+) -> None:
+    """Add a label to all pod specs in a manifest.
+    Recursively traverses the manifest and adds the label to all pod specs.
+    Pod specs are identified by the presence of a "spec" key with a "containers"
+    key in the value.
+    Arguments:
+        manifest: The manifest to modify.
+        label_key: The label key to add.
+        label_value: The label value to add.
+    Returns: None.
+    """
+    def yield_pods(manifest: Any) -> Iterator[dict]:
+        if isinstance(manifest, list):
+            for item in manifest:
+                yield from yield_pods(item)
+        elif isinstance(manifest, dict):
+            if "spec" in manifest and "containers" in manifest["spec"]:
+                yield manifest
+            for value in manifest.values():
+                if isinstance(value, (dict, list)):
+                    yield from yield_pods(value)
+    for pod in yield_pods(manifest):
+        metadata = pod.setdefault("metadata", {})
+        labels = metadata.setdefault("labels", {})
+        labels[label_key] = label_value
+def add_entrypoint_args_overrides(manifest: Union[dict, list], overrides: dict) -> None:
+    """Add entrypoint and args overrides to all containers in a manifest.
+    Recursively traverses the manifest and adds the entrypoint and args overrides
+    to all containers. Containers are identified by the presence of a "spec" key
+    with a "containers" key in the value.
+    Arguments:
+        manifest: The manifest to modify.
+        overrides: Dictionary with args and entrypoint keys.
+    Returns: None.
+    """
+    if isinstance(manifest, list):
+        for item in manifest:
+            add_entrypoint_args_overrides(item, overrides)
+    elif isinstance(manifest, dict):
+        if "spec" in manifest and "containers" in manifest["spec"]:
+            containers = manifest["spec"]["containers"]
+            for container in containers:
+                if "command" in overrides:
+                    container["command"] = overrides["command"]
+                if "args" in overrides:
+                    container["args"] = overrides["args"]
+        for value in manifest.values():
+            add_entrypoint_args_overrides(value, overrides)

wandb 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl

wandb 0.15.3py3-none-any.whl → 0.15.5py3-none-any.whl