PyPI - wandb - Versions diffs - 0.16.4__py3-none-any.whl → 0.16.6__py3-none-any.whl - Mend

wandb 0.16.4py3-none-any.whl → 0.16.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

wandb/__init__.py +2 -2
wandb/agents/pyagent.py +1 -1
wandb/apis/public/api.py +6 -6
wandb/apis/reports/v2/interface.py +4 -8
wandb/apis/reports/v2/internal.py +12 -45
wandb/cli/cli.py +29 -5
wandb/integration/openai/fine_tuning.py +74 -37
wandb/integration/ultralytics/callback.py +0 -1
wandb/proto/v3/wandb_internal_pb2.py +332 -312
wandb/proto/v3/wandb_settings_pb2.py +13 -3
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_internal_pb2.py +316 -312
wandb/proto/v4/wandb_settings_pb2.py +5 -3
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/sdk/artifacts/artifact.py +92 -26
wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
wandb/sdk/artifacts/artifact_saver.py +16 -36
wandb/sdk/artifacts/storage_handler.py +2 -1
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +13 -5
wandb/sdk/interface/interface.py +60 -15
wandb/sdk/interface/interface_shared.py +13 -7
wandb/sdk/internal/file_stream.py +19 -0
wandb/sdk/internal/handler.py +1 -4
wandb/sdk/internal/internal_api.py +2 -0
wandb/sdk/internal/job_builder.py +45 -17
wandb/sdk/internal/sender.py +53 -28
wandb/sdk/internal/settings_static.py +9 -0
wandb/sdk/internal/system/system_info.py +4 -1
wandb/sdk/launch/_launch.py +5 -0
wandb/sdk/launch/_project_spec.py +5 -20
wandb/sdk/launch/agent/agent.py +80 -37
wandb/sdk/launch/agent/config.py +8 -0
wandb/sdk/launch/builder/kaniko_builder.py +149 -134
wandb/sdk/launch/create_job.py +44 -48
wandb/sdk/launch/runner/kubernetes_monitor.py +3 -1
wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
wandb/sdk/launch/sweeps/scheduler.py +3 -1
wandb/sdk/launch/utils.py +23 -5
wandb/sdk/lib/__init__.py +2 -5
wandb/sdk/lib/_settings_toposort_generated.py +2 -0
wandb/sdk/lib/filesystem.py +11 -1
wandb/sdk/lib/run_moment.py +78 -0
wandb/sdk/service/streams.py +1 -6
wandb/sdk/wandb_init.py +12 -7
wandb/sdk/wandb_login.py +43 -26
wandb/sdk/wandb_run.py +179 -94
wandb/sdk/wandb_settings.py +55 -16
wandb/testing/relay.py +5 -6
{wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/METADATA +1 -1
{wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/RECORD +55 -54
{wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/WHEEL +1 -1
{wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/LICENSE +0 -0
{wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/entry_points.txt +0 -0
{wandb-0.16.4.dist-info → wandb-0.16.6.dist-info}/top_level.txt +0 -0

wandb/sdk/launch/agent/agent.py CHANGED Viewed

@@ -45,7 +45,10 @@ MAX_RESUME_COUNT = 5
 RUN_INFO_GRACE_PERIOD = 60
-MAX_WAIT_RUN_STOPPED = 60
+DEFAULT_STOPPED_RUN_TIMEOUT = 60
+DEFAULT_PRINT_INTERVAL = 5 * 60
+VERBOSE_PRINT_INTERVAL = 20
 _env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
 if _env_timeout:
@@ -105,30 +108,29 @@ def _max_from_config(
     return max_from_config
-def _is_scheduler_job(run_spec: Dict[str, Any]) -> bool:
-    """Determine whether a job/runSpec is a sweep scheduler."""
-    if not run_spec:
-        _logger.debug("Recieved runSpec in _is_scheduler_job that was empty")
+class InternalAgentLogger:
+    def __init__(self, verbosity=0):
+        self._print_to_terminal = verbosity >= 2
-    if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
-        return False
+    def error(self, message: str):
+        if self._print_to_terminal:
+            wandb.termerror(f"{LOG_PREFIX}{message}")
+        _logger.error(f"{LOG_PREFIX}{message}")
-    if run_spec.get("resource") == "local-process":
-        # Any job pushed to a run queue that has a scheduler uri is
-        # allowed to use local-process
-        if run_spec.get("job"):
-            return True
+    def warn(self, message: str):
+        if self._print_to_terminal:
+            wandb.termwarn(f"{LOG_PREFIX}{message}")
+        _logger.warn(f"{LOG_PREFIX}{message}")
-        # If a scheduler is local-process and run through CLI, also
-        #    confirm command is in format: [wandb scheduler <sweep>]
-        cmd = run_spec.get("overrides", {}).get("entry_point", [])
-        if len(cmd) < 3:
-            return False
+    def info(self, message: str):
+        if self._print_to_terminal:
+            wandb.termlog(f"{LOG_PREFIX}{message}")
+        _logger.info(f"{LOG_PREFIX}{message}")
-        if cmd[:2] != ["wandb", "scheduler"]:
-            return False
-    return True
+    def debug(self, message: str):
+        if self._print_to_terminal:
+            wandb.termlog(f"{LOG_PREFIX}{message}")
+        _logger.debug(f"{LOG_PREFIX}{message}")
 class LaunchAgent:
@@ -184,7 +186,13 @@ class LaunchAgent:
         self._max_jobs = _max_from_config(config, "max_jobs")
         self._max_schedulers = _max_from_config(config, "max_schedulers")
         self._secure_mode = config.get("secure_mode", False)
+        self._verbosity = config.get("verbosity", 0)
+        self._internal_logger = InternalAgentLogger(verbosity=self._verbosity)
+        self._last_status_print_time = 0.0
         self.default_config: Dict[str, Any] = config
+        self._stopped_run_timeout = config.get(
+            "stopped_run_timeout", DEFAULT_STOPPED_RUN_TIMEOUT
+        )
         # Get agent version from env var if present, otherwise wandb version
         self.version: str = "wandb@" + wandb.__version__
@@ -228,6 +236,33 @@ class LaunchAgent:
         self._name = agent_response["name"]
         self._init_agent_run()
+    def _is_scheduler_job(self, run_spec: Dict[str, Any]) -> bool:
+        """Determine whether a job/runSpec is a sweep scheduler."""
+        if not run_spec:
+            self._internal_logger.debug(
+                "Recieved runSpec in _is_scheduler_job that was empty"
+            )
+        if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
+            return False
+        if run_spec.get("resource") == "local-process":
+            # Any job pushed to a run queue that has a scheduler uri is
+            # allowed to use local-process
+            if run_spec.get("job"):
+                return True
+            # If a scheduler is local-process and run through CLI, also
+            #    confirm command is in format: [wandb scheduler <sweep>]
+            cmd = run_spec.get("overrides", {}).get("entry_point", [])
+            if len(cmd) < 3:
+                return False
+            if cmd[:2] != ["wandb", "scheduler"]:
+                return False
+        return True
     async def fail_run_queue_item(
         self,
         run_queue_item_id: str,
@@ -298,6 +333,7 @@ class LaunchAgent:
     def print_status(self) -> None:
         """Prints the current status of the agent."""
+        self._last_status_print_time = time.time()
         output_str = "agent "
         if self._name:
             output_str += f"{self._name} "
@@ -344,8 +380,8 @@ class LaunchAgent:
             if run_state.lower() != "pending":
                 return True
         except CommError:
-            _logger.info(
-                f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run"
+            self._internal_logger.info(
+                f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run",
             )
         return False
@@ -361,8 +397,8 @@ class LaunchAgent:
             job_and_run_status.entity is not None
             and job_and_run_status.entity != self._entity
         ):
-            _logger.info(
-                "Skipping check for completed run status because run is on a different entity than agent"
+            self._internal_logger.info(
+                "Skipping check for completed run status because run is on a different entity than agent",
             )
         elif exception is not None:
             tb_str = traceback.format_exception(
@@ -378,8 +414,8 @@ class LaunchAgent:
                 fnames,
             )
         elif job_and_run_status.project is None or job_and_run_status.run_id is None:
-            _logger.error(
-                f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}"
+            self._internal_logger.info(
+                f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}",
             )
             wandb.termerror(
                 "Missing project or run id on thread called finish thread id"
@@ -430,7 +466,9 @@ class LaunchAgent:
                     job_and_run_status.run_queue_item_id, _msg, "run", fnames
                 )
         else:
-            _logger.info(f"Finish thread id {thread_id} had no exception and no run")
+            self._internal_logger.info(
+                f"Finish thread id {thread_id} had no exception and no run"
+            )
             wandb._sentry.exception(
                 "launch agent called finish thread id on thread without run or exception"
             )
@@ -458,7 +496,7 @@ class LaunchAgent:
         await self.update_status(AGENT_RUNNING)
         # parse job
-        _logger.info("Parsing launch spec")
+        self._internal_logger.info("Parsing launch spec")
         launch_spec = job["runSpec"]
         # Abort if this job attempts to override secure mode
@@ -511,6 +549,10 @@ class LaunchAgent:
             KeyboardInterrupt: if the agent is requested to stop.
         """
         self.print_status()
+        if self._verbosity == 0:
+            print_interval = DEFAULT_PRINT_INTERVAL
+        else:
+            print_interval = VERBOSE_PRINT_INTERVAL
         try:
             while True:
                 job = None
@@ -532,7 +574,7 @@ class LaunchAgent:
                             file_saver = RunQueueItemFileSaver(
                                 self._wandb_run, job["runQueueItemId"]
                             )
-                            if _is_scheduler_job(job.get("runSpec", {})):
+                            if self._is_scheduler_job(job.get("runSpec", {})):
                                 # If job is a scheduler, and we are already at the cap, ignore,
                                 #    don't ack, and it will be pushed back onto the queue in 1 min
                                 if self.num_running_schedulers >= self._max_schedulers:
@@ -567,6 +609,7 @@ class LaunchAgent:
                         await self.update_status(AGENT_POLLING)
                     else:
                         await self.update_status(AGENT_RUNNING)
+                if time.time() - self._last_status_print_time > print_interval:
                     self.print_status()
                 if self.num_running_jobs == self._max_jobs or job is None:
@@ -634,14 +677,14 @@ class LaunchAgent:
         await self.check_sweep_state(launch_spec, api)
         job_tracker.update_run_info(project)
-        _logger.info("Fetching and validating project...")
+        self._internal_logger.info("Fetching and validating project...")
         project.fetch_and_validate_project()
-        _logger.info("Fetching resource...")
+        self._internal_logger.info("Fetching resource...")
         resource = launch_spec.get("resource") or "local-container"
         backend_config: Dict[str, Any] = {
             PROJECT_SYNCHRONOUS: False,  # agent always runs async
         }
-        _logger.info("Loading backend")
+        self._internal_logger.info("Loading backend")
         override_build_config = launch_spec.get("builder")
         _, build_config, registry_config = construct_agent_configs(
@@ -661,13 +704,13 @@ class LaunchAgent:
             assert entrypoint is not None
             image_uri = await builder.build_image(project, entrypoint, job_tracker)
-        _logger.info("Backend loaded...")
+        self._internal_logger.info("Backend loaded...")
         if isinstance(backend, LocalProcessRunner):
             run = await backend.run(project, image_uri)
         else:
             assert image_uri
             run = await backend.run(project, image_uri)
-        if _is_scheduler_job(launch_spec):
+        if self._is_scheduler_job(launch_spec):
             with self._jobs_lock:
                 self._jobs[thread_id].is_scheduler = True
             wandb.termlog(
@@ -700,7 +743,7 @@ class LaunchAgent:
                 if stopped_time is None:
                     stopped_time = time.time()
                 else:
-                    if time.time() - stopped_time > MAX_WAIT_RUN_STOPPED:
+                    if time.time() - stopped_time > self._stopped_run_timeout:
                         await run.cancel()
             await asyncio.sleep(AGENT_POLLING_INTERVAL)
@@ -720,7 +763,7 @@ class LaunchAgent:
                     project=launch_spec["project"],
                 )
             except Exception as e:
-                _logger.debug(f"Fetch sweep state error: {e}")
+                self._internal_logger.debug(f"Fetch sweep state error: {e}")
                 state = None
             if state != "RUNNING" and state != "PAUSED":

wandb/sdk/launch/agent/config.py CHANGED Viewed

@@ -225,6 +225,14 @@ class AgentConfig(BaseModel):
         None,
         description="The builder to use.",
     )
+    verbosity: Optional[int] = Field(
+        0,
+        description="How verbose to print, 0 = default, 1 = verbose, 2 = very verbose",
+    )
+    stopped_run_timeout: Optional[int] = Field(
+        60,
+        description="How many seconds to wait after receiving the stop command before forcibly cancelling a run.",
+    )
     class Config:
         extra = "forbid"

wandb/sdk/launch/builder/kaniko_builder.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import base64
+import copy
 import json
 import logging
 import os
@@ -8,7 +9,7 @@ import tarfile
 import tempfile
 import time
 import traceback
-from typing import Optional
+from typing import Any, Dict, Optional
 import wandb
 from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
@@ -105,6 +106,7 @@ class KanikoBuilder(AbstractBuilder):
         secret_name: str = "",
         secret_key: str = "",
         image: str = "gcr.io/kaniko-project/executor:v1.11.0",
+        config: Optional[dict] = None,
     ):
         """Initialize a KanikoBuilder.
@@ -125,6 +127,7 @@ class KanikoBuilder(AbstractBuilder):
         self.secret_name = secret_name
         self.secret_key = secret_key
         self.image = image
+        self.kaniko_config = config or {}
     @classmethod
     def from_config(
@@ -170,6 +173,7 @@ class KanikoBuilder(AbstractBuilder):
         image_uri = config.get("destination")
         if image_uri is not None:
             registry = registry_from_uri(image_uri)
+        kaniko_config = config.get("kaniko-config", {})
         return cls(
             environment,
@@ -179,6 +183,7 @@ class KanikoBuilder(AbstractBuilder):
             secret_name=secret_name,
             secret_key=secret_key,
             image=kaniko_image,
+            config=kaniko_config,
         )
     async def verify(self) -> None:
@@ -289,7 +294,7 @@ class KanikoBuilder(AbstractBuilder):
         build_context = await self._upload_build_context(run_id, context_path)
         build_job = await self._create_kaniko_job(
-            build_job_name, repo_uri, image_uri, build_context, core_v1
+            build_job_name, repo_uri, image_uri, build_context, core_v1, api_client
         )
         wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
@@ -324,7 +329,9 @@ class KanikoBuilder(AbstractBuilder):
             ):
                 if job_tracker:
                     job_tracker.set_err_stage("build")
-                raise Exception(f"Failed to build image in kaniko for job {run_id}")
+                raise Exception(
+                    f"Failed to build image in kaniko for job {run_id}. View logs with `kubectl logs -n {NAMESPACE} {build_job_name}`."
+                )
             try:
                 pods_from_job = await core_v1.list_namespaced_pod(
                     namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
@@ -371,23 +378,32 @@ class KanikoBuilder(AbstractBuilder):
         image_tag: str,
         build_context_path: str,
         core_client: client.CoreV1Api,
-    ) -> "client.V1Job":
-        env = []
-        volume_mounts = []
-        volumes = []
+        api_client,
+    ) -> Dict[str, Any]:
+        job = copy.deepcopy(self.kaniko_config)
+        job_metadata = job.get("metadata", {})
+        job_labels = job_metadata.get("labels", {})
+        job_spec = job.get("spec", {})
+        pod_template = job_spec.get("template", {})
+        pod_metadata = pod_template.get("metadata", {})
+        pod_labels = pod_metadata.get("labels", {})
+        pod_spec = pod_template.get("spec", {})
+        volumes = pod_spec.get("volumes", [])
+        containers = pod_spec.get("containers") or [{}]
+        if len(containers) > 1:
+            raise LaunchError(
+                "Multiple container configs not supported for kaniko builder."
+            )
+        container = containers[0]
+        volume_mounts = container.get("volumeMounts", [])
+        env = container.get("env", [])
+        custom_args = container.get("args", [])
         if PVC_MOUNT_PATH:
             volumes.append(
-                client.V1Volume(
-                    name="kaniko-pvc",
-                    persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
-                        claim_name=PVC_NAME
-                    ),
-                )
-            )
-            volume_mounts.append(
-                client.V1VolumeMount(name="kaniko-pvc", mount_path="/context")
+                {"name": "kaniko-pvc", "persistentVolumeClaim": {"claimName": PVC_NAME}}
             )
+            volume_mounts.append({"name": "kaniko-pvc", "mountPath": "/context"})
         if bool(self.secret_name) != bool(self.secret_key):
             raise LaunchError(
@@ -395,13 +411,13 @@ class KanikoBuilder(AbstractBuilder):
                 "for kaniko build. You provided only one of them."
             )
         if isinstance(self.registry, ElasticContainerRegistry):
-            env += [
-                client.V1EnvVar(
-                    name="AWS_REGION",
-                    value=self.registry.region,
-                )
-            ]
-        # TODO: Refactor all of this environment/registry
+            env.append(
+                {
+                    "name": "AWS_REGION",
+                    "value": self.registry.region,
+                }
+            )
+        # TODO(ben): Refactor all of this environment/registry
         # specific stuff into methods of those classes.
         if isinstance(self.environment, AzureEnvironment):
             # Use the core api to check if the secret exists
@@ -416,52 +432,46 @@ class KanikoBuilder(AbstractBuilder):
                     "namespace wandb. Please create it with the key password "
                     "set to your azure storage access key."
                 ) from e
-            env += [
-                client.V1EnvVar(
-                    name="AZURE_STORAGE_ACCESS_KEY",
-                    value_from=client.V1EnvVarSource(
-                        secret_key_ref=client.V1SecretKeySelector(
-                            name="azure-storage-access-key",
-                            key="password",
-                        )
-                    ),
-                )
-            ]
+            env.append(
+                {
+                    "name": "AZURE_STORAGE_ACCESS_KEY",
+                    "valueFrom": {
+                        "secretKeyRef": {
+                            "name": "azure-storage-access-key",
+                            "key": "password",
+                        }
+                    },
+                }
+            )
         if DOCKER_CONFIG_SECRET:
             volumes.append(
-                client.V1Volume(
-                    name="kaniko-docker-config",
-                    secret=client.V1SecretVolumeSource(
-                        secret_name=DOCKER_CONFIG_SECRET,
-                        items=[
-                            client.V1KeyToPath(
-                                key=".dockerconfigjson", path="config.json"
-                            )
+                {
+                    "name": "kaniko-docker-config",
+                    "secret": {
+                        "secretName": DOCKER_CONFIG_SECRET,
+                        "items": [
+                            {
+                                "key": ".dockerconfigjson",
+                                "path": "config.json",
+                            }
                         ],
-                    ),
-                )
+                    },
+                }
             )
             volume_mounts.append(
-                client.V1VolumeMount(
-                    name="kaniko-docker-config",
-                    mount_path="/kaniko/.docker",
-                )
+                {"name": "kaniko-docker-config", "mountPath": "/kaniko/.docker"}
             )
         elif self.secret_name and self.secret_key:
-            volumes += [
-                client.V1Volume(
-                    name="docker-config",
-                    config_map=client.V1ConfigMapVolumeSource(
-                        name=f"docker-config-{job_name}",
-                    ),
-                ),
-            ]
-            volume_mounts += [
-                client.V1VolumeMount(
-                    name="docker-config", mount_path="/kaniko/.docker/"
-                ),
-            ]
-            # TODO: I don't like conditioning on the registry type here. As a
+            volumes.append(
+                {
+                    "name": "docker-config",
+                    "configMap": {"name": f"docker-config-{job_name}"},
+                }
+            )
+            volume_mounts.append(
+                {"name": "docker-config", "mountPath": "/kaniko/.docker"}
+            )
+            # TODO(ben): I don't like conditioning on the registry type here. As a
             # future change I want the registry and environment classes to provide
             # a list of environment variables and volume mounts that need to be
             # added to the job. The environment class provides credentials for
@@ -475,90 +485,95 @@ class KanikoBuilder(AbstractBuilder):
             elif isinstance(self.registry, GoogleArtifactRegistry):
                 mount_path = "/kaniko/.config/gcloud"
                 key = "config.json"
-                env += [
-                    client.V1EnvVar(
-                        name="GOOGLE_APPLICATION_CREDENTIALS",
-                        value="/kaniko/.config/gcloud/config.json",
-                    )
-                ]
+                env.append(
+                    {
+                        "name": "GOOGLE_APPLICATION_CREDENTIALS",
+                        "value": "/kaniko/.config/gcloud/config.json",
+                    }
+                )
             else:
                 raise LaunchError(
                     f"Registry type {type(self.registry)} not supported by kaniko"
                 )
-            volume_mounts += [
-                client.V1VolumeMount(
-                    name=self.secret_name,
-                    mount_path=mount_path,
-                    read_only=True,
-                )
-            ]
-            volumes += [
-                client.V1Volume(
-                    name=self.secret_name,
-                    secret=client.V1SecretVolumeSource(
-                        secret_name=self.secret_name,
-                        items=[client.V1KeyToPath(key=self.secret_key, path=key)],
-                    ),
-                )
-            ]
+            volumes.append(
+                {
+                    "name": self.secret_name,
+                    "secret": {
+                        "secretName": self.secret_name,
+                        "items": [{"key": self.secret_key, "path": key}],
+                    },
+                }
+            )
+            volume_mounts.append(
+                {
+                    "name": self.secret_name,
+                    "mountPath": mount_path,
+                    "readOnly": True,
+                }
+            )
         if isinstance(self.registry, AzureContainerRegistry):
-            # ADd the docker config map
-            volume_mounts += [
-                client.V1VolumeMount(
-                    name="docker-config", mount_path="/kaniko/.docker/"
-                ),
-            ]
-            volumes += [
-                client.V1Volume(
-                    name="docker-config",
-                    config_map=client.V1ConfigMapVolumeSource(
-                        name=f"docker-config-{job_name}",
-                    ),
-                ),
-            ]
+            # Add the docker config map
+            volumes.append(
+                {
+                    "name": "docker-config",
+                    "configMap": {"name": f"docker-config-{job_name}"},
+                }
+            )
+            volume_mounts.append(
+                {"name": "docker-config", "mountPath": "/kaniko/.docker/"}
+            )
         # Kaniko doesn't want https:// at the begining of the image tag.
         destination = image_tag
         if destination.startswith("https://"):
             destination = destination.replace("https://", "")
-        args = [
-            f"--context={build_context_path}",
-            f"--dockerfile={_WANDB_DOCKERFILE_NAME}",
-            f"--destination={destination}",
-            "--cache=true",
-            f"--cache-repo={repository.replace('https://', '')}",
-            "--snapshotMode=redo",
-            "--compressed-caching=false",
+        args = {
+            "--context": build_context_path,
+            "--dockerfile": _WANDB_DOCKERFILE_NAME,
+            "--destination": destination,
+            "--cache": "true",
+            "--cache-repo": repository.replace("https://", ""),
+            "--snapshot-mode": "redo",
+            "--compressed-caching": "false",
+        }
+        for custom_arg in custom_args:
+            arg_name, arg_value = custom_arg.split("=", 1)
+            args[arg_name] = arg_value
+        parsed_args = [
+            f"{arg_name}={arg_value}" for arg_name, arg_value in args.items()
         ]
-        container = client.V1Container(
-            name="wandb-container-build",
-            image=self.image,
-            args=args,
-            volume_mounts=volume_mounts,
-            env=env if env else None,
-        )
-        # Create and configure a spec section
-        labels = {"wandb": "launch"}
+        container["args"] = parsed_args
+        # Apply the rest of our defaults
+        pod_labels["wandb"] = "launch"
         # This annotation is required to enable azure workload identity.
         if isinstance(self.registry, AzureContainerRegistry):
-            labels["azure.workload.identity/use"] = "true"
-        template = client.V1PodTemplateSpec(
-            metadata=client.V1ObjectMeta(labels=labels),
-            spec=client.V1PodSpec(
-                restart_policy="Never",
-                active_deadline_seconds=_DEFAULT_BUILD_TIMEOUT_SECS,
-                containers=[container],
-                volumes=volumes,
-                service_account_name=SERVICE_ACCOUNT_NAME,
-            ),
+            pod_labels["azure.workload.identity/use"] = "true"
+        pod_spec["restartPolicy"] = pod_spec.get("restartPolicy", "Never")
+        pod_spec["activeDeadlineSeconds"] = pod_spec.get(
+            "activeDeadlineSeconds", _DEFAULT_BUILD_TIMEOUT_SECS
         )
-        # Create the specification of job
-        spec = client.V1JobSpec(template=template, backoff_limit=0)
-        job = client.V1Job(
-            api_version="batch/v1",
-            kind="Job",
-            metadata=client.V1ObjectMeta(
-                name=job_name, namespace=NAMESPACE, labels={"wandb": "launch"}
-            ),
-            spec=spec,
+        pod_spec["serviceAccountName"] = pod_spec.get(
+            "serviceAccountName", SERVICE_ACCOUNT_NAME
         )
+        job_spec["backoffLimit"] = job_spec.get("backoffLimit", 0)
+        job_labels["wandb"] = "launch"
+        job_metadata["namespace"] = job_metadata.get("namespace", NAMESPACE)
+        job_metadata["name"] = job_metadata.get("name", job_name)
+        job["apiVersion"] = "batch/v1"
+        job["kind"] = "Job"
+        # Apply all nested configs from the bottom up
+        pod_metadata["labels"] = pod_labels
+        pod_template["metadata"] = pod_metadata
+        container["name"] = container.get("name", "wandb-container-build")
+        container["image"] = container.get("image", self.image)
+        container["volumeMounts"] = volume_mounts
+        container["env"] = env
+        pod_spec["containers"] = [container]
+        pod_spec["volumes"] = volumes
+        pod_template["spec"] = pod_spec
+        job_spec["template"] = pod_template
+        job_metadata["labels"] = job_labels
+        job["metadata"] = job_metadata
+        job["spec"] = job_spec
         return job

wandb 0.16.4__py3-none-any.whl → 0.16.6__py3-none-any.whl

wandb 0.16.4py3-none-any.whl → 0.16.6py3-none-any.whl