PyPI - wandb - Versions diffs - 0.17.0rc2__py3-none-win_amd64.whl → 0.17.1__py3-none-win_amd64.whl - Mend

wandb 0.17.0rc2__py3-none-win_amd64.whl → 0.17.1__py3-none-win_amd64.whl

Files changed (159) hide show

wandb/__init__.py +1 -2
wandb/apis/importers/internals/internal.py +0 -1
wandb/apis/importers/wandb.py +12 -7
wandb/apis/internal.py +0 -3
wandb/apis/public/api.py +213 -79
wandb/apis/public/artifacts.py +335 -100
wandb/apis/public/files.py +9 -9
wandb/apis/public/jobs.py +16 -4
wandb/apis/public/projects.py +26 -28
wandb/apis/public/query_generator.py +1 -1
wandb/apis/public/runs.py +163 -65
wandb/apis/public/sweeps.py +2 -2
wandb/apis/reports/__init__.py +1 -7
wandb/apis/reports/v1/__init__.py +5 -27
wandb/apis/reports/v2/__init__.py +7 -19
wandb/apis/workspaces/__init__.py +8 -0
wandb/beta/workflows.py +8 -3
wandb/bin/wandb-core +0 -0
wandb/cli/cli.py +131 -59
wandb/docker/__init__.py +1 -1
wandb/errors/term.py +10 -2
wandb/filesync/step_checksum.py +1 -4
wandb/filesync/step_prepare.py +4 -24
wandb/filesync/step_upload.py +5 -107
wandb/filesync/upload_job.py +0 -76
wandb/integration/gym/__init__.py +35 -15
wandb/integration/openai/fine_tuning.py +21 -3
wandb/integration/prodigy/prodigy.py +1 -1
wandb/jupyter.py +16 -17
wandb/plot/pr_curve.py +2 -1
wandb/plot/roc_curve.py +2 -1
wandb/{plots → plot}/utils.py +13 -25
wandb/proto/v3/wandb_internal_pb2.py +54 -54
wandb/proto/v3/wandb_settings_pb2.py +2 -2
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_internal_pb2.py +54 -54
wandb/proto/v4/wandb_settings_pb2.py +2 -2
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/proto/v5/wandb_base_pb2.py +30 -0
wandb/proto/v5/wandb_internal_pb2.py +355 -0
wandb/proto/v5/wandb_server_pb2.py +63 -0
wandb/proto/v5/wandb_settings_pb2.py +45 -0
wandb/proto/v5/wandb_telemetry_pb2.py +41 -0
wandb/proto/wandb_base_pb2.py +2 -0
wandb/proto/wandb_deprecated.py +9 -1
wandb/proto/wandb_generate_deprecated.py +34 -0
wandb/proto/{wandb_internal_codegen.py → wandb_generate_proto.py} +1 -35
wandb/proto/wandb_internal_pb2.py +2 -0
wandb/proto/wandb_server_pb2.py +2 -0
wandb/proto/wandb_settings_pb2.py +2 -0
wandb/proto/wandb_telemetry_pb2.py +2 -0
wandb/sdk/artifacts/artifact.py +68 -22
wandb/sdk/artifacts/artifact_manifest.py +1 -1
wandb/sdk/artifacts/artifact_manifest_entry.py +6 -3
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -1
wandb/sdk/artifacts/artifact_saver.py +1 -10
wandb/sdk/artifacts/storage_handlers/local_file_handler.py +6 -2
wandb/sdk/artifacts/storage_handlers/multi_handler.py +1 -1
wandb/sdk/artifacts/storage_handlers/tracking_handler.py +6 -4
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +2 -42
wandb/sdk/artifacts/storage_policy.py +1 -12
wandb/sdk/data_types/image.py +1 -1
wandb/sdk/data_types/video.py +4 -2
wandb/sdk/interface/interface.py +13 -0
wandb/sdk/interface/interface_shared.py +1 -1
wandb/sdk/internal/file_pusher.py +2 -5
wandb/sdk/internal/file_stream.py +6 -19
wandb/sdk/internal/internal_api.py +148 -136
wandb/sdk/internal/job_builder.py +207 -135
wandb/sdk/internal/progress.py +0 -28
wandb/sdk/internal/sender.py +102 -39
wandb/sdk/internal/settings_static.py +8 -1
wandb/sdk/internal/system/assets/trainium.py +3 -3
wandb/sdk/internal/system/system_info.py +4 -2
wandb/sdk/internal/update.py +1 -1
wandb/sdk/launch/__init__.py +9 -1
wandb/sdk/launch/_launch.py +4 -24
wandb/sdk/launch/_launch_add.py +1 -3
wandb/sdk/launch/_project_spec.py +184 -224
wandb/sdk/launch/agent/agent.py +58 -18
wandb/sdk/launch/agent/config.py +0 -3
wandb/sdk/launch/builder/abstract.py +67 -0
wandb/sdk/launch/builder/build.py +165 -576
wandb/sdk/launch/builder/context_manager.py +235 -0
wandb/sdk/launch/builder/docker_builder.py +7 -23
wandb/sdk/launch/builder/kaniko_builder.py +10 -23
wandb/sdk/launch/builder/templates/dockerfile.py +92 -0
wandb/sdk/launch/create_job.py +51 -45
wandb/sdk/launch/environment/aws_environment.py +26 -1
wandb/sdk/launch/inputs/files.py +148 -0
wandb/sdk/launch/inputs/internal.py +224 -0
wandb/sdk/launch/inputs/manage.py +95 -0
wandb/sdk/launch/runner/abstract.py +2 -2
wandb/sdk/launch/runner/kubernetes_monitor.py +45 -12
wandb/sdk/launch/runner/kubernetes_runner.py +6 -8
wandb/sdk/launch/runner/local_container.py +2 -3
wandb/sdk/launch/runner/local_process.py +8 -29
wandb/sdk/launch/runner/sagemaker_runner.py +20 -14
wandb/sdk/launch/runner/vertex_runner.py +8 -7
wandb/sdk/launch/sweeps/scheduler.py +2 -0
wandb/sdk/launch/sweeps/utils.py +2 -2
wandb/sdk/launch/utils.py +16 -138
wandb/sdk/lib/_settings_toposort_generated.py +2 -5
wandb/sdk/lib/apikey.py +4 -2
wandb/sdk/lib/config_util.py +3 -3
wandb/sdk/lib/proto_util.py +22 -1
wandb/sdk/lib/redirect.py +1 -1
wandb/sdk/service/service.py +2 -1
wandb/sdk/service/streams.py +5 -5
wandb/sdk/wandb_init.py +25 -59
wandb/sdk/wandb_login.py +28 -25
wandb/sdk/wandb_run.py +112 -45
wandb/sdk/wandb_settings.py +33 -64
wandb/sdk/wandb_watch.py +1 -1
wandb/sklearn/plot/classifier.py +4 -6
wandb/sync/sync.py +2 -2
wandb/testing/relay.py +32 -17
wandb/util.py +36 -37
wandb/wandb_agent.py +3 -3
wandb/wandb_controller.py +3 -2
{wandb-0.17.0rc2.dist-info → wandb-0.17.1.dist-info}/METADATA +7 -9
{wandb-0.17.0rc2.dist-info → wandb-0.17.1.dist-info}/RECORD +125 -147
wandb/apis/reports/v1/_blocks.py +0 -1406
wandb/apis/reports/v1/_helpers.py +0 -70
wandb/apis/reports/v1/_panels.py +0 -1282
wandb/apis/reports/v1/_templates.py +0 -478
wandb/apis/reports/v1/blocks.py +0 -27
wandb/apis/reports/v1/helpers.py +0 -2
wandb/apis/reports/v1/mutations.py +0 -66
wandb/apis/reports/v1/panels.py +0 -17
wandb/apis/reports/v1/report.py +0 -268
wandb/apis/reports/v1/runset.py +0 -144
wandb/apis/reports/v1/templates.py +0 -7
wandb/apis/reports/v1/util.py +0 -406
wandb/apis/reports/v1/validators.py +0 -131
wandb/apis/reports/v2/blocks.py +0 -25
wandb/apis/reports/v2/expr_parsing.py +0 -257
wandb/apis/reports/v2/gql.py +0 -68
wandb/apis/reports/v2/interface.py +0 -1911
wandb/apis/reports/v2/internal.py +0 -867
wandb/apis/reports/v2/metrics.py +0 -6
wandb/apis/reports/v2/panels.py +0 -15
wandb/catboost/__init__.py +0 -9
wandb/fastai/__init__.py +0 -9
wandb/keras/__init__.py +0 -19
wandb/lightgbm/__init__.py +0 -9
wandb/plots/__init__.py +0 -6
wandb/plots/explain_text.py +0 -36
wandb/plots/heatmap.py +0 -81
wandb/plots/named_entity.py +0 -43
wandb/plots/part_of_speech.py +0 -50
wandb/plots/plot_definitions.py +0 -768
wandb/plots/precision_recall.py +0 -121
wandb/plots/roc.py +0 -103
wandb/sacred/__init__.py +0 -3
wandb/xgboost/__init__.py +0 -9
{wandb-0.17.0rc2.dist-info → wandb-0.17.1.dist-info}/WHEEL +0 -0
{wandb-0.17.0rc2.dist-info → wandb-0.17.1.dist-info}/entry_points.txt +0 -0
{wandb-0.17.0rc2.dist-info → wandb-0.17.1.dist-info}/licenses/LICENSE +0 -0

wandb/sdk/launch/environment/aws_environment.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Dict, Optional
 from wandb.sdk.launch.errors import LaunchError
 from wandb.util import get_module
-from ..utils import S3_URI_RE, event_loop_thread_exec
+from ..utils import ARN_PARTITION_RE, S3_URI_RE, event_loop_thread_exec
 from .abstract import AbstractEnvironment
 boto3 = get_module(
@@ -49,6 +49,7 @@ class AwsEnvironment(AbstractEnvironment):
         self._secret_key = secret_key
         self._session_token = session_token
         self._account = None
+        self._partition = None
     @classmethod
     def from_default(cls, region: Optional[str] = None) -> "AwsEnvironment":
@@ -122,6 +123,30 @@ class AwsEnvironment(AbstractEnvironment):
     def region(self, region: str) -> None:
         self._region = region
+    async def get_partition(self) -> str:
+        """Set the partition for the AWS environment."""
+        try:
+            session = await self.get_session()
+            client = await event_loop_thread_exec(session.client)("sts")
+            get_caller_identity = event_loop_thread_exec(client.get_caller_identity)
+            identity = await get_caller_identity()
+            arn = identity.get("Arn")
+            if not arn:
+                raise LaunchError(
+                    "Could not set partition for AWS environment. ARN not found."
+                )
+            matched_partition = ARN_PARTITION_RE.match(arn)
+            if not matched_partition:
+                raise LaunchError(
+                    f"Could not set partition for AWS environment. ARN {arn} is not valid."
+                )
+            partition = matched_partition.group(1)
+            return partition
+        except botocore.exceptions.ClientError as e:
+            raise LaunchError(
+                f"Could not set partition for AWS environment. {e}"
+            ) from e
     async def verify(self) -> None:
         """Verify that the AWS environment is configured correctly.

wandb/sdk/launch/inputs/files.py ADDED Viewed

@@ -0,0 +1,148 @@
+import json
+import os
+from typing import Any, Dict
+import yaml
+from ..errors import LaunchError
+FILE_OVERRIDE_ENV_VAR = "WANDB_LAUNCH_FILE_OVERRIDES"
+class FileOverrides:
+    """Singleton that read file overrides json from environment variables."""
+    _instance = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = object.__new__(cls)
+            cls._instance.overrides = {}
+            cls._instance.load()
+        return cls._instance
+    def load(self) -> None:
+        """Load overrides from an environment variable."""
+        overrides = os.environ.get(FILE_OVERRIDE_ENV_VAR)
+        if overrides is None:
+            if f"{FILE_OVERRIDE_ENV_VAR}_0" in os.environ:
+                overrides = ""
+                idx = 0
+                while f"{FILE_OVERRIDE_ENV_VAR}_{idx}" in os.environ:
+                    overrides += os.environ[f"{FILE_OVERRIDE_ENV_VAR}_{idx}"]
+                    idx += 1
+        if overrides:
+            try:
+                contents = json.loads(overrides)
+                if not isinstance(contents, dict):
+                    raise LaunchError(f"Invalid JSON in {FILE_OVERRIDE_ENV_VAR}")
+                self.overrides = contents
+            except json.JSONDecodeError:
+                raise LaunchError(f"Invalid JSON in {FILE_OVERRIDE_ENV_VAR}")
+def config_path_is_valid(path: str) -> None:
+    """Validate a config file path.
+    This function checks if a given config file path is valid. A valid path
+    should meet the following criteria:
+    - The path must be expressed as a relative path without any upwards path
+      traversal, e.g. `../config.json`.
+    - The file specified by the path must exist.
+    - The file must have a supported extension (`.json`, `.yaml`, or `.yml`).
+    Args:
+        path (str): The path to validate.
+    Raises:
+        LaunchError: If the path is not valid.
+    """
+    if os.path.isabs(path):
+        raise LaunchError(
+            f"Invalid config path: {path}. Please provide a relative path."
+        )
+    if ".." in path:
+        raise LaunchError(
+            f"Invalid config path: {path}. Please provide a relative path "
+            "without any upward path traversal, e.g. `../config.json`."
+        )
+    path = os.path.normpath(path)
+    if not os.path.exists(path):
+        raise LaunchError(f"Invalid config path: {path}. File does not exist.")
+    if not any(path.endswith(ext) for ext in [".json", ".yaml", ".yml"]):
+        raise LaunchError(
+            f"Invalid config path: {path}. Only JSON and YAML files are supported."
+        )
+def override_file(path: str) -> None:
+    """Check for file overrides in the environment and apply them if found."""
+    file_overrides = FileOverrides()
+    if path in file_overrides.overrides:
+        overrides = file_overrides.overrides.get(path)
+        if overrides is not None:
+            config = _read_config_file(path)
+            _update_dict(config, overrides)
+            _write_config_file(path, config)
+def _write_config_file(path: str, config: Any) -> None:
+    """Write a config file to disk.
+    Args:
+        path (str): The path to the config file.
+        config (Any): The contents of the config file as a Python object.
+    Raises:
+        LaunchError: If the file extension is not supported.
+    """
+    _, ext = os.path.splitext(path)
+    if ext == ".json":
+        with open(path, "w") as f:
+            json.dump(config, f, indent=2)
+    elif ext in [".yaml", ".yml"]:
+        with open(path, "w") as f:
+            yaml.safe_dump(config, f)
+    else:
+        raise LaunchError(f"Unsupported file extension: {ext}")
+def _read_config_file(path: str) -> Any:
+    """Read a config file from disk.
+    Args:
+        path (str): The path to the config file.
+    Returns:
+        Any: The contents of the config file as a Python object.
+    """
+    _, ext = os.path.splitext(path)
+    if ext == ".json":
+        with open(
+            path,
+        ) as f:
+            return json.load(f)
+    elif ext in [".yaml", ".yml"]:
+        with open(
+            path,
+        ) as f:
+            return yaml.safe_load(f)
+    else:
+        raise LaunchError(f"Unsupported file extension: {ext}")
+def _update_dict(target: Dict, source: Dict) -> None:
+    """Update a dictionary with the contents of another dictionary.
+    Args:
+        target (Dict): The dictionary to update.
+        source (Dict): The dictionary to update from.
+    """
+    for key, value in source.items():
+        if isinstance(value, dict):
+            if key not in target:
+                target[key] = {}
+            _update_dict(target[key], value)
+        else:
+            target[key] = value

wandb/sdk/launch/inputs/internal.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""The layer between launch sdk user code and the wandb internal process.
+If there is an active run this communication is done through the wandb run's
+backend interface.
+If there is no active run, the messages are staged on the StagedLaunchInputs
+singleton and sent when a run is created.
+"""
+import os
+import pathlib
+import shutil
+import tempfile
+from typing import List, Optional
+import wandb
+import wandb.data_types
+from wandb.sdk.launch.errors import LaunchError
+from wandb.sdk.wandb_run import Run
+from .files import config_path_is_valid, override_file
+PERIOD = "."
+BACKSLASH = "\\"
+LAUNCH_MANAGED_CONFIGS_DIR = "_wandb_configs"
+class ConfigTmpDir:
+    """Singleton for managing temporary directories for configuration files.
+    Any configuration files designated as inputs to a launch job are copied to
+    a temporary directory. This singleton manages the temporary directory and
+    provides paths to the configuration files.
+    """
+    _instance = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = object.__new__(cls)
+        return cls._instance
+    def __init__(self):
+        if not hasattr(self, "_tmp_dir"):
+            self._tmp_dir = tempfile.mkdtemp()
+            self._configs_dir = os.path.join(self._tmp_dir, LAUNCH_MANAGED_CONFIGS_DIR)
+            os.mkdir(self._configs_dir)
+    @property
+    def tmp_dir(self):
+        return pathlib.Path(self._tmp_dir)
+    @property
+    def configs_dir(self):
+        return pathlib.Path(self._configs_dir)
+class JobInputArguments:
+    """Arguments for the publish_job_input of Interface."""
+    def __init__(
+        self,
+        include: Optional[List[str]] = None,
+        exclude: Optional[List[str]] = None,
+        file_path: Optional[str] = None,
+        run_config: Optional[bool] = None,
+    ):
+        self.include = include
+        self.exclude = exclude
+        self.file_path = file_path
+        self.run_config = run_config
+class StagedLaunchInputs:
+    _instance = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = object.__new__(cls)
+        return cls._instance
+    def __init__(self) -> None:
+        if not hasattr(self, "_staged_inputs"):
+            self._staged_inputs: List[JobInputArguments] = []
+    def add_staged_input(
+        self,
+        input_arguments: JobInputArguments,
+    ):
+        self._staged_inputs.append(input_arguments)
+    def apply(self, run: Run):
+        """Apply the staged inputs to the given run."""
+        for input in self._staged_inputs:
+            _publish_job_input(input, run)
+def _publish_job_input(
+    input: JobInputArguments,
+    run: Run,
+) -> None:
+    """Publish a job input to the backend interface of the given run.
+    Arguments:
+        input (JobInputArguments): The arguments for the job input.
+        run (Run): The run to publish the job input to.
+    """
+    assert run._backend is not None
+    assert run._backend.interface is not None
+    assert input.run_config is not None
+    interface = run._backend.interface
+    if input.file_path:
+        config_dir = ConfigTmpDir()
+        dest = os.path.join(config_dir.configs_dir, input.file_path)
+        run.save(dest, base_path=config_dir.tmp_dir)
+    interface.publish_job_input(
+        include_paths=[_split_on_unesc_dot(path) for path in input.include]
+        if input.include
+        else [],
+        exclude_paths=[_split_on_unesc_dot(path) for path in input.exclude]
+        if input.exclude
+        else [],
+        run_config=input.run_config,
+        file_path=input.file_path or "",
+    )
+def handle_config_file_input(
+    path: str,
+    include: Optional[List[str]] = None,
+    exclude: Optional[List[str]] = None,
+):
+    """Declare an overridable configuration file for a launch job.
+    The configuration file is copied to a temporary directory and the path to
+    the copy is sent to the backend interface of the active run and used to
+    configure the job builder.
+    If there is no active run, the configuration file is staged and sent when a
+    run is created.
+    """
+    config_path_is_valid(path)
+    override_file(path)
+    tmp_dir = ConfigTmpDir()
+    dest = os.path.join(tmp_dir.configs_dir, path)
+    dest_dir = os.path.dirname(dest)
+    if not os.path.exists(dest_dir):
+        os.makedirs(dest_dir)
+    shutil.copy(
+        path,
+        dest,
+    )
+    arguments = JobInputArguments(
+        include=include,
+        exclude=exclude,
+        file_path=path,
+        run_config=False,
+    )
+    if wandb.run is not None:
+        _publish_job_input(arguments, wandb.run)
+    else:
+        staged_inputs = StagedLaunchInputs()
+        staged_inputs.add_staged_input(arguments)
+def handle_run_config_input(
+    include: Optional[List[str]] = None, exclude: Optional[List[str]] = None
+):
+    """Declare wandb.config as an overridable configuration for a launch job.
+    The include and exclude paths are sent to the backend interface of the
+    active run and used to configure the job builder.
+    If there is no active run, the include and exclude paths are staged and sent
+    when a run is created.
+    """
+    arguments = JobInputArguments(
+        include=include,
+        exclude=exclude,
+        run_config=True,
+        file_path=None,
+    )
+    if wandb.run is not None:
+        _publish_job_input(arguments, wandb.run)
+    else:
+        stage_inputs = StagedLaunchInputs()
+        stage_inputs.add_staged_input(arguments)
+def _split_on_unesc_dot(path: str) -> List[str]:
+    r"""Split a string on unescaped dots.
+    Arguments:
+        path (str): The string to split.
+    Raises:
+        ValueError: If the path has a trailing escape character.
+    Returns:
+        List[str]: The split string.
+    """
+    parts = []
+    part = ""
+    i = 0
+    while i < len(path):
+        if path[i] == BACKSLASH:
+            if i == len(path) - 1:
+                raise LaunchError(
+                    f"Invalid config path {path}: trailing {BACKSLASH}.",
+                )
+            if path[i + 1] == PERIOD:
+                part += PERIOD
+                i += 2
+        elif path[i] == PERIOD:
+            parts.append(part)
+            part = ""
+            i += 1
+        else:
+            part += path[i]
+            i += 1
+    if part:
+        parts.append(part)
+    return parts

wandb/sdk/launch/inputs/manage.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Functions for declaring overridable configuration for launch jobs."""
+from typing import List, Optional
+def manage_config_file(
+    path: str,
+    include: Optional[List[str]] = None,
+    exclude: Optional[List[str]] = None,
+):
+    r"""Declare an overridable configuration file for a launch job.
+    If a new job version is created from the active run, the configuration file
+    will be added to the job's inputs. If the job is launched and overrides
+    have been provided for the configuration file, this function will detect
+    the overrides from the environment and update the configuration file on disk.
+    Note that these overrides will only be applied in ephemeral containers.
+    `include` and `exclude` are lists of dot separated paths with the config.
+    The paths are used to filter subtrees of the configuration file out of the
+    job's inputs.
+    For example, given the following configuration file:
+        ```yaml
+        model:
+            name: resnet
+            layers: 18
+        training:
+            epochs: 10
+            batch_size: 32
+        ```
+    Passing `include=['model']` will only include the `model` subtree in the
+    job's inputs. Passing `exclude=['model.layers']` will exclude the `layers`
+    key from the `model` subtree. Note that `exclude` takes precedence over
+    `include`.
+    `.` is used as a separator for nested keys. If a key contains a `.`, it
+    should be escaped with a backslash, e.g. `include=[r'model\.layers']`. Note
+    the use of `r` to denote a raw string when using escape chars.
+    Args:
+        path (str): The path to the configuration file. This path must be
+            relative and must not contain backwards traversal, i.e. `..`.
+        include (List[str]): A list of keys to include in the configuration file.
+        exclude (List[str]): A list of keys to exclude from the configuration file.
+    Raises:
+        LaunchError: If the path is not valid, or if there is no active run.
+    """
+    from .internal import handle_config_file_input
+    return handle_config_file_input(path, include, exclude)
+def manage_wandb_config(
+    include: Optional[List[str]] = None,
+    exclude: Optional[List[str]] = None,
+):
+    r"""Declare wandb.config as an overridable configuration for a launch job.
+    If a new job version is created from the active run, the run config
+    (wandb.config) will become an overridable input of the job. If the job is
+    launched and overrides have been provided for the run config, the overrides
+    will be applied to the run config when `wandb.init` is called.
+    `include` and `exclude` are lists of dot separated paths with the config.
+    The paths are used to filter subtrees of the configuration file out of the
+    job's inputs.
+    For example, given the following run config contents:
+        ```yaml
+        model:
+            name: resnet
+            layers: 18
+        training:
+            epochs: 10
+            batch_size: 32
+        ```
+    Passing `include=['model']` will only include the `model` subtree in the
+    job's inputs. Passing `exclude=['model.layers']` will exclude the `layers`
+    key from the `model` subtree. Note that `exclude` takes precedence over
+    `include`.
+    `.` is used as a separator for nested keys. If a key contains a `.`, it
+    should be escaped with a backslash, e.g. `include=[r'model\.layers']`. Note
+    the use of `r` to denote a raw string when using escape chars.
+    Args:
+        include (List[str]): A list of subtrees to include in the configuration.
+        exclude (List[str]): A list of subtrees to exclude from the configuration.
+    Raises:
+        LaunchError: If there is no active run.
+    """
+    from .internal import handle_run_config_input
+    handle_run_config_input(include, exclude)

wandb/sdk/launch/runner/abstract.py CHANGED Viewed

@@ -40,9 +40,9 @@ State = Literal[
 class Status:
-    def __init__(self, state: "State" = "unknown", data=None):  # type: ignore
+    def __init__(self, state: "State" = "unknown", messages: List[str] = None):  # type: ignore
         self.state = state
-        self.data = data or {}
+        self.messages = messages or []
     def __repr__(self) -> "State":
         return self.state

wandb/sdk/launch/runner/kubernetes_monitor.py CHANGED Viewed

@@ -14,6 +14,7 @@ from kubernetes_asyncio.client import (  # type: ignore  # noqa: F401
     BatchV1Api,
     CoreV1Api,
     CustomObjectsApi,
+    V1Pod,
     V1PodStatus,
 )
@@ -118,6 +119,27 @@ def _is_container_creating(status: "V1PodStatus") -> bool:
     return False
+def _is_pod_unschedulable(status: "V1PodStatus") -> Tuple[bool, str]:
+    """Return whether the pod is unschedulable along with the reason message."""
+    if not status.conditions:
+        return False, ""
+    for condition in status.conditions:
+        if (
+            condition.type == "PodScheduled"
+            and condition.status == "False"
+            and condition.reason == "Unschedulable"
+        ):
+            return True, condition.message
+    return False, ""
+def _get_crd_job_name(object: "V1Pod") -> Optional[str]:
+    refs = object.metadata.owner_references
+    if refs:
+        return refs[0].name
+    return None
 def _state_from_conditions(conditions: List[Dict[str, Any]]) -> Optional[State]:
     """Get the status from the pod conditions."""
     true_conditions = [
@@ -298,10 +320,18 @@ class LaunchKubernetesMonitor:
                 counts[state] += 1
         return counts
-    def _set_status(self, job_name: str, status: Status) -> None:
+    def _set_status_state(self, job_name: str, state: State) -> None:
         """Set the status of the run."""
-        if self._job_states.get(job_name) != status:
-            self._job_states[job_name] = status
+        if job_name not in self._job_states:
+            self._job_states[job_name] = Status(state)
+        elif self._job_states[job_name].state != state:
+            self._job_states[job_name].state = state
+    def _add_status_message(self, job_name: str, message: str) -> None:
+        if job_name not in self._job_states:
+            self._job_states[job_name] = Status("unknown")
+        wandb.termwarn(f"Warning from Kubernetes for job {job_name}: {message}")
+        self._job_states[job_name].messages.append(message)
     async def _monitor_pods(self, namespace: str) -> None:
         """Monitor a namespace for changes."""
@@ -312,15 +342,19 @@ class LaunchKubernetesMonitor:
             label_selector=self._label_selector,
         ):
             obj = event.get("object")
-            job_name = obj.metadata.labels.get("job-name")
+            job_name = obj.metadata.labels.get("job-name") or _get_crd_job_name(obj)
             if job_name is None or not hasattr(obj, "status"):
                 continue
             if self.__get_status(job_name) in ["finished", "failed"]:
                 continue
+            is_unschedulable, reason = _is_pod_unschedulable(obj.status)
+            if is_unschedulable:
+                self._add_status_message(job_name, reason)
             if obj.status.phase == "Running" or _is_container_creating(obj.status):
-                self._set_status(job_name, Status("running"))
+                self._set_status_state(job_name, "running")
             elif _is_preempted(obj.status):
-                self._set_status(job_name, Status("preempted"))
+                self._set_status_state(job_name, "preempted")
     async def _monitor_jobs(self, namespace: str) -> None:
         """Monitor a namespace for changes."""
@@ -334,15 +368,15 @@ class LaunchKubernetesMonitor:
             job_name = obj.metadata.name
             if obj.status.succeeded == 1:
-                self._set_status(job_name, Status("finished"))
+                self._set_status_state(job_name, "finished")
             elif obj.status.failed is not None and obj.status.failed >= 1:
-                self._set_status(job_name, Status("failed"))
+                self._set_status_state(job_name, "failed")
             # If the job is deleted and we haven't seen a terminal state
             # then we will consider the job failed.
             if event.get("type") == "DELETED":
                 if self._job_states.get(job_name) != Status("finished"):
-                    self._set_status(job_name, Status("failed"))
+                    self._set_status_state(job_name, "failed")
     async def _monitor_crd(
         self, namespace: str, custom_resource: CustomResource
@@ -355,7 +389,7 @@ class LaunchKubernetesMonitor:
             plural=custom_resource.plural,
             group=custom_resource.group,
             version=custom_resource.version,
-            label_selector=self._label_selector,  # TODO: Label selector doesn't work for CRDs.
+            label_selector=self._label_selector,
         ):
             object = event.get("object")
             name = object.get("metadata", dict()).get("name")
@@ -383,8 +417,7 @@ class LaunchKubernetesMonitor:
                     )
             if state is None:
                 continue
-            status = Status(state)
-            self._set_status(name, status)
+            self._set_status_state(name, state)
 class SafeWatch:

wandb/sdk/launch/runner/kubernetes_runner.py CHANGED Viewed

@@ -29,7 +29,6 @@ from wandb.sdk.lib.retry import ExponentialBackoff, retry_async
 from wandb.util import get_module
 from .._project_spec import EntryPoint, LaunchProject
-from ..builder.build import get_env_vars_dict
 from ..errors import LaunchError
 from ..utils import (
     LOG_PREFIX,
@@ -374,8 +373,7 @@ class KubernetesRunner(AbstractRunner):
                 }
         entry_point = (
-            launch_project.override_entrypoint
-            or launch_project.get_single_entry_point()
+            launch_project.override_entrypoint or launch_project.get_job_entry_point()
         )
         if launch_project.docker_image:
             # dont specify run id if user provided image, could have multiple runs
@@ -401,8 +399,8 @@ class KubernetesRunner(AbstractRunner):
             launch_project.override_entrypoint is not None,
         )
-        env_vars = get_env_vars_dict(
-            launch_project, self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
+        env_vars = launch_project.get_env_vars_dict(
+            self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
         )
         api_key_secret = None
         for cont in containers:
@@ -511,8 +509,8 @@ class KubernetesRunner(AbstractRunner):
         api_version = resource_args.get("apiVersion", "batch/v1")
         if api_version not in ["batch/v1", "batch/v1beta1"]:
-            env_vars = get_env_vars_dict(
-                launch_project, self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
+            env_vars = launch_project.get_env_vars_dict(
+                self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
             )
             # Crawl the resource args and add our env vars to the containers.
             add_wandb_env(resource_args, env_vars)
@@ -537,7 +535,7 @@ class KubernetesRunner(AbstractRunner):
             if LaunchAgent.initialized():
                 add_label_to_pods(
                     resource_args,
-                    WANDB_K8S_LABEL_MONITOR,
+                    WANDB_K8S_LABEL_AGENT,
                     LaunchAgent.name(),
                 )
                 resource_args["metadata"]["labels"][WANDB_K8S_LABEL_AGENT] = (