PyPI - wandb - Versions diffs - 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

wandb 0.13.10py3-none-any.whl → 0.14.0py3-none-any.whl

Files changed (228) hide show

wandb/__init__.py +2 -3
wandb/apis/__init__.py +1 -3
wandb/apis/importers/__init__.py +4 -0
wandb/apis/importers/base.py +312 -0
wandb/apis/importers/mlflow.py +113 -0
wandb/apis/internal.py +29 -2
wandb/apis/normalize.py +6 -5
wandb/apis/public.py +163 -180
wandb/apis/reports/_templates.py +6 -12
wandb/apis/reports/report.py +1 -1
wandb/apis/reports/runset.py +1 -3
wandb/apis/reports/util.py +12 -10
wandb/beta/workflows.py +57 -34
wandb/catboost/__init__.py +1 -2
wandb/cli/cli.py +215 -133
wandb/data_types.py +63 -56
wandb/docker/__init__.py +78 -16
wandb/docker/auth.py +21 -22
wandb/env.py +0 -1
wandb/errors/__init__.py +8 -116
wandb/errors/term.py +1 -1
wandb/fastai/__init__.py +1 -2
wandb/filesync/dir_watcher.py +8 -5
wandb/filesync/step_prepare.py +76 -75
wandb/filesync/step_upload.py +1 -2
wandb/integration/catboost/__init__.py +1 -3
wandb/integration/catboost/catboost.py +8 -14
wandb/integration/fastai/__init__.py +7 -13
wandb/integration/gym/__init__.py +35 -4
wandb/integration/keras/__init__.py +3 -3
wandb/integration/keras/callbacks/metrics_logger.py +9 -8
wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
wandb/integration/keras/callbacks/tables_builder.py +31 -19
wandb/integration/kfp/kfp_patch.py +20 -17
wandb/integration/kfp/wandb_logging.py +1 -2
wandb/integration/lightgbm/__init__.py +21 -19
wandb/integration/prodigy/prodigy.py +6 -7
wandb/integration/sacred/__init__.py +9 -12
wandb/integration/sagemaker/__init__.py +1 -3
wandb/integration/sagemaker/auth.py +0 -1
wandb/integration/sagemaker/config.py +1 -1
wandb/integration/sagemaker/resources.py +1 -1
wandb/integration/sb3/sb3.py +8 -4
wandb/integration/tensorboard/__init__.py +1 -3
wandb/integration/tensorboard/log.py +8 -8
wandb/integration/tensorboard/monkeypatch.py +11 -9
wandb/integration/tensorflow/__init__.py +1 -3
wandb/integration/xgboost/__init__.py +4 -6
wandb/integration/yolov8/__init__.py +7 -0
wandb/integration/yolov8/yolov8.py +250 -0
wandb/jupyter.py +31 -35
wandb/lightgbm/__init__.py +1 -2
wandb/old/settings.py +2 -2
wandb/plot/bar.py +1 -2
wandb/plot/confusion_matrix.py +1 -3
wandb/plot/histogram.py +1 -2
wandb/plot/line.py +1 -2
wandb/plot/line_series.py +4 -4
wandb/plot/pr_curve.py +17 -20
wandb/plot/roc_curve.py +1 -3
wandb/plot/scatter.py +1 -2
wandb/proto/v3/wandb_server_pb2.py +85 -39
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_server_pb2.py +51 -39
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/sdk/__init__.py +1 -3
wandb/sdk/backend/backend.py +1 -1
wandb/sdk/data_types/_dtypes.py +38 -30
wandb/sdk/data_types/base_types/json_metadata.py +1 -3
wandb/sdk/data_types/base_types/media.py +17 -17
wandb/sdk/data_types/base_types/wb_value.py +33 -26
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
wandb/sdk/data_types/helper_types/classes.py +1 -1
wandb/sdk/data_types/helper_types/image_mask.py +12 -12
wandb/sdk/data_types/histogram.py +5 -4
wandb/sdk/data_types/html.py +1 -2
wandb/sdk/data_types/image.py +11 -11
wandb/sdk/data_types/molecule.py +3 -6
wandb/sdk/data_types/object_3d.py +1 -2
wandb/sdk/data_types/plotly.py +1 -2
wandb/sdk/data_types/saved_model.py +10 -8
wandb/sdk/data_types/video.py +1 -1
wandb/sdk/integration_utils/data_logging.py +5 -5
wandb/sdk/interface/artifacts.py +288 -266
wandb/sdk/interface/interface.py +2 -3
wandb/sdk/interface/interface_grpc.py +1 -1
wandb/sdk/interface/interface_queue.py +1 -1
wandb/sdk/interface/interface_relay.py +1 -1
wandb/sdk/interface/interface_shared.py +1 -2
wandb/sdk/interface/interface_sock.py +1 -1
wandb/sdk/interface/message_future.py +1 -1
wandb/sdk/interface/message_future_poll.py +1 -1
wandb/sdk/interface/router.py +1 -1
wandb/sdk/interface/router_queue.py +1 -1
wandb/sdk/interface/router_relay.py +1 -1
wandb/sdk/interface/router_sock.py +1 -1
wandb/sdk/interface/summary_record.py +1 -1
wandb/sdk/internal/artifacts.py +1 -1
wandb/sdk/internal/datastore.py +2 -3
wandb/sdk/internal/file_pusher.py +5 -3
wandb/sdk/internal/file_stream.py +22 -19
wandb/sdk/internal/handler.py +5 -4
wandb/sdk/internal/internal.py +1 -1
wandb/sdk/internal/internal_api.py +115 -55
wandb/sdk/internal/job_builder.py +1 -3
wandb/sdk/internal/profiler.py +1 -1
wandb/sdk/internal/progress.py +4 -6
wandb/sdk/internal/sample.py +1 -3
wandb/sdk/internal/sender.py +28 -16
wandb/sdk/internal/settings_static.py +5 -5
wandb/sdk/internal/system/assets/__init__.py +1 -0
wandb/sdk/internal/system/assets/cpu.py +3 -9
wandb/sdk/internal/system/assets/disk.py +2 -4
wandb/sdk/internal/system/assets/gpu.py +6 -18
wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
wandb/sdk/internal/system/assets/interfaces.py +50 -22
wandb/sdk/internal/system/assets/ipu.py +1 -3
wandb/sdk/internal/system/assets/memory.py +7 -13
wandb/sdk/internal/system/assets/network.py +4 -8
wandb/sdk/internal/system/assets/open_metrics.py +283 -0
wandb/sdk/internal/system/assets/tpu.py +1 -4
wandb/sdk/internal/system/assets/trainium.py +26 -14
wandb/sdk/internal/system/system_info.py +2 -3
wandb/sdk/internal/system/system_monitor.py +52 -20
wandb/sdk/internal/tb_watcher.py +12 -13
wandb/sdk/launch/_project_spec.py +54 -65
wandb/sdk/launch/agent/agent.py +374 -90
wandb/sdk/launch/builder/abstract.py +61 -7
wandb/sdk/launch/builder/build.py +81 -110
wandb/sdk/launch/builder/docker_builder.py +181 -0
wandb/sdk/launch/builder/kaniko_builder.py +419 -0
wandb/sdk/launch/builder/noop.py +31 -12
wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
wandb/sdk/launch/environment/abstract.py +28 -0
wandb/sdk/launch/environment/aws_environment.py +276 -0
wandb/sdk/launch/environment/gcp_environment.py +271 -0
wandb/sdk/launch/environment/local_environment.py +65 -0
wandb/sdk/launch/github_reference.py +3 -8
wandb/sdk/launch/launch.py +38 -29
wandb/sdk/launch/launch_add.py +6 -8
wandb/sdk/launch/loader.py +230 -0
wandb/sdk/launch/registry/abstract.py +54 -0
wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
wandb/sdk/launch/registry/local_registry.py +62 -0
wandb/sdk/launch/runner/abstract.py +1 -16
wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
wandb/sdk/launch/runner/local_container.py +46 -22
wandb/sdk/launch/runner/local_process.py +1 -4
wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
wandb/sdk/launch/sweeps/__init__.py +3 -2
wandb/sdk/launch/sweeps/scheduler.py +132 -39
wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
wandb/sdk/launch/utils.py +101 -30
wandb/sdk/launch/wandb_reference.py +2 -7
wandb/sdk/lib/_settings_toposort_generate.py +166 -0
wandb/sdk/lib/_settings_toposort_generated.py +201 -0
wandb/sdk/lib/apikey.py +2 -4
wandb/sdk/lib/config_util.py +4 -1
wandb/sdk/lib/console.py +1 -3
wandb/sdk/lib/deprecate.py +3 -3
wandb/sdk/lib/file_stream_utils.py +7 -5
wandb/sdk/lib/filenames.py +1 -1
wandb/sdk/lib/filesystem.py +61 -5
wandb/sdk/lib/git.py +1 -3
wandb/sdk/lib/import_hooks.py +4 -7
wandb/sdk/lib/ipython.py +8 -5
wandb/sdk/lib/lazyloader.py +1 -3
wandb/sdk/lib/mailbox.py +14 -4
wandb/sdk/lib/proto_util.py +10 -5
wandb/sdk/lib/redirect.py +15 -22
wandb/sdk/lib/reporting.py +1 -3
wandb/sdk/lib/retry.py +4 -5
wandb/sdk/lib/runid.py +1 -3
wandb/sdk/lib/server.py +15 -9
wandb/sdk/lib/sock_client.py +1 -1
wandb/sdk/lib/sparkline.py +1 -1
wandb/sdk/lib/wburls.py +1 -1
wandb/sdk/service/port_file.py +1 -2
wandb/sdk/service/service.py +36 -13
wandb/sdk/service/service_base.py +12 -1
wandb/sdk/verify/verify.py +5 -7
wandb/sdk/wandb_artifacts.py +142 -177
wandb/sdk/wandb_config.py +5 -8
wandb/sdk/wandb_helper.py +1 -1
wandb/sdk/wandb_init.py +24 -13
wandb/sdk/wandb_login.py +9 -9
wandb/sdk/wandb_manager.py +39 -4
wandb/sdk/wandb_metric.py +2 -6
wandb/sdk/wandb_require.py +4 -15
wandb/sdk/wandb_require_helpers.py +1 -9
wandb/sdk/wandb_run.py +95 -141
wandb/sdk/wandb_save.py +1 -3
wandb/sdk/wandb_settings.py +149 -54
wandb/sdk/wandb_setup.py +66 -46
wandb/sdk/wandb_summary.py +13 -10
wandb/sdk/wandb_sweep.py +6 -7
wandb/sdk/wandb_watch.py +1 -1
wandb/sklearn/calculate/confusion_matrix.py +1 -1
wandb/sklearn/calculate/learning_curve.py +1 -1
wandb/sklearn/calculate/summary_metrics.py +1 -3
wandb/sklearn/plot/__init__.py +1 -1
wandb/sklearn/plot/classifier.py +27 -18
wandb/sklearn/plot/clusterer.py +4 -5
wandb/sklearn/plot/regressor.py +4 -4
wandb/sklearn/plot/shared.py +2 -2
wandb/sync/__init__.py +1 -3
wandb/sync/sync.py +4 -5
wandb/testing/relay.py +11 -10
wandb/trigger.py +1 -1
wandb/util.py +106 -81
wandb/viz.py +4 -4
wandb/wandb_agent.py +50 -50
wandb/wandb_controller.py +2 -3
wandb/wandb_run.py +1 -2
wandb/wandb_torch.py +1 -1
wandb/xgboost/__init__.py +1 -2
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
wandb/sdk/launch/builder/docker.py +0 -80
wandb/sdk/launch/builder/kaniko.py +0 -393
wandb/sdk/launch/builder/loader.py +0 -32
wandb/sdk/launch/runner/loader.py +0 -50
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0

wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} RENAMED Viewed

@@ -1,5 +1,5 @@
 import datetime
-import os
+import logging
 import shlex
 import time
 from typing import Any, Dict, Optional
@@ -10,17 +10,20 @@ if False:
 import yaml
 import wandb
-from wandb.errors import LaunchError
+from wandb.apis.internal import Api
 from wandb.util import get_module
 from .._project_spec import LaunchProject, get_entry_point_command
 from ..builder.abstract import AbstractBuilder
-from ..builder.build import construct_gcp_registry_uri, get_env_vars_dict
-from ..utils import LOG_PREFIX, PROJECT_SYNCHRONOUS, run_shell
+from ..builder.build import get_env_vars_dict
+from ..environment.gcp_environment import GcpEnvironment
+from ..utils import LOG_PREFIX, PROJECT_SYNCHRONOUS, LaunchError, run_shell
 from .abstract import AbstractRun, AbstractRunner, Status
 GCP_CONSOLE_URI = "https://console.cloud.google.com"
+_logger = logging.getLogger(__name__)
 class VertexSubmittedRun(AbstractRun):
     def __init__(self, job: Any) -> None:
@@ -57,12 +60,14 @@ class VertexSubmittedRun(AbstractRun):
     def get_status(self) -> Status:
         job_state = str(self._job.state)  # extract from type PipelineState
-        if job_state == "PipelineState.PIPELINE_STATE_SUCCEEDED":
+        if job_state == "JobState.JOB_STATE_SUCCEEDED":
             return Status("finished")
-        if job_state == "PipelineState.PIPELINE_STATE_FAILED":
+        if job_state == "JobState.JOB_STATE_FAILED":
             return Status("failed")
-        if job_state == "PipelineState.PIPELINE_STATE_RUNNING":
+        if job_state == "JobState.JOB_STATE_RUNNING":
             return Status("running")
+        if job_state == "JobState.JOB_STATE_PENDING":
+            return Status("starting")
         return Status("unknown")
     def cancel(self) -> None:
@@ -70,47 +75,37 @@ class VertexSubmittedRun(AbstractRun):
 class VertexRunner(AbstractRunner):
-    """Runner class, uses a project to create a VertexSubmittedRun"""
+    """Runner class, uses a project to create a VertexSubmittedRun."""
+    def __init__(
+        self, api: Api, backend_config: Dict[str, Any], environment: GcpEnvironment
+    ) -> None:
+        """Initialize a VertexRunner instance."""
+        super().__init__(api, backend_config)
+        self.environment = environment
     def run(
         self,
         launch_project: LaunchProject,
-        builder: AbstractBuilder,
-        registry_config: Dict[str, Any],
+        builder: Optional[AbstractBuilder],
     ) -> Optional[AbstractRun]:
+        """Run a Vertex job."""
         aiplatform = get_module(  # noqa: F811
             "google.cloud.aiplatform",
             "VertexRunner requires google.cloud.aiplatform to be installed",
         )
-        resource_args = launch_project.resource_args.get("gcp_vertex")
+        resource_args = launch_project.resource_args.get("vertex")
+        if not resource_args:
+            resource_args = launch_project.resource_args.get("gcp-vertex")
         if not resource_args:
             raise LaunchError(
                 "No Vertex resource args specified. Specify args via --resource-args with a JSON file or string under top-level key gcp_vertex"
             )
-        gcp_config = get_gcp_config(resource_args.get("gcp_config") or "default")
-        gcp_project = (
-            resource_args.get("gcp_project")
-            or gcp_config["properties"]["core"]["project"]
-        )
-        gcp_zone = resource_args.get("gcp_region") or gcp_config["properties"].get(
-            "compute", {}
-        ).get("zone")
-        gcp_region = "-".join(gcp_zone.split("-")[:2])
         gcp_staging_bucket = resource_args.get("staging_bucket")
         if not gcp_staging_bucket:
             raise LaunchError(
                 "Vertex requires a staging bucket for training and dependency packages in the same region as compute. Specify a bucket under key staging_bucket."
             )
-        gcp_artifact_repo = resource_args.get("artifact_repo")
-        if not gcp_artifact_repo:
-            raise LaunchError(
-                "Vertex requires an Artifact Registry repository for the Docker image. Specify a repo under key artifact_repo."
-            )
-        gcp_docker_host = (
-            resource_args.get("docker_host") or f"{gcp_region}-docker.pkg.dev"
-        )
         gcp_machine_type = resource_args.get("machine_type") or "n1-standard-4"
         gcp_accelerator_type = (
             resource_args.get("accelerator_type") or "ACCELERATOR_TYPE_UNSPECIFIED"
@@ -124,9 +119,10 @@ class VertexRunner(AbstractRunner):
         )
         service_account = resource_args.get("service_account")
         tensorboard = resource_args.get("tensorboard")
         aiplatform.init(
-            project=gcp_project, location=gcp_region, staging_bucket=gcp_staging_bucket
+            project=self.environment.project,
+            location=self.environment.region,
+            staging_bucket=gcp_staging_bucket,
         )
         synchronous: bool = self.backend_config[PROJECT_SYNCHRONOUS]
@@ -135,21 +131,13 @@ class VertexRunner(AbstractRunner):
         if launch_project.docker_image:
             image_uri = launch_project.docker_image
         else:
-            repository = construct_gcp_registry_uri(
-                gcp_artifact_repo,
-                gcp_project,
-                gcp_docker_host,
-            )
             assert entry_point is not None
+            assert builder is not None
             image_uri = builder.build_image(
                 launch_project,
-                repository,
                 entry_point,
             )
-        if not self.ack_run_queue_item(launch_project):
-            return None
         # TODO: how to handle this?
         entry_cmd = get_entry_point_command(entry_point, launch_project.override_args)
@@ -176,18 +164,19 @@ class VertexRunner(AbstractRunner):
             display_name=gcp_training_job_name, worker_pool_specs=worker_pool_specs
         )
-        submitted_run = VertexSubmittedRun(job)
-        # todo: support gcp dataset?
         wandb.termlog(
             f"{LOG_PREFIX}Running training job {gcp_training_job_name} on {gcp_machine_type}."
         )
-        # when sync is True, vertex blocks the main thread on job completion. when False, vertex returns a Future
-        # on this thread but continues to block the process on another thread. always set sync=False so we can get
-        # the job info (dependent on job._gca_resource)
-        job.run(service_account=service_account, tensorboard=tensorboard, sync=False)
+        if synchronous:
+            job.run(service_account=service_account, tensorboard=tensorboard, sync=True)
+        else:
+            job.submit(
+                service_account=service_account,
+                tensorboard=tensorboard,
+            )
+        submitted_run = VertexSubmittedRun(job)
         while not getattr(job._gca_resource, "name", None):
             # give time for the gcp job object to be created and named, this should only loop a couple times max
@@ -196,12 +185,6 @@ class VertexRunner(AbstractRunner):
         wandb.termlog(
             f"{LOG_PREFIX}View your job status and logs at {submitted_run.get_page_link()}."
         )
-        # hacky: if user doesn't want blocking behavior, kill both main thread and the background thread. job continues
-        # to run remotely. this obviously doesn't work if we need to do some sort of postprocessing after this run fn
-        if not synchronous:
-            os._exit(0)
         return submitted_run

wandb/sdk/launch/sweeps/__init__.py CHANGED Viewed

@@ -3,9 +3,11 @@ from typing import Any, Callable, Dict
 log = logging.getLogger(__name__)
+SCHEDULER_URI = "placeholder-uri-scheduler"
 class SchedulerError(Exception):
-    """Raised when a known error occurs with wandb sweep scheduler"""
+    """Raised when a known error occurs with wandb sweep scheduler."""
     pass
@@ -22,7 +24,6 @@ _WANDB_SCHEDULERS: Dict[str, Callable] = {
 def load_scheduler(scheduler_name: str) -> Any:
     scheduler_name = scheduler_name.lower()
     if scheduler_name not in _WANDB_SCHEDULERS:
         raise SchedulerError(

wandb/sdk/launch/sweeps/scheduler.py CHANGED Viewed

@@ -2,12 +2,14 @@
 import logging
 import os
 import threading
+import traceback
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 import click
+import yaml
 import wandb
 import wandb.apis.public as public
@@ -16,21 +18,30 @@ from wandb.errors import CommError
 from wandb.sdk.launch.launch_add import launch_add
 from wandb.sdk.launch.sweeps import SchedulerError
 from wandb.sdk.lib.runid import generate_id
+from wandb.wandb_agent import Agent
-logger = logging.getLogger(__name__)
+_logger = logging.getLogger(__name__)
 LOG_PREFIX = f"{click.style('sched:', fg='cyan')} "
+@dataclass
+class _Worker:
+    agent_config: Dict[str, Any]
+    agent_id: str
 class SchedulerState(Enum):
     PENDING = 0
     STARTING = 1
     RUNNING = 2
-    COMPLETED = 3
-    FAILED = 4
-    STOPPED = 5
+    FLUSH_RUNS = 3
+    COMPLETED = 4
+    FAILED = 5
+    STOPPED = 6
+    CANCELLED = 7
-class SimpleRunState(Enum):
+class RunState(Enum):
     ALIVE = 0
     DEAD = 1
     UNKNOWN = 2
@@ -39,19 +50,16 @@ class SimpleRunState(Enum):
 @dataclass
 class SweepRun:
     id: str
-    state: SimpleRunState = SimpleRunState.ALIVE
+    state: RunState = RunState.ALIVE
     queued_run: Optional[public.QueuedRun] = None
     args: Optional[Dict[str, Any]] = None
     logs: Optional[List[str]] = None
-    program: Optional[str] = None
     # Threading can be used to run multiple workers in parallel
     worker_id: Optional[int] = None
 class Scheduler(ABC):
-    """The Scheduler is a controller/agent that will populate a Launch RunQueue
-    with jobs from a hyperparameter sweep.
-    """
+    """A controller/agent that populates a Launch RunQueue from a hyperparameter sweep."""
     def __init__(
         self,
@@ -73,18 +81,31 @@ class Scheduler(ABC):
         self._project = (
             project or os.environ.get("WANDB_PROJECT") or api.settings("project")
         )
+        self._sweep_id: str = sweep_id or "empty-sweep-id"
+        self._state: SchedulerState = SchedulerState.PENDING
         # Make sure the provided sweep_id corresponds to a valid sweep
         try:
-            self._api.sweep(sweep_id, "{}", entity=self._entity, project=self._project)
+            resp = self._api.sweep(
+                sweep_id, "{}", entity=self._entity, project=self._project
+            )
+            if resp.get("state") == SchedulerState.CANCELLED.name:
+                self._state = SchedulerState.CANCELLED
+            self._sweep_config = yaml.safe_load(resp["config"])
         except Exception as e:
             raise SchedulerError(f"{LOG_PREFIX}Exception when finding sweep: {e}")
-        self._sweep_id: str = sweep_id or "empty-sweep-id"
-        self._state: SchedulerState = SchedulerState.PENDING
         # Dictionary of the runs being managed by the scheduler
         self._runs: Dict[str, SweepRun] = {}
         # Threading lock to ensure thread-safe access to the runs dictionary
         self._threading_lock: threading.Lock = threading.Lock()
-        self._project_queue = project_queue or self._project
+        self._project_queue = project_queue
+        # Optionally run multiple workers in (pseudo-)parallel. Workers do not
+        # actually run training workloads, they simply send heartbeat messages
+        # (emulating a real agent) and add new runs to the launch queue. The
+        # launch agent is the one that actually runs the training workloads.
+        self._workers: Dict[int, _Worker] = {}
         # Scheduler may receive additional kwargs which will be piped into the launch command
         self._kwargs: Dict[str, Any] = kwargs
@@ -102,12 +123,12 @@ class Scheduler(ABC):
     @property
     def state(self) -> SchedulerState:
-        logger.debug(f"{LOG_PREFIX}Scheduler state is {self._state.name}")
+        _logger.debug(f"{LOG_PREFIX}Scheduler state is {self._state.name}")
         return self._state
     @state.setter
     def state(self, value: SchedulerState) -> None:
-        logger.debug(f"{LOG_PREFIX}Scheduler was {self.state.name} is {value.name}")
+        _logger.debug(f"{LOG_PREFIX}Scheduler was {self.state.name} is {value.name}")
         self._state = value
     def is_alive(self) -> bool:
@@ -115,17 +136,33 @@ class Scheduler(ABC):
             SchedulerState.COMPLETED,
             SchedulerState.FAILED,
             SchedulerState.STOPPED,
+            SchedulerState.CANCELLED,
         ]:
             return False
         return True
     def start(self) -> None:
+        """Start a scheduler, confirms prerequisites, begins execution loop."""
         wandb.termlog(f"{LOG_PREFIX}Scheduler starting.")
+        if not self.is_alive():
+            wandb.termerror(
+                f"{LOG_PREFIX}Sweep already {self.state.name.lower()}! Exiting..."
+            )
+            self.exit()
+            return
         self._state = SchedulerState.STARTING
+        if not self._try_load_executable():
+            wandb.termerror(
+                f"{LOG_PREFIX}No 'job' or 'image_uri' loaded from sweep config."
+            )
+            self.exit()
+            return
         self._start()
         self.run()
     def run(self) -> None:
+        """Main run function for all external schedulers."""
         wandb.termlog(f"{LOG_PREFIX}Scheduler Running.")
         self.state = SchedulerState.RUNNING
         try:
@@ -134,6 +171,11 @@ class Scheduler(ABC):
                     break
                 self._update_run_states()
                 self._run()
+                # if we hit the run_cap, now set to stopped after launching runs
+                if self.state == SchedulerState.FLUSH_RUNS:
+                    if len(self._runs.keys()) == 0:
+                        wandb.termlog(f"{LOG_PREFIX}Done polling on runs, exiting.")
+                        self.state = SchedulerState.STOPPED
         except KeyboardInterrupt:
             wandb.termlog(f"{LOG_PREFIX}Scheduler received KeyboardInterrupt. Exiting.")
             self.state = SchedulerState.STOPPED
@@ -157,6 +199,28 @@ class Scheduler(ABC):
             self.state = SchedulerState.FAILED
         self._stop_runs()
+    def _try_load_executable(self) -> bool:
+        """Check existance of valid executable for a run.
+        logs and returns False when job is unreachable
+        """
+        if self._kwargs.get("job"):
+            _public_api = public.Api()
+            try:
+                _job_artifact = _public_api.artifact(self._kwargs["job"], type="job")
+                wandb.termlog(
+                    f"{LOG_PREFIX}Successfully loaded job: {_job_artifact.name} in scheduler"
+                )
+            except Exception:
+                wandb.termerror(f"{LOG_PREFIX}{traceback.format_exc()}")
+                return False
+            return True
+        elif self._kwargs.get("image_uri"):
+            # TODO(gst): check docker existance? Use registry in launch config?
+            return True
+        else:
+            return False
     def _yield_runs(self) -> Iterator[Tuple[str, SweepRun]]:
         """Thread-safe way to iterate over the runs."""
         with self._threading_lock:
@@ -168,25 +232,38 @@ class Scheduler(ABC):
             self._stop_run(run_id)
     def _stop_run(self, run_id: str) -> None:
-        """Stops a run and removes it from the scheduler"""
+        """Stop a run and removes it from the scheduler."""
         if run_id in self._runs:
             run: SweepRun = self._runs[run_id]
-            run.state = SimpleRunState.DEAD
+            run.state = RunState.DEAD
             # TODO(hupo): Send command to backend to stop run
             wandb.termlog(f"{LOG_PREFIX} Stopped run {run_id}.")
     def _update_run_states(self) -> None:
+        """Iterate through runs.
+        Get state from backend and deletes runs if not in running state. Threadsafe.
+        """
         _runs_to_remove: List[str] = []
         for run_id, run in self._yield_runs():
             try:
                 _state = self._api.get_run_state(self._entity, self._project, run_id)
-                if _state is None or _state in [
-                    "crashed",
-                    "failed",
-                    "killed",
-                    "finished",
-                ]:
-                    run.state = SimpleRunState.DEAD
+                _rqi_state = run.queued_run.state if run.queued_run else None
+                if (
+                    not _state
+                    or _state
+                    in [
+                        "crashed",
+                        "failed",
+                        "killed",
+                        "finished",
+                    ]
+                    or _rqi_state == "failed"
+                ):
+                    _logger.debug(
+                        f"({run_id}) run-state:{_state}, rqi-state:{_rqi_state}"
+                    )
+                    run.state = RunState.DEAD
                     _runs_to_remove.append(run_id)
                 elif _state in [
                     "running",
@@ -194,12 +271,12 @@ class Scheduler(ABC):
                     "preempted",
                     "preempting",
                 ]:
-                    run.state = SimpleRunState.ALIVE
+                    run.state = RunState.ALIVE
             except CommError as e:
                 wandb.termlog(
                     f"{LOG_PREFIX}Issue when getting RunState for Run {run_id}: {e}"
                 )
-                run.state = SimpleRunState.UNKNOWN
+                run.state = RunState.UNKNOWN
                 continue
         # Remove any runs that are dead
         with self._threading_lock:
@@ -213,31 +290,47 @@ class Scheduler(ABC):
         entry_point: Optional[List[str]] = None,
         config: Optional[Dict[str, Any]] = None,
     ) -> "public.QueuedRun":
-        """Add a launch job to the Launch RunQueue."""
+        """Add a launch job to the Launch RunQueue.
+        run_id: supplied by gorilla from agentHeartbeat
+        entry_point: sweep entrypoint overrides image_uri/job entrypoint
+        config: launch config
+        """
+        # job and image first from CLI args, then from sweep config
+        _job = self._kwargs.get("job") or self._sweep_config.get("job")
+        _sweep_config_uri = self._sweep_config.get("image_uri")
+        _image_uri = self._kwargs.get("image_uri") or _sweep_config_uri
+        if _job is None and _image_uri is None:
+            raise SchedulerError(
+                f"{LOG_PREFIX}No 'job' nor 'image_uri' (run: {run_id})"
+            )
+        elif _job is not None and _image_uri is not None:
+            raise SchedulerError(f"{LOG_PREFIX}Sweep has both 'job' and 'image_uri'")
+        if self._sweep_config.get("command"):
+            entry_point = Agent._create_sweep_command(self._sweep_config["command"])
+            wandb.termwarn(
+                f"{LOG_PREFIX}Sweep command {entry_point} will override"
+                f' {"job" if _job else "image_uri"} entrypoint'
+            )
         run_id = run_id or generate_id()
-        # One of Job and URI is required
-        _job = self._kwargs.get("job", None)
-        _uri = self._kwargs.get("uri", None)
-        if _job is None and _uri is None:
-            # If no Job is specified, use a placeholder URI to prevent Launch failure
-            _uri = "placeholder-uri-queuedrun-from-scheduler"
-        # Queue is required
-        _queue = self._kwargs.get("queue", "default")
         queued_run = launch_add(
             run_id=run_id,
             entry_point=entry_point,
             config=config,
-            uri=_uri,
+            docker_image=_image_uri,  # TODO(gst): make agnostic (github? run uri?)
             job=_job,
             project=self._project,
             entity=self._entity,
-            queue_name=_queue,
+            queue_name=self._kwargs.get("queue"),
             project_queue=self._project_queue,
             resource=self._kwargs.get("resource", None),
             resource_args=self._kwargs.get("resource_args", None),
         )
         self._runs[run_id].queued_run = queued_run
         wandb.termlog(
-            f"{LOG_PREFIX}Added run to Launch RunQueue: {_queue} RunID:{run_id}."
+            f"{LOG_PREFIX}Added run to Launch queue: {self._kwargs.get('queue')} RunID:{run_id}."
         )
         return queued_run

wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl

wandb 0.13.10py3-none-any.whl → 0.14.0py3-none-any.whl