PyPI - wandb - Versions diffs - 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

wandb 0.13.10py3-none-any.whl → 0.14.0py3-none-any.whl

Files changed (228) hide show

wandb/__init__.py +2 -3
wandb/apis/__init__.py +1 -3
wandb/apis/importers/__init__.py +4 -0
wandb/apis/importers/base.py +312 -0
wandb/apis/importers/mlflow.py +113 -0
wandb/apis/internal.py +29 -2
wandb/apis/normalize.py +6 -5
wandb/apis/public.py +163 -180
wandb/apis/reports/_templates.py +6 -12
wandb/apis/reports/report.py +1 -1
wandb/apis/reports/runset.py +1 -3
wandb/apis/reports/util.py +12 -10
wandb/beta/workflows.py +57 -34
wandb/catboost/__init__.py +1 -2
wandb/cli/cli.py +215 -133
wandb/data_types.py +63 -56
wandb/docker/__init__.py +78 -16
wandb/docker/auth.py +21 -22
wandb/env.py +0 -1
wandb/errors/__init__.py +8 -116
wandb/errors/term.py +1 -1
wandb/fastai/__init__.py +1 -2
wandb/filesync/dir_watcher.py +8 -5
wandb/filesync/step_prepare.py +76 -75
wandb/filesync/step_upload.py +1 -2
wandb/integration/catboost/__init__.py +1 -3
wandb/integration/catboost/catboost.py +8 -14
wandb/integration/fastai/__init__.py +7 -13
wandb/integration/gym/__init__.py +35 -4
wandb/integration/keras/__init__.py +3 -3
wandb/integration/keras/callbacks/metrics_logger.py +9 -8
wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
wandb/integration/keras/callbacks/tables_builder.py +31 -19
wandb/integration/kfp/kfp_patch.py +20 -17
wandb/integration/kfp/wandb_logging.py +1 -2
wandb/integration/lightgbm/__init__.py +21 -19
wandb/integration/prodigy/prodigy.py +6 -7
wandb/integration/sacred/__init__.py +9 -12
wandb/integration/sagemaker/__init__.py +1 -3
wandb/integration/sagemaker/auth.py +0 -1
wandb/integration/sagemaker/config.py +1 -1
wandb/integration/sagemaker/resources.py +1 -1
wandb/integration/sb3/sb3.py +8 -4
wandb/integration/tensorboard/__init__.py +1 -3
wandb/integration/tensorboard/log.py +8 -8
wandb/integration/tensorboard/monkeypatch.py +11 -9
wandb/integration/tensorflow/__init__.py +1 -3
wandb/integration/xgboost/__init__.py +4 -6
wandb/integration/yolov8/__init__.py +7 -0
wandb/integration/yolov8/yolov8.py +250 -0
wandb/jupyter.py +31 -35
wandb/lightgbm/__init__.py +1 -2
wandb/old/settings.py +2 -2
wandb/plot/bar.py +1 -2
wandb/plot/confusion_matrix.py +1 -3
wandb/plot/histogram.py +1 -2
wandb/plot/line.py +1 -2
wandb/plot/line_series.py +4 -4
wandb/plot/pr_curve.py +17 -20
wandb/plot/roc_curve.py +1 -3
wandb/plot/scatter.py +1 -2
wandb/proto/v3/wandb_server_pb2.py +85 -39
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_server_pb2.py +51 -39
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/sdk/__init__.py +1 -3
wandb/sdk/backend/backend.py +1 -1
wandb/sdk/data_types/_dtypes.py +38 -30
wandb/sdk/data_types/base_types/json_metadata.py +1 -3
wandb/sdk/data_types/base_types/media.py +17 -17
wandb/sdk/data_types/base_types/wb_value.py +33 -26
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
wandb/sdk/data_types/helper_types/classes.py +1 -1
wandb/sdk/data_types/helper_types/image_mask.py +12 -12
wandb/sdk/data_types/histogram.py +5 -4
wandb/sdk/data_types/html.py +1 -2
wandb/sdk/data_types/image.py +11 -11
wandb/sdk/data_types/molecule.py +3 -6
wandb/sdk/data_types/object_3d.py +1 -2
wandb/sdk/data_types/plotly.py +1 -2
wandb/sdk/data_types/saved_model.py +10 -8
wandb/sdk/data_types/video.py +1 -1
wandb/sdk/integration_utils/data_logging.py +5 -5
wandb/sdk/interface/artifacts.py +288 -266
wandb/sdk/interface/interface.py +2 -3
wandb/sdk/interface/interface_grpc.py +1 -1
wandb/sdk/interface/interface_queue.py +1 -1
wandb/sdk/interface/interface_relay.py +1 -1
wandb/sdk/interface/interface_shared.py +1 -2
wandb/sdk/interface/interface_sock.py +1 -1
wandb/sdk/interface/message_future.py +1 -1
wandb/sdk/interface/message_future_poll.py +1 -1
wandb/sdk/interface/router.py +1 -1
wandb/sdk/interface/router_queue.py +1 -1
wandb/sdk/interface/router_relay.py +1 -1
wandb/sdk/interface/router_sock.py +1 -1
wandb/sdk/interface/summary_record.py +1 -1
wandb/sdk/internal/artifacts.py +1 -1
wandb/sdk/internal/datastore.py +2 -3
wandb/sdk/internal/file_pusher.py +5 -3
wandb/sdk/internal/file_stream.py +22 -19
wandb/sdk/internal/handler.py +5 -4
wandb/sdk/internal/internal.py +1 -1
wandb/sdk/internal/internal_api.py +115 -55
wandb/sdk/internal/job_builder.py +1 -3
wandb/sdk/internal/profiler.py +1 -1
wandb/sdk/internal/progress.py +4 -6
wandb/sdk/internal/sample.py +1 -3
wandb/sdk/internal/sender.py +28 -16
wandb/sdk/internal/settings_static.py +5 -5
wandb/sdk/internal/system/assets/__init__.py +1 -0
wandb/sdk/internal/system/assets/cpu.py +3 -9
wandb/sdk/internal/system/assets/disk.py +2 -4
wandb/sdk/internal/system/assets/gpu.py +6 -18
wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
wandb/sdk/internal/system/assets/interfaces.py +50 -22
wandb/sdk/internal/system/assets/ipu.py +1 -3
wandb/sdk/internal/system/assets/memory.py +7 -13
wandb/sdk/internal/system/assets/network.py +4 -8
wandb/sdk/internal/system/assets/open_metrics.py +283 -0
wandb/sdk/internal/system/assets/tpu.py +1 -4
wandb/sdk/internal/system/assets/trainium.py +26 -14
wandb/sdk/internal/system/system_info.py +2 -3
wandb/sdk/internal/system/system_monitor.py +52 -20
wandb/sdk/internal/tb_watcher.py +12 -13
wandb/sdk/launch/_project_spec.py +54 -65
wandb/sdk/launch/agent/agent.py +374 -90
wandb/sdk/launch/builder/abstract.py +61 -7
wandb/sdk/launch/builder/build.py +81 -110
wandb/sdk/launch/builder/docker_builder.py +181 -0
wandb/sdk/launch/builder/kaniko_builder.py +419 -0
wandb/sdk/launch/builder/noop.py +31 -12
wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
wandb/sdk/launch/environment/abstract.py +28 -0
wandb/sdk/launch/environment/aws_environment.py +276 -0
wandb/sdk/launch/environment/gcp_environment.py +271 -0
wandb/sdk/launch/environment/local_environment.py +65 -0
wandb/sdk/launch/github_reference.py +3 -8
wandb/sdk/launch/launch.py +38 -29
wandb/sdk/launch/launch_add.py +6 -8
wandb/sdk/launch/loader.py +230 -0
wandb/sdk/launch/registry/abstract.py +54 -0
wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
wandb/sdk/launch/registry/local_registry.py +62 -0
wandb/sdk/launch/runner/abstract.py +1 -16
wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
wandb/sdk/launch/runner/local_container.py +46 -22
wandb/sdk/launch/runner/local_process.py +1 -4
wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
wandb/sdk/launch/sweeps/__init__.py +3 -2
wandb/sdk/launch/sweeps/scheduler.py +132 -39
wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
wandb/sdk/launch/utils.py +101 -30
wandb/sdk/launch/wandb_reference.py +2 -7
wandb/sdk/lib/_settings_toposort_generate.py +166 -0
wandb/sdk/lib/_settings_toposort_generated.py +201 -0
wandb/sdk/lib/apikey.py +2 -4
wandb/sdk/lib/config_util.py +4 -1
wandb/sdk/lib/console.py +1 -3
wandb/sdk/lib/deprecate.py +3 -3
wandb/sdk/lib/file_stream_utils.py +7 -5
wandb/sdk/lib/filenames.py +1 -1
wandb/sdk/lib/filesystem.py +61 -5
wandb/sdk/lib/git.py +1 -3
wandb/sdk/lib/import_hooks.py +4 -7
wandb/sdk/lib/ipython.py +8 -5
wandb/sdk/lib/lazyloader.py +1 -3
wandb/sdk/lib/mailbox.py +14 -4
wandb/sdk/lib/proto_util.py +10 -5
wandb/sdk/lib/redirect.py +15 -22
wandb/sdk/lib/reporting.py +1 -3
wandb/sdk/lib/retry.py +4 -5
wandb/sdk/lib/runid.py +1 -3
wandb/sdk/lib/server.py +15 -9
wandb/sdk/lib/sock_client.py +1 -1
wandb/sdk/lib/sparkline.py +1 -1
wandb/sdk/lib/wburls.py +1 -1
wandb/sdk/service/port_file.py +1 -2
wandb/sdk/service/service.py +36 -13
wandb/sdk/service/service_base.py +12 -1
wandb/sdk/verify/verify.py +5 -7
wandb/sdk/wandb_artifacts.py +142 -177
wandb/sdk/wandb_config.py +5 -8
wandb/sdk/wandb_helper.py +1 -1
wandb/sdk/wandb_init.py +24 -13
wandb/sdk/wandb_login.py +9 -9
wandb/sdk/wandb_manager.py +39 -4
wandb/sdk/wandb_metric.py +2 -6
wandb/sdk/wandb_require.py +4 -15
wandb/sdk/wandb_require_helpers.py +1 -9
wandb/sdk/wandb_run.py +95 -141
wandb/sdk/wandb_save.py +1 -3
wandb/sdk/wandb_settings.py +149 -54
wandb/sdk/wandb_setup.py +66 -46
wandb/sdk/wandb_summary.py +13 -10
wandb/sdk/wandb_sweep.py +6 -7
wandb/sdk/wandb_watch.py +1 -1
wandb/sklearn/calculate/confusion_matrix.py +1 -1
wandb/sklearn/calculate/learning_curve.py +1 -1
wandb/sklearn/calculate/summary_metrics.py +1 -3
wandb/sklearn/plot/__init__.py +1 -1
wandb/sklearn/plot/classifier.py +27 -18
wandb/sklearn/plot/clusterer.py +4 -5
wandb/sklearn/plot/regressor.py +4 -4
wandb/sklearn/plot/shared.py +2 -2
wandb/sync/__init__.py +1 -3
wandb/sync/sync.py +4 -5
wandb/testing/relay.py +11 -10
wandb/trigger.py +1 -1
wandb/util.py +106 -81
wandb/viz.py +4 -4
wandb/wandb_agent.py +50 -50
wandb/wandb_controller.py +2 -3
wandb/wandb_run.py +1 -2
wandb/wandb_torch.py +1 -1
wandb/xgboost/__init__.py +1 -2
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
wandb/sdk/launch/builder/docker.py +0 -80
wandb/sdk/launch/builder/kaniko.py +0 -393
wandb/sdk/launch/builder/loader.py +0 -32
wandb/sdk/launch/runner/loader.py +0 -50
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0

wandb/sdk/launch/agent/agent.py CHANGED Viewed

@@ -1,42 +1,69 @@
-"""
-Implementation of launch agent.
-"""
+"""Implementation of launch agent."""
 import logging
 import os
 import pprint
+import threading
 import time
 import traceback
-from typing import Any, Dict, List, Union
+from dataclasses import dataclass
+from multiprocessing import Event
+from multiprocessing.pool import ThreadPool
+from typing import Any, Dict, List, Optional, Union
 import wandb
 import wandb.util as util
 from wandb.apis.internal import Api
+from wandb.errors import CommError
+from wandb.sdk.launch._project_spec import LaunchProject
 from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
+from wandb.sdk.launch.sweeps import SCHEDULER_URI
 from wandb.sdk.lib import runid
+from .. import loader
 from .._project_spec import create_project_from_spec, fetch_and_validate_project
-from ..builder.loader import load_builder
+from ..builder.build import construct_builder_args
 from ..runner.abstract import AbstractRun
-from ..runner.loader import load_backend
 from ..utils import (
     LAUNCH_DEFAULT_PROJECT,
     LOG_PREFIX,
     PROJECT_SYNCHRONOUS,
-    resolve_build_and_registry_config,
+    LaunchDockerError,
+    LaunchError,
 )
 AGENT_POLLING_INTERVAL = 10
+ACTIVE_SWEEP_POLLING_INTERVAL = 1  # more frequent when we know we have jobs
 AGENT_POLLING = "POLLING"
 AGENT_RUNNING = "RUNNING"
 AGENT_KILLED = "KILLED"
+MAX_THREADS = 64
 _logger = logging.getLogger(__name__)
+@dataclass
+class JobAndRunStatus:
+    run_queue_item_id: str
+    run_id: Optional[str] = None
+    project: Optional[str] = None
+    run: Optional[AbstractRun] = None
+    failed_to_start: bool = False
+    completed: bool = False
+    is_scheduler: bool = False
+    @property
+    def job_completed(self) -> bool:
+        return self.completed or self.failed_to_start
+    def update_run_info(self, launch_project: LaunchProject) -> None:
+        self.run_id = launch_project.run_id
+        self.project = launch_project.target_project
 def _convert_access(access: str) -> str:
-    """Converts access string to a value accepted by wandb."""
+    """Convert access string to a value accepted by wandb."""
     access = access.upper()
     assert (
         access == "PROJECT" or access == "USER"
@@ -44,31 +71,94 @@ def _convert_access(access: str) -> str:
     return access
+def _max_from_config(
+    config: Dict[str, Any], key: str, default: int = 1
+) -> Union[int, float]:
+    """Get an integer from the config, or float.inf if -1.
+    Utility for parsing integers from the agent config with a default, infinity
+    handling, and integer parsing. Raises more informative error if parse error.
+    """
+    try:
+        val = config.get(key)
+        if val is None:
+            val = default
+        max_from_config = int(val)
+    except ValueError as e:
+        raise LaunchError(
+            f"Error when parsing LaunchAgent config key: ['{key}': "
+            f"{config.get(key)}]. Error: {str(e)}"
+        )
+    if max_from_config == -1:
+        return float("inf")
+    if max_from_config < 0:
+        raise LaunchError(
+            f"Error when parsing LaunchAgent config key: ['{key}': "
+            f"{config.get(key)}]. Error: negative value."
+        )
+    return max_from_config
+def _job_is_scheduler(run_spec: Dict[str, Any]) -> bool:
+    """Determine whether a job/runSpec is a sweep scheduler."""
+    if not run_spec:
+        _logger.debug("Recieved runSpec in _job_is_scheduler that was empty")
+    if run_spec.get("uri") != SCHEDULER_URI:
+        return False
+    if run_spec.get("resource") == "local-process":
+        # If a scheduler is a local-process (100%), also
+        #    confirm command is in format: [wandb scheduler <sweep>]
+        cmd = run_spec.get("overrides", {}).get("entry_point", [])
+        if len(cmd) < 3:
+            return False
+        if cmd[:2] != ["wandb", "scheduler"]:
+            return False
+    return True
 class LaunchAgent:
     """Launch agent class which polls run given run queues and launches runs for wandb launch."""
     def __init__(self, api: Api, config: Dict[str, Any]):
-        self._entity = config.get("entity")
-        self._project = config.get("project")
+        """Initialize a launch agent.
+        Arguments:
+            api: Api object to use for making requests to the backend.
+            config: Config dictionary for the agent.
+        """
+        self._entity = config["entity"]
+        self._project = config["project"]
         self._api = api
         self._base_url = self._api.settings().get("base_url")
-        self._jobs: Dict[Union[int, str], AbstractRun] = {}
         self._ticks = 0
-        self._running = 0
+        self._jobs: Dict[int, JobAndRunStatus] = {}
+        self._jobs_lock = threading.Lock()
+        self._jobs_event = Event()
+        self._jobs_event.set()
         self._cwd = os.getcwd()
         self._namespace = runid.generate_id()
         self._access = _convert_access("project")
-        max_jobs_from_config = int(config.get("max_jobs", 1))
-        if max_jobs_from_config == -1:
-            self._max_jobs = float("inf")
-        else:
-            self._max_jobs = max_jobs_from_config
+        self._max_jobs = _max_from_config(config, "max_jobs")
+        self._max_schedulers = _max_from_config(config, "max_schedulers")
+        self._pool = ThreadPool(
+            processes=int(min(MAX_THREADS, self._max_jobs + self._max_schedulers)),
+            initargs=(self._jobs, self._jobs_lock),
+        )
         self.default_config: Dict[str, Any] = config
         # serverside creation
         self.gorilla_supports_agents = (
             self._api.launch_agent_introspection() is not None
         )
+        self._gorilla_supports_fail_run_queue_items = (
+            self._api.fail_run_queue_item_introspection()
+        )
         self._queues = config.get("queues", ["default"])
         create_response = self._api.create_launch_agent(
             self._entity,
@@ -78,14 +168,45 @@ class LaunchAgent:
         )
         self._id = create_response["launchAgentId"]
         self._name = ""  # hacky: want to display this to the user but we don't get it back from gql until polling starts. fix later
+        if self._api.entity_is_team(self._entity):
+            wandb.termwarn(
+                f"{LOG_PREFIX}Agent is running on team entity ({self._entity}). Members of this team will be able to run code on this device."
+            )
+    def fail_run_queue_item(self, run_queue_item_id: str) -> None:
+        if self._gorilla_supports_fail_run_queue_items:
+            self._api.fail_run_queue_item(run_queue_item_id)
+    @property
+    def thread_ids(self) -> List[int]:
+        """Returns a list of keys running thread ids for the agent."""
+        with self._jobs_lock:
+            return list(self._jobs.keys())
+    @property
+    def num_running_schedulers(self) -> int:
+        """Return just the number of schedulers."""
+        with self._jobs_lock:
+            return len([x for x in self._jobs if self._jobs[x].is_scheduler])
     @property
-    def job_ids(self) -> List[Union[int, str]]:
-        """Returns a list of keys running job ids for the agent."""
-        return list(self._jobs.keys())
+    def num_running_jobs(self) -> int:
+        """Return the number of jobs not including schedulers."""
+        with self._jobs_lock:
+            return len([x for x in self._jobs if not self._jobs[x].is_scheduler])
     def pop_from_queue(self, queue: str) -> Any:
-        """Pops an item off the runqueue to run as a job."""
+        """Pops an item off the runqueue to run as a job.
+        Arguments:
+            queue: Queue to pop from.
+        Returns:
+            Item popped off the queue.
+        Raises:
+            Exception: if there is an error popping from the queue.
+        """
         try:
             ups = self._api.pop_from_run_queue(
                 queue,
@@ -100,41 +221,77 @@ class LaunchAgent:
     def print_status(self) -> None:
         """Prints the current status of the agent."""
-        if self._project == LAUNCH_DEFAULT_PROJECT:
-            wandb.termlog(
-                f"{LOG_PREFIX}agent {self._name} polling on queues {','.join(self._queues)} for jobs"
-            )
-        else:
-            wandb.termlog(
-                f"{LOG_PREFIX}agent {self._name} polling on project {self._project}, queues {','.join(self._queues)} for jobs"
-            )
+        output_str = "agent "
+        if self._name:
+            output_str += f"{self._name} "
+        if self.num_running_jobs < self._max_jobs:
+            output_str += "polling on "
+            if self._project != LAUNCH_DEFAULT_PROJECT:
+                output_str += f"project {self._project}, "
+            output_str += f"queues {','.join(self._queues)}, "
+        output_str += (
+            f"running {self.num_running_jobs} out of a maximum of {self._max_jobs} jobs"
+        )
+        wandb.termlog(f"{LOG_PREFIX}{output_str}")
+        if self.num_running_jobs > 0:
+            output_str += f": {','.join(str(job_id) for job_id in self.thread_ids)}"
+        _logger.info(output_str)
     def update_status(self, status: str) -> None:
+        """Update the status of the agent.
+        Arguments:
+            status: Status to update the agent to.
+        """
         update_ret = self._api.update_launch_agent_status(
             self._id, status, self.gorilla_supports_agents
         )
         if not update_ret["success"]:
-            wandb.termerror(f"Failed to update agent status to {status}")
+            wandb.termerror(f"{LOG_PREFIX}Failed to update agent status to {status}")
-    def finish_job_id(self, job_id: Union[str, int]) -> None:
+    def finish_thread_id(self, thread_id: int) -> None:
         """Removes the job from our list for now."""
+        job_and_run_status = self._jobs[thread_id]
+        if not job_and_run_status.run_id or not job_and_run_status.project:
+            self.fail_run_queue_item(job_and_run_status.run_queue_item_id)
+        else:
+            run_info = None
+            # sweep runs exist but have no info before they are started
+            # so run_info returned will be None
+            # normal runs just throw a comm error
+            try:
+                run_info = self._api.get_run_info(
+                    self._entity, job_and_run_status.project, job_and_run_status.run_id
+                )
+            except CommError:
+                pass
+            if run_info is None:
+                self.fail_run_queue_item(job_and_run_status.run_queue_item_id)
         # TODO:  keep logs or something for the finished jobs
-        del self._jobs[job_id]
-        self._running -= 1
+        with self._jobs_lock:
+            del self._jobs[thread_id]
         # update status back to polling if no jobs are running
-        if self._running == 0:
+        if len(self.thread_ids) == 0:
             self.update_status(AGENT_POLLING)
-    def _update_finished(self, job_id: Union[int, str]) -> None:
+    def _update_finished(self, thread_id: int) -> None:
         """Check our status enum."""
-        try:
-            if self._jobs[job_id].get_status().state in ["failed", "finished"]:
-                self.finish_job_id(job_id)
-        except Exception:
-            self.finish_job_id(job_id)
+        with self._jobs_lock:
+            job = self._jobs[thread_id]
+        if job.job_completed:
+            self.finish_thread_id(thread_id)
     def run_job(self, job: Dict[str, Any]) -> None:
-        """Sets up project and runs the job."""
+        """Set up project and run the job.
+        Arguments:
+            job: Job to run.
+        """
         _msg = f"{LOG_PREFIX}Launch agent received job:\n{pprint.pformat(job)}\n"
         wandb.termlog(_msg)
         _logger.info(_msg)
@@ -151,81 +308,208 @@ class LaunchAgent:
                 launch_spec["overrides"].get("args", [])
             )
-        project = create_project_from_spec(launch_spec, self._api)
-        _logger.info("Fetching and validating project...")
-        project = fetch_and_validate_project(project, self._api)
-        _logger.info("Fetching resource...")
-        resource = launch_spec.get("resource") or "local-container"
-        backend_config: Dict[str, Any] = {
-            PROJECT_SYNCHRONOUS: False,  # agent always runs async
-        }
-        backend_config["runQueueItemId"] = job["runQueueItemId"]
-        _logger.info("Loading backend")
-        override_build_config = launch_spec.get("build")
-        override_registry_config = launch_spec.get("registry")
-        build_config, registry_config = resolve_build_and_registry_config(
-            self.default_config, override_build_config, override_registry_config
+        self._pool.apply_async(
+            self.thread_run_job,
+            (
+                launch_spec,
+                job,
+                self.default_config,
+                self._api,
+            ),
         )
-        builder = load_builder(build_config)
-        default_runner = self.default_config.get("runner", {}).get("type")
-        if default_runner == resource:
-            backend_config["runner"] = self.default_config.get("runner")
-        backend = load_backend(resource, self._api, backend_config)
-        backend.verify()
-        _logger.info("Backend loaded...")
-        run = backend.run(project, builder, registry_config)
-        if run:
-            self._jobs[run.id] = run
-            self._running += 1
     def loop(self) -> None:
-        """Main loop function for agent."""
+        """Loop infinitely to poll for jobs and run them.
+        Raises:
+            KeyboardInterrupt: if the agent is requested to stop.
+        """
         self.print_status()
         try:
             while True:
                 self._ticks += 1
-                job = None
                 agent_response = self._api.get_launch_agent(
                     self._id, self.gorilla_supports_agents
                 )
-                self._name = agent_response[
-                    "name"
-                ]  # hacky, but we don't return the name on create so this is first time
+                self._name = agent_response["name"]  # hack: first time we get name
                 if agent_response["stopPolling"]:
                     # shutdown process and all jobs if requested from ui
                     raise KeyboardInterrupt
-                if self._running < self._max_jobs:
+                if self.num_running_jobs < self._max_jobs:
                     # only check for new jobs if we're not at max
                     for queue in self._queues:
                         job = self.pop_from_queue(queue)
                         if job:
+                            if _job_is_scheduler(job.get("runSpec")):
+                                # If job is a scheduler, and we are already at the cap, ignore,
+                                #    don't ack, and it will be pushed back onto the queue in 1 min
+                                if self.num_running_schedulers >= self._max_schedulers:
+                                    wandb.termwarn(
+                                        f"{LOG_PREFIX}Agent already running the maximum number "
+                                        f"of sweep schedulers: {self._max_schedulers}. To set "
+                                        "this value use `max_schedulers` key in the agent config"
+                                    )
+                                    continue
                             try:
                                 self.run_job(job)
-                            except Exception:
+                            except Exception as e:
                                 wandb.termerror(
-                                    f"Error running job: {traceback.format_exc()}"
+                                    f"{LOG_PREFIX}Error running job: {traceback.format_exc()}"
                                 )
-                                self._api.ack_run_queue_item(job["runQueueItemId"])
-                for job_id in self.job_ids:
-                    self._update_finished(job_id)
+                                util.sentry_exc(e)
+                                self.fail_run_queue_item(job["runQueueItemId"])
+                for thread_id in self.thread_ids:
+                    self._update_finished(thread_id)
                 if self._ticks % 2 == 0:
-                    if self._running == 0:
+                    if len(self.thread_ids) == 0:
                         self.update_status(AGENT_POLLING)
-                        self.print_status()
                     else:
                         self.update_status(AGENT_RUNNING)
-                time.sleep(AGENT_POLLING_INTERVAL)
+                    self.print_status()
+                if (
+                    self.num_running_jobs == self._max_jobs
+                    or self.num_running_schedulers == 0
+                ):
+                    # all threads busy or no schedulers running
+                    time.sleep(AGENT_POLLING_INTERVAL)
+                else:
+                    time.sleep(ACTIVE_SWEEP_POLLING_INTERVAL)
         except KeyboardInterrupt:
-            # temp: for local, kill all jobs. we don't yet have good handling for different
-            # types of runners in general
-            for _, run in self._jobs.items():
-                if isinstance(run, LocalSubmittedRun):
-                    run.command_proc.kill()
+            self._jobs_event.clear()
             self.update_status(AGENT_KILLED)
             wandb.termlog(f"{LOG_PREFIX}Shutting down, active jobs:")
             self.print_status()
+            self._pool.close()
+            self._pool.join()
+    # Threaded functions
+    def thread_run_job(
+        self,
+        launch_spec: Dict[str, Any],
+        job: Dict[str, Any],
+        default_config: Dict[str, Any],
+        api: Api,
+    ) -> None:
+        thread_id = threading.current_thread().ident
+        assert thread_id is not None
+        try:
+            self._thread_run_job(launch_spec, job, default_config, api, thread_id)
+        except LaunchDockerError as e:
+            wandb.termerror(
+                f"{LOG_PREFIX}agent {self._name} encountered an issue while starting Docker, see above output for details."
+            )
+            self.finish_thread_id(thread_id)
+            util.sentry_exc(e)
+        except Exception as e:
+            wandb.termerror(f"{LOG_PREFIX}Error running job: {traceback.format_exc()}")
+            self.finish_thread_id(thread_id)
+            util.sentry_exc(e)
+    def _thread_run_job(
+        self,
+        launch_spec: Dict[str, Any],
+        job: Dict[str, Any],
+        default_config: Dict[str, Any],
+        api: Api,
+        thread_id: int,
+    ) -> None:
+        job_tracker = JobAndRunStatus(job["runQueueItemId"])
+        with self._jobs_lock:
+            self._jobs[thread_id] = job_tracker
+        project = create_project_from_spec(launch_spec, api)
+        job_tracker.update_run_info(project)
+        _logger.info("Fetching and validating project...")
+        project = fetch_and_validate_project(project, api)
+        _logger.info("Fetching resource...")
+        resource = launch_spec.get("resource") or "local-container"
+        backend_config: Dict[str, Any] = {
+            PROJECT_SYNCHRONOUS: False,  # agent always runs async
+        }
+        _logger.info("Loading backend")
+        override_build_config = launch_spec.get("builder")
+        build_config, registry_config = construct_builder_args(
+            default_config, override_build_config
+        )
+        environment = loader.environment_from_config(
+            default_config.get("environment", {})
+        )
+        registry = loader.registry_from_config(registry_config, environment)
+        builder = loader.builder_from_config(build_config, environment, registry)
+        backend = loader.runner_from_config(resource, api, backend_config, environment)
+        _logger.info("Backend loaded...")
+        api.ack_run_queue_item(job["runQueueItemId"], project.run_id)
+        run = backend.run(project, builder)
+        if _job_is_scheduler(launch_spec):
+            with self._jobs_lock:
+                self._jobs[thread_id].is_scheduler = True
+            wandb.termlog(
+                f"{LOG_PREFIX}Preparing to run sweep scheduler "
+                f"({self.num_running_schedulers}/{self._max_schedulers})"
+            )
+        if not run:
+            with self._jobs_lock:
+                job_tracker.failed_to_start = True
+            return
+        with self._jobs_lock:
+            job_tracker.run = run
+        while self._jobs_event.is_set():
+            if self._check_run_finished(job_tracker):
+                return
+            time.sleep(AGENT_POLLING_INTERVAL)
+        # temp: for local, kill all jobs. we don't yet have good handling for different
+        # types of runners in general
+        if isinstance(run, LocalSubmittedRun):
+            run.command_proc.kill()
+    def _check_run_finished(self, job_tracker: JobAndRunStatus) -> bool:
+        if job_tracker.completed:
+            return True
+        # the run can be done before the run has started
+        # but can also be none if the run failed to start
+        # so if there is no run, either the run hasn't started yet
+        # or it has failed
+        if job_tracker.run is None:
+            if job_tracker.failed_to_start:
+                return True
+            return False
+        known_error = False
+        try:
+            run = job_tracker.run
+            status = run.get_status().state
+            if status in ["stopped", "failed", "finished"]:
+                if job_tracker.is_scheduler:
+                    wandb.termlog(f"{LOG_PREFIX}Scheduler finished with ID: {run.id}")
+                else:
+                    wandb.termlog(f"{LOG_PREFIX}Job finished with ID: {run.id}")
+                with self._jobs_lock:
+                    job_tracker.completed = True
+                return True
+            return False
+        except LaunchError as e:
+            wandb.termerror(
+                f"{LOG_PREFIX}Terminating job {run.id} because it failed to start: {str(e)}"
+            )
+            known_error = True
+            with self._jobs_lock:
+                job_tracker.failed_to_start = True
+        # TODO: make get_status robust to errors for each runner, and handle them
+        # TODO: add sentry to track this case and solve issues
+        except Exception:
+            wandb.termerror(f"{LOG_PREFIX}Error getting status for job {run.id}")
+            wandb.termerror(traceback.format_exc())
+            _logger.info("---")
+            _logger.info("Caught exception while getting status.")
+            _logger.info(f"Job ID: {run.id}")
+            _logger.info(traceback.format_exc())
+            _logger.info("---")
+        return known_error

wandb/sdk/launch/builder/abstract.py CHANGED Viewed

@@ -1,5 +1,9 @@
+"""Abstract plugin class defining the interface needed to build container images for W&B Launch."""
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional
+from typing import Any, Dict
+from wandb.sdk.launch.environment.abstract import AbstractEnvironment
+from wandb.sdk.launch.registry.abstract import AbstractRegistry
 from .._project_spec import EntryPoint, LaunchProject
@@ -7,25 +11,75 @@ from .._project_spec import EntryPoint, LaunchProject
 class AbstractBuilder(ABC):
     """Abstract plugin class defining the interface needed to build container images for W&B Launch."""
-    type: str
+    builder_type: str
+    environment: AbstractEnvironment
+    registry: AbstractRegistry
+    builder_config: Dict[str, Any]
+    @abstractmethod
+    def __init__(
+        self,
+        environment: AbstractEnvironment,
+        registry: AbstractRegistry,
+        verify: bool = True,
+    ) -> None:
+        """Initialize a builder.
+        Arguments:
+            builder_config: The builder config.
+            registry: The registry to use.
+            verify: Whether to verify the functionality of the builder.
+        Raises:
+            LaunchError: If the builder cannot be intialized or verified.
+        """
+        raise NotImplementedError
+    @classmethod
+    @abstractmethod
+    def from_config(
+        cls,
+        config: dict,
+        environment: AbstractEnvironment,
+        registry: AbstractRegistry,
+        verify: bool = True,
+    ) -> "AbstractBuilder":
+        """Create a builder from a config dictionary.
+        Arguments:
+            config: The config dictionary.
+            environment: The environment to use.
+            registry: The registry to use.
+            verify: Whether to verify the functionality of the builder.
+            login: Whether to login to the registry immediately.
-    def __init__(self, builder_config: Dict[str, Any]) -> None:
-        self.builder_config = builder_config
+        Returns:
+            The builder.
+        """
+        raise NotImplementedError
     @abstractmethod
     def build_image(
         self,
         launch_project: LaunchProject,
-        registry: Optional[str],
         entrypoint: EntryPoint,
     ) -> str:
         """Build the image for the given project.
-        Args:
+        Arguments:
             launch_project: The project to build.
             build_ctx_path: The path to the build context.
         Returns:
             The image name.
         """
-        pass
+        raise NotImplementedError
+    @abstractmethod
+    def verify(self) -> None:
+        """Verify that the builder can be used to build images.
+        Raises:
+            LaunchError: If the builder cannot be used to build images.
+        """
+        raise NotImplementedError

wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl

wandb 0.13.10py3-none-any.whl → 0.14.0py3-none-any.whl