PyPI - wandb - Versions diffs - 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl - Mend

wandb 0.15.3py3-none-any.whl → 0.15.5py3-none-any.whl

Files changed (156) hide show

wandb/__init__.py +1 -1
wandb/analytics/sentry.py +1 -0
wandb/apis/importers/base.py +20 -5
wandb/apis/importers/mlflow.py +7 -1
wandb/apis/internal.py +12 -0
wandb/apis/public.py +247 -1387
wandb/apis/reports/_panels.py +58 -35
wandb/beta/workflows.py +6 -7
wandb/cli/cli.py +130 -60
wandb/data_types.py +3 -1
wandb/filesync/dir_watcher.py +21 -27
wandb/filesync/step_checksum.py +8 -8
wandb/filesync/step_prepare.py +23 -10
wandb/filesync/step_upload.py +13 -13
wandb/filesync/upload_job.py +4 -8
wandb/integration/cohere/__init__.py +3 -0
wandb/integration/cohere/cohere.py +21 -0
wandb/integration/cohere/resolver.py +347 -0
wandb/integration/gym/__init__.py +4 -6
wandb/integration/huggingface/__init__.py +3 -0
wandb/integration/huggingface/huggingface.py +18 -0
wandb/integration/huggingface/resolver.py +213 -0
wandb/integration/langchain/wandb_tracer.py +16 -179
wandb/integration/openai/__init__.py +1 -3
wandb/integration/openai/openai.py +11 -143
wandb/integration/openai/resolver.py +111 -38
wandb/integration/sagemaker/config.py +2 -2
wandb/integration/tensorboard/log.py +4 -4
wandb/old/settings.py +24 -7
wandb/proto/v3/wandb_telemetry_pb2.py +12 -12
wandb/proto/v4/wandb_telemetry_pb2.py +12 -12
wandb/proto/wandb_deprecated.py +3 -1
wandb/sdk/__init__.py +1 -1
wandb/sdk/artifacts/__init__.py +0 -0
wandb/sdk/artifacts/artifact.py +2101 -0
wandb/sdk/artifacts/artifact_download_logger.py +42 -0
wandb/sdk/artifacts/artifact_manifest.py +67 -0
wandb/sdk/artifacts/artifact_manifest_entry.py +159 -0
wandb/sdk/artifacts/artifact_manifests/__init__.py +0 -0
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +91 -0
wandb/sdk/{internal → artifacts}/artifact_saver.py +6 -5
wandb/sdk/artifacts/artifact_state.py +10 -0
wandb/sdk/{interface/artifacts/artifact_cache.py → artifacts/artifacts_cache.py} +22 -12
wandb/sdk/artifacts/exceptions.py +55 -0
wandb/sdk/artifacts/storage_handler.py +59 -0
wandb/sdk/artifacts/storage_handlers/__init__.py +0 -0
wandb/sdk/artifacts/storage_handlers/azure_handler.py +192 -0
wandb/sdk/artifacts/storage_handlers/gcs_handler.py +224 -0
wandb/sdk/artifacts/storage_handlers/http_handler.py +112 -0
wandb/sdk/artifacts/storage_handlers/local_file_handler.py +134 -0
wandb/sdk/artifacts/storage_handlers/multi_handler.py +53 -0
wandb/sdk/artifacts/storage_handlers/s3_handler.py +301 -0
wandb/sdk/artifacts/storage_handlers/tracking_handler.py +67 -0
wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +132 -0
wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +72 -0
wandb/sdk/artifacts/storage_layout.py +6 -0
wandb/sdk/artifacts/storage_policies/__init__.py +0 -0
wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +61 -0
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +386 -0
wandb/sdk/{interface/artifacts/artifact_storage.py → artifacts/storage_policy.py} +5 -57
wandb/sdk/data_types/_dtypes.py +7 -12
wandb/sdk/data_types/base_types/json_metadata.py +3 -2
wandb/sdk/data_types/base_types/media.py +8 -8
wandb/sdk/data_types/base_types/wb_value.py +12 -13
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +5 -6
wandb/sdk/data_types/helper_types/classes.py +6 -8
wandb/sdk/data_types/helper_types/image_mask.py +5 -6
wandb/sdk/data_types/histogram.py +4 -3
wandb/sdk/data_types/html.py +3 -4
wandb/sdk/data_types/image.py +11 -9
wandb/sdk/data_types/molecule.py +5 -3
wandb/sdk/data_types/object_3d.py +7 -5
wandb/sdk/data_types/plotly.py +3 -2
wandb/sdk/data_types/saved_model.py +11 -11
wandb/sdk/data_types/trace_tree.py +5 -4
wandb/sdk/data_types/utils.py +3 -5
wandb/sdk/data_types/video.py +5 -4
wandb/sdk/integration_utils/auto_logging.py +215 -0
wandb/sdk/interface/interface.py +15 -15
wandb/sdk/internal/file_pusher.py +8 -16
wandb/sdk/internal/file_stream.py +5 -11
wandb/sdk/internal/handler.py +13 -1
wandb/sdk/internal/internal_api.py +287 -13
wandb/sdk/internal/job_builder.py +119 -30
wandb/sdk/internal/sender.py +6 -26
wandb/sdk/internal/settings_static.py +2 -0
wandb/sdk/internal/system/assets/__init__.py +2 -0
wandb/sdk/internal/system/assets/gpu.py +42 -0
wandb/sdk/internal/system/assets/gpu_amd.py +216 -0
wandb/sdk/internal/system/env_probe_helpers.py +13 -0
wandb/sdk/internal/system/system_info.py +3 -3
wandb/sdk/internal/tb_watcher.py +32 -22
wandb/sdk/internal/thread_local_settings.py +18 -0
wandb/sdk/launch/_project_spec.py +57 -11
wandb/sdk/launch/agent/agent.py +147 -65
wandb/sdk/launch/agent/job_status_tracker.py +34 -0
wandb/sdk/launch/agent/run_queue_item_file_saver.py +45 -0
wandb/sdk/launch/builder/abstract.py +5 -1
wandb/sdk/launch/builder/build.py +21 -18
wandb/sdk/launch/builder/docker_builder.py +10 -4
wandb/sdk/launch/builder/kaniko_builder.py +113 -23
wandb/sdk/launch/builder/noop.py +6 -3
wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +46 -14
wandb/sdk/launch/environment/aws_environment.py +3 -2
wandb/sdk/launch/environment/azure_environment.py +124 -0
wandb/sdk/launch/environment/gcp_environment.py +2 -4
wandb/sdk/launch/environment/local_environment.py +1 -1
wandb/sdk/launch/errors.py +19 -0
wandb/sdk/launch/github_reference.py +32 -19
wandb/sdk/launch/launch.py +3 -8
wandb/sdk/launch/launch_add.py +6 -2
wandb/sdk/launch/loader.py +21 -2
wandb/sdk/launch/registry/azure_container_registry.py +132 -0
wandb/sdk/launch/registry/elastic_container_registry.py +39 -5
wandb/sdk/launch/registry/google_artifact_registry.py +68 -26
wandb/sdk/launch/registry/local_registry.py +2 -1
wandb/sdk/launch/runner/abstract.py +24 -3
wandb/sdk/launch/runner/kubernetes_runner.py +479 -26
wandb/sdk/launch/runner/local_container.py +103 -51
wandb/sdk/launch/runner/local_process.py +1 -1
wandb/sdk/launch/runner/sagemaker_runner.py +60 -10
wandb/sdk/launch/runner/vertex_runner.py +10 -5
wandb/sdk/launch/sweeps/__init__.py +7 -9
wandb/sdk/launch/sweeps/scheduler.py +307 -77
wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
wandb/sdk/launch/sweeps/utils.py +82 -35
wandb/sdk/launch/utils.py +89 -75
wandb/sdk/lib/_settings_toposort_generated.py +7 -0
wandb/sdk/lib/capped_dict.py +26 -0
wandb/sdk/lib/{git.py → gitlib.py} +76 -59
wandb/sdk/lib/hashutil.py +12 -4
wandb/sdk/lib/paths.py +96 -8
wandb/sdk/lib/sock_client.py +2 -2
wandb/sdk/lib/timer.py +1 -0
wandb/sdk/service/server.py +22 -9
wandb/sdk/service/server_sock.py +1 -1
wandb/sdk/service/service.py +27 -8
wandb/sdk/verify/verify.py +4 -7
wandb/sdk/wandb_config.py +2 -6
wandb/sdk/wandb_init.py +57 -53
wandb/sdk/wandb_require.py +7 -0
wandb/sdk/wandb_run.py +61 -223
wandb/sdk/wandb_settings.py +28 -4
wandb/testing/relay.py +15 -2
wandb/util.py +74 -36
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/METADATA +15 -9
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/RECORD +151 -116
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/entry_points.txt +1 -0
wandb/integration/langchain/util.py +0 -191
wandb/sdk/interface/artifacts/__init__.py +0 -33
wandb/sdk/interface/artifacts/artifact.py +0 -615
wandb/sdk/interface/artifacts/artifact_manifest.py +0 -131
wandb/sdk/wandb_artifacts.py +0 -2226
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/LICENSE +0 -0
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/WHEEL +0 -0
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/top_level.txt +0 -0

wandb/sdk/launch/sweeps/scheduler.py CHANGED Viewed

@@ -9,7 +9,7 @@ import traceback
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, Iterator, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import click
 import yaml
@@ -17,7 +17,10 @@ import yaml
 import wandb
 import wandb.apis.public as public
 from wandb.apis.internal import Api
+from wandb.apis.public import Api as PublicApi
+from wandb.apis.public import QueuedRun, Run
 from wandb.errors import CommError
+from wandb.sdk.launch.errors import LaunchError
 from wandb.sdk.launch.launch_add import launch_add
 from wandb.sdk.launch.sweeps import SchedulerError
 from wandb.sdk.launch.sweeps.utils import (
@@ -25,10 +28,13 @@ from wandb.sdk.launch.sweeps.utils import (
     make_launch_sweep_entrypoint,
 )
 from wandb.sdk.lib.runid import generate_id
+from wandb.sdk.wandb_run import Run as SdkRun
 _logger = logging.getLogger(__name__)
 LOG_PREFIX = f"{click.style('sched:', fg='cyan')} "
+DEFAULT_POLLING_SLEEP = 5.0
 class SchedulerState(Enum):
     PENDING = 0
@@ -42,9 +48,29 @@ class SchedulerState(Enum):
 class RunState(Enum):
-    ALIVE = 0
-    DEAD = 1
-    UNKNOWN = 2
+    RUNNING = "running", "alive"
+    PENDING = "pending", "alive"
+    PREEMPTING = "preempting", "alive"
+    CRASHED = "crashed", "dead"
+    FAILED = "failed", "dead"
+    KILLED = "killed", "dead"
+    FINISHED = "finished", "dead"
+    PREEMPTED = "preempted", "dead"
+    # unknown when api.get_run_state fails or returns unexpected state
+    # assumed alive, unless we get unknown 2x then move to failed (dead)
+    UNKNOWN = "unknown", "alive"
+    def __new__(cls: Any, *args: List, **kwds: Any) -> "RunState":
+        obj: "RunState" = object.__new__(cls)
+        obj._value_ = args[0]
+        return obj
+    def __init__(self, _: str, life: str = "unknown") -> None:
+        self._life = life
+    @property
+    def is_alive(self) -> bool:
+        return self._life == "alive"
 @dataclass
@@ -57,7 +83,7 @@ class _Worker:
 class SweepRun:
     id: str
     worker_id: int
-    state: RunState = RunState.ALIVE
+    state: RunState = RunState.RUNNING
     queued_run: Optional[public.QueuedRun] = None
     args: Optional[Dict[str, Any]] = None
     logs: Optional[List[str]] = None
@@ -66,20 +92,24 @@ class SweepRun:
 class Scheduler(ABC):
     """A controller/agent that populates a Launch RunQueue from a hyperparameter sweep."""
+    PLACEHOLDER_URI = "placeholder-uri-scheduler"
+    SWEEP_JOB_TYPE = "sweep-controller"
+    ENTRYPOINT = ["wandb", "scheduler", "WANDB_SWEEP_ID"]
     def __init__(
         self,
         api: Api,
         *args: Optional[Any],
-        num_workers: int = 8,
-        polling_sleep: float = 5.0,
+        polling_sleep: Optional[float] = None,
         sweep_id: Optional[str] = None,
         entity: Optional[str] = None,
         project: Optional[str] = None,
         project_queue: Optional[str] = None,
+        num_workers: Optional[Union[int, str]] = None,
         **kwargs: Optional[Any],
     ):
         self._api = api
-        self._public_api = public.Api()
+        self._public_api = PublicApi()
         self._entity = (
             entity
             or os.environ.get("WANDB_ENTITY")
@@ -100,27 +130,42 @@ class Scheduler(ABC):
             if resp.get("state") == SchedulerState.CANCELLED.name:
                 self._state = SchedulerState.CANCELLED
             self._sweep_config = yaml.safe_load(resp["config"])
+            self._num_runs_launched: int = self._get_num_runs_launched(resp["runs"])
+            if self._num_runs_launched > 0:
+                wandb.termlog(
+                    f"{LOG_PREFIX}Found {self._num_runs_launched} previous valid runs for sweep {self._sweep_id}"
+                )
         except Exception as e:
             raise SchedulerError(
                 f"{LOG_PREFIX}Exception when finding sweep ({sweep_id}) {e}"
             )
+        # Scheduler may receive additional kwargs which will be piped into the launch command
+        self._kwargs: Dict[str, Any] = kwargs
         # Dictionary of the runs being managed by the scheduler
         self._runs: Dict[str, SweepRun] = {}
         # Threading lock to ensure thread-safe access to the runs dictionary
         self._threading_lock: threading.Lock = threading.Lock()
-        self._polling_sleep = polling_sleep
+        self._polling_sleep = polling_sleep or DEFAULT_POLLING_SLEEP
         self._project_queue = project_queue
         # Optionally run multiple workers in (pseudo-)parallel. Workers do not
         # actually run training workloads, they simply send heartbeat messages
         # (emulating a real agent) and add new runs to the launch queue. The
         # launch agent is the one that actually runs the training workloads.
         self._workers: Dict[int, _Worker] = {}
-        self._num_workers = num_workers
-        self._num_runs_launched = 0
-        # Scheduler may receive additional kwargs which will be piped into the launch command
-        self._kwargs: Dict[str, Any] = kwargs
+        # Init wandb scheduler run
+        self._wandb_run = self._init_wandb_run()
+        # Grab params from scheduler wandb run config
+        num_workers = num_workers or self._wandb_run.config.get("scheduler", {}).get(
+            "num_workers"
+        )
+        self._num_workers = int(num_workers) if str(num_workers).isdigit() else 8
+        self._settings_config: Dict[str, Any] = self._wandb_run.config.get(
+            "settings", {}
+        )
     @abstractmethod
     def _get_next_sweep_run(self, worker_id: int) -> Optional[SweepRun]:
@@ -168,7 +213,6 @@ class Scheduler(ABC):
     @property
     def at_runcap(self) -> bool:
         """False if under user-specified cap on # of runs."""
-        # TODO(gst): Count previous runs for resumed sweeps
         run_cap = self._sweep_config.get("run_cap")
         if not run_cap:
             return False
@@ -200,6 +244,18 @@ class Scheduler(ABC):
             _id: w for _id, w in self._workers.items() if _id not in self.busy_workers
         }
+    def _init_wandb_run(self) -> SdkRun:
+        """Controls resume or init logic for a scheduler wandb run."""
+        _type = self._kwargs.get("sweep_type", "sweep")
+        run: SdkRun = wandb.init(
+            name=f"{_type}-scheduler-{self._sweep_id}",
+            job_type=self.SWEEP_JOB_TYPE,
+            # WANDB_RUN_ID = sweep_id for scheduler
+            resume="allow",
+            config=self._kwargs,  # when run as a job, this sets config
+        )
+        return run
     def stop_sweep(self) -> None:
         """Stop the sweep."""
         self._state = SchedulerState.STOPPED
@@ -228,6 +284,7 @@ class Scheduler(ABC):
             self.exit()
             return
+        # For resuming sweeps
         self._load_state()
         self._register_agents()
         self.run()
@@ -238,10 +295,12 @@ class Scheduler(ABC):
         self.state = SchedulerState.RUNNING
         try:
             while True:
-                wandb.termlog(f"{LOG_PREFIX}Polling for new runs to launch")
+                self._update_scheduler_run_state()
                 if not self.is_alive:
                     break
+                wandb.termlog(f"{LOG_PREFIX}Polling for new runs to launch")
                 self._update_run_states()
                 self._poll()
                 if self.state == SchedulerState.FLUSH_RUNS:
@@ -259,8 +318,17 @@ class Scheduler(ABC):
                         self.state = SchedulerState.FLUSH_RUNS
                         break
-                    run: Optional[SweepRun] = self._get_next_sweep_run(worker_id)
-                    if not run:
+                    try:
+                        run: Optional[SweepRun] = self._get_next_sweep_run(worker_id)
+                        if not run:
+                            break
+                    except SchedulerError as e:
+                        raise SchedulerError(e)
+                    except Exception as e:
+                        wandb.termerror(
+                            f"{LOG_PREFIX}Failed to get next sweep run: {e}"
+                        )
+                        self.state = SchedulerState.FAILED
                         break
                     if self._add_to_launch_queue(run):
@@ -278,18 +346,49 @@ class Scheduler(ABC):
             self.exit()
             raise e
         else:
-            wandb.termlog(f"{LOG_PREFIX}Scheduler completed")
+            wandb.termlog(f"{LOG_PREFIX}Scheduler completed successfully")
+            # don't overwrite special states (e.g. STOPPED, FAILED)
+            if self.state in [SchedulerState.RUNNING, SchedulerState.FLUSH_RUNS]:
+                self.state = SchedulerState.COMPLETED
             self.exit()
     def exit(self) -> None:
         self._exit()
-        self._save_state()
+        # _save_state isn't controlled, possibly fails
+        try:
+            self._save_state()
+        except Exception:
+            wandb.termerror(
+                f"{LOG_PREFIX}Failed to save state: {traceback.format_exc()}"
+            )
         if self.state not in [
             SchedulerState.COMPLETED,
             SchedulerState.STOPPED,
         ]:
             self.state = SchedulerState.FAILED
+            self._set_sweep_state("CRASHED")
+        else:
+            self._set_sweep_state("FINISHED")
         self._stop_runs()
+        self._wandb_run.finish()
+    def _get_num_runs_launched(self, runs: List[Dict[str, Any]]) -> int:
+        """Returns the number of valid runs in the sweep."""
+        count = 0
+        for run in runs:
+            # if bad run, shouldn't be counted against run cap
+            if run.get("state", "") in ["killed", "crashed"] and not run.get(
+                "summaryMetrics"
+            ):
+                _logger.debug(
+                    f"excluding run: {run['name']} with state: {run['state']} from run cap \n{run}"
+                )
+                continue
+            count += 1
+        return count
     def _try_load_executable(self) -> bool:
         """Check existance of valid executable for a run.
@@ -297,9 +396,8 @@ class Scheduler(ABC):
         logs and returns False when job is unreachable
         """
         if self._kwargs.get("job"):
-            _public_api = public.Api()
             try:
-                _job_artifact = _public_api.artifact(self._kwargs["job"], type="job")
+                _job_artifact = self._public_api.job(self._kwargs["job"])
                 wandb.termlog(
                     f"{LOG_PREFIX}Successfully loaded job ({_job_artifact.name}) in scheduler"
                 )
@@ -316,12 +414,17 @@ class Scheduler(ABC):
     def _register_agents(self) -> None:
         for worker_id in range(self._num_workers):
             _logger.debug(f"{LOG_PREFIX}Starting AgentHeartbeat worker ({worker_id})")
-            agent_config = self._api.register_agent(
-                f"{socket.gethostname()}-{worker_id}",  # host
-                sweep_id=self._sweep_id,
-                project_name=self._project,
-                entity=self._entity,
-            )
+            try:
+                agent_config = self._api.register_agent(
+                    f"{socket.gethostname()}-{worker_id}",  # host
+                    sweep_id=self._sweep_id,
+                    project_name=self._project,
+                    entity=self._entity,
+                )
+            except Exception as e:
+                _logger.debug(f"failed to register agent: {e}")
+                self.fail_sweep(f"failed to register agent: {e}")
             self._workers[worker_id] = _Worker(
                 agent_config=agent_config,
                 agent_id=agent_config["id"],
@@ -332,6 +435,17 @@ class Scheduler(ABC):
         with self._threading_lock:
             yield from self._runs.items()
+    def _cleanup_runs(self, runs_to_remove: List[str]) -> None:
+        """Helper for removing runs from memory.
+        Can be overloaded to prevent deletion of runs, which is useful
+        for debugging or when polling on completed runs.
+        """
+        with self._threading_lock:
+            for run_id in runs_to_remove:
+                wandb.termlog(f"{LOG_PREFIX}Cleaning up finished run ({run_id})")
+                del self._runs[run_id]
     def _stop_runs(self) -> None:
         to_delete = []
         for run_id, _ in self._yield_runs():
@@ -357,7 +471,7 @@ class Scheduler(ABC):
             )
             return False
-        if run.state == RunState.DEAD:
+        if not run.state.is_alive:
             # run already dead, just delete reference
             return True
@@ -366,82 +480,195 @@ class Scheduler(ABC):
             f"Run:v1:{run_id}:{self._project}:{self._entity}".encode()
         ).decode("utf-8")
-        success: bool = self._api.stop_run(run_id=encoded_run_id)
-        if success:
-            wandb.termlog(f"{LOG_PREFIX}Stopped run {run_id}.")
+        try:
+            success: bool = self._api.stop_run(run_id=encoded_run_id)
+            if success:
+                wandb.termlog(f"{LOG_PREFIX}Stopped run {run_id}.")
+                return True
+        except Exception as e:
+            _logger.debug(f"error stopping run ({run_id}): {e}")
+        return False
+    def _update_scheduler_run_state(self) -> None:
+        """Update the scheduler state from state of scheduler run and sweep state."""
+        state: RunState = self._get_run_state(self._wandb_run.id)
-        return success
+        if state == RunState.KILLED:
+            self.state = SchedulerState.STOPPED
+        elif state in [RunState.FAILED, RunState.CRASHED]:
+            self.state = SchedulerState.FAILED
+        elif state == RunState.FINISHED:
+            self.state = SchedulerState.COMPLETED
+        try:
+            sweep_state = self._api.get_sweep_state(
+                self._sweep_id, self._entity, self._project
+            )
+        except Exception as e:
+            _logger.debug(f"sweep state error: {sweep_state} e: {e}")
+            return
+        if sweep_state in ["FINISHED", "CANCELLED"]:
+            self.state = SchedulerState.COMPLETED
+        elif sweep_state in ["PAUSED", "STOPPED"]:
+            self.state = SchedulerState.FLUSH_RUNS
     def _update_run_states(self) -> None:
         """Iterate through runs.
         Get state from backend and deletes runs if not in running state. Threadsafe.
         """
-        # TODO(gst): move to better constants place
-        end_states = [
-            "crashed",
-            "failed",
-            "killed",
-            "finished",
-            "preempted",
-        ]
-        run_states = ["running", "pending", "preempting"]
-        _runs_to_remove: List[str] = []
+        runs_to_remove: List[str] = []
         for run_id, run in self._yield_runs():
+            run.state = self._get_run_state(run_id, run.state)
             try:
-                _state = self._api.get_run_state(self._entity, self._project, run_id)
-                _rqi_state = run.queued_run.state if run.queued_run else None
-                if not _state or _state in end_states or _rqi_state == "failed":
-                    _logger.debug(
-                        f"({run_id}) run-state:{_state}, rqi-state:{_rqi_state}"
-                    )
-                    run.state = RunState.DEAD
-                    _runs_to_remove.append(run_id)
-                elif _state in run_states:
-                    run.state = RunState.ALIVE
-            except CommError as e:
-                _logger.debug(
-                    f"Issue when getting state for run ({run_id}) with error: {e}"
+                rqi_state = run.queued_run.state if run.queued_run else None
+            except (CommError, LaunchError) as e:
+                _logger.debug(f"Failed to get queued_run.state: {e}")
+                rqi_state = None
+            if not run.state.is_alive or rqi_state == "failed":
+                _logger.debug(f"({run_id}) states: ({run.state}, {rqi_state})")
+                runs_to_remove.append(run_id)
+        self._cleanup_runs(runs_to_remove)
+    def _get_metrics_from_run(self, run_id: str) -> List[Any]:
+        """Use the public api to get metrics from a run.
+        Uses the metric name found in the sweep config, any
+        misspellings will result in an empty list.
+        """
+        try:
+            queued_run: Optional[QueuedRun] = self._runs[run_id].queued_run
+            if not queued_run:
+                return []
+            api_run: Run = self._public_api.run(
+                f"{queued_run.entity}/{queued_run.project}/{run_id}"
+            )
+            metric_name = self._sweep_config["metric"]["name"]
+            history = api_run.scan_history(keys=["_step", metric_name])
+            metrics = [x[metric_name] for x in history]
+            return metrics
+        except Exception as e:
+            _logger.debug(f"[_get_metrics_from_run] {e}")
+        return []
+    def _get_run_info(self, run_id: str) -> Dict[str, Any]:
+        """Use the public api to get info about a run."""
+        try:
+            info: Dict[str, Any] = self._api.get_run_info(
+                self._entity, self._project, run_id
+            )
+            if info:
+                return info
+        except Exception as e:
+            _logger.debug(f"[_get_run_info] {e}")
+        return {}
+    def _get_run_state(
+        self, run_id: str, prev_run_state: RunState = RunState.UNKNOWN
+    ) -> RunState:
+        """Use the public api to get state of a run."""
+        run_state = None
+        try:
+            state = self._api.get_run_state(self._entity, self._project, run_id)
+            run_state = RunState(state)
+        except CommError as e:
+            _logger.debug(f"error getting state for run ({run_id}): {e}")
+            if prev_run_state == RunState.UNKNOWN:
+                # triggers when we get an unknown state for the second time
+                wandb.termwarn(
+                    f"Failed to get runstate for run ({run_id}). Error: {traceback.format_exc()}"
                 )
-                run.state = RunState.UNKNOWN
-                continue
-        # Remove any runs that are dead
-        with self._threading_lock:
-            for run_id in _runs_to_remove:
-                wandb.termlog(f"{LOG_PREFIX}Cleaning up finished run ({run_id})")
-                del self._runs[run_id]
+                run_state = RunState.FAILED
+            else:  # first time we get unknwon state
+                run_state = RunState.UNKNOWN
+        except (AttributeError, ValueError):
+            wandb.termwarn(
+                f"Bad state ({run_state}) for run ({run_id}). Error: {traceback.format_exc()}"
+            )
+            run_state = RunState.UNKNOWN
+        return run_state
-    def _add_to_launch_queue(self, run: SweepRun) -> bool:
-        """Convert a sweeprun into a launch job then push to runqueue."""
-        # job and image first from CLI args, then from sweep config
-        _job = self._kwargs.get("job") or self._sweep_config.get("job")
-        _sweep_config_uri = self._sweep_config.get("image_uri")
-        _image_uri = self._kwargs.get("image_uri") or _sweep_config_uri
-        if _job is None and _image_uri is None:
-            raise SchedulerError(f"{LOG_PREFIX}No 'job' nor 'image_uri' ({run.id})")
-        elif _job is not None and _image_uri is not None:
-            raise SchedulerError(f"{LOG_PREFIX}Sweep has both 'job' and 'image_uri'")
+    def _create_run(self) -> Dict[str, Any]:
+        """Use the public api to create a blank run."""
+        try:
+            run: List[Dict[str, Any]] = self._api.upsert_run(
+                project=self._project,
+                entity=self._entity,
+                sweep_name=self._sweep_id,
+            )
+            if run:
+                return run[0]
+        except Exception as e:
+            _logger.debug(f"[_create_run] {e}")
+            raise SchedulerError(
+                "Error creating run from scheduler, check API connection and CLI version."
+            )
+        return {}
+    def _set_sweep_state(self, state: str) -> None:
+        wandb.termlog(f"{LOG_PREFIX}Updating sweep state to: {state.lower()}")
+        try:
+            self._api.set_sweep_state(sweep=self._sweep_id, state=state)
+        except Exception as e:
+            _logger.debug(f"[set_sweep_state] {e}")
+    def _encode(self, _id: str) -> str:
+        return (
+            base64.b64decode(bytes(_id.encode("utf-8"))).decode("utf-8").split(":")[2]
+        )
+    def _make_entry_and_launch_config(
+        self, run: SweepRun
+    ) -> Tuple[Optional[List[str]], Dict[str, Dict[str, Any]]]:
         args = create_sweep_command_args({"args": run.args})
         entry_point, macro_args = make_launch_sweep_entrypoint(
             args, self._sweep_config.get("command")
         )
+        # handle program macro
+        if entry_point and "${program}" in entry_point:
+            if not self._sweep_config.get("program"):
+                raise SchedulerError(
+                    f"{LOG_PREFIX}Program macro in command has no corresponding 'program' in sweep config."
+                )
+            pidx = entry_point.index("${program}")
+            entry_point[pidx] = self._sweep_config["program"]
         launch_config = {"overrides": {"run_config": args["args_dict"]}}
         if macro_args:  # pipe in hyperparam args as params to launch
             launch_config["overrides"]["args"] = macro_args
         if entry_point:
-            wandb.termwarn(
-                f"{LOG_PREFIX}Sweep command {entry_point} will override"
-                f' {"job" if _job else "image_uri"} entrypoint'
-            )
             unresolved = [x for x in entry_point if str(x).startswith("${")]
             if unresolved:
                 wandb.termwarn(
                     f"{LOG_PREFIX}Sweep command contains unresolved macros: "
                     f"{unresolved}, see launch docs for supported macros."
                 )
+        return entry_point, launch_config
+    def _add_to_launch_queue(self, run: SweepRun) -> bool:
+        """Convert a sweeprun into a launch job then push to runqueue."""
+        # job and image first from CLI args, then from sweep config
+        _job = self._kwargs.get("job") or self._sweep_config.get("job")
+        _sweep_config_uri = self._sweep_config.get("image_uri")
+        _image_uri = self._kwargs.get("image_uri") or _sweep_config_uri
+        if _job is None and _image_uri is None:
+            raise SchedulerError(f"{LOG_PREFIX}No 'job' nor 'image_uri' ({run.id})")
+        elif _job is not None and _image_uri is not None:
+            raise SchedulerError(f"{LOG_PREFIX}Sweep has both 'job' and 'image_uri'")
+        entry_point, launch_config = self._make_entry_and_launch_config(run)
+        if entry_point:
+            wandb.termwarn(
+                f"{LOG_PREFIX}Sweep command {entry_point} will override"
+                f' {"job" if _job else "image_uri"} entrypoint'
+            )
         run_id = run.id or generate_id()
         queued_run = launch_add(
@@ -457,8 +684,11 @@ class Scheduler(ABC):
             resource=self._kwargs.get("resource", None),
             resource_args=self._kwargs.get("resource_args", None),
             author=self._kwargs.get("author"),
+            sweep_id=self._sweep_id,
         )
         run.queued_run = queued_run
+        # TODO(gst): unify run and queued_run state
+        run.state = RunState.RUNNING  # assume it will get picked up
         self._runs[run_id] = run
         wandb.termlog(

wandb/sdk/launch/sweeps/scheduler_sweep.py CHANGED Viewed

@@ -50,6 +50,7 @@ class SweepScheduler(Scheduler):
             return SweepRun(
                 id=_run_id,
+                state=RunState.PENDING,
                 args=command.get("args", {}),
                 logs=command.get("logs", []),
                 worker_id=worker_id,
@@ -62,7 +63,7 @@ class SweepScheduler(Scheduler):
         _run_states: Dict[str, bool] = {}
         for run_id, run in self._yield_runs():
             # Filter out runs that are from a different worker thread
-            if run.worker_id == worker_id and run.state == RunState.ALIVE:
+            if run.worker_id == worker_id and run.state.is_alive:
                 _run_states[run_id] = True
         _logger.debug(f"Sending states: \n{pf(_run_states)}\n")

wandb 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl

wandb 0.15.3py3-none-any.whl → 0.15.5py3-none-any.whl