PyPI - wandb - Versions diffs - 0.15.4__py3-none-any.whl → 0.15.5__py3-none-any.whl - Mend

wandb 0.15.4py3-none-any.whl → 0.15.5py3-none-any.whl

Files changed (102) hide show

wandb/__init__.py +1 -1
wandb/analytics/sentry.py +1 -0
wandb/apis/internal.py +3 -0
wandb/apis/public.py +18 -20
wandb/beta/workflows.py +5 -6
wandb/cli/cli.py +27 -27
wandb/data_types.py +2 -0
wandb/integration/langchain/wandb_tracer.py +16 -179
wandb/integration/sagemaker/config.py +2 -2
wandb/integration/tensorboard/log.py +4 -4
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/proto/wandb_deprecated.py +3 -1
wandb/sdk/__init__.py +1 -4
wandb/sdk/artifacts/__init__.py +0 -14
wandb/sdk/artifacts/artifact.py +1757 -277
wandb/sdk/artifacts/artifact_manifest_entry.py +26 -6
wandb/sdk/artifacts/artifact_state.py +10 -0
wandb/sdk/artifacts/artifacts_cache.py +7 -8
wandb/sdk/artifacts/exceptions.py +4 -4
wandb/sdk/artifacts/storage_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/azure_handler.py +16 -6
wandb/sdk/artifacts/storage_handlers/gcs_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/http_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/local_file_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/multi_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/s3_handler.py +35 -32
wandb/sdk/artifacts/storage_handlers/tracking_handler.py +2 -2
wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +5 -9
wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +2 -2
wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +2 -2
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +24 -16
wandb/sdk/artifacts/storage_policy.py +3 -3
wandb/sdk/data_types/_dtypes.py +7 -12
wandb/sdk/data_types/base_types/json_metadata.py +2 -2
wandb/sdk/data_types/base_types/media.py +5 -6
wandb/sdk/data_types/base_types/wb_value.py +12 -13
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +4 -5
wandb/sdk/data_types/helper_types/classes.py +5 -8
wandb/sdk/data_types/helper_types/image_mask.py +4 -5
wandb/sdk/data_types/histogram.py +3 -3
wandb/sdk/data_types/html.py +3 -4
wandb/sdk/data_types/image.py +4 -5
wandb/sdk/data_types/molecule.py +2 -2
wandb/sdk/data_types/object_3d.py +3 -3
wandb/sdk/data_types/plotly.py +2 -2
wandb/sdk/data_types/saved_model.py +7 -8
wandb/sdk/data_types/trace_tree.py +4 -4
wandb/sdk/data_types/video.py +4 -4
wandb/sdk/interface/interface.py +8 -10
wandb/sdk/internal/file_stream.py +2 -3
wandb/sdk/internal/internal_api.py +99 -4
wandb/sdk/internal/job_builder.py +15 -7
wandb/sdk/internal/sender.py +4 -0
wandb/sdk/internal/settings_static.py +1 -0
wandb/sdk/launch/_project_spec.py +9 -7
wandb/sdk/launch/agent/agent.py +115 -58
wandb/sdk/launch/agent/job_status_tracker.py +34 -0
wandb/sdk/launch/agent/run_queue_item_file_saver.py +45 -0
wandb/sdk/launch/builder/abstract.py +5 -1
wandb/sdk/launch/builder/build.py +16 -10
wandb/sdk/launch/builder/docker_builder.py +9 -2
wandb/sdk/launch/builder/kaniko_builder.py +108 -22
wandb/sdk/launch/builder/noop.py +3 -1
wandb/sdk/launch/environment/aws_environment.py +2 -1
wandb/sdk/launch/environment/azure_environment.py +124 -0
wandb/sdk/launch/github_reference.py +30 -18
wandb/sdk/launch/launch.py +1 -1
wandb/sdk/launch/loader.py +15 -0
wandb/sdk/launch/registry/azure_container_registry.py +132 -0
wandb/sdk/launch/registry/elastic_container_registry.py +38 -4
wandb/sdk/launch/registry/google_artifact_registry.py +46 -7
wandb/sdk/launch/runner/abstract.py +19 -3
wandb/sdk/launch/runner/kubernetes_runner.py +111 -47
wandb/sdk/launch/runner/local_container.py +101 -48
wandb/sdk/launch/runner/sagemaker_runner.py +59 -9
wandb/sdk/launch/runner/vertex_runner.py +8 -4
wandb/sdk/launch/sweeps/scheduler.py +102 -27
wandb/sdk/launch/sweeps/utils.py +21 -0
wandb/sdk/launch/utils.py +19 -7
wandb/sdk/lib/_settings_toposort_generated.py +3 -0
wandb/sdk/service/server.py +22 -9
wandb/sdk/service/service.py +27 -8
wandb/sdk/verify/verify.py +6 -9
wandb/sdk/wandb_config.py +2 -4
wandb/sdk/wandb_init.py +2 -0
wandb/sdk/wandb_require.py +7 -0
wandb/sdk/wandb_run.py +32 -35
wandb/sdk/wandb_settings.py +10 -3
wandb/testing/relay.py +15 -2
wandb/util.py +55 -23
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/METADATA +11 -8
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/RECORD +97 -97
wandb/integration/langchain/util.py +0 -191
wandb/sdk/artifacts/invalid_artifact.py +0 -23
wandb/sdk/artifacts/lazy_artifact.py +0 -162
wandb/sdk/artifacts/local_artifact.py +0 -719
wandb/sdk/artifacts/public_artifact.py +0 -1188
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/LICENSE +0 -0
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/WHEEL +0 -0
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/entry_points.txt +0 -0
{wandb-0.15.4.dist-info → wandb-0.15.5.dist-info}/top_level.txt +0 -0

wandb/sdk/launch/runner/local_container.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import logging
 import os
 import shlex
-import signal
 import subprocess
 import sys
+import threading
+import time
 from typing import Any, Dict, List, Optional
 import wandb
+from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
 from wandb.sdk.launch.builder.abstract import AbstractBuilder
 from wandb.sdk.launch.environment.abstract import AbstractEnvironment
@@ -28,36 +30,57 @@ _logger = logging.getLogger(__name__)
 class LocalSubmittedRun(AbstractRun):
     """Instance of ``AbstractRun`` corresponding to a subprocess launched to run an entry point command locally."""
-    def __init__(self, command_proc: "subprocess.Popen[bytes]") -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.command_proc = command_proc
+        self._command_proc: Optional[subprocess.Popen] = None
+        self._stdout: Optional[str] = None
+        self._terminate_flag: bool = False
+        self._thread: Optional[threading.Thread] = None
+    def set_command_proc(self, command_proc: subprocess.Popen) -> None:
+        self._command_proc = command_proc
+    def set_thread(self, thread: threading.Thread) -> None:
+        self._thread = thread
     @property
-    def id(self) -> str:
-        return str(self.command_proc.pid)
+    def id(self) -> Optional[str]:
+        if self._command_proc is None:
+            return None
+        return str(self._command_proc.pid)
     def wait(self) -> bool:
-        return self.command_proc.wait() == 0
+        assert self._thread is not None
+        # if command proc is not set
+        # wait for thread to set it
+        if self._command_proc is None:
+            while self._thread.is_alive():
+                time.sleep(5)
+                # command proc can be updated by another thread
+                if self._command_proc is not None:
+                    return self._command_proc.wait() == 0  # type: ignore
+            return False
+        return self._command_proc.wait() == 0
+    def get_logs(self) -> Optional[str]:
+        return self._stdout
     def cancel(self) -> None:
-        # Interrupt child process if it hasn't already exited
-        if self.command_proc.poll() is None:
-            # Kill the the process tree rooted at the child if it's the leader of its own process
-            # group, otherwise just kill the child
-            try:
-                if self.command_proc.pid == os.getpgid(self.command_proc.pid):
-                    os.killpg(self.command_proc.pid, signal.SIGTERM)
-                else:
-                    self.command_proc.terminate()
-            except OSError:
-                # The child process may have exited before we attempted to terminate it, so we
-                # ignore OSErrors raised during child process termination
-                _msg = f"{LOG_PREFIX}Failed to terminate child process PID {self.command_proc.pid}"
-                _logger.debug(_msg)
-            self.command_proc.wait()
+        # thread is set immediately after starting, should always exist
+        assert self._thread is not None
+        # cancel called before the thread subprocess has started
+        # indicates to thread to not start command proc if not already started
+        self._terminate_flag = True
     def get_status(self) -> Status:
-        exit_code = self.command_proc.poll()
+        assert self._thread is not None, "Failed to get status, self._thread = None"
+        if self._command_proc is None:
+            if self._thread.is_alive():
+                return Status("running")
+            return Status("stopped")
+        exit_code = self._command_proc.poll()
         if exit_code is None:
             return Status("running")
         if exit_code == 0:
@@ -77,12 +100,7 @@ class LocalContainerRunner(AbstractRunner):
         super().__init__(api, backend_config)
         self.environment = environment
-    def run(
-        self,
-        launch_project: LaunchProject,
-        builder: Optional[AbstractBuilder],
-    ) -> Optional[AbstractRun]:
-        synchronous: bool = self.backend_config[PROJECT_SYNCHRONOUS]
+    def _populate_docker_args(self, launch_project: LaunchProject) -> Dict[str, Any]:
         docker_args: Dict[str, Any] = launch_project.resource_args.get(
             "local-container", {}
         )
@@ -95,6 +113,16 @@ class LocalContainerRunner(AbstractRunner):
             if sys.platform == "linux" or sys.platform == "linux2":
                 docker_args["add-host"] = "host.docker.internal:host-gateway"
+        return docker_args
+    def run(
+        self,
+        launch_project: LaunchProject,
+        builder: Optional[AbstractBuilder],
+        job_tracker: Optional[JobAndRunStatusTracker] = None,
+    ) -> Optional[AbstractRun]:
+        docker_args = self._populate_docker_args(launch_project)
+        synchronous: bool = self.backend_config[PROJECT_SYNCHRONOUS]
         entry_point = launch_project.get_single_entry_point()
         env_vars = get_env_vars_dict(launch_project, self._api)
@@ -106,7 +134,7 @@ class LocalContainerRunner(AbstractRunner):
             _, _, port = self._api.settings("base_url").split(":")
             env_vars["WANDB_BASE_URL"] = f"http://host.docker.internal:{port}"
         elif _is_wandb_dev_uri(self._api.settings("base_url")):
-            env_vars["WANDB_BASE_URL"] = "http://host.docker.internal:9002"
+            env_vars["WANDB_BASE_URL"] = "http://host.docker.internal:9001"
         if launch_project.docker_image:
             # user has provided their own docker image
@@ -128,11 +156,7 @@ class LocalContainerRunner(AbstractRunner):
             assert entry_point is not None
             _logger.info("Building docker image...")
             assert builder is not None
-            image_uri = builder.build_image(
-                launch_project,
-                entry_point,
-            )
+            image_uri = builder.build_image(launch_project, entry_point, job_tracker)
             _logger.info(f"Docker image built with uri {image_uri}")
             # entry_cmd and additional_args are empty here because
             # if launch built the container they've been accounted
@@ -167,20 +191,49 @@ def _run_entry_point(command: str, work_dir: Optional[str]) -> AbstractRun:
     if work_dir is None:
         work_dir = os.getcwd()
     env = os.environ.copy()
-    if os.name == "nt":
-        # we are running on windows
-        process = subprocess.Popen(
-            ["cmd", "/c", command], close_fds=True, cwd=work_dir, env=env
-        )
-    else:
-        process = subprocess.Popen(
-            ["bash", "-c", command],
-            close_fds=True,
-            cwd=work_dir,
-            env=env,
-        )
-    return LocalSubmittedRun(process)
+    run = LocalSubmittedRun()
+    thread = threading.Thread(
+        target=_thread_process_runner,
+        args=(run, ["bash", "-c", command], work_dir, env),
+    )
+    run.set_thread(thread)
+    thread.start()
+    return run
+def _thread_process_runner(
+    run: LocalSubmittedRun, args: List[str], work_dir: str, env: Dict[str, str]
+) -> None:
+    # cancel was called before we started the subprocess
+    if run._terminate_flag:
+        return
+    process = subprocess.Popen(
+        args,
+        close_fds=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        universal_newlines=True,
+        bufsize=1,
+        cwd=work_dir,
+        env=env,
+    )
+    run.set_command_proc(process)
+    run._stdout = ""
+    while True:
+        # the agent thread could set the terminate flag
+        if run._terminate_flag:
+            process.terminate()  # type: ignore
+        chunk = os.read(process.stdout.fileno(), 4096)  # type: ignore
+        if not chunk:
+            break
+        index = chunk.find(b"\r")
+        decoded_chunk = chunk.decode()
+        if index != -1:
+            run._stdout += decoded_chunk
+            print(chunk.decode(), end="")
+        else:
+            run._stdout += decoded_chunk + "\r"
+            print(chunk.decode(), end="\r")
 def get_docker_command(

wandb/sdk/launch/runner/sagemaker_runner.py CHANGED Viewed

@@ -8,6 +8,7 @@ if False:
 import wandb
 from wandb.apis.internal import Api
+from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
 from wandb.sdk.launch.builder.abstract import AbstractBuilder
 from wandb.sdk.launch.environment.aws_environment import AwsEnvironment
 from wandb.sdk.launch.errors import LaunchError
@@ -23,9 +24,15 @@ _logger = logging.getLogger(__name__)
 class SagemakerSubmittedRun(AbstractRun):
     """Instance of ``AbstractRun`` corresponding to a subprocess launched to run an entry point command on aws sagemaker."""
-    def __init__(self, training_job_name: str, client: "boto3.Client") -> None:
+    def __init__(
+        self,
+        training_job_name: str,
+        client: "boto3.Client",
+        log_client: Optional["boto3.Client"] = None,
+    ) -> None:
         super().__init__()
         self.client = client
+        self.log_client = log_client
         self.training_job_name = training_job_name
         self._status = Status("running")
@@ -33,6 +40,38 @@ class SagemakerSubmittedRun(AbstractRun):
     def id(self) -> str:
         return f"sagemaker-{self.training_job_name}"
+    def get_logs(self) -> Optional[str]:
+        if self.log_client is None:
+            return None
+        try:
+            describe_res = self.log_client.describe_log_streams(
+                logGroupName="/aws/sagemaker/TrainingJobs",
+                logStreamNamePrefix=self.training_job_name,
+            )
+            if len(describe_res["logStreams"]) == 0:
+                wandb.termwarn(
+                    f"Failed to get logs for training job: {self.training_job_name}"
+                )
+                return None
+            log_name = describe_res["logStreams"][0]["logStreamName"]
+            res = self.log_client.get_log_events(
+                logGroupName="/aws/sagemaker/TrainingJobs",
+                logStreamName=log_name,
+            )
+            return "\n".join(
+                [f'{event["timestamp"]}:{event["message"]}' for event in res["events"]]
+            )
+        except self.log_client.exceptions.ResourceNotFoundException:
+            wandb.termwarn(
+                f"Failed to get logs for training job: {self.training_job_name}"
+            )
+            return None
+        except Exception as e:
+            wandb.termwarn(
+                f"Failed to handle logs for training job: {self.training_job_name} with error {str(e)}"
+            )
+            return None
     def wait(self) -> bool:
         while True:
             status_state = self.get_status().state
@@ -89,6 +128,7 @@ class SageMakerRunner(AbstractRunner):
         self,
         launch_project: LaunchProject,
         builder: Optional[AbstractBuilder],
+        job_tracker: Optional[JobAndRunStatusTracker] = None,
     ) -> Optional[AbstractRun]:
         """Run a project on Amazon Sagemaker.
@@ -128,6 +168,13 @@ class SageMakerRunner(AbstractRunner):
         # Create a sagemaker client to launch the job.
         sagemaker_client = session.client("sagemaker")
+        log_client = None
+        try:
+            log_client = session.client("logs")
+        except Exception as e:
+            wandb.termwarn(
+                f"Failed to connect to cloudwatch logs with error {str(e)}, logs will not be available"
+            )
         # if the user provided the image they want to use, use that, but warn it won't have swappable artifacts
         if (
@@ -146,7 +193,9 @@ class SageMakerRunner(AbstractRunner):
             _logger.info(
                 f"Launching sagemaker job on user supplied image with args: {sagemaker_args}"
             )
-            run = launch_sagemaker_job(launch_project, sagemaker_args, sagemaker_client)
+            run = launch_sagemaker_job(
+                launch_project, sagemaker_args, sagemaker_client, log_client
+            )
             if self.backend_config[PROJECT_SYNCHRONOUS]:
                 run.wait()
             return run
@@ -158,11 +207,9 @@ class SageMakerRunner(AbstractRunner):
             assert builder is not None
             # build our own image
             _logger.info("Building docker image...")
-            image = builder.build_image(
-                launch_project,
-                entry_point,
-            )
+            image = builder.build_image(launch_project, entry_point, job_tracker)
             _logger.info(f"Docker image built with uri {image}")
         launch_project.fill_macros(image)
         _logger.info("Connecting to sagemaker client")
         command_args = get_entry_point_command(
@@ -181,7 +228,9 @@ class SageMakerRunner(AbstractRunner):
             launch_project, self._api, role_arn, image, default_output_path
         )
         _logger.info(f"Launching sagemaker job with args: {sagemaker_args}")
-        run = launch_sagemaker_job(launch_project, sagemaker_args, sagemaker_client)
+        run = launch_sagemaker_job(
+            launch_project, sagemaker_args, sagemaker_client, log_client
+        )
         if self.backend_config[PROJECT_SYNCHRONOUS]:
             run.wait()
         return run
@@ -296,14 +345,15 @@ def launch_sagemaker_job(
     launch_project: LaunchProject,
     sagemaker_args: Dict[str, Any],
     sagemaker_client: "boto3.Client",
+    log_client: Optional["boto3.Client"] = None,
 ) -> SagemakerSubmittedRun:
     training_job_name = sagemaker_args.get("TrainingJobName") or launch_project.run_id
     resp = sagemaker_client.create_training_job(**sagemaker_args)
     if resp.get("TrainingJobArn") is None:
-        raise LaunchError("Unable to create training job")
+        raise LaunchError("Failed to create training job when submitting to SageMaker")
-    run = SagemakerSubmittedRun(training_job_name, sagemaker_client)
+    run = SagemakerSubmittedRun(training_job_name, sagemaker_client, log_client)
     wandb.termlog(
         f"{LOG_PREFIX}Run job submitted with arn: {resp.get('TrainingJobArn')}"
     )

wandb/sdk/launch/runner/vertex_runner.py CHANGED Viewed

@@ -14,6 +14,7 @@ from wandb.apis.internal import Api
 from wandb.util import get_module
 from .._project_spec import LaunchProject, get_entry_point_command
+from ..agent.job_status_tracker import JobAndRunStatusTracker
 from ..builder.abstract import AbstractBuilder
 from ..builder.build import get_env_vars_dict
 from ..environment.gcp_environment import GcpEnvironment
@@ -35,6 +36,10 @@ class VertexSubmittedRun(AbstractRun):
         # numeric ID of the custom training job
         return self._job.name  # type: ignore
+    def get_logs(self) -> Optional[str]:
+        # TODO: implement
+        return None
     @property
     def name(self) -> str:
         return self._job.display_name  # type: ignore
@@ -89,6 +94,7 @@ class VertexRunner(AbstractRunner):
         self,
         launch_project: LaunchProject,
         builder: Optional[AbstractBuilder],
+        job_tracker: Optional[JobAndRunStatusTracker] = None,
     ) -> Optional[AbstractRun]:
         """Run a Vertex job."""
         aiplatform = get_module(  # noqa: F811
@@ -134,10 +140,8 @@ class VertexRunner(AbstractRunner):
         else:
             assert entry_point is not None
             assert builder is not None
-            image_uri = builder.build_image(
-                launch_project,
-                entry_point,
-            )
+            image_uri = builder.build_image(launch_project, entry_point, job_tracker)
         launch_project.fill_macros(image_uri)
         # TODO: how to handle this?
         entry_cmd = get_entry_point_command(entry_point, launch_project.override_args)

wandb/sdk/launch/sweeps/scheduler.py CHANGED Viewed

@@ -130,10 +130,10 @@ class Scheduler(ABC):
             if resp.get("state") == SchedulerState.CANCELLED.name:
                 self._state = SchedulerState.CANCELLED
             self._sweep_config = yaml.safe_load(resp["config"])
-            self._num_runs_launched: int = len(resp["runs"])
+            self._num_runs_launched: int = self._get_num_runs_launched(resp["runs"])
             if self._num_runs_launched > 0:
                 wandb.termlog(
-                    f"{LOG_PREFIX}Found {self._num_runs_launched} previous runs for sweep {self._sweep_id}"
+                    f"{LOG_PREFIX}Found {self._num_runs_launched} previous valid runs for sweep {self._sweep_id}"
                 )
         except Exception as e:
             raise SchedulerError(
@@ -295,10 +295,12 @@ class Scheduler(ABC):
         self.state = SchedulerState.RUNNING
         try:
             while True:
-                wandb.termlog(f"{LOG_PREFIX}Polling for new runs to launch")
+                self._update_scheduler_run_state()
                 if not self.is_alive:
                     break
+                wandb.termlog(f"{LOG_PREFIX}Polling for new runs to launch")
                 self._update_run_states()
                 self._poll()
                 if self.state == SchedulerState.FLUSH_RUNS:
@@ -316,8 +318,17 @@ class Scheduler(ABC):
                         self.state = SchedulerState.FLUSH_RUNS
                         break
-                    run: Optional[SweepRun] = self._get_next_sweep_run(worker_id)
-                    if not run:
+                    try:
+                        run: Optional[SweepRun] = self._get_next_sweep_run(worker_id)
+                        if not run:
+                            break
+                    except SchedulerError as e:
+                        raise SchedulerError(e)
+                    except Exception as e:
+                        wandb.termerror(
+                            f"{LOG_PREFIX}Failed to get next sweep run: {e}"
+                        )
+                        self.state = SchedulerState.FAILED
                         break
                     if self._add_to_launch_queue(run):
@@ -356,10 +367,29 @@ class Scheduler(ABC):
             SchedulerState.STOPPED,
         ]:
             self.state = SchedulerState.FAILED
+            self._set_sweep_state("CRASHED")
+        else:
+            self._set_sweep_state("FINISHED")
         self._stop_runs()
-        self._set_sweep_state("FINISHED")
         self._wandb_run.finish()
+    def _get_num_runs_launched(self, runs: List[Dict[str, Any]]) -> int:
+        """Returns the number of valid runs in the sweep."""
+        count = 0
+        for run in runs:
+            # if bad run, shouldn't be counted against run cap
+            if run.get("state", "") in ["killed", "crashed"] and not run.get(
+                "summaryMetrics"
+            ):
+                _logger.debug(
+                    f"excluding run: {run['name']} with state: {run['state']} from run cap \n{run}"
+                )
+                continue
+            count += 1
+        return count
     def _try_load_executable(self) -> bool:
         """Check existance of valid executable for a run.
@@ -384,12 +414,17 @@ class Scheduler(ABC):
     def _register_agents(self) -> None:
         for worker_id in range(self._num_workers):
             _logger.debug(f"{LOG_PREFIX}Starting AgentHeartbeat worker ({worker_id})")
-            agent_config = self._api.register_agent(
-                f"{socket.gethostname()}-{worker_id}",  # host
-                sweep_id=self._sweep_id,
-                project_name=self._project,
-                entity=self._entity,
-            )
+            try:
+                agent_config = self._api.register_agent(
+                    f"{socket.gethostname()}-{worker_id}",  # host
+                    sweep_id=self._sweep_id,
+                    project_name=self._project,
+                    entity=self._entity,
+                )
+            except Exception as e:
+                _logger.debug(f"failed to register agent: {e}")
+                self.fail_sweep(f"failed to register agent: {e}")
             self._workers[worker_id] = _Worker(
                 agent_config=agent_config,
                 agent_id=agent_config["id"],
@@ -455,6 +490,30 @@ class Scheduler(ABC):
         return False
+    def _update_scheduler_run_state(self) -> None:
+        """Update the scheduler state from state of scheduler run and sweep state."""
+        state: RunState = self._get_run_state(self._wandb_run.id)
+        if state == RunState.KILLED:
+            self.state = SchedulerState.STOPPED
+        elif state in [RunState.FAILED, RunState.CRASHED]:
+            self.state = SchedulerState.FAILED
+        elif state == RunState.FINISHED:
+            self.state = SchedulerState.COMPLETED
+        try:
+            sweep_state = self._api.get_sweep_state(
+                self._sweep_id, self._entity, self._project
+            )
+        except Exception as e:
+            _logger.debug(f"sweep state error: {sweep_state} e: {e}")
+            return
+        if sweep_state in ["FINISHED", "CANCELLED"]:
+            self.state = SchedulerState.COMPLETED
+        elif sweep_state in ["PAUSED", "STOPPED"]:
+            self.state = SchedulerState.FLUSH_RUNS
     def _update_run_states(self) -> None:
         """Iterate through runs.
@@ -530,7 +589,7 @@ class Scheduler(ABC):
                 run_state = RunState.UNKNOWN
         except (AttributeError, ValueError):
             wandb.termwarn(
-                f"Bad state ({state}) for run ({run_id}). Error: {traceback.format_exc()}"
+                f"Bad state ({run_state}) for run ({run_id}). Error: {traceback.format_exc()}"
             )
             run_state = RunState.UNKNOWN
         return run_state
@@ -564,6 +623,35 @@ class Scheduler(ABC):
             base64.b64decode(bytes(_id.encode("utf-8"))).decode("utf-8").split(":")[2]
         )
+    def _make_entry_and_launch_config(
+        self, run: SweepRun
+    ) -> Tuple[Optional[List[str]], Dict[str, Dict[str, Any]]]:
+        args = create_sweep_command_args({"args": run.args})
+        entry_point, macro_args = make_launch_sweep_entrypoint(
+            args, self._sweep_config.get("command")
+        )
+        # handle program macro
+        if entry_point and "${program}" in entry_point:
+            if not self._sweep_config.get("program"):
+                raise SchedulerError(
+                    f"{LOG_PREFIX}Program macro in command has no corresponding 'program' in sweep config."
+                )
+            pidx = entry_point.index("${program}")
+            entry_point[pidx] = self._sweep_config["program"]
+        launch_config = {"overrides": {"run_config": args["args_dict"]}}
+        if macro_args:  # pipe in hyperparam args as params to launch
+            launch_config["overrides"]["args"] = macro_args
+        if entry_point:
+            unresolved = [x for x in entry_point if str(x).startswith("${")]
+            if unresolved:
+                wandb.termwarn(
+                    f"{LOG_PREFIX}Sweep command contains unresolved macros: "
+                    f"{unresolved}, see launch docs for supported macros."
+                )
+        return entry_point, launch_config
     def _add_to_launch_queue(self, run: SweepRun) -> bool:
         """Convert a sweeprun into a launch job then push to runqueue."""
         # job and image first from CLI args, then from sweep config
@@ -575,25 +663,12 @@ class Scheduler(ABC):
         elif _job is not None and _image_uri is not None:
             raise SchedulerError(f"{LOG_PREFIX}Sweep has both 'job' and 'image_uri'")
-        args = create_sweep_command_args({"args": run.args})
-        entry_point, macro_args = make_launch_sweep_entrypoint(
-            args, self._sweep_config.get("command")
-        )
-        launch_config = {"overrides": {"run_config": args["args_dict"]}}
-        if macro_args:  # pipe in hyperparam args as params to launch
-            launch_config["overrides"]["args"] = macro_args
+        entry_point, launch_config = self._make_entry_and_launch_config(run)
         if entry_point:
             wandb.termwarn(
                 f"{LOG_PREFIX}Sweep command {entry_point} will override"
                 f' {"job" if _job else "image_uri"} entrypoint'
             )
-            unresolved = [x for x in entry_point if str(x).startswith("${")]
-            if unresolved:
-                wandb.termwarn(
-                    f"{LOG_PREFIX}Sweep command contains unresolved macros: "
-                    f"{unresolved}, see launch docs for supported macros."
-                )
         run_id = run.id or generate_id()
         queued_run = launch_add(

wandb/sdk/launch/sweeps/utils.py CHANGED Viewed

@@ -291,3 +291,24 @@ def check_job_exists(public_api: PublicApi, job: Optional[str]) -> bool:
         wandb.termerror(f"Failed to load job. {e}")
         return False
     return True
+def get_previous_args(
+    run_spec: Dict[str, Any]
+) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """Parse through previous scheduler run_spec.
+    returns scheduler_args and settings.
+    """
+    scheduler_args = (
+        run_spec.get("overrides", {}).get("run_config", {}).get("scheduler", {})
+    )
+    # also pipe through top level resource setup
+    if run_spec.get("resource"):
+        scheduler_args["resource"] = run_spec["resource"]
+    if run_spec.get("resource_args"):
+        scheduler_args["resource_args"] = run_spec["resource_args"]
+    settings = run_spec.get("overrides", {}).get("run_config", {}).get("settings", {})
+    return scheduler_args, settings

wandb/sdk/launch/utils.py CHANGED Viewed

@@ -28,7 +28,8 @@ FAILED_PACKAGES_REGEX = re.compile(
 )
 if TYPE_CHECKING:  # pragma: no cover
-    from wandb.sdk.artifacts.public_artifact import Artifact as PublicArtifact
+    from wandb.sdk.artifacts.artifact import Artifact
+    from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
 # TODO: this should be restricted to just Git repos and not S3 and stuff like that
@@ -47,7 +48,7 @@ _WANDB_LOCAL_DEV_URI_REGEX = re.compile(
     r"^https?://localhost"
 )  # for testing, not sure if we wanna keep this
-API_KEY_REGEX = r"WANDB_API_KEY=\w+"
+API_KEY_REGEX = r"WANDB_API_KEY=\w+(-\w+)?"
 MACRO_REGEX = re.compile(r"\$\{(\w+)\}")
@@ -493,7 +494,7 @@ def convert_jupyter_notebook_to_script(fname: str, project_dir: str) -> str:
 def check_and_download_code_artifacts(
     entity: str, project: str, run_name: str, internal_api: Api, project_dir: str
-) -> Optional["PublicArtifact"]:
+) -> Optional["Artifact"]:
     _logger.info("Checking for code artifacts")
     public_api = wandb.PublicApi(
         overrides={"base_url": internal_api.settings("base_url")}
@@ -620,12 +621,23 @@ def make_name_dns_safe(name: str) -> str:
     return resp
-def warn_failed_packages_from_build_logs(log: str, image_uri: str) -> None:
+def warn_failed_packages_from_build_logs(
+    log: str, image_uri: str, api: Api, job_tracker: Optional["JobAndRunStatusTracker"]
+) -> None:
     match = FAILED_PACKAGES_REGEX.search(log)
     if match:
-        wandb.termwarn(
-            f"Failed to install the following packages: {match.group(1)} for image: {image_uri}. Will attempt to launch image without them."
-        )
+        _msg = f"Failed to install the following packages: {match.group(1)} for image: {image_uri}. Will attempt to launch image without them."
+        wandb.termwarn(_msg)
+        if job_tracker is not None:
+            res = job_tracker.saver.save_contents(
+                _msg, "failed-packages.log", "warning"
+            )
+            api.update_run_queue_item_warning(
+                job_tracker.run_queue_item_id,
+                "Some packages were not successfully installed during the build",
+                "build",
+                res,
+            )
 def docker_image_exists(docker_image: str, should_raise: bool = False) -> bool:

wandb 0.15.4__py3-none-any.whl → 0.15.5__py3-none-any.whl

wandb 0.15.4py3-none-any.whl → 0.15.5py3-none-any.whl