PyPI - wandb - Versions diffs - 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl - Mend

wandb 0.15.3py3-none-any.whl → 0.15.5py3-none-any.whl

Files changed (156) hide show

wandb/__init__.py +1 -1
wandb/analytics/sentry.py +1 -0
wandb/apis/importers/base.py +20 -5
wandb/apis/importers/mlflow.py +7 -1
wandb/apis/internal.py +12 -0
wandb/apis/public.py +247 -1387
wandb/apis/reports/_panels.py +58 -35
wandb/beta/workflows.py +6 -7
wandb/cli/cli.py +130 -60
wandb/data_types.py +3 -1
wandb/filesync/dir_watcher.py +21 -27
wandb/filesync/step_checksum.py +8 -8
wandb/filesync/step_prepare.py +23 -10
wandb/filesync/step_upload.py +13 -13
wandb/filesync/upload_job.py +4 -8
wandb/integration/cohere/__init__.py +3 -0
wandb/integration/cohere/cohere.py +21 -0
wandb/integration/cohere/resolver.py +347 -0
wandb/integration/gym/__init__.py +4 -6
wandb/integration/huggingface/__init__.py +3 -0
wandb/integration/huggingface/huggingface.py +18 -0
wandb/integration/huggingface/resolver.py +213 -0
wandb/integration/langchain/wandb_tracer.py +16 -179
wandb/integration/openai/__init__.py +1 -3
wandb/integration/openai/openai.py +11 -143
wandb/integration/openai/resolver.py +111 -38
wandb/integration/sagemaker/config.py +2 -2
wandb/integration/tensorboard/log.py +4 -4
wandb/old/settings.py +24 -7
wandb/proto/v3/wandb_telemetry_pb2.py +12 -12
wandb/proto/v4/wandb_telemetry_pb2.py +12 -12
wandb/proto/wandb_deprecated.py +3 -1
wandb/sdk/__init__.py +1 -1
wandb/sdk/artifacts/__init__.py +0 -0
wandb/sdk/artifacts/artifact.py +2101 -0
wandb/sdk/artifacts/artifact_download_logger.py +42 -0
wandb/sdk/artifacts/artifact_manifest.py +67 -0
wandb/sdk/artifacts/artifact_manifest_entry.py +159 -0
wandb/sdk/artifacts/artifact_manifests/__init__.py +0 -0
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +91 -0
wandb/sdk/{internal → artifacts}/artifact_saver.py +6 -5
wandb/sdk/artifacts/artifact_state.py +10 -0
wandb/sdk/{interface/artifacts/artifact_cache.py → artifacts/artifacts_cache.py} +22 -12
wandb/sdk/artifacts/exceptions.py +55 -0
wandb/sdk/artifacts/storage_handler.py +59 -0
wandb/sdk/artifacts/storage_handlers/__init__.py +0 -0
wandb/sdk/artifacts/storage_handlers/azure_handler.py +192 -0
wandb/sdk/artifacts/storage_handlers/gcs_handler.py +224 -0
wandb/sdk/artifacts/storage_handlers/http_handler.py +112 -0
wandb/sdk/artifacts/storage_handlers/local_file_handler.py +134 -0
wandb/sdk/artifacts/storage_handlers/multi_handler.py +53 -0
wandb/sdk/artifacts/storage_handlers/s3_handler.py +301 -0
wandb/sdk/artifacts/storage_handlers/tracking_handler.py +67 -0
wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +132 -0
wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +72 -0
wandb/sdk/artifacts/storage_layout.py +6 -0
wandb/sdk/artifacts/storage_policies/__init__.py +0 -0
wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +61 -0
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +386 -0
wandb/sdk/{interface/artifacts/artifact_storage.py → artifacts/storage_policy.py} +5 -57
wandb/sdk/data_types/_dtypes.py +7 -12
wandb/sdk/data_types/base_types/json_metadata.py +3 -2
wandb/sdk/data_types/base_types/media.py +8 -8
wandb/sdk/data_types/base_types/wb_value.py +12 -13
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +5 -6
wandb/sdk/data_types/helper_types/classes.py +6 -8
wandb/sdk/data_types/helper_types/image_mask.py +5 -6
wandb/sdk/data_types/histogram.py +4 -3
wandb/sdk/data_types/html.py +3 -4
wandb/sdk/data_types/image.py +11 -9
wandb/sdk/data_types/molecule.py +5 -3
wandb/sdk/data_types/object_3d.py +7 -5
wandb/sdk/data_types/plotly.py +3 -2
wandb/sdk/data_types/saved_model.py +11 -11
wandb/sdk/data_types/trace_tree.py +5 -4
wandb/sdk/data_types/utils.py +3 -5
wandb/sdk/data_types/video.py +5 -4
wandb/sdk/integration_utils/auto_logging.py +215 -0
wandb/sdk/interface/interface.py +15 -15
wandb/sdk/internal/file_pusher.py +8 -16
wandb/sdk/internal/file_stream.py +5 -11
wandb/sdk/internal/handler.py +13 -1
wandb/sdk/internal/internal_api.py +287 -13
wandb/sdk/internal/job_builder.py +119 -30
wandb/sdk/internal/sender.py +6 -26
wandb/sdk/internal/settings_static.py +2 -0
wandb/sdk/internal/system/assets/__init__.py +2 -0
wandb/sdk/internal/system/assets/gpu.py +42 -0
wandb/sdk/internal/system/assets/gpu_amd.py +216 -0
wandb/sdk/internal/system/env_probe_helpers.py +13 -0
wandb/sdk/internal/system/system_info.py +3 -3
wandb/sdk/internal/tb_watcher.py +32 -22
wandb/sdk/internal/thread_local_settings.py +18 -0
wandb/sdk/launch/_project_spec.py +57 -11
wandb/sdk/launch/agent/agent.py +147 -65
wandb/sdk/launch/agent/job_status_tracker.py +34 -0
wandb/sdk/launch/agent/run_queue_item_file_saver.py +45 -0
wandb/sdk/launch/builder/abstract.py +5 -1
wandb/sdk/launch/builder/build.py +21 -18
wandb/sdk/launch/builder/docker_builder.py +10 -4
wandb/sdk/launch/builder/kaniko_builder.py +113 -23
wandb/sdk/launch/builder/noop.py +6 -3
wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +46 -14
wandb/sdk/launch/environment/aws_environment.py +3 -2
wandb/sdk/launch/environment/azure_environment.py +124 -0
wandb/sdk/launch/environment/gcp_environment.py +2 -4
wandb/sdk/launch/environment/local_environment.py +1 -1
wandb/sdk/launch/errors.py +19 -0
wandb/sdk/launch/github_reference.py +32 -19
wandb/sdk/launch/launch.py +3 -8
wandb/sdk/launch/launch_add.py +6 -2
wandb/sdk/launch/loader.py +21 -2
wandb/sdk/launch/registry/azure_container_registry.py +132 -0
wandb/sdk/launch/registry/elastic_container_registry.py +39 -5
wandb/sdk/launch/registry/google_artifact_registry.py +68 -26
wandb/sdk/launch/registry/local_registry.py +2 -1
wandb/sdk/launch/runner/abstract.py +24 -3
wandb/sdk/launch/runner/kubernetes_runner.py +479 -26
wandb/sdk/launch/runner/local_container.py +103 -51
wandb/sdk/launch/runner/local_process.py +1 -1
wandb/sdk/launch/runner/sagemaker_runner.py +60 -10
wandb/sdk/launch/runner/vertex_runner.py +10 -5
wandb/sdk/launch/sweeps/__init__.py +7 -9
wandb/sdk/launch/sweeps/scheduler.py +307 -77
wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
wandb/sdk/launch/sweeps/utils.py +82 -35
wandb/sdk/launch/utils.py +89 -75
wandb/sdk/lib/_settings_toposort_generated.py +7 -0
wandb/sdk/lib/capped_dict.py +26 -0
wandb/sdk/lib/{git.py → gitlib.py} +76 -59
wandb/sdk/lib/hashutil.py +12 -4
wandb/sdk/lib/paths.py +96 -8
wandb/sdk/lib/sock_client.py +2 -2
wandb/sdk/lib/timer.py +1 -0
wandb/sdk/service/server.py +22 -9
wandb/sdk/service/server_sock.py +1 -1
wandb/sdk/service/service.py +27 -8
wandb/sdk/verify/verify.py +4 -7
wandb/sdk/wandb_config.py +2 -6
wandb/sdk/wandb_init.py +57 -53
wandb/sdk/wandb_require.py +7 -0
wandb/sdk/wandb_run.py +61 -223
wandb/sdk/wandb_settings.py +28 -4
wandb/testing/relay.py +15 -2
wandb/util.py +74 -36
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/METADATA +15 -9
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/RECORD +151 -116
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/entry_points.txt +1 -0
wandb/integration/langchain/util.py +0 -191
wandb/sdk/interface/artifacts/__init__.py +0 -33
wandb/sdk/interface/artifacts/artifact.py +0 -615
wandb/sdk/interface/artifacts/artifact_manifest.py +0 -131
wandb/sdk/wandb_artifacts.py +0 -2226
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/LICENSE +0 -0
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/WHEEL +0 -0
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/top_level.txt +0 -0

wandb/sdk/internal/tb_watcher.py CHANGED Viewed

@@ -58,12 +58,16 @@ def _link_and_save_file(
     interface.publish_files(dict(files=[(GlobStr(glob.escape(file_name)), "live")]))
-def is_tfevents_file_created_by(path: str, hostname: str, start_time: float) -> bool:
-    """Check if a path is a tfevents file created by hostname.
+def is_tfevents_file_created_by(
+    path: str, hostname: Optional[str], start_time: Optional[float]
+) -> bool:
+    """Check if a path is a tfevents file.
+    Optionally checks that it was created by [hostname] after [start_time].
     tensorboard tfevents filename format:
         https://github.com/tensorflow/tensorboard/blob/f3f26b46981da5bd46a5bb93fcf02d9eb7608bc1/tensorboard/summary/writer/event_file_writer.py#L81
-    tensorflow tfevents fielname format:
+    tensorflow tfevents filename format:
         https://github.com/tensorflow/tensorflow/blob/8f597046dc30c14b5413813d02c0e0aed399c177/tensorflow/core/util/events_writer.cc#L68
     """
     if not path:
@@ -77,23 +81,27 @@ def is_tfevents_file_created_by(path: str, hostname: str, start_time: float) ->
     except ValueError:
         return False
     # check the hostname, which may have dots
-    for i, part in enumerate(hostname.split(".")):
+    if hostname is not None:
+        for i, part in enumerate(hostname.split(".")):
+            try:
+                fname_component_part = fname_components[tfevents_idx + 2 + i]
+            except IndexError:
+                return False
+            if part != fname_component_part:
+                return False
+    if start_time is not None:
         try:
-            fname_component_part = fname_components[tfevents_idx + 2 + i]
-        except IndexError:
+            created_time = int(fname_components[tfevents_idx + 1])
+        except (ValueError, IndexError):
             return False
-        if part != fname_component_part:
+        # Ensure that the file is newer then our start time, and that it was
+        # created from the same hostname.
+        # TODO: we should also check the PID (also contained in the tfevents
+        #     filename). Can we assume that our parent pid is the user process
+        #     that wrote these files?
+        if created_time < int(start_time):
             return False
-    try:
-        created_time = int(fname_components[tfevents_idx + 1])
-    except (ValueError, IndexError):
-        return False
-    # Ensure that the file is newer then our start time, and that it was
-    # created from the same hostname.
-    # TODO: we should also check the PID (also contained in the tfevents
-    #     filename). Can we assume that our parent pid is the user process
-    #     that wrote these files?
-    return created_time >= int(start_time)
+    return True
 class TBWatcher:
@@ -136,6 +144,7 @@ class TBWatcher:
             # Note that we strip '/' instead of os.sep, because elsewhere we've
             # converted paths to forward slash.
             namespace = logdir.replace(filename, "").replace(rootdir, "").strip("/")
             # TODO: revisit this heuristic, it exists because we don't know the
             # root log directory until more than one tfevents file is written to
             if len(dirs) == 1 and namespace not in ["train", "validation"]:
@@ -217,12 +226,13 @@ class TBDirWatcher:
         """Check if a path has been modified since launch and contains tfevents."""
         if not path:
             raise ValueError("Path must be a nonempty string")
-        if self._force:
-            return True
         path = self.tf_compat.tf.compat.as_str_any(path)
-        return is_tfevents_file_created_by(
-            path, self._hostname, self._tbwatcher._settings._start_time
-        )
+        if self._force:
+            return is_tfevents_file_created_by(path, None, None)
+        else:
+            return is_tfevents_file_created_by(
+                path, self._hostname, self._tbwatcher._settings._start_time
+            )
     def _loader(
         self, save: bool = True, namespace: Optional[str] = None

wandb/sdk/internal/thread_local_settings.py ADDED Viewed

@@ -0,0 +1,18 @@
+import threading
+from typing import Dict, Optional
+# Context variable for setting API settings (api keys, etc.) for internal and public apis thread-locally
+# TODO: move this into actual settings
+class _ThreadLocalApiSettings(threading.local):
+    api_key: Optional[str]
+    cookies: Optional[Dict]
+    headers: Optional[Dict]
+    def __init__(self) -> None:
+        self.api_key = None
+        self.cookies = None
+        self.headers = None
+_thread_local_api_settings: _ThreadLocalApiSettings = _ThreadLocalApiSettings()

wandb/sdk/launch/_project_spec.py CHANGED Viewed

@@ -7,17 +7,20 @@ import json
 import logging
 import os
 import tempfile
-from typing import Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 import wandb
 import wandb.docker as docker
 from wandb.apis.internal import Api
-from wandb.apis.public import Artifact as PublicArtifact
 from wandb.errors import CommError
+from wandb.sdk.launch import utils
 from wandb.sdk.lib.runid import generate_id
-from . import utils
-from .utils import LOG_PREFIX, LaunchError
+from .errors import LaunchError
+from .utils import LOG_PREFIX, recursive_macro_sub
+if TYPE_CHECKING:
+    from wandb.sdk.artifacts.artifact import Artifact
 _logger = logging.getLogger(__name__)
@@ -59,6 +62,7 @@ class LaunchProject:
         resource: str,
         resource_args: Dict[str, Any],
         run_id: Optional[str],
+        sweep_id: Optional[str] = None,
     ):
         if uri is not None and utils.is_bare_wandb_uri(uri):
             uri = api.settings("base_url") + uri
@@ -67,7 +71,7 @@ class LaunchProject:
         self.job = job
         if job is not None:
             wandb.termlog(f"{LOG_PREFIX}Launching job: {job}")
-        self._job_artifact: Optional[PublicArtifact] = None
+        self._job_artifact: Optional["Artifact"] = None
         self.api = api
         self.launch_spec = launch_spec
         self.target_entity = target_entity
@@ -78,11 +82,12 @@ class LaunchProject:
         # runner, so we need to pop the builder key out
         resource_args_build = resource_args.get(resource, {}).pop("builder", {})
         self.resource = resource
-        self.resource_args = resource_args
+        self.resource_args = resource_args.copy()
+        self.sweep_id = sweep_id
         self.python_version: Optional[str] = launch_spec.get("python_version")
-        self.cuda_base_image: Optional[str] = resource_args_build.get("cuda", {}).get(
-            "base_image"
-        )
+        self.accelerator_base_image: Optional[str] = resource_args_build.get(
+            "accelerator", {}
+        ).get("base_image") or resource_args_build.get("cuda", {}).get("base_image")
         self._base_image: Optional[str] = launch_spec.get("base_image")
         self.docker_image: Optional[str] = docker_config.get(
             "docker_image"
@@ -110,6 +115,9 @@ class LaunchProject:
             self.override_entrypoint = self.add_entry_point(
                 overrides.get("entry_point")  # type: ignore
             )
+        if overrides.get("sweep_id") is not None:
+            _logger.info("Adding override sweep id")
+            self.sweep_id = overrides["sweep_id"]
         if self.docker_image is not None:
             self.source = LaunchSource.DOCKER
             self.project_dir = None
@@ -172,6 +180,43 @@ class LaunchProject:
             assert self.job is not None
             return wandb.util.make_docker_image_name_safe(self.job.split(":")[0])
+    def fill_macros(self, image: str) -> None:
+        """Substitute values for macros in resource arguments.
+        Certain macros can be used in resource args. These macros allow the
+        user to set resource args dynamically in the context of the
+        run being launched. The macros are given in the ${macro} format. The
+        following macros are currently supported:
+        ${project_name} - the name of the project the run is being launched to.
+        ${entity_name} - the owner of the project the run being launched to.
+        ${run_id} - the id of the run being launched.
+        ${run_name} - the name of the run that is launching.
+        ${image_uri} - the URI of the container image for this run.
+        Additionally, you may use ${<ENV-VAR-NAME>} to refer to the value of any
+        environment variables that you plan to set in the environment of any
+        agents that will receive these resource args.
+        Calling this method will overwrite the contents of self.resource_args
+        with the substituted values.
+        Args:
+            image (str): The image name to fill in for ${wandb-image}.
+        Returns:
+            None
+        """
+        update_dict = {
+            "project_name": self.target_project,
+            "entity_name": self.target_entity,
+            "run_id": self.run_id,
+            "run_name": self.name,
+            "image_uri": image,
+        }
+        update_dict.update(os.environ)
+        self.resource_args = recursive_macro_sub(self.resource_args, update_dict)
     def build_required(self) -> bool:
         """Checks the source to see if a build is required."""
         # since the image tag for images built from jobs
@@ -416,6 +461,7 @@ def create_project_from_spec(launch_spec: Dict[str, Any], api: Api) -> LaunchPro
         launch_spec.get("resource", None),
         launch_spec.get("resource_args", {}),
         launch_spec.get("run_id", None),
+        launch_spec.get("sweep_id", {}),
     )
@@ -446,8 +492,8 @@ def fetch_and_validate_project(
         launch_project._fetch_project_local(internal_api=api)
     assert launch_project.project_dir is not None
-    # this prioritizes pip, and we don't support any cases where both are present
-    # conda projects when uploaded to wandb become pip projects via requirements.frozen.txt, wandb doesn't preserve conda envs
+    # this prioritizes pip, and we don't support any cases where both are present conda projects when uploaded to
+    # wandb become pip projects via requirements.frozen.txt, wandb doesn't preserve conda envs
     if os.path.exists(
         os.path.join(launch_project.project_dir, "requirements.txt")
     ) or os.path.exists(

wandb/sdk/launch/agent/agent.py CHANGED Viewed

@@ -5,7 +5,6 @@ import pprint
 import threading
 import time
 import traceback
-from dataclasses import dataclass
 from multiprocessing import Event
 from multiprocessing.pool import ThreadPool
 from typing import Any, Dict, List, Optional, Union
@@ -13,22 +12,18 @@ from typing import Any, Dict, List, Optional, Union
 import wandb
 from wandb.apis.internal import Api
 from wandb.errors import CommError
-from wandb.sdk.launch._project_spec import LaunchProject
+from wandb.sdk.launch.launch_add import launch_add
 from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
-from wandb.sdk.launch.sweeps import SCHEDULER_URI
+from wandb.sdk.launch.sweeps.scheduler import Scheduler
 from wandb.sdk.lib import runid
 from .. import loader
 from .._project_spec import create_project_from_spec, fetch_and_validate_project
 from ..builder.build import construct_builder_args
-from ..runner.abstract import AbstractRun
-from ..utils import (
-    LAUNCH_DEFAULT_PROJECT,
-    LOG_PREFIX,
-    PROJECT_SYNCHRONOUS,
-    LaunchDockerError,
-    LaunchError,
-)
+from ..errors import LaunchDockerError, LaunchError
+from ..utils import LAUNCH_DEFAULT_PROJECT, LOG_PREFIX, PROJECT_SYNCHRONOUS
+from .job_status_tracker import JobAndRunStatusTracker
+from .run_queue_item_file_saver import RunQueueItemFileSaver
 AGENT_POLLING_INTERVAL = 10
 ACTIVE_SWEEP_POLLING_INTERVAL = 1  # more frequent when we know we have jobs
@@ -37,30 +32,13 @@ AGENT_POLLING = "POLLING"
 AGENT_RUNNING = "RUNNING"
 AGENT_KILLED = "KILLED"
-MAX_THREADS = 64
-_logger = logging.getLogger(__name__)
+HIDDEN_AGENT_RUN_TYPE = "sweep-controller"
+MAX_THREADS = 64
-@dataclass
-class JobAndRunStatus:
-    run_queue_item_id: str
-    run_id: Optional[str] = None
-    project: Optional[str] = None
-    entity: Optional[str] = None
-    run: Optional[AbstractRun] = None
-    failed_to_start: bool = False
-    completed_status: Optional[str] = None
-    is_scheduler: bool = False
-    @property
-    def job_completed(self) -> bool:
-        return self.failed_to_start or self.completed_status is not None
+MAX_RESUME_COUNT = 5
-    def update_run_info(self, launch_project: LaunchProject) -> None:
-        self.run_id = launch_project.run_id
-        self.project = launch_project.target_project
-        self.entity = launch_project.target_entity
+_logger = logging.getLogger(__name__)
 def _convert_access(access: str) -> str:
@@ -101,16 +79,21 @@ def _max_from_config(
     return max_from_config
-def _job_is_scheduler(run_spec: Dict[str, Any]) -> bool:
+def _is_scheduler_job(run_spec: Dict[str, Any]) -> bool:
     """Determine whether a job/runSpec is a sweep scheduler."""
     if not run_spec:
-        _logger.debug("Recieved runSpec in _job_is_scheduler that was empty")
+        _logger.debug("Recieved runSpec in _is_scheduler_job that was empty")
-    if run_spec.get("uri") != SCHEDULER_URI:
+    if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
         return False
     if run_spec.get("resource") == "local-process":
-        # If a scheduler is a local-process (100%), also
+        # Any job pushed to a run queue that has a scheduler uri is
+        # allowed to use local-process
+        if run_spec.get("job"):
+            return True
+        # If a scheduler is local-process and run through CLI, also
         #    confirm command is in format: [wandb scheduler <sweep>]
         cmd = run_spec.get("overrides", {}).get("entry_point", [])
         if len(cmd) < 3:
@@ -137,7 +120,7 @@ class LaunchAgent:
         self._api = api
         self._base_url = self._api.settings().get("base_url")
         self._ticks = 0
-        self._jobs: Dict[int, JobAndRunStatus] = {}
+        self._jobs: Dict[int, JobAndRunStatusTracker] = {}
         self._jobs_lock = threading.Lock()
         self._jobs_event = Event()
         self._jobs_event.set()
@@ -169,15 +152,40 @@ class LaunchAgent:
             self.gorilla_supports_agents,
         )
         self._id = create_response["launchAgentId"]
-        self._name = ""  # hacky: want to display this to the user but we don't get it back from gql until polling starts. fix later
         if self._api.entity_is_team(self._entity):
             wandb.termwarn(
                 f"{LOG_PREFIX}Agent is running on team entity ({self._entity}). Members of this team will be able to run code on this device."
             )
-    def fail_run_queue_item(self, run_queue_item_id: str) -> None:
+        agent_response = self._api.get_launch_agent(
+            self._id, self.gorilla_supports_agents
+        )
+        self._name = agent_response["name"]
+        self._init_agent_run()
+    def fail_run_queue_item(
+        self,
+        run_queue_item_id: str,
+        message: str,
+        phase: str,
+        files: Optional[List[str]] = None,
+    ) -> None:
         if self._gorilla_supports_fail_run_queue_items:
-            self._api.fail_run_queue_item(run_queue_item_id)
+            self._api.fail_run_queue_item(run_queue_item_id, message, phase, files)
+    def _init_agent_run(self) -> None:
+        # TODO: has it been long enough that all backends support agents?
+        if self.gorilla_supports_agents:
+            settings = wandb.Settings(silent=True, disable_git=True)
+            self._wandb_run = wandb.init(
+                project=self._project,
+                entity=self._entity,
+                settings=settings,
+                id=self._name,
+                job_type=HIDDEN_AGENT_RUN_TYPE,
+            )
+        else:
+            self._wandb_run = None
     @property
     def thread_ids(self) -> List[int]:
@@ -253,24 +261,43 @@ class LaunchAgent:
         if not update_ret["success"]:
             wandb.termerror(f"{LOG_PREFIX}Failed to update agent status to {status}")
-    def finish_thread_id(self, thread_id: int) -> None:
+    def finish_thread_id(
+        self,
+        thread_id: int,
+        exception: Optional[Union[Exception, LaunchDockerError]] = None,
+    ) -> None:
         """Removes the job from our list for now."""
         job_and_run_status = self._jobs[thread_id]
-        if not job_and_run_status.run_id or not job_and_run_status.project:
-            self.fail_run_queue_item(job_and_run_status.run_queue_item_id)
-        elif job_and_run_status.entity != self._entity:
+        if (
+            job_and_run_status.entity is not None
+            and job_and_run_status.entity != self._entity
+        ):
             _logger.info(
                 "Skipping check for completed run status because run is on a different entity than agent"
             )
+        elif exception is not None:
+            tb_str = traceback.format_exception(
+                type(exception), value=exception, tb=exception.__traceback__
+            )
+            fnames = job_and_run_status.saver.save_contents(
+                "".join(tb_str), "error.log", "error"
+            )
+            self.fail_run_queue_item(
+                job_and_run_status.run_queue_item_id,
+                str(exception),
+                job_and_run_status.err_stage,
+                fnames,
+            )
         elif job_and_run_status.completed_status not in ["stopped", "failed"]:
             _logger.info(
                 "Skipping check for completed run status because run was successful"
             )
-        else:
+        elif job_and_run_status.run is not None:
             run_info = None
             # sweep runs exist but have no info before they are started
             # so run_info returned will be None
             # normal runs just throw a comm error
+            # TODO: make more clear
             try:
                 run_info = self._api.get_run_info(
                     self._entity, job_and_run_status.project, job_and_run_status.run_id
@@ -279,7 +306,22 @@ class LaunchAgent:
             except CommError:
                 pass
             if run_info is None:
-                self.fail_run_queue_item(job_and_run_status.run_queue_item_id)
+                _msg = "The submitted run was not successfully started"
+                fnames = None
+                logs = job_and_run_status.run.get_logs()
+                if logs:
+                    fnames = job_and_run_status.saver.save_contents(
+                        logs, "error.log", "error"
+                    )
+                self.fail_run_queue_item(
+                    job_and_run_status.run_queue_item_id, _msg, "run", fnames
+                )
+        else:
+            _logger.info("Finish thread id had no exception, ror run")
+            wandb._sentry.exception(
+                "launch agent called finish thread id on thread without run or exception"
+            )
         # TODO:  keep logs or something for the finished jobs
         with self._jobs_lock:
@@ -296,7 +338,9 @@ class LaunchAgent:
         if job.job_completed:
             self.finish_thread_id(thread_id)
-    def run_job(self, job: Dict[str, Any]) -> None:
+    def run_job(
+        self, job: Dict[str, Any], queue: str, file_saver: RunQueueItemFileSaver
+    ) -> None:
         """Set up project and run the job.
         Arguments:
@@ -322,6 +366,8 @@ class LaunchAgent:
                 job,
                 self.default_config,
                 self._api,
+                queue,
+                file_saver,
             ),
         )
@@ -367,7 +413,6 @@ class LaunchAgent:
                 agent_response = self._api.get_launch_agent(
                     self._id, self.gorilla_supports_agents
                 )
-                self._name = agent_response["name"]  # hack: first time we get name
                 if agent_response["stopPolling"]:
                     # shutdown process and all jobs if requested from ui
                     raise KeyboardInterrupt
@@ -376,7 +421,10 @@ class LaunchAgent:
                     for queue in self._queues:
                         job = self.pop_from_queue(queue)
                         if job:
-                            if _job_is_scheduler(job.get("runSpec")):
+                            file_saver = RunQueueItemFileSaver(
+                                self._wandb_run, job["runQueueItemId"]
+                            )
+                            if _is_scheduler_job(job.get("runSpec")):
                                 # If job is a scheduler, and we are already at the cap, ignore,
                                 #    don't ack, and it will be pushed back onto the queue in 1 min
                                 if self.num_running_schedulers >= self._max_schedulers:
@@ -388,13 +436,25 @@ class LaunchAgent:
                                     continue
                             try:
-                                self.run_job(job)
+                                self.run_job(job, queue, file_saver)
                             except Exception as e:
                                 wandb.termerror(
                                     f"{LOG_PREFIX}Error running job: {traceback.format_exc()}"
                                 )
                                 wandb._sentry.exception(e)
-                                self.fail_run_queue_item(job["runQueueItemId"])
+                                # always the first phase, because we only enter phase 2 within the thread
+                                files = file_saver.save_contents(
+                                    contents=traceback.format_exc(),
+                                    fname="error.log",
+                                    file_sub_type="error",
+                                )
+                                self.fail_run_queue_item(
+                                    run_queue_item_id=job["runQueueItemId"],
+                                    message=str(e),
+                                    phase="agent",
+                                    files=files,
+                                )
                 for thread_id in self.thread_ids:
                     self._update_finished(thread_id)
@@ -429,20 +489,27 @@ class LaunchAgent:
         job: Dict[str, Any],
         default_config: Dict[str, Any],
         api: Api,
+        queue: str,
+        file_saver: RunQueueItemFileSaver,
     ) -> None:
         thread_id = threading.current_thread().ident
         assert thread_id is not None
+        job_tracker = JobAndRunStatusTracker(job["runQueueItemId"], queue, file_saver)
+        with self._jobs_lock:
+            self._jobs[thread_id] = job_tracker
         try:
-            self._thread_run_job(launch_spec, job, default_config, api, thread_id)
+            self._thread_run_job(
+                launch_spec, job, default_config, api, queue, thread_id, job_tracker
+            )
         except LaunchDockerError as e:
             wandb.termerror(
                 f"{LOG_PREFIX}agent {self._name} encountered an issue while starting Docker, see above output for details."
             )
-            self.finish_thread_id(thread_id)
+            self.finish_thread_id(thread_id, e)
             wandb._sentry.exception(e)
         except Exception as e:
             wandb.termerror(f"{LOG_PREFIX}Error running job: {traceback.format_exc()}")
-            self.finish_thread_id(thread_id)
+            self.finish_thread_id(thread_id, e)
             wandb._sentry.exception(e)
     def _thread_run_job(
@@ -451,11 +518,10 @@ class LaunchAgent:
         job: Dict[str, Any],
         default_config: Dict[str, Any],
         api: Api,
+        queue: str,
         thread_id: int,
+        job_tracker: JobAndRunStatusTracker,
     ) -> None:
-        job_tracker = JobAndRunStatus(job["runQueueItemId"])
-        with self._jobs_lock:
-            self._jobs[thread_id] = job_tracker
         project = create_project_from_spec(launch_spec, api)
         job_tracker.update_run_info(project)
         _logger.info("Fetching and validating project...")
@@ -480,9 +546,8 @@ class LaunchAgent:
         backend = loader.runner_from_config(resource, api, backend_config, environment)
         _logger.info("Backend loaded...")
         api.ack_run_queue_item(job["runQueueItemId"], project.run_id)
-        run = backend.run(project, builder)
-        if _job_is_scheduler(launch_spec):
+        run = backend.run(project, builder, job_tracker)
+        if _is_scheduler_job(launch_spec):
             with self._jobs_lock:
                 self._jobs[thread_id].is_scheduler = True
             wandb.termlog(
@@ -497,15 +562,17 @@ class LaunchAgent:
         with self._jobs_lock:
             job_tracker.run = run
         while self._jobs_event.is_set():
-            if self._check_run_finished(job_tracker):
+            if self._check_run_finished(job_tracker, launch_spec):
                 return
             time.sleep(AGENT_POLLING_INTERVAL)
         # temp: for local, kill all jobs. we don't yet have good handling for different
         # types of runners in general
-        if isinstance(run, LocalSubmittedRun):
-            run.command_proc.kill()
+        if isinstance(run, LocalSubmittedRun) and run._command_proc is not None:
+            run._command_proc.kill()
-    def _check_run_finished(self, job_tracker: JobAndRunStatus) -> bool:
+    def _check_run_finished(
+        self, job_tracker: JobAndRunStatusTracker, launch_spec: Dict[str, Any]
+    ) -> bool:
         if job_tracker.completed_status:
             return True
@@ -522,13 +589,28 @@ class LaunchAgent:
         try:
             run = job_tracker.run
             status = run.get_status().state
-            if status in ["stopped", "failed", "finished"]:
+            if status in ["stopped", "failed", "finished", "preempted"]:
                 if job_tracker.is_scheduler:
                     wandb.termlog(f"{LOG_PREFIX}Scheduler finished with ID: {run.id}")
                 else:
                     wandb.termlog(f"{LOG_PREFIX}Job finished with ID: {run.id}")
                 with self._jobs_lock:
                     job_tracker.completed_status = status
+                if status == "preempted":
+                    config = launch_spec.copy()
+                    config["run_id"] = job_tracker.run_id
+                    config["_resume_count"] = config.get("_resume_count", 0) + 1
+                    if config["_resume_count"] > MAX_RESUME_COUNT:
+                        wandb.termlog(
+                            f"{LOG_PREFIX}Run {job_tracker.run_id} has already resumed {MAX_RESUME_COUNT} times."
+                        )
+                        return True
+                    wandb.termlog(f"{LOG_PREFIX}Requeueing run {job_tracker.run_id}.")
+                    launch_add(
+                        config=config,
+                        project_queue=self._project,
+                        queue_name=job_tracker.queue,
+                    )
                 return True
             return False
         except LaunchError as e:

wandb/sdk/launch/agent/job_status_tracker.py ADDED Viewed

@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+from typing import Optional
+from wandb.sdk.launch._project_spec import LaunchProject
+from ..runner.abstract import AbstractRun
+from .run_queue_item_file_saver import RunQueueItemFileSaver
+@dataclass
+class JobAndRunStatusTracker:
+    run_queue_item_id: str
+    queue: str
+    saver: RunQueueItemFileSaver
+    run_id: Optional[str] = None
+    project: Optional[str] = None
+    entity: Optional[str] = None
+    run: Optional[AbstractRun] = None
+    failed_to_start: bool = False
+    completed_status: Optional[str] = None
+    is_scheduler: bool = False
+    err_stage: str = "agent"
+    @property
+    def job_completed(self) -> bool:
+        return self.failed_to_start or self.completed_status is not None
+    def update_run_info(self, launch_project: LaunchProject) -> None:
+        self.run_id = launch_project.run_id
+        self.project = launch_project.target_project
+        self.entity = launch_project.target_entity
+    def set_err_stage(self, stage: str) -> None:
+        self.err_stage = stage

wandb 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl

wandb 0.15.3py3-none-any.whl → 0.15.5py3-none-any.whl