PyPI - wandb - Versions diffs - 0.16.5__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

wandb 0.16.5py3-none-any.whl → 0.17.0py3-none-any.whl

Files changed (194) hide show

package_readme.md +95 -0
wandb/__init__.py +2 -3
wandb/agents/pyagent.py +0 -1
wandb/analytics/sentry.py +2 -1
wandb/apis/importers/internals/internal.py +0 -1
wandb/apis/importers/internals/protocols.py +30 -56
wandb/apis/importers/mlflow.py +13 -26
wandb/apis/importers/wandb.py +8 -14
wandb/apis/internal.py +0 -3
wandb/apis/public/api.py +55 -3
wandb/apis/public/artifacts.py +1 -0
wandb/apis/public/files.py +1 -0
wandb/apis/public/history.py +1 -0
wandb/apis/public/jobs.py +17 -4
wandb/apis/public/projects.py +1 -0
wandb/apis/public/reports.py +1 -0
wandb/apis/public/runs.py +15 -17
wandb/apis/public/sweeps.py +1 -0
wandb/apis/public/teams.py +1 -0
wandb/apis/public/users.py +1 -0
wandb/apis/reports/v1/_blocks.py +3 -7
wandb/apis/reports/v2/gql.py +1 -0
wandb/apis/reports/v2/interface.py +3 -4
wandb/apis/reports/v2/internal.py +5 -8
wandb/cli/cli.py +95 -22
wandb/data_types.py +9 -6
wandb/docker/__init__.py +1 -1
wandb/env.py +38 -8
wandb/errors/__init__.py +5 -0
wandb/errors/term.py +10 -2
wandb/filesync/step_checksum.py +1 -4
wandb/filesync/step_prepare.py +4 -24
wandb/filesync/step_upload.py +4 -106
wandb/filesync/upload_job.py +0 -76
wandb/integration/catboost/catboost.py +1 -1
wandb/integration/fastai/__init__.py +1 -0
wandb/integration/huggingface/resolver.py +2 -2
wandb/integration/keras/__init__.py +1 -0
wandb/integration/keras/callbacks/metrics_logger.py +1 -1
wandb/integration/keras/keras.py +7 -7
wandb/integration/langchain/wandb_tracer.py +1 -0
wandb/integration/lightning/fabric/logger.py +1 -3
wandb/integration/metaflow/metaflow.py +41 -6
wandb/integration/openai/fine_tuning.py +77 -40
wandb/integration/prodigy/prodigy.py +1 -1
wandb/old/summary.py +1 -1
wandb/plot/confusion_matrix.py +1 -1
wandb/plot/pr_curve.py +2 -1
wandb/plot/roc_curve.py +2 -1
wandb/{plots → plot}/utils.py +13 -25
wandb/proto/v3/wandb_internal_pb2.py +364 -332
wandb/proto/v3/wandb_settings_pb2.py +2 -2
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_internal_pb2.py +322 -316
wandb/proto/v4/wandb_settings_pb2.py +2 -2
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/proto/wandb_deprecated.py +7 -1
wandb/proto/wandb_internal_codegen.py +3 -29
wandb/sdk/artifacts/artifact.py +51 -20
wandb/sdk/artifacts/artifact_download_logger.py +1 -0
wandb/sdk/artifacts/artifact_file_cache.py +18 -4
wandb/sdk/artifacts/artifact_instance_cache.py +1 -0
wandb/sdk/artifacts/artifact_manifest.py +1 -0
wandb/sdk/artifacts/artifact_manifest_entry.py +7 -3
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
wandb/sdk/artifacts/artifact_saver.py +18 -27
wandb/sdk/artifacts/artifact_state.py +1 -0
wandb/sdk/artifacts/artifact_ttl.py +1 -0
wandb/sdk/artifacts/exceptions.py +1 -0
wandb/sdk/artifacts/storage_handlers/azure_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/gcs_handler.py +13 -18
wandb/sdk/artifacts/storage_handlers/http_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/local_file_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/multi_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/s3_handler.py +5 -3
wandb/sdk/artifacts/storage_handlers/tracking_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +1 -0
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +3 -42
wandb/sdk/artifacts/storage_policy.py +2 -12
wandb/sdk/data_types/_dtypes.py +8 -8
wandb/sdk/data_types/base_types/media.py +3 -6
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
wandb/sdk/data_types/image.py +1 -1
wandb/sdk/data_types/video.py +1 -1
wandb/sdk/integration_utils/auto_logging.py +5 -6
wandb/sdk/integration_utils/data_logging.py +10 -6
wandb/sdk/interface/interface.py +86 -38
wandb/sdk/interface/interface_shared.py +7 -13
wandb/sdk/internal/datastore.py +1 -1
wandb/sdk/internal/file_pusher.py +2 -5
wandb/sdk/internal/file_stream.py +5 -18
wandb/sdk/internal/handler.py +18 -2
wandb/sdk/internal/internal.py +0 -1
wandb/sdk/internal/internal_api.py +1 -129
wandb/sdk/internal/internal_util.py +0 -1
wandb/sdk/internal/job_builder.py +159 -45
wandb/sdk/internal/profiler.py +1 -0
wandb/sdk/internal/progress.py +0 -28
wandb/sdk/internal/run.py +1 -0
wandb/sdk/internal/sender.py +1 -2
wandb/sdk/internal/system/assets/gpu_amd.py +44 -44
wandb/sdk/internal/system/assets/gpu_apple.py +56 -11
wandb/sdk/internal/system/assets/interfaces.py +6 -8
wandb/sdk/internal/system/assets/open_metrics.py +2 -2
wandb/sdk/internal/system/assets/trainium.py +1 -3
wandb/sdk/launch/__init__.py +9 -1
wandb/sdk/launch/_launch.py +9 -24
wandb/sdk/launch/_launch_add.py +1 -3
wandb/sdk/launch/_project_spec.py +188 -241
wandb/sdk/launch/agent/agent.py +115 -48
wandb/sdk/launch/agent/config.py +80 -14
wandb/sdk/launch/builder/abstract.py +69 -1
wandb/sdk/launch/builder/build.py +156 -555
wandb/sdk/launch/builder/context_manager.py +235 -0
wandb/sdk/launch/builder/docker_builder.py +8 -23
wandb/sdk/launch/builder/kaniko_builder.py +161 -159
wandb/sdk/launch/builder/noop.py +1 -0
wandb/sdk/launch/builder/templates/dockerfile.py +92 -0
wandb/sdk/launch/create_job.py +68 -63
wandb/sdk/launch/environment/abstract.py +1 -0
wandb/sdk/launch/environment/gcp_environment.py +1 -0
wandb/sdk/launch/environment/local_environment.py +1 -0
wandb/sdk/launch/inputs/files.py +148 -0
wandb/sdk/launch/inputs/internal.py +217 -0
wandb/sdk/launch/inputs/manage.py +95 -0
wandb/sdk/launch/loader.py +1 -0
wandb/sdk/launch/registry/abstract.py +1 -0
wandb/sdk/launch/registry/azure_container_registry.py +1 -0
wandb/sdk/launch/registry/elastic_container_registry.py +1 -0
wandb/sdk/launch/registry/google_artifact_registry.py +2 -1
wandb/sdk/launch/registry/local_registry.py +1 -0
wandb/sdk/launch/runner/abstract.py +1 -0
wandb/sdk/launch/runner/kubernetes_monitor.py +4 -1
wandb/sdk/launch/runner/kubernetes_runner.py +9 -10
wandb/sdk/launch/runner/local_container.py +2 -3
wandb/sdk/launch/runner/local_process.py +8 -29
wandb/sdk/launch/runner/sagemaker_runner.py +21 -20
wandb/sdk/launch/runner/vertex_runner.py +8 -7
wandb/sdk/launch/sweeps/scheduler.py +7 -4
wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
wandb/sdk/launch/sweeps/utils.py +3 -3
wandb/sdk/launch/utils.py +33 -140
wandb/sdk/lib/_settings_toposort_generated.py +1 -5
wandb/sdk/lib/fsm.py +8 -12
wandb/sdk/lib/gitlib.py +4 -4
wandb/sdk/lib/import_hooks.py +1 -1
wandb/sdk/lib/lazyloader.py +0 -1
wandb/sdk/lib/proto_util.py +23 -2
wandb/sdk/lib/redirect.py +19 -14
wandb/sdk/lib/retry.py +3 -2
wandb/sdk/lib/run_moment.py +7 -1
wandb/sdk/lib/tracelog.py +1 -1
wandb/sdk/service/service.py +19 -16
wandb/sdk/verify/verify.py +2 -1
wandb/sdk/wandb_init.py +16 -63
wandb/sdk/wandb_manager.py +2 -2
wandb/sdk/wandb_require.py +5 -0
wandb/sdk/wandb_run.py +164 -90
wandb/sdk/wandb_settings.py +2 -48
wandb/sdk/wandb_setup.py +1 -1
wandb/sklearn/__init__.py +1 -0
wandb/sklearn/plot/__init__.py +1 -0
wandb/sklearn/plot/classifier.py +11 -12
wandb/sklearn/plot/clusterer.py +2 -1
wandb/sklearn/plot/regressor.py +1 -0
wandb/sklearn/plot/shared.py +1 -0
wandb/sklearn/utils.py +1 -0
wandb/testing/relay.py +4 -4
wandb/trigger.py +1 -0
wandb/util.py +67 -54
wandb/wandb_controller.py +2 -3
wandb/wandb_torch.py +1 -2
{wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/METADATA +67 -70
{wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/RECORD +178 -188
{wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/WHEEL +1 -2
wandb/bin/apple_gpu_stats +0 -0
wandb/catboost/__init__.py +0 -9
wandb/fastai/__init__.py +0 -9
wandb/keras/__init__.py +0 -18
wandb/lightgbm/__init__.py +0 -9
wandb/plots/__init__.py +0 -6
wandb/plots/explain_text.py +0 -36
wandb/plots/heatmap.py +0 -81
wandb/plots/named_entity.py +0 -43
wandb/plots/part_of_speech.py +0 -50
wandb/plots/plot_definitions.py +0 -768
wandb/plots/precision_recall.py +0 -121
wandb/plots/roc.py +0 -103
wandb/sacred/__init__.py +0 -3
wandb/xgboost/__init__.py +0 -9
wandb-0.16.5.dist-info/top_level.txt +0 -1
{wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/entry_points.txt +0 -0
{wandb-0.16.5.dist-info → wandb-0.17.0.dist-info/licenses}/LICENSE +0 -0

wandb/sdk/launch/agent/agent.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Implementation of launch agent."""
 import asyncio
 import logging
 import os
@@ -8,7 +9,9 @@ import time
 import traceback
 from dataclasses import dataclass
 from multiprocessing import Event
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
+import yaml
 import wandb
 from wandb.apis.internal import Api
@@ -17,11 +20,11 @@ from wandb.sdk.launch._launch_add import launch_add
 from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
 from wandb.sdk.launch.runner.local_process import LocalProcessRunner
 from wandb.sdk.launch.sweeps.scheduler import Scheduler
+from wandb.sdk.launch.utils import LAUNCH_CONFIG_FILE, resolve_build_and_registry_config
 from wandb.sdk.lib import runid
 from .. import loader
 from .._project_spec import LaunchProject
-from ..builder.build import construct_agent_configs
 from ..errors import LaunchDockerError, LaunchError
 from ..utils import (
     LAUNCH_DEFAULT_PROJECT,
@@ -45,7 +48,10 @@ MAX_RESUME_COUNT = 5
 RUN_INFO_GRACE_PERIOD = 60
-MAX_WAIT_RUN_STOPPED = 60
+DEFAULT_STOPPED_RUN_TIMEOUT = 60
+DEFAULT_PRINT_INTERVAL = 5 * 60
+VERBOSE_PRINT_INTERVAL = 20
 _env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
 if _env_timeout:
@@ -105,30 +111,54 @@ def _max_from_config(
     return max_from_config
-def _is_scheduler_job(run_spec: Dict[str, Any]) -> bool:
-    """Determine whether a job/runSpec is a sweep scheduler."""
-    if not run_spec:
-        _logger.debug("Recieved runSpec in _is_scheduler_job that was empty")
+class InternalAgentLogger:
+    def __init__(self, verbosity=0):
+        self._print_to_terminal = verbosity >= 2
-    if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
-        return False
+    def error(self, message: str):
+        if self._print_to_terminal:
+            wandb.termerror(f"{LOG_PREFIX}{message}")
+        _logger.error(f"{LOG_PREFIX}{message}")
-    if run_spec.get("resource") == "local-process":
-        # Any job pushed to a run queue that has a scheduler uri is
-        # allowed to use local-process
-        if run_spec.get("job"):
-            return True
+    def warn(self, message: str):
+        if self._print_to_terminal:
+            wandb.termwarn(f"{LOG_PREFIX}{message}")
+        _logger.warn(f"{LOG_PREFIX}{message}")
-        # If a scheduler is local-process and run through CLI, also
-        #    confirm command is in format: [wandb scheduler <sweep>]
-        cmd = run_spec.get("overrides", {}).get("entry_point", [])
-        if len(cmd) < 3:
-            return False
+    def info(self, message: str):
+        if self._print_to_terminal:
+            wandb.termlog(f"{LOG_PREFIX}{message}")
+        _logger.info(f"{LOG_PREFIX}{message}")
-        if cmd[:2] != ["wandb", "scheduler"]:
-            return False
+    def debug(self, message: str):
+        if self._print_to_terminal:
+            wandb.termlog(f"{LOG_PREFIX}{message}")
+        _logger.debug(f"{LOG_PREFIX}{message}")
-    return True
+def construct_agent_configs(
+    launch_config: Optional[Dict] = None,
+    build_config: Optional[Dict] = None,
+) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any], Dict[str, Any]]:
+    registry_config = None
+    environment_config = None
+    if launch_config is not None:
+        build_config = launch_config.get("builder")
+        registry_config = launch_config.get("registry")
+    default_launch_config = None
+    if os.path.exists(os.path.expanduser(LAUNCH_CONFIG_FILE)):
+        with open(os.path.expanduser(LAUNCH_CONFIG_FILE)) as f:
+            default_launch_config = (
+                yaml.safe_load(f) or {}
+            )  # In case the config is empty, we want it to be {} instead of None.
+        environment_config = default_launch_config.get("environment")
+    build_config, registry_config = resolve_build_and_registry_config(
+        default_launch_config, build_config, registry_config
+    )
+    return environment_config, build_config, registry_config
 class LaunchAgent:
@@ -170,7 +200,7 @@ class LaunchAgent:
             config: Config dictionary for the agent.
         """
         self._entity = config["entity"]
-        self._project = config.get("project", LAUNCH_DEFAULT_PROJECT)
+        self._project = LAUNCH_DEFAULT_PROJECT
         self._api = api
         self._base_url = self._api.settings().get("base_url")
         self._ticks = 0
@@ -184,7 +214,13 @@ class LaunchAgent:
         self._max_jobs = _max_from_config(config, "max_jobs")
         self._max_schedulers = _max_from_config(config, "max_schedulers")
         self._secure_mode = config.get("secure_mode", False)
+        self._verbosity = config.get("verbosity", 0)
+        self._internal_logger = InternalAgentLogger(verbosity=self._verbosity)
+        self._last_status_print_time = 0.0
         self.default_config: Dict[str, Any] = config
+        self._stopped_run_timeout = config.get(
+            "stopped_run_timeout", DEFAULT_STOPPED_RUN_TIMEOUT
+        )
         # Get agent version from env var if present, otherwise wandb version
         self.version: str = "wandb@" + wandb.__version__
@@ -228,6 +264,33 @@ class LaunchAgent:
         self._name = agent_response["name"]
         self._init_agent_run()
+    def _is_scheduler_job(self, run_spec: Dict[str, Any]) -> bool:
+        """Determine whether a job/runSpec is a sweep scheduler."""
+        if not run_spec:
+            self._internal_logger.debug(
+                "Received runSpec in _is_scheduler_job that was empty"
+            )
+        if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
+            return False
+        if run_spec.get("resource") == "local-process":
+            # Any job pushed to a run queue that has a scheduler uri is
+            # allowed to use local-process
+            if run_spec.get("job"):
+                return True
+            # If a scheduler is local-process and run through CLI, also
+            #    confirm command is in format: [wandb scheduler <sweep>]
+            cmd = run_spec.get("overrides", {}).get("entry_point", [])
+            if len(cmd) < 3:
+                return False
+            if cmd[:2] != ["wandb", "scheduler"]:
+                return False
+        return True
     async def fail_run_queue_item(
         self,
         run_queue_item_id: str,
@@ -241,6 +304,8 @@ class LaunchAgent:
     def _init_agent_run(self) -> None:
         # TODO: has it been long enough that all backends support agents?
+        self._wandb_run = None
         if self.gorilla_supports_agents:
             settings = wandb.Settings(silent=True, disable_git=True)
             self._wandb_run = wandb.init(
@@ -250,8 +315,6 @@ class LaunchAgent:
                 id=self._name,
                 job_type=HIDDEN_AGENT_RUN_TYPE,
             )
-        else:
-            self._wandb_run = None
     @property
     def thread_ids(self) -> List[int]:
@@ -298,14 +361,12 @@ class LaunchAgent:
     def print_status(self) -> None:
         """Prints the current status of the agent."""
+        self._last_status_print_time = time.time()
         output_str = "agent "
         if self._name:
             output_str += f"{self._name} "
         if self.num_running_jobs < self._max_jobs:
-            output_str += "polling on "
-            if self._project != LAUNCH_DEFAULT_PROJECT:
-                output_str += f"project {self._project}, "
-            output_str += f"queues {','.join(self._queues)}, "
+            output_str += f"polling on queues {','.join(self._queues)}, "
         output_str += (
             f"running {self.num_running_jobs} out of a maximum of {self._max_jobs} jobs"
         )
@@ -344,8 +405,8 @@ class LaunchAgent:
             if run_state.lower() != "pending":
                 return True
         except CommError:
-            _logger.info(
-                f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run"
+            self._internal_logger.info(
+                f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run",
             )
         return False
@@ -361,8 +422,8 @@ class LaunchAgent:
             job_and_run_status.entity is not None
             and job_and_run_status.entity != self._entity
         ):
-            _logger.info(
-                "Skipping check for completed run status because run is on a different entity than agent"
+            self._internal_logger.info(
+                "Skipping check for completed run status because run is on a different entity than agent",
             )
         elif exception is not None:
             tb_str = traceback.format_exception(
@@ -378,8 +439,8 @@ class LaunchAgent:
                 fnames,
             )
         elif job_and_run_status.project is None or job_and_run_status.run_id is None:
-            _logger.error(
-                f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}"
+            self._internal_logger.info(
+                f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}",
             )
             wandb.termerror(
                 "Missing project or run id on thread called finish thread id"
@@ -397,7 +458,6 @@ class LaunchAgent:
             # We retry for 60 seconds with an exponential backoff in case
             # upsert run is taking a while.
             logs = None
-            start_time = time.time()
             interval = 1
             while True:
                 called_init = self._check_run_exists_and_inited(
@@ -406,7 +466,7 @@ class LaunchAgent:
                     job_and_run_status.run_id,
                     job_and_run_status.run_queue_item_id,
                 )
-                if called_init or time.time() - start_time > RUN_INFO_GRACE_PERIOD:
+                if called_init or interval > RUN_INFO_GRACE_PERIOD:
                     break
                 if not called_init:
                     # Fetch the logs now if we don't get run info on the
@@ -430,7 +490,9 @@ class LaunchAgent:
                     job_and_run_status.run_queue_item_id, _msg, "run", fnames
                 )
         else:
-            _logger.info(f"Finish thread id {thread_id} had no exception and no run")
+            self._internal_logger.info(
+                f"Finish thread id {thread_id} had no exception and no run"
+            )
             wandb._sentry.exception(
                 "launch agent called finish thread id on thread without run or exception"
             )
@@ -458,7 +520,7 @@ class LaunchAgent:
         await self.update_status(AGENT_RUNNING)
         # parse job
-        _logger.info("Parsing launch spec")
+        self._internal_logger.info("Parsing launch spec")
         launch_spec = job["runSpec"]
         # Abort if this job attempts to override secure mode
@@ -511,6 +573,10 @@ class LaunchAgent:
             KeyboardInterrupt: if the agent is requested to stop.
         """
         self.print_status()
+        if self._verbosity == 0:
+            print_interval = DEFAULT_PRINT_INTERVAL
+        else:
+            print_interval = VERBOSE_PRINT_INTERVAL
         try:
             while True:
                 job = None
@@ -532,7 +598,7 @@ class LaunchAgent:
                             file_saver = RunQueueItemFileSaver(
                                 self._wandb_run, job["runQueueItemId"]
                             )
-                            if _is_scheduler_job(job.get("runSpec", {})):
+                            if self._is_scheduler_job(job.get("runSpec", {})):
                                 # If job is a scheduler, and we are already at the cap, ignore,
                                 #    don't ack, and it will be pushed back onto the queue in 1 min
                                 if self.num_running_schedulers >= self._max_schedulers:
@@ -567,6 +633,7 @@ class LaunchAgent:
                         await self.update_status(AGENT_POLLING)
                     else:
                         await self.update_status(AGENT_RUNNING)
+                if time.time() - self._last_status_print_time > print_interval:
                     self.print_status()
                 if self.num_running_jobs == self._max_jobs or job is None:
@@ -634,21 +701,21 @@ class LaunchAgent:
         await self.check_sweep_state(launch_spec, api)
         job_tracker.update_run_info(project)
-        _logger.info("Fetching and validating project...")
+        self._internal_logger.info("Fetching and validating project...")
         project.fetch_and_validate_project()
-        _logger.info("Fetching resource...")
+        self._internal_logger.info("Fetching resource...")
         resource = launch_spec.get("resource") or "local-container"
         backend_config: Dict[str, Any] = {
             PROJECT_SYNCHRONOUS: False,  # agent always runs async
         }
-        _logger.info("Loading backend")
+        self._internal_logger.info("Loading backend")
         override_build_config = launch_spec.get("builder")
         _, build_config, registry_config = construct_agent_configs(
             default_config, override_build_config
         )
         image_uri = project.docker_image
-        entrypoint = project.get_single_entry_point()
+        entrypoint = project.get_job_entry_point()
         environment = loader.environment_from_config(
             default_config.get("environment", {})
         )
@@ -661,13 +728,13 @@ class LaunchAgent:
             assert entrypoint is not None
             image_uri = await builder.build_image(project, entrypoint, job_tracker)
-        _logger.info("Backend loaded...")
+        self._internal_logger.info("Backend loaded...")
         if isinstance(backend, LocalProcessRunner):
             run = await backend.run(project, image_uri)
         else:
             assert image_uri
             run = await backend.run(project, image_uri)
-        if _is_scheduler_job(launch_spec):
+        if self._is_scheduler_job(launch_spec):
             with self._jobs_lock:
                 self._jobs[thread_id].is_scheduler = True
             wandb.termlog(
@@ -700,7 +767,7 @@ class LaunchAgent:
                 if stopped_time is None:
                     stopped_time = time.time()
                 else:
-                    if time.time() - stopped_time > MAX_WAIT_RUN_STOPPED:
+                    if time.time() - stopped_time > self._stopped_run_timeout:
                         await run.cancel()
             await asyncio.sleep(AGENT_POLLING_INTERVAL)
@@ -720,7 +787,7 @@ class LaunchAgent:
                     project=launch_spec["project"],
                 )
             except Exception as e:
-                _logger.debug(f"Fetch sweep state error: {e}")
+                self._internal_logger.debug(f"Fetch sweep state error: {e}")
                 state = None
             if state != "RUNNING" and state != "PAUSED":

wandb/sdk/launch/agent/config.py CHANGED Viewed

@@ -80,17 +80,7 @@ class RegistryConfig(BaseModel):
     @validator("uri")  # type: ignore
     @classmethod
     def validate_uri(cls, uri: str) -> str:
-        for regex in [
-            GCP_ARTIFACT_REGISTRY_URI_REGEX,
-            AZURE_CONTAINER_REGISTRY_URI_REGEX,
-            ELASTIC_CONTAINER_REGISTRY_URI_REGEX,
-        ]:
-            if regex.match(uri):
-                return uri
-        raise ValueError(
-            "Invalid uri. URI must be a repository URI for an "
-            "ECR, ACR, or GCP Artifact Registry."
-        )
+        return validate_registry_uri(uri)
 class EnvironmentConfig(BaseModel):
@@ -186,6 +176,14 @@ class BuilderConfig(BaseModel):
         """Right now there are no required fields for docker builds."""
         return values
+    @validator("destination")  # type: ignore
+    @classmethod
+    def validate_destination(cls, destination: Optional[str]) -> Optional[str]:
+        """Validate that the destination is a valid container registry URI."""
+        if destination is None:
+            return None
+        return validate_registry_uri(destination)
 class AgentConfig(BaseModel):
     """Configuration for the Launch agent."""
@@ -194,9 +192,6 @@ class AgentConfig(BaseModel):
         default=[],
         description="The queues to use for this agent.",
     )
-    project: Optional[str] = Field(
-        description="The W&B project to use for this agent.",
-    )
     entity: Optional[str] = Field(
         description="The W&B entity to use for this agent.",
     )
@@ -225,6 +220,77 @@ class AgentConfig(BaseModel):
         None,
         description="The builder to use.",
     )
+    verbosity: Optional[int] = Field(
+        0,
+        description="How verbose to print, 0 = default, 1 = verbose, 2 = very verbose",
+    )
+    stopped_run_timeout: Optional[int] = Field(
+        60,
+        description="How many seconds to wait after receiving the stop command before forcibly cancelling a run.",
+    )
     class Config:
         extra = "forbid"
+def validate_registry_uri(uri: str) -> str:
+    """Validate that the registry URI is a valid container registry URI.
+    The URI should resolve to an image name in a container registry. The recognized
+    formats are for ECR, ACR, and GCP Artifact Registry. If the URI does not match
+    any of these formats, a warning is printed indicating the registry type is not
+    recognized and the agent can't guarantee that images can be pushed.
+    If the format is recognized but does not resolve to an image name, an
+    error is raised. For example, if the URI is an ECR URI but does not include
+    an image name or includes a tag as well as an image name, an error is raised.
+    """
+    tag_msg = (
+        "Destination for built images may not include a tag, but the URI provided "
+        "includes the suffix '{tag}'. Please remove the tag and try again. The agent "
+        "will automatically tag each image with a unique hash of the source code."
+    )
+    if uri.startswith("https://"):
+        uri = uri[8:]
+    match = GCP_ARTIFACT_REGISTRY_URI_REGEX.match(uri)
+    if match:
+        if match.group("tag"):
+            raise ValueError(tag_msg.format(tag=match.group("tag")))
+        if not match.group("image_name"):
+            raise ValueError(
+                "An image name must be specified in the URI for a GCP Artifact Registry. "
+                "Please provide a uri with the format "
+                "'https://<region>-docker.pkg.dev/<project>/<repository>/<image>'."
+            )
+        return uri
+    match = AZURE_CONTAINER_REGISTRY_URI_REGEX.match(uri)
+    if match:
+        if match.group("tag"):
+            raise ValueError(tag_msg.format(tag=match.group("tag")))
+        if not match.group("repository"):
+            raise ValueError(
+                "A repository name must be specified in the URI for an "
+                "Azure Container Registry. Please provide a uri with the format "
+                "'https://<registry-name>.azurecr.io/<repository>'."
+            )
+        return uri
+    match = ELASTIC_CONTAINER_REGISTRY_URI_REGEX.match(uri)
+    if match:
+        if match.group("tag"):
+            raise ValueError(tag_msg.format(tag=match.group("tag")))
+        if not match.group("repository"):
+            raise ValueError(
+                "A repository name must be specified in the URI for an "
+                "Elastic Container Registry. Please provide a uri with the format "
+                "'https://<account-id>.dkr.ecr.<region>.amazonaws.com/<repository>'."
+            )
+        return uri
+    wandb.termwarn(
+        f"Unable to recognize registry type in URI {uri}. You are responsible "
+        "for ensuring the agent can push images to this registry."
+    )
+    return uri

wandb/sdk/launch/builder/abstract.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Abstract plugin class defining the interface needed to build container images for W&B Launch."""
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Dict, Optional
@@ -6,6 +7,12 @@ from wandb.sdk.launch.environment.abstract import AbstractEnvironment
 from wandb.sdk.launch.registry.abstract import AbstractRegistry
 from .._project_spec import EntryPoint, LaunchProject
+from ..registry.anon import AnonynmousRegistry
+from ..utils import (
+    AZURE_CONTAINER_REGISTRY_URI_REGEX,
+    ELASTIC_CONTAINER_REGISTRY_URI_REGEX,
+    GCP_ARTIFACT_REGISTRY_URI_REGEX,
+)
 if TYPE_CHECKING:
     from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
@@ -34,7 +41,7 @@ class AbstractBuilder(ABC):
             verify: Whether to verify the functionality of the builder.
         Raises:
-            LaunchError: If the builder cannot be intialized or verified.
+            LaunchError: If the builder cannot be initialized or verified.
         """
         raise NotImplementedError
@@ -86,3 +93,64 @@ class AbstractBuilder(ABC):
             LaunchError: If the builder cannot be used to build images.
         """
         raise NotImplementedError
+def registry_from_uri(uri: str) -> AbstractRegistry:
+    """Create a registry helper object from a uri.
+    This function parses the URI and determines which supported registry it
+    belongs to. It then creates a registry helper object for that registry.
+    The supported remote registry types are:
+    - Azure Container Registry
+    - Google Container Registry
+    - AWS Elastic Container Registry
+    The format of the URI is as follows:
+    - Azure Container Registry: <registry-name>.azurecr.io/<repo-name>/<image-name>
+    - Google Container Registry: <location>-docker.pkg.dev/<project-id>/<repo-name>/<image-name>
+    - AWS Elastic Container Registry: <account-id>.dkr.ecr.<region>.amazonaws.com/<repo-name>/<image-name>
+    Our classification of the registry is based on the domain name. For example,
+    if the uri contains `.azurecr.io`, we classify it as an Azure
+    Container Registry. If the uri contains `.dkr.ecr`, we classify
+    it as an AWS Elastic Container Registry. If the uri contains
+    `-docker.pkg.dev`, we classify it as a Google Artifact Registry.
+    This function will attempt to load the approriate cloud helpers for the
+    `https://` prefix is optional for all of the above.
+    Arguments:
+        uri: The uri to create a registry from.
+    Returns:
+        The registry.
+    Raises:
+        LaunchError: If the registry helper cannot be loaded for the given URI.
+    """
+    if uri.startswith("https://"):
+        uri = uri[len("https://") :]
+    if AZURE_CONTAINER_REGISTRY_URI_REGEX.match(uri) is not None:
+        from wandb.sdk.launch.registry.azure_container_registry import (
+            AzureContainerRegistry,
+        )
+        return AzureContainerRegistry(uri=uri)
+    elif GCP_ARTIFACT_REGISTRY_URI_REGEX.match(uri) is not None:
+        from wandb.sdk.launch.registry.google_artifact_registry import (
+            GoogleArtifactRegistry,
+        )
+        return GoogleArtifactRegistry(uri=uri)
+    elif ELASTIC_CONTAINER_REGISTRY_URI_REGEX.match(uri) is not None:
+        from wandb.sdk.launch.registry.elastic_container_registry import (
+            ElasticContainerRegistry,
+        )
+        return ElasticContainerRegistry(uri=uri)
+    return AnonynmousRegistry(uri=uri)

wandb 0.16.5__py3-none-any.whl → 0.17.0__py3-none-any.whl

wandb 0.16.5py3-none-any.whl → 0.17.0py3-none-any.whl