PyPI - wandb - Versions diffs - 0.16.5__py3-none-any.whl → 0.17.0rc1__py3-none-any.whl - Mend

wandb 0.16.5py3-none-any.whl → 0.17.0rc1py3-none-any.whl

Files changed (141) hide show

package_readme.md +95 -0
wandb/__init__.py +2 -2
wandb/agents/pyagent.py +0 -1
wandb/analytics/sentry.py +2 -1
wandb/apis/importers/internals/protocols.py +30 -56
wandb/apis/importers/mlflow.py +13 -26
wandb/apis/importers/wandb.py +8 -14
wandb/apis/public/api.py +1 -0
wandb/apis/public/artifacts.py +1 -0
wandb/apis/public/files.py +1 -0
wandb/apis/public/history.py +1 -0
wandb/apis/public/jobs.py +1 -0
wandb/apis/public/projects.py +1 -0
wandb/apis/public/reports.py +1 -0
wandb/apis/public/runs.py +1 -0
wandb/apis/public/sweeps.py +1 -0
wandb/apis/public/teams.py +1 -0
wandb/apis/public/users.py +1 -0
wandb/apis/reports/v1/_blocks.py +2 -6
wandb/apis/reports/v2/gql.py +1 -0
wandb/apis/reports/v2/interface.py +3 -4
wandb/apis/reports/v2/internal.py +5 -8
wandb/cli/cli.py +7 -4
wandb/data_types.py +3 -3
wandb/env.py +35 -5
wandb/errors/__init__.py +5 -0
wandb/integration/catboost/catboost.py +1 -1
wandb/integration/fastai/__init__.py +1 -0
wandb/integration/keras/__init__.py +1 -0
wandb/integration/keras/keras.py +6 -6
wandb/integration/langchain/wandb_tracer.py +1 -0
wandb/integration/lightning/fabric/logger.py +1 -3
wandb/integration/metaflow/metaflow.py +41 -6
wandb/integration/openai/fine_tuning.py +77 -40
wandb/keras/__init__.py +1 -0
wandb/proto/v3/wandb_internal_pb2.py +364 -332
wandb/proto/v3/wandb_settings_pb2.py +2 -2
wandb/proto/v4/wandb_internal_pb2.py +322 -316
wandb/proto/v4/wandb_settings_pb2.py +2 -2
wandb/proto/wandb_internal_codegen.py +0 -25
wandb/sdk/artifacts/artifact.py +41 -13
wandb/sdk/artifacts/artifact_download_logger.py +1 -0
wandb/sdk/artifacts/artifact_file_cache.py +18 -4
wandb/sdk/artifacts/artifact_instance_cache.py +1 -0
wandb/sdk/artifacts/artifact_manifest.py +1 -0
wandb/sdk/artifacts/artifact_manifest_entry.py +1 -0
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
wandb/sdk/artifacts/artifact_saver.py +21 -21
wandb/sdk/artifacts/artifact_state.py +1 -0
wandb/sdk/artifacts/artifact_ttl.py +1 -0
wandb/sdk/artifacts/exceptions.py +1 -0
wandb/sdk/artifacts/storage_handlers/azure_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/gcs_handler.py +13 -18
wandb/sdk/artifacts/storage_handlers/http_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/local_file_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/multi_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/s3_handler.py +5 -3
wandb/sdk/artifacts/storage_handlers/tracking_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +1 -0
wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +1 -0
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +1 -0
wandb/sdk/artifacts/storage_policy.py +1 -0
wandb/sdk/data_types/base_types/media.py +3 -6
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
wandb/sdk/integration_utils/auto_logging.py +5 -6
wandb/sdk/integration_utils/data_logging.py +5 -1
wandb/sdk/interface/interface.py +72 -37
wandb/sdk/interface/interface_shared.py +7 -13
wandb/sdk/internal/datastore.py +1 -1
wandb/sdk/internal/handler.py +18 -2
wandb/sdk/internal/internal.py +0 -1
wandb/sdk/internal/internal_util.py +0 -1
wandb/sdk/internal/job_builder.py +4 -3
wandb/sdk/internal/profiler.py +1 -0
wandb/sdk/internal/run.py +1 -0
wandb/sdk/internal/sender.py +1 -1
wandb/sdk/internal/system/assets/gpu_amd.py +44 -44
wandb/sdk/internal/system/assets/gpu_apple.py +56 -11
wandb/sdk/internal/system/assets/interfaces.py +6 -8
wandb/sdk/internal/system/assets/open_metrics.py +2 -2
wandb/sdk/internal/system/assets/trainium.py +1 -3
wandb/sdk/launch/_launch.py +5 -0
wandb/sdk/launch/_project_spec.py +10 -23
wandb/sdk/launch/agent/agent.py +81 -37
wandb/sdk/launch/agent/config.py +80 -11
wandb/sdk/launch/builder/abstract.py +1 -0
wandb/sdk/launch/builder/build.py +28 -1
wandb/sdk/launch/builder/docker_builder.py +1 -0
wandb/sdk/launch/builder/kaniko_builder.py +149 -134
wandb/sdk/launch/builder/noop.py +1 -0
wandb/sdk/launch/create_job.py +61 -48
wandb/sdk/launch/environment/abstract.py +1 -0
wandb/sdk/launch/environment/gcp_environment.py +1 -0
wandb/sdk/launch/environment/local_environment.py +1 -0
wandb/sdk/launch/loader.py +1 -0
wandb/sdk/launch/registry/abstract.py +1 -0
wandb/sdk/launch/registry/azure_container_registry.py +1 -0
wandb/sdk/launch/registry/elastic_container_registry.py +1 -0
wandb/sdk/launch/registry/google_artifact_registry.py +1 -0
wandb/sdk/launch/registry/local_registry.py +1 -0
wandb/sdk/launch/runner/abstract.py +1 -0
wandb/sdk/launch/runner/kubernetes_monitor.py +4 -1
wandb/sdk/launch/runner/kubernetes_runner.py +4 -3
wandb/sdk/launch/runner/sagemaker_runner.py +11 -10
wandb/sdk/launch/sweeps/scheduler.py +4 -1
wandb/sdk/launch/sweeps/scheduler_sweep.py +1 -0
wandb/sdk/launch/sweeps/utils.py +1 -1
wandb/sdk/launch/utils.py +21 -3
wandb/sdk/lib/_settings_toposort_generated.py +1 -0
wandb/sdk/lib/fsm.py +8 -12
wandb/sdk/lib/gitlib.py +4 -4
wandb/sdk/lib/lazyloader.py +0 -1
wandb/sdk/lib/proto_util.py +1 -1
wandb/sdk/lib/retry.py +3 -2
wandb/sdk/lib/run_moment.py +7 -1
wandb/sdk/service/service.py +17 -15
wandb/sdk/verify/verify.py +2 -1
wandb/sdk/wandb_init.py +2 -8
wandb/sdk/wandb_manager.py +2 -2
wandb/sdk/wandb_require.py +5 -0
wandb/sdk/wandb_run.py +64 -46
wandb/sdk/wandb_settings.py +2 -1
wandb/sklearn/__init__.py +1 -0
wandb/sklearn/plot/__init__.py +1 -0
wandb/sklearn/plot/classifier.py +1 -0
wandb/sklearn/plot/clusterer.py +1 -0
wandb/sklearn/plot/regressor.py +1 -0
wandb/sklearn/plot/shared.py +1 -0
wandb/sklearn/utils.py +1 -0
wandb/testing/relay.py +4 -4
wandb/trigger.py +1 -0
wandb/util.py +40 -17
wandb/wandb_controller.py +0 -1
wandb/wandb_torch.py +1 -2
{wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info}/METADATA +68 -69
{wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info}/RECORD +139 -140
{wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info}/WHEEL +1 -2
wandb/bin/apple_gpu_stats +0 -0
wandb-0.16.5.dist-info/top_level.txt +0 -1
{wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info}/entry_points.txt +0 -0
{wandb-0.16.5.dist-info → wandb-0.17.0rc1.dist-info/licenses}/LICENSE +0 -0

wandb/sdk/integration_utils/data_logging.py CHANGED Viewed

@@ -291,7 +291,11 @@ def _infer_single_example_keyed_processor(
     ):
         # assume this is a class
         if class_labels_table is not None:
-            processors["class"] = lambda n, d, p: class_labels_table.index_ref(d[0]) if d[0] < len(class_labels_table.data) else d[0]  # type: ignore
+            processors["class"] = (
+                lambda n, d, p: class_labels_table.index_ref(d[0])
+                if d[0] < len(class_labels_table.data)
+                else d[0]
+            )  # type: ignore
         else:
             processors["val"] = lambda n, d, p: d[0]
     elif len(shape) == 1:

wandb/sdk/interface/interface.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import (
     Union,
 )
+from wandb import termwarn
 from wandb.proto import wandb_internal_pb2 as pb
 from wandb.proto import wandb_telemetry_pb2 as tpb
 from wandb.sdk.artifacts.artifact import Artifact
@@ -423,7 +424,7 @@ class InterfaceBase:
             job_info=job_info,
             metadata=metadata,
         )
-        use_artifact.partial.source_info.source.ParseFromString(src_str)
+        use_artifact.partial.source_info.source.ParseFromString(src_str)  # type: ignore[arg-type]
         return use_artifact
@@ -447,16 +448,27 @@ class InterfaceBase:
                 path = artifact.get_entry("wandb-job.json").download()
                 with open(path) as f:
                     job_info = json.load(f)
             except Exception as e:
                 logger.warning(
                     f"Failed to download partial job info from artifact {artifact}, : {e}"
                 )
-            use_artifact = self._make_proto_use_artifact(
-                use_artifact=use_artifact,
-                job_name=artifact.name,
-                job_info=job_info,
-                metadata=artifact.metadata,
-            )
+                termwarn(
+                    f"Failed to download partial job info from artifact {artifact}, : {e}"
+                )
+                return
+            try:
+                use_artifact = self._make_proto_use_artifact(
+                    use_artifact=use_artifact,
+                    job_name=artifact.name,
+                    job_info=job_info,
+                    metadata=artifact.metadata,
+                )
+            except Exception as e:
+                logger.warning(f"Failed to construct use artifact proto: {e}")
+                termwarn(f"Failed to construct use artifact proto: {e}")
+                return
         self._publish_use_artifact(use_artifact)
@@ -504,11 +516,15 @@ class InterfaceBase:
         artifact_id: str,
         download_root: str,
         allow_missing_references: bool,
+        skip_cache: bool,
+        path_prefix: Optional[str],
     ) -> MailboxHandle:
         download_artifact = pb.DownloadArtifactRequest()
         download_artifact.artifact_id = artifact_id
         download_artifact.download_root = download_root
         download_artifact.allow_missing_references = allow_missing_references
+        download_artifact.skip_cache = skip_cache
+        download_artifact.path_prefix = path_prefix or ""
         resp = self._deliver_download_artifact(download_artifact)
         return resp
@@ -717,6 +733,55 @@ class InterfaceBase:
     def _publish_keepalive(self, keepalive: pb.KeepaliveRequest) -> None:
         raise NotImplementedError
+    def publish_job_input(
+        self,
+        include_paths: List[List[str]],
+        exclude_paths: List[List[str]],
+        run_config: bool = False,
+        file_path: str = "",
+    ):
+        """Publishes a request to add inputs to the job.
+        If run_config is True, the wandb.config will be added as a job input.
+        If file_path is provided, the file at file_path will be added as a job
+        input.
+        The paths provided as arguments are sequences of dictionary keys that
+        specify a path within the wandb.config. If a path is included, the
+        corresponding field will be treated as a job input. If a path is
+        excluded, the corresponding field will not be treated as a job input.
+        Args:
+            include_paths: paths within config to include as job inputs.
+            exclude_paths: paths within config to exclude as job inputs.
+            run_config: bool indicating whether wandb.config is the input source.
+            file_path: path to file to include as a job input.
+        """
+        if run_config and file_path:
+            raise ValueError(
+                "run_config and file_path are mutually exclusive arguments."
+            )
+        request = pb.JobInputRequest()
+        include_records = [pb.JobInputPath(path=path) for path in include_paths]
+        exclude_records = [pb.JobInputPath(path=path) for path in exclude_paths]
+        request.include_paths.extend(include_records)
+        request.exclude_paths.extend(exclude_records)
+        source = pb.JobInputSource(
+            run_config=pb.JobInputSource.RunConfigSource(),
+        )
+        if run_config:
+            source.run_config.CopyFrom(pb.JobInputSource.RunConfigSource())
+        else:
+            source.file.CopyFrom(
+                pb.JobInputSource.ConfigFileSource(path=file_path),
+            )
+        return self._publish_job_input(request)
+    @abstractmethod
+    def _publish_job_input(self, request: pb.JobInputRequest) -> MailboxHandle:
+        raise NotImplementedError
     def join(self) -> None:
         # Drop indicates that the internal process has already been shutdown
         if self._drop:
@@ -767,36 +832,6 @@ class InterfaceBase:
         run_start.run.CopyFrom(run_pb)
         return self._deliver_run_start(run_start)
-    def publish_launch_wandb_config_parameters(
-        self, include_paths: List[List[str]], exclude_paths: List[List[str]]
-    ):
-        """Tells the internal process to treat wandb.config fields as job inputs.
-        The paths provided as arguments are sequences of dictionary keys that
-        specify a path within the wandb.config. If a path is included, the
-        corresponding field will be treated as a job input. If a path is
-        excluded, the corresponding field will not be treated as a job input.
-        Args:
-            include_paths: paths within config to include as job inputs.
-            exclude_paths: paths within config to exclude as job inputs.
-        Returns:
-            None
-        """
-        config_parameters = pb.LaunchWandbConfigParametersRecord()
-        include_records = [pb.ConfigFilterPath(path=path) for path in include_paths]
-        exclude_records = [pb.ConfigFilterPath(path=path) for path in exclude_paths]
-        config_parameters.include_paths.extend(include_records)
-        config_parameters.exclude_paths.extend(exclude_records)
-        return self._publish_launch_wandb_config_parameters(config_parameters)
-    @abstractmethod
-    def _publish_launch_wandb_config_parameters(
-        self, config_parameters: pb.LaunchWandbConfigParametersRecord
-    ) -> None:
-        raise NotImplementedError
     @abstractmethod
     def _deliver_run_start(self, run_start: pb.RunStartRequest) -> MailboxHandle:
         raise NotImplementedError

wandb/sdk/interface/interface_shared.py CHANGED Viewed

@@ -100,6 +100,10 @@ class InterfaceShared(InterfaceBase):
         rec = self._make_record(telemetry=telem)
         self._publish(rec)
+    def _publish_job_input(self, job_input: pb.JobInputRequest) -> MailboxHandle:
+        record = self._make_request(job_input=job_input)
+        return self._deliver_record(record)
     def _make_stats(self, stats_dict: dict) -> pb.StatsRecord:
         stats = pb.StatsRecord()
         stats.stats_type = pb.StatsRecord.StatsType.SYSTEM
@@ -147,6 +151,7 @@ class InterfaceShared(InterfaceBase):
         telemetry_record: Optional[pb.TelemetryRecordRequest] = None,
         get_system_metrics: Optional[pb.GetSystemMetricsRequest] = None,
         python_packages: Optional[pb.PythonPackagesRequest] = None,
+        job_input: Optional[pb.JobInputRequest] = None,
     ) -> pb.Record:
         request = pb.Request()
         if login:
@@ -207,6 +212,8 @@ class InterfaceShared(InterfaceBase):
             request.sync.CopyFrom(sync)
         elif python_packages:
             request.python_packages.CopyFrom(python_packages)
+        elif job_input:
+            request.job_input.CopyFrom(job_input)
         else:
             raise Exception("Invalid request")
         record = self._make_record(request=request)
@@ -239,9 +246,6 @@ class InterfaceShared(InterfaceBase):
         use_artifact: Optional[pb.UseArtifactRecord] = None,
         output: Optional[pb.OutputRecord] = None,
         output_raw: Optional[pb.OutputRawRecord] = None,
-        launch_wandb_config_parameters: Optional[
-            pb.LaunchWandbConfigParametersRecord
-        ] = None,
     ) -> pb.Record:
         record = pb.Record()
         if run:
@@ -286,8 +290,6 @@ class InterfaceShared(InterfaceBase):
             record.output.CopyFrom(output)
         elif output_raw:
             record.output_raw.CopyFrom(output_raw)
-        elif launch_wandb_config_parameters:
-            record.wandb_config_parameters.CopyFrom(launch_wandb_config_parameters)
         else:
             raise Exception("Invalid record")
         return record
@@ -417,14 +419,6 @@ class InterfaceShared(InterfaceBase):
         rec = self._make_record(alert=proto_alert)
         self._publish(rec)
-    def _publish_launch_wandb_config_parameters(
-        self, launch_wandb_config_parameters: pb.LaunchWandbConfigParametersRecord
-    ) -> None:
-        rec = self._make_record(
-            launch_wandb_config_parameters=launch_wandb_config_parameters
-        )
-        self._publish(rec)
     def _communicate_status(
         self, status: pb.StatusRequest
     ) -> Optional[pb.StatusResponse]:

wandb/sdk/internal/datastore.py CHANGED Viewed

@@ -52,7 +52,7 @@ try:
     bytes("", "ascii")
     def strtobytes(x):
-        """strtobytes."""
+        """Strtobytes."""
         return bytes(x, "iso8859-1")
     # def bytestostr(x):

wandb/sdk/internal/handler.py CHANGED Viewed

@@ -50,6 +50,18 @@ SummaryDict = Dict[str, Any]
 logger = logging.getLogger(__name__)
+# Update (March 5, 2024): Since ~2020/2021, when constructing the summary
+# object, we had replaced the artifact path for media types with the latest
+# artifact path. The primary purpose of this was to support live updating of
+# media objects in the UI (since the default artifact path was fully qualified
+# and would not update). However, in March of 2024, a bug was discovered with
+# this approach which causes this path to be incorrect in cases where the media
+# object is logged to another artifact before being logged to the run. Setting
+# this to `False` disables this copy behavior. The impact is that users will
+# need to refresh to see updates. Ironically, this updating behavior is not
+# currently supported in the UI, so the impact of this change is minimal.
+REPLACE_SUMMARY_ART_PATH_WITH_LATEST = False
 def _dict_nested_set(target: Dict[str, Any], key_list: Sequence[str], v: Any) -> None:
     # recurse down the dictionary structure:
@@ -371,7 +383,11 @@ class HandleManager:
                     updated = True
             return updated
         # If the dict is a media object, update the pointer to the latest alias
-        elif isinstance(v, dict) and handler_util.metric_is_wandb_dict(v):
+        elif (
+            REPLACE_SUMMARY_ART_PATH_WITH_LATEST
+            and isinstance(v, dict)
+            and handler_util.metric_is_wandb_dict(v)
+        ):
             if "_latest_artifact_path" in v and "artifact_path" in v:
                 # TODO: Make non-destructive?
                 v["artifact_path"] = v["_latest_artifact_path"]
@@ -381,7 +397,7 @@ class HandleManager:
     def _update_summary_media_objects(self, v: Dict[str, Any]) -> Dict[str, Any]:
         # For now, non-recursive - just top level
         for nk, nv in v.items():
-            if (
+            if REPLACE_SUMMARY_ART_PATH_WITH_LATEST and (
                 isinstance(nv, dict)
                 and handler_util.metric_is_wandb_dict(nv)
                 and "_latest_artifact_path" in nv

wandb/sdk/internal/internal.py CHANGED Viewed

@@ -12,7 +12,6 @@ Threads:
 """
 import atexit
 import logging
 import os

wandb/sdk/internal/internal_util.py CHANGED Viewed

@@ -4,7 +4,6 @@ Collection of classes to support the internal process.
 """
 import logging
 import queue
 import sys

wandb/sdk/internal/job_builder.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """job builder."""
 import json
 import logging
 import os
@@ -105,9 +106,9 @@ class JobBuilder:
         self._disable = settings.disable_job_creation
         self._partial_source = None
         self._aliases = []
-        self._source_type: Optional[
-            Literal["repo", "artifact", "image"]
-        ] = settings.job_source  # type: ignore[assignment]
+        self._source_type: Optional[Literal["repo", "artifact", "image"]] = (
+            settings.job_source  # type: ignore[assignment]
+        )
         self._is_notebook_run = self._get_is_notebook_run()
         self._verbose = verbose

wandb/sdk/internal/profiler.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Integration with pytorch profiler."""
 import os
 import wandb

wandb/sdk/internal/run.py CHANGED Viewed

@@ -4,6 +4,7 @@
 Semi-stubbed run for internal process use.
 """
 from wandb._globals import _datatypes_set_callback
 from .. import wandb_run

wandb/sdk/internal/sender.py CHANGED Viewed

@@ -910,7 +910,7 @@ class SendManager:
         is_wandb_init = self._run is None
         # save start time of a run
-        self._start_time = run.start_time.ToMicroseconds() // 1e6
+        self._start_time = int(run.start_time.ToMicroseconds() // 1e6)
         # update telemetry
         if run.telemetry:

wandb/sdk/internal/system/assets/gpu_amd.py CHANGED Viewed

@@ -28,14 +28,6 @@ logger = logging.getLogger(__name__)
 ROCM_SMI_CMD: Final[str] = shutil.which("rocm-smi") or "/usr/bin/rocm-smi"
-def get_rocm_smi_stats() -> Dict[str, Any]:
-    command = [str(ROCM_SMI_CMD), "-a", "--json"]
-    output = subprocess.check_output(command, universal_newlines=True).strip()
-    if "No AMD GPUs specified" in output:
-        return {}
-    return json.loads(output.split("\n")[0])  # type: ignore
 _StatsKeys = Literal[
     "gpu",
     "memoryAllocated",
@@ -49,6 +41,48 @@ _Stats = Dict[_StatsKeys, float]
 _InfoDict = Dict[str, Union[int, List[Dict[str, Any]]]]
+def get_rocm_smi_stats() -> Dict[str, Any]:
+    command = [str(ROCM_SMI_CMD), "-a", "--json"]
+    output = subprocess.check_output(command, universal_newlines=True).strip()
+    if "No AMD GPUs specified" in output:
+        return {}
+    return json.loads(output.split("\n")[0])  # type: ignore
+def parse_stats(stats: Dict[str, str]) -> _Stats:
+    """Parse stats from rocm-smi output."""
+    parsed_stats: _Stats = {}
+    try:
+        parsed_stats["gpu"] = float(stats.get("GPU use (%)"))  # type: ignore
+    except (TypeError, ValueError):
+        logger.warning("Could not parse GPU usage as float")
+    try:
+        parsed_stats["memoryAllocated"] = float(stats.get("GPU memory use (%)"))  # type: ignore
+    except (TypeError, ValueError):
+        logger.warning("Could not parse GPU memory allocation as float")
+    try:
+        parsed_stats["temp"] = float(stats.get("Temperature (Sensor memory) (C)"))  # type: ignore
+    except (TypeError, ValueError):
+        logger.warning("Could not parse GPU temperature as float")
+    try:
+        parsed_stats["powerWatts"] = float(
+            stats.get("Average Graphics Package Power (W)")  # type: ignore
+        )
+    except (TypeError, ValueError):
+        logger.warning("Could not parse GPU power as float")
+    try:
+        parsed_stats["powerPercent"] = (
+            float(stats.get("Average Graphics Package Power (W)"))  # type: ignore
+            / float(stats.get("Max Graphics Package Power (W)"))  # type: ignore
+            * 100
+        )
+    except (TypeError, ValueError):
+        logger.warning("Could not parse GPU average/max power as float")
+    return parsed_stats
 class GPUAMDStats:
     """Stats for AMD GPU devices."""
@@ -58,40 +92,6 @@ class GPUAMDStats:
     def __init__(self) -> None:
         self.samples = deque()
-    @staticmethod
-    def parse_stats(stats: Dict[str, str]) -> _Stats:
-        """Parse stats from rocm-smi output."""
-        parsed_stats: _Stats = {}
-        try:
-            parsed_stats["gpu"] = float(stats.get("GPU use (%)"))  # type: ignore
-        except (TypeError, ValueError):
-            logger.warning("Could not parse GPU usage as float")
-        try:
-            parsed_stats["memoryAllocated"] = float(stats.get("GPU memory use (%)"))  # type: ignore
-        except (TypeError, ValueError):
-            logger.warning("Could not parse GPU memory allocation as float")
-        try:
-            parsed_stats["temp"] = float(stats.get("Temperature (Sensor memory) (C)"))  # type: ignore
-        except (TypeError, ValueError):
-            logger.warning("Could not parse GPU temperature as float")
-        try:
-            parsed_stats["powerWatts"] = float(
-                stats.get("Average Graphics Package Power (W)")  # type: ignore
-            )
-        except (TypeError, ValueError):
-            logger.warning("Could not parse GPU power as float")
-        try:
-            parsed_stats["powerPercent"] = (
-                float(stats.get("Average Graphics Package Power (W)"))  # type: ignore
-                / float(stats.get("Max Graphics Package Power (W)"))  # type: ignore
-                * 100
-            )
-        except (TypeError, ValueError):
-            logger.warning("Could not parse GPU average/max power as float")
-        return parsed_stats
     def sample(self) -> None:
         try:
             raw_stats = get_rocm_smi_stats()
@@ -103,7 +103,7 @@ class GPUAMDStats:
             for card_key in card_keys:
                 card_stats = raw_stats[card_key]
-                stats = self.parse_stats(card_stats)
+                stats = parse_stats(card_stats)
                 if stats:
                     cards.append(stats)
@@ -183,7 +183,7 @@ class GPUAMD:
         can_read_rocm_smi = False
         try:
-            if get_rocm_smi_stats():
+            if parse_stats(get_rocm_smi_stats()):
                 can_read_rocm_smi = True
         except Exception:
             pass

wandb/sdk/internal/system/assets/gpu_apple.py CHANGED Viewed

@@ -37,6 +37,12 @@ class _Stats(TypedDict):
     # cpuWaitMs: float
+def get_apple_gpu_path() -> pathlib.Path:
+    return (
+        pathlib.Path(sys.modules["wandb"].__path__[0]) / "bin" / "apple_gpu_stats"
+    ).resolve()
 class GPUAppleStats:
     """Apple GPU stats available on Arm Macs."""
@@ -49,9 +55,7 @@ class GPUAppleStats:
     def __init__(self) -> None:
         self.samples = deque()
-        self.binary_path = (
-            pathlib.Path(sys.modules["wandb"].__path__[0]) / "bin" / "apple_gpu_stats"
-        ).resolve()
+        self.binary_path = get_apple_gpu_path()
     def sample(self) -> None:
         try:
@@ -63,22 +67,47 @@ class GPUAppleStats:
             )[0]
             raw_stats = json.loads(output)
+            temp_keys = [
+                "m1Gpu1",
+                "m1Gpu2",
+                "m1Gpu3",
+                "m1Gpu4",
+                "m2Gpu1",
+                "m2Gpu2",
+                "m3Gpu1",
+                "m3Gpu2",
+                "m3Gpu3",
+                "m3Gpu4",
+                "m3Gpu5",
+                "m3Gpu6",
+                "m3Gpu7",
+                "m3Gpu8",
+            ]
+            temp, count = 0, 0
+            for k in temp_keys:
+                if raw_stats.get(k, 0) > 0:
+                    temp += raw_stats[k]
+                    count += 1
             stats: _Stats = {
                 "gpu": raw_stats["utilization"],
-                "memoryAllocated": raw_stats["mem_used"],
-                "temp": raw_stats["temperature"],
-                "powerWatts": raw_stats["power"],
-                "powerPercent": (raw_stats["power"] / self.MAX_POWER_WATTS) * 100,
+                "memoryAllocated": (
+                    raw_stats["inUseSystemMemory"]
+                    / raw_stats["allocatedSystemMemory"]
+                    * 100
+                ),
+                "powerWatts": raw_stats["systemPower"],
+                "powerPercent": (raw_stats["systemPower"] / self.MAX_POWER_WATTS) * 100,
+                "temp": temp / count if count > 0 else 0,
                 # TODO: this stat could be useful eventually, it was consistently
                 #  0 in my experimentation and requires a frontend change
                 #  so leaving it out for now.
                 # "cpuWaitMs": raw_stats["cpu_wait_ms"],
             }
             self.samples.append(stats)
         except (OSError, ValueError, TypeError, subprocess.CalledProcessError) as e:
-            logger.exception(f"GPU stats error: {e}")
+            logger.exception("GPU stats error: %s", e)
     def clear(self) -> None:
         self.samples.clear()
@@ -116,6 +145,7 @@ class GPUApple:
         telemetry_record = telemetry.TelemetryRecord()
         telemetry_record.env.m1_gpu = True
         interface._publish_telemetry(telemetry_record)
+        self.binary_path = get_apple_gpu_path()
     @classmethod
     def is_available(cls) -> bool:
@@ -128,5 +158,20 @@ class GPUApple:
         self.metrics_monitor.finish()
     def probe(self) -> dict:
-        # todo: make this actually meaningful
-        return {self.name: {"type": "arm", "vendor": "Apple"}}
+        try:
+            command = [str(self.binary_path), "--json"]
+            output = (
+                subprocess.check_output(command, universal_newlines=True)
+                .strip()
+                .split("\n")
+            )[0]
+            raw_stats = json.loads(output)
+            return {
+                self.name: {
+                    "type": raw_stats["name"],
+                    "vendor": raw_stats["vendor"],
+                }
+            }
+        except (OSError, ValueError, TypeError, subprocess.CalledProcessError) as e:
+            logger.exception("GPU stats error: %s", e)
+            return {self.name: {"type": "arm", "vendor": "Apple"}}

wandb/sdk/internal/system/assets/interfaces.py CHANGED Viewed

@@ -68,8 +68,7 @@ class Asset(Protocol):
     metrics: List[Metric]
     metrics_monitor: "MetricsMonitor"
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        ...  # pragma: no cover
+    def __init__(self, *args: Any, **kwargs: Any) -> None: ...  # pragma: no cover
     @classmethod
     def is_available(cls) -> bool:
@@ -90,14 +89,13 @@ class Asset(Protocol):
 class Interface(Protocol):
-    def publish_stats(self, stats: dict) -> None:
-        ...  # pragma: no cover
+    def publish_stats(self, stats: dict) -> None: ...  # pragma: no cover
-    def _publish_telemetry(self, telemetry: "TelemetryRecord") -> None:
-        ...  # pragma: no cover
+    def _publish_telemetry(
+        self, telemetry: "TelemetryRecord"
+    ) -> None: ...  # pragma: no cover
-    def publish_files(self, files_dict: "FilesDict") -> None:
-        ...  # pragma: no cover
+    def publish_files(self, files_dict: "FilesDict") -> None: ...  # pragma: no cover
 class MetricsMonitor:

wandb/sdk/internal/system/assets/open_metrics.py CHANGED Viewed

@@ -65,13 +65,13 @@ def _setup_requests_session() -> requests.Session:
 def _nested_dict_to_tuple(
-    nested_dict: Mapping[str, Mapping[str, str]]
+    nested_dict: Mapping[str, Mapping[str, str]],
 ) -> Tuple[Tuple[str, Tuple[str, str]], ...]:
     return tuple((k, *v.items()) for k, v in nested_dict.items())  # type: ignore
 def _tuple_to_nested_dict(
-    nested_tuple: Tuple[Tuple[str, Tuple[str, str]], ...]
+    nested_tuple: Tuple[Tuple[str, Tuple[str, str]], ...],
 ) -> Dict[str, Dict[str, str]]:
     return {k: dict(v) for k, *v in nested_tuple}

wandb/sdk/internal/system/assets/trainium.py CHANGED Viewed

@@ -197,9 +197,7 @@ class NeuronCoreStats:
                 entry["report"]
                 for entry in raw_stats["neuron_runtime_data"]
                 if self._is_matching_entry(entry)
-            ][
-                0
-            ]  # there should be only one entry with the pid
+            ][0]  # there should be only one entry with the pid
             neuroncores_in_use = neuron_runtime_data["neuroncore_counters"][
                 "neuroncores_in_use"

wandb/sdk/launch/_launch.py CHANGED Viewed

@@ -62,6 +62,7 @@ def resolve_agent_config(  # noqa: C901
     max_jobs: Optional[int],
     queues: Optional[Tuple[str]],
     config: Optional[str],
+    verbosity: Optional[int],
 ) -> Tuple[Dict[str, Any], Api]:
     """Resolve the agent config.
@@ -72,6 +73,7 @@ def resolve_agent_config(  # noqa: C901
         max_jobs (int): The max number of jobs.
         queues (Tuple[str]): The queues.
         config (str): The config.
+        verbosity (int): How verbose to print, 0 or None = default, 1 = print status every 20 seconds, 2 = also print debugging information
     Returns:
         Tuple[Dict[str, Any], Api]: The resolved config and api.
@@ -83,6 +85,7 @@ def resolve_agent_config(  # noqa: C901
         "queues": [],
         "registry": {},
         "builder": {},
+        "verbosity": 0,
     }
     user_set_project = False
     resolved_config: Dict[str, Any] = defaults
@@ -123,6 +126,8 @@ def resolve_agent_config(  # noqa: C901
         resolved_config.update({"max_jobs": int(max_jobs)})
     if queues:
         resolved_config.update({"queues": list(queues)})
+    if verbosity:
+        resolved_config.update({"verbosity": int(verbosity)})
     # queue -> queues
     if resolved_config.get("queue"):
         if isinstance(resolved_config.get("queue"), str):

wandb 0.16.5__py3-none-any.whl → 0.17.0rc1__py3-none-any.whl

wandb 0.16.5py3-none-any.whl → 0.17.0rc1py3-none-any.whl