PyPI - wandb - Versions diffs - 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl - Mend

wandb 0.15.3py3-none-any.whl → 0.15.5py3-none-any.whl

Files changed (156) hide show

wandb/__init__.py +1 -1
wandb/analytics/sentry.py +1 -0
wandb/apis/importers/base.py +20 -5
wandb/apis/importers/mlflow.py +7 -1
wandb/apis/internal.py +12 -0
wandb/apis/public.py +247 -1387
wandb/apis/reports/_panels.py +58 -35
wandb/beta/workflows.py +6 -7
wandb/cli/cli.py +130 -60
wandb/data_types.py +3 -1
wandb/filesync/dir_watcher.py +21 -27
wandb/filesync/step_checksum.py +8 -8
wandb/filesync/step_prepare.py +23 -10
wandb/filesync/step_upload.py +13 -13
wandb/filesync/upload_job.py +4 -8
wandb/integration/cohere/__init__.py +3 -0
wandb/integration/cohere/cohere.py +21 -0
wandb/integration/cohere/resolver.py +347 -0
wandb/integration/gym/__init__.py +4 -6
wandb/integration/huggingface/__init__.py +3 -0
wandb/integration/huggingface/huggingface.py +18 -0
wandb/integration/huggingface/resolver.py +213 -0
wandb/integration/langchain/wandb_tracer.py +16 -179
wandb/integration/openai/__init__.py +1 -3
wandb/integration/openai/openai.py +11 -143
wandb/integration/openai/resolver.py +111 -38
wandb/integration/sagemaker/config.py +2 -2
wandb/integration/tensorboard/log.py +4 -4
wandb/old/settings.py +24 -7
wandb/proto/v3/wandb_telemetry_pb2.py +12 -12
wandb/proto/v4/wandb_telemetry_pb2.py +12 -12
wandb/proto/wandb_deprecated.py +3 -1
wandb/sdk/__init__.py +1 -1
wandb/sdk/artifacts/__init__.py +0 -0
wandb/sdk/artifacts/artifact.py +2101 -0
wandb/sdk/artifacts/artifact_download_logger.py +42 -0
wandb/sdk/artifacts/artifact_manifest.py +67 -0
wandb/sdk/artifacts/artifact_manifest_entry.py +159 -0
wandb/sdk/artifacts/artifact_manifests/__init__.py +0 -0
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +91 -0
wandb/sdk/{internal → artifacts}/artifact_saver.py +6 -5
wandb/sdk/artifacts/artifact_state.py +10 -0
wandb/sdk/{interface/artifacts/artifact_cache.py → artifacts/artifacts_cache.py} +22 -12
wandb/sdk/artifacts/exceptions.py +55 -0
wandb/sdk/artifacts/storage_handler.py +59 -0
wandb/sdk/artifacts/storage_handlers/__init__.py +0 -0
wandb/sdk/artifacts/storage_handlers/azure_handler.py +192 -0
wandb/sdk/artifacts/storage_handlers/gcs_handler.py +224 -0
wandb/sdk/artifacts/storage_handlers/http_handler.py +112 -0
wandb/sdk/artifacts/storage_handlers/local_file_handler.py +134 -0
wandb/sdk/artifacts/storage_handlers/multi_handler.py +53 -0
wandb/sdk/artifacts/storage_handlers/s3_handler.py +301 -0
wandb/sdk/artifacts/storage_handlers/tracking_handler.py +67 -0
wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +132 -0
wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +72 -0
wandb/sdk/artifacts/storage_layout.py +6 -0
wandb/sdk/artifacts/storage_policies/__init__.py +0 -0
wandb/sdk/artifacts/storage_policies/s3_bucket_policy.py +61 -0
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +386 -0
wandb/sdk/{interface/artifacts/artifact_storage.py → artifacts/storage_policy.py} +5 -57
wandb/sdk/data_types/_dtypes.py +7 -12
wandb/sdk/data_types/base_types/json_metadata.py +3 -2
wandb/sdk/data_types/base_types/media.py +8 -8
wandb/sdk/data_types/base_types/wb_value.py +12 -13
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +5 -6
wandb/sdk/data_types/helper_types/classes.py +6 -8
wandb/sdk/data_types/helper_types/image_mask.py +5 -6
wandb/sdk/data_types/histogram.py +4 -3
wandb/sdk/data_types/html.py +3 -4
wandb/sdk/data_types/image.py +11 -9
wandb/sdk/data_types/molecule.py +5 -3
wandb/sdk/data_types/object_3d.py +7 -5
wandb/sdk/data_types/plotly.py +3 -2
wandb/sdk/data_types/saved_model.py +11 -11
wandb/sdk/data_types/trace_tree.py +5 -4
wandb/sdk/data_types/utils.py +3 -5
wandb/sdk/data_types/video.py +5 -4
wandb/sdk/integration_utils/auto_logging.py +215 -0
wandb/sdk/interface/interface.py +15 -15
wandb/sdk/internal/file_pusher.py +8 -16
wandb/sdk/internal/file_stream.py +5 -11
wandb/sdk/internal/handler.py +13 -1
wandb/sdk/internal/internal_api.py +287 -13
wandb/sdk/internal/job_builder.py +119 -30
wandb/sdk/internal/sender.py +6 -26
wandb/sdk/internal/settings_static.py +2 -0
wandb/sdk/internal/system/assets/__init__.py +2 -0
wandb/sdk/internal/system/assets/gpu.py +42 -0
wandb/sdk/internal/system/assets/gpu_amd.py +216 -0
wandb/sdk/internal/system/env_probe_helpers.py +13 -0
wandb/sdk/internal/system/system_info.py +3 -3
wandb/sdk/internal/tb_watcher.py +32 -22
wandb/sdk/internal/thread_local_settings.py +18 -0
wandb/sdk/launch/_project_spec.py +57 -11
wandb/sdk/launch/agent/agent.py +147 -65
wandb/sdk/launch/agent/job_status_tracker.py +34 -0
wandb/sdk/launch/agent/run_queue_item_file_saver.py +45 -0
wandb/sdk/launch/builder/abstract.py +5 -1
wandb/sdk/launch/builder/build.py +21 -18
wandb/sdk/launch/builder/docker_builder.py +10 -4
wandb/sdk/launch/builder/kaniko_builder.py +113 -23
wandb/sdk/launch/builder/noop.py +6 -3
wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +46 -14
wandb/sdk/launch/environment/aws_environment.py +3 -2
wandb/sdk/launch/environment/azure_environment.py +124 -0
wandb/sdk/launch/environment/gcp_environment.py +2 -4
wandb/sdk/launch/environment/local_environment.py +1 -1
wandb/sdk/launch/errors.py +19 -0
wandb/sdk/launch/github_reference.py +32 -19
wandb/sdk/launch/launch.py +3 -8
wandb/sdk/launch/launch_add.py +6 -2
wandb/sdk/launch/loader.py +21 -2
wandb/sdk/launch/registry/azure_container_registry.py +132 -0
wandb/sdk/launch/registry/elastic_container_registry.py +39 -5
wandb/sdk/launch/registry/google_artifact_registry.py +68 -26
wandb/sdk/launch/registry/local_registry.py +2 -1
wandb/sdk/launch/runner/abstract.py +24 -3
wandb/sdk/launch/runner/kubernetes_runner.py +479 -26
wandb/sdk/launch/runner/local_container.py +103 -51
wandb/sdk/launch/runner/local_process.py +1 -1
wandb/sdk/launch/runner/sagemaker_runner.py +60 -10
wandb/sdk/launch/runner/vertex_runner.py +10 -5
wandb/sdk/launch/sweeps/__init__.py +7 -9
wandb/sdk/launch/sweeps/scheduler.py +307 -77
wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
wandb/sdk/launch/sweeps/utils.py +82 -35
wandb/sdk/launch/utils.py +89 -75
wandb/sdk/lib/_settings_toposort_generated.py +7 -0
wandb/sdk/lib/capped_dict.py +26 -0
wandb/sdk/lib/{git.py → gitlib.py} +76 -59
wandb/sdk/lib/hashutil.py +12 -4
wandb/sdk/lib/paths.py +96 -8
wandb/sdk/lib/sock_client.py +2 -2
wandb/sdk/lib/timer.py +1 -0
wandb/sdk/service/server.py +22 -9
wandb/sdk/service/server_sock.py +1 -1
wandb/sdk/service/service.py +27 -8
wandb/sdk/verify/verify.py +4 -7
wandb/sdk/wandb_config.py +2 -6
wandb/sdk/wandb_init.py +57 -53
wandb/sdk/wandb_require.py +7 -0
wandb/sdk/wandb_run.py +61 -223
wandb/sdk/wandb_settings.py +28 -4
wandb/testing/relay.py +15 -2
wandb/util.py +74 -36
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/METADATA +15 -9
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/RECORD +151 -116
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/entry_points.txt +1 -0
wandb/integration/langchain/util.py +0 -191
wandb/sdk/interface/artifacts/__init__.py +0 -33
wandb/sdk/interface/artifacts/artifact.py +0 -615
wandb/sdk/interface/artifacts/artifact_manifest.py +0 -131
wandb/sdk/wandb_artifacts.py +0 -2226
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/LICENSE +0 -0
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/WHEEL +0 -0
{wandb-0.15.3.dist-info → wandb-0.15.5.dist-info}/top_level.txt +0 -0

wandb/sdk/internal/job_builder.py CHANGED Viewed

@@ -1,20 +1,23 @@
 """job builder."""
 import json
+import logging
 import os
 import sys
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from wandb.sdk.artifacts.artifact import Artifact
 from wandb.sdk.data_types._dtypes import TypeRegistry
 from wandb.sdk.lib.filenames import DIFF_FNAME, METADATA_FNAME, REQUIREMENTS_FNAME
-from wandb.sdk.wandb_artifacts import Artifact
 from wandb.util import make_artifact_name_safe
 from .settings_static import SettingsStatic
 if sys.version_info >= (3, 8):
-    from typing import TypedDict
+    from typing import Literal, TypedDict
 else:
-    from typing_extensions import TypedDict
+    from typing_extensions import Literal, TypedDict
+_logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from wandb.proto.wandb_internal_pb2 import ArtifactRecord
@@ -32,11 +35,13 @@ class GitInfo(TypedDict):
 class GitSourceDict(TypedDict):
     git: GitInfo
     entrypoint: List[str]
+    notebook: bool
 class ArtifactSourceDict(TypedDict):
     artifact: str
     entrypoint: List[str]
+    notebook: bool
 class ImageSourceDict(TypedDict):
@@ -71,6 +76,7 @@ class JobBuilder:
     _summary: Optional[Dict[str, Any]]
     _logged_code_artifact: Optional[ArtifactInfoForJob]
     _disable: bool
+    _aliases: List[str]
     def __init__(self, settings: SettingsStatic):
         self._settings = settings
@@ -80,6 +86,10 @@ class JobBuilder:
         self._summary = None
         self._logged_code_artifact = None
         self._disable = settings.disable_job_creation
+        self._aliases = []
+        self._source_type: Optional[
+            Literal["repo", "artifact", "image"]
+        ] = settings.get("job_source")
     def set_config(self, config: Dict[str, Any]) -> None:
         self._config = config
@@ -107,19 +117,50 @@ class JobBuilder:
             )
     def _build_repo_job(
-        self, metadata: Dict[str, Any], program_relpath: str
-    ) -> Tuple[Artifact, GitSourceDict]:
+        self, metadata: Dict[str, Any], program_relpath: str, root: Optional[str]
+    ) -> Tuple[Optional[Artifact], Optional[GitSourceDict]]:
         git_info: Dict[str, str] = metadata.get("git", {})
         remote = git_info.get("remote")
         commit = git_info.get("commit")
         assert remote is not None
         assert commit is not None
+        if self._is_notebook_run():
+            if not os.path.exists(
+                os.path.join(os.getcwd(), os.path.basename(program_relpath))
+            ):
+                return None, None
+            if root is None or self._settings._jupyter_root is None:
+                _logger.info("target path does not exist, exiting")
+                return None, None
+            assert self._settings._jupyter_root is not None
+            # git notebooks set the root to the git root,
+            # jupyter_root contains the path where the jupyter notebook was started
+            # program_relpath contains the path from jupyter_root to the file
+            # full program path here is actually the relpath from the program to the git root
+            full_program_path = os.path.join(
+                os.path.relpath(str(self._settings._jupyter_root), root),
+                program_relpath,
+            )
+            full_program_path = os.path.normpath(full_program_path)
+            # if the notebook server is started above the git repo need to clear all the ..s
+            if full_program_path.startswith(".."):
+                split_path = full_program_path.split("/")
+                count_dots = 0
+                for p in split_path:
+                    if p == "..":
+                        count_dots += 1
+                full_program_path = "/".join(split_path[2 * count_dots :])
+        else:
+            full_program_path = program_relpath
         # TODO: update executable to a method that supports pex
         source: GitSourceDict = {
             "entrypoint": [
                 os.path.basename(sys.executable),
-                program_relpath,
+                full_program_path,
             ],
+            "notebook": self._is_notebook_run(),
             "git": {
                 "remote": remote,
                 "commit": commit,
@@ -132,22 +173,40 @@ class JobBuilder:
         if os.path.exists(os.path.join(self._settings.files_dir, DIFF_FNAME)):
             artifact.add_file(
                 os.path.join(self._settings.files_dir, DIFF_FNAME),
-                name="diff.patch",
+                name=DIFF_FNAME,
             )
         return artifact, source
     def _build_artifact_job(
-        self, program_relpath: str
-    ) -> Tuple[Artifact, ArtifactSourceDict]:
+        self, metadata: Dict[str, Any], program_relpath: str
+    ) -> Tuple[Optional[Artifact], Optional[ArtifactSourceDict]]:
         assert isinstance(self._logged_code_artifact, dict)
+        # TODO: should we just always exit early if the path doesn't exist?
+        if self._is_notebook_run() and not self._is_colab_run():
+            full_program_relpath = os.path.relpath(program_relpath, os.getcwd())
+            # if the resolved path doesn't exist, then we shouldn't make a job because it will fail
+            if not os.path.exists(full_program_relpath):
+                # when users call log code in a notebook the code artifact starts
+                # at the directory the notebook is in instead of the jupyter
+                # core
+                if os.path.exists(os.path.basename(program_relpath)):
+                    full_program_relpath = os.path.basename(program_relpath)
+                else:
+                    _logger.info("target path does not exist, exiting")
+                    return None, None
+        else:
+            full_program_relpath = program_relpath
+        entrypoint = [
+            os.path.basename(sys.executable),
+            full_program_relpath,
+        ]
         # TODO: update executable to a method that supports pex
         source: ArtifactSourceDict = {
-            "entrypoint": [
-                os.path.basename(sys.executable),
-                program_relpath,
-            ],
+            "entrypoint": entrypoint,
+            "notebook": self._is_notebook_run(),
             "artifact": f"wandb-artifact://_id/{self._logged_code_artifact['id']}",
         }
         name = make_artifact_name_safe(f"job-{self._logged_code_artifact['name']}")
         artifact = JobArtifact(name)
@@ -158,14 +217,27 @@ class JobBuilder:
     ) -> Tuple[Artifact, ImageSourceDict]:
         image_name = metadata.get("docker")
         assert isinstance(image_name, str)
-        name = make_artifact_name_safe(f"job-{image_name}")
+        raw_image_name = image_name
+        if ":" in image_name:
+            raw_image_name, tag = image_name.split(":")
+            self._aliases += [tag]
+        name = make_artifact_name_safe(f"job-{raw_image_name}")
         artifact = JobArtifact(name)
         source: ImageSourceDict = {
             "image": image_name,
         }
         return artifact, source
+    def _is_notebook_run(self) -> bool:
+        return hasattr(self._settings, "_jupyter") and bool(self._settings._jupyter)
+    def _is_colab_run(self) -> bool:
+        return hasattr(self._settings, "_colab") and bool(self._settings._colab)
     def build(self) -> Optional[Artifact]:
+        _logger.info("Attempting to build job artifact")
         if not os.path.exists(
             os.path.join(self._settings.files_dir, REQUIREMENTS_FNAME)
         ):
@@ -181,23 +253,40 @@ class JobBuilder:
         program_relpath: Optional[str] = metadata.get("codePath")
+        source_type = self._source_type
+        if self._is_notebook_run():
+            _logger.info("run is notebook based run")
+            program_relpath = metadata.get("program")
+        if not source_type:
+            if self._has_git_job_ingredients(metadata, program_relpath):
+                _logger.info("is repo sourced job")
+                source_type = "repo"
+            elif self._has_artifact_job_ingredients(program_relpath):
+                _logger.info("is artifact sourced job")
+                source_type = "artifact"
+            elif self._has_image_job_ingredients(metadata):
+                _logger.info("is image sourced job")
+                source_type = "image"
+        if not source_type:
+            _logger.info("no source found")
+            return None
         artifact = None
-        source_type = None
         source: Optional[
             Union[GitSourceDict, ArtifactSourceDict, ImageSourceDict]
         ] = None
-        if self._has_git_job_ingredients(metadata, program_relpath):
+        if source_type == "repo":
             assert program_relpath is not None
-            artifact, source = self._build_repo_job(metadata, program_relpath)
-            source_type = "repo"
-        elif self._has_artifact_job_ingredients(program_relpath):
+            root: Optional[str] = metadata.get("root")
+            artifact, source = self._build_repo_job(metadata, program_relpath, root)
+        elif source_type == "artifact":
             assert program_relpath is not None
-            artifact, source = self._build_artifact_job(program_relpath)
-            source_type = "artifact"
-        elif self._has_image_job_ingredients(metadata):
+            artifact, source = self._build_artifact_job(metadata, program_relpath)
+        elif source_type == "image":
             artifact, source = self._build_image_job(metadata)
-            source_type = "image"
         if artifact is None or source_type is None or source is None:
             return None
@@ -213,7 +302,7 @@ class JobBuilder:
             "output_types": output_types,
             "runtime": runtime,
         }
+        _logger.info("adding wandb-job metadata file")
         with artifact.new_file("wandb-job.json") as f:
             f.write(json.dumps(source_info, indent=4))
@@ -238,11 +327,11 @@ class JobBuilder:
         self, metadata: Dict[str, Any], program_relpath: Optional[str]
     ) -> bool:
         git_info: Dict[str, str] = metadata.get("git", {})
-        return (
-            git_info.get("remote") is not None
-            and git_info.get("commit") is not None
-            and program_relpath is not None
-        )
+        if program_relpath is None:
+            return False
+        if self._is_notebook_run() and metadata.get("root") is None:
+            return False
+        return git_info.get("remote") is not None and git_info.get("commit") is not None
     def _has_artifact_job_ingredients(self, program_relpath: Optional[str]) -> bool:
         return self._logged_code_artifact is not None and program_relpath is not None

wandb/sdk/internal/sender.py CHANGED Viewed

@@ -30,16 +30,10 @@ from wandb.errors import CommError, UsageError
 from wandb.errors.util import ProtobufErrorHandler
 from wandb.filesync.dir_watcher import DirWatcher
 from wandb.proto import wandb_internal_pb2
+from wandb.sdk.artifacts import artifact_saver
 from wandb.sdk.interface import interface
 from wandb.sdk.interface.interface_queue import InterfaceQueue
-from wandb.sdk.internal import (
-    artifact_saver,
-    context,
-    datastore,
-    file_stream,
-    internal_api,
-    update,
-)
+from wandb.sdk.internal import context, datastore, file_stream, internal_api, update
 from wandb.sdk.internal.file_pusher import FilePusher
 from wandb.sdk.internal.job_builder import JobBuilder
 from wandb.sdk.internal.settings_static import SettingsDict, SettingsStatic
@@ -486,24 +480,6 @@ class SendManager:
                 result.response.check_version_response.delete_message = delete_message
         self._respond_result(result)
-    def _send_request_attach(
-        self,
-        req: wandb_internal_pb2.AttachRequest,
-        resp: wandb_internal_pb2.AttachResponse,
-    ) -> None:
-        attach_id = req.attach_id
-        assert attach_id
-        assert self._run
-        resp.run.CopyFrom(self._run)
-    def send_request_attach(self, record: "Record") -> None:
-        assert record.control.req_resp or record.control.mailbox_slot
-        result = proto_util._result_from_record(record)
-        self._send_request_attach(
-            record.request.attach, result.response.attach_response
-        )
-        self._respond_result(result)
     def send_request_stop_status(self, record: "Record") -> None:
         result = proto_util._result_from_record(record)
         status_resp = result.response.stop_status_response
@@ -1632,6 +1608,10 @@ class SendManager:
             # TODO: this should be removed when the latest tag is handled
             # by the backend (WB-12116)
             proto_artifact.aliases.append("latest")
+            # add docker image tag
+            for alias in self._job_builder._aliases:
+                proto_artifact.aliases.append(alias)
             proto_artifact.user_created = True
             proto_artifact.use_after_commit = True
             proto_artifact.finalize = True

wandb/sdk/internal/settings_static.py CHANGED Viewed

@@ -8,6 +8,7 @@ class SettingsStatic:
     # TODO(jhr): figure out how to share type defs with sdk/wandb_settings.py
     _offline: Optional[bool]
     _sync: bool
+    _disable_setproctitle: bool
     _disable_stats: Optional[bool]
     _disable_meta: Optional[bool]
     _flow_control: bool
@@ -65,6 +66,7 @@ class SettingsStatic:
     disable_job_creation: bool
     _async_upload_concurrency_limit: Optional[int]
     _extra_http_headers: Optional[Mapping[str, str]]
+    job_source: Optional[str]
     # TODO(jhr): clean this up, it is only in SettingsStatic and not in Settings
     _log_level: int

wandb/sdk/internal/system/assets/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ __all__ = (
     "CPU",
     "Disk",
     "GPU",
+    "GPUAMD",
     "GPUApple",
     "IPU",
     "Memory",
@@ -16,6 +17,7 @@ from .asset_registry import asset_registry
 from .cpu import CPU
 from .disk import Disk
 from .gpu import GPU
+from .gpu_amd import GPUAMD
 from .gpu_apple import GPUApple
 from .ipu import IPU
 from .memory import Memory

wandb/sdk/internal/system/assets/gpu.py CHANGED Viewed

@@ -137,6 +137,47 @@ class GPUMemoryAllocated:
         return stats
+class GPUMemoryAllocatedBytes:
+    """GPU memory allocated in bytes for each GPU."""
+    # name = "memory_allocated"
+    name = "gpu.{}.memoryAllocatedBytes"
+    # samples: Deque[Tuple[datetime.datetime, float]]
+    samples: "Deque[List[float]]"
+    def __init__(self, pid: int) -> None:
+        self.pid = pid
+        self.samples = deque([])
+    def sample(self) -> None:
+        memory_allocated = []
+        device_count = pynvml.nvmlDeviceGetCount()  # type: ignore
+        for i in range(device_count):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)  # type: ignore
+            memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)  # type: ignore
+            memory_allocated.append(memory_info.used)
+        self.samples.append(memory_allocated)
+    def clear(self) -> None:
+        self.samples.clear()
+    def aggregate(self) -> dict:
+        if not self.samples:
+            return {}
+        stats = {}
+        device_count = pynvml.nvmlDeviceGetCount()  # type: ignore
+        for i in range(device_count):
+            samples = [sample[i] for sample in self.samples]
+            aggregate = aggregate_mean(samples)
+            stats[self.name.format(i)] = aggregate
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)  # type: ignore
+            if gpu_in_use_by_this_process(handle, self.pid):
+                stats[self.name.format(f"process.{i}")] = aggregate
+        return stats
 class GPUUtilization:
     """GPU utilization in percent for each GPU."""
@@ -314,6 +355,7 @@ class GPU:
         self.name = self.__class__.__name__.lower()
         self.metrics: List[Metric] = [
             GPUMemoryAllocated(settings._stats_pid),
+            GPUMemoryAllocatedBytes(settings._stats_pid),
             GPUMemoryUtilization(settings._stats_pid),
             GPUUtilization(settings._stats_pid),
             GPUTemperature(settings._stats_pid),

wandb/sdk/internal/system/assets/gpu_amd.py ADDED Viewed

@@ -0,0 +1,216 @@
+import json
+import logging
+import shutil
+import subprocess
+import sys
+import threading
+from collections import deque
+from typing import TYPE_CHECKING, Any, Dict, List, Union
+if sys.version_info >= (3, 8):
+    from typing import Final, Literal
+else:
+    from typing_extensions import Final, Literal
+from wandb.sdk.lib import telemetry
+from .aggregators import aggregate_mean
+from .asset_registry import asset_registry
+from .interfaces import Interface, Metric, MetricsMonitor
+if TYPE_CHECKING:
+    from typing import Deque
+    from wandb.sdk.internal.settings_static import SettingsStatic
+logger = logging.getLogger(__name__)
+ROCM_SMI_CMD: Final[str] = shutil.which("rocm-smi") or "/usr/bin/rocm-smi"
+def get_rocm_smi_stats() -> Dict[str, Any]:
+    command = [str(ROCM_SMI_CMD), "-a", "--json"]
+    output = (
+        subprocess.check_output(command, universal_newlines=True).strip().split("\n")
+    )[0]
+    return json.loads(output)  # type: ignore
+_StatsKeys = Literal[
+    "gpu",
+    "memoryAllocated",
+    "temp",
+    "powerWatts",
+    "powerPercent",
+]
+_Stats = Dict[_StatsKeys, float]
+_InfoDict = Dict[str, Union[int, List[Dict[str, Any]]]]
+class GPUAMDStats:
+    """Stats for AMD GPU devices."""
+    name = "gpu.{gpu_id}.{key}"
+    samples: "Deque[List[_Stats]]"
+    def __init__(self) -> None:
+        self.samples = deque()
+    @staticmethod
+    def parse_stats(stats: Dict[str, str]) -> _Stats:
+        """Parse stats from rocm-smi output."""
+        parsed_stats: _Stats = {}
+        try:
+            parsed_stats["gpu"] = float(stats.get("GPU use (%)"))  # type: ignore
+        except (TypeError, ValueError):
+            logger.warning("Could not parse GPU usage as float")
+        try:
+            parsed_stats["memoryAllocated"] = float(stats.get("GPU memory use (%)"))  # type: ignore
+        except (TypeError, ValueError):
+            logger.warning("Could not parse GPU memory allocation as float")
+        try:
+            parsed_stats["temp"] = float(stats.get("Temperature (Sensor memory) (C)"))  # type: ignore
+        except (TypeError, ValueError):
+            logger.warning("Could not parse GPU temperature as float")
+        try:
+            parsed_stats["powerWatts"] = float(
+                stats.get("Average Graphics Package Power (W)")  # type: ignore
+            )
+        except (TypeError, ValueError):
+            logger.warning("Could not parse GPU power as float")
+        try:
+            parsed_stats["powerPercent"] = (
+                float(stats.get("Average Graphics Package Power (W)"))  # type: ignore
+                / float(stats.get("Max Graphics Package Power (W)"))  # type: ignore
+                * 100
+            )
+        except (TypeError, ValueError):
+            logger.warning("Could not parse GPU average/max power as float")
+        return parsed_stats
+    def sample(self) -> None:
+        try:
+            raw_stats = get_rocm_smi_stats()
+            cards = []
+            card_keys = [
+                key for key in sorted(raw_stats.keys()) if key.startswith("card")
+            ]
+            for card_key in card_keys:
+                card_stats = raw_stats[card_key]
+                stats = self.parse_stats(card_stats)
+                if stats:
+                    cards.append(stats)
+            if cards:
+                self.samples.append(cards)
+        except (OSError, ValueError, TypeError, subprocess.CalledProcessError) as e:
+            logger.exception(f"GPU stats error: {e}")
+    def clear(self) -> None:
+        self.samples.clear()
+    def aggregate(self) -> dict:
+        if not self.samples:
+            return {}
+        stats = {}
+        device_count = len(self.samples[0])
+        for i in range(device_count):
+            samples = [sample[i] for sample in self.samples]
+            for key in samples[0].keys():
+                samples_key = [s[key] for s in samples]
+                aggregate = aggregate_mean(samples_key)
+                stats[self.name.format(gpu_id=i, key=key)] = aggregate
+        return stats
+@asset_registry.register
+class GPUAMD:
+    """GPUAMD is a class for monitoring AMD GPU devices.
+    Uses AMD's rocm_smi tool to get GPU stats.
+    For the list of supported environments and devices, see
+    https://github.com/RadeonOpenCompute/ROCm/blob/develop/docs/deploy/
+    """
+    def __init__(
+        self,
+        interface: "Interface",
+        settings: "SettingsStatic",
+        shutdown_event: threading.Event,
+    ) -> None:
+        self.name = self.__class__.__name__.lower()
+        self.metrics: List[Metric] = [
+            GPUAMDStats(),
+        ]
+        self.metrics_monitor = MetricsMonitor(
+            self.name,
+            self.metrics,
+            interface,
+            settings,
+            shutdown_event,
+        )
+        telemetry_record = telemetry.TelemetryRecord()
+        telemetry_record.env.amd_gpu = True
+        interface._publish_telemetry(telemetry_record)
+    @classmethod
+    def is_available(cls) -> bool:
+        rocm_smi_available = shutil.which(ROCM_SMI_CMD) is not None
+        if rocm_smi_available:
+            try:
+                _ = get_rocm_smi_stats()
+                return True
+            except Exception:
+                pass
+        return False
+    def start(self) -> None:
+        self.metrics_monitor.start()
+    def finish(self) -> None:
+        self.metrics_monitor.finish()
+    def probe(self) -> dict:
+        info: _InfoDict = {}
+        try:
+            stats = get_rocm_smi_stats()
+            info["gpu_count"] = len(
+                [key for key in stats.keys() if key.startswith("card")]
+            )
+            key_mapping = {
+                "id": "GPU ID",
+                "unique_id": "Unique ID",
+                "vbios_version": "VBIOS version",
+                "performance_level": "Performance Level",
+                "gpu_overdrive": "GPU OverDrive value (%)",
+                "gpu_memory_overdrive": "GPU Memory OverDrive value (%)",
+                "max_power": "Max Graphics Package Power (W)",
+                "series": "Card series",
+                "model": "Card model",
+                "vendor": "Card vendor",
+                "sku": "Card SKU",
+                "sclk_range": "Valid sclk range",
+                "mclk_range": "Valid mclk range",
+            }
+            info["gpu_devices"] = [
+                {k: stats[key][v] for k, v in key_mapping.items() if stats[key].get(v)}
+                for key in stats.keys()
+                if key.startswith("card")
+            ]
+        except Exception as e:
+            logger.exception(f"GPUAMD probe error: {e}")
+        return info

wandb/sdk/internal/system/env_probe_helpers.py ADDED Viewed

@@ -0,0 +1,13 @@
+import logging
+from sentry_sdk.integrations.aws_lambda import get_lambda_bootstrap  # type: ignore
+logger = logging.getLogger(__name__)
+def is_aws_lambda() -> bool:
+    """Check if we are running in a lambda environment."""
+    lambda_bootstrap = get_lambda_bootstrap()
+    if not lambda_bootstrap or not hasattr(lambda_bootstrap, "handle_event_request"):
+        return False
+    return True

wandb/sdk/internal/system/system_info.py CHANGED Viewed

@@ -18,7 +18,7 @@ from wandb.sdk.lib.filenames import (
     METADATA_FNAME,
     REQUIREMENTS_FNAME,
 )
-from wandb.sdk.lib.git import GitRepo
+from wandb.sdk.lib.gitlib import GitRepo
 from .assets.interfaces import Interface
@@ -142,8 +142,8 @@ class SystemInfo:
                         os.path.relpath(patch_path, start=self.settings.files_dir)
                     )
-            upstream_commit = self.git.get_upstream_fork_point()  # type: ignore
-            if upstream_commit and upstream_commit != self.git.repo.head.commit:
+            upstream_commit = self.git.get_upstream_fork_point()
+            if upstream_commit and upstream_commit != self.git.repo.head.commit:  # type: ignore
                 sha = upstream_commit.hexsha
                 upstream_patch_path = os.path.join(
                     self.settings.files_dir, f"upstream_diff_{sha}.patch"

wandb 0.15.3__py3-none-any.whl → 0.15.5__py3-none-any.whl

wandb 0.15.3py3-none-any.whl → 0.15.5py3-none-any.whl