PyPI - wandb - Versions diffs - 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

wandb 0.13.10py3-none-any.whl → 0.14.0py3-none-any.whl

Files changed (228) hide show

wandb/__init__.py +2 -3
wandb/apis/__init__.py +1 -3
wandb/apis/importers/__init__.py +4 -0
wandb/apis/importers/base.py +312 -0
wandb/apis/importers/mlflow.py +113 -0
wandb/apis/internal.py +29 -2
wandb/apis/normalize.py +6 -5
wandb/apis/public.py +163 -180
wandb/apis/reports/_templates.py +6 -12
wandb/apis/reports/report.py +1 -1
wandb/apis/reports/runset.py +1 -3
wandb/apis/reports/util.py +12 -10
wandb/beta/workflows.py +57 -34
wandb/catboost/__init__.py +1 -2
wandb/cli/cli.py +215 -133
wandb/data_types.py +63 -56
wandb/docker/__init__.py +78 -16
wandb/docker/auth.py +21 -22
wandb/env.py +0 -1
wandb/errors/__init__.py +8 -116
wandb/errors/term.py +1 -1
wandb/fastai/__init__.py +1 -2
wandb/filesync/dir_watcher.py +8 -5
wandb/filesync/step_prepare.py +76 -75
wandb/filesync/step_upload.py +1 -2
wandb/integration/catboost/__init__.py +1 -3
wandb/integration/catboost/catboost.py +8 -14
wandb/integration/fastai/__init__.py +7 -13
wandb/integration/gym/__init__.py +35 -4
wandb/integration/keras/__init__.py +3 -3
wandb/integration/keras/callbacks/metrics_logger.py +9 -8
wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
wandb/integration/keras/callbacks/tables_builder.py +31 -19
wandb/integration/kfp/kfp_patch.py +20 -17
wandb/integration/kfp/wandb_logging.py +1 -2
wandb/integration/lightgbm/__init__.py +21 -19
wandb/integration/prodigy/prodigy.py +6 -7
wandb/integration/sacred/__init__.py +9 -12
wandb/integration/sagemaker/__init__.py +1 -3
wandb/integration/sagemaker/auth.py +0 -1
wandb/integration/sagemaker/config.py +1 -1
wandb/integration/sagemaker/resources.py +1 -1
wandb/integration/sb3/sb3.py +8 -4
wandb/integration/tensorboard/__init__.py +1 -3
wandb/integration/tensorboard/log.py +8 -8
wandb/integration/tensorboard/monkeypatch.py +11 -9
wandb/integration/tensorflow/__init__.py +1 -3
wandb/integration/xgboost/__init__.py +4 -6
wandb/integration/yolov8/__init__.py +7 -0
wandb/integration/yolov8/yolov8.py +250 -0
wandb/jupyter.py +31 -35
wandb/lightgbm/__init__.py +1 -2
wandb/old/settings.py +2 -2
wandb/plot/bar.py +1 -2
wandb/plot/confusion_matrix.py +1 -3
wandb/plot/histogram.py +1 -2
wandb/plot/line.py +1 -2
wandb/plot/line_series.py +4 -4
wandb/plot/pr_curve.py +17 -20
wandb/plot/roc_curve.py +1 -3
wandb/plot/scatter.py +1 -2
wandb/proto/v3/wandb_server_pb2.py +85 -39
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_server_pb2.py +51 -39
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/sdk/__init__.py +1 -3
wandb/sdk/backend/backend.py +1 -1
wandb/sdk/data_types/_dtypes.py +38 -30
wandb/sdk/data_types/base_types/json_metadata.py +1 -3
wandb/sdk/data_types/base_types/media.py +17 -17
wandb/sdk/data_types/base_types/wb_value.py +33 -26
wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
wandb/sdk/data_types/helper_types/classes.py +1 -1
wandb/sdk/data_types/helper_types/image_mask.py +12 -12
wandb/sdk/data_types/histogram.py +5 -4
wandb/sdk/data_types/html.py +1 -2
wandb/sdk/data_types/image.py +11 -11
wandb/sdk/data_types/molecule.py +3 -6
wandb/sdk/data_types/object_3d.py +1 -2
wandb/sdk/data_types/plotly.py +1 -2
wandb/sdk/data_types/saved_model.py +10 -8
wandb/sdk/data_types/video.py +1 -1
wandb/sdk/integration_utils/data_logging.py +5 -5
wandb/sdk/interface/artifacts.py +288 -266
wandb/sdk/interface/interface.py +2 -3
wandb/sdk/interface/interface_grpc.py +1 -1
wandb/sdk/interface/interface_queue.py +1 -1
wandb/sdk/interface/interface_relay.py +1 -1
wandb/sdk/interface/interface_shared.py +1 -2
wandb/sdk/interface/interface_sock.py +1 -1
wandb/sdk/interface/message_future.py +1 -1
wandb/sdk/interface/message_future_poll.py +1 -1
wandb/sdk/interface/router.py +1 -1
wandb/sdk/interface/router_queue.py +1 -1
wandb/sdk/interface/router_relay.py +1 -1
wandb/sdk/interface/router_sock.py +1 -1
wandb/sdk/interface/summary_record.py +1 -1
wandb/sdk/internal/artifacts.py +1 -1
wandb/sdk/internal/datastore.py +2 -3
wandb/sdk/internal/file_pusher.py +5 -3
wandb/sdk/internal/file_stream.py +22 -19
wandb/sdk/internal/handler.py +5 -4
wandb/sdk/internal/internal.py +1 -1
wandb/sdk/internal/internal_api.py +115 -55
wandb/sdk/internal/job_builder.py +1 -3
wandb/sdk/internal/profiler.py +1 -1
wandb/sdk/internal/progress.py +4 -6
wandb/sdk/internal/sample.py +1 -3
wandb/sdk/internal/sender.py +28 -16
wandb/sdk/internal/settings_static.py +5 -5
wandb/sdk/internal/system/assets/__init__.py +1 -0
wandb/sdk/internal/system/assets/cpu.py +3 -9
wandb/sdk/internal/system/assets/disk.py +2 -4
wandb/sdk/internal/system/assets/gpu.py +6 -18
wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
wandb/sdk/internal/system/assets/interfaces.py +50 -22
wandb/sdk/internal/system/assets/ipu.py +1 -3
wandb/sdk/internal/system/assets/memory.py +7 -13
wandb/sdk/internal/system/assets/network.py +4 -8
wandb/sdk/internal/system/assets/open_metrics.py +283 -0
wandb/sdk/internal/system/assets/tpu.py +1 -4
wandb/sdk/internal/system/assets/trainium.py +26 -14
wandb/sdk/internal/system/system_info.py +2 -3
wandb/sdk/internal/system/system_monitor.py +52 -20
wandb/sdk/internal/tb_watcher.py +12 -13
wandb/sdk/launch/_project_spec.py +54 -65
wandb/sdk/launch/agent/agent.py +374 -90
wandb/sdk/launch/builder/abstract.py +61 -7
wandb/sdk/launch/builder/build.py +81 -110
wandb/sdk/launch/builder/docker_builder.py +181 -0
wandb/sdk/launch/builder/kaniko_builder.py +419 -0
wandb/sdk/launch/builder/noop.py +31 -12
wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
wandb/sdk/launch/environment/abstract.py +28 -0
wandb/sdk/launch/environment/aws_environment.py +276 -0
wandb/sdk/launch/environment/gcp_environment.py +271 -0
wandb/sdk/launch/environment/local_environment.py +65 -0
wandb/sdk/launch/github_reference.py +3 -8
wandb/sdk/launch/launch.py +38 -29
wandb/sdk/launch/launch_add.py +6 -8
wandb/sdk/launch/loader.py +230 -0
wandb/sdk/launch/registry/abstract.py +54 -0
wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
wandb/sdk/launch/registry/local_registry.py +62 -0
wandb/sdk/launch/runner/abstract.py +1 -16
wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
wandb/sdk/launch/runner/local_container.py +46 -22
wandb/sdk/launch/runner/local_process.py +1 -4
wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
wandb/sdk/launch/sweeps/__init__.py +3 -2
wandb/sdk/launch/sweeps/scheduler.py +132 -39
wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
wandb/sdk/launch/utils.py +101 -30
wandb/sdk/launch/wandb_reference.py +2 -7
wandb/sdk/lib/_settings_toposort_generate.py +166 -0
wandb/sdk/lib/_settings_toposort_generated.py +201 -0
wandb/sdk/lib/apikey.py +2 -4
wandb/sdk/lib/config_util.py +4 -1
wandb/sdk/lib/console.py +1 -3
wandb/sdk/lib/deprecate.py +3 -3
wandb/sdk/lib/file_stream_utils.py +7 -5
wandb/sdk/lib/filenames.py +1 -1
wandb/sdk/lib/filesystem.py +61 -5
wandb/sdk/lib/git.py +1 -3
wandb/sdk/lib/import_hooks.py +4 -7
wandb/sdk/lib/ipython.py +8 -5
wandb/sdk/lib/lazyloader.py +1 -3
wandb/sdk/lib/mailbox.py +14 -4
wandb/sdk/lib/proto_util.py +10 -5
wandb/sdk/lib/redirect.py +15 -22
wandb/sdk/lib/reporting.py +1 -3
wandb/sdk/lib/retry.py +4 -5
wandb/sdk/lib/runid.py +1 -3
wandb/sdk/lib/server.py +15 -9
wandb/sdk/lib/sock_client.py +1 -1
wandb/sdk/lib/sparkline.py +1 -1
wandb/sdk/lib/wburls.py +1 -1
wandb/sdk/service/port_file.py +1 -2
wandb/sdk/service/service.py +36 -13
wandb/sdk/service/service_base.py +12 -1
wandb/sdk/verify/verify.py +5 -7
wandb/sdk/wandb_artifacts.py +142 -177
wandb/sdk/wandb_config.py +5 -8
wandb/sdk/wandb_helper.py +1 -1
wandb/sdk/wandb_init.py +24 -13
wandb/sdk/wandb_login.py +9 -9
wandb/sdk/wandb_manager.py +39 -4
wandb/sdk/wandb_metric.py +2 -6
wandb/sdk/wandb_require.py +4 -15
wandb/sdk/wandb_require_helpers.py +1 -9
wandb/sdk/wandb_run.py +95 -141
wandb/sdk/wandb_save.py +1 -3
wandb/sdk/wandb_settings.py +149 -54
wandb/sdk/wandb_setup.py +66 -46
wandb/sdk/wandb_summary.py +13 -10
wandb/sdk/wandb_sweep.py +6 -7
wandb/sdk/wandb_watch.py +1 -1
wandb/sklearn/calculate/confusion_matrix.py +1 -1
wandb/sklearn/calculate/learning_curve.py +1 -1
wandb/sklearn/calculate/summary_metrics.py +1 -3
wandb/sklearn/plot/__init__.py +1 -1
wandb/sklearn/plot/classifier.py +27 -18
wandb/sklearn/plot/clusterer.py +4 -5
wandb/sklearn/plot/regressor.py +4 -4
wandb/sklearn/plot/shared.py +2 -2
wandb/sync/__init__.py +1 -3
wandb/sync/sync.py +4 -5
wandb/testing/relay.py +11 -10
wandb/trigger.py +1 -1
wandb/util.py +106 -81
wandb/viz.py +4 -4
wandb/wandb_agent.py +50 -50
wandb/wandb_controller.py +2 -3
wandb/wandb_run.py +1 -2
wandb/wandb_torch.py +1 -1
wandb/xgboost/__init__.py +1 -2
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
wandb/sdk/launch/builder/docker.py +0 -80
wandb/sdk/launch/builder/kaniko.py +0 -393
wandb/sdk/launch/builder/loader.py +0 -32
wandb/sdk/launch/runner/loader.py +0 -50
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
{wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0

wandb/sdk/internal/system/assets/disk.py CHANGED Viewed

@@ -18,9 +18,7 @@ if TYPE_CHECKING:
 class DiskUsage:
-    """
-    Total system disk usage in percent.
-    """
+    """Total system disk usage in percent."""
     # name = "disk_usage"
     name = "disk"
@@ -62,7 +60,7 @@ class Disk:
     @classmethod
     def is_available(cls) -> bool:
-        """Return a new instance of the CPU metrics"""
+        """Return a new instance of the CPU metrics."""
         return psutil is not None
     def probe(self) -> dict:

wandb/sdk/internal/system/assets/gpu.py CHANGED Viewed

@@ -55,9 +55,7 @@ def gpu_in_use_by_this_process(gpu_handle: "GPUHandle", pid: int) -> bool:
 class GPUMemoryUtilization:
-    """
-    GPU memory utilization in percent for each GPU.
-    """
+    """GPU memory utilization in percent for each GPU."""
     # name = "memory_utilization"
     name = "gpu.{}.memory"
@@ -99,9 +97,7 @@ class GPUMemoryUtilization:
 class GPUMemoryAllocated:
-    """
-    GPU memory allocated in percent for each GPU.
-    """
+    """GPU memory allocated in percent for each GPU."""
     # name = "memory_allocated"
     name = "gpu.{}.memoryAllocated"
@@ -142,9 +138,7 @@ class GPUMemoryAllocated:
 class GPUUtilization:
-    """
-    GPU utilization in percent for each GPU.
-    """
+    """GPU utilization in percent for each GPU."""
     # name = "gpu_utilization"
     name = "gpu.{}.gpu"
@@ -186,9 +180,7 @@ class GPUUtilization:
 class GPUTemperature:
-    """
-    GPU temperature in Celsius for each GPU.
-    """
+    """GPU temperature in Celsius for each GPU."""
     # name = "gpu_temperature"
     name = "gpu.{}.temp"
@@ -233,9 +225,7 @@ class GPUTemperature:
 class GPUPowerUsageWatts:
-    """
-    GPU power usage in Watts for each GPU.
-    """
+    """GPU power usage in Watts for each GPU."""
     name = "gpu.{}.powerWatts"
     # samples: Deque[Tuple[datetime.datetime, float]]
@@ -273,9 +263,7 @@ class GPUPowerUsageWatts:
 class GPUPowerUsagePercent:
-    """
-    GPU power usage in percent for each GPU.
-    """
+    """GPU power usage in percent for each GPU."""
     name = "gpu.{}.powerPercent"
     # samples: Deque[Tuple[datetime.datetime, float]]

wandb/sdk/internal/system/assets/gpu_apple.py CHANGED Viewed

@@ -34,13 +34,11 @@ class _Stats(TypedDict):
     temp: float
     powerWatts: float  # noqa: N815
     powerPercent: float  # noqa: N815
-    # cpuWaitMs: float  # noqa: N815
+    # cpuWaitMs: float
 class GPUAppleStats:
-    """
-    Apple GPU stats available on Arm Macs.
-    """
+    """Apple GPU stats available on Arm Macs."""
     name = "gpu.0.{}"
     samples: "Deque[_Stats]"

wandb/sdk/internal/system/assets/interfaces.py CHANGED Viewed

@@ -26,29 +26,43 @@ logger = logging.getLogger(__name__)
 class Metric(Protocol):
-    """
-    Base protocol for individual metrics
-    """
+    """Base protocol for individual metrics."""
     name: str
     # samples: Sequence[Tuple[TimeStamp, Sample]]
     samples: "Deque[Any]"
     def sample(self) -> None:
+        """Sample the metric."""
         ...  # pragma: no cover
     def clear(self) -> None:
+        """Clear the samples."""
         ...  # pragma: no cover
     def aggregate(self) -> dict:
+        """Aggregate the samples."""
+        ...  # pragma: no cover
+@runtime_checkable
+class SetupTeardown(Protocol):
+    """Protocol for classes that require setup and teardown."""
+    def setup(self) -> None:
+        """Extra setup required for the metric beyond __init__."""
+        ...  # pragma: no cover
+    def teardown(self) -> None:
+        """Extra teardown required for the metric."""
         ...  # pragma: no cover
 @runtime_checkable
 class Asset(Protocol):
-    """
-    Base protocol to encapsulate everything relating to an "Asset"
-    e.g. CPU, GPU, TPU, Network, I/O etc.
+    """Base protocol encapsulate everything relating to an "Asset".
+    An asset can be CPU, GPU, TPU, Network, I/O etc.
     """
     name: str
@@ -60,19 +74,19 @@ class Asset(Protocol):
     @classmethod
     def is_available(cls) -> bool:
-        """Check if the resource is available"""
+        """Check if the resource is available."""
         ...  # pragma: no cover
     def start(self) -> None:
-        """Start monitoring the resource"""
+        """Start monitoring the resource."""
         ...  # pragma: no cover
     def finish(self) -> None:
-        """finish monitoring the resource"""
+        """Finish monitoring the resource."""
         ...  # pragma: no cover
     def probe(self) -> dict:
-        """Get static information about the resource"""
+        """Get static information about the resource."""
         ...  # pragma: no cover
@@ -88,9 +102,7 @@ class Interface(Protocol):
 class MetricsMonitor:
-    """
-    Takes care of collecting, sampling, serializing, and publishing a set of metrics.
-    """
+    """Takes care of collecting, sampling, serializing, and publishing a set of metrics."""
     def __init__(
         self,
@@ -119,7 +131,7 @@ class MetricsMonitor:
         )
     def monitor(self) -> None:
-        """Poll the Asset metrics"""
+        """Poll the Asset metrics."""
         while not self._shutdown_event.is_set():
             for _ in range(self.samples_to_aggregate):
                 for metric in self.metrics:
@@ -133,7 +145,7 @@ class MetricsMonitor:
             self.publish()
     def aggregate(self) -> dict:
-        """Return a dict of metrics"""
+        """Return a dict of metrics."""
         aggregated_metrics = {}
         for metric in self.metrics:
             try:
@@ -147,7 +159,7 @@ class MetricsMonitor:
         return aggregated_metrics
     def publish(self) -> None:
-        """Publish the Asset metrics"""
+        """Publish the Asset metrics."""
         try:
             aggregated_metrics = self.aggregate()
             if aggregated_metrics:
@@ -158,21 +170,37 @@ class MetricsMonitor:
             logger.error(f"Failed to publish metrics: {e}")
     def start(self) -> None:
-        if self._process is None and not self._shutdown_event.is_set():
+        if (self._process is not None) or self._shutdown_event.is_set():
+            return None
+        thread_name = f"{self.asset_name[:15]}"  # thread names are limited to 15 chars
+        try:
+            for metric in self.metrics:
+                if isinstance(metric, SetupTeardown):
+                    metric.setup()
             self._process = threading.Thread(
                 target=self.monitor,
                 daemon=True,
-                name=f"{self.asset_name}",
+                name=thread_name,
             )
             self._process.start()
-            logger.info(f"Started {self._process.name}")
+            logger.info(f"Started {thread_name} monitoring")
+        except Exception as e:
+            logger.warning(f"Failed to start {thread_name} monitoring: {e}")
+            self._process = None
     def finish(self) -> None:
         if self._process is None:
             return None
+        thread_name = f"{self.asset_name[:15]}"
         try:
             self._process.join()
-            logger.info(f"Joined {self._process.name}")
+            logger.info(f"Joined {thread_name} monitor")
+            for metric in self.metrics:
+                if isinstance(metric, SetupTeardown):
+                    metric.teardown()
         except Exception as e:
-            logger.warning(f"Failed to join {self._process.name}: {e}")
-        self._process = None
+            logger.warning(f"Failed to finish {thread_name} monitoring: {e}")
+        finally:
+            self._process = None

wandb/sdk/internal/system/assets/ipu.py CHANGED Viewed

@@ -20,9 +20,7 @@ if TYPE_CHECKING:
 class IPUStats:
-    """
-    Stats for Graphcore IPU devices
-    """
+    """Stats for Graphcore IPU devices."""
     name = "ipu.{}.{}"
     samples: "Deque[dict]"

wandb/sdk/internal/system/assets/memory.py CHANGED Viewed

@@ -18,8 +18,8 @@ if TYPE_CHECKING:
 class ProcessMemoryRSS:
-    """
-    Memory resident set size (RSS) in MB.
+    """Memory resident set size (RSS) in MB.
     RSS is the portion of memory occupied by a process that is held in main memory (RAM).
     """
@@ -49,9 +49,7 @@ class ProcessMemoryRSS:
 class ProcessMemoryPercent:
-    """
-    Process memory usage in percent.
-    """
+    """Process memory usage in percent."""
     # name = "process_memory_percent"
     name = "proc.memory.percent"
@@ -79,9 +77,7 @@ class ProcessMemoryPercent:
 class MemoryPercent:
-    """
-    Total system memory usage in percent.
-    """
+    """Total system memory usage in percent."""
     # name = "memory_percent"
     name = "memory"
@@ -104,9 +100,7 @@ class MemoryPercent:
 class MemoryAvailable:
-    """
-    Total system memory available in MB.
-    """
+    """Total system memory available in MB."""
     # name = "memory_available"
     name = "proc.memory.availableMB"
@@ -159,11 +153,11 @@ class Memory:
     @classmethod
     def is_available(cls) -> bool:
-        """Return a new instance of the CPU metrics"""
+        """Return a new instance of the CPU metrics."""
         return psutil is not None
     def probe(self) -> dict:
-        """Return a dict of the hardware information"""
+        """Return a dict of the hardware information."""
         # total available memory in gigabytes
         return {
             "memory": {

wandb/sdk/internal/system/assets/network.py CHANGED Viewed

@@ -18,9 +18,7 @@ if TYPE_CHECKING:
 class NetworkSent:
-    """
-    Network bytes sent.
-    """
+    """Network bytes sent."""
     name = "network.sent"
     samples: "Deque[float]"
@@ -45,9 +43,7 @@ class NetworkSent:
 class NetworkRecv:
-    """
-    Network bytes received.
-    """
+    """Network bytes received."""
     name = "network.recv"
     samples: "Deque[float]"
@@ -101,11 +97,11 @@ class Network:
     @classmethod
     def is_available(cls) -> bool:
-        """Return a new instance of the CPU metrics"""
+        """Return a new instance of the CPU metrics."""
         return psutil is not None
     def probe(self) -> dict:
-        """Return a dict of the hardware information"""
+        """Return a dict of the hardware information."""
         # net_if_addrs = psutil.net_if_addrs()
         # return {

wandb/sdk/internal/system/assets/open_metrics.py ADDED Viewed

@@ -0,0 +1,283 @@
+import logging
+import multiprocessing as mp
+import re
+import sys
+from collections import defaultdict, deque
+from functools import lru_cache
+from hashlib import md5
+from types import ModuleType
+from typing import TYPE_CHECKING, Dict, List, Mapping, Tuple, Union
+if sys.version_info >= (3, 8):
+    from typing import Final
+else:
+    from typing_extensions import Final
+import requests
+import requests.adapters
+import urllib3
+import wandb
+from wandb.sdk.lib import telemetry
+from .aggregators import aggregate_last, aggregate_mean
+from .interfaces import Interface, Metric, MetricsMonitor
+if TYPE_CHECKING:
+    from typing import Deque, Optional
+    from wandb.sdk.internal.settings_static import SettingsStatic
+_PREFIX: Final[str] = "openmetrics"
+_REQUEST_RETRY_STRATEGY = urllib3.util.retry.Retry(
+    backoff_factor=1,
+    total=3,
+    status_forcelist=(408, 409, 429, 500, 502, 503, 504),
+)
+_REQUEST_POOL_CONNECTIONS = 4
+_REQUEST_POOL_MAXSIZE = 4
+_REQUEST_TIMEOUT = 3
+logger = logging.getLogger(__name__)
+prometheus_client_parser: "Optional[ModuleType]" = None
+try:
+    import prometheus_client.parser  # type: ignore
+    prometheus_client_parser = prometheus_client.parser
+except ImportError:
+    pass
+def _setup_requests_session() -> requests.Session:
+    session = requests.Session()
+    adapter = requests.adapters.HTTPAdapter(
+        max_retries=_REQUEST_RETRY_STRATEGY,
+        pool_connections=_REQUEST_POOL_CONNECTIONS,
+        pool_maxsize=_REQUEST_POOL_MAXSIZE,
+    )
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+def _nested_dict_to_tuple(
+    nested_dict: Mapping[str, Mapping[str, str]]
+) -> Tuple[Tuple[str, Tuple[str, str]], ...]:
+    return tuple((k, *v.items()) for k, v in nested_dict.items())  # type: ignore
+def _tuple_to_nested_dict(
+    nested_tuple: Tuple[Tuple[str, Tuple[str, str]], ...]
+) -> Dict[str, Dict[str, str]]:
+    return {k: dict(v) for k, *v in nested_tuple}
+@lru_cache(maxsize=128)
+def _should_capture_metric(
+    metric_name: str,
+    metric_labels: Tuple[str, ...],
+    filters: Tuple[Tuple[str, Tuple[str, str]], ...],
+) -> bool:
+    # we use tuples to make the function arguments hashable => usable with lru_cache
+    should_capture = False
+    if not filters:
+        return should_capture
+    # self.filters keys are regexes, check the name against them
+    # and for the first match, check the labels against the label filters.
+    # assume that if at least one label filter doesn't match, the metric
+    # should not be captured.
+    # it's up to the user to make sure that the filters are not conflicting etc.
+    metric_labels_dict = {t[0]: t[1] for t in metric_labels}
+    filters_dict = _tuple_to_nested_dict(filters)
+    for metric_name_regex, label_filters in filters_dict.items():
+        if not re.match(metric_name_regex, metric_name):
+            continue
+        should_capture = True
+        for label, label_filter in label_filters.items():
+            if not re.match(label_filter, metric_labels_dict.get(label, "")):
+                should_capture = False
+                break
+        break
+    return should_capture
+class OpenMetricsMetric:
+    """Container for all the COUNTER and GAUGE metrics extracted from an OpenMetrics endpoint."""
+    def __init__(
+        self, name: str, url: str, filters: Mapping[str, Mapping[str, str]]
+    ) -> None:
+        self.name = name
+        self.url = url
+        self.filters = filters
+        self.filters_tuple = _nested_dict_to_tuple(filters)
+        self._session: Optional["requests.Session"] = None
+        self.samples: "Deque[dict]" = deque([])
+        # {"<metric name>": {"<labels hash>": <index>}}
+        self.label_map: "Dict[str, Dict[str, int]]" = defaultdict(dict)
+        # {"<labels hash>": <labels>}
+        self.label_hashes: "Dict[str, dict]" = {}
+    def setup(self) -> None:
+        if self._session is not None:
+            return
+        self._session = _setup_requests_session()
+    def teardown(self) -> None:
+        if self._session is None:
+            return
+        self._session.close()
+        self._session = None
+    def parse_open_metrics_endpoint(self) -> Dict[str, Union[str, int, float]]:
+        assert prometheus_client_parser is not None
+        assert self._session is not None
+        response = self._session.get(self.url, timeout=_REQUEST_TIMEOUT)
+        response.raise_for_status()
+        text = response.text
+        measurement = {}
+        for family in prometheus_client_parser.text_string_to_metric_families(text):
+            if family.type not in ("counter", "gauge"):
+                # todo: add support for other metric types?
+                # todo: log warning about that?
+                continue
+            for sample in family.samples:
+                name, labels, value = sample.name, sample.labels, sample.value
+                if not _should_capture_metric(
+                    name,
+                    tuple(labels.items()),
+                    self.filters_tuple,
+                ):
+                    continue
+                # md5 hash of the labels
+                label_hash = md5(str(labels).encode("utf-8")).hexdigest()
+                if label_hash not in self.label_map[name]:
+                    # store the index of the label hash in the label map
+                    self.label_map[name][label_hash] = len(self.label_map[name])
+                    # store the labels themselves
+                    self.label_hashes[label_hash] = labels
+                index = self.label_map[name][label_hash]
+                measurement[f"{name}.{index}"] = value
+        return measurement
+    def sample(self) -> None:
+        s = self.parse_open_metrics_endpoint()
+        self.samples.append(s)
+    def clear(self) -> None:
+        self.samples.clear()
+    def aggregate(self) -> dict:
+        if not self.samples:
+            return {}
+        prefix = f"{_PREFIX}.{self.name}."
+        stats = {}
+        for key in self.samples[0].keys():
+            samples = [s[key] for s in self.samples if key in s]
+            if samples and all(isinstance(s, (int, float)) for s in samples):
+                stats[f"{prefix}{key}"] = aggregate_mean(samples)
+            else:
+                stats[f"{prefix}{key}"] = aggregate_last(samples)
+        return stats
+class OpenMetrics:
+    # Poll an OpenMetrics endpoint, parse the response and return a dict of metrics
+    # Implements the same Protocol interface as Asset
+    def __init__(
+        self,
+        interface: "Interface",
+        settings: "SettingsStatic",
+        shutdown_event: mp.synchronize.Event,
+        name: str,
+        url: str,
+    ) -> None:
+        self.name = name
+        self.url = url
+        self.interface = interface
+        self.settings = settings
+        self.shutdown_event = shutdown_event
+        self.metrics: List[Metric] = [
+            OpenMetricsMetric(name, url, settings._stats_open_metrics_filters)
+        ]
+        self.metrics_monitor: "MetricsMonitor" = MetricsMonitor(
+            asset_name=self.name,
+            metrics=self.metrics,
+            interface=interface,
+            settings=settings,
+            shutdown_event=shutdown_event,
+        )
+        telemetry_record = telemetry.TelemetryRecord()
+        telemetry_record.feature.open_metrics = True
+        interface._publish_telemetry(telemetry_record)
+    @classmethod
+    def is_available(cls, url: str) -> bool:
+        _is_available: bool = False
+        ret = prometheus_client_parser is not None
+        if not ret:
+            wandb.termwarn(
+                "Monitoring OpenMetrics endpoints requires the `prometheus_client` package. "
+                "To install it, run `pip install prometheus_client`.",
+                repeat=False,
+            )
+            return _is_available
+        # check if the endpoint is available and is a valid OpenMetrics endpoint
+        _session: Optional[requests.Session] = None
+        try:
+            assert prometheus_client_parser is not None
+            _session = _setup_requests_session()
+            response = _session.get(url, timeout=_REQUEST_TIMEOUT)
+            response.raise_for_status()
+            # check if the response is a valid OpenMetrics response
+            # text_string_to_metric_families returns a generator
+            if list(
+                prometheus_client_parser.text_string_to_metric_families(response.text)
+            ):
+                _is_available = True
+        except Exception as e:
+            logger.debug(
+                f"OpenMetrics endpoint {url} is not available: {e}", exc_info=True
+            )
+        if _session is not None:
+            try:
+                _session.close()
+            except Exception:
+                pass
+        return _is_available
+    def start(self) -> None:
+        self.metrics_monitor.start()
+    def finish(self) -> None:
+        self.metrics_monitor.finish()
+    def probe(self) -> dict:
+        # todo: also return self.label_hashes
+        return {self.name: self.url}

wandb/sdk/internal/system/assets/tpu.py CHANGED Viewed

@@ -17,9 +17,7 @@ logger = logging.getLogger(__name__)
 class TPUUtilization:
-    """
-    Google Cloud TPU utilization in percent.
-    """
+    """Google Cloud TPU utilization in percent."""
     name = "tpu"
     samples: "Deque[float]"
@@ -130,7 +128,6 @@ class TPU:
     @classmethod
     def is_available(cls) -> bool:
         if os.environ.get("TPU_NAME", False) is False:
             return False

wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl

wandb 0.13.10py3-none-any.whl → 0.14.0py3-none-any.whl