PyPI - wandb - Versions diffs - 0.20.1__py3-none-win32.whl → 0.20.2rc20250616__py3-none-win32.whl - Mend

wandb 0.20.1py3-none-win32.whl → 0.20.2rc20250616py3-none-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

wandb/__init__.py +3 -6
wandb/__init__.pyi +1 -1
wandb/analytics/sentry.py +2 -2
wandb/apis/importers/internals/internal.py +0 -3
wandb/apis/public/api.py +2 -2
wandb/apis/public/registries/{utils.py → _utils.py} +12 -12
wandb/apis/public/registries/registries_search.py +2 -2
wandb/apis/public/registries/registry.py +19 -18
wandb/bin/gpu_stats.exe +0 -0
wandb/bin/wandb-core +0 -0
wandb/cli/beta.py +1 -7
wandb/cli/cli.py +0 -30
wandb/env.py +0 -6
wandb/proto/v3/wandb_settings_pb2.py +2 -2
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_settings_pb2.py +2 -2
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/proto/v5/wandb_settings_pb2.py +2 -2
wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
wandb/proto/v6/wandb_settings_pb2.py +2 -2
wandb/proto/v6/wandb_telemetry_pb2.py +10 -10
wandb/sdk/artifacts/storage_handlers/s3_handler.py +42 -1
wandb/sdk/backend/backend.py +1 -1
wandb/sdk/internal/handler.py +1 -69
wandb/sdk/lib/printer.py +6 -7
wandb/sdk/lib/progress.py +1 -3
wandb/sdk/lib/service/ipc_support.py +13 -0
wandb/sdk/lib/{service_connection.py → service/service_connection.py} +20 -56
wandb/sdk/lib/service/service_port_file.py +105 -0
wandb/sdk/lib/service/service_process.py +111 -0
wandb/sdk/lib/service/service_token.py +164 -0
wandb/sdk/lib/sock_client.py +8 -12
wandb/sdk/wandb_init.py +0 -3
wandb/sdk/wandb_require.py +9 -20
wandb/sdk/wandb_run.py +0 -24
wandb/sdk/wandb_settings.py +0 -9
wandb/sdk/wandb_setup.py +2 -13
{wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/METADATA +1 -3
{wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/RECORD +42 -68
wandb/sdk/internal/flow_control.py +0 -263
wandb/sdk/internal/internal.py +0 -401
wandb/sdk/internal/internal_util.py +0 -97
wandb/sdk/internal/system/__init__.py +0 -0
wandb/sdk/internal/system/assets/__init__.py +0 -25
wandb/sdk/internal/system/assets/aggregators.py +0 -31
wandb/sdk/internal/system/assets/asset_registry.py +0 -20
wandb/sdk/internal/system/assets/cpu.py +0 -163
wandb/sdk/internal/system/assets/disk.py +0 -210
wandb/sdk/internal/system/assets/gpu.py +0 -416
wandb/sdk/internal/system/assets/gpu_amd.py +0 -233
wandb/sdk/internal/system/assets/interfaces.py +0 -205
wandb/sdk/internal/system/assets/ipu.py +0 -177
wandb/sdk/internal/system/assets/memory.py +0 -166
wandb/sdk/internal/system/assets/network.py +0 -125
wandb/sdk/internal/system/assets/open_metrics.py +0 -293
wandb/sdk/internal/system/assets/tpu.py +0 -154
wandb/sdk/internal/system/assets/trainium.py +0 -393
wandb/sdk/internal/system/env_probe_helpers.py +0 -13
wandb/sdk/internal/system/system_info.py +0 -248
wandb/sdk/internal/system/system_monitor.py +0 -224
wandb/sdk/internal/writer.py +0 -204
wandb/sdk/lib/service_token.py +0 -93
wandb/sdk/service/__init__.py +0 -0
wandb/sdk/service/_startup_debug.py +0 -22
wandb/sdk/service/port_file.py +0 -53
wandb/sdk/service/server.py +0 -107
wandb/sdk/service/server_sock.py +0 -286
wandb/sdk/service/service.py +0 -252
wandb/sdk/service/streams.py +0 -425
{wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/WHEEL +0 -0
{wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/entry_points.txt +0 -0
{wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/licenses/LICENSE +0 -0

wandb/sdk/internal/system/assets/open_metrics.py DELETED Viewed

@@ -1,293 +0,0 @@
-import logging
-import re
-import threading
-from collections import defaultdict, deque
-from functools import lru_cache
-from types import ModuleType
-from typing import TYPE_CHECKING, Dict, Final, List, Mapping, Sequence, Tuple, Union
-import requests
-import requests.adapters
-import urllib3
-import wandb
-from wandb.sdk.lib import hashutil, telemetry
-from .aggregators import aggregate_last, aggregate_mean
-from .interfaces import Interface, Metric, MetricsMonitor
-if TYPE_CHECKING:
-    from typing import Deque, Optional
-    from wandb.sdk.internal.settings_static import SettingsStatic
-_PREFIX: Final[str] = "openmetrics"
-_REQUEST_RETRY_STRATEGY = urllib3.util.retry.Retry(
-    backoff_factor=1,
-    total=3,
-    status_forcelist=(408, 409, 429, 500, 502, 503, 504),
-)
-_REQUEST_POOL_CONNECTIONS = 4
-_REQUEST_POOL_MAXSIZE = 4
-_REQUEST_TIMEOUT = 3
-logger = logging.getLogger(__name__)
-prometheus_client_parser: "Optional[ModuleType]" = None
-try:
-    import prometheus_client.parser  # type: ignore
-    prometheus_client_parser = prometheus_client.parser
-except ImportError:
-    pass
-def _setup_requests_session() -> requests.Session:
-    session = requests.Session()
-    adapter = requests.adapters.HTTPAdapter(
-        max_retries=_REQUEST_RETRY_STRATEGY,
-        pool_connections=_REQUEST_POOL_CONNECTIONS,
-        pool_maxsize=_REQUEST_POOL_MAXSIZE,
-    )
-    session.mount("http://", adapter)
-    session.mount("https://", adapter)
-    return session
-def _nested_dict_to_tuple(
-    nested_dict: Mapping[str, Mapping[str, str]],
-) -> Tuple[Tuple[str, Tuple[str, str]], ...]:
-    return tuple((k, *v.items()) for k, v in nested_dict.items())  # type: ignore
-def _tuple_to_nested_dict(
-    nested_tuple: Tuple[Tuple[str, Tuple[str, str]], ...],
-) -> Dict[str, Dict[str, str]]:
-    return {k: dict(v) for k, *v in nested_tuple}
-@lru_cache(maxsize=128)
-def _should_capture_metric(
-    endpoint_name: str,
-    metric_name: str,
-    metric_labels: Tuple[str, ...],
-    filters: Tuple[Tuple[str, Tuple[str, str]], ...],
-) -> bool:
-    # we use tuples to make the function arguments hashable => usable with lru_cache
-    should_capture = False
-    if not filters:
-        return should_capture
-    # self.filters keys are regexes, check the name against them
-    # and for the first match, check the labels against the label filters.
-    # assume that if at least one label filter doesn't match, the metric
-    # should not be captured.
-    # it's up to the user to make sure that the filters are not conflicting etc.
-    metric_labels_dict = {t[0]: t[1] for t in metric_labels}
-    filters_dict = _tuple_to_nested_dict(filters)
-    for metric_name_regex, label_filters in filters_dict.items():
-        if not re.match(metric_name_regex, f"{endpoint_name}.{metric_name}"):
-            continue
-        should_capture = True
-        for label, label_filter in label_filters.items():
-            if not re.match(label_filter, metric_labels_dict.get(label, "")):
-                should_capture = False
-                break
-        break
-    return should_capture
-class OpenMetricsMetric:
-    """Container for all the COUNTER and GAUGE metrics extracted from an OpenMetrics endpoint."""
-    def __init__(
-        self,
-        name: str,
-        url: str,
-        filters: Union[Mapping[str, Mapping[str, str]], Sequence[str], None],
-    ) -> None:
-        self.name = name  # user-defined name for the endpoint
-        self.url = url  # full URL
-        # - filters can be a dict {"<metric regex>": {"<label>": "<filter regex>"}}
-        #   or a sequence of metric regexes. we convert the latter to a dict
-        #   to make it easier to work with.
-        # - the metric regexes are matched against the full metric name,
-        #   i.e. "<endpoint name>.<metric name>".
-        # - by default, all metrics are captured.
-        self.filters = (
-            filters
-            if isinstance(filters, Mapping)
-            else {k: {} for k in filters or [".*"]}
-        )
-        self.filters_tuple = _nested_dict_to_tuple(self.filters) if self.filters else ()
-        self._session: Optional[requests.Session] = None
-        self.samples: Deque[dict] = deque([])
-        # {"<metric name>": {"<labels hash>": <index>}}
-        self.label_map: Dict[str, Dict[str, int]] = defaultdict(dict)
-        # {"<labels hash>": <labels>}
-        self.label_hashes: Dict[str, dict] = {}
-    def setup(self) -> None:
-        if self._session is not None:
-            return
-        self._session = _setup_requests_session()
-    def teardown(self) -> None:
-        if self._session is None:
-            return
-        self._session.close()
-        self._session = None
-    def parse_open_metrics_endpoint(self) -> Dict[str, Union[str, int, float]]:
-        assert prometheus_client_parser is not None
-        assert self._session is not None
-        response = self._session.get(self.url, timeout=_REQUEST_TIMEOUT)
-        response.raise_for_status()
-        text = response.text
-        measurement = {}
-        for family in prometheus_client_parser.text_string_to_metric_families(text):
-            if family.type not in ("counter", "gauge"):
-                # todo: add support for other metric types?
-                # todo: log warning about that?
-                continue
-            for sample in family.samples:
-                name, labels, value = sample.name, sample.labels, sample.value
-                if not _should_capture_metric(
-                    self.name,
-                    name,
-                    tuple(labels.items()),
-                    self.filters_tuple,
-                ):
-                    continue
-                # md5 hash of the labels
-                label_hash = hashutil._md5(str(labels).encode("utf-8")).hexdigest()
-                if label_hash not in self.label_map[name]:
-                    # store the index of the label hash in the label map
-                    self.label_map[name][label_hash] = len(self.label_map[name])
-                    # store the labels themselves
-                    self.label_hashes[label_hash] = labels
-                index = self.label_map[name][label_hash]
-                measurement[f"{name}.{index}"] = value
-        return measurement
-    def sample(self) -> None:
-        s = self.parse_open_metrics_endpoint()
-        self.samples.append(s)
-    def clear(self) -> None:
-        self.samples.clear()
-    def aggregate(self) -> dict:
-        if not self.samples:
-            return {}
-        prefix = f"{_PREFIX}.{self.name}."
-        stats = {}
-        for key in self.samples[0].keys():
-            samples = [s[key] for s in self.samples if key in s]
-            if samples and all(isinstance(s, (int, float)) for s in samples):
-                stats[f"{prefix}{key}"] = aggregate_mean(samples)
-            else:
-                stats[f"{prefix}{key}"] = aggregate_last(samples)
-        return stats
-class OpenMetrics:
-    # Poll an OpenMetrics endpoint, parse the response and return a dict of metrics
-    # Implements the same Protocol interface as Asset
-    def __init__(
-        self,
-        interface: "Interface",
-        settings: "SettingsStatic",
-        shutdown_event: threading.Event,
-        name: str,
-        url: str,
-    ) -> None:
-        self.name = name
-        self.url = url
-        self.interface = interface
-        self.settings = settings
-        self.shutdown_event = shutdown_event
-        self.metrics: List[Metric] = [
-            OpenMetricsMetric(name, url, settings.x_stats_open_metrics_filters)
-        ]
-        self.metrics_monitor: MetricsMonitor = MetricsMonitor(
-            asset_name=self.name,
-            metrics=self.metrics,
-            interface=interface,
-            settings=settings,
-            shutdown_event=shutdown_event,
-        )
-        telemetry_record = telemetry.TelemetryRecord()
-        telemetry_record.feature.open_metrics = True
-        interface._publish_telemetry(telemetry_record)
-    @classmethod
-    def is_available(cls, url: str) -> bool:
-        _is_available: bool = False
-        ret = prometheus_client_parser is not None
-        if not ret:
-            wandb.termwarn(
-                "Monitoring OpenMetrics endpoints requires the `prometheus_client` package. "
-                "To install it, run `pip install prometheus_client`.",
-                repeat=False,
-            )
-            return _is_available
-        # check if the endpoint is available and is a valid OpenMetrics endpoint
-        _session: Optional[requests.Session] = None
-        try:
-            assert prometheus_client_parser is not None
-            _session = _setup_requests_session()
-            response = _session.get(url, timeout=_REQUEST_TIMEOUT)
-            response.raise_for_status()
-            # check if the response is a valid OpenMetrics response
-            # text_string_to_metric_families returns a generator
-            if list(
-                prometheus_client_parser.text_string_to_metric_families(response.text)
-            ):
-                _is_available = True
-        except Exception as e:
-            logger.debug(
-                f"OpenMetrics endpoint {url} is not available: {e}", exc_info=True
-            )
-        if _session is not None:
-            try:
-                _session.close()
-            except Exception:
-                pass
-        return _is_available
-    def start(self) -> None:
-        self.metrics_monitor.start()
-    def finish(self) -> None:
-        self.metrics_monitor.finish()
-    def probe(self) -> dict:
-        # todo: also return self.label_hashes
-        return {self.name: self.url}

wandb/sdk/internal/system/assets/tpu.py DELETED Viewed

@@ -1,154 +0,0 @@
-import logging
-import os
-import threading
-from collections import deque
-from typing import TYPE_CHECKING, List, Optional
-from .aggregators import aggregate_mean
-from .asset_registry import asset_registry
-from .interfaces import Interface, Metric, MetricsMonitor
-if TYPE_CHECKING:
-    from typing import Deque
-    from wandb.sdk.internal.settings_static import SettingsStatic
-logger = logging.getLogger(__name__)
-class TPUUtilization:
-    """Google Cloud TPU utilization in percent."""
-    name = "tpu"
-    samples: "Deque[float]"
-    def __init__(
-        self,
-        service_addr: str,
-        duration_ms: int = 100,
-    ) -> None:
-        self.samples = deque([])
-        self.duration_ms = duration_ms
-        self.service_addr = service_addr
-        try:
-            from tensorflow.python.profiler import profiler_client  # type: ignore
-            self._profiler_client = profiler_client
-        except ImportError:
-            logger.warning(
-                "Unable to import `tensorflow.python.profiler.profiler_client`. "
-                "TPU metrics will not be reported."
-            )
-            self._profiler_client = None
-    def sample(self) -> None:
-        result = self._profiler_client.monitor(
-            self.service_addr, duration_ms=self.duration_ms, level=2
-        )
-        self.samples.append(
-            float(result.split("Utilization ")[1].split(": ")[1].split("%")[0])
-        )
-    def clear(self) -> None:
-        self.samples.clear()
-    def aggregate(self) -> dict:
-        if not self.samples:
-            return {}
-        aggregate = aggregate_mean(self.samples)
-        return {self.name: aggregate}
-@asset_registry.register
-class TPU:
-    def __init__(
-        self,
-        interface: "Interface",
-        settings: "SettingsStatic",
-        shutdown_event: threading.Event,
-    ) -> None:
-        self.name = self.__class__.__name__.lower()
-        self.service_addr = self.get_service_addr()
-        self.metrics: List[Metric] = [TPUUtilization(self.service_addr)]
-        self.metrics_monitor = MetricsMonitor(
-            self.name,
-            self.metrics,
-            interface,
-            settings,
-            shutdown_event,
-        )
-    @staticmethod
-    def get_service_addr(
-        service_addr: Optional[str] = None,
-        tpu_name: Optional[str] = None,
-        compute_zone: Optional[str] = None,
-        core_project: Optional[str] = None,
-    ) -> str:
-        if service_addr is not None:
-            if tpu_name is not None:
-                logger.warning(
-                    "Both service_addr and tpu_name arguments provided. "
-                    "Ignoring tpu_name and using service_addr."
-                )
-        else:
-            tpu_name = tpu_name or os.environ.get("TPU_NAME")
-            if tpu_name is None:
-                raise Exception("Required environment variable TPU_NAME.")
-            compute_zone = compute_zone or os.environ.get("CLOUDSDK_COMPUTE_ZONE")
-            core_project = core_project or os.environ.get("CLOUDSDK_CORE_PROJECT")
-            try:
-                from tensorflow.python.distribute.cluster_resolver import (  # type: ignore
-                    tpu_cluster_resolver,
-                )
-                service_addr = tpu_cluster_resolver.TPUClusterResolver(
-                    [tpu_name], zone=compute_zone, project=core_project
-                ).get_master()
-            except (ValueError, TypeError):
-                raise ValueError(
-                    "Failed to find TPU. Try specifying TPU zone "
-                    "(via CLOUDSDK_COMPUTE_ZONE environment variable)"
-                    " and GCP project (via CLOUDSDK_CORE_PROJECT "
-                    "environment variable)."
-                )
-        service_addr = service_addr.replace("grpc://", "").replace(":8470", ":8466")
-        return service_addr
-    def start(self) -> None:
-        if self.metrics:
-            self.metrics_monitor.start()
-    def finish(self) -> None:
-        self.metrics_monitor.finish()
-    @classmethod
-    def is_available(cls) -> bool:
-        if os.environ.get("TPU_NAME", False) is False:
-            return False
-        try:
-            from tensorflow.python.distribute.cluster_resolver import (  # noqa: F401
-                tpu_cluster_resolver,
-            )
-            from tensorflow.python.profiler import profiler_client  # noqa: F401
-            cls.get_service_addr()
-        except (
-            ImportError,
-            TypeError,
-            AttributeError,
-            ValueError,
-        ):  # Saw type error when iterating paths on colab...
-            # TODO: Saw error in sentry where module 'tensorflow.python.pywrap_tensorflow'
-            #  has no attribute 'TFE_DEVICE_PLACEMENT_EXPLICIT'
-            return False
-        return True
-    def probe(self) -> dict:
-        return {self.name: {"service_address": self.service_addr}}

wandb 0.20.1__py3-none-win32.whl → 0.20.2rc20250616__py3-none-win32.whl

wandb 0.20.1py3-none-win32.whl → 0.20.2rc20250616py3-none-win32.whl