wandb 0.20.1__py3-none-win32.whl → 0.20.2rc20250616__py3-none-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +3 -6
- wandb/__init__.pyi +1 -1
- wandb/analytics/sentry.py +2 -2
- wandb/apis/importers/internals/internal.py +0 -3
- wandb/apis/public/api.py +2 -2
- wandb/apis/public/registries/{utils.py → _utils.py} +12 -12
- wandb/apis/public/registries/registries_search.py +2 -2
- wandb/apis/public/registries/registry.py +19 -18
- wandb/bin/gpu_stats.exe +0 -0
- wandb/bin/wandb-core +0 -0
- wandb/cli/beta.py +1 -7
- wandb/cli/cli.py +0 -30
- wandb/env.py +0 -6
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v6/wandb_settings_pb2.py +2 -2
- wandb/proto/v6/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +42 -1
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/internal/handler.py +1 -69
- wandb/sdk/lib/printer.py +6 -7
- wandb/sdk/lib/progress.py +1 -3
- wandb/sdk/lib/service/ipc_support.py +13 -0
- wandb/sdk/lib/{service_connection.py → service/service_connection.py} +20 -56
- wandb/sdk/lib/service/service_port_file.py +105 -0
- wandb/sdk/lib/service/service_process.py +111 -0
- wandb/sdk/lib/service/service_token.py +164 -0
- wandb/sdk/lib/sock_client.py +8 -12
- wandb/sdk/wandb_init.py +0 -3
- wandb/sdk/wandb_require.py +9 -20
- wandb/sdk/wandb_run.py +0 -24
- wandb/sdk/wandb_settings.py +0 -9
- wandb/sdk/wandb_setup.py +2 -13
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/METADATA +1 -3
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/RECORD +42 -68
- wandb/sdk/internal/flow_control.py +0 -263
- wandb/sdk/internal/internal.py +0 -401
- wandb/sdk/internal/internal_util.py +0 -97
- wandb/sdk/internal/system/__init__.py +0 -0
- wandb/sdk/internal/system/assets/__init__.py +0 -25
- wandb/sdk/internal/system/assets/aggregators.py +0 -31
- wandb/sdk/internal/system/assets/asset_registry.py +0 -20
- wandb/sdk/internal/system/assets/cpu.py +0 -163
- wandb/sdk/internal/system/assets/disk.py +0 -210
- wandb/sdk/internal/system/assets/gpu.py +0 -416
- wandb/sdk/internal/system/assets/gpu_amd.py +0 -233
- wandb/sdk/internal/system/assets/interfaces.py +0 -205
- wandb/sdk/internal/system/assets/ipu.py +0 -177
- wandb/sdk/internal/system/assets/memory.py +0 -166
- wandb/sdk/internal/system/assets/network.py +0 -125
- wandb/sdk/internal/system/assets/open_metrics.py +0 -293
- wandb/sdk/internal/system/assets/tpu.py +0 -154
- wandb/sdk/internal/system/assets/trainium.py +0 -393
- wandb/sdk/internal/system/env_probe_helpers.py +0 -13
- wandb/sdk/internal/system/system_info.py +0 -248
- wandb/sdk/internal/system/system_monitor.py +0 -224
- wandb/sdk/internal/writer.py +0 -204
- wandb/sdk/lib/service_token.py +0 -93
- wandb/sdk/service/__init__.py +0 -0
- wandb/sdk/service/_startup_debug.py +0 -22
- wandb/sdk/service/port_file.py +0 -53
- wandb/sdk/service/server.py +0 -107
- wandb/sdk/service/server_sock.py +0 -286
- wandb/sdk/service/service.py +0 -252
- wandb/sdk/service/streams.py +0 -425
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/WHEEL +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/entry_points.txt +0 -0
- {wandb-0.20.1.dist-info → wandb-0.20.2rc20250616.dist-info}/licenses/LICENSE +0 -0
@@ -1,205 +0,0 @@
|
|
1
|
-
import datetime
|
2
|
-
import logging
|
3
|
-
import threading
|
4
|
-
from typing import (
|
5
|
-
TYPE_CHECKING,
|
6
|
-
Any,
|
7
|
-
List,
|
8
|
-
Optional,
|
9
|
-
Protocol,
|
10
|
-
TypeVar,
|
11
|
-
runtime_checkable,
|
12
|
-
)
|
13
|
-
|
14
|
-
if TYPE_CHECKING:
|
15
|
-
from typing import Deque
|
16
|
-
|
17
|
-
from wandb.proto.wandb_telemetry_pb2 import TelemetryRecord
|
18
|
-
from wandb.sdk.interface.interface import FilesDict
|
19
|
-
from wandb.sdk.internal.settings_static import SettingsStatic
|
20
|
-
|
21
|
-
import psutil
|
22
|
-
|
23
|
-
TimeStamp = TypeVar("TimeStamp", bound=datetime.datetime)
|
24
|
-
|
25
|
-
|
26
|
-
logger = logging.getLogger(__name__)
|
27
|
-
|
28
|
-
|
29
|
-
class Metric(Protocol):
|
30
|
-
"""Base protocol for individual metrics."""
|
31
|
-
|
32
|
-
name: str
|
33
|
-
# samples: Sequence[Tuple[TimeStamp, Sample]]
|
34
|
-
samples: "Deque[Any]"
|
35
|
-
|
36
|
-
def sample(self) -> None:
|
37
|
-
"""Sample the metric."""
|
38
|
-
... # pragma: no cover
|
39
|
-
|
40
|
-
def clear(self) -> None:
|
41
|
-
"""Clear the samples."""
|
42
|
-
... # pragma: no cover
|
43
|
-
|
44
|
-
def aggregate(self) -> dict:
|
45
|
-
"""Aggregate the samples."""
|
46
|
-
... # pragma: no cover
|
47
|
-
|
48
|
-
|
49
|
-
@runtime_checkable
|
50
|
-
class SetupTeardown(Protocol):
|
51
|
-
"""Protocol for classes that require setup and teardown."""
|
52
|
-
|
53
|
-
def setup(self) -> None:
|
54
|
-
"""Extra setup required for the metric beyond __init__."""
|
55
|
-
... # pragma: no cover
|
56
|
-
|
57
|
-
def teardown(self) -> None:
|
58
|
-
"""Extra teardown required for the metric."""
|
59
|
-
... # pragma: no cover
|
60
|
-
|
61
|
-
|
62
|
-
@runtime_checkable
|
63
|
-
class Asset(Protocol):
|
64
|
-
"""Base protocol encapsulate everything relating to an "Asset".
|
65
|
-
|
66
|
-
An asset can be CPU, GPU, TPU, Network, I/O etc.
|
67
|
-
"""
|
68
|
-
|
69
|
-
name: str
|
70
|
-
metrics: List[Metric]
|
71
|
-
metrics_monitor: "MetricsMonitor"
|
72
|
-
|
73
|
-
def __init__(self, *args: Any, **kwargs: Any) -> None: ... # pragma: no cover
|
74
|
-
|
75
|
-
@classmethod
|
76
|
-
def is_available(cls) -> bool:
|
77
|
-
"""Check if the resource is available."""
|
78
|
-
... # pragma: no cover
|
79
|
-
|
80
|
-
def start(self) -> None:
|
81
|
-
"""Start monitoring the resource."""
|
82
|
-
... # pragma: no cover
|
83
|
-
|
84
|
-
def finish(self) -> None:
|
85
|
-
"""Finish monitoring the resource."""
|
86
|
-
... # pragma: no cover
|
87
|
-
|
88
|
-
def probe(self) -> dict:
|
89
|
-
"""Get static information about the resource."""
|
90
|
-
... # pragma: no cover
|
91
|
-
|
92
|
-
|
93
|
-
class Interface(Protocol):
|
94
|
-
def publish_stats(self, stats: dict) -> None: ... # pragma: no cover
|
95
|
-
|
96
|
-
def _publish_telemetry(
|
97
|
-
self, telemetry: "TelemetryRecord"
|
98
|
-
) -> None: ... # pragma: no cover
|
99
|
-
|
100
|
-
def publish_files(self, files_dict: "FilesDict") -> None: ... # pragma: no cover
|
101
|
-
|
102
|
-
|
103
|
-
class MetricsMonitor:
|
104
|
-
"""Takes care of collecting, sampling, serializing, and publishing a set of metrics."""
|
105
|
-
|
106
|
-
def __init__(
|
107
|
-
self,
|
108
|
-
asset_name: str,
|
109
|
-
metrics: List[Metric],
|
110
|
-
interface: Interface,
|
111
|
-
settings: "SettingsStatic",
|
112
|
-
shutdown_event: threading.Event,
|
113
|
-
) -> None:
|
114
|
-
self.metrics = metrics
|
115
|
-
self.asset_name = asset_name
|
116
|
-
self._interface = interface
|
117
|
-
self._process: Optional[threading.Thread] = None
|
118
|
-
self._shutdown_event: threading.Event = shutdown_event
|
119
|
-
|
120
|
-
self.sampling_interval: float = float(
|
121
|
-
max(
|
122
|
-
0.1,
|
123
|
-
settings.x_stats_sampling_interval,
|
124
|
-
)
|
125
|
-
) # seconds
|
126
|
-
self.samples_to_aggregate = 1
|
127
|
-
|
128
|
-
def monitor(self) -> None:
|
129
|
-
"""Poll the Asset metrics."""
|
130
|
-
while not self._shutdown_event.is_set():
|
131
|
-
for _ in range(self.samples_to_aggregate):
|
132
|
-
for metric in self.metrics:
|
133
|
-
try:
|
134
|
-
metric.sample()
|
135
|
-
except psutil.NoSuchProcess:
|
136
|
-
logger.info(f"Process {metric.name} has exited.")
|
137
|
-
self._shutdown_event.set()
|
138
|
-
break
|
139
|
-
except Exception:
|
140
|
-
logger.exception("Failed to sample metric.")
|
141
|
-
self._shutdown_event.wait(self.sampling_interval)
|
142
|
-
if self._shutdown_event.is_set():
|
143
|
-
break
|
144
|
-
self.publish()
|
145
|
-
|
146
|
-
def aggregate(self) -> dict:
|
147
|
-
"""Return a dict of metrics."""
|
148
|
-
aggregated_metrics = {}
|
149
|
-
for metric in self.metrics:
|
150
|
-
try:
|
151
|
-
serialized_metric = metric.aggregate()
|
152
|
-
aggregated_metrics.update(serialized_metric)
|
153
|
-
# aggregated_metrics = wandb.util.merge_dicts(
|
154
|
-
# aggregated_metrics, metric.serialize()
|
155
|
-
# )
|
156
|
-
except Exception:
|
157
|
-
logger.exception("Failed to serialize metric.")
|
158
|
-
return aggregated_metrics
|
159
|
-
|
160
|
-
def publish(self) -> None:
|
161
|
-
"""Publish the Asset metrics."""
|
162
|
-
try:
|
163
|
-
aggregated_metrics = self.aggregate()
|
164
|
-
if aggregated_metrics:
|
165
|
-
self._interface.publish_stats(aggregated_metrics)
|
166
|
-
for metric in self.metrics:
|
167
|
-
metric.clear()
|
168
|
-
except Exception:
|
169
|
-
logger.exception("Failed to publish metrics.")
|
170
|
-
|
171
|
-
def start(self) -> None:
|
172
|
-
if (self._process is not None) or self._shutdown_event.is_set():
|
173
|
-
return None
|
174
|
-
|
175
|
-
thread_name = f"{self.asset_name[:15]}" # thread names are limited to 15 chars
|
176
|
-
try:
|
177
|
-
for metric in self.metrics:
|
178
|
-
if isinstance(metric, SetupTeardown):
|
179
|
-
metric.setup()
|
180
|
-
self._process = threading.Thread(
|
181
|
-
target=self.monitor,
|
182
|
-
daemon=True,
|
183
|
-
name=thread_name,
|
184
|
-
)
|
185
|
-
self._process.start()
|
186
|
-
logger.info(f"Started {thread_name} monitoring")
|
187
|
-
except Exception as e:
|
188
|
-
logger.warning(f"Failed to start {thread_name} monitoring: {e}")
|
189
|
-
self._process = None
|
190
|
-
|
191
|
-
def finish(self) -> None:
|
192
|
-
if self._process is None:
|
193
|
-
return None
|
194
|
-
|
195
|
-
thread_name = f"{self.asset_name[:15]}"
|
196
|
-
try:
|
197
|
-
self._process.join()
|
198
|
-
logger.info(f"Joined {thread_name} monitor")
|
199
|
-
for metric in self.metrics:
|
200
|
-
if isinstance(metric, SetupTeardown):
|
201
|
-
metric.teardown()
|
202
|
-
except Exception as e:
|
203
|
-
logger.warning(f"Failed to finish {thread_name} monitoring: {e}")
|
204
|
-
finally:
|
205
|
-
self._process = None
|
@@ -1,177 +0,0 @@
|
|
1
|
-
import threading
|
2
|
-
from collections import deque
|
3
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
|
4
|
-
|
5
|
-
try:
|
6
|
-
import gcipuinfo # type: ignore
|
7
|
-
except ImportError:
|
8
|
-
gcipuinfo = None
|
9
|
-
|
10
|
-
import wandb
|
11
|
-
|
12
|
-
from .aggregators import aggregate_mean
|
13
|
-
from .asset_registry import asset_registry
|
14
|
-
from .interfaces import Interface, Metric, MetricsMonitor
|
15
|
-
|
16
|
-
if TYPE_CHECKING:
|
17
|
-
from typing import Deque
|
18
|
-
|
19
|
-
from wandb.sdk.internal.settings_static import SettingsStatic
|
20
|
-
|
21
|
-
|
22
|
-
class IPUStats:
|
23
|
-
"""Stats for Graphcore IPU devices."""
|
24
|
-
|
25
|
-
name = "ipu.{}.{}"
|
26
|
-
samples: "Deque[dict]"
|
27
|
-
|
28
|
-
# The metrics that change over time.
|
29
|
-
# Only these are returned on each invocation
|
30
|
-
# to avoid sending a load of unnecessary data.
|
31
|
-
variable_metric_keys = {
|
32
|
-
"average board temp",
|
33
|
-
"average die temp",
|
34
|
-
"clock",
|
35
|
-
"ipu power",
|
36
|
-
"ipu utilisation",
|
37
|
-
"ipu utilisation (session)",
|
38
|
-
}
|
39
|
-
|
40
|
-
def __init__(self, pid: int, gc_ipu_info: Optional[Any] = None) -> None:
|
41
|
-
self.samples: Deque[dict] = deque()
|
42
|
-
|
43
|
-
if gc_ipu_info is None:
|
44
|
-
if not gcipuinfo:
|
45
|
-
raise ImportError(
|
46
|
-
"Monitoring IPU stats requires gcipuinfo to be installed"
|
47
|
-
)
|
48
|
-
|
49
|
-
self._gc_ipu_info = gcipuinfo.gcipuinfo()
|
50
|
-
else:
|
51
|
-
self._gc_ipu_info = gc_ipu_info
|
52
|
-
self._gc_ipu_info.setUpdateMode(True)
|
53
|
-
|
54
|
-
self._pid = pid
|
55
|
-
self._devices_called: Set[str] = set()
|
56
|
-
|
57
|
-
@staticmethod
|
58
|
-
def parse_metric(key: str, value: str) -> Optional[Tuple[str, Union[int, float]]]:
|
59
|
-
metric_suffixes = {
|
60
|
-
"temp": "C",
|
61
|
-
"clock": "MHz",
|
62
|
-
"power": "W",
|
63
|
-
"utilisation": "%",
|
64
|
-
"utilisation (session)": "%",
|
65
|
-
"speed": "GT/s",
|
66
|
-
}
|
67
|
-
|
68
|
-
for metric, suffix in metric_suffixes.items():
|
69
|
-
if key.endswith(metric) and value.endswith(suffix):
|
70
|
-
value = value[: -len(suffix)]
|
71
|
-
key = f"{key} ({suffix})"
|
72
|
-
|
73
|
-
try:
|
74
|
-
float_value = float(value)
|
75
|
-
num_value = int(float_value) if float_value.is_integer() else float_value
|
76
|
-
except ValueError:
|
77
|
-
return None
|
78
|
-
|
79
|
-
return key, num_value
|
80
|
-
|
81
|
-
def sample(self) -> None:
|
82
|
-
try:
|
83
|
-
stats = {}
|
84
|
-
devices = self._gc_ipu_info.getDevices()
|
85
|
-
|
86
|
-
for device in devices:
|
87
|
-
device_metrics: Dict[str, str] = dict(device)
|
88
|
-
|
89
|
-
pid = device_metrics.get("user process id")
|
90
|
-
if pid is None or int(pid) != self._pid:
|
91
|
-
continue
|
92
|
-
|
93
|
-
device_id = device_metrics.get("id")
|
94
|
-
initial_call = device_id not in self._devices_called
|
95
|
-
if device_id is not None:
|
96
|
-
self._devices_called.add(device_id)
|
97
|
-
|
98
|
-
for key, value in device_metrics.items():
|
99
|
-
log_metric = initial_call or key in self.variable_metric_keys
|
100
|
-
if not log_metric:
|
101
|
-
continue
|
102
|
-
parsed = self.parse_metric(key, value)
|
103
|
-
if parsed is None:
|
104
|
-
continue
|
105
|
-
parsed_key, parsed_value = parsed
|
106
|
-
stats[self.name.format(device_id, parsed_key)] = parsed_value
|
107
|
-
|
108
|
-
self.samples.append(stats)
|
109
|
-
|
110
|
-
except Exception as e:
|
111
|
-
wandb.termwarn(f"IPU stats error {e}", repeat=False)
|
112
|
-
|
113
|
-
def clear(self) -> None:
|
114
|
-
self.samples.clear()
|
115
|
-
|
116
|
-
def aggregate(self) -> dict:
|
117
|
-
if not self.samples:
|
118
|
-
return {}
|
119
|
-
stats = {}
|
120
|
-
for key in self.samples[0].keys():
|
121
|
-
samples = [s[key] for s in self.samples if key in s]
|
122
|
-
aggregate = aggregate_mean(samples)
|
123
|
-
stats[key] = aggregate
|
124
|
-
return stats
|
125
|
-
|
126
|
-
|
127
|
-
@asset_registry.register
|
128
|
-
class IPU:
|
129
|
-
def __init__(
|
130
|
-
self,
|
131
|
-
interface: "Interface",
|
132
|
-
settings: "SettingsStatic",
|
133
|
-
shutdown_event: threading.Event,
|
134
|
-
) -> None:
|
135
|
-
self.name = self.__class__.__name__.lower()
|
136
|
-
self.metrics: List[Metric] = [
|
137
|
-
IPUStats(settings.x_stats_pid),
|
138
|
-
]
|
139
|
-
self.metrics_monitor = MetricsMonitor(
|
140
|
-
self.name,
|
141
|
-
self.metrics,
|
142
|
-
interface,
|
143
|
-
settings,
|
144
|
-
shutdown_event,
|
145
|
-
)
|
146
|
-
|
147
|
-
@classmethod
|
148
|
-
def is_available(cls) -> bool:
|
149
|
-
return gcipuinfo is not None
|
150
|
-
|
151
|
-
def start(self) -> None:
|
152
|
-
self.metrics_monitor.start()
|
153
|
-
|
154
|
-
def finish(self) -> None:
|
155
|
-
self.metrics_monitor.finish()
|
156
|
-
|
157
|
-
def probe(self) -> dict:
|
158
|
-
device_data = self.metrics[0]._gc_ipu_info.getDevices() # type: ignore
|
159
|
-
device_count = len(device_data)
|
160
|
-
devices = []
|
161
|
-
for i, device in enumerate(device_data):
|
162
|
-
device_metrics: Dict[str, str] = dict(device)
|
163
|
-
devices.append(
|
164
|
-
{
|
165
|
-
"id": device_metrics.get("id") or i,
|
166
|
-
"board ipu index": device_metrics.get("board ipu index"),
|
167
|
-
"board type": device_metrics.get("board type") or "unknown",
|
168
|
-
}
|
169
|
-
)
|
170
|
-
|
171
|
-
return {
|
172
|
-
self.name: {
|
173
|
-
"device_count": device_count,
|
174
|
-
"devices": devices,
|
175
|
-
"vendor": "Graphcore",
|
176
|
-
}
|
177
|
-
}
|
@@ -1,166 +0,0 @@
|
|
1
|
-
import threading
|
2
|
-
from collections import deque
|
3
|
-
from typing import TYPE_CHECKING, List, Optional
|
4
|
-
|
5
|
-
try:
|
6
|
-
import psutil
|
7
|
-
except ImportError:
|
8
|
-
psutil = None
|
9
|
-
|
10
|
-
from .aggregators import aggregate_mean
|
11
|
-
from .asset_registry import asset_registry
|
12
|
-
from .interfaces import Interface, Metric, MetricsMonitor
|
13
|
-
|
14
|
-
if TYPE_CHECKING:
|
15
|
-
from typing import Deque
|
16
|
-
|
17
|
-
from wandb.sdk.internal.settings_static import SettingsStatic
|
18
|
-
|
19
|
-
|
20
|
-
class ProcessMemoryRSS:
|
21
|
-
"""Memory resident set size (RSS) in MB.
|
22
|
-
|
23
|
-
RSS is the portion of memory occupied by a process that is held in main memory (RAM).
|
24
|
-
"""
|
25
|
-
|
26
|
-
# name = "memory_rss"
|
27
|
-
name = "proc.memory.rssMB"
|
28
|
-
samples: "Deque[float]"
|
29
|
-
|
30
|
-
def __init__(self, pid: int) -> None:
|
31
|
-
self.pid = pid
|
32
|
-
self.process: Optional[psutil.Process] = None
|
33
|
-
self.samples = deque([])
|
34
|
-
|
35
|
-
def sample(self) -> None:
|
36
|
-
if self.process is None:
|
37
|
-
self.process = psutil.Process(self.pid)
|
38
|
-
|
39
|
-
self.samples.append(self.process.memory_info().rss / 1024 / 1024)
|
40
|
-
|
41
|
-
def clear(self) -> None:
|
42
|
-
self.samples.clear()
|
43
|
-
|
44
|
-
def aggregate(self) -> dict:
|
45
|
-
if not self.samples:
|
46
|
-
return {}
|
47
|
-
aggregate = aggregate_mean(self.samples)
|
48
|
-
return {self.name: aggregate}
|
49
|
-
|
50
|
-
|
51
|
-
class ProcessMemoryPercent:
|
52
|
-
"""Process memory usage in percent."""
|
53
|
-
|
54
|
-
# name = "process_memory_percent"
|
55
|
-
name = "proc.memory.percent"
|
56
|
-
samples: "Deque[float]"
|
57
|
-
|
58
|
-
def __init__(self, pid: int) -> None:
|
59
|
-
self.pid = pid
|
60
|
-
self.process: Optional[psutil.Process] = None
|
61
|
-
self.samples = deque([])
|
62
|
-
|
63
|
-
def sample(self) -> None:
|
64
|
-
if self.process is None:
|
65
|
-
self.process = psutil.Process(self.pid)
|
66
|
-
|
67
|
-
self.samples.append(self.process.memory_percent())
|
68
|
-
|
69
|
-
def clear(self) -> None:
|
70
|
-
self.samples.clear()
|
71
|
-
|
72
|
-
def aggregate(self) -> dict:
|
73
|
-
if not self.samples:
|
74
|
-
return {}
|
75
|
-
aggregate = aggregate_mean(self.samples)
|
76
|
-
return {self.name: aggregate}
|
77
|
-
|
78
|
-
|
79
|
-
class MemoryPercent:
|
80
|
-
"""Total system memory usage in percent."""
|
81
|
-
|
82
|
-
# name = "memory_percent"
|
83
|
-
name = "memory"
|
84
|
-
samples: "Deque[float]"
|
85
|
-
|
86
|
-
def __init__(self) -> None:
|
87
|
-
self.samples = deque([])
|
88
|
-
|
89
|
-
def sample(self) -> None:
|
90
|
-
self.samples.append(psutil.virtual_memory().percent)
|
91
|
-
|
92
|
-
def clear(self) -> None:
|
93
|
-
self.samples.clear()
|
94
|
-
|
95
|
-
def aggregate(self) -> dict:
|
96
|
-
if not self.samples:
|
97
|
-
return {}
|
98
|
-
aggregate = aggregate_mean(self.samples)
|
99
|
-
return {self.name: aggregate}
|
100
|
-
|
101
|
-
|
102
|
-
class MemoryAvailable:
|
103
|
-
"""Total system memory available in MB."""
|
104
|
-
|
105
|
-
# name = "memory_available"
|
106
|
-
name = "proc.memory.availableMB"
|
107
|
-
samples: "Deque[float]"
|
108
|
-
|
109
|
-
def __init__(self) -> None:
|
110
|
-
self.samples = deque([])
|
111
|
-
|
112
|
-
def sample(self) -> None:
|
113
|
-
self.samples.append(psutil.virtual_memory().available / 1024 / 1024)
|
114
|
-
|
115
|
-
def clear(self) -> None:
|
116
|
-
self.samples.clear()
|
117
|
-
|
118
|
-
def aggregate(self) -> dict:
|
119
|
-
if not self.samples:
|
120
|
-
return {}
|
121
|
-
aggregate = aggregate_mean(self.samples)
|
122
|
-
return {self.name: aggregate}
|
123
|
-
|
124
|
-
|
125
|
-
@asset_registry.register
|
126
|
-
class Memory:
|
127
|
-
def __init__(
|
128
|
-
self,
|
129
|
-
interface: "Interface",
|
130
|
-
settings: "SettingsStatic",
|
131
|
-
shutdown_event: threading.Event,
|
132
|
-
) -> None:
|
133
|
-
self.name = self.__class__.__name__.lower()
|
134
|
-
self.metrics: List[Metric] = [
|
135
|
-
MemoryAvailable(),
|
136
|
-
MemoryPercent(),
|
137
|
-
ProcessMemoryRSS(settings.x_stats_pid),
|
138
|
-
ProcessMemoryPercent(settings.x_stats_pid),
|
139
|
-
]
|
140
|
-
self.metrics_monitor = MetricsMonitor(
|
141
|
-
self.name,
|
142
|
-
self.metrics,
|
143
|
-
interface,
|
144
|
-
settings,
|
145
|
-
shutdown_event,
|
146
|
-
)
|
147
|
-
|
148
|
-
def start(self) -> None:
|
149
|
-
self.metrics_monitor.start()
|
150
|
-
|
151
|
-
def finish(self) -> None:
|
152
|
-
self.metrics_monitor.finish()
|
153
|
-
|
154
|
-
@classmethod
|
155
|
-
def is_available(cls) -> bool:
|
156
|
-
"""Return a new instance of the CPU metrics."""
|
157
|
-
return psutil is not None
|
158
|
-
|
159
|
-
def probe(self) -> dict:
|
160
|
-
"""Return a dict of the hardware information."""
|
161
|
-
# total available memory in gigabytes
|
162
|
-
return {
|
163
|
-
"memory": {
|
164
|
-
"total": psutil.virtual_memory().total,
|
165
|
-
}
|
166
|
-
}
|
@@ -1,125 +0,0 @@
|
|
1
|
-
import threading
|
2
|
-
from collections import deque
|
3
|
-
from typing import TYPE_CHECKING, List
|
4
|
-
|
5
|
-
try:
|
6
|
-
import psutil
|
7
|
-
except ImportError:
|
8
|
-
psutil = None
|
9
|
-
|
10
|
-
from .aggregators import aggregate_mean
|
11
|
-
from .asset_registry import asset_registry
|
12
|
-
from .interfaces import Interface, Metric, MetricsMonitor
|
13
|
-
|
14
|
-
if TYPE_CHECKING:
|
15
|
-
from typing import Deque
|
16
|
-
|
17
|
-
from wandb.sdk.internal.settings_static import SettingsStatic
|
18
|
-
|
19
|
-
|
20
|
-
class NetworkSent:
|
21
|
-
"""Network bytes sent."""
|
22
|
-
|
23
|
-
name = "network.sent"
|
24
|
-
samples: "Deque[float]"
|
25
|
-
|
26
|
-
def __init__(self) -> None:
|
27
|
-
self.samples = deque([])
|
28
|
-
self.sent_init = psutil.net_io_counters().bytes_sent
|
29
|
-
|
30
|
-
def sample(self) -> None:
|
31
|
-
self.samples.append(psutil.net_io_counters().bytes_sent - self.sent_init)
|
32
|
-
|
33
|
-
def clear(self) -> None:
|
34
|
-
self.samples.clear()
|
35
|
-
|
36
|
-
def aggregate(self) -> dict:
|
37
|
-
if not self.samples:
|
38
|
-
return {}
|
39
|
-
aggregate = aggregate_mean(self.samples)
|
40
|
-
# todo: this is an adapter for the legacy metrics system
|
41
|
-
# return {"network": {self.name: aggregate}}
|
42
|
-
return {self.name: aggregate}
|
43
|
-
|
44
|
-
|
45
|
-
class NetworkRecv:
|
46
|
-
"""Network bytes received."""
|
47
|
-
|
48
|
-
name = "network.recv"
|
49
|
-
samples: "Deque[float]"
|
50
|
-
|
51
|
-
def __init__(self) -> None:
|
52
|
-
self.samples = deque([])
|
53
|
-
self.recv_init = psutil.net_io_counters().bytes_recv
|
54
|
-
|
55
|
-
def sample(self) -> None:
|
56
|
-
self.samples.append(psutil.net_io_counters().bytes_recv - self.recv_init)
|
57
|
-
|
58
|
-
def clear(self) -> None:
|
59
|
-
self.samples.clear()
|
60
|
-
|
61
|
-
def aggregate(self) -> dict:
|
62
|
-
if not self.samples:
|
63
|
-
return {}
|
64
|
-
aggregate = aggregate_mean(self.samples)
|
65
|
-
# todo: this is an adapter for the legacy metrics system
|
66
|
-
# return {"network": {self.name: aggregate}}
|
67
|
-
|
68
|
-
return {self.name: aggregate}
|
69
|
-
|
70
|
-
|
71
|
-
@asset_registry.register
|
72
|
-
class Network:
|
73
|
-
def __init__(
|
74
|
-
self,
|
75
|
-
interface: "Interface",
|
76
|
-
settings: "SettingsStatic",
|
77
|
-
shutdown_event: threading.Event,
|
78
|
-
) -> None:
|
79
|
-
self.name = self.__class__.__name__.lower()
|
80
|
-
self.metrics: List[Metric] = [
|
81
|
-
NetworkSent(),
|
82
|
-
NetworkRecv(),
|
83
|
-
]
|
84
|
-
self.metrics_monitor = MetricsMonitor(
|
85
|
-
self.name,
|
86
|
-
self.metrics,
|
87
|
-
interface,
|
88
|
-
settings,
|
89
|
-
shutdown_event,
|
90
|
-
)
|
91
|
-
|
92
|
-
def start(self) -> None:
|
93
|
-
self.metrics_monitor.start()
|
94
|
-
|
95
|
-
def finish(self) -> None:
|
96
|
-
self.metrics_monitor.finish()
|
97
|
-
|
98
|
-
@classmethod
|
99
|
-
def is_available(cls) -> bool:
|
100
|
-
"""Return a new instance of the CPU metrics."""
|
101
|
-
return psutil is not None
|
102
|
-
|
103
|
-
def probe(self) -> dict:
|
104
|
-
"""Return a dict of the hardware information."""
|
105
|
-
# net_if_addrs = psutil.net_if_addrs()
|
106
|
-
|
107
|
-
# return {
|
108
|
-
# self.name: {
|
109
|
-
# "interfaces": {
|
110
|
-
# k: {
|
111
|
-
# "addresses": [
|
112
|
-
# {
|
113
|
-
# "address": v.address,
|
114
|
-
# "netmask": v.netmask,
|
115
|
-
# "broadcast": v.broadcast,
|
116
|
-
# "ptp": v.ptp,
|
117
|
-
# }
|
118
|
-
# for v in v
|
119
|
-
# ]
|
120
|
-
# }
|
121
|
-
# for k, v in net_if_addrs.items()
|
122
|
-
# }
|
123
|
-
# }
|
124
|
-
# }
|
125
|
-
return {}
|