wandb 0.13.10__py3-none-any.whl → 0.14.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- wandb/__init__.py +2 -3
- wandb/apis/__init__.py +1 -3
- wandb/apis/importers/__init__.py +4 -0
- wandb/apis/importers/base.py +312 -0
- wandb/apis/importers/mlflow.py +113 -0
- wandb/apis/internal.py +29 -2
- wandb/apis/normalize.py +6 -5
- wandb/apis/public.py +163 -180
- wandb/apis/reports/_templates.py +6 -12
- wandb/apis/reports/report.py +1 -1
- wandb/apis/reports/runset.py +1 -3
- wandb/apis/reports/util.py +12 -10
- wandb/beta/workflows.py +57 -34
- wandb/catboost/__init__.py +1 -2
- wandb/cli/cli.py +215 -133
- wandb/data_types.py +63 -56
- wandb/docker/__init__.py +78 -16
- wandb/docker/auth.py +21 -22
- wandb/env.py +0 -1
- wandb/errors/__init__.py +8 -116
- wandb/errors/term.py +1 -1
- wandb/fastai/__init__.py +1 -2
- wandb/filesync/dir_watcher.py +8 -5
- wandb/filesync/step_prepare.py +76 -75
- wandb/filesync/step_upload.py +1 -2
- wandb/integration/catboost/__init__.py +1 -3
- wandb/integration/catboost/catboost.py +8 -14
- wandb/integration/fastai/__init__.py +7 -13
- wandb/integration/gym/__init__.py +35 -4
- wandb/integration/keras/__init__.py +3 -3
- wandb/integration/keras/callbacks/metrics_logger.py +9 -8
- wandb/integration/keras/callbacks/model_checkpoint.py +9 -9
- wandb/integration/keras/callbacks/tables_builder.py +31 -19
- wandb/integration/kfp/kfp_patch.py +20 -17
- wandb/integration/kfp/wandb_logging.py +1 -2
- wandb/integration/lightgbm/__init__.py +21 -19
- wandb/integration/prodigy/prodigy.py +6 -7
- wandb/integration/sacred/__init__.py +9 -12
- wandb/integration/sagemaker/__init__.py +1 -3
- wandb/integration/sagemaker/auth.py +0 -1
- wandb/integration/sagemaker/config.py +1 -1
- wandb/integration/sagemaker/resources.py +1 -1
- wandb/integration/sb3/sb3.py +8 -4
- wandb/integration/tensorboard/__init__.py +1 -3
- wandb/integration/tensorboard/log.py +8 -8
- wandb/integration/tensorboard/monkeypatch.py +11 -9
- wandb/integration/tensorflow/__init__.py +1 -3
- wandb/integration/xgboost/__init__.py +4 -6
- wandb/integration/yolov8/__init__.py +7 -0
- wandb/integration/yolov8/yolov8.py +250 -0
- wandb/jupyter.py +31 -35
- wandb/lightgbm/__init__.py +1 -2
- wandb/old/settings.py +2 -2
- wandb/plot/bar.py +1 -2
- wandb/plot/confusion_matrix.py +1 -3
- wandb/plot/histogram.py +1 -2
- wandb/plot/line.py +1 -2
- wandb/plot/line_series.py +4 -4
- wandb/plot/pr_curve.py +17 -20
- wandb/plot/roc_curve.py +1 -3
- wandb/plot/scatter.py +1 -2
- wandb/proto/v3/wandb_server_pb2.py +85 -39
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_server_pb2.py +51 -39
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/sdk/__init__.py +1 -3
- wandb/sdk/backend/backend.py +1 -1
- wandb/sdk/data_types/_dtypes.py +38 -30
- wandb/sdk/data_types/base_types/json_metadata.py +1 -3
- wandb/sdk/data_types/base_types/media.py +17 -17
- wandb/sdk/data_types/base_types/wb_value.py +33 -26
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +91 -125
- wandb/sdk/data_types/helper_types/classes.py +1 -1
- wandb/sdk/data_types/helper_types/image_mask.py +12 -12
- wandb/sdk/data_types/histogram.py +5 -4
- wandb/sdk/data_types/html.py +1 -2
- wandb/sdk/data_types/image.py +11 -11
- wandb/sdk/data_types/molecule.py +3 -6
- wandb/sdk/data_types/object_3d.py +1 -2
- wandb/sdk/data_types/plotly.py +1 -2
- wandb/sdk/data_types/saved_model.py +10 -8
- wandb/sdk/data_types/video.py +1 -1
- wandb/sdk/integration_utils/data_logging.py +5 -5
- wandb/sdk/interface/artifacts.py +288 -266
- wandb/sdk/interface/interface.py +2 -3
- wandb/sdk/interface/interface_grpc.py +1 -1
- wandb/sdk/interface/interface_queue.py +1 -1
- wandb/sdk/interface/interface_relay.py +1 -1
- wandb/sdk/interface/interface_shared.py +1 -2
- wandb/sdk/interface/interface_sock.py +1 -1
- wandb/sdk/interface/message_future.py +1 -1
- wandb/sdk/interface/message_future_poll.py +1 -1
- wandb/sdk/interface/router.py +1 -1
- wandb/sdk/interface/router_queue.py +1 -1
- wandb/sdk/interface/router_relay.py +1 -1
- wandb/sdk/interface/router_sock.py +1 -1
- wandb/sdk/interface/summary_record.py +1 -1
- wandb/sdk/internal/artifacts.py +1 -1
- wandb/sdk/internal/datastore.py +2 -3
- wandb/sdk/internal/file_pusher.py +5 -3
- wandb/sdk/internal/file_stream.py +22 -19
- wandb/sdk/internal/handler.py +5 -4
- wandb/sdk/internal/internal.py +1 -1
- wandb/sdk/internal/internal_api.py +115 -55
- wandb/sdk/internal/job_builder.py +1 -3
- wandb/sdk/internal/profiler.py +1 -1
- wandb/sdk/internal/progress.py +4 -6
- wandb/sdk/internal/sample.py +1 -3
- wandb/sdk/internal/sender.py +28 -16
- wandb/sdk/internal/settings_static.py +5 -5
- wandb/sdk/internal/system/assets/__init__.py +1 -0
- wandb/sdk/internal/system/assets/cpu.py +3 -9
- wandb/sdk/internal/system/assets/disk.py +2 -4
- wandb/sdk/internal/system/assets/gpu.py +6 -18
- wandb/sdk/internal/system/assets/gpu_apple.py +2 -4
- wandb/sdk/internal/system/assets/interfaces.py +50 -22
- wandb/sdk/internal/system/assets/ipu.py +1 -3
- wandb/sdk/internal/system/assets/memory.py +7 -13
- wandb/sdk/internal/system/assets/network.py +4 -8
- wandb/sdk/internal/system/assets/open_metrics.py +283 -0
- wandb/sdk/internal/system/assets/tpu.py +1 -4
- wandb/sdk/internal/system/assets/trainium.py +26 -14
- wandb/sdk/internal/system/system_info.py +2 -3
- wandb/sdk/internal/system/system_monitor.py +52 -20
- wandb/sdk/internal/tb_watcher.py +12 -13
- wandb/sdk/launch/_project_spec.py +54 -65
- wandb/sdk/launch/agent/agent.py +374 -90
- wandb/sdk/launch/builder/abstract.py +61 -7
- wandb/sdk/launch/builder/build.py +81 -110
- wandb/sdk/launch/builder/docker_builder.py +181 -0
- wandb/sdk/launch/builder/kaniko_builder.py +419 -0
- wandb/sdk/launch/builder/noop.py +31 -12
- wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +70 -20
- wandb/sdk/launch/environment/abstract.py +28 -0
- wandb/sdk/launch/environment/aws_environment.py +276 -0
- wandb/sdk/launch/environment/gcp_environment.py +271 -0
- wandb/sdk/launch/environment/local_environment.py +65 -0
- wandb/sdk/launch/github_reference.py +3 -8
- wandb/sdk/launch/launch.py +38 -29
- wandb/sdk/launch/launch_add.py +6 -8
- wandb/sdk/launch/loader.py +230 -0
- wandb/sdk/launch/registry/abstract.py +54 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +163 -0
- wandb/sdk/launch/registry/google_artifact_registry.py +203 -0
- wandb/sdk/launch/registry/local_registry.py +62 -0
- wandb/sdk/launch/runner/abstract.py +1 -16
- wandb/sdk/launch/runner/{kubernetes.py → kubernetes_runner.py} +83 -95
- wandb/sdk/launch/runner/local_container.py +46 -22
- wandb/sdk/launch/runner/local_process.py +1 -4
- wandb/sdk/launch/runner/{aws.py → sagemaker_runner.py} +53 -212
- wandb/sdk/launch/runner/{gcp_vertex.py → vertex_runner.py} +38 -55
- wandb/sdk/launch/sweeps/__init__.py +3 -2
- wandb/sdk/launch/sweeps/scheduler.py +132 -39
- wandb/sdk/launch/sweeps/scheduler_sweep.py +80 -89
- wandb/sdk/launch/utils.py +101 -30
- wandb/sdk/launch/wandb_reference.py +2 -7
- wandb/sdk/lib/_settings_toposort_generate.py +166 -0
- wandb/sdk/lib/_settings_toposort_generated.py +201 -0
- wandb/sdk/lib/apikey.py +2 -4
- wandb/sdk/lib/config_util.py +4 -1
- wandb/sdk/lib/console.py +1 -3
- wandb/sdk/lib/deprecate.py +3 -3
- wandb/sdk/lib/file_stream_utils.py +7 -5
- wandb/sdk/lib/filenames.py +1 -1
- wandb/sdk/lib/filesystem.py +61 -5
- wandb/sdk/lib/git.py +1 -3
- wandb/sdk/lib/import_hooks.py +4 -7
- wandb/sdk/lib/ipython.py +8 -5
- wandb/sdk/lib/lazyloader.py +1 -3
- wandb/sdk/lib/mailbox.py +14 -4
- wandb/sdk/lib/proto_util.py +10 -5
- wandb/sdk/lib/redirect.py +15 -22
- wandb/sdk/lib/reporting.py +1 -3
- wandb/sdk/lib/retry.py +4 -5
- wandb/sdk/lib/runid.py +1 -3
- wandb/sdk/lib/server.py +15 -9
- wandb/sdk/lib/sock_client.py +1 -1
- wandb/sdk/lib/sparkline.py +1 -1
- wandb/sdk/lib/wburls.py +1 -1
- wandb/sdk/service/port_file.py +1 -2
- wandb/sdk/service/service.py +36 -13
- wandb/sdk/service/service_base.py +12 -1
- wandb/sdk/verify/verify.py +5 -7
- wandb/sdk/wandb_artifacts.py +142 -177
- wandb/sdk/wandb_config.py +5 -8
- wandb/sdk/wandb_helper.py +1 -1
- wandb/sdk/wandb_init.py +24 -13
- wandb/sdk/wandb_login.py +9 -9
- wandb/sdk/wandb_manager.py +39 -4
- wandb/sdk/wandb_metric.py +2 -6
- wandb/sdk/wandb_require.py +4 -15
- wandb/sdk/wandb_require_helpers.py +1 -9
- wandb/sdk/wandb_run.py +95 -141
- wandb/sdk/wandb_save.py +1 -3
- wandb/sdk/wandb_settings.py +149 -54
- wandb/sdk/wandb_setup.py +66 -46
- wandb/sdk/wandb_summary.py +13 -10
- wandb/sdk/wandb_sweep.py +6 -7
- wandb/sdk/wandb_watch.py +1 -1
- wandb/sklearn/calculate/confusion_matrix.py +1 -1
- wandb/sklearn/calculate/learning_curve.py +1 -1
- wandb/sklearn/calculate/summary_metrics.py +1 -3
- wandb/sklearn/plot/__init__.py +1 -1
- wandb/sklearn/plot/classifier.py +27 -18
- wandb/sklearn/plot/clusterer.py +4 -5
- wandb/sklearn/plot/regressor.py +4 -4
- wandb/sklearn/plot/shared.py +2 -2
- wandb/sync/__init__.py +1 -3
- wandb/sync/sync.py +4 -5
- wandb/testing/relay.py +11 -10
- wandb/trigger.py +1 -1
- wandb/util.py +106 -81
- wandb/viz.py +4 -4
- wandb/wandb_agent.py +50 -50
- wandb/wandb_controller.py +2 -3
- wandb/wandb_run.py +1 -2
- wandb/wandb_torch.py +1 -1
- wandb/xgboost/__init__.py +1 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/METADATA +6 -2
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/RECORD +224 -209
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/WHEEL +1 -1
- wandb/sdk/launch/builder/docker.py +0 -80
- wandb/sdk/launch/builder/kaniko.py +0 -393
- wandb/sdk/launch/builder/loader.py +0 -32
- wandb/sdk/launch/runner/loader.py +0 -50
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/LICENSE +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/entry_points.txt +0 -0
- {wandb-0.13.10.dist-info → wandb-0.14.0.dist-info}/top_level.txt +0 -0
@@ -18,9 +18,7 @@ if TYPE_CHECKING:
|
|
18
18
|
|
19
19
|
|
20
20
|
class DiskUsage:
|
21
|
-
"""
|
22
|
-
Total system disk usage in percent.
|
23
|
-
"""
|
21
|
+
"""Total system disk usage in percent."""
|
24
22
|
|
25
23
|
# name = "disk_usage"
|
26
24
|
name = "disk"
|
@@ -62,7 +60,7 @@ class Disk:
|
|
62
60
|
|
63
61
|
@classmethod
|
64
62
|
def is_available(cls) -> bool:
|
65
|
-
"""Return a new instance of the CPU metrics"""
|
63
|
+
"""Return a new instance of the CPU metrics."""
|
66
64
|
return psutil is not None
|
67
65
|
|
68
66
|
def probe(self) -> dict:
|
@@ -55,9 +55,7 @@ def gpu_in_use_by_this_process(gpu_handle: "GPUHandle", pid: int) -> bool:
|
|
55
55
|
|
56
56
|
|
57
57
|
class GPUMemoryUtilization:
|
58
|
-
"""
|
59
|
-
GPU memory utilization in percent for each GPU.
|
60
|
-
"""
|
58
|
+
"""GPU memory utilization in percent for each GPU."""
|
61
59
|
|
62
60
|
# name = "memory_utilization"
|
63
61
|
name = "gpu.{}.memory"
|
@@ -99,9 +97,7 @@ class GPUMemoryUtilization:
|
|
99
97
|
|
100
98
|
|
101
99
|
class GPUMemoryAllocated:
|
102
|
-
"""
|
103
|
-
GPU memory allocated in percent for each GPU.
|
104
|
-
"""
|
100
|
+
"""GPU memory allocated in percent for each GPU."""
|
105
101
|
|
106
102
|
# name = "memory_allocated"
|
107
103
|
name = "gpu.{}.memoryAllocated"
|
@@ -142,9 +138,7 @@ class GPUMemoryAllocated:
|
|
142
138
|
|
143
139
|
|
144
140
|
class GPUUtilization:
|
145
|
-
"""
|
146
|
-
GPU utilization in percent for each GPU.
|
147
|
-
"""
|
141
|
+
"""GPU utilization in percent for each GPU."""
|
148
142
|
|
149
143
|
# name = "gpu_utilization"
|
150
144
|
name = "gpu.{}.gpu"
|
@@ -186,9 +180,7 @@ class GPUUtilization:
|
|
186
180
|
|
187
181
|
|
188
182
|
class GPUTemperature:
|
189
|
-
"""
|
190
|
-
GPU temperature in Celsius for each GPU.
|
191
|
-
"""
|
183
|
+
"""GPU temperature in Celsius for each GPU."""
|
192
184
|
|
193
185
|
# name = "gpu_temperature"
|
194
186
|
name = "gpu.{}.temp"
|
@@ -233,9 +225,7 @@ class GPUTemperature:
|
|
233
225
|
|
234
226
|
|
235
227
|
class GPUPowerUsageWatts:
|
236
|
-
"""
|
237
|
-
GPU power usage in Watts for each GPU.
|
238
|
-
"""
|
228
|
+
"""GPU power usage in Watts for each GPU."""
|
239
229
|
|
240
230
|
name = "gpu.{}.powerWatts"
|
241
231
|
# samples: Deque[Tuple[datetime.datetime, float]]
|
@@ -273,9 +263,7 @@ class GPUPowerUsageWatts:
|
|
273
263
|
|
274
264
|
|
275
265
|
class GPUPowerUsagePercent:
|
276
|
-
"""
|
277
|
-
GPU power usage in percent for each GPU.
|
278
|
-
"""
|
266
|
+
"""GPU power usage in percent for each GPU."""
|
279
267
|
|
280
268
|
name = "gpu.{}.powerPercent"
|
281
269
|
# samples: Deque[Tuple[datetime.datetime, float]]
|
@@ -34,13 +34,11 @@ class _Stats(TypedDict):
|
|
34
34
|
temp: float
|
35
35
|
powerWatts: float # noqa: N815
|
36
36
|
powerPercent: float # noqa: N815
|
37
|
-
# cpuWaitMs: float
|
37
|
+
# cpuWaitMs: float
|
38
38
|
|
39
39
|
|
40
40
|
class GPUAppleStats:
|
41
|
-
"""
|
42
|
-
Apple GPU stats available on Arm Macs.
|
43
|
-
"""
|
41
|
+
"""Apple GPU stats available on Arm Macs."""
|
44
42
|
|
45
43
|
name = "gpu.0.{}"
|
46
44
|
samples: "Deque[_Stats]"
|
@@ -26,29 +26,43 @@ logger = logging.getLogger(__name__)
|
|
26
26
|
|
27
27
|
|
28
28
|
class Metric(Protocol):
|
29
|
-
"""
|
30
|
-
Base protocol for individual metrics
|
31
|
-
"""
|
29
|
+
"""Base protocol for individual metrics."""
|
32
30
|
|
33
31
|
name: str
|
34
32
|
# samples: Sequence[Tuple[TimeStamp, Sample]]
|
35
33
|
samples: "Deque[Any]"
|
36
34
|
|
37
35
|
def sample(self) -> None:
|
36
|
+
"""Sample the metric."""
|
38
37
|
... # pragma: no cover
|
39
38
|
|
40
39
|
def clear(self) -> None:
|
40
|
+
"""Clear the samples."""
|
41
41
|
... # pragma: no cover
|
42
42
|
|
43
43
|
def aggregate(self) -> dict:
|
44
|
+
"""Aggregate the samples."""
|
45
|
+
... # pragma: no cover
|
46
|
+
|
47
|
+
|
48
|
+
@runtime_checkable
|
49
|
+
class SetupTeardown(Protocol):
|
50
|
+
"""Protocol for classes that require setup and teardown."""
|
51
|
+
|
52
|
+
def setup(self) -> None:
|
53
|
+
"""Extra setup required for the metric beyond __init__."""
|
54
|
+
... # pragma: no cover
|
55
|
+
|
56
|
+
def teardown(self) -> None:
|
57
|
+
"""Extra teardown required for the metric."""
|
44
58
|
... # pragma: no cover
|
45
59
|
|
46
60
|
|
47
61
|
@runtime_checkable
|
48
62
|
class Asset(Protocol):
|
49
|
-
"""
|
50
|
-
|
51
|
-
|
63
|
+
"""Base protocol encapsulate everything relating to an "Asset".
|
64
|
+
|
65
|
+
An asset can be CPU, GPU, TPU, Network, I/O etc.
|
52
66
|
"""
|
53
67
|
|
54
68
|
name: str
|
@@ -60,19 +74,19 @@ class Asset(Protocol):
|
|
60
74
|
|
61
75
|
@classmethod
|
62
76
|
def is_available(cls) -> bool:
|
63
|
-
"""Check if the resource is available"""
|
77
|
+
"""Check if the resource is available."""
|
64
78
|
... # pragma: no cover
|
65
79
|
|
66
80
|
def start(self) -> None:
|
67
|
-
"""Start monitoring the resource"""
|
81
|
+
"""Start monitoring the resource."""
|
68
82
|
... # pragma: no cover
|
69
83
|
|
70
84
|
def finish(self) -> None:
|
71
|
-
"""
|
85
|
+
"""Finish monitoring the resource."""
|
72
86
|
... # pragma: no cover
|
73
87
|
|
74
88
|
def probe(self) -> dict:
|
75
|
-
"""Get static information about the resource"""
|
89
|
+
"""Get static information about the resource."""
|
76
90
|
... # pragma: no cover
|
77
91
|
|
78
92
|
|
@@ -88,9 +102,7 @@ class Interface(Protocol):
|
|
88
102
|
|
89
103
|
|
90
104
|
class MetricsMonitor:
|
91
|
-
"""
|
92
|
-
Takes care of collecting, sampling, serializing, and publishing a set of metrics.
|
93
|
-
"""
|
105
|
+
"""Takes care of collecting, sampling, serializing, and publishing a set of metrics."""
|
94
106
|
|
95
107
|
def __init__(
|
96
108
|
self,
|
@@ -119,7 +131,7 @@ class MetricsMonitor:
|
|
119
131
|
)
|
120
132
|
|
121
133
|
def monitor(self) -> None:
|
122
|
-
"""Poll the Asset metrics"""
|
134
|
+
"""Poll the Asset metrics."""
|
123
135
|
while not self._shutdown_event.is_set():
|
124
136
|
for _ in range(self.samples_to_aggregate):
|
125
137
|
for metric in self.metrics:
|
@@ -133,7 +145,7 @@ class MetricsMonitor:
|
|
133
145
|
self.publish()
|
134
146
|
|
135
147
|
def aggregate(self) -> dict:
|
136
|
-
"""Return a dict of metrics"""
|
148
|
+
"""Return a dict of metrics."""
|
137
149
|
aggregated_metrics = {}
|
138
150
|
for metric in self.metrics:
|
139
151
|
try:
|
@@ -147,7 +159,7 @@ class MetricsMonitor:
|
|
147
159
|
return aggregated_metrics
|
148
160
|
|
149
161
|
def publish(self) -> None:
|
150
|
-
"""Publish the Asset metrics"""
|
162
|
+
"""Publish the Asset metrics."""
|
151
163
|
try:
|
152
164
|
aggregated_metrics = self.aggregate()
|
153
165
|
if aggregated_metrics:
|
@@ -158,21 +170,37 @@ class MetricsMonitor:
|
|
158
170
|
logger.error(f"Failed to publish metrics: {e}")
|
159
171
|
|
160
172
|
def start(self) -> None:
|
161
|
-
if self._process is None
|
173
|
+
if (self._process is not None) or self._shutdown_event.is_set():
|
174
|
+
return None
|
175
|
+
|
176
|
+
thread_name = f"{self.asset_name[:15]}" # thread names are limited to 15 chars
|
177
|
+
try:
|
178
|
+
for metric in self.metrics:
|
179
|
+
if isinstance(metric, SetupTeardown):
|
180
|
+
metric.setup()
|
162
181
|
self._process = threading.Thread(
|
163
182
|
target=self.monitor,
|
164
183
|
daemon=True,
|
165
|
-
name=
|
184
|
+
name=thread_name,
|
166
185
|
)
|
167
186
|
self._process.start()
|
168
|
-
logger.info(f"Started {
|
187
|
+
logger.info(f"Started {thread_name} monitoring")
|
188
|
+
except Exception as e:
|
189
|
+
logger.warning(f"Failed to start {thread_name} monitoring: {e}")
|
190
|
+
self._process = None
|
169
191
|
|
170
192
|
def finish(self) -> None:
|
171
193
|
if self._process is None:
|
172
194
|
return None
|
195
|
+
|
196
|
+
thread_name = f"{self.asset_name[:15]}"
|
173
197
|
try:
|
174
198
|
self._process.join()
|
175
|
-
logger.info(f"Joined {
|
199
|
+
logger.info(f"Joined {thread_name} monitor")
|
200
|
+
for metric in self.metrics:
|
201
|
+
if isinstance(metric, SetupTeardown):
|
202
|
+
metric.teardown()
|
176
203
|
except Exception as e:
|
177
|
-
logger.warning(f"Failed to
|
178
|
-
|
204
|
+
logger.warning(f"Failed to finish {thread_name} monitoring: {e}")
|
205
|
+
finally:
|
206
|
+
self._process = None
|
@@ -18,8 +18,8 @@ if TYPE_CHECKING:
|
|
18
18
|
|
19
19
|
|
20
20
|
class ProcessMemoryRSS:
|
21
|
-
"""
|
22
|
-
|
21
|
+
"""Memory resident set size (RSS) in MB.
|
22
|
+
|
23
23
|
RSS is the portion of memory occupied by a process that is held in main memory (RAM).
|
24
24
|
"""
|
25
25
|
|
@@ -49,9 +49,7 @@ class ProcessMemoryRSS:
|
|
49
49
|
|
50
50
|
|
51
51
|
class ProcessMemoryPercent:
|
52
|
-
"""
|
53
|
-
Process memory usage in percent.
|
54
|
-
"""
|
52
|
+
"""Process memory usage in percent."""
|
55
53
|
|
56
54
|
# name = "process_memory_percent"
|
57
55
|
name = "proc.memory.percent"
|
@@ -79,9 +77,7 @@ class ProcessMemoryPercent:
|
|
79
77
|
|
80
78
|
|
81
79
|
class MemoryPercent:
|
82
|
-
"""
|
83
|
-
Total system memory usage in percent.
|
84
|
-
"""
|
80
|
+
"""Total system memory usage in percent."""
|
85
81
|
|
86
82
|
# name = "memory_percent"
|
87
83
|
name = "memory"
|
@@ -104,9 +100,7 @@ class MemoryPercent:
|
|
104
100
|
|
105
101
|
|
106
102
|
class MemoryAvailable:
|
107
|
-
"""
|
108
|
-
Total system memory available in MB.
|
109
|
-
"""
|
103
|
+
"""Total system memory available in MB."""
|
110
104
|
|
111
105
|
# name = "memory_available"
|
112
106
|
name = "proc.memory.availableMB"
|
@@ -159,11 +153,11 @@ class Memory:
|
|
159
153
|
|
160
154
|
@classmethod
|
161
155
|
def is_available(cls) -> bool:
|
162
|
-
"""Return a new instance of the CPU metrics"""
|
156
|
+
"""Return a new instance of the CPU metrics."""
|
163
157
|
return psutil is not None
|
164
158
|
|
165
159
|
def probe(self) -> dict:
|
166
|
-
"""Return a dict of the hardware information"""
|
160
|
+
"""Return a dict of the hardware information."""
|
167
161
|
# total available memory in gigabytes
|
168
162
|
return {
|
169
163
|
"memory": {
|
@@ -18,9 +18,7 @@ if TYPE_CHECKING:
|
|
18
18
|
|
19
19
|
|
20
20
|
class NetworkSent:
|
21
|
-
"""
|
22
|
-
Network bytes sent.
|
23
|
-
"""
|
21
|
+
"""Network bytes sent."""
|
24
22
|
|
25
23
|
name = "network.sent"
|
26
24
|
samples: "Deque[float]"
|
@@ -45,9 +43,7 @@ class NetworkSent:
|
|
45
43
|
|
46
44
|
|
47
45
|
class NetworkRecv:
|
48
|
-
"""
|
49
|
-
Network bytes received.
|
50
|
-
"""
|
46
|
+
"""Network bytes received."""
|
51
47
|
|
52
48
|
name = "network.recv"
|
53
49
|
samples: "Deque[float]"
|
@@ -101,11 +97,11 @@ class Network:
|
|
101
97
|
|
102
98
|
@classmethod
|
103
99
|
def is_available(cls) -> bool:
|
104
|
-
"""Return a new instance of the CPU metrics"""
|
100
|
+
"""Return a new instance of the CPU metrics."""
|
105
101
|
return psutil is not None
|
106
102
|
|
107
103
|
def probe(self) -> dict:
|
108
|
-
"""Return a dict of the hardware information"""
|
104
|
+
"""Return a dict of the hardware information."""
|
109
105
|
# net_if_addrs = psutil.net_if_addrs()
|
110
106
|
|
111
107
|
# return {
|
@@ -0,0 +1,283 @@
|
|
1
|
+
import logging
|
2
|
+
import multiprocessing as mp
|
3
|
+
import re
|
4
|
+
import sys
|
5
|
+
from collections import defaultdict, deque
|
6
|
+
from functools import lru_cache
|
7
|
+
from hashlib import md5
|
8
|
+
from types import ModuleType
|
9
|
+
from typing import TYPE_CHECKING, Dict, List, Mapping, Tuple, Union
|
10
|
+
|
11
|
+
if sys.version_info >= (3, 8):
|
12
|
+
from typing import Final
|
13
|
+
else:
|
14
|
+
from typing_extensions import Final
|
15
|
+
|
16
|
+
import requests
|
17
|
+
import requests.adapters
|
18
|
+
import urllib3
|
19
|
+
|
20
|
+
import wandb
|
21
|
+
from wandb.sdk.lib import telemetry
|
22
|
+
|
23
|
+
from .aggregators import aggregate_last, aggregate_mean
|
24
|
+
from .interfaces import Interface, Metric, MetricsMonitor
|
25
|
+
|
26
|
+
if TYPE_CHECKING:
|
27
|
+
from typing import Deque, Optional
|
28
|
+
|
29
|
+
from wandb.sdk.internal.settings_static import SettingsStatic
|
30
|
+
|
31
|
+
|
32
|
+
_PREFIX: Final[str] = "openmetrics"
|
33
|
+
|
34
|
+
_REQUEST_RETRY_STRATEGY = urllib3.util.retry.Retry(
|
35
|
+
backoff_factor=1,
|
36
|
+
total=3,
|
37
|
+
status_forcelist=(408, 409, 429, 500, 502, 503, 504),
|
38
|
+
)
|
39
|
+
_REQUEST_POOL_CONNECTIONS = 4
|
40
|
+
_REQUEST_POOL_MAXSIZE = 4
|
41
|
+
_REQUEST_TIMEOUT = 3
|
42
|
+
|
43
|
+
|
44
|
+
logger = logging.getLogger(__name__)
|
45
|
+
|
46
|
+
|
47
|
+
prometheus_client_parser: "Optional[ModuleType]" = None
|
48
|
+
try:
|
49
|
+
import prometheus_client.parser # type: ignore
|
50
|
+
|
51
|
+
prometheus_client_parser = prometheus_client.parser
|
52
|
+
except ImportError:
|
53
|
+
pass
|
54
|
+
|
55
|
+
|
56
|
+
def _setup_requests_session() -> requests.Session:
|
57
|
+
session = requests.Session()
|
58
|
+
adapter = requests.adapters.HTTPAdapter(
|
59
|
+
max_retries=_REQUEST_RETRY_STRATEGY,
|
60
|
+
pool_connections=_REQUEST_POOL_CONNECTIONS,
|
61
|
+
pool_maxsize=_REQUEST_POOL_MAXSIZE,
|
62
|
+
)
|
63
|
+
session.mount("http://", adapter)
|
64
|
+
session.mount("https://", adapter)
|
65
|
+
return session
|
66
|
+
|
67
|
+
|
68
|
+
def _nested_dict_to_tuple(
|
69
|
+
nested_dict: Mapping[str, Mapping[str, str]]
|
70
|
+
) -> Tuple[Tuple[str, Tuple[str, str]], ...]:
|
71
|
+
return tuple((k, *v.items()) for k, v in nested_dict.items()) # type: ignore
|
72
|
+
|
73
|
+
|
74
|
+
def _tuple_to_nested_dict(
|
75
|
+
nested_tuple: Tuple[Tuple[str, Tuple[str, str]], ...]
|
76
|
+
) -> Dict[str, Dict[str, str]]:
|
77
|
+
return {k: dict(v) for k, *v in nested_tuple}
|
78
|
+
|
79
|
+
|
80
|
+
@lru_cache(maxsize=128)
|
81
|
+
def _should_capture_metric(
|
82
|
+
metric_name: str,
|
83
|
+
metric_labels: Tuple[str, ...],
|
84
|
+
filters: Tuple[Tuple[str, Tuple[str, str]], ...],
|
85
|
+
) -> bool:
|
86
|
+
# we use tuples to make the function arguments hashable => usable with lru_cache
|
87
|
+
should_capture = False
|
88
|
+
|
89
|
+
if not filters:
|
90
|
+
return should_capture
|
91
|
+
|
92
|
+
# self.filters keys are regexes, check the name against them
|
93
|
+
# and for the first match, check the labels against the label filters.
|
94
|
+
# assume that if at least one label filter doesn't match, the metric
|
95
|
+
# should not be captured.
|
96
|
+
# it's up to the user to make sure that the filters are not conflicting etc.
|
97
|
+
metric_labels_dict = {t[0]: t[1] for t in metric_labels}
|
98
|
+
filters_dict = _tuple_to_nested_dict(filters)
|
99
|
+
for metric_name_regex, label_filters in filters_dict.items():
|
100
|
+
if not re.match(metric_name_regex, metric_name):
|
101
|
+
continue
|
102
|
+
|
103
|
+
should_capture = True
|
104
|
+
|
105
|
+
for label, label_filter in label_filters.items():
|
106
|
+
if not re.match(label_filter, metric_labels_dict.get(label, "")):
|
107
|
+
should_capture = False
|
108
|
+
break
|
109
|
+
break
|
110
|
+
|
111
|
+
return should_capture
|
112
|
+
|
113
|
+
|
114
|
+
class OpenMetricsMetric:
|
115
|
+
"""Container for all the COUNTER and GAUGE metrics extracted from an OpenMetrics endpoint."""
|
116
|
+
|
117
|
+
def __init__(
|
118
|
+
self, name: str, url: str, filters: Mapping[str, Mapping[str, str]]
|
119
|
+
) -> None:
|
120
|
+
self.name = name
|
121
|
+
self.url = url
|
122
|
+
self.filters = filters
|
123
|
+
self.filters_tuple = _nested_dict_to_tuple(filters)
|
124
|
+
self._session: Optional["requests.Session"] = None
|
125
|
+
self.samples: "Deque[dict]" = deque([])
|
126
|
+
# {"<metric name>": {"<labels hash>": <index>}}
|
127
|
+
self.label_map: "Dict[str, Dict[str, int]]" = defaultdict(dict)
|
128
|
+
# {"<labels hash>": <labels>}
|
129
|
+
self.label_hashes: "Dict[str, dict]" = {}
|
130
|
+
|
131
|
+
def setup(self) -> None:
|
132
|
+
if self._session is not None:
|
133
|
+
return
|
134
|
+
|
135
|
+
self._session = _setup_requests_session()
|
136
|
+
|
137
|
+
def teardown(self) -> None:
|
138
|
+
if self._session is None:
|
139
|
+
return
|
140
|
+
|
141
|
+
self._session.close()
|
142
|
+
self._session = None
|
143
|
+
|
144
|
+
def parse_open_metrics_endpoint(self) -> Dict[str, Union[str, int, float]]:
|
145
|
+
assert prometheus_client_parser is not None
|
146
|
+
assert self._session is not None
|
147
|
+
|
148
|
+
response = self._session.get(self.url, timeout=_REQUEST_TIMEOUT)
|
149
|
+
response.raise_for_status()
|
150
|
+
|
151
|
+
text = response.text
|
152
|
+
measurement = {}
|
153
|
+
for family in prometheus_client_parser.text_string_to_metric_families(text):
|
154
|
+
if family.type not in ("counter", "gauge"):
|
155
|
+
# todo: add support for other metric types?
|
156
|
+
# todo: log warning about that?
|
157
|
+
continue
|
158
|
+
for sample in family.samples:
|
159
|
+
name, labels, value = sample.name, sample.labels, sample.value
|
160
|
+
|
161
|
+
if not _should_capture_metric(
|
162
|
+
name,
|
163
|
+
tuple(labels.items()),
|
164
|
+
self.filters_tuple,
|
165
|
+
):
|
166
|
+
continue
|
167
|
+
|
168
|
+
# md5 hash of the labels
|
169
|
+
label_hash = md5(str(labels).encode("utf-8")).hexdigest()
|
170
|
+
if label_hash not in self.label_map[name]:
|
171
|
+
# store the index of the label hash in the label map
|
172
|
+
self.label_map[name][label_hash] = len(self.label_map[name])
|
173
|
+
# store the labels themselves
|
174
|
+
self.label_hashes[label_hash] = labels
|
175
|
+
index = self.label_map[name][label_hash]
|
176
|
+
measurement[f"{name}.{index}"] = value
|
177
|
+
|
178
|
+
return measurement
|
179
|
+
|
180
|
+
def sample(self) -> None:
|
181
|
+
s = self.parse_open_metrics_endpoint()
|
182
|
+
self.samples.append(s)
|
183
|
+
|
184
|
+
def clear(self) -> None:
|
185
|
+
self.samples.clear()
|
186
|
+
|
187
|
+
def aggregate(self) -> dict:
|
188
|
+
if not self.samples:
|
189
|
+
return {}
|
190
|
+
|
191
|
+
prefix = f"{_PREFIX}.{self.name}."
|
192
|
+
|
193
|
+
stats = {}
|
194
|
+
for key in self.samples[0].keys():
|
195
|
+
samples = [s[key] for s in self.samples if key in s]
|
196
|
+
if samples and all(isinstance(s, (int, float)) for s in samples):
|
197
|
+
stats[f"{prefix}{key}"] = aggregate_mean(samples)
|
198
|
+
else:
|
199
|
+
stats[f"{prefix}{key}"] = aggregate_last(samples)
|
200
|
+
return stats
|
201
|
+
|
202
|
+
|
203
|
+
class OpenMetrics:
|
204
|
+
# Poll an OpenMetrics endpoint, parse the response and return a dict of metrics
|
205
|
+
# Implements the same Protocol interface as Asset
|
206
|
+
|
207
|
+
def __init__(
|
208
|
+
self,
|
209
|
+
interface: "Interface",
|
210
|
+
settings: "SettingsStatic",
|
211
|
+
shutdown_event: mp.synchronize.Event,
|
212
|
+
name: str,
|
213
|
+
url: str,
|
214
|
+
) -> None:
|
215
|
+
self.name = name
|
216
|
+
self.url = url
|
217
|
+
self.interface = interface
|
218
|
+
self.settings = settings
|
219
|
+
self.shutdown_event = shutdown_event
|
220
|
+
|
221
|
+
self.metrics: List[Metric] = [
|
222
|
+
OpenMetricsMetric(name, url, settings._stats_open_metrics_filters)
|
223
|
+
]
|
224
|
+
|
225
|
+
self.metrics_monitor: "MetricsMonitor" = MetricsMonitor(
|
226
|
+
asset_name=self.name,
|
227
|
+
metrics=self.metrics,
|
228
|
+
interface=interface,
|
229
|
+
settings=settings,
|
230
|
+
shutdown_event=shutdown_event,
|
231
|
+
)
|
232
|
+
|
233
|
+
telemetry_record = telemetry.TelemetryRecord()
|
234
|
+
telemetry_record.feature.open_metrics = True
|
235
|
+
interface._publish_telemetry(telemetry_record)
|
236
|
+
|
237
|
+
@classmethod
|
238
|
+
def is_available(cls, url: str) -> bool:
|
239
|
+
_is_available: bool = False
|
240
|
+
|
241
|
+
ret = prometheus_client_parser is not None
|
242
|
+
if not ret:
|
243
|
+
wandb.termwarn(
|
244
|
+
"Monitoring OpenMetrics endpoints requires the `prometheus_client` package. "
|
245
|
+
"To install it, run `pip install prometheus_client`.",
|
246
|
+
repeat=False,
|
247
|
+
)
|
248
|
+
return _is_available
|
249
|
+
# check if the endpoint is available and is a valid OpenMetrics endpoint
|
250
|
+
_session: Optional[requests.Session] = None
|
251
|
+
try:
|
252
|
+
assert prometheus_client_parser is not None
|
253
|
+
_session = _setup_requests_session()
|
254
|
+
response = _session.get(url, timeout=_REQUEST_TIMEOUT)
|
255
|
+
response.raise_for_status()
|
256
|
+
|
257
|
+
# check if the response is a valid OpenMetrics response
|
258
|
+
# text_string_to_metric_families returns a generator
|
259
|
+
if list(
|
260
|
+
prometheus_client_parser.text_string_to_metric_families(response.text)
|
261
|
+
):
|
262
|
+
_is_available = True
|
263
|
+
except Exception as e:
|
264
|
+
logger.debug(
|
265
|
+
f"OpenMetrics endpoint {url} is not available: {e}", exc_info=True
|
266
|
+
)
|
267
|
+
|
268
|
+
if _session is not None:
|
269
|
+
try:
|
270
|
+
_session.close()
|
271
|
+
except Exception:
|
272
|
+
pass
|
273
|
+
return _is_available
|
274
|
+
|
275
|
+
def start(self) -> None:
|
276
|
+
self.metrics_monitor.start()
|
277
|
+
|
278
|
+
def finish(self) -> None:
|
279
|
+
self.metrics_monitor.finish()
|
280
|
+
|
281
|
+
def probe(self) -> dict:
|
282
|
+
# todo: also return self.label_hashes
|
283
|
+
return {self.name: self.url}
|
@@ -17,9 +17,7 @@ logger = logging.getLogger(__name__)
|
|
17
17
|
|
18
18
|
|
19
19
|
class TPUUtilization:
|
20
|
-
"""
|
21
|
-
Google Cloud TPU utilization in percent.
|
22
|
-
"""
|
20
|
+
"""Google Cloud TPU utilization in percent."""
|
23
21
|
|
24
22
|
name = "tpu"
|
25
23
|
samples: "Deque[float]"
|
@@ -130,7 +128,6 @@ class TPU:
|
|
130
128
|
|
131
129
|
@classmethod
|
132
130
|
def is_available(cls) -> bool:
|
133
|
-
|
134
131
|
if os.environ.get("TPU_NAME", False) is False:
|
135
132
|
return False
|
136
133
|
|