ob-metaflow-extensions 1.1.42rc1__tar.gz → 1.1.43__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/__init__.py +0 -7
- ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/profilers/__init__.py +1 -0
- ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/profilers/gpu.py +701 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/SOURCES.txt +2 -1
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/setup.py +1 -1
- ob-metaflow-extensions-1.1.42rc1/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -27
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/README.md +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/remote_config.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/requires.txt +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
- {ob-metaflow-extensions-1.1.42rc1 → ob-metaflow-extensions-1.1.43}/setup.cfg +0 -0
|
@@ -2,13 +2,6 @@ import metaflow.metaflow_config_funcs
|
|
|
2
2
|
|
|
3
3
|
from metaflow_extensions.outerbounds.remote_config import init_config
|
|
4
4
|
|
|
5
|
-
from metaflow_extensions.outerbounds.plugins.perimeters import (
|
|
6
|
-
override_metaflow_profile_with_perimeter,
|
|
7
|
-
)
|
|
8
|
-
|
|
9
|
-
override_metaflow_profile_with_perimeter()
|
|
10
|
-
|
|
11
|
-
|
|
12
5
|
# we want to overide OSS Metaflow's initialization behavior with our own to support remote configs
|
|
13
6
|
# we're reassigning the METAFLOW_CONFIG variable because all downstream settings rely on it and
|
|
14
7
|
# users still have the power to overriden them with environment variables
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .gpu import gpu_profile
|
|
@@ -0,0 +1,701 @@
|
|
|
1
|
+
from metaflow.cards import Markdown, Table, VegaChart
|
|
2
|
+
from functools import wraps
|
|
3
|
+
import threading
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from metaflow import current
|
|
6
|
+
from metaflow.cards import Table, Markdown, VegaChart, Image
|
|
7
|
+
import time
|
|
8
|
+
from typing import List, Dict, Any, Union
|
|
9
|
+
import re
|
|
10
|
+
import os
|
|
11
|
+
import uuid
|
|
12
|
+
import json
|
|
13
|
+
import sys
|
|
14
|
+
from tempfile import TemporaryDirectory
|
|
15
|
+
from subprocess import check_output, Popen
|
|
16
|
+
from datetime import datetime, timedelta
|
|
17
|
+
from functools import wraps
|
|
18
|
+
from collections import namedtuple
|
|
19
|
+
|
|
20
|
+
# Card plot styles
|
|
21
|
+
MEM_COLOR = "#0c64d6"
|
|
22
|
+
GPU_COLOR = "#ff69b4"
|
|
23
|
+
|
|
24
|
+
NVIDIA_TS_FORMAT = "%Y/%m/%d %H:%M:%S"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
DRIVER_VER = re.compile(b"Driver Version: (.+?) ")
|
|
28
|
+
CUDA_VER = re.compile(b"CUDA Version:(.*) ")
|
|
29
|
+
|
|
30
|
+
MONITOR_FIELDS = [
|
|
31
|
+
"timestamp",
|
|
32
|
+
"gpu_utilization",
|
|
33
|
+
"memory_used",
|
|
34
|
+
"memory_total",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
MONITOR = """nvidia-smi --query-gpu=pci.bus_id,timestamp,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits -l {interval};"""
|
|
38
|
+
ProcessUUID = namedtuple("ProcessUUID", ["uuid", "start_time", "end_time"])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _get_uuid(time_duration=600):
|
|
42
|
+
frmt_str = "%Y-%m-%d-%H-%M-%S"
|
|
43
|
+
# Create a datetime range between the timerange values using current date as start date and time_duration as end date
|
|
44
|
+
start_date = datetime.now()
|
|
45
|
+
end_date = start_date + timedelta(seconds=time_duration)
|
|
46
|
+
datetime_range = start_date = (
|
|
47
|
+
datetime.now().strftime(frmt_str) + "_" + end_date.strftime(frmt_str)
|
|
48
|
+
)
|
|
49
|
+
uuid_str = uuid.uuid4().hex.replace("-", "") + "_" + datetime_range
|
|
50
|
+
return ProcessUUID(uuid_str, start_date, end_date)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AsyncProcessManager:
|
|
54
|
+
"""
|
|
55
|
+
This class is responsible for managing the nvidia SMI subprocesses
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
processes: Dict[str, Dict] = {
|
|
59
|
+
# "procid": {
|
|
60
|
+
# "proc": subprocess.Popen,
|
|
61
|
+
# "started": time.time()
|
|
62
|
+
# }
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def _register_process(cls, procid, proc):
|
|
67
|
+
cls.processes[procid] = {
|
|
68
|
+
"proc": proc,
|
|
69
|
+
"started": time.time(),
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def get(cls, procid):
|
|
74
|
+
proc_dict = cls.processes.get(procid, None)
|
|
75
|
+
if proc_dict is not None:
|
|
76
|
+
return proc_dict["proc"], proc_dict["started"]
|
|
77
|
+
return None, None
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def spawn(cls, procid, cmd, file):
|
|
81
|
+
proc = Popen(cmd, stdout=file)
|
|
82
|
+
cls._register_process(procid, proc)
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def remove(cls, procid, delete_item=True):
|
|
86
|
+
if procid in cls.processes:
|
|
87
|
+
if cls.processes[procid]["proc"].stdout is not None:
|
|
88
|
+
cls.processes[procid]["proc"].stdout.close()
|
|
89
|
+
cls.processes[procid]["proc"].terminate()
|
|
90
|
+
cls.processes[procid]["proc"].wait()
|
|
91
|
+
if delete_item:
|
|
92
|
+
del cls.processes[procid]
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def cleanup(cls):
|
|
96
|
+
for procid in cls.processes:
|
|
97
|
+
cls.remove(procid, delete_item=False)
|
|
98
|
+
cls.processes.clear()
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def is_running(cls, procid):
|
|
102
|
+
if procid not in cls.processes:
|
|
103
|
+
return False
|
|
104
|
+
return cls.processes[procid]["proc"].poll() is None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _parse_timestamp(timestamp):
|
|
108
|
+
try:
|
|
109
|
+
ts = timestamp.split(".")[0]
|
|
110
|
+
return datetime.strptime(ts, NVIDIA_TS_FORMAT)
|
|
111
|
+
except ValueError:
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class GPUMonitor:
|
|
116
|
+
"""
|
|
117
|
+
The `GPUMonitor` class is designed to monitor GPU usage.
|
|
118
|
+
|
|
119
|
+
When an instance of `GPUMonitor` is created, it initializes with a specified `interval` and `duration`.
|
|
120
|
+
The `duration` is the timeperiod it will run the NVIDIA SMI command for and the `interval` is the timeperiod between each reading.
|
|
121
|
+
The class exposes a `_monitor_update_thread` method which runs as a background thread that continuously updates the GPU usage readings.
|
|
122
|
+
It will keep running unitl the `_finished` flag is set to `True`.
|
|
123
|
+
|
|
124
|
+
The class will statefully manage the the spawned NVIDI-SMI processes.
|
|
125
|
+
It will start a new NVIDI-SMI process after the current one has ran for the specified `duration`.
|
|
126
|
+
At a time this class will only maintain readings for the `_current_process` and will have all the aggregated
|
|
127
|
+
readings for the past processes stored in the `_past_readings` dictionary.
|
|
128
|
+
When a process finishes completion, the readings are appended to the `_past_readings` dictionary and a new process is started.
|
|
129
|
+
|
|
130
|
+
If the caller of this class wishes to read the GPU usage, they can call the `read` method which will return the readings in a dictionary format.
|
|
131
|
+
The `read` method will aggregate the readings from the `_current_readings` and `_past_readings`.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
_started_processes: List[ProcessUUID] = []
|
|
135
|
+
|
|
136
|
+
_current_process: Union[ProcessUUID, None] = None
|
|
137
|
+
|
|
138
|
+
_current_readings: Dict[str, Any] = {}
|
|
139
|
+
|
|
140
|
+
_past_readings: Dict[str, Any] = {}
|
|
141
|
+
|
|
142
|
+
def __init__(self, interval=1, duration=300) -> None:
|
|
143
|
+
self._tempdir = TemporaryDirectory(prefix="gpu_card_monitor", dir="./")
|
|
144
|
+
self._interval = interval
|
|
145
|
+
self._duration = duration
|
|
146
|
+
self._finished = False
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def _current_file(self):
|
|
150
|
+
if self._current_process is None:
|
|
151
|
+
return None
|
|
152
|
+
return os.path.join(self._tempdir.name, self._current_process.uuid + ".csv")
|
|
153
|
+
|
|
154
|
+
def get_file_name(self, uuid):
|
|
155
|
+
return os.path.join(self._tempdir.name, uuid + ".csv")
|
|
156
|
+
|
|
157
|
+
def create_new_monitor(self):
|
|
158
|
+
uuid = _get_uuid(self._duration)
|
|
159
|
+
file = open(self.get_file_name(uuid.uuid), "w")
|
|
160
|
+
cmd = MONITOR.format(interval=self._interval, time_duration=self._duration)
|
|
161
|
+
AsyncProcessManager.spawn(uuid.uuid, ["bash", "-c", cmd], file)
|
|
162
|
+
self._started_processes.append(uuid)
|
|
163
|
+
self._current_process = uuid
|
|
164
|
+
return uuid
|
|
165
|
+
|
|
166
|
+
def clear_current_monitor(self):
|
|
167
|
+
if self._current_process is None:
|
|
168
|
+
return
|
|
169
|
+
AsyncProcessManager.remove(self._current_process.uuid)
|
|
170
|
+
self._current_process = None
|
|
171
|
+
|
|
172
|
+
def current_process_has_ended(self):
|
|
173
|
+
if self._current_process is None:
|
|
174
|
+
return True
|
|
175
|
+
return datetime.now() > self._current_process.end_time
|
|
176
|
+
|
|
177
|
+
def current_process_is_running(self):
|
|
178
|
+
if self._current_process is None:
|
|
179
|
+
return False
|
|
180
|
+
return AsyncProcessManager.is_running(self._current_process.uuid)
|
|
181
|
+
|
|
182
|
+
def _read_monitor(self):
|
|
183
|
+
"""
|
|
184
|
+
Reads the monitor file and returns the readings in a dictionary format
|
|
185
|
+
"""
|
|
186
|
+
all_readings = []
|
|
187
|
+
if self._current_file is None:
|
|
188
|
+
return None
|
|
189
|
+
# Extract everything from the CVS File and store it in a list of dictionaries
|
|
190
|
+
all_fields = ["gpu_id"] + MONITOR_FIELDS
|
|
191
|
+
with open(self._current_file, "r") as _monitor_out:
|
|
192
|
+
for line in _monitor_out.readlines():
|
|
193
|
+
data = {}
|
|
194
|
+
fields = [f.strip() for f in line.split(",")]
|
|
195
|
+
if len(fields) == len(all_fields):
|
|
196
|
+
# strip subsecond resolution from timestamps that doesn't align across devices
|
|
197
|
+
for idx, _f in enumerate(all_fields):
|
|
198
|
+
data[_f] = fields[idx]
|
|
199
|
+
all_readings.append(data)
|
|
200
|
+
else:
|
|
201
|
+
# expect that the last line may be truncated
|
|
202
|
+
break
|
|
203
|
+
|
|
204
|
+
# Convert to dictionary format
|
|
205
|
+
devdata = {}
|
|
206
|
+
for reading in all_readings:
|
|
207
|
+
gpu_id = reading["gpu_id"]
|
|
208
|
+
if "timestamp" not in reading:
|
|
209
|
+
continue
|
|
210
|
+
if _parse_timestamp(reading["timestamp"]) is None:
|
|
211
|
+
continue
|
|
212
|
+
reading["timestamp"] = reading["timestamp"].split(".")[0]
|
|
213
|
+
if gpu_id not in devdata:
|
|
214
|
+
devdata[gpu_id] = {}
|
|
215
|
+
|
|
216
|
+
for i, field in enumerate(MONITOR_FIELDS):
|
|
217
|
+
if field not in devdata[gpu_id]:
|
|
218
|
+
devdata[gpu_id][field] = []
|
|
219
|
+
devdata[gpu_id][field].append(reading[field])
|
|
220
|
+
return devdata
|
|
221
|
+
|
|
222
|
+
def _update_readings(self):
|
|
223
|
+
"""
|
|
224
|
+
Core update function that checks if the current process has ended and if so, it will create a new monitor
|
|
225
|
+
otherwise sets the current readings to the readings from the monitor file
|
|
226
|
+
"""
|
|
227
|
+
if self.current_process_has_ended() or not self.current_process_is_running():
|
|
228
|
+
self._update_past_readings()
|
|
229
|
+
self.clear_current_monitor()
|
|
230
|
+
self.create_new_monitor()
|
|
231
|
+
# Sleep for 1 seconds to allow the new process to start and we can make a reading
|
|
232
|
+
time.sleep(1)
|
|
233
|
+
|
|
234
|
+
readings = self._read_monitor()
|
|
235
|
+
if readings is None:
|
|
236
|
+
return
|
|
237
|
+
self._current_readings = readings
|
|
238
|
+
|
|
239
|
+
@staticmethod
|
|
240
|
+
def _make_full_reading(current, past):
|
|
241
|
+
if current is None:
|
|
242
|
+
return past
|
|
243
|
+
for gpu_id in current:
|
|
244
|
+
if gpu_id not in past:
|
|
245
|
+
past[gpu_id] = {}
|
|
246
|
+
for field in MONITOR_FIELDS:
|
|
247
|
+
if field not in past[gpu_id]:
|
|
248
|
+
past[gpu_id][field] = []
|
|
249
|
+
past[gpu_id][field].extend(current[gpu_id][field])
|
|
250
|
+
return past
|
|
251
|
+
|
|
252
|
+
def read(self):
|
|
253
|
+
return self._make_full_reading(
|
|
254
|
+
self._current_readings, json.loads(json.dumps(self._past_readings))
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def _update_past_readings(self):
|
|
258
|
+
if self._current_readings is None:
|
|
259
|
+
return
|
|
260
|
+
self._past_readings = self._make_full_reading(
|
|
261
|
+
self._current_readings, json.loads(json.dumps(self._past_readings))
|
|
262
|
+
)
|
|
263
|
+
self._current_readings = None
|
|
264
|
+
|
|
265
|
+
def cleanup(self):
|
|
266
|
+
self._finished = True
|
|
267
|
+
AsyncProcessManager.cleanup()
|
|
268
|
+
self._tempdir.cleanup()
|
|
269
|
+
|
|
270
|
+
def _monitor_update_thread(self):
|
|
271
|
+
while not self._finished:
|
|
272
|
+
self._update_readings()
|
|
273
|
+
time.sleep(self._interval)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _get_ts_range(_range):
|
|
277
|
+
if _range == "":
|
|
278
|
+
return "*No readings available*"
|
|
279
|
+
return "*Time range of charts: %s*" % _range
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _update_utilization(results, md_dict):
|
|
283
|
+
for device, data in results["profile"].items():
|
|
284
|
+
if device not in md_dict:
|
|
285
|
+
print(
|
|
286
|
+
"Device %s not found in the GPU card layout. Skipping..." % device,
|
|
287
|
+
file=sys.stderr,
|
|
288
|
+
)
|
|
289
|
+
continue
|
|
290
|
+
md_dict[device]["gpu"].update(
|
|
291
|
+
"%2.1f%%" % max(map(float, data["gpu_utilization"]))
|
|
292
|
+
)
|
|
293
|
+
md_dict[device]["memory"].update("%dMB" % max(map(float, data["memory_used"])))
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _update_charts(results, md_dict):
|
|
297
|
+
for device, data in results["profile"].items():
|
|
298
|
+
try:
|
|
299
|
+
if device not in md_dict:
|
|
300
|
+
continue
|
|
301
|
+
gpu_plot, mem_plot, ts_range = profile_plots(
|
|
302
|
+
device,
|
|
303
|
+
data["timestamp"],
|
|
304
|
+
data["gpu_utilization"],
|
|
305
|
+
data["memory_used"],
|
|
306
|
+
data["memory_total"],
|
|
307
|
+
)
|
|
308
|
+
md_dict[device]["gpu"].update(gpu_plot)
|
|
309
|
+
md_dict[device]["memory"].update(mem_plot)
|
|
310
|
+
md_dict[device]["reading_duration"].update(_get_ts_range(ts_range))
|
|
311
|
+
except ValueError as e:
|
|
312
|
+
# This is thrown when the date is unparsable. We can just safely ignore this.
|
|
313
|
+
print("ValueError: Could not parse date \n%s" % str(e), file=sys.stderr)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
# This code is adapted from: https://github.com/outerbounds/monitorbench
|
|
317
|
+
class GPUProfiler:
|
|
318
|
+
def __init__(self, interval=1, monitor_batch_duration=200):
|
|
319
|
+
self.driver_ver, self.cuda_ver, self.error = self._read_versions()
|
|
320
|
+
(
|
|
321
|
+
self.interconnect_data,
|
|
322
|
+
self.interconnect_legend,
|
|
323
|
+
) = self._read_multi_gpu_interconnect()
|
|
324
|
+
if self.error:
|
|
325
|
+
self.devices = []
|
|
326
|
+
return
|
|
327
|
+
else:
|
|
328
|
+
self.devices = self._read_devices()
|
|
329
|
+
self._monitor = GPUMonitor(
|
|
330
|
+
interval=interval, duration=monitor_batch_duration
|
|
331
|
+
)
|
|
332
|
+
self._monitor_thread = threading.Thread(
|
|
333
|
+
target=self._monitor._monitor_update_thread, daemon=True
|
|
334
|
+
)
|
|
335
|
+
self._monitor_thread.start()
|
|
336
|
+
self._interval = interval
|
|
337
|
+
|
|
338
|
+
self._card_comps = {"max_utilization": {}, "charts": {}, "reading_duration": {}}
|
|
339
|
+
self._card_created = False
|
|
340
|
+
|
|
341
|
+
def finish(self):
|
|
342
|
+
ret = {
|
|
343
|
+
"error": self.error,
|
|
344
|
+
"cuda_version": self.cuda_ver,
|
|
345
|
+
"driver_version": self.driver_ver,
|
|
346
|
+
}
|
|
347
|
+
if self.error:
|
|
348
|
+
return ret
|
|
349
|
+
else:
|
|
350
|
+
ret["devices"] = self.devices
|
|
351
|
+
ret["profile"] = self._monitor.read()
|
|
352
|
+
ret["interconnect"] = {
|
|
353
|
+
"data": self.interconnect_data,
|
|
354
|
+
"legend": self.interconnect_legend,
|
|
355
|
+
}
|
|
356
|
+
self._monitor.cleanup()
|
|
357
|
+
return ret
|
|
358
|
+
|
|
359
|
+
def _make_reading(self):
|
|
360
|
+
ret = {
|
|
361
|
+
"error": self.error,
|
|
362
|
+
"cuda_version": self.cuda_ver,
|
|
363
|
+
"driver_version": self.driver_ver,
|
|
364
|
+
}
|
|
365
|
+
if self.error:
|
|
366
|
+
return ret
|
|
367
|
+
else:
|
|
368
|
+
ret["devices"] = self.devices
|
|
369
|
+
ret["profile"] = self._monitor.read()
|
|
370
|
+
ret["interconnect"] = {
|
|
371
|
+
"data": self.interconnect_data,
|
|
372
|
+
"legend": self.interconnect_legend,
|
|
373
|
+
}
|
|
374
|
+
return ret
|
|
375
|
+
|
|
376
|
+
def _update_card(self):
|
|
377
|
+
if len(self.devices) == 0:
|
|
378
|
+
current.card["gpu_profile"].clear()
|
|
379
|
+
current.card["gpu_profile"].append(
|
|
380
|
+
Markdown("## GPU profile failed: %s" % self.error)
|
|
381
|
+
)
|
|
382
|
+
current.card["gpu_profile"].refresh()
|
|
383
|
+
|
|
384
|
+
return
|
|
385
|
+
|
|
386
|
+
while True:
|
|
387
|
+
readings = self._make_reading()
|
|
388
|
+
if readings is None:
|
|
389
|
+
print("GPU Profiler readings are none", file=sys.stderr)
|
|
390
|
+
time.sleep(self._interval)
|
|
391
|
+
continue
|
|
392
|
+
_update_utilization(readings, self._card_comps["max_utilization"])
|
|
393
|
+
_update_charts(readings, self._card_comps["charts"])
|
|
394
|
+
current.card["gpu_profile"].refresh()
|
|
395
|
+
time.sleep(self._interval)
|
|
396
|
+
|
|
397
|
+
def _setup_card(self, artifact_name):
|
|
398
|
+
from metaflow import current
|
|
399
|
+
|
|
400
|
+
results = self._make_reading()
|
|
401
|
+
els = current.card["gpu_profile"]
|
|
402
|
+
|
|
403
|
+
def _drivers():
|
|
404
|
+
els.append(Markdown("## Drivers"))
|
|
405
|
+
els.append(
|
|
406
|
+
Table(
|
|
407
|
+
[[results["cuda_version"], results["driver_version"]]],
|
|
408
|
+
headers=["NVidia driver version", "CUDA version"],
|
|
409
|
+
)
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
def _devices():
|
|
413
|
+
els.append(Markdown("## Devices"))
|
|
414
|
+
rows = [
|
|
415
|
+
[d["device_id"], d["name"], d["memory"]] for d in results["devices"]
|
|
416
|
+
]
|
|
417
|
+
els.append(Table(rows, headers=["Device ID", "Device type", "GPU memory"]))
|
|
418
|
+
|
|
419
|
+
def _interconnect():
|
|
420
|
+
if results["interconnect"]["data"] and results["interconnect"]["legend"]:
|
|
421
|
+
els.append(Markdown("## Interconnect"))
|
|
422
|
+
interconnect_data = results["interconnect"]["data"]
|
|
423
|
+
rows = list(interconnect_data.values())
|
|
424
|
+
rows = [list(transpose_row) for transpose_row in list(zip(*rows))]
|
|
425
|
+
els.append(Table(rows, headers=list(interconnect_data.keys())))
|
|
426
|
+
els.append(Markdown("#### Legend"))
|
|
427
|
+
els.append(
|
|
428
|
+
Table(
|
|
429
|
+
[list(results["interconnect"]["legend"].values())],
|
|
430
|
+
headers=list(results["interconnect"]["legend"].keys()),
|
|
431
|
+
)
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
def _utilization():
|
|
435
|
+
els.append(Markdown("## Maximum utilization"))
|
|
436
|
+
rows = {}
|
|
437
|
+
for d in results["devices"]:
|
|
438
|
+
rows[d["device_id"]] = {
|
|
439
|
+
"gpu": Markdown("0%"),
|
|
440
|
+
"memory": Markdown("0MB"),
|
|
441
|
+
}
|
|
442
|
+
_rows = [[Markdown(k)] + list(v.values()) for k, v in rows.items()]
|
|
443
|
+
els.append(
|
|
444
|
+
Table(data=_rows, headers=["Device ID", "Max GPU %", "Max memory"])
|
|
445
|
+
)
|
|
446
|
+
els.append(
|
|
447
|
+
Markdown(f"Detailed data saved in an artifact `{artifact_name}`")
|
|
448
|
+
)
|
|
449
|
+
return rows
|
|
450
|
+
|
|
451
|
+
def _plots():
|
|
452
|
+
els.append(Markdown("## GPU utilization and memory usage over time"))
|
|
453
|
+
|
|
454
|
+
rows = {}
|
|
455
|
+
for d in results["devices"]:
|
|
456
|
+
gpu_plot, mem_plot, ts_range = profile_plots(
|
|
457
|
+
d["device_id"], [], [], [], []
|
|
458
|
+
)
|
|
459
|
+
rows[d["device_id"]] = {
|
|
460
|
+
"gpu": VegaChart(gpu_plot),
|
|
461
|
+
"memory": VegaChart(mem_plot),
|
|
462
|
+
"reading_duration": Markdown(_get_ts_range(ts_range)),
|
|
463
|
+
}
|
|
464
|
+
for k, v in rows.items():
|
|
465
|
+
els.append(Markdown("### GPU Utilization for device : %s" % k))
|
|
466
|
+
els.append(v["reading_duration"])
|
|
467
|
+
els.append(
|
|
468
|
+
Table(
|
|
469
|
+
data=[
|
|
470
|
+
[Markdown("GPU Utilization"), v["gpu"]],
|
|
471
|
+
[Markdown("Memory usage"), v["memory"]],
|
|
472
|
+
]
|
|
473
|
+
)
|
|
474
|
+
)
|
|
475
|
+
return rows
|
|
476
|
+
|
|
477
|
+
_drivers()
|
|
478
|
+
_devices()
|
|
479
|
+
_interconnect()
|
|
480
|
+
self._card_comps["max_utilization"] = _utilization()
|
|
481
|
+
self._card_comps["charts"] = _plots()
|
|
482
|
+
|
|
483
|
+
def _read_versions(self):
|
|
484
|
+
def parse(r, s):
|
|
485
|
+
return r.search(s).group(1).strip().decode("utf-8")
|
|
486
|
+
|
|
487
|
+
try:
|
|
488
|
+
out = check_output(["nvidia-smi"])
|
|
489
|
+
return parse(DRIVER_VER, out), parse(CUDA_VER, out), None
|
|
490
|
+
except FileNotFoundError:
|
|
491
|
+
return None, None, "nvidia-smi not found"
|
|
492
|
+
except AttributeError:
|
|
493
|
+
return None, None, "nvidia-smi output is unexpected"
|
|
494
|
+
except:
|
|
495
|
+
return None, None, "nvidia-smi error"
|
|
496
|
+
|
|
497
|
+
def _read_devices(self):
|
|
498
|
+
out = check_output(
|
|
499
|
+
[
|
|
500
|
+
"nvidia-smi",
|
|
501
|
+
"--query-gpu=name,pci.bus_id,memory.total",
|
|
502
|
+
"--format=csv,noheader",
|
|
503
|
+
]
|
|
504
|
+
)
|
|
505
|
+
return [
|
|
506
|
+
dict(
|
|
507
|
+
zip(("name", "device_id", "memory"), (x.strip() for x in l.split(",")))
|
|
508
|
+
)
|
|
509
|
+
for l in out.decode("utf-8").splitlines()
|
|
510
|
+
]
|
|
511
|
+
|
|
512
|
+
def _read_multi_gpu_interconnect(self):
|
|
513
|
+
"""
|
|
514
|
+
parse output of `nvidia-smi tomo -m`, such as this sample:
|
|
515
|
+
|
|
516
|
+
GPU0 GPU1 CPU Affinity NUMA Affinity
|
|
517
|
+
GPU0 X NV2 0-23 N/A
|
|
518
|
+
GPU1 NV2 X 0-23 N/A
|
|
519
|
+
|
|
520
|
+
returns two dictionaries describing multi-GPU topology:
|
|
521
|
+
data: {index: [GPU0, GPU1, ...], GPU0: [X, NV2, ...], GPU1: [NV2, X, ...], ...}
|
|
522
|
+
legend_items: {X: 'Same PCI', NV2: 'NVLink 2', ...}
|
|
523
|
+
"""
|
|
524
|
+
try:
|
|
525
|
+
import re
|
|
526
|
+
|
|
527
|
+
ansi_escape = re.compile(r"(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]")
|
|
528
|
+
|
|
529
|
+
out = check_output(["nvidia-smi", "topo", "-m"])
|
|
530
|
+
rows = out.decode("utf-8").split("\n")
|
|
531
|
+
|
|
532
|
+
header = ansi_escape.sub("", rows[0]).split("\t")[1:]
|
|
533
|
+
data = {}
|
|
534
|
+
data["index"] = []
|
|
535
|
+
data |= {k: [] for k in header}
|
|
536
|
+
|
|
537
|
+
for i, row in enumerate(rows[1:]):
|
|
538
|
+
row = ansi_escape.sub("", row).split()
|
|
539
|
+
if len(row) == 0:
|
|
540
|
+
continue
|
|
541
|
+
if row[0].startswith("GPU"):
|
|
542
|
+
data["index"].append(row[0])
|
|
543
|
+
for key, val in zip(header, row[1:]):
|
|
544
|
+
data[key].append(val)
|
|
545
|
+
elif row[0].startswith("Legend"):
|
|
546
|
+
break
|
|
547
|
+
|
|
548
|
+
legend_items = {}
|
|
549
|
+
for legend_row in rows[i:]:
|
|
550
|
+
if legend_row == "" or legend_row.startswith("Legend"):
|
|
551
|
+
continue
|
|
552
|
+
res = legend_row.strip().split(" = ")
|
|
553
|
+
legend_items[res[0].strip()] = res[1].strip()
|
|
554
|
+
|
|
555
|
+
return data, legend_items
|
|
556
|
+
|
|
557
|
+
except:
|
|
558
|
+
return None, None
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
class gpu_profile:
|
|
562
|
+
def __init__(
|
|
563
|
+
self,
|
|
564
|
+
include_artifacts=True,
|
|
565
|
+
artifact_prefix="gpu_profile_",
|
|
566
|
+
interval=1,
|
|
567
|
+
):
|
|
568
|
+
self.include_artifacts = include_artifacts
|
|
569
|
+
self.artifact_prefix = artifact_prefix
|
|
570
|
+
self.interval = interval
|
|
571
|
+
|
|
572
|
+
def __call__(self, f):
|
|
573
|
+
@wraps(f)
|
|
574
|
+
def func(s):
|
|
575
|
+
prof = GPUProfiler(interval=self.interval)
|
|
576
|
+
if self.include_artifacts:
|
|
577
|
+
setattr(s, self.artifact_prefix + "num_gpus", len(prof.devices))
|
|
578
|
+
|
|
579
|
+
current.card["gpu_profile"].append(
|
|
580
|
+
Markdown("# GPU profile for `%s`" % current.pathspec)
|
|
581
|
+
)
|
|
582
|
+
current.card["gpu_profile"].append(
|
|
583
|
+
Markdown(
|
|
584
|
+
"_Started at: %s_"
|
|
585
|
+
% datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
|
|
586
|
+
)
|
|
587
|
+
)
|
|
588
|
+
prof._setup_card(self.artifact_prefix + "data")
|
|
589
|
+
current.card["gpu_profile"].refresh()
|
|
590
|
+
update_thread = threading.Thread(target=prof._update_card, daemon=True)
|
|
591
|
+
update_thread.start()
|
|
592
|
+
|
|
593
|
+
try:
|
|
594
|
+
f(s)
|
|
595
|
+
finally:
|
|
596
|
+
try:
|
|
597
|
+
results = prof.finish()
|
|
598
|
+
except:
|
|
599
|
+
results = {"error": "couldn't read profiler results"}
|
|
600
|
+
if self.include_artifacts:
|
|
601
|
+
setattr(s, self.artifact_prefix + "data", results)
|
|
602
|
+
|
|
603
|
+
from metaflow import card
|
|
604
|
+
|
|
605
|
+
return card(type="blank", id="gpu_profile", refresh_interval=self.interval)(
|
|
606
|
+
func
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def translate_to_vegalite(
|
|
611
|
+
tstamps,
|
|
612
|
+
vals,
|
|
613
|
+
description,
|
|
614
|
+
y_label,
|
|
615
|
+
legend,
|
|
616
|
+
line_color=None,
|
|
617
|
+
percentage_format=False,
|
|
618
|
+
):
|
|
619
|
+
# Preprocessing for Vega-Lite
|
|
620
|
+
# Assuming tstamps is a list of datetime objects and vals is a list of values
|
|
621
|
+
data = [{"tstamps": str(t), "vals": v} for t, v in zip(tstamps, vals)]
|
|
622
|
+
|
|
623
|
+
# Base Vega-Lite spec
|
|
624
|
+
vega_lite_spec = {
|
|
625
|
+
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
|
|
626
|
+
"description": description,
|
|
627
|
+
"data": {"values": data},
|
|
628
|
+
"width": 600,
|
|
629
|
+
"height": 400,
|
|
630
|
+
"encoding": {
|
|
631
|
+
"x": {"field": "tstamps", "type": "temporal", "axis": {"title": "Time"}},
|
|
632
|
+
"y": {
|
|
633
|
+
"field": "vals",
|
|
634
|
+
"type": "quantitative",
|
|
635
|
+
"axis": {
|
|
636
|
+
"title": y_label,
|
|
637
|
+
**({"format": "%"} if percentage_format else {}),
|
|
638
|
+
},
|
|
639
|
+
},
|
|
640
|
+
},
|
|
641
|
+
"layer": [
|
|
642
|
+
{
|
|
643
|
+
"mark": {
|
|
644
|
+
"type": "line",
|
|
645
|
+
"color": line_color if line_color else "blue",
|
|
646
|
+
"tooltip": True,
|
|
647
|
+
"description": legend, # Adding legend as description
|
|
648
|
+
},
|
|
649
|
+
"encoding": {"tooltip": [{"field": "tstamps"}, {"field": "vals"}]},
|
|
650
|
+
}
|
|
651
|
+
],
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
return vega_lite_spec
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
def profile_plots(device_id, ts, gpu, mem_used, mem_total):
|
|
658
|
+
tstamps = [datetime.strptime(t, NVIDIA_TS_FORMAT) for t in ts]
|
|
659
|
+
gpu = [i / 100 for i in list(map(float, gpu))]
|
|
660
|
+
mem = [float(used) / float(total) for used, total in zip(mem_used, mem_total)]
|
|
661
|
+
time_stamp_range = ""
|
|
662
|
+
if len(tstamps) > 1:
|
|
663
|
+
max_time = max(tstamps).strftime(NVIDIA_TS_FORMAT)
|
|
664
|
+
min_time = min(tstamps).strftime(NVIDIA_TS_FORMAT)
|
|
665
|
+
time_stamp_range = "%s to %s" % (min_time, max_time)
|
|
666
|
+
|
|
667
|
+
gpu_plot = translate_to_vegalite(
|
|
668
|
+
tstamps,
|
|
669
|
+
gpu,
|
|
670
|
+
"GPU utilization",
|
|
671
|
+
"GPU utilization",
|
|
672
|
+
"device: %s" % device_id,
|
|
673
|
+
line_color=GPU_COLOR,
|
|
674
|
+
percentage_format=True,
|
|
675
|
+
)
|
|
676
|
+
mem_plot = translate_to_vegalite(
|
|
677
|
+
tstamps,
|
|
678
|
+
mem,
|
|
679
|
+
"Percentage Memory utilization",
|
|
680
|
+
"Percentage Memory utilization",
|
|
681
|
+
"device: %s" % device_id,
|
|
682
|
+
line_color=MEM_COLOR,
|
|
683
|
+
percentage_format=True,
|
|
684
|
+
)
|
|
685
|
+
return gpu_plot, mem_plot, time_stamp_range
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
if __name__ == "__main__":
|
|
689
|
+
prof = GPUProfiler(monitor_batch_duration=10)
|
|
690
|
+
|
|
691
|
+
def _write_json_file(data, filename):
|
|
692
|
+
with open(filename, "w") as f:
|
|
693
|
+
json.dump(data, f, indent=4)
|
|
694
|
+
|
|
695
|
+
import time
|
|
696
|
+
|
|
697
|
+
for i in range(15):
|
|
698
|
+
time.sleep(1)
|
|
699
|
+
_write_json_file(prof._monitor.read(), "gpu_profile.json")
|
|
700
|
+
|
|
701
|
+
print(json.dumps(prof.finish()))
|
|
@@ -5,9 +5,10 @@ metaflow_extensions/outerbounds/remote_config.py
|
|
|
5
5
|
metaflow_extensions/outerbounds/config/__init__.py
|
|
6
6
|
metaflow_extensions/outerbounds/plugins/__init__.py
|
|
7
7
|
metaflow_extensions/outerbounds/plugins/auth_server.py
|
|
8
|
-
metaflow_extensions/outerbounds/plugins/perimeters.py
|
|
9
8
|
metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py
|
|
10
9
|
metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py
|
|
10
|
+
metaflow_extensions/outerbounds/profilers/__init__.py
|
|
11
|
+
metaflow_extensions/outerbounds/profilers/gpu.py
|
|
11
12
|
metaflow_extensions/outerbounds/toplevel/__init__.py
|
|
12
13
|
metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py
|
|
13
14
|
metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import fcntl
|
|
3
|
-
from os import path
|
|
4
|
-
import json
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def override_metaflow_profile_with_perimeter():
|
|
8
|
-
# If OBP_CONFIG_DIR is set, use that, otherwise use METAFLOW_HOME
|
|
9
|
-
# If neither are set, use ~/.metaflowconfig
|
|
10
|
-
obp_config_dir = path.expanduser(
|
|
11
|
-
os.environ.get(
|
|
12
|
-
"OBP_CONFIG_DIR", os.environ.get("METAFLOW_HOME", "~/.metaflowconfig")
|
|
13
|
-
)
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
file_path = os.path.join(obp_config_dir, "ob_config.json")
|
|
17
|
-
|
|
18
|
-
if os.path.exists(file_path):
|
|
19
|
-
# Acquire a shared read lock on the file
|
|
20
|
-
fd = os.open(file_path, os.O_RDONLY)
|
|
21
|
-
fcntl.flock(fd, fcntl.LOCK_SH)
|
|
22
|
-
|
|
23
|
-
with open(file_path, "r") as f:
|
|
24
|
-
ob_config = json.loads(f.read())
|
|
25
|
-
|
|
26
|
-
if "OB_CURRENT_PERIMETER" in ob_config:
|
|
27
|
-
os.environ["METAFLOW_PROFILE"] = ob_config["OB_CURRENT_PERIMETER"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|