ob-metaflow-extensions 1.1.42rc0__tar.gz → 1.1.43__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/PKG-INFO +1 -1
- ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/__init__.py +9 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/config/__init__.py +2 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/__init__.py +79 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/auth_server.py +16 -24
- ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/profilers/__init__.py +1 -0
- ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/profilers/gpu.py +701 -0
- ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/remote_config.py +108 -0
- ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +1 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/SOURCES.txt +4 -1
- ob-metaflow-extensions-1.1.43/ob_metaflow_extensions.egg-info/requires.txt +3 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/setup.py +2 -2
- ob-metaflow-extensions-1.1.42rc0/metaflow_extensions/outerbounds/__init__.py +0 -5
- ob-metaflow-extensions-1.1.42rc0/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -27
- ob-metaflow-extensions-1.1.42rc0/ob_metaflow_extensions.egg-info/requires.txt +0 -3
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/README.md +0 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
- {ob-metaflow-extensions-1.1.42rc0 → ob-metaflow-extensions-1.1.43}/setup.cfg +0 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import metaflow.metaflow_config_funcs
|
|
2
|
+
|
|
3
|
+
from metaflow_extensions.outerbounds.remote_config import init_config
|
|
4
|
+
|
|
5
|
+
# we want to overide OSS Metaflow's initialization behavior with our own to support remote configs
|
|
6
|
+
# we're reassigning the METAFLOW_CONFIG variable because all downstream settings rely on it and
|
|
7
|
+
# users still have the power to overriden them with environment variables
|
|
8
|
+
metaflow.metaflow_config_funcs.METAFLOW_CONFIG = init_config()
|
|
9
|
+
metaflow.metaflow_config_funcs.init_config = init_config
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import json
|
|
2
3
|
import tempfile
|
|
3
4
|
from contextlib import contextmanager
|
|
4
5
|
|
|
@@ -161,3 +162,81 @@ class ObpAzureAuthProvider(object):
|
|
|
161
162
|
|
|
162
163
|
|
|
163
164
|
AZURE_CLIENT_PROVIDERS_DESC = [("obp", ".ObpAzureAuthProvider")]
|
|
165
|
+
|
|
166
|
+
import threading
|
|
167
|
+
import time
|
|
168
|
+
|
|
169
|
+
_gcp_client_cache = dict()
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _get_cache_key():
|
|
173
|
+
return os.getpid(), threading.get_ident()
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class ObpGcpAuthProvider(object):
|
|
177
|
+
name = "obp"
|
|
178
|
+
|
|
179
|
+
@staticmethod
|
|
180
|
+
def get_gs_storage_client(*args, **kwargs):
|
|
181
|
+
|
|
182
|
+
import sys
|
|
183
|
+
from metaflow_extensions.outerbounds.plugins.auth_server import get_token
|
|
184
|
+
|
|
185
|
+
cache_key = _get_cache_key()
|
|
186
|
+
if _gcp_client_cache.get(cache_key):
|
|
187
|
+
# Don't cache the client for more than 5 minutes as it may have
|
|
188
|
+
# expired.
|
|
189
|
+
if _gcp_client_cache[cache_key]._created_at < time.time() - 300:
|
|
190
|
+
del _gcp_client_cache[cache_key]
|
|
191
|
+
else:
|
|
192
|
+
return _gcp_client_cache[cache_key]
|
|
193
|
+
|
|
194
|
+
from hashlib import sha256
|
|
195
|
+
from metaflow.util import get_username
|
|
196
|
+
|
|
197
|
+
user = get_username()
|
|
198
|
+
|
|
199
|
+
token_info = get_token("/generate/gcp")
|
|
200
|
+
token_file = "/tmp/obp_token." + sha256(user.encode("utf-8")).hexdigest()[:16]
|
|
201
|
+
credentials_file = (
|
|
202
|
+
"/tmp/obp_credentials." + sha256(user.encode("utf-8")).hexdigest()[:16]
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
with tempfile.NamedTemporaryFile("w", delete=False) as f:
|
|
206
|
+
f.write(token_info["token"])
|
|
207
|
+
tmp_token_file = f.name
|
|
208
|
+
os.rename(tmp_token_file, token_file)
|
|
209
|
+
|
|
210
|
+
credentials_json = {
|
|
211
|
+
"type": "external_account",
|
|
212
|
+
"audience": f"//iam.googleapis.com/projects/{token_info['gcpProjectNumber']}/locations/global/workloadIdentityPools/{token_info['gcpWorkloadIdentityPool']}/providers/{token_info['gcpWorkloadIdentityPoolProvider']}",
|
|
213
|
+
"subject_token_type": "urn:ietf:params:oauth:token-type:jwt",
|
|
214
|
+
"token_url": "https://sts.googleapis.com/v1/token",
|
|
215
|
+
"service_account_impersonation_url": f"https://iamcredentials.googleapis.com/v1/projects/-/serviceAccounts/{token_info['gcpServiceAccountEmail']}:generateAccessToken",
|
|
216
|
+
"credential_source": {
|
|
217
|
+
"file": token_file,
|
|
218
|
+
"format": {"type": "text"},
|
|
219
|
+
},
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
with tempfile.NamedTemporaryFile("w", delete=False) as f:
|
|
223
|
+
f.write(json.dumps(credentials_json))
|
|
224
|
+
tmp_credentials_file = f.name
|
|
225
|
+
os.rename(tmp_credentials_file, credentials_file)
|
|
226
|
+
|
|
227
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file
|
|
228
|
+
from google.cloud import storage
|
|
229
|
+
|
|
230
|
+
storage_client = storage.Client(project=token_info["gcpProjectId"])
|
|
231
|
+
storage_client._created_at = time.time()
|
|
232
|
+
_gcp_client_cache[cache_key] = storage_client
|
|
233
|
+
return storage_client
|
|
234
|
+
|
|
235
|
+
@staticmethod
|
|
236
|
+
def get_credentials(scopes, *args, **kwargs):
|
|
237
|
+
import google.auth
|
|
238
|
+
|
|
239
|
+
return google.auth.default(scopes=scopes)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
GCP_CLIENT_PROVIDERS_DESC = [("obp", ".ObpGcpAuthProvider")]
|
|
@@ -1,37 +1,22 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import json
|
|
3
|
-
import requests
|
|
4
1
|
from urllib.parse import urlparse
|
|
5
2
|
|
|
3
|
+
import requests
|
|
4
|
+
|
|
6
5
|
|
|
7
6
|
def read_mf_config():
|
|
8
|
-
#
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
path_to_config = os.path.join(home, "config.json")
|
|
12
|
-
if profile:
|
|
13
|
-
path_to_config = os.path.join(home, "config_%s.json" % profile)
|
|
14
|
-
path_to_config = os.path.expanduser(path_to_config)
|
|
15
|
-
config = {}
|
|
16
|
-
if os.path.exists(path_to_config):
|
|
17
|
-
with open(path_to_config, encoding="utf-8") as f:
|
|
18
|
-
return json.load(f)
|
|
19
|
-
elif profile:
|
|
20
|
-
from metaflow.exception import MetaflowException
|
|
21
|
-
|
|
22
|
-
raise MetaflowException(
|
|
23
|
-
"Unable to locate METAFLOW_PROFILE '%s' in '%s')" % (profile, home)
|
|
24
|
-
)
|
|
25
|
-
return config
|
|
7
|
+
# this should be overridden with the resolved remote config here:
|
|
8
|
+
# obp-python-packages/ob-metaflow-extensions/metaflow_extensions/outerbounds/__init__.py
|
|
9
|
+
from metaflow.metaflow_config_funcs import METAFLOW_CONFIG
|
|
26
10
|
|
|
11
|
+
return METAFLOW_CONFIG
|
|
27
12
|
|
|
28
|
-
|
|
13
|
+
|
|
14
|
+
def get_token_url_and_headers(url_path):
|
|
29
15
|
from metaflow.metaflow_config import (
|
|
30
16
|
SERVICE_HEADERS,
|
|
31
|
-
from_conf,
|
|
32
17
|
SERVICE_URL,
|
|
33
18
|
)
|
|
34
|
-
from metaflow.
|
|
19
|
+
from metaflow.metaflow_config import SERVICE_HEADERS, SERVICE_URL
|
|
35
20
|
|
|
36
21
|
# Infer auth host from metadata service URL, unless it has been
|
|
37
22
|
# specified explicitly. Take the MDS host and replace first part of
|
|
@@ -44,6 +29,13 @@ def get_token(url_path):
|
|
|
44
29
|
assert url_path.startswith("/")
|
|
45
30
|
url = "https://" + authServer + url_path
|
|
46
31
|
headers = SERVICE_HEADERS
|
|
32
|
+
return url, headers
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_token(url_path):
|
|
36
|
+
from metaflow.exception import MetaflowException
|
|
37
|
+
|
|
38
|
+
url, headers = get_token_url_and_headers(url_path)
|
|
47
39
|
try:
|
|
48
40
|
r = requests.get(url, headers=headers)
|
|
49
41
|
r.raise_for_status()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .gpu import gpu_profile
|
|
@@ -0,0 +1,701 @@
|
|
|
1
|
+
from metaflow.cards import Markdown, Table, VegaChart
|
|
2
|
+
from functools import wraps
|
|
3
|
+
import threading
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from metaflow import current
|
|
6
|
+
from metaflow.cards import Table, Markdown, VegaChart, Image
|
|
7
|
+
import time
|
|
8
|
+
from typing import List, Dict, Any, Union
|
|
9
|
+
import re
|
|
10
|
+
import os
|
|
11
|
+
import uuid
|
|
12
|
+
import json
|
|
13
|
+
import sys
|
|
14
|
+
from tempfile import TemporaryDirectory
|
|
15
|
+
from subprocess import check_output, Popen
|
|
16
|
+
from datetime import datetime, timedelta
|
|
17
|
+
from functools import wraps
|
|
18
|
+
from collections import namedtuple
|
|
19
|
+
|
|
20
|
+
# Card plot styles
|
|
21
|
+
MEM_COLOR = "#0c64d6"
|
|
22
|
+
GPU_COLOR = "#ff69b4"
|
|
23
|
+
|
|
24
|
+
NVIDIA_TS_FORMAT = "%Y/%m/%d %H:%M:%S"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
DRIVER_VER = re.compile(b"Driver Version: (.+?) ")
|
|
28
|
+
CUDA_VER = re.compile(b"CUDA Version:(.*) ")
|
|
29
|
+
|
|
30
|
+
MONITOR_FIELDS = [
|
|
31
|
+
"timestamp",
|
|
32
|
+
"gpu_utilization",
|
|
33
|
+
"memory_used",
|
|
34
|
+
"memory_total",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
MONITOR = """nvidia-smi --query-gpu=pci.bus_id,timestamp,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits -l {interval};"""
|
|
38
|
+
ProcessUUID = namedtuple("ProcessUUID", ["uuid", "start_time", "end_time"])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _get_uuid(time_duration=600):
|
|
42
|
+
frmt_str = "%Y-%m-%d-%H-%M-%S"
|
|
43
|
+
# Create a datetime range between the timerange values using current date as start date and time_duration as end date
|
|
44
|
+
start_date = datetime.now()
|
|
45
|
+
end_date = start_date + timedelta(seconds=time_duration)
|
|
46
|
+
datetime_range = start_date = (
|
|
47
|
+
datetime.now().strftime(frmt_str) + "_" + end_date.strftime(frmt_str)
|
|
48
|
+
)
|
|
49
|
+
uuid_str = uuid.uuid4().hex.replace("-", "") + "_" + datetime_range
|
|
50
|
+
return ProcessUUID(uuid_str, start_date, end_date)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AsyncProcessManager:
|
|
54
|
+
"""
|
|
55
|
+
This class is responsible for managing the nvidia SMI subprocesses
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
processes: Dict[str, Dict] = {
|
|
59
|
+
# "procid": {
|
|
60
|
+
# "proc": subprocess.Popen,
|
|
61
|
+
# "started": time.time()
|
|
62
|
+
# }
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def _register_process(cls, procid, proc):
|
|
67
|
+
cls.processes[procid] = {
|
|
68
|
+
"proc": proc,
|
|
69
|
+
"started": time.time(),
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def get(cls, procid):
|
|
74
|
+
proc_dict = cls.processes.get(procid, None)
|
|
75
|
+
if proc_dict is not None:
|
|
76
|
+
return proc_dict["proc"], proc_dict["started"]
|
|
77
|
+
return None, None
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def spawn(cls, procid, cmd, file):
|
|
81
|
+
proc = Popen(cmd, stdout=file)
|
|
82
|
+
cls._register_process(procid, proc)
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def remove(cls, procid, delete_item=True):
|
|
86
|
+
if procid in cls.processes:
|
|
87
|
+
if cls.processes[procid]["proc"].stdout is not None:
|
|
88
|
+
cls.processes[procid]["proc"].stdout.close()
|
|
89
|
+
cls.processes[procid]["proc"].terminate()
|
|
90
|
+
cls.processes[procid]["proc"].wait()
|
|
91
|
+
if delete_item:
|
|
92
|
+
del cls.processes[procid]
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def cleanup(cls):
|
|
96
|
+
for procid in cls.processes:
|
|
97
|
+
cls.remove(procid, delete_item=False)
|
|
98
|
+
cls.processes.clear()
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def is_running(cls, procid):
|
|
102
|
+
if procid not in cls.processes:
|
|
103
|
+
return False
|
|
104
|
+
return cls.processes[procid]["proc"].poll() is None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _parse_timestamp(timestamp):
|
|
108
|
+
try:
|
|
109
|
+
ts = timestamp.split(".")[0]
|
|
110
|
+
return datetime.strptime(ts, NVIDIA_TS_FORMAT)
|
|
111
|
+
except ValueError:
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class GPUMonitor:
|
|
116
|
+
"""
|
|
117
|
+
The `GPUMonitor` class is designed to monitor GPU usage.
|
|
118
|
+
|
|
119
|
+
When an instance of `GPUMonitor` is created, it initializes with a specified `interval` and `duration`.
|
|
120
|
+
The `duration` is the timeperiod it will run the NVIDIA SMI command for and the `interval` is the timeperiod between each reading.
|
|
121
|
+
The class exposes a `_monitor_update_thread` method which runs as a background thread that continuously updates the GPU usage readings.
|
|
122
|
+
It will keep running unitl the `_finished` flag is set to `True`.
|
|
123
|
+
|
|
124
|
+
The class will statefully manage the the spawned NVIDI-SMI processes.
|
|
125
|
+
It will start a new NVIDI-SMI process after the current one has ran for the specified `duration`.
|
|
126
|
+
At a time this class will only maintain readings for the `_current_process` and will have all the aggregated
|
|
127
|
+
readings for the past processes stored in the `_past_readings` dictionary.
|
|
128
|
+
When a process finishes completion, the readings are appended to the `_past_readings` dictionary and a new process is started.
|
|
129
|
+
|
|
130
|
+
If the caller of this class wishes to read the GPU usage, they can call the `read` method which will return the readings in a dictionary format.
|
|
131
|
+
The `read` method will aggregate the readings from the `_current_readings` and `_past_readings`.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
_started_processes: List[ProcessUUID] = []
|
|
135
|
+
|
|
136
|
+
_current_process: Union[ProcessUUID, None] = None
|
|
137
|
+
|
|
138
|
+
_current_readings: Dict[str, Any] = {}
|
|
139
|
+
|
|
140
|
+
_past_readings: Dict[str, Any] = {}
|
|
141
|
+
|
|
142
|
+
def __init__(self, interval=1, duration=300) -> None:
|
|
143
|
+
self._tempdir = TemporaryDirectory(prefix="gpu_card_monitor", dir="./")
|
|
144
|
+
self._interval = interval
|
|
145
|
+
self._duration = duration
|
|
146
|
+
self._finished = False
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def _current_file(self):
|
|
150
|
+
if self._current_process is None:
|
|
151
|
+
return None
|
|
152
|
+
return os.path.join(self._tempdir.name, self._current_process.uuid + ".csv")
|
|
153
|
+
|
|
154
|
+
def get_file_name(self, uuid):
|
|
155
|
+
return os.path.join(self._tempdir.name, uuid + ".csv")
|
|
156
|
+
|
|
157
|
+
def create_new_monitor(self):
|
|
158
|
+
uuid = _get_uuid(self._duration)
|
|
159
|
+
file = open(self.get_file_name(uuid.uuid), "w")
|
|
160
|
+
cmd = MONITOR.format(interval=self._interval, time_duration=self._duration)
|
|
161
|
+
AsyncProcessManager.spawn(uuid.uuid, ["bash", "-c", cmd], file)
|
|
162
|
+
self._started_processes.append(uuid)
|
|
163
|
+
self._current_process = uuid
|
|
164
|
+
return uuid
|
|
165
|
+
|
|
166
|
+
def clear_current_monitor(self):
|
|
167
|
+
if self._current_process is None:
|
|
168
|
+
return
|
|
169
|
+
AsyncProcessManager.remove(self._current_process.uuid)
|
|
170
|
+
self._current_process = None
|
|
171
|
+
|
|
172
|
+
def current_process_has_ended(self):
|
|
173
|
+
if self._current_process is None:
|
|
174
|
+
return True
|
|
175
|
+
return datetime.now() > self._current_process.end_time
|
|
176
|
+
|
|
177
|
+
def current_process_is_running(self):
|
|
178
|
+
if self._current_process is None:
|
|
179
|
+
return False
|
|
180
|
+
return AsyncProcessManager.is_running(self._current_process.uuid)
|
|
181
|
+
|
|
182
|
+
def _read_monitor(self):
|
|
183
|
+
"""
|
|
184
|
+
Reads the monitor file and returns the readings in a dictionary format
|
|
185
|
+
"""
|
|
186
|
+
all_readings = []
|
|
187
|
+
if self._current_file is None:
|
|
188
|
+
return None
|
|
189
|
+
# Extract everything from the CVS File and store it in a list of dictionaries
|
|
190
|
+
all_fields = ["gpu_id"] + MONITOR_FIELDS
|
|
191
|
+
with open(self._current_file, "r") as _monitor_out:
|
|
192
|
+
for line in _monitor_out.readlines():
|
|
193
|
+
data = {}
|
|
194
|
+
fields = [f.strip() for f in line.split(",")]
|
|
195
|
+
if len(fields) == len(all_fields):
|
|
196
|
+
# strip subsecond resolution from timestamps that doesn't align across devices
|
|
197
|
+
for idx, _f in enumerate(all_fields):
|
|
198
|
+
data[_f] = fields[idx]
|
|
199
|
+
all_readings.append(data)
|
|
200
|
+
else:
|
|
201
|
+
# expect that the last line may be truncated
|
|
202
|
+
break
|
|
203
|
+
|
|
204
|
+
# Convert to dictionary format
|
|
205
|
+
devdata = {}
|
|
206
|
+
for reading in all_readings:
|
|
207
|
+
gpu_id = reading["gpu_id"]
|
|
208
|
+
if "timestamp" not in reading:
|
|
209
|
+
continue
|
|
210
|
+
if _parse_timestamp(reading["timestamp"]) is None:
|
|
211
|
+
continue
|
|
212
|
+
reading["timestamp"] = reading["timestamp"].split(".")[0]
|
|
213
|
+
if gpu_id not in devdata:
|
|
214
|
+
devdata[gpu_id] = {}
|
|
215
|
+
|
|
216
|
+
for i, field in enumerate(MONITOR_FIELDS):
|
|
217
|
+
if field not in devdata[gpu_id]:
|
|
218
|
+
devdata[gpu_id][field] = []
|
|
219
|
+
devdata[gpu_id][field].append(reading[field])
|
|
220
|
+
return devdata
|
|
221
|
+
|
|
222
|
+
def _update_readings(self):
|
|
223
|
+
"""
|
|
224
|
+
Core update function that checks if the current process has ended and if so, it will create a new monitor
|
|
225
|
+
otherwise sets the current readings to the readings from the monitor file
|
|
226
|
+
"""
|
|
227
|
+
if self.current_process_has_ended() or not self.current_process_is_running():
|
|
228
|
+
self._update_past_readings()
|
|
229
|
+
self.clear_current_monitor()
|
|
230
|
+
self.create_new_monitor()
|
|
231
|
+
# Sleep for 1 seconds to allow the new process to start and we can make a reading
|
|
232
|
+
time.sleep(1)
|
|
233
|
+
|
|
234
|
+
readings = self._read_monitor()
|
|
235
|
+
if readings is None:
|
|
236
|
+
return
|
|
237
|
+
self._current_readings = readings
|
|
238
|
+
|
|
239
|
+
@staticmethod
|
|
240
|
+
def _make_full_reading(current, past):
|
|
241
|
+
if current is None:
|
|
242
|
+
return past
|
|
243
|
+
for gpu_id in current:
|
|
244
|
+
if gpu_id not in past:
|
|
245
|
+
past[gpu_id] = {}
|
|
246
|
+
for field in MONITOR_FIELDS:
|
|
247
|
+
if field not in past[gpu_id]:
|
|
248
|
+
past[gpu_id][field] = []
|
|
249
|
+
past[gpu_id][field].extend(current[gpu_id][field])
|
|
250
|
+
return past
|
|
251
|
+
|
|
252
|
+
def read(self):
|
|
253
|
+
return self._make_full_reading(
|
|
254
|
+
self._current_readings, json.loads(json.dumps(self._past_readings))
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def _update_past_readings(self):
|
|
258
|
+
if self._current_readings is None:
|
|
259
|
+
return
|
|
260
|
+
self._past_readings = self._make_full_reading(
|
|
261
|
+
self._current_readings, json.loads(json.dumps(self._past_readings))
|
|
262
|
+
)
|
|
263
|
+
self._current_readings = None
|
|
264
|
+
|
|
265
|
+
def cleanup(self):
|
|
266
|
+
self._finished = True
|
|
267
|
+
AsyncProcessManager.cleanup()
|
|
268
|
+
self._tempdir.cleanup()
|
|
269
|
+
|
|
270
|
+
def _monitor_update_thread(self):
|
|
271
|
+
while not self._finished:
|
|
272
|
+
self._update_readings()
|
|
273
|
+
time.sleep(self._interval)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _get_ts_range(_range):
|
|
277
|
+
if _range == "":
|
|
278
|
+
return "*No readings available*"
|
|
279
|
+
return "*Time range of charts: %s*" % _range
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _update_utilization(results, md_dict):
|
|
283
|
+
for device, data in results["profile"].items():
|
|
284
|
+
if device not in md_dict:
|
|
285
|
+
print(
|
|
286
|
+
"Device %s not found in the GPU card layout. Skipping..." % device,
|
|
287
|
+
file=sys.stderr,
|
|
288
|
+
)
|
|
289
|
+
continue
|
|
290
|
+
md_dict[device]["gpu"].update(
|
|
291
|
+
"%2.1f%%" % max(map(float, data["gpu_utilization"]))
|
|
292
|
+
)
|
|
293
|
+
md_dict[device]["memory"].update("%dMB" % max(map(float, data["memory_used"])))
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _update_charts(results, md_dict):
|
|
297
|
+
for device, data in results["profile"].items():
|
|
298
|
+
try:
|
|
299
|
+
if device not in md_dict:
|
|
300
|
+
continue
|
|
301
|
+
gpu_plot, mem_plot, ts_range = profile_plots(
|
|
302
|
+
device,
|
|
303
|
+
data["timestamp"],
|
|
304
|
+
data["gpu_utilization"],
|
|
305
|
+
data["memory_used"],
|
|
306
|
+
data["memory_total"],
|
|
307
|
+
)
|
|
308
|
+
md_dict[device]["gpu"].update(gpu_plot)
|
|
309
|
+
md_dict[device]["memory"].update(mem_plot)
|
|
310
|
+
md_dict[device]["reading_duration"].update(_get_ts_range(ts_range))
|
|
311
|
+
except ValueError as e:
|
|
312
|
+
# This is thrown when the date is unparsable. We can just safely ignore this.
|
|
313
|
+
print("ValueError: Could not parse date \n%s" % str(e), file=sys.stderr)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
# This code is adapted from: https://github.com/outerbounds/monitorbench
|
|
317
|
+
class GPUProfiler:
|
|
318
|
+
def __init__(self, interval=1, monitor_batch_duration=200):
|
|
319
|
+
self.driver_ver, self.cuda_ver, self.error = self._read_versions()
|
|
320
|
+
(
|
|
321
|
+
self.interconnect_data,
|
|
322
|
+
self.interconnect_legend,
|
|
323
|
+
) = self._read_multi_gpu_interconnect()
|
|
324
|
+
if self.error:
|
|
325
|
+
self.devices = []
|
|
326
|
+
return
|
|
327
|
+
else:
|
|
328
|
+
self.devices = self._read_devices()
|
|
329
|
+
self._monitor = GPUMonitor(
|
|
330
|
+
interval=interval, duration=monitor_batch_duration
|
|
331
|
+
)
|
|
332
|
+
self._monitor_thread = threading.Thread(
|
|
333
|
+
target=self._monitor._monitor_update_thread, daemon=True
|
|
334
|
+
)
|
|
335
|
+
self._monitor_thread.start()
|
|
336
|
+
self._interval = interval
|
|
337
|
+
|
|
338
|
+
self._card_comps = {"max_utilization": {}, "charts": {}, "reading_duration": {}}
|
|
339
|
+
self._card_created = False
|
|
340
|
+
|
|
341
|
+
def finish(self):
|
|
342
|
+
ret = {
|
|
343
|
+
"error": self.error,
|
|
344
|
+
"cuda_version": self.cuda_ver,
|
|
345
|
+
"driver_version": self.driver_ver,
|
|
346
|
+
}
|
|
347
|
+
if self.error:
|
|
348
|
+
return ret
|
|
349
|
+
else:
|
|
350
|
+
ret["devices"] = self.devices
|
|
351
|
+
ret["profile"] = self._monitor.read()
|
|
352
|
+
ret["interconnect"] = {
|
|
353
|
+
"data": self.interconnect_data,
|
|
354
|
+
"legend": self.interconnect_legend,
|
|
355
|
+
}
|
|
356
|
+
self._monitor.cleanup()
|
|
357
|
+
return ret
|
|
358
|
+
|
|
359
|
+
def _make_reading(self):
|
|
360
|
+
ret = {
|
|
361
|
+
"error": self.error,
|
|
362
|
+
"cuda_version": self.cuda_ver,
|
|
363
|
+
"driver_version": self.driver_ver,
|
|
364
|
+
}
|
|
365
|
+
if self.error:
|
|
366
|
+
return ret
|
|
367
|
+
else:
|
|
368
|
+
ret["devices"] = self.devices
|
|
369
|
+
ret["profile"] = self._monitor.read()
|
|
370
|
+
ret["interconnect"] = {
|
|
371
|
+
"data": self.interconnect_data,
|
|
372
|
+
"legend": self.interconnect_legend,
|
|
373
|
+
}
|
|
374
|
+
return ret
|
|
375
|
+
|
|
376
|
+
def _update_card(self):
|
|
377
|
+
if len(self.devices) == 0:
|
|
378
|
+
current.card["gpu_profile"].clear()
|
|
379
|
+
current.card["gpu_profile"].append(
|
|
380
|
+
Markdown("## GPU profile failed: %s" % self.error)
|
|
381
|
+
)
|
|
382
|
+
current.card["gpu_profile"].refresh()
|
|
383
|
+
|
|
384
|
+
return
|
|
385
|
+
|
|
386
|
+
while True:
|
|
387
|
+
readings = self._make_reading()
|
|
388
|
+
if readings is None:
|
|
389
|
+
print("GPU Profiler readings are none", file=sys.stderr)
|
|
390
|
+
time.sleep(self._interval)
|
|
391
|
+
continue
|
|
392
|
+
_update_utilization(readings, self._card_comps["max_utilization"])
|
|
393
|
+
_update_charts(readings, self._card_comps["charts"])
|
|
394
|
+
current.card["gpu_profile"].refresh()
|
|
395
|
+
time.sleep(self._interval)
|
|
396
|
+
|
|
397
|
+
def _setup_card(self, artifact_name):
|
|
398
|
+
from metaflow import current
|
|
399
|
+
|
|
400
|
+
results = self._make_reading()
|
|
401
|
+
els = current.card["gpu_profile"]
|
|
402
|
+
|
|
403
|
+
def _drivers():
|
|
404
|
+
els.append(Markdown("## Drivers"))
|
|
405
|
+
els.append(
|
|
406
|
+
Table(
|
|
407
|
+
[[results["cuda_version"], results["driver_version"]]],
|
|
408
|
+
headers=["NVidia driver version", "CUDA version"],
|
|
409
|
+
)
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
def _devices():
|
|
413
|
+
els.append(Markdown("## Devices"))
|
|
414
|
+
rows = [
|
|
415
|
+
[d["device_id"], d["name"], d["memory"]] for d in results["devices"]
|
|
416
|
+
]
|
|
417
|
+
els.append(Table(rows, headers=["Device ID", "Device type", "GPU memory"]))
|
|
418
|
+
|
|
419
|
+
def _interconnect():
|
|
420
|
+
if results["interconnect"]["data"] and results["interconnect"]["legend"]:
|
|
421
|
+
els.append(Markdown("## Interconnect"))
|
|
422
|
+
interconnect_data = results["interconnect"]["data"]
|
|
423
|
+
rows = list(interconnect_data.values())
|
|
424
|
+
rows = [list(transpose_row) for transpose_row in list(zip(*rows))]
|
|
425
|
+
els.append(Table(rows, headers=list(interconnect_data.keys())))
|
|
426
|
+
els.append(Markdown("#### Legend"))
|
|
427
|
+
els.append(
|
|
428
|
+
Table(
|
|
429
|
+
[list(results["interconnect"]["legend"].values())],
|
|
430
|
+
headers=list(results["interconnect"]["legend"].keys()),
|
|
431
|
+
)
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
def _utilization():
|
|
435
|
+
els.append(Markdown("## Maximum utilization"))
|
|
436
|
+
rows = {}
|
|
437
|
+
for d in results["devices"]:
|
|
438
|
+
rows[d["device_id"]] = {
|
|
439
|
+
"gpu": Markdown("0%"),
|
|
440
|
+
"memory": Markdown("0MB"),
|
|
441
|
+
}
|
|
442
|
+
_rows = [[Markdown(k)] + list(v.values()) for k, v in rows.items()]
|
|
443
|
+
els.append(
|
|
444
|
+
Table(data=_rows, headers=["Device ID", "Max GPU %", "Max memory"])
|
|
445
|
+
)
|
|
446
|
+
els.append(
|
|
447
|
+
Markdown(f"Detailed data saved in an artifact `{artifact_name}`")
|
|
448
|
+
)
|
|
449
|
+
return rows
|
|
450
|
+
|
|
451
|
+
def _plots():
|
|
452
|
+
els.append(Markdown("## GPU utilization and memory usage over time"))
|
|
453
|
+
|
|
454
|
+
rows = {}
|
|
455
|
+
for d in results["devices"]:
|
|
456
|
+
gpu_plot, mem_plot, ts_range = profile_plots(
|
|
457
|
+
d["device_id"], [], [], [], []
|
|
458
|
+
)
|
|
459
|
+
rows[d["device_id"]] = {
|
|
460
|
+
"gpu": VegaChart(gpu_plot),
|
|
461
|
+
"memory": VegaChart(mem_plot),
|
|
462
|
+
"reading_duration": Markdown(_get_ts_range(ts_range)),
|
|
463
|
+
}
|
|
464
|
+
for k, v in rows.items():
|
|
465
|
+
els.append(Markdown("### GPU Utilization for device : %s" % k))
|
|
466
|
+
els.append(v["reading_duration"])
|
|
467
|
+
els.append(
|
|
468
|
+
Table(
|
|
469
|
+
data=[
|
|
470
|
+
[Markdown("GPU Utilization"), v["gpu"]],
|
|
471
|
+
[Markdown("Memory usage"), v["memory"]],
|
|
472
|
+
]
|
|
473
|
+
)
|
|
474
|
+
)
|
|
475
|
+
return rows
|
|
476
|
+
|
|
477
|
+
_drivers()
|
|
478
|
+
_devices()
|
|
479
|
+
_interconnect()
|
|
480
|
+
self._card_comps["max_utilization"] = _utilization()
|
|
481
|
+
self._card_comps["charts"] = _plots()
|
|
482
|
+
|
|
483
|
+
def _read_versions(self):
|
|
484
|
+
def parse(r, s):
|
|
485
|
+
return r.search(s).group(1).strip().decode("utf-8")
|
|
486
|
+
|
|
487
|
+
try:
|
|
488
|
+
out = check_output(["nvidia-smi"])
|
|
489
|
+
return parse(DRIVER_VER, out), parse(CUDA_VER, out), None
|
|
490
|
+
except FileNotFoundError:
|
|
491
|
+
return None, None, "nvidia-smi not found"
|
|
492
|
+
except AttributeError:
|
|
493
|
+
return None, None, "nvidia-smi output is unexpected"
|
|
494
|
+
except:
|
|
495
|
+
return None, None, "nvidia-smi error"
|
|
496
|
+
|
|
497
|
+
def _read_devices(self):
|
|
498
|
+
out = check_output(
|
|
499
|
+
[
|
|
500
|
+
"nvidia-smi",
|
|
501
|
+
"--query-gpu=name,pci.bus_id,memory.total",
|
|
502
|
+
"--format=csv,noheader",
|
|
503
|
+
]
|
|
504
|
+
)
|
|
505
|
+
return [
|
|
506
|
+
dict(
|
|
507
|
+
zip(("name", "device_id", "memory"), (x.strip() for x in l.split(",")))
|
|
508
|
+
)
|
|
509
|
+
for l in out.decode("utf-8").splitlines()
|
|
510
|
+
]
|
|
511
|
+
|
|
512
|
+
def _read_multi_gpu_interconnect(self):
|
|
513
|
+
"""
|
|
514
|
+
parse output of `nvidia-smi tomo -m`, such as this sample:
|
|
515
|
+
|
|
516
|
+
GPU0 GPU1 CPU Affinity NUMA Affinity
|
|
517
|
+
GPU0 X NV2 0-23 N/A
|
|
518
|
+
GPU1 NV2 X 0-23 N/A
|
|
519
|
+
|
|
520
|
+
returns two dictionaries describing multi-GPU topology:
|
|
521
|
+
data: {index: [GPU0, GPU1, ...], GPU0: [X, NV2, ...], GPU1: [NV2, X, ...], ...}
|
|
522
|
+
legend_items: {X: 'Same PCI', NV2: 'NVLink 2', ...}
|
|
523
|
+
"""
|
|
524
|
+
try:
|
|
525
|
+
import re
|
|
526
|
+
|
|
527
|
+
ansi_escape = re.compile(r"(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]")
|
|
528
|
+
|
|
529
|
+
out = check_output(["nvidia-smi", "topo", "-m"])
|
|
530
|
+
rows = out.decode("utf-8").split("\n")
|
|
531
|
+
|
|
532
|
+
header = ansi_escape.sub("", rows[0]).split("\t")[1:]
|
|
533
|
+
data = {}
|
|
534
|
+
data["index"] = []
|
|
535
|
+
data |= {k: [] for k in header}
|
|
536
|
+
|
|
537
|
+
for i, row in enumerate(rows[1:]):
|
|
538
|
+
row = ansi_escape.sub("", row).split()
|
|
539
|
+
if len(row) == 0:
|
|
540
|
+
continue
|
|
541
|
+
if row[0].startswith("GPU"):
|
|
542
|
+
data["index"].append(row[0])
|
|
543
|
+
for key, val in zip(header, row[1:]):
|
|
544
|
+
data[key].append(val)
|
|
545
|
+
elif row[0].startswith("Legend"):
|
|
546
|
+
break
|
|
547
|
+
|
|
548
|
+
legend_items = {}
|
|
549
|
+
for legend_row in rows[i:]:
|
|
550
|
+
if legend_row == "" or legend_row.startswith("Legend"):
|
|
551
|
+
continue
|
|
552
|
+
res = legend_row.strip().split(" = ")
|
|
553
|
+
legend_items[res[0].strip()] = res[1].strip()
|
|
554
|
+
|
|
555
|
+
return data, legend_items
|
|
556
|
+
|
|
557
|
+
except:
|
|
558
|
+
return None, None
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
class gpu_profile:
|
|
562
|
+
def __init__(
|
|
563
|
+
self,
|
|
564
|
+
include_artifacts=True,
|
|
565
|
+
artifact_prefix="gpu_profile_",
|
|
566
|
+
interval=1,
|
|
567
|
+
):
|
|
568
|
+
self.include_artifacts = include_artifacts
|
|
569
|
+
self.artifact_prefix = artifact_prefix
|
|
570
|
+
self.interval = interval
|
|
571
|
+
|
|
572
|
+
def __call__(self, f):
|
|
573
|
+
@wraps(f)
|
|
574
|
+
def func(s):
|
|
575
|
+
prof = GPUProfiler(interval=self.interval)
|
|
576
|
+
if self.include_artifacts:
|
|
577
|
+
setattr(s, self.artifact_prefix + "num_gpus", len(prof.devices))
|
|
578
|
+
|
|
579
|
+
current.card["gpu_profile"].append(
|
|
580
|
+
Markdown("# GPU profile for `%s`" % current.pathspec)
|
|
581
|
+
)
|
|
582
|
+
current.card["gpu_profile"].append(
|
|
583
|
+
Markdown(
|
|
584
|
+
"_Started at: %s_"
|
|
585
|
+
% datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
|
|
586
|
+
)
|
|
587
|
+
)
|
|
588
|
+
prof._setup_card(self.artifact_prefix + "data")
|
|
589
|
+
current.card["gpu_profile"].refresh()
|
|
590
|
+
update_thread = threading.Thread(target=prof._update_card, daemon=True)
|
|
591
|
+
update_thread.start()
|
|
592
|
+
|
|
593
|
+
try:
|
|
594
|
+
f(s)
|
|
595
|
+
finally:
|
|
596
|
+
try:
|
|
597
|
+
results = prof.finish()
|
|
598
|
+
except:
|
|
599
|
+
results = {"error": "couldn't read profiler results"}
|
|
600
|
+
if self.include_artifacts:
|
|
601
|
+
setattr(s, self.artifact_prefix + "data", results)
|
|
602
|
+
|
|
603
|
+
from metaflow import card
|
|
604
|
+
|
|
605
|
+
return card(type="blank", id="gpu_profile", refresh_interval=self.interval)(
|
|
606
|
+
func
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def translate_to_vegalite(
|
|
611
|
+
tstamps,
|
|
612
|
+
vals,
|
|
613
|
+
description,
|
|
614
|
+
y_label,
|
|
615
|
+
legend,
|
|
616
|
+
line_color=None,
|
|
617
|
+
percentage_format=False,
|
|
618
|
+
):
|
|
619
|
+
# Preprocessing for Vega-Lite
|
|
620
|
+
# Assuming tstamps is a list of datetime objects and vals is a list of values
|
|
621
|
+
data = [{"tstamps": str(t), "vals": v} for t, v in zip(tstamps, vals)]
|
|
622
|
+
|
|
623
|
+
# Base Vega-Lite spec
|
|
624
|
+
vega_lite_spec = {
|
|
625
|
+
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
|
|
626
|
+
"description": description,
|
|
627
|
+
"data": {"values": data},
|
|
628
|
+
"width": 600,
|
|
629
|
+
"height": 400,
|
|
630
|
+
"encoding": {
|
|
631
|
+
"x": {"field": "tstamps", "type": "temporal", "axis": {"title": "Time"}},
|
|
632
|
+
"y": {
|
|
633
|
+
"field": "vals",
|
|
634
|
+
"type": "quantitative",
|
|
635
|
+
"axis": {
|
|
636
|
+
"title": y_label,
|
|
637
|
+
**({"format": "%"} if percentage_format else {}),
|
|
638
|
+
},
|
|
639
|
+
},
|
|
640
|
+
},
|
|
641
|
+
"layer": [
|
|
642
|
+
{
|
|
643
|
+
"mark": {
|
|
644
|
+
"type": "line",
|
|
645
|
+
"color": line_color if line_color else "blue",
|
|
646
|
+
"tooltip": True,
|
|
647
|
+
"description": legend, # Adding legend as description
|
|
648
|
+
},
|
|
649
|
+
"encoding": {"tooltip": [{"field": "tstamps"}, {"field": "vals"}]},
|
|
650
|
+
}
|
|
651
|
+
],
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
return vega_lite_spec
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
def profile_plots(device_id, ts, gpu, mem_used, mem_total):
|
|
658
|
+
tstamps = [datetime.strptime(t, NVIDIA_TS_FORMAT) for t in ts]
|
|
659
|
+
gpu = [i / 100 for i in list(map(float, gpu))]
|
|
660
|
+
mem = [float(used) / float(total) for used, total in zip(mem_used, mem_total)]
|
|
661
|
+
time_stamp_range = ""
|
|
662
|
+
if len(tstamps) > 1:
|
|
663
|
+
max_time = max(tstamps).strftime(NVIDIA_TS_FORMAT)
|
|
664
|
+
min_time = min(tstamps).strftime(NVIDIA_TS_FORMAT)
|
|
665
|
+
time_stamp_range = "%s to %s" % (min_time, max_time)
|
|
666
|
+
|
|
667
|
+
gpu_plot = translate_to_vegalite(
|
|
668
|
+
tstamps,
|
|
669
|
+
gpu,
|
|
670
|
+
"GPU utilization",
|
|
671
|
+
"GPU utilization",
|
|
672
|
+
"device: %s" % device_id,
|
|
673
|
+
line_color=GPU_COLOR,
|
|
674
|
+
percentage_format=True,
|
|
675
|
+
)
|
|
676
|
+
mem_plot = translate_to_vegalite(
|
|
677
|
+
tstamps,
|
|
678
|
+
mem,
|
|
679
|
+
"Percentage Memory utilization",
|
|
680
|
+
"Percentage Memory utilization",
|
|
681
|
+
"device: %s" % device_id,
|
|
682
|
+
line_color=MEM_COLOR,
|
|
683
|
+
percentage_format=True,
|
|
684
|
+
)
|
|
685
|
+
return gpu_plot, mem_plot, time_stamp_range
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
if __name__ == "__main__":
|
|
689
|
+
prof = GPUProfiler(monitor_batch_duration=10)
|
|
690
|
+
|
|
691
|
+
def _write_json_file(data, filename):
|
|
692
|
+
with open(filename, "w") as f:
|
|
693
|
+
json.dump(data, f, indent=4)
|
|
694
|
+
|
|
695
|
+
import time
|
|
696
|
+
|
|
697
|
+
for i in range(15):
|
|
698
|
+
time.sleep(1)
|
|
699
|
+
_write_json_file(prof._monitor.read(), "gpu_profile.json")
|
|
700
|
+
|
|
701
|
+
print(json.dumps(prof.finish()))
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from metaflow.exception import MetaflowException
|
|
9
|
+
from requests.models import HTTPError
|
|
10
|
+
|
|
11
|
+
OBP_REMOTE_CONFIG_KEY = "OBP_METAFLOW_CONFIG_URL"
|
|
12
|
+
HOSTNAME_KEY = "OBP_API_SERVER"
|
|
13
|
+
AUTH_KEY = "METAFLOW_SERVICE_AUTH_KEY"
|
|
14
|
+
PERIMETER_KEY = "OBP_PERIMETER"
|
|
15
|
+
CONFIG_READ_ONCE_KEY = "__REMOTE_CONFIG_HAS_BEEN_RESOLVED__"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def read_config_from_local() -> Optional[Path]:
|
|
19
|
+
default_path = Path.home() / ".metaflowconfig"
|
|
20
|
+
home = Path(os.environ.get("METAFLOW_HOME", default_path))
|
|
21
|
+
|
|
22
|
+
profile = os.environ.get("METAFLOW_PROFILE")
|
|
23
|
+
config_path = home / f"config_{profile}.json" if profile else home / "config.json"
|
|
24
|
+
|
|
25
|
+
if config_path.exists() and config_path.is_file():
|
|
26
|
+
_init_debug(f"using config from {config_path}")
|
|
27
|
+
return config_path
|
|
28
|
+
|
|
29
|
+
# we should error because the user wants a specific config
|
|
30
|
+
if profile:
|
|
31
|
+
raise MetaflowException(
|
|
32
|
+
f"Unable to locate METAFLOW_PROFILE {profile} in {config_path}"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# there's no config and that's ok. Metaflow uses environment variables as its primary way to set values
|
|
36
|
+
# and will fallback to local settings if no config is present
|
|
37
|
+
_init_debug(f"no config present at path {config_path}")
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def resolve_config_from_remote(remote_url: str, auth_token: str) -> dict[str, str]:
|
|
42
|
+
_init_debug(f"retrieving config from {remote_url}")
|
|
43
|
+
|
|
44
|
+
headers = {"x-api-key": auth_token}
|
|
45
|
+
try:
|
|
46
|
+
response = requests.get(remote_url, headers=headers)
|
|
47
|
+
_init_debug(
|
|
48
|
+
f"response\nstatus code: {response.status_code}\nbody: {response.text}"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
response.raise_for_status()
|
|
52
|
+
data = response.json()
|
|
53
|
+
return data["config"]
|
|
54
|
+
except HTTPError:
|
|
55
|
+
raise MetaflowException(
|
|
56
|
+
"Error fetching resolving configuration. Make sure you have run \
|
|
57
|
+
`outerbounds configure` with the correct value"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def init_config() -> dict[str, str]:
|
|
62
|
+
"""
|
|
63
|
+
OSS Metaflow reads the config file on every step initialization. This is because OSS assumes config files change
|
|
64
|
+
relatively infrequently. We want to avoid config values changing between flow steps. Our solution to prevent this
|
|
65
|
+
is to read a config once and cache it on an environment variable. Environment variables carry over between steps
|
|
66
|
+
because steps are executed in subprocesses (local) or environments which expect environment variables to be set.
|
|
67
|
+
"""
|
|
68
|
+
_init_debug("starting initialization")
|
|
69
|
+
|
|
70
|
+
if config_json := os.environ.get(CONFIG_READ_ONCE_KEY):
|
|
71
|
+
_init_debug("reading config from environment")
|
|
72
|
+
return json.loads(config_json)
|
|
73
|
+
|
|
74
|
+
config_path = read_config_from_local()
|
|
75
|
+
if not config_path:
|
|
76
|
+
return {}
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
remote_config = json.loads(config_path.read_text())
|
|
80
|
+
except ValueError:
|
|
81
|
+
raise MetaflowException(
|
|
82
|
+
"Error decoding your metaflow config. Please run the `outerbounds configure` \
|
|
83
|
+
command with the string provided in the Outerbounds dashboard"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# users still have a legacy format and that's ok.
|
|
87
|
+
if OBP_REMOTE_CONFIG_KEY not in remote_config:
|
|
88
|
+
return remote_config
|
|
89
|
+
|
|
90
|
+
metaflow_config = resolve_config_from_remote(
|
|
91
|
+
remote_url=remote_config[OBP_REMOTE_CONFIG_KEY],
|
|
92
|
+
auth_token=remote_config[AUTH_KEY],
|
|
93
|
+
)
|
|
94
|
+
metaflow_config[AUTH_KEY] = remote_config[AUTH_KEY]
|
|
95
|
+
|
|
96
|
+
# set cache
|
|
97
|
+
os.environ[CONFIG_READ_ONCE_KEY] = json.dumps(metaflow_config)
|
|
98
|
+
return metaflow_config
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
DEBUG_CONFIG = os.environ.get("METAFLOW_DEBUG_CONFIG")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _init_debug(*args, **kwargs):
|
|
105
|
+
if DEBUG_CONFIG:
|
|
106
|
+
init_str = "ob_extension_init:"
|
|
107
|
+
kwargs["file"] = sys.stderr
|
|
108
|
+
print(init_str, *args, **kwargs)
|
ob-metaflow-extensions-1.1.43/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__mf_promote_submodules__ = ["plugins.gcp.gs_storage_client_factory"]
|
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
README.md
|
|
2
2
|
setup.py
|
|
3
3
|
metaflow_extensions/outerbounds/__init__.py
|
|
4
|
+
metaflow_extensions/outerbounds/remote_config.py
|
|
4
5
|
metaflow_extensions/outerbounds/config/__init__.py
|
|
5
6
|
metaflow_extensions/outerbounds/plugins/__init__.py
|
|
6
7
|
metaflow_extensions/outerbounds/plugins/auth_server.py
|
|
7
|
-
metaflow_extensions/outerbounds/plugins/perimeters.py
|
|
8
8
|
metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py
|
|
9
9
|
metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py
|
|
10
|
+
metaflow_extensions/outerbounds/profilers/__init__.py
|
|
11
|
+
metaflow_extensions/outerbounds/profilers/gpu.py
|
|
10
12
|
metaflow_extensions/outerbounds/toplevel/__init__.py
|
|
11
13
|
metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py
|
|
12
14
|
metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py
|
|
15
|
+
metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py
|
|
13
16
|
metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py
|
|
14
17
|
ob_metaflow_extensions.egg-info/PKG-INFO
|
|
15
18
|
ob_metaflow_extensions.egg-info/SOURCES.txt
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
version = "1.1.
|
|
5
|
+
version = "1.1.43"
|
|
6
6
|
this_directory = Path(__file__).parent
|
|
7
7
|
long_description = (this_directory / "README.md").read_text()
|
|
8
8
|
|
|
@@ -15,5 +15,5 @@ setup(
|
|
|
15
15
|
packages=find_namespace_packages(include=["metaflow_extensions.*"]),
|
|
16
16
|
long_description=long_description,
|
|
17
17
|
long_description_content_type="text/markdown",
|
|
18
|
-
install_requires=["boto3", "kubernetes", "ob-metaflow == 2.11.0.
|
|
18
|
+
install_requires=["boto3", "kubernetes", "ob-metaflow == 2.11.0.4"],
|
|
19
19
|
)
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import fcntl
|
|
3
|
-
from os import path
|
|
4
|
-
import json
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def override_metaflow_profile_with_perimeter():
|
|
8
|
-
# If OBP_CONFIG_DIR is set, use that, otherwise use METAFLOW_HOME
|
|
9
|
-
# If neither are set, use ~/.metaflowconfig
|
|
10
|
-
obp_config_dir = path.expanduser(
|
|
11
|
-
os.environ.get(
|
|
12
|
-
"OBP_CONFIG_DIR", os.environ.get("METAFLOW_HOME", "~/.metaflowconfig")
|
|
13
|
-
)
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
file_path = os.path.join(obp_config_dir, "ob_config.json")
|
|
17
|
-
|
|
18
|
-
if os.path.exists(file_path):
|
|
19
|
-
# Acquire a shared read lock on the file
|
|
20
|
-
fd = os.open(file_path, os.O_RDONLY)
|
|
21
|
-
fcntl.flock(fd, fcntl.LOCK_SH)
|
|
22
|
-
|
|
23
|
-
with open(file_path, "r") as f:
|
|
24
|
-
ob_config = json.loads(f.read())
|
|
25
|
-
|
|
26
|
-
if "OB_CURRENT_PERIMETER" in ob_config:
|
|
27
|
-
os.environ["METAFLOW_PROFILE"] = ob_config["OB_CURRENT_PERIMETER"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|