ob-metaflow-extensions 1.1.102__py2.py3-none-any.whl → 1.1.103__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/plugins/__init__.py +5 -0
- metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
- metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +173 -0
- metaflow_extensions/outerbounds/profilers/gpu.py +51 -39
- {ob_metaflow_extensions-1.1.102.dist-info → ob_metaflow_extensions-1.1.103.dist-info}/METADATA +1 -1
- {ob_metaflow_extensions-1.1.102.dist-info → ob_metaflow_extensions-1.1.103.dist-info}/RECORD +10 -6
- {ob_metaflow_extensions-1.1.102.dist-info → ob_metaflow_extensions-1.1.103.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.102.dist-info → ob_metaflow_extensions-1.1.103.dist-info}/top_level.txt +0 -0
|
@@ -318,6 +318,7 @@ STEP_DECORATORS_DESC = [
|
|
|
318
318
|
),
|
|
319
319
|
("snowpark", ".snowpark.snowpark_decorator.SnowparkDecorator"),
|
|
320
320
|
("tensorboard", ".tensorboard.TensorboardDecorator"),
|
|
321
|
+
("gpu_profile", ".profilers.gpu_profile_decorator.GPUProfileDecorator"),
|
|
321
322
|
]
|
|
322
323
|
FLOW_DECORATORS_DESC = [("nim", ".nim.NimDecorator")]
|
|
323
324
|
TOGGLE_STEP_DECORATOR = [
|
|
@@ -331,3 +332,7 @@ TOGGLE_CLI = ["-batch", "-step-functions", "-airflow"]
|
|
|
331
332
|
ENVIRONMENTS_DESC = [
|
|
332
333
|
("fast-bakery", ".fast_bakery.docker_environment.DockerEnvironment")
|
|
333
334
|
]
|
|
335
|
+
|
|
336
|
+
SECRETS_PROVIDERS_DESC = [
|
|
337
|
+
("outerbounds", ".secrets.secrets.OuterboundsSecretsProvider"),
|
|
338
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from metaflow.exception import MetaflowException
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class CardDecoratorInjector:
|
|
6
|
+
"""
|
|
7
|
+
Mixin Useful for injecting @card decorators from other first class Metaflow decorators.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
_first_time_init = defaultdict(dict)
|
|
11
|
+
|
|
12
|
+
@classmethod
|
|
13
|
+
def _get_first_time_init_cached_value(cls, step_name, card_id):
|
|
14
|
+
return cls._first_time_init.get(step_name, {}).get(card_id, None)
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def _set_first_time_init_cached_value(cls, step_name, card_id, value):
|
|
18
|
+
cls._first_time_init[step_name][card_id] = value
|
|
19
|
+
|
|
20
|
+
def _card_deco_already_attached(self, step, card_id):
|
|
21
|
+
for decorator in step.decorators:
|
|
22
|
+
if decorator.name == "card":
|
|
23
|
+
if decorator.attributes["id"] and card_id in decorator.attributes["id"]:
|
|
24
|
+
return True
|
|
25
|
+
return False
|
|
26
|
+
|
|
27
|
+
def _get_step(self, flow, step_name):
|
|
28
|
+
for step in flow:
|
|
29
|
+
if step.name == step_name:
|
|
30
|
+
return step
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
def _first_time_init_check(self, step_dag_node, card_id):
|
|
34
|
+
""" """
|
|
35
|
+
return not self._card_deco_already_attached(step_dag_node, card_id)
|
|
36
|
+
|
|
37
|
+
def attach_card_decorator(
|
|
38
|
+
self,
|
|
39
|
+
flow,
|
|
40
|
+
step_name,
|
|
41
|
+
card_id,
|
|
42
|
+
card_type,
|
|
43
|
+
refresh_interval=5,
|
|
44
|
+
):
|
|
45
|
+
"""
|
|
46
|
+
This method is called `step_init` in your StepDecorator code since
|
|
47
|
+
this class is used as a Mixin
|
|
48
|
+
"""
|
|
49
|
+
from metaflow import decorators as _decorators
|
|
50
|
+
|
|
51
|
+
if not all([card_id, card_type]):
|
|
52
|
+
raise MetaflowException(
|
|
53
|
+
"`card_id` and `card_type` must be set in the `CardDecoratorInjector` Mixin"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
step_dag_node = self._get_step(flow, step_name)
|
|
57
|
+
if (
|
|
58
|
+
self._get_first_time_init_cached_value(step_name, card_id) is None
|
|
59
|
+
): # First check class level setting.
|
|
60
|
+
if self._first_time_init_check(step_dag_node, card_id):
|
|
61
|
+
self._set_first_time_init_cached_value(step_name, card_id, True)
|
|
62
|
+
_decorators._attach_decorators_to_step(
|
|
63
|
+
step_dag_node,
|
|
64
|
+
[
|
|
65
|
+
"card:type=%s,id=%s,refresh_interval=%s"
|
|
66
|
+
% (card_type, card_id, str(refresh_interval))
|
|
67
|
+
],
|
|
68
|
+
)
|
|
69
|
+
else:
|
|
70
|
+
self._set_first_time_init_cached_value(step_name, card_id, False)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from metaflow.decorators import StepDecorator
|
|
3
|
+
from ...profilers.gpu import GPUProfiler # Fix import
|
|
4
|
+
from .deco_injector import CardDecoratorInjector
|
|
5
|
+
import threading
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class GPUProfileDecorator(StepDecorator):
|
|
9
|
+
name = "gpu_profile"
|
|
10
|
+
|
|
11
|
+
defaults = {
|
|
12
|
+
"include_artifacts": True,
|
|
13
|
+
"artifact_prefix": "gpu_profile_",
|
|
14
|
+
"interval": 1,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
def step_init(
|
|
18
|
+
self, flow, graph, step_name, decorators, environment, flow_datastore, logger
|
|
19
|
+
):
|
|
20
|
+
self.deco_injector = CardDecoratorInjector()
|
|
21
|
+
self.deco_injector.attach_card_decorator(
|
|
22
|
+
flow,
|
|
23
|
+
step_name,
|
|
24
|
+
"gpu_profile",
|
|
25
|
+
"blank",
|
|
26
|
+
refresh_interval=self.attributes["interval"],
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def task_pre_step(
|
|
30
|
+
self,
|
|
31
|
+
step_name,
|
|
32
|
+
task_datastore,
|
|
33
|
+
metadata,
|
|
34
|
+
run_id,
|
|
35
|
+
task_id,
|
|
36
|
+
flow,
|
|
37
|
+
graph,
|
|
38
|
+
retry_count,
|
|
39
|
+
max_user_code_retries,
|
|
40
|
+
ubf_context,
|
|
41
|
+
inputs,
|
|
42
|
+
):
|
|
43
|
+
self._profiler = GPUProfiler(
|
|
44
|
+
interval=self.attributes["interval"],
|
|
45
|
+
artifact_name=self.attributes["artifact_prefix"] + "data",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def task_decorate(
|
|
49
|
+
self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
|
|
50
|
+
):
|
|
51
|
+
from metaflow import current
|
|
52
|
+
from metaflow.cards import Markdown
|
|
53
|
+
|
|
54
|
+
if self.attributes["include_artifacts"]:
|
|
55
|
+
setattr(
|
|
56
|
+
flow,
|
|
57
|
+
self.attributes["artifact_prefix"] + "num_gpus",
|
|
58
|
+
len(self._profiler.devices),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
current.card["gpu_profile"].append(
|
|
62
|
+
Markdown("# GPU profile for `%s`" % current.pathspec)
|
|
63
|
+
)
|
|
64
|
+
current.card["gpu_profile"].append(
|
|
65
|
+
Markdown(
|
|
66
|
+
"_Started at: %s_"
|
|
67
|
+
% datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
self._profiler._setup_card()
|
|
71
|
+
current.card["gpu_profile"].refresh()
|
|
72
|
+
self._update_thread = threading.Thread(
|
|
73
|
+
target=self._profiler._update_card, daemon=True
|
|
74
|
+
)
|
|
75
|
+
self._update_thread.start()
|
|
76
|
+
|
|
77
|
+
def wrapped_step_func():
|
|
78
|
+
try:
|
|
79
|
+
step_func()
|
|
80
|
+
finally:
|
|
81
|
+
try:
|
|
82
|
+
results = self._profiler.finish()
|
|
83
|
+
except:
|
|
84
|
+
results = {"error": "couldn't read profiler results"}
|
|
85
|
+
if self.attributes["include_artifacts"]:
|
|
86
|
+
setattr(flow, self.attributes["artifact_prefix"] + "data", results)
|
|
87
|
+
|
|
88
|
+
return wrapped_step_func
|
|
File without changes
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from metaflow.plugins.secrets import SecretsProvider
|
|
2
|
+
from typing import Dict
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
import json
|
|
6
|
+
import requests
|
|
7
|
+
import time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class OuterboundsSecretsException(Exception):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OuterboundsSecretsApiResponse:
|
|
15
|
+
def __init__(self, response):
|
|
16
|
+
self.response = response
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def secret_resource_id(self):
|
|
20
|
+
return self.response["secret_resource_id"]
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def secret_backend_type(self):
|
|
24
|
+
return self.response["secret_backend_type"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class OuterboundsSecretsProvider(SecretsProvider):
|
|
28
|
+
TYPE = "outerbounds"
|
|
29
|
+
|
|
30
|
+
def get_secret_as_dict(self, secret_id, options={}, role=None):
|
|
31
|
+
"""
|
|
32
|
+
Supports a special way of specifying secrets sources in outerbounds using the format:
|
|
33
|
+
@secrets(sources=["outerbounds.<integrations_name>"])
|
|
34
|
+
|
|
35
|
+
When invoked it makes a requests to the integrations secrets metadata endpoint on the
|
|
36
|
+
keywest server to get the cloud resource id for a secret. It then uses that to invoke
|
|
37
|
+
secrets manager on the core oss and returns the secrets.
|
|
38
|
+
"""
|
|
39
|
+
headers = {"Content-Type": "application/json", "Connection": "keep-alive"}
|
|
40
|
+
perimeter, integrations_secrets_metadata_url = self._get_secret_configs()
|
|
41
|
+
integration_name = secret_id
|
|
42
|
+
request_payload = {
|
|
43
|
+
"perimeter_name": perimeter,
|
|
44
|
+
"integration_name": integration_name,
|
|
45
|
+
}
|
|
46
|
+
response = self._make_request(
|
|
47
|
+
integrations_secrets_metadata_url, headers, request_payload
|
|
48
|
+
)
|
|
49
|
+
secret_resource_id = response.secret_resource_id
|
|
50
|
+
secret_backend_type = response.secret_backend_type
|
|
51
|
+
|
|
52
|
+
from metaflow.plugins.secrets.secrets_decorator import (
|
|
53
|
+
get_secrets_backend_provider,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
secrets_provider = get_secrets_backend_provider(secret_backend_type)
|
|
57
|
+
secret_dict = secrets_provider.get_secret_as_dict(
|
|
58
|
+
secret_resource_id, options={}, role=role
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Outerbounds stores secrets as binaries. Hence we expect the returned secret to be
|
|
62
|
+
# {<cloud-secret-name>: <base64 encoded full secret>}. We decode the secret here like:
|
|
63
|
+
# 1. decode the base64 encoded full secret
|
|
64
|
+
# 2. load the decoded secret as a json
|
|
65
|
+
# 3. decode the base64 encoded values in the dict
|
|
66
|
+
# 4. return the decoded dict
|
|
67
|
+
binary_secret = next(iter(secret_dict.values()))
|
|
68
|
+
return self._decode_secret(binary_secret)
|
|
69
|
+
|
|
70
|
+
def _is_base64_encoded(self, data):
|
|
71
|
+
try:
|
|
72
|
+
if isinstance(data, str):
|
|
73
|
+
# Check if the string can be base64 decoded
|
|
74
|
+
base64.b64decode(data).decode("utf-8")
|
|
75
|
+
return True
|
|
76
|
+
return False
|
|
77
|
+
except Exception:
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
def _decode_secret(self, secret):
|
|
81
|
+
try:
|
|
82
|
+
result = {}
|
|
83
|
+
secret_str = secret
|
|
84
|
+
if self._is_base64_encoded(secret):
|
|
85
|
+
# we check if the secret string is base64 encoded because the returned secret from
|
|
86
|
+
# AWS secret manager is base64 encoded while the secret from GCP is not
|
|
87
|
+
secret_str = base64.b64decode(secret).decode("utf-8")
|
|
88
|
+
|
|
89
|
+
secret_dict = json.loads(secret_str)
|
|
90
|
+
for key, value in secret_dict.items():
|
|
91
|
+
result[key] = base64.b64decode(value).decode("utf-8")
|
|
92
|
+
|
|
93
|
+
return result
|
|
94
|
+
except Exception as e:
|
|
95
|
+
raise OuterboundsSecretsException(f"Error decoding secret: {e}")
|
|
96
|
+
|
|
97
|
+
def _get_secret_configs(self):
|
|
98
|
+
from metaflow_extensions.outerbounds.remote_config import init_config
|
|
99
|
+
from os import environ
|
|
100
|
+
|
|
101
|
+
conf = init_config()
|
|
102
|
+
if "OBP_PERIMETER" in conf:
|
|
103
|
+
perimeter = conf["OBP_PERIMETER"]
|
|
104
|
+
else:
|
|
105
|
+
# if the perimeter is not in metaflow config, try to get it from the environment
|
|
106
|
+
perimeter = environ.get("OBP_PERIMETER", "")
|
|
107
|
+
|
|
108
|
+
if "OBP_INTEGRATIONS_SECRETS_METADATA_URL" in conf:
|
|
109
|
+
integrations_secrets_metadata_url = conf[
|
|
110
|
+
"OBP_INTEGRATIONS_SECRETS_METADATA_URL"
|
|
111
|
+
]
|
|
112
|
+
else:
|
|
113
|
+
# if the integrations secrets metadata url is not in metaflow config, try to get it from the environment
|
|
114
|
+
integrations_secrets_metadata_url = environ.get(
|
|
115
|
+
"OBP_INTEGRATIONS_SECRETS_METADATA_URL", ""
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if not perimeter:
|
|
119
|
+
raise OuterboundsSecretsException(
|
|
120
|
+
"No perimeter set. Please make sure to run `outerbounds configure <...>` command which can be found on the Ourebounds UI or reach out to your Outerbounds support team."
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if not integrations_secrets_metadata_url:
|
|
124
|
+
raise OuterboundsSecretsException(
|
|
125
|
+
"No integrations secrets metadata url set. Please notify your Outerbounds support team about this issue."
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
return perimeter, integrations_secrets_metadata_url
|
|
129
|
+
|
|
130
|
+
def _make_request(self, url, headers: Dict, payload: Dict):
|
|
131
|
+
try:
|
|
132
|
+
from metaflow.metaflow_config import SERVICE_HEADERS
|
|
133
|
+
|
|
134
|
+
request_headers = {**headers, **(SERVICE_HEADERS or {})}
|
|
135
|
+
except ImportError:
|
|
136
|
+
headers = self.headers
|
|
137
|
+
|
|
138
|
+
retryable_status_codes = [409]
|
|
139
|
+
json_payload = json.dumps(payload)
|
|
140
|
+
for attempt in range(2): # 0 = initial attempt, 1-2 = retries
|
|
141
|
+
response = requests.get(url, data=json_payload, headers=request_headers)
|
|
142
|
+
if response.status_code not in retryable_status_codes:
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
if attempt < 2: # Don't sleep after the last attempt
|
|
146
|
+
sleep_time = 0.5 * (attempt + 1)
|
|
147
|
+
time.sleep(sleep_time)
|
|
148
|
+
|
|
149
|
+
response = requests.get(url, data=json_payload, headers=request_headers)
|
|
150
|
+
self._handle_error_response(response)
|
|
151
|
+
return OuterboundsSecretsApiResponse(response.json())
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _handle_error_response(response: requests.Response):
|
|
155
|
+
if response.status_code >= 500:
|
|
156
|
+
raise OuterboundsSecretsException(
|
|
157
|
+
f"Server error: {response.text}. Please reach out to your Outerbounds support team."
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
body = response.json()
|
|
161
|
+
status_code = body.get("error", {}).get("statusCode", response.status_code)
|
|
162
|
+
if status_code == 404:
|
|
163
|
+
raise OuterboundsSecretsException(f"Secret not found: {body}")
|
|
164
|
+
|
|
165
|
+
if status_code >= 400:
|
|
166
|
+
try:
|
|
167
|
+
raise OuterboundsSecretsException(
|
|
168
|
+
f"status_code={status_code}\t*{body['error']['details']['kind']}*\n{body['error']['details']['message']}"
|
|
169
|
+
)
|
|
170
|
+
except KeyError:
|
|
171
|
+
raise OuterboundsSecretsException(
|
|
172
|
+
f"status_code={status_code} Unexpected error: {body}"
|
|
173
|
+
)
|
|
@@ -187,6 +187,9 @@ class GPUMonitor:
|
|
|
187
187
|
all_readings = []
|
|
188
188
|
if self._current_file is None:
|
|
189
189
|
return None
|
|
190
|
+
|
|
191
|
+
if not os.path.exists(self._current_file):
|
|
192
|
+
return None
|
|
190
193
|
# Extract everything from the CVS File and store it in a list of dictionaries
|
|
191
194
|
all_fields = ["gpu_id"] + MONITOR_FIELDS
|
|
192
195
|
with open(self._current_file, "r") as _monitor_out:
|
|
@@ -317,11 +320,17 @@ def _update_charts(results, md_dict):
|
|
|
317
320
|
# This code is adapted from: https://github.com/outerbounds/monitorbench
|
|
318
321
|
class GPUProfiler:
|
|
319
322
|
def __init__(
|
|
320
|
-
self,
|
|
323
|
+
self,
|
|
324
|
+
interval=1,
|
|
325
|
+
monitor_batch_duration=200,
|
|
326
|
+
artifact_name="gpu_profile_data",
|
|
327
|
+
max_check_timeout=60,
|
|
321
328
|
):
|
|
322
329
|
self._interval = interval
|
|
330
|
+
self.max_check_timeout = max_check_timeout
|
|
323
331
|
self._monitor_batch_duration = monitor_batch_duration
|
|
324
332
|
self.artifact_name = artifact_name
|
|
333
|
+
self._started_at = datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
|
|
325
334
|
self._card_setup_finished = False
|
|
326
335
|
self._card_comps = {
|
|
327
336
|
"max_utilization": {},
|
|
@@ -334,14 +343,14 @@ class GPUProfiler:
|
|
|
334
343
|
|
|
335
344
|
def _start_monitor(self):
|
|
336
345
|
self.driver_ver, self.cuda_ver, self.error = self._read_versions()
|
|
337
|
-
(
|
|
338
|
-
self.interconnect_data,
|
|
339
|
-
self.interconnect_legend,
|
|
340
|
-
) = self._read_multi_gpu_interconnect()
|
|
341
346
|
if self.error:
|
|
342
347
|
self.devices = []
|
|
343
348
|
return
|
|
344
349
|
else:
|
|
350
|
+
(
|
|
351
|
+
self.interconnect_data,
|
|
352
|
+
self.interconnect_legend,
|
|
353
|
+
) = self._read_multi_gpu_interconnect()
|
|
345
354
|
self.devices = self._read_devices()
|
|
346
355
|
self._monitor = GPUMonitor(
|
|
347
356
|
interval=self._interval, duration=self._monitor_batch_duration
|
|
@@ -391,6 +400,12 @@ class GPUProfiler:
|
|
|
391
400
|
def _update_card(self):
|
|
392
401
|
if len(self.devices) == 0:
|
|
393
402
|
current.card["gpu_profile"].clear()
|
|
403
|
+
current.card["gpu_profile"].append(
|
|
404
|
+
Markdown("# GPU profile for `%s`" % current.pathspec)
|
|
405
|
+
)
|
|
406
|
+
current.card["gpu_profile"].append(
|
|
407
|
+
Markdown("_Started at: %s_" % self._started_at)
|
|
408
|
+
)
|
|
394
409
|
current.card["gpu_profile"].append(
|
|
395
410
|
Markdown("## GPU profile failed: %s" % self.error)
|
|
396
411
|
)
|
|
@@ -398,14 +413,36 @@ class GPUProfiler:
|
|
|
398
413
|
|
|
399
414
|
return
|
|
400
415
|
|
|
416
|
+
_check_time = 0
|
|
417
|
+
stop_checking = False
|
|
418
|
+
# Before writing anything to the card, we need to make sure that:
|
|
419
|
+
# 1. GPU Monitor has started.
|
|
420
|
+
# 2. Monitor can record readings
|
|
421
|
+
# 3. Card is setup
|
|
401
422
|
while True:
|
|
423
|
+
|
|
424
|
+
if stop_checking:
|
|
425
|
+
time.sleep(self._interval)
|
|
426
|
+
continue
|
|
427
|
+
|
|
402
428
|
# There is a possibility that the `monitor` thread is not started yet
|
|
403
429
|
# because it somehow crashed at the very start.
|
|
430
|
+
if not self._monitor_started and _check_time > self.max_check_timeout:
|
|
431
|
+
current.card["gpu_profile"].clear()
|
|
432
|
+
current.card["gpu_profile"].append(
|
|
433
|
+
Markdown("## GPU profile failed: %s" % self.error)
|
|
434
|
+
)
|
|
435
|
+
current.card["gpu_profile"].refresh()
|
|
436
|
+
stop_checking = True
|
|
437
|
+
|
|
438
|
+
# Try restarting monitor if it hasn't started yet
|
|
404
439
|
if not self._monitor_started:
|
|
405
440
|
self._start_monitor()
|
|
441
|
+
_check_time += self._interval
|
|
406
442
|
time.sleep(self._interval)
|
|
407
443
|
continue
|
|
408
444
|
|
|
445
|
+
# Ensure that we are getting well formatted readings
|
|
409
446
|
readings = self._make_reading()
|
|
410
447
|
|
|
411
448
|
if readings is None:
|
|
@@ -413,6 +450,7 @@ class GPUProfiler:
|
|
|
413
450
|
time.sleep(self._interval)
|
|
414
451
|
continue
|
|
415
452
|
|
|
453
|
+
# ensure that the card is setup
|
|
416
454
|
if not self._card_setup_finished:
|
|
417
455
|
self._setup_card()
|
|
418
456
|
time.sleep(self._interval)
|
|
@@ -642,41 +680,15 @@ class gpu_profile:
|
|
|
642
680
|
def __call__(self, f):
|
|
643
681
|
@wraps(f)
|
|
644
682
|
def func(s):
|
|
645
|
-
|
|
646
|
-
interval=self.interval, artifact_name=self.artifact_prefix + "data"
|
|
647
|
-
)
|
|
648
|
-
if self.include_artifacts:
|
|
649
|
-
setattr(s, self.artifact_prefix + "num_gpus", len(prof.devices))
|
|
683
|
+
return f(s)
|
|
650
684
|
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
)
|
|
659
|
-
)
|
|
660
|
-
prof._setup_card()
|
|
661
|
-
current.card["gpu_profile"].refresh()
|
|
662
|
-
update_thread = threading.Thread(target=prof._update_card, daemon=True)
|
|
663
|
-
update_thread.start()
|
|
664
|
-
|
|
665
|
-
try:
|
|
666
|
-
f(s)
|
|
667
|
-
finally:
|
|
668
|
-
try:
|
|
669
|
-
results = prof.finish()
|
|
670
|
-
except:
|
|
671
|
-
results = {"error": "couldn't read profiler results"}
|
|
672
|
-
if self.include_artifacts:
|
|
673
|
-
setattr(s, self.artifact_prefix + "data", results)
|
|
674
|
-
|
|
675
|
-
from metaflow import card
|
|
676
|
-
|
|
677
|
-
return card(type="blank", id="gpu_profile", refresh_interval=self.interval)(
|
|
678
|
-
func
|
|
679
|
-
)
|
|
685
|
+
from metaflow import gpu_profile
|
|
686
|
+
|
|
687
|
+
return gpu_profile(
|
|
688
|
+
include_artifacts=self.include_artifacts,
|
|
689
|
+
artifact_prefix=self.artifact_prefix,
|
|
690
|
+
interval=self.interval,
|
|
691
|
+
)(func)
|
|
680
692
|
|
|
681
693
|
|
|
682
694
|
def translate_to_vegalite(
|
{ob_metaflow_extensions-1.1.102.dist-info → ob_metaflow_extensions-1.1.103.dist-info}/RECORD
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
metaflow_extensions/outerbounds/__init__.py,sha256=TRGvIUMjkfneWtYUFSWoubu_Kf2ekAL4WLbV3IxOj9k,499
|
|
2
2
|
metaflow_extensions/outerbounds/remote_config.py,sha256=Zpfpjgz68_ZgxlXezjzlsDLo4840rkWuZgwDB_5H57U,4059
|
|
3
3
|
metaflow_extensions/outerbounds/config/__init__.py,sha256=JsQGRuGFz28fQWjUvxUgR8EKBLGRdLUIk_buPLJplJY,1225
|
|
4
|
-
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=
|
|
4
|
+
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=lF8tJA_llp-VjJeNUFKnwOx9DNgJPTyNWAYzDIaORU8,12639
|
|
5
5
|
metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=1v2GBqoMBxp5E7Lejz139w-jxJtPnLDvvHXP0HhEIHI,2361
|
|
6
6
|
metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
|
|
7
7
|
metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -18,6 +18,10 @@ metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=pOWwm8LFQ
|
|
|
18
18
|
metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=1ayR5YLiEy3XRGVEnRUpsDCH_UyK7z-7A8L5GxI4qOE,9535
|
|
19
19
|
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=YZb5AvbVgUwUJVxRxQ4JqqP8e1RMJr6dZ9U4KkHE-M8,9134
|
|
20
20
|
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=TKWN3mG2twC40uChfWYVsNKqf3euQIM_YHLAuNf1pvA,6525
|
|
21
|
+
metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py,sha256=oI_C3c64XBm7n88FILqHwn-Nnc5DeT_68I67lM9rXaI,2434
|
|
22
|
+
metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py,sha256=gDHQ2sMIp4NuZSzUspbSd8RGdFAoO5mgZAyFcZ2a51Y,2619
|
|
23
|
+
metaflow_extensions/outerbounds/plugins/secrets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
metaflow_extensions/outerbounds/plugins/secrets/secrets.py,sha256=OSKm0aMKGDimqPLC7qzsUnXCEMpRfRDpQJN9Z8b4jo8,6796
|
|
21
25
|
metaflow_extensions/outerbounds/plugins/snowpark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
26
|
metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py,sha256=0R8aFN9MpgWraqiaI6ZF82YpLdFJ1f-3z_-BPRpZfxM,10674
|
|
23
27
|
metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py,sha256=ErsVoCQLa33byiykOQzDEeEkRKk0mgffZme43f3jxn4,8747
|
|
@@ -28,13 +32,13 @@ metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py,sha256=d_5UhXqZ
|
|
|
28
32
|
metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py,sha256=AI_kcm1hZV3JRxJkookcH6twiGnAYjk9Dx-MeoYz60Y,8511
|
|
29
33
|
metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py,sha256=9lUM4Cqi5RjrHBRfG6AQMRz8-R96eZC8Ih0KD2lv22Y,1858
|
|
30
34
|
metaflow_extensions/outerbounds/profilers/__init__.py,sha256=wa_jhnCBr82TBxoS0e8b6_6sLyZX0fdHicuGJZNTqKw,29
|
|
31
|
-
metaflow_extensions/outerbounds/profilers/gpu.py,sha256=
|
|
35
|
+
metaflow_extensions/outerbounds/profilers/gpu.py,sha256=3Er8uKQzfm_082uadg4yn_D4Y-iSCgzUfFmguYxZsz4,27485
|
|
32
36
|
metaflow_extensions/outerbounds/toplevel/__init__.py,sha256=qWUJSv_r5hXJ7jV_On4nEasKIfUCm6_UjkjXWA_A1Ts,90
|
|
33
37
|
metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py,sha256=Zq3OuL1bOod8KJra-Zk8B3gNhSHoWEGteM9T7g0pp6E,1881
|
|
34
38
|
metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py,sha256=WUuhz2YQfI4fz7nIcipwwWq781eaoHEk7n4GAn1npDg,63
|
|
35
39
|
metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3uILlEZ6ntBLKeNyqn3If8nIXZFq_Apd7Dhco,70
|
|
36
40
|
metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
37
|
-
ob_metaflow_extensions-1.1.
|
|
38
|
-
ob_metaflow_extensions-1.1.
|
|
39
|
-
ob_metaflow_extensions-1.1.
|
|
40
|
-
ob_metaflow_extensions-1.1.
|
|
41
|
+
ob_metaflow_extensions-1.1.103.dist-info/METADATA,sha256=jydJTGUDvrFg4bhCVxtiF4IL584lR73yr2Iv0LvseMA,521
|
|
42
|
+
ob_metaflow_extensions-1.1.103.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
|
|
43
|
+
ob_metaflow_extensions-1.1.103.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
|
|
44
|
+
ob_metaflow_extensions-1.1.103.dist-info/RECORD,,
|
|
File without changes
|
{ob_metaflow_extensions-1.1.102.dist-info → ob_metaflow_extensions-1.1.103.dist-info}/top_level.txt
RENAMED
|
File without changes
|