ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/__init__.py +1 -7
- metaflow_extensions/outerbounds/config/__init__.py +35 -0
- metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
- metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
- metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
- metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
- metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
- metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
- metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
- metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
- metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
- metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
- metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
- metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
- metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
- metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
- metaflow_extensions/outerbounds/remote_config.py +53 -16
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
- ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
from metaflow.cards import Markdown, Table, VegaChart
|
|
2
|
+
from metaflow.metaflow_current import current
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from ..card_utilities.async_cards import CardRefresher
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class OllamaStatusCard(CardRefresher):
|
|
11
|
+
"""
|
|
12
|
+
Real-time status card for Ollama system monitoring.
|
|
13
|
+
Shows circuit breaker state, server health, model status, and recent events.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
CARD_ID = "ollama_status"
|
|
17
|
+
|
|
18
|
+
def __init__(self, refresh_interval=10):
|
|
19
|
+
self.refresh_interval = refresh_interval
|
|
20
|
+
self.status_data = {
|
|
21
|
+
"circuit_breaker": {
|
|
22
|
+
"state": "CLOSED",
|
|
23
|
+
"failure_count": 0,
|
|
24
|
+
"last_failure_time": None,
|
|
25
|
+
"last_open_time": None,
|
|
26
|
+
},
|
|
27
|
+
"server": {
|
|
28
|
+
"status": "Starting",
|
|
29
|
+
"uptime_start": None,
|
|
30
|
+
"restart_attempts": 0,
|
|
31
|
+
"last_health_check": None,
|
|
32
|
+
"health_status": "Unknown",
|
|
33
|
+
},
|
|
34
|
+
"models": {}, # model_name -> {status, pull_time, load_time, etc}
|
|
35
|
+
"performance": {
|
|
36
|
+
"install_time": None,
|
|
37
|
+
"server_startup_time": None,
|
|
38
|
+
"total_initialization_time": None,
|
|
39
|
+
},
|
|
40
|
+
"versions": {
|
|
41
|
+
"ollama_system": "Detecting...",
|
|
42
|
+
"ollama_python": "Detecting...",
|
|
43
|
+
},
|
|
44
|
+
"cache": {
|
|
45
|
+
"policy": "auto",
|
|
46
|
+
"model_status": {}, # model_name -> cache status
|
|
47
|
+
},
|
|
48
|
+
"events": [], # Recent events log
|
|
49
|
+
}
|
|
50
|
+
self._lock = threading.Lock()
|
|
51
|
+
self._already_rendered = False
|
|
52
|
+
|
|
53
|
+
def update_status(self, category, data):
|
|
54
|
+
"""Thread-safe method to update status data"""
|
|
55
|
+
with self._lock:
|
|
56
|
+
if category in self.status_data:
|
|
57
|
+
self.status_data[category].update(data)
|
|
58
|
+
|
|
59
|
+
def add_event(self, event_type, message, timestamp=None):
|
|
60
|
+
"""Add an event to the timeline"""
|
|
61
|
+
if timestamp is None:
|
|
62
|
+
timestamp = datetime.now()
|
|
63
|
+
|
|
64
|
+
with self._lock:
|
|
65
|
+
self.status_data["events"].insert(
|
|
66
|
+
0,
|
|
67
|
+
{
|
|
68
|
+
"type": event_type, # 'info', 'warning', 'error', 'success'
|
|
69
|
+
"message": message,
|
|
70
|
+
"timestamp": timestamp,
|
|
71
|
+
},
|
|
72
|
+
)
|
|
73
|
+
# Keep only last 10 events
|
|
74
|
+
self.status_data["events"] = self.status_data["events"][:10]
|
|
75
|
+
|
|
76
|
+
def get_circuit_breaker_emoji(self, state):
|
|
77
|
+
"""Get status emoji for circuit breaker state"""
|
|
78
|
+
emoji_map = {"CLOSED": "🟢", "OPEN": "🔴", "HALF_OPEN": "🟡"}
|
|
79
|
+
return emoji_map.get(state, "⚪")
|
|
80
|
+
|
|
81
|
+
def get_uptime_string(self, start_time):
|
|
82
|
+
"""Calculate uptime string"""
|
|
83
|
+
if not start_time:
|
|
84
|
+
return "Not started"
|
|
85
|
+
|
|
86
|
+
uptime = datetime.now() - start_time
|
|
87
|
+
hours, remainder = divmod(int(uptime.total_seconds()), 3600)
|
|
88
|
+
minutes, seconds = divmod(remainder, 60)
|
|
89
|
+
|
|
90
|
+
if hours > 0:
|
|
91
|
+
return f"{hours}h {minutes}m {seconds}s"
|
|
92
|
+
elif minutes > 0:
|
|
93
|
+
return f"{minutes}m {seconds}s"
|
|
94
|
+
else:
|
|
95
|
+
return f"{seconds}s"
|
|
96
|
+
|
|
97
|
+
def on_startup(self, current_card):
|
|
98
|
+
"""Initialize the card when monitoring starts"""
|
|
99
|
+
current_card.append(Markdown("# 🦙 `@ollama` Status Dashboard"))
|
|
100
|
+
current_card.append(Markdown("_Initializing Ollama system..._"))
|
|
101
|
+
current_card.refresh()
|
|
102
|
+
|
|
103
|
+
def render_card_fresh(self, current_card, data):
|
|
104
|
+
"""Render the complete card with all status information"""
|
|
105
|
+
self._already_rendered = True
|
|
106
|
+
current_card.clear()
|
|
107
|
+
|
|
108
|
+
# Header with version information
|
|
109
|
+
current_card.append(Markdown("# 🦙 `@ollama` Status Dashboard"))
|
|
110
|
+
|
|
111
|
+
# Version information in header
|
|
112
|
+
versions = data.get("versions", {})
|
|
113
|
+
system_version = versions.get("ollama_system", "Unknown")
|
|
114
|
+
python_version = versions.get("ollama_python", "Unknown")
|
|
115
|
+
current_card.append(
|
|
116
|
+
Markdown(
|
|
117
|
+
f"**System:** `{system_version}` | **Python Client:** `{python_version}`"
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Cache policy information
|
|
122
|
+
cache_info = data.get("cache", {})
|
|
123
|
+
cache_policy = cache_info.get("policy", "auto")
|
|
124
|
+
current_card.append(Markdown(f"**Cache Policy:** `{cache_policy}`"))
|
|
125
|
+
|
|
126
|
+
current_card.append(
|
|
127
|
+
Markdown(f"_Last updated: {datetime.now().strftime('%H:%M:%S')}_")
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Circuit Breaker Status
|
|
131
|
+
cb_data = data["circuit_breaker"]
|
|
132
|
+
cb_emoji = self.get_circuit_breaker_emoji(cb_data["state"])
|
|
133
|
+
cb_status = f"{cb_emoji} **{cb_data['state']}**"
|
|
134
|
+
if cb_data["failure_count"] > 0:
|
|
135
|
+
cb_status += f" (failures: {cb_data['failure_count']})"
|
|
136
|
+
|
|
137
|
+
# Server Status
|
|
138
|
+
server_data = data["server"]
|
|
139
|
+
uptime = self.get_uptime_string(server_data.get("uptime_start"))
|
|
140
|
+
server_status = f"**{server_data['status']}**"
|
|
141
|
+
if server_data["restart_attempts"] > 0:
|
|
142
|
+
server_status += f" (restarts: {server_data['restart_attempts']})"
|
|
143
|
+
|
|
144
|
+
# Status Overview Table
|
|
145
|
+
status_table = [
|
|
146
|
+
["Circuit Breaker", Markdown(cb_status)],
|
|
147
|
+
["Server Status", Markdown(server_status)],
|
|
148
|
+
["Server Uptime", Markdown(uptime)],
|
|
149
|
+
[
|
|
150
|
+
"Last Health Check",
|
|
151
|
+
Markdown(server_data.get("health_status", "Unknown")),
|
|
152
|
+
],
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
current_card.append(Markdown("## System Status"))
|
|
156
|
+
current_card.append(Table(status_table, headers=["Component", "Status"]))
|
|
157
|
+
|
|
158
|
+
# Models Status
|
|
159
|
+
if data["models"]:
|
|
160
|
+
current_card.append(Markdown("## Models"))
|
|
161
|
+
model_table = []
|
|
162
|
+
cache_model_status = cache_info.get("model_status", {})
|
|
163
|
+
|
|
164
|
+
for model_name, model_info in data["models"].items():
|
|
165
|
+
status = model_info.get("status", "Unknown")
|
|
166
|
+
pull_time = model_info.get("pull_time", "N/A")
|
|
167
|
+
if isinstance(pull_time, (int, float)):
|
|
168
|
+
pull_time = f"{pull_time:.1f}s"
|
|
169
|
+
|
|
170
|
+
# Add cache status indicator
|
|
171
|
+
cache_status = cache_model_status.get(model_name, "unknown")
|
|
172
|
+
cache_emoji = {
|
|
173
|
+
"exists": "💾",
|
|
174
|
+
"missing": "❌",
|
|
175
|
+
"error": "⚠️",
|
|
176
|
+
"unknown": "❓",
|
|
177
|
+
}.get(cache_status, "❓")
|
|
178
|
+
|
|
179
|
+
# Get model metadata
|
|
180
|
+
size_formatted = model_info.get("size_formatted", "Unknown")
|
|
181
|
+
blob_count = model_info.get("blob_count", "Unknown")
|
|
182
|
+
if blob_count == 0:
|
|
183
|
+
blob_count = "Unknown"
|
|
184
|
+
|
|
185
|
+
model_table.append(
|
|
186
|
+
[
|
|
187
|
+
f"{model_name} {cache_emoji}",
|
|
188
|
+
status,
|
|
189
|
+
pull_time,
|
|
190
|
+
size_formatted,
|
|
191
|
+
str(blob_count),
|
|
192
|
+
]
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
current_card.append(
|
|
196
|
+
Table(
|
|
197
|
+
model_table,
|
|
198
|
+
headers=["Model (Cache)", "Status", "Pull Time", "Size", "Blobs"],
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Performance Metrics
|
|
203
|
+
perf_data = data["performance"]
|
|
204
|
+
if any(v is not None for v in perf_data.values()):
|
|
205
|
+
current_card.append(Markdown("## Performance"))
|
|
206
|
+
|
|
207
|
+
# Separate initialization and shutdown metrics
|
|
208
|
+
init_metrics = []
|
|
209
|
+
shutdown_metrics = []
|
|
210
|
+
other_metrics = []
|
|
211
|
+
|
|
212
|
+
for metric, value in perf_data.items():
|
|
213
|
+
if value is not None:
|
|
214
|
+
display_value = value
|
|
215
|
+
if isinstance(value, (int, float)):
|
|
216
|
+
display_value = f"{value:.1f}s"
|
|
217
|
+
|
|
218
|
+
metric_display = metric.replace("_", " ").title()
|
|
219
|
+
|
|
220
|
+
if "shutdown" in metric.lower():
|
|
221
|
+
shutdown_metrics.append([metric_display, display_value])
|
|
222
|
+
elif metric in [
|
|
223
|
+
"install_time",
|
|
224
|
+
"server_startup_time",
|
|
225
|
+
"total_initialization_time",
|
|
226
|
+
]:
|
|
227
|
+
init_metrics.append([metric_display, display_value])
|
|
228
|
+
else:
|
|
229
|
+
other_metrics.append([metric_display, display_value])
|
|
230
|
+
|
|
231
|
+
# Display metrics in organized sections
|
|
232
|
+
if init_metrics:
|
|
233
|
+
current_card.append(Markdown("### Initialization"))
|
|
234
|
+
current_card.append(Table(init_metrics, headers=["Metric", "Duration"]))
|
|
235
|
+
|
|
236
|
+
if shutdown_metrics:
|
|
237
|
+
current_card.append(Markdown("### Shutdown"))
|
|
238
|
+
current_card.append(
|
|
239
|
+
Table(shutdown_metrics, headers=["Metric", "Value"])
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if other_metrics:
|
|
243
|
+
current_card.append(Markdown("### Other"))
|
|
244
|
+
current_card.append(Table(other_metrics, headers=["Metric", "Value"]))
|
|
245
|
+
|
|
246
|
+
# Recent Events
|
|
247
|
+
if data["events"]:
|
|
248
|
+
current_card.append(Markdown("## Recent Events"))
|
|
249
|
+
events_table = []
|
|
250
|
+
for event in data["events"][:5]: # Show last 5 events
|
|
251
|
+
timestamp = event["timestamp"].strftime("%H:%M:%S")
|
|
252
|
+
event_type = event["type"]
|
|
253
|
+
message = event["message"]
|
|
254
|
+
|
|
255
|
+
# Add emoji based on event type
|
|
256
|
+
type_emoji = {
|
|
257
|
+
"info": "ℹ️",
|
|
258
|
+
"success": "✅",
|
|
259
|
+
"warning": "⚠️",
|
|
260
|
+
"error": "❌",
|
|
261
|
+
}.get(event_type, "ℹ️")
|
|
262
|
+
|
|
263
|
+
events_table.append([timestamp, f"{type_emoji} {message}"])
|
|
264
|
+
|
|
265
|
+
current_card.append(Table(events_table, headers=["Time", "Event"]))
|
|
266
|
+
|
|
267
|
+
current_card.refresh()
|
|
268
|
+
|
|
269
|
+
def on_error(self, current_card, error_message):
|
|
270
|
+
"""Handle errors in card rendering"""
|
|
271
|
+
if not self._already_rendered:
|
|
272
|
+
current_card.clear()
|
|
273
|
+
current_card.append(Markdown("# 🦙 `@ollama` Status Dashboard"))
|
|
274
|
+
current_card.append(Markdown(f"## ❌ Error: {str(error_message)}"))
|
|
275
|
+
current_card.refresh()
|
|
276
|
+
|
|
277
|
+
def on_update(self, current_card, data_object):
|
|
278
|
+
"""Update the card with new data"""
|
|
279
|
+
with self._lock:
|
|
280
|
+
current_data = self.status_data.copy()
|
|
281
|
+
|
|
282
|
+
if not self._already_rendered:
|
|
283
|
+
self.render_card_fresh(current_card, current_data)
|
|
284
|
+
else:
|
|
285
|
+
# For frequent updates, we could implement incremental updates here
|
|
286
|
+
# For now, just re-render the whole card
|
|
287
|
+
self.render_card_fresh(current_card, current_data)
|
|
288
|
+
|
|
289
|
+
def sqlite_fetch_func(self, conn):
|
|
290
|
+
"""Required by CardRefresher (which needs a refactor), but we use in-memory data instead"""
|
|
291
|
+
with self._lock:
|
|
292
|
+
return {"status": self.status_data}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
__mf_promote_submodules__ = ["plugins.optuna"]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def auth():
|
|
8
|
+
from metaflow.metaflow_config_funcs import init_config
|
|
9
|
+
|
|
10
|
+
conf = init_config()
|
|
11
|
+
if conf:
|
|
12
|
+
headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
|
|
13
|
+
else:
|
|
14
|
+
headers = json.loads(os.environ["METAFLOW_SERVICE_HEADERS"])
|
|
15
|
+
return headers
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_deployment_db_access_endpoint(name: str):
|
|
19
|
+
from ..apps.core.perimeters import PerimeterExtractor
|
|
20
|
+
from ..apps.core.capsule import CapsuleApi
|
|
21
|
+
|
|
22
|
+
perimeter, cap_url = PerimeterExtractor.during_programmatic_access()
|
|
23
|
+
deployment = CapsuleApi(cap_url, perimeter).get_by_name(name)
|
|
24
|
+
if not deployment:
|
|
25
|
+
raise Exception(f"No app deployment found with name `{name}`")
|
|
26
|
+
|
|
27
|
+
if (
|
|
28
|
+
"status" in deployment
|
|
29
|
+
and "accessInfo" in deployment["status"]
|
|
30
|
+
and "extraAccessUrls" in deployment["status"]["accessInfo"]
|
|
31
|
+
):
|
|
32
|
+
for extra_url in deployment["status"]["accessInfo"]["extraAccessUrls"]:
|
|
33
|
+
if extra_url["name"] == "in_cluster_db_access":
|
|
34
|
+
db_url = extra_url["url"].replace("http://", "")
|
|
35
|
+
return db_url
|
|
36
|
+
|
|
37
|
+
raise Exception(f"No db access endpoint found for deployment `{name}`")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_db_url(app_name: str):
|
|
41
|
+
"""
|
|
42
|
+
Example usage:
|
|
43
|
+
>>> from metaflow.plugins.optuna import get_db_url
|
|
44
|
+
>>> s = optuna.create_study(..., storage=get_db_url("optuna-dashboard"))
|
|
45
|
+
"""
|
|
46
|
+
mf_token = auth()["x-api-key"]
|
|
47
|
+
app_url = get_deployment_db_access_endpoint(app_name)
|
|
48
|
+
return f"postgresql://userspace_default:{mf_token}@{app_url}/userspace_default?sslmode=disable"
|
|
@@ -3,12 +3,16 @@ import fcntl
|
|
|
3
3
|
from os import path
|
|
4
4
|
import json
|
|
5
5
|
from metaflow.exception import MetaflowException
|
|
6
|
+
from typing import Union
|
|
6
7
|
|
|
7
8
|
CURRENT_PERIMETER_KEY = "OB_CURRENT_PERIMETER"
|
|
8
9
|
CURRENT_PERIMETER_URL = "OB_CURRENT_PERIMETER_MF_CONFIG_URL"
|
|
10
|
+
CURRENT_PERIMETER_URL_LEGACY_KEY = (
|
|
11
|
+
"OB_CURRENT_PERIMETER_URL" # For backwards compatibility with workstations.
|
|
12
|
+
)
|
|
9
13
|
|
|
10
14
|
|
|
11
|
-
def
|
|
15
|
+
def get_perimeter_config_url_if_set_in_ob_config() -> Union[str, None]:
|
|
12
16
|
# If OBP_CONFIG_DIR is set, use that, otherwise use METAFLOW_HOME
|
|
13
17
|
# If neither are set, use ~/.metaflowconfig
|
|
14
18
|
obp_config_dir = path.expanduser(
|
|
@@ -36,18 +40,28 @@ def set_current_perimeter_config_url_in_environment():
|
|
|
36
40
|
with open(file_path, "r") as f:
|
|
37
41
|
ob_config = json.loads(f.read())
|
|
38
42
|
|
|
39
|
-
if CURRENT_PERIMETER_KEY in ob_config and
|
|
43
|
+
if CURRENT_PERIMETER_KEY in ob_config and (
|
|
44
|
+
CURRENT_PERIMETER_URL in ob_config
|
|
45
|
+
or CURRENT_PERIMETER_URL_LEGACY_KEY in ob_config
|
|
46
|
+
):
|
|
40
47
|
os.environ[CURRENT_PERIMETER_KEY] = ob_config[CURRENT_PERIMETER_KEY]
|
|
41
|
-
|
|
48
|
+
if CURRENT_PERIMETER_URL in ob_config:
|
|
49
|
+
os.environ[CURRENT_PERIMETER_URL] = ob_config[CURRENT_PERIMETER_URL]
|
|
50
|
+
elif CURRENT_PERIMETER_URL_LEGACY_KEY in ob_config:
|
|
51
|
+
os.environ[CURRENT_PERIMETER_URL] = ob_config[
|
|
52
|
+
CURRENT_PERIMETER_URL_LEGACY_KEY
|
|
53
|
+
]
|
|
54
|
+
return os.environ[CURRENT_PERIMETER_URL]
|
|
42
55
|
else:
|
|
43
56
|
raise MetaflowException(
|
|
44
|
-
"
|
|
57
|
+
"{} does not contain the key {}".format(
|
|
45
58
|
file_path, CURRENT_PERIMETER_KEY
|
|
46
59
|
)
|
|
47
60
|
)
|
|
48
61
|
elif "OBP_CONFIG_DIR" in os.environ:
|
|
49
62
|
raise MetaflowException(
|
|
50
|
-
"Environment variable OBP_CONFIG_DIR is set to
|
|
63
|
+
"Environment variable OBP_CONFIG_DIR is set to {} but this directory does not contain an ob_config.json file.".format(
|
|
51
64
|
os.environ["OBP_CONFIG_DIR"]
|
|
52
65
|
)
|
|
53
66
|
)
|
|
67
|
+
return None
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from metaflow.exception import MetaflowException
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class CardDecoratorInjector:
|
|
6
|
+
"""
|
|
7
|
+
Mixin Useful for injecting @card decorators from other first class Metaflow decorators.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
_first_time_init = defaultdict(dict)
|
|
11
|
+
|
|
12
|
+
@classmethod
|
|
13
|
+
def _get_first_time_init_cached_value(cls, step_name, card_id):
|
|
14
|
+
return cls._first_time_init.get(step_name, {}).get(card_id, None)
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def _set_first_time_init_cached_value(cls, step_name, card_id, value):
|
|
18
|
+
cls._first_time_init[step_name][card_id] = value
|
|
19
|
+
|
|
20
|
+
def _card_deco_already_attached(self, step, card_id):
|
|
21
|
+
for decorator in step.decorators:
|
|
22
|
+
if decorator.name == "card":
|
|
23
|
+
if decorator.attributes["id"] and card_id in decorator.attributes["id"]:
|
|
24
|
+
return True
|
|
25
|
+
return False
|
|
26
|
+
|
|
27
|
+
def _get_step(self, flow, step_name):
|
|
28
|
+
for step in flow:
|
|
29
|
+
if step.name == step_name:
|
|
30
|
+
return step
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
def _first_time_init_check(self, step_dag_node, card_id):
|
|
34
|
+
""" """
|
|
35
|
+
return not self._card_deco_already_attached(step_dag_node, card_id)
|
|
36
|
+
|
|
37
|
+
def attach_card_decorator(
|
|
38
|
+
self,
|
|
39
|
+
flow,
|
|
40
|
+
step_name,
|
|
41
|
+
card_id,
|
|
42
|
+
card_type,
|
|
43
|
+
refresh_interval=5,
|
|
44
|
+
):
|
|
45
|
+
"""
|
|
46
|
+
This method is called `step_init` in your StepDecorator code since
|
|
47
|
+
this class is used as a Mixin
|
|
48
|
+
"""
|
|
49
|
+
from metaflow import decorators as _decorators
|
|
50
|
+
|
|
51
|
+
if not all([card_id, card_type]):
|
|
52
|
+
raise MetaflowException(
|
|
53
|
+
"`card_id` and `card_type` must be set in the `CardDecoratorInjector` Mixin"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
step_dag_node = self._get_step(flow, step_name)
|
|
57
|
+
if (
|
|
58
|
+
self._get_first_time_init_cached_value(step_name, card_id) is None
|
|
59
|
+
): # First check class level setting.
|
|
60
|
+
if self._first_time_init_check(step_dag_node, card_id):
|
|
61
|
+
self._set_first_time_init_cached_value(step_name, card_id, True)
|
|
62
|
+
_decorators._attach_decorators_to_step(
|
|
63
|
+
step_dag_node,
|
|
64
|
+
[
|
|
65
|
+
"card:type=%s,id=%s,refresh_interval=%s"
|
|
66
|
+
% (card_type, card_id, str(refresh_interval))
|
|
67
|
+
],
|
|
68
|
+
)
|
|
69
|
+
else:
|
|
70
|
+
self._set_first_time_init_cached_value(step_name, card_id, False)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from metaflow.decorators import StepDecorator
|
|
3
|
+
from ...profilers.gpu import GPUProfiler # Fix import
|
|
4
|
+
from .deco_injector import CardDecoratorInjector
|
|
5
|
+
import threading
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class GPUProfileDecorator(StepDecorator):
|
|
9
|
+
name = "gpu_profile"
|
|
10
|
+
|
|
11
|
+
defaults = {
|
|
12
|
+
"include_artifacts": True,
|
|
13
|
+
"artifact_prefix": "gpu_profile_",
|
|
14
|
+
"interval": 1,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
def step_init(
|
|
18
|
+
self, flow, graph, step_name, decorators, environment, flow_datastore, logger
|
|
19
|
+
):
|
|
20
|
+
self.deco_injector = CardDecoratorInjector()
|
|
21
|
+
self.deco_injector.attach_card_decorator(
|
|
22
|
+
flow,
|
|
23
|
+
step_name,
|
|
24
|
+
"gpu_profile",
|
|
25
|
+
"blank",
|
|
26
|
+
refresh_interval=self.attributes["interval"],
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def task_pre_step(
|
|
30
|
+
self,
|
|
31
|
+
step_name,
|
|
32
|
+
task_datastore,
|
|
33
|
+
metadata,
|
|
34
|
+
run_id,
|
|
35
|
+
task_id,
|
|
36
|
+
flow,
|
|
37
|
+
graph,
|
|
38
|
+
retry_count,
|
|
39
|
+
max_user_code_retries,
|
|
40
|
+
ubf_context,
|
|
41
|
+
inputs,
|
|
42
|
+
):
|
|
43
|
+
self._profiler = GPUProfiler(
|
|
44
|
+
interval=self.attributes["interval"],
|
|
45
|
+
artifact_name=self.attributes["artifact_prefix"] + "data",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def task_decorate(
|
|
49
|
+
self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
|
|
50
|
+
):
|
|
51
|
+
from metaflow import current
|
|
52
|
+
from metaflow.cards import Markdown
|
|
53
|
+
|
|
54
|
+
if self.attributes["include_artifacts"]:
|
|
55
|
+
setattr(
|
|
56
|
+
flow,
|
|
57
|
+
self.attributes["artifact_prefix"] + "num_gpus",
|
|
58
|
+
len(self._profiler.devices),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
current.card["gpu_profile"].append(
|
|
62
|
+
Markdown("# GPU profile for `%s`" % current.pathspec)
|
|
63
|
+
)
|
|
64
|
+
current.card["gpu_profile"].append(
|
|
65
|
+
Markdown(
|
|
66
|
+
"_Started at: %s_"
|
|
67
|
+
% datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
self._profiler._setup_card()
|
|
71
|
+
current.card["gpu_profile"].refresh()
|
|
72
|
+
self._update_thread = threading.Thread(
|
|
73
|
+
target=self._profiler._update_card, daemon=True
|
|
74
|
+
)
|
|
75
|
+
self._update_thread.start()
|
|
76
|
+
|
|
77
|
+
def wrapped_step_func():
|
|
78
|
+
try:
|
|
79
|
+
step_func()
|
|
80
|
+
finally:
|
|
81
|
+
try:
|
|
82
|
+
results = self._profiler.finish()
|
|
83
|
+
except:
|
|
84
|
+
results = {"error": "couldn't read profiler results"}
|
|
85
|
+
if self.attributes["include_artifacts"]:
|
|
86
|
+
setattr(flow, self.attributes["artifact_prefix"] + "data", results)
|
|
87
|
+
|
|
88
|
+
return wrapped_step_func
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from metaflow.decorators import StepDecorator
|
|
3
|
+
from ..card_utilities.injector import CardDecoratorInjector
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DynamicCardAppendDecorator(StepDecorator):
|
|
7
|
+
"""
|
|
8
|
+
A simple decorator that demonstrates using CardDecoratorInjector
|
|
9
|
+
to inject a card and render simple markdown content.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
name = "test_append_card"
|
|
13
|
+
|
|
14
|
+
defaults = {
|
|
15
|
+
"title": "Simple Card",
|
|
16
|
+
"message": "Hello from DynamicCardAppendDecorator!",
|
|
17
|
+
"show_timestamp": True,
|
|
18
|
+
"refresh_interval": 5,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
CARD_ID = "simple_card"
|
|
22
|
+
|
|
23
|
+
def step_init(
|
|
24
|
+
self, flow, graph, step_name, decorators, environment, flow_datastore, logger
|
|
25
|
+
):
|
|
26
|
+
"""Initialize the decorator and inject the card."""
|
|
27
|
+
self.deco_injector = CardDecoratorInjector()
|
|
28
|
+
self.deco_injector.attach_card_decorator(
|
|
29
|
+
flow,
|
|
30
|
+
step_name,
|
|
31
|
+
self.CARD_ID,
|
|
32
|
+
"blank",
|
|
33
|
+
refresh_interval=self.attributes["refresh_interval"],
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def task_decorate(
|
|
37
|
+
self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
|
|
38
|
+
):
|
|
39
|
+
"""Decorate the step function to add card content."""
|
|
40
|
+
from metaflow import current
|
|
41
|
+
from metaflow.cards import Markdown
|
|
42
|
+
|
|
43
|
+
# Create the card content
|
|
44
|
+
title = self.attributes["title"]
|
|
45
|
+
message = self.attributes["message"]
|
|
46
|
+
show_timestamp = self.attributes["show_timestamp"]
|
|
47
|
+
|
|
48
|
+
# Add title to the card
|
|
49
|
+
current.card[self.CARD_ID].append(Markdown(f"# {title}"))
|
|
50
|
+
|
|
51
|
+
# Add message to the card
|
|
52
|
+
current.card[self.CARD_ID].append(Markdown(f"**Message:** {message}"))
|
|
53
|
+
|
|
54
|
+
# Add timestamp if requested
|
|
55
|
+
if show_timestamp:
|
|
56
|
+
timestamp = datetime.now().astimezone().strftime("%Y-%m-%d %H:%M:%S %z")
|
|
57
|
+
current.card[self.CARD_ID].append(Markdown(f"**Created at:** {timestamp}"))
|
|
58
|
+
|
|
59
|
+
# Add step information
|
|
60
|
+
current.card[self.CARD_ID].append(Markdown(f"**Step:** `{current.pathspec}`"))
|
|
61
|
+
|
|
62
|
+
# Add a simple divider
|
|
63
|
+
current.card[self.CARD_ID].append(Markdown("---"))
|
|
64
|
+
|
|
65
|
+
# Add some dynamic content that shows this is working
|
|
66
|
+
current.card[self.CARD_ID].append(
|
|
67
|
+
Markdown("**Status:** Card successfully injected! 🎉")
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def wrapped_step_func():
|
|
71
|
+
"""Execute the original step function."""
|
|
72
|
+
try:
|
|
73
|
+
# Before execution
|
|
74
|
+
current.card[self.CARD_ID].append(
|
|
75
|
+
Markdown("**Execution:** Step started...")
|
|
76
|
+
)
|
|
77
|
+
current.card[self.CARD_ID].refresh()
|
|
78
|
+
|
|
79
|
+
# Execute the original step
|
|
80
|
+
step_func()
|
|
81
|
+
|
|
82
|
+
# After execution
|
|
83
|
+
current.card[self.CARD_ID].append(
|
|
84
|
+
Markdown("**Execution:** Step completed successfully! ✅")
|
|
85
|
+
)
|
|
86
|
+
current.card[self.CARD_ID].refresh()
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
# Handle errors
|
|
90
|
+
current.card[self.CARD_ID].append(
|
|
91
|
+
Markdown(f"**Error:** Step failed with error: `{str(e)}` ❌")
|
|
92
|
+
)
|
|
93
|
+
current.card[self.CARD_ID].refresh()
|
|
94
|
+
raise
|
|
95
|
+
|
|
96
|
+
return wrapped_step_func
|