ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (128) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -7
  2. metaflow_extensions/outerbounds/config/__init__.py +35 -0
  3. metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
  4. metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  5. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  6. metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
  7. metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  33. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  34. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  35. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  36. metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
  37. metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
  38. metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
  39. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  40. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  41. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  42. metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
  43. metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
  44. metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
  45. metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
  46. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  47. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  48. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  49. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  50. metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  51. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  52. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
  53. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
  54. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
  55. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
  56. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
  57. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  58. metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
  59. metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
  60. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
  61. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  62. metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  63. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
  64. metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
  65. metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
  66. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
  67. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
  68. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
  69. metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
  70. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  71. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  72. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  73. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  74. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  75. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  76. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  77. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
  78. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  79. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  80. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
  81. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  82. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  83. metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
  84. metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
  85. metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
  86. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  87. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  88. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  89. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  90. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  91. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  92. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  93. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  94. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  95. metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
  96. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
  97. metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
  98. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
  99. metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  100. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
  101. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
  102. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
  103. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
  104. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
  105. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
  106. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
  107. metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
  108. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  109. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  110. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  111. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  112. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  113. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  114. metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
  115. metaflow_extensions/outerbounds/remote_config.py +53 -16
  116. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
  117. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  118. metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
  119. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  120. metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
  121. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  122. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  123. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  124. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
  125. ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
  126. ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
  127. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
  128. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,292 @@
1
+ from metaflow.cards import Markdown, Table, VegaChart
2
+ from metaflow.metaflow_current import current
3
+ from datetime import datetime
4
+ import threading
5
+ import time
6
+
7
+ from ..card_utilities.async_cards import CardRefresher
8
+
9
+
10
+ class OllamaStatusCard(CardRefresher):
11
+ """
12
+ Real-time status card for Ollama system monitoring.
13
+ Shows circuit breaker state, server health, model status, and recent events.
14
+ """
15
+
16
+ CARD_ID = "ollama_status"
17
+
18
+ def __init__(self, refresh_interval=10):
19
+ self.refresh_interval = refresh_interval
20
+ self.status_data = {
21
+ "circuit_breaker": {
22
+ "state": "CLOSED",
23
+ "failure_count": 0,
24
+ "last_failure_time": None,
25
+ "last_open_time": None,
26
+ },
27
+ "server": {
28
+ "status": "Starting",
29
+ "uptime_start": None,
30
+ "restart_attempts": 0,
31
+ "last_health_check": None,
32
+ "health_status": "Unknown",
33
+ },
34
+ "models": {}, # model_name -> {status, pull_time, load_time, etc}
35
+ "performance": {
36
+ "install_time": None,
37
+ "server_startup_time": None,
38
+ "total_initialization_time": None,
39
+ },
40
+ "versions": {
41
+ "ollama_system": "Detecting...",
42
+ "ollama_python": "Detecting...",
43
+ },
44
+ "cache": {
45
+ "policy": "auto",
46
+ "model_status": {}, # model_name -> cache status
47
+ },
48
+ "events": [], # Recent events log
49
+ }
50
+ self._lock = threading.Lock()
51
+ self._already_rendered = False
52
+
53
+ def update_status(self, category, data):
54
+ """Thread-safe method to update status data"""
55
+ with self._lock:
56
+ if category in self.status_data:
57
+ self.status_data[category].update(data)
58
+
59
+ def add_event(self, event_type, message, timestamp=None):
60
+ """Add an event to the timeline"""
61
+ if timestamp is None:
62
+ timestamp = datetime.now()
63
+
64
+ with self._lock:
65
+ self.status_data["events"].insert(
66
+ 0,
67
+ {
68
+ "type": event_type, # 'info', 'warning', 'error', 'success'
69
+ "message": message,
70
+ "timestamp": timestamp,
71
+ },
72
+ )
73
+ # Keep only last 10 events
74
+ self.status_data["events"] = self.status_data["events"][:10]
75
+
76
+ def get_circuit_breaker_emoji(self, state):
77
+ """Get status emoji for circuit breaker state"""
78
+ emoji_map = {"CLOSED": "🟢", "OPEN": "🔴", "HALF_OPEN": "🟡"}
79
+ return emoji_map.get(state, "⚪")
80
+
81
+ def get_uptime_string(self, start_time):
82
+ """Calculate uptime string"""
83
+ if not start_time:
84
+ return "Not started"
85
+
86
+ uptime = datetime.now() - start_time
87
+ hours, remainder = divmod(int(uptime.total_seconds()), 3600)
88
+ minutes, seconds = divmod(remainder, 60)
89
+
90
+ if hours > 0:
91
+ return f"{hours}h {minutes}m {seconds}s"
92
+ elif minutes > 0:
93
+ return f"{minutes}m {seconds}s"
94
+ else:
95
+ return f"{seconds}s"
96
+
97
+ def on_startup(self, current_card):
98
+ """Initialize the card when monitoring starts"""
99
+ current_card.append(Markdown("# 🦙 `@ollama` Status Dashboard"))
100
+ current_card.append(Markdown("_Initializing Ollama system..._"))
101
+ current_card.refresh()
102
+
103
+ def render_card_fresh(self, current_card, data):
104
+ """Render the complete card with all status information"""
105
+ self._already_rendered = True
106
+ current_card.clear()
107
+
108
+ # Header with version information
109
+ current_card.append(Markdown("# 🦙 `@ollama` Status Dashboard"))
110
+
111
+ # Version information in header
112
+ versions = data.get("versions", {})
113
+ system_version = versions.get("ollama_system", "Unknown")
114
+ python_version = versions.get("ollama_python", "Unknown")
115
+ current_card.append(
116
+ Markdown(
117
+ f"**System:** `{system_version}` | **Python Client:** `{python_version}`"
118
+ )
119
+ )
120
+
121
+ # Cache policy information
122
+ cache_info = data.get("cache", {})
123
+ cache_policy = cache_info.get("policy", "auto")
124
+ current_card.append(Markdown(f"**Cache Policy:** `{cache_policy}`"))
125
+
126
+ current_card.append(
127
+ Markdown(f"_Last updated: {datetime.now().strftime('%H:%M:%S')}_")
128
+ )
129
+
130
+ # Circuit Breaker Status
131
+ cb_data = data["circuit_breaker"]
132
+ cb_emoji = self.get_circuit_breaker_emoji(cb_data["state"])
133
+ cb_status = f"{cb_emoji} **{cb_data['state']}**"
134
+ if cb_data["failure_count"] > 0:
135
+ cb_status += f" (failures: {cb_data['failure_count']})"
136
+
137
+ # Server Status
138
+ server_data = data["server"]
139
+ uptime = self.get_uptime_string(server_data.get("uptime_start"))
140
+ server_status = f"**{server_data['status']}**"
141
+ if server_data["restart_attempts"] > 0:
142
+ server_status += f" (restarts: {server_data['restart_attempts']})"
143
+
144
+ # Status Overview Table
145
+ status_table = [
146
+ ["Circuit Breaker", Markdown(cb_status)],
147
+ ["Server Status", Markdown(server_status)],
148
+ ["Server Uptime", Markdown(uptime)],
149
+ [
150
+ "Last Health Check",
151
+ Markdown(server_data.get("health_status", "Unknown")),
152
+ ],
153
+ ]
154
+
155
+ current_card.append(Markdown("## System Status"))
156
+ current_card.append(Table(status_table, headers=["Component", "Status"]))
157
+
158
+ # Models Status
159
+ if data["models"]:
160
+ current_card.append(Markdown("## Models"))
161
+ model_table = []
162
+ cache_model_status = cache_info.get("model_status", {})
163
+
164
+ for model_name, model_info in data["models"].items():
165
+ status = model_info.get("status", "Unknown")
166
+ pull_time = model_info.get("pull_time", "N/A")
167
+ if isinstance(pull_time, (int, float)):
168
+ pull_time = f"{pull_time:.1f}s"
169
+
170
+ # Add cache status indicator
171
+ cache_status = cache_model_status.get(model_name, "unknown")
172
+ cache_emoji = {
173
+ "exists": "💾",
174
+ "missing": "❌",
175
+ "error": "⚠️",
176
+ "unknown": "❓",
177
+ }.get(cache_status, "❓")
178
+
179
+ # Get model metadata
180
+ size_formatted = model_info.get("size_formatted", "Unknown")
181
+ blob_count = model_info.get("blob_count", "Unknown")
182
+ if blob_count == 0:
183
+ blob_count = "Unknown"
184
+
185
+ model_table.append(
186
+ [
187
+ f"{model_name} {cache_emoji}",
188
+ status,
189
+ pull_time,
190
+ size_formatted,
191
+ str(blob_count),
192
+ ]
193
+ )
194
+
195
+ current_card.append(
196
+ Table(
197
+ model_table,
198
+ headers=["Model (Cache)", "Status", "Pull Time", "Size", "Blobs"],
199
+ )
200
+ )
201
+
202
+ # Performance Metrics
203
+ perf_data = data["performance"]
204
+ if any(v is not None for v in perf_data.values()):
205
+ current_card.append(Markdown("## Performance"))
206
+
207
+ # Separate initialization and shutdown metrics
208
+ init_metrics = []
209
+ shutdown_metrics = []
210
+ other_metrics = []
211
+
212
+ for metric, value in perf_data.items():
213
+ if value is not None:
214
+ display_value = value
215
+ if isinstance(value, (int, float)):
216
+ display_value = f"{value:.1f}s"
217
+
218
+ metric_display = metric.replace("_", " ").title()
219
+
220
+ if "shutdown" in metric.lower():
221
+ shutdown_metrics.append([metric_display, display_value])
222
+ elif metric in [
223
+ "install_time",
224
+ "server_startup_time",
225
+ "total_initialization_time",
226
+ ]:
227
+ init_metrics.append([metric_display, display_value])
228
+ else:
229
+ other_metrics.append([metric_display, display_value])
230
+
231
+ # Display metrics in organized sections
232
+ if init_metrics:
233
+ current_card.append(Markdown("### Initialization"))
234
+ current_card.append(Table(init_metrics, headers=["Metric", "Duration"]))
235
+
236
+ if shutdown_metrics:
237
+ current_card.append(Markdown("### Shutdown"))
238
+ current_card.append(
239
+ Table(shutdown_metrics, headers=["Metric", "Value"])
240
+ )
241
+
242
+ if other_metrics:
243
+ current_card.append(Markdown("### Other"))
244
+ current_card.append(Table(other_metrics, headers=["Metric", "Value"]))
245
+
246
+ # Recent Events
247
+ if data["events"]:
248
+ current_card.append(Markdown("## Recent Events"))
249
+ events_table = []
250
+ for event in data["events"][:5]: # Show last 5 events
251
+ timestamp = event["timestamp"].strftime("%H:%M:%S")
252
+ event_type = event["type"]
253
+ message = event["message"]
254
+
255
+ # Add emoji based on event type
256
+ type_emoji = {
257
+ "info": "ℹ️",
258
+ "success": "✅",
259
+ "warning": "⚠️",
260
+ "error": "❌",
261
+ }.get(event_type, "ℹ️")
262
+
263
+ events_table.append([timestamp, f"{type_emoji} {message}"])
264
+
265
+ current_card.append(Table(events_table, headers=["Time", "Event"]))
266
+
267
+ current_card.refresh()
268
+
269
+ def on_error(self, current_card, error_message):
270
+ """Handle errors in card rendering"""
271
+ if not self._already_rendered:
272
+ current_card.clear()
273
+ current_card.append(Markdown("# 🦙 `@ollama` Status Dashboard"))
274
+ current_card.append(Markdown(f"## ❌ Error: {str(error_message)}"))
275
+ current_card.refresh()
276
+
277
+ def on_update(self, current_card, data_object):
278
+ """Update the card with new data"""
279
+ with self._lock:
280
+ current_data = self.status_data.copy()
281
+
282
+ if not self._already_rendered:
283
+ self.render_card_fresh(current_card, current_data)
284
+ else:
285
+ # For frequent updates, we could implement incremental updates here
286
+ # For now, just re-render the whole card
287
+ self.render_card_fresh(current_card, current_data)
288
+
289
+ def sqlite_fetch_func(self, conn):
290
+ """Required by CardRefresher (which needs a refactor), but we use in-memory data instead"""
291
+ with self._lock:
292
+ return {"status": self.status_data}
@@ -0,0 +1,48 @@
1
+ import os
2
+ import json
3
+
4
+ __mf_promote_submodules__ = ["plugins.optuna"]
5
+
6
+
7
+ def auth():
8
+ from metaflow.metaflow_config_funcs import init_config
9
+
10
+ conf = init_config()
11
+ if conf:
12
+ headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
13
+ else:
14
+ headers = json.loads(os.environ["METAFLOW_SERVICE_HEADERS"])
15
+ return headers
16
+
17
+
18
+ def get_deployment_db_access_endpoint(name: str):
19
+ from ..apps.core.perimeters import PerimeterExtractor
20
+ from ..apps.core.capsule import CapsuleApi
21
+
22
+ perimeter, cap_url = PerimeterExtractor.during_programmatic_access()
23
+ deployment = CapsuleApi(cap_url, perimeter).get_by_name(name)
24
+ if not deployment:
25
+ raise Exception(f"No app deployment found with name `{name}`")
26
+
27
+ if (
28
+ "status" in deployment
29
+ and "accessInfo" in deployment["status"]
30
+ and "extraAccessUrls" in deployment["status"]["accessInfo"]
31
+ ):
32
+ for extra_url in deployment["status"]["accessInfo"]["extraAccessUrls"]:
33
+ if extra_url["name"] == "in_cluster_db_access":
34
+ db_url = extra_url["url"].replace("http://", "")
35
+ return db_url
36
+
37
+ raise Exception(f"No db access endpoint found for deployment `{name}`")
38
+
39
+
40
+ def get_db_url(app_name: str):
41
+ """
42
+ Example usage:
43
+ >>> from metaflow.plugins.optuna import get_db_url
44
+ >>> s = optuna.create_study(..., storage=get_db_url("optuna-dashboard"))
45
+ """
46
+ mf_token = auth()["x-api-key"]
47
+ app_url = get_deployment_db_access_endpoint(app_name)
48
+ return f"postgresql://userspace_default:{mf_token}@{app_url}/userspace_default?sslmode=disable"
@@ -3,12 +3,16 @@ import fcntl
3
3
  from os import path
4
4
  import json
5
5
  from metaflow.exception import MetaflowException
6
+ from typing import Union
6
7
 
7
8
  CURRENT_PERIMETER_KEY = "OB_CURRENT_PERIMETER"
8
9
  CURRENT_PERIMETER_URL = "OB_CURRENT_PERIMETER_MF_CONFIG_URL"
10
+ CURRENT_PERIMETER_URL_LEGACY_KEY = (
11
+ "OB_CURRENT_PERIMETER_URL" # For backwards compatibility with workstations.
12
+ )
9
13
 
10
14
 
11
- def set_current_perimeter_config_url_in_environment():
15
+ def get_perimeter_config_url_if_set_in_ob_config() -> Union[str, None]:
12
16
  # If OBP_CONFIG_DIR is set, use that, otherwise use METAFLOW_HOME
13
17
  # If neither are set, use ~/.metaflowconfig
14
18
  obp_config_dir = path.expanduser(
@@ -36,18 +40,28 @@ def set_current_perimeter_config_url_in_environment():
36
40
  with open(file_path, "r") as f:
37
41
  ob_config = json.loads(f.read())
38
42
 
39
- if CURRENT_PERIMETER_KEY in ob_config and CURRENT_PERIMETER_URL in ob_config:
43
+ if CURRENT_PERIMETER_KEY in ob_config and (
44
+ CURRENT_PERIMETER_URL in ob_config
45
+ or CURRENT_PERIMETER_URL_LEGACY_KEY in ob_config
46
+ ):
40
47
  os.environ[CURRENT_PERIMETER_KEY] = ob_config[CURRENT_PERIMETER_KEY]
41
- os.environ[CURRENT_PERIMETER_URL] = ob_config[CURRENT_PERIMETER_URL]
48
+ if CURRENT_PERIMETER_URL in ob_config:
49
+ os.environ[CURRENT_PERIMETER_URL] = ob_config[CURRENT_PERIMETER_URL]
50
+ elif CURRENT_PERIMETER_URL_LEGACY_KEY in ob_config:
51
+ os.environ[CURRENT_PERIMETER_URL] = ob_config[
52
+ CURRENT_PERIMETER_URL_LEGACY_KEY
53
+ ]
54
+ return os.environ[CURRENT_PERIMETER_URL]
42
55
  else:
43
56
  raise MetaflowException(
44
- "%s does not contain the key %s".format(
57
+ "{} does not contain the key {}".format(
45
58
  file_path, CURRENT_PERIMETER_KEY
46
59
  )
47
60
  )
48
61
  elif "OBP_CONFIG_DIR" in os.environ:
49
62
  raise MetaflowException(
50
- "Environment variable OBP_CONFIG_DIR is set to %s but this directory does not contain an ob_config.json file.".format(
63
+ "Environment variable OBP_CONFIG_DIR is set to {} but this directory does not contain an ob_config.json file.".format(
51
64
  os.environ["OBP_CONFIG_DIR"]
52
65
  )
53
66
  )
67
+ return None
@@ -0,0 +1,70 @@
1
+ from metaflow.exception import MetaflowException
2
+ from collections import defaultdict
3
+
4
+
5
+ class CardDecoratorInjector:
6
+ """
7
+ Mixin Useful for injecting @card decorators from other first class Metaflow decorators.
8
+ """
9
+
10
+ _first_time_init = defaultdict(dict)
11
+
12
+ @classmethod
13
+ def _get_first_time_init_cached_value(cls, step_name, card_id):
14
+ return cls._first_time_init.get(step_name, {}).get(card_id, None)
15
+
16
+ @classmethod
17
+ def _set_first_time_init_cached_value(cls, step_name, card_id, value):
18
+ cls._first_time_init[step_name][card_id] = value
19
+
20
+ def _card_deco_already_attached(self, step, card_id):
21
+ for decorator in step.decorators:
22
+ if decorator.name == "card":
23
+ if decorator.attributes["id"] and card_id in decorator.attributes["id"]:
24
+ return True
25
+ return False
26
+
27
+ def _get_step(self, flow, step_name):
28
+ for step in flow:
29
+ if step.name == step_name:
30
+ return step
31
+ return None
32
+
33
+ def _first_time_init_check(self, step_dag_node, card_id):
34
+ """ """
35
+ return not self._card_deco_already_attached(step_dag_node, card_id)
36
+
37
+ def attach_card_decorator(
38
+ self,
39
+ flow,
40
+ step_name,
41
+ card_id,
42
+ card_type,
43
+ refresh_interval=5,
44
+ ):
45
+ """
46
+ This method is called `step_init` in your StepDecorator code since
47
+ this class is used as a Mixin
48
+ """
49
+ from metaflow import decorators as _decorators
50
+
51
+ if not all([card_id, card_type]):
52
+ raise MetaflowException(
53
+ "`card_id` and `card_type` must be set in the `CardDecoratorInjector` Mixin"
54
+ )
55
+
56
+ step_dag_node = self._get_step(flow, step_name)
57
+ if (
58
+ self._get_first_time_init_cached_value(step_name, card_id) is None
59
+ ): # First check class level setting.
60
+ if self._first_time_init_check(step_dag_node, card_id):
61
+ self._set_first_time_init_cached_value(step_name, card_id, True)
62
+ _decorators._attach_decorators_to_step(
63
+ step_dag_node,
64
+ [
65
+ "card:type=%s,id=%s,refresh_interval=%s"
66
+ % (card_type, card_id, str(refresh_interval))
67
+ ],
68
+ )
69
+ else:
70
+ self._set_first_time_init_cached_value(step_name, card_id, False)
@@ -0,0 +1,88 @@
1
+ from datetime import datetime
2
+ from metaflow.decorators import StepDecorator
3
+ from ...profilers.gpu import GPUProfiler # Fix import
4
+ from .deco_injector import CardDecoratorInjector
5
+ import threading
6
+
7
+
8
+ class GPUProfileDecorator(StepDecorator):
9
+ name = "gpu_profile"
10
+
11
+ defaults = {
12
+ "include_artifacts": True,
13
+ "artifact_prefix": "gpu_profile_",
14
+ "interval": 1,
15
+ }
16
+
17
+ def step_init(
18
+ self, flow, graph, step_name, decorators, environment, flow_datastore, logger
19
+ ):
20
+ self.deco_injector = CardDecoratorInjector()
21
+ self.deco_injector.attach_card_decorator(
22
+ flow,
23
+ step_name,
24
+ "gpu_profile",
25
+ "blank",
26
+ refresh_interval=self.attributes["interval"],
27
+ )
28
+
29
+ def task_pre_step(
30
+ self,
31
+ step_name,
32
+ task_datastore,
33
+ metadata,
34
+ run_id,
35
+ task_id,
36
+ flow,
37
+ graph,
38
+ retry_count,
39
+ max_user_code_retries,
40
+ ubf_context,
41
+ inputs,
42
+ ):
43
+ self._profiler = GPUProfiler(
44
+ interval=self.attributes["interval"],
45
+ artifact_name=self.attributes["artifact_prefix"] + "data",
46
+ )
47
+
48
+ def task_decorate(
49
+ self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
50
+ ):
51
+ from metaflow import current
52
+ from metaflow.cards import Markdown
53
+
54
+ if self.attributes["include_artifacts"]:
55
+ setattr(
56
+ flow,
57
+ self.attributes["artifact_prefix"] + "num_gpus",
58
+ len(self._profiler.devices),
59
+ )
60
+
61
+ current.card["gpu_profile"].append(
62
+ Markdown("# GPU profile for `%s`" % current.pathspec)
63
+ )
64
+ current.card["gpu_profile"].append(
65
+ Markdown(
66
+ "_Started at: %s_"
67
+ % datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
68
+ )
69
+ )
70
+ self._profiler._setup_card()
71
+ current.card["gpu_profile"].refresh()
72
+ self._update_thread = threading.Thread(
73
+ target=self._profiler._update_card, daemon=True
74
+ )
75
+ self._update_thread.start()
76
+
77
+ def wrapped_step_func():
78
+ try:
79
+ step_func()
80
+ finally:
81
+ try:
82
+ results = self._profiler.finish()
83
+ except:
84
+ results = {"error": "couldn't read profiler results"}
85
+ if self.attributes["include_artifacts"]:
86
+ setattr(flow, self.attributes["artifact_prefix"] + "data", results)
87
+
88
+ return wrapped_step_func
@@ -0,0 +1,96 @@
1
+ from datetime import datetime
2
+ from metaflow.decorators import StepDecorator
3
+ from ..card_utilities.injector import CardDecoratorInjector
4
+
5
+
6
+ class DynamicCardAppendDecorator(StepDecorator):
7
+ """
8
+ A simple decorator that demonstrates using CardDecoratorInjector
9
+ to inject a card and render simple markdown content.
10
+ """
11
+
12
+ name = "test_append_card"
13
+
14
+ defaults = {
15
+ "title": "Simple Card",
16
+ "message": "Hello from DynamicCardAppendDecorator!",
17
+ "show_timestamp": True,
18
+ "refresh_interval": 5,
19
+ }
20
+
21
+ CARD_ID = "simple_card"
22
+
23
+ def step_init(
24
+ self, flow, graph, step_name, decorators, environment, flow_datastore, logger
25
+ ):
26
+ """Initialize the decorator and inject the card."""
27
+ self.deco_injector = CardDecoratorInjector()
28
+ self.deco_injector.attach_card_decorator(
29
+ flow,
30
+ step_name,
31
+ self.CARD_ID,
32
+ "blank",
33
+ refresh_interval=self.attributes["refresh_interval"],
34
+ )
35
+
36
+ def task_decorate(
37
+ self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
38
+ ):
39
+ """Decorate the step function to add card content."""
40
+ from metaflow import current
41
+ from metaflow.cards import Markdown
42
+
43
+ # Create the card content
44
+ title = self.attributes["title"]
45
+ message = self.attributes["message"]
46
+ show_timestamp = self.attributes["show_timestamp"]
47
+
48
+ # Add title to the card
49
+ current.card[self.CARD_ID].append(Markdown(f"# {title}"))
50
+
51
+ # Add message to the card
52
+ current.card[self.CARD_ID].append(Markdown(f"**Message:** {message}"))
53
+
54
+ # Add timestamp if requested
55
+ if show_timestamp:
56
+ timestamp = datetime.now().astimezone().strftime("%Y-%m-%d %H:%M:%S %z")
57
+ current.card[self.CARD_ID].append(Markdown(f"**Created at:** {timestamp}"))
58
+
59
+ # Add step information
60
+ current.card[self.CARD_ID].append(Markdown(f"**Step:** `{current.pathspec}`"))
61
+
62
+ # Add a simple divider
63
+ current.card[self.CARD_ID].append(Markdown("---"))
64
+
65
+ # Add some dynamic content that shows this is working
66
+ current.card[self.CARD_ID].append(
67
+ Markdown("**Status:** Card successfully injected! 🎉")
68
+ )
69
+
70
+ def wrapped_step_func():
71
+ """Execute the original step function."""
72
+ try:
73
+ # Before execution
74
+ current.card[self.CARD_ID].append(
75
+ Markdown("**Execution:** Step started...")
76
+ )
77
+ current.card[self.CARD_ID].refresh()
78
+
79
+ # Execute the original step
80
+ step_func()
81
+
82
+ # After execution
83
+ current.card[self.CARD_ID].append(
84
+ Markdown("**Execution:** Step completed successfully! ✅")
85
+ )
86
+ current.card[self.CARD_ID].refresh()
87
+
88
+ except Exception as e:
89
+ # Handle errors
90
+ current.card[self.CARD_ID].append(
91
+ Markdown(f"**Error:** Step failed with error: `{str(e)}` ❌")
92
+ )
93
+ current.card[self.CARD_ID].refresh()
94
+ raise
95
+
96
+ return wrapped_step_func
@@ -0,0 +1,7 @@
1
+ from .s3_proxy_decorator import (
2
+ S3ProxyDecorator,
3
+ NebiusS3ProxyDecorator,
4
+ CoreWeaveS3ProxyDecorator,
5
+ )
6
+
7
+ __all__ = ["S3ProxyDecorator", "NebiusS3ProxyDecorator", "CoreWeaveS3ProxyDecorator"]