ob-metaflow-extensions 1.1.171rc1__py2.py3-none-any.whl → 1.4.35__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/plugins/__init__.py +6 -3
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -29
- metaflow_extensions/outerbounds/plugins/apps/app_deploy_decorator.py +146 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +10 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_cli.py +1200 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +146 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +12 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +161 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +868 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +288 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +139 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +398 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1088 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +303 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +25 -12
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +9 -77
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +7 -78
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +6 -2
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +8 -8
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +6 -3
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +13 -7
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +8 -2
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +173 -95
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +9 -9
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +159 -9
- metaflow_extensions/outerbounds/remote_config.py +8 -3
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +62 -1
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +2 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.171rc1.dist-info → ob_metaflow_extensions-1.4.35.dist-info}/METADATA +2 -2
- {ob_metaflow_extensions-1.1.171rc1.dist-info → ob_metaflow_extensions-1.4.35.dist-info}/RECORD +64 -22
- {ob_metaflow_extensions-1.1.171rc1.dist-info → ob_metaflow_extensions-1.4.35.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.171rc1.dist-info → ob_metaflow_extensions-1.4.35.dist-info}/top_level.txt +0 -0
|
@@ -1,17 +1,30 @@
|
|
|
1
1
|
from metaflow.decorators import StepDecorator
|
|
2
2
|
from metaflow import current
|
|
3
3
|
import functools
|
|
4
|
-
import
|
|
4
|
+
from enum import Enum
|
|
5
5
|
import threading
|
|
6
6
|
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
7
7
|
from metaflow.metaflow_config import from_conf
|
|
8
8
|
|
|
9
|
-
from .vllm_manager import
|
|
9
|
+
from .vllm_manager import VLLMOpenAIManager, VLLMPyManager
|
|
10
10
|
from .status_card import VLLMStatusCard, CardDecoratorInjector
|
|
11
11
|
|
|
12
12
|
__mf_promote_submodules__ = ["plugins.vllm"]
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
### The following classes are used to store the vLLM information in the current environment.
|
|
16
|
+
# Then, Metaflow users can access the vLLM information through the current environment.
|
|
17
|
+
class OpenAIAPIInfo:
|
|
18
|
+
def __init__(self, local_endpoint, local_api_key):
|
|
19
|
+
self.local_endpoint = local_endpoint
|
|
20
|
+
self.local_api_key = local_api_key
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class VLLM:
|
|
24
|
+
def __init__(self, llm):
|
|
25
|
+
self.llm = llm
|
|
26
|
+
|
|
27
|
+
|
|
15
28
|
class VLLMDecorator(StepDecorator, CardDecoratorInjector):
|
|
16
29
|
"""
|
|
17
30
|
This decorator is used to run vllm APIs as Metaflow task sidecars.
|
|
@@ -40,11 +53,23 @@ class VLLMDecorator(StepDecorator, CardDecoratorInjector):
|
|
|
40
53
|
HuggingFace model identifier to be served by vLLM.
|
|
41
54
|
backend: str
|
|
42
55
|
Determines where and how to run the vLLM process.
|
|
56
|
+
openai_api_server: bool
|
|
57
|
+
Whether to use OpenAI-compatible API server mode (subprocess) instead of native engine.
|
|
58
|
+
Default is False (uses native engine).
|
|
59
|
+
Set to True for backward compatibility with existing code.
|
|
43
60
|
debug: bool
|
|
44
61
|
Whether to turn on verbose debugging logs.
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
62
|
+
card_refresh_interval: int
|
|
63
|
+
Interval in seconds for refreshing the vLLM status card.
|
|
64
|
+
Only used when openai_api_server=True.
|
|
65
|
+
max_retries: int
|
|
66
|
+
Maximum number of retries checking for vLLM server startup.
|
|
67
|
+
Only used when openai_api_server=True.
|
|
68
|
+
retry_alert_frequency: int
|
|
69
|
+
Frequency of alert logs for vLLM server startup retries.
|
|
70
|
+
Only used when openai_api_server=True.
|
|
71
|
+
engine_args : dict
|
|
72
|
+
Additional keyword arguments to pass to the vLLM engine.
|
|
48
73
|
For example, `tensor_parallel_size=2`.
|
|
49
74
|
"""
|
|
50
75
|
|
|
@@ -52,9 +77,12 @@ class VLLMDecorator(StepDecorator, CardDecoratorInjector):
|
|
|
52
77
|
defaults = {
|
|
53
78
|
"model": None,
|
|
54
79
|
"backend": "local",
|
|
80
|
+
"openai_api_server": False, # Default to native engine
|
|
55
81
|
"debug": False,
|
|
56
82
|
"stream_logs_to_card": False,
|
|
57
83
|
"card_refresh_interval": 10,
|
|
84
|
+
"max_retries": 60,
|
|
85
|
+
"retry_alert_frequency": 5,
|
|
58
86
|
"engine_args": {},
|
|
59
87
|
}
|
|
60
88
|
|
|
@@ -72,106 +100,156 @@ class VLLMDecorator(StepDecorator, CardDecoratorInjector):
|
|
|
72
100
|
f"Example: @vllm(model='meta-llama/Llama-3.2-1B')"
|
|
73
101
|
)
|
|
74
102
|
|
|
75
|
-
# Attach the vllm status card
|
|
76
|
-
self.
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
103
|
+
# Attach the vllm status card only for API server mode
|
|
104
|
+
if self.attributes["openai_api_server"]:
|
|
105
|
+
self.attach_card_decorator(
|
|
106
|
+
flow,
|
|
107
|
+
step_name,
|
|
108
|
+
"vllm_status",
|
|
109
|
+
"blank",
|
|
110
|
+
refresh_interval=self.attributes["card_refresh_interval"],
|
|
111
|
+
)
|
|
83
112
|
|
|
84
113
|
def task_decorate(
|
|
85
114
|
self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
|
|
86
115
|
):
|
|
87
116
|
@functools.wraps(step_func)
|
|
88
117
|
def vllm_wrapper():
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
118
|
+
# FIXME: Kind of ugly branch. Causing branching elsewhere.
|
|
119
|
+
# Other possibile code paths:
|
|
120
|
+
# - OpenAI batch API
|
|
121
|
+
# - Embedding
|
|
122
|
+
# - Special types of models
|
|
123
|
+
if self.attributes["openai_api_server"]:
|
|
124
|
+
# API Server mode (existing functionality)
|
|
125
|
+
self._run_api_server_mode(step_func)
|
|
126
|
+
else:
|
|
127
|
+
# Native engine mode (new functionality)
|
|
128
|
+
self._run_native_engine_mode(step_func)
|
|
92
129
|
|
|
93
|
-
|
|
94
|
-
self.status_card = VLLMStatusCard(
|
|
95
|
-
refresh_interval=self.attributes["card_refresh_interval"]
|
|
96
|
-
)
|
|
130
|
+
return vllm_wrapper
|
|
97
131
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
132
|
+
def _run_api_server_mode(self, step_func):
|
|
133
|
+
"""Run vLLM in API server mode (subprocess, existing functionality)"""
|
|
134
|
+
self.vllm_manager = None
|
|
135
|
+
self.status_card = None
|
|
136
|
+
self.card_monitor_thread = None
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
self.status_card = VLLMStatusCard(
|
|
140
|
+
refresh_interval=self.attributes["card_refresh_interval"]
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def monitor_card():
|
|
144
|
+
try:
|
|
145
|
+
self.status_card.on_startup(current.card["vllm_status"])
|
|
146
|
+
|
|
147
|
+
while not getattr(self.card_monitor_thread, "_stop_event", False):
|
|
148
|
+
try:
|
|
149
|
+
self.status_card.on_update(
|
|
150
|
+
current.card["vllm_status"], None
|
|
151
|
+
)
|
|
152
|
+
import time
|
|
153
|
+
|
|
154
|
+
time.sleep(self.attributes["card_refresh_interval"])
|
|
155
|
+
except Exception as e:
|
|
156
|
+
if self.attributes["debug"]:
|
|
157
|
+
print(f"[@vllm] Card monitoring error: {e}")
|
|
158
|
+
break
|
|
159
|
+
except Exception as e:
|
|
160
|
+
if self.attributes["debug"]:
|
|
161
|
+
print(f"[@vllm] Card monitor thread error: {e}")
|
|
162
|
+
self.status_card.on_error(current.card["vllm_status"], str(e))
|
|
163
|
+
|
|
164
|
+
self.card_monitor_thread = threading.Thread(
|
|
165
|
+
target=monitor_card, daemon=True
|
|
166
|
+
)
|
|
167
|
+
self.card_monitor_thread._stop_event = False
|
|
168
|
+
self.card_monitor_thread.start()
|
|
169
|
+
self.vllm_manager = VLLMOpenAIManager(
|
|
170
|
+
model=self.attributes["model"],
|
|
171
|
+
backend=self.attributes["backend"],
|
|
172
|
+
debug=self.attributes["debug"],
|
|
173
|
+
status_card=self.status_card,
|
|
174
|
+
max_retries=self.attributes["max_retries"],
|
|
175
|
+
retry_alert_frequency=self.attributes["retry_alert_frequency"],
|
|
176
|
+
stream_logs_to_card=self.attributes["stream_logs_to_card"],
|
|
177
|
+
**self.attributes["engine_args"],
|
|
178
|
+
)
|
|
179
|
+
current._update_env(
|
|
180
|
+
dict(
|
|
181
|
+
vllm=OpenAIAPIInfo(
|
|
182
|
+
local_endpoint=f"http://127.0.0.1:{self.vllm_manager.port}/v1",
|
|
183
|
+
local_api_key="token123",
|
|
184
|
+
)
|
|
123
185
|
)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
if self.attributes["debug"]:
|
|
189
|
+
print("[@vllm] API server mode initialized.")
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
if self.status_card:
|
|
193
|
+
self.status_card.add_event("error", f"Initialization failed: {str(e)}")
|
|
194
|
+
try:
|
|
195
|
+
self.status_card.on_error(current.card["vllm_status"], str(e))
|
|
196
|
+
except:
|
|
197
|
+
pass
|
|
198
|
+
print(f"[@vllm] Error initializing API server mode: {e}")
|
|
199
|
+
raise
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
if self.status_card:
|
|
203
|
+
self.status_card.add_event("info", "Starting user step function")
|
|
204
|
+
step_func()
|
|
205
|
+
if self.status_card:
|
|
206
|
+
self.status_card.add_event(
|
|
207
|
+
"success", "User step function completed successfully"
|
|
133
208
|
)
|
|
134
|
-
|
|
135
|
-
|
|
209
|
+
finally:
|
|
210
|
+
if self.vllm_manager:
|
|
211
|
+
self.vllm_manager.terminate_models()
|
|
136
212
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
self.status_card.on_error(current.card["vllm_status"], str(e))
|
|
144
|
-
except:
|
|
145
|
-
pass
|
|
146
|
-
print(f"[@vllm] Error initializing VLLMManager: {e}")
|
|
147
|
-
raise
|
|
148
|
-
|
|
149
|
-
try:
|
|
150
|
-
if self.status_card:
|
|
151
|
-
self.status_card.add_event("info", "Starting user step function")
|
|
152
|
-
step_func()
|
|
153
|
-
if self.status_card:
|
|
154
|
-
self.status_card.add_event(
|
|
155
|
-
"success", "User step function completed successfully"
|
|
156
|
-
)
|
|
157
|
-
finally:
|
|
158
|
-
if self.vllm_manager:
|
|
159
|
-
self.vllm_manager.terminate_models()
|
|
160
|
-
|
|
161
|
-
if self.card_monitor_thread and self.status_card:
|
|
162
|
-
import time
|
|
163
|
-
|
|
164
|
-
try:
|
|
165
|
-
self.status_card.on_update(current.card["vllm_status"], None)
|
|
166
|
-
except Exception as e:
|
|
167
|
-
if self.attributes["debug"]:
|
|
168
|
-
print(f"[@vllm] Final card update error: {e}")
|
|
169
|
-
time.sleep(2)
|
|
170
|
-
|
|
171
|
-
if self.card_monitor_thread:
|
|
172
|
-
self.card_monitor_thread._stop_event = True
|
|
173
|
-
self.card_monitor_thread.join(timeout=5)
|
|
213
|
+
if self.card_monitor_thread and self.status_card:
|
|
214
|
+
import time
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
self.status_card.on_update(current.card["vllm_status"], None)
|
|
218
|
+
except Exception as e:
|
|
174
219
|
if self.attributes["debug"]:
|
|
175
|
-
print("[@vllm]
|
|
220
|
+
print(f"[@vllm] Final card update error: {e}")
|
|
221
|
+
time.sleep(2)
|
|
176
222
|
|
|
177
|
-
|
|
223
|
+
if self.card_monitor_thread:
|
|
224
|
+
self.card_monitor_thread._stop_event = True
|
|
225
|
+
self.card_monitor_thread.join(timeout=5)
|
|
226
|
+
if self.attributes["debug"]:
|
|
227
|
+
print("[@vllm] Card monitoring thread stopped.")
|
|
228
|
+
|
|
229
|
+
def _run_native_engine_mode(self, step_func):
|
|
230
|
+
"""Run vLLM in native engine mode (direct LLM API access)"""
|
|
231
|
+
self.vllm = None
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
if self.attributes["debug"]:
|
|
235
|
+
print("[@vllm] Initializing native engine mode")
|
|
236
|
+
|
|
237
|
+
self.vllm = VLLMPyManager(
|
|
238
|
+
model=self.attributes["model"],
|
|
239
|
+
debug=self.attributes["debug"],
|
|
240
|
+
**self.attributes["engine_args"],
|
|
241
|
+
)
|
|
242
|
+
current._update_env(dict(vllm=VLLM(llm=self.vllm.engine)))
|
|
243
|
+
|
|
244
|
+
if self.attributes["debug"]:
|
|
245
|
+
print("[@vllm] Native engine mode initialized.")
|
|
246
|
+
|
|
247
|
+
except Exception as e:
|
|
248
|
+
print(f"[@vllm] Error initializing native engine mode: {e}")
|
|
249
|
+
raise
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
step_func()
|
|
253
|
+
finally:
|
|
254
|
+
if self.vllm:
|
|
255
|
+
self.vllm.terminate_engine()
|
|
@@ -160,10 +160,10 @@ class VLLMStatusCard(CardRefresher):
|
|
|
160
160
|
# Keep only last 10 events
|
|
161
161
|
self.status_data["events"] = self.status_data["events"][:10]
|
|
162
162
|
|
|
163
|
-
def get_circuit_breaker_emoji(self, state):
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
163
|
+
# def get_circuit_breaker_emoji(self, state):
|
|
164
|
+
# """Get status emoji for circuit breaker state"""
|
|
165
|
+
# emoji_map = {"CLOSED": "🟢", "OPEN": "🔴", "HALF_OPEN": "🟡"}
|
|
166
|
+
# return emoji_map.get(state, "⚪")
|
|
167
167
|
|
|
168
168
|
def get_uptime_string(self, start_time):
|
|
169
169
|
"""Calculate uptime string"""
|
|
@@ -252,11 +252,11 @@ class VLLMStatusCard(CardRefresher):
|
|
|
252
252
|
)
|
|
253
253
|
|
|
254
254
|
# Simplified monitoring note
|
|
255
|
-
current_card.append(
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
)
|
|
255
|
+
# current_card.append(
|
|
256
|
+
# Markdown(
|
|
257
|
+
# "## 🔧 Monitoring\n**Advanced Features:** Disabled (Circuit Breaker, Request Interception)"
|
|
258
|
+
# )
|
|
259
|
+
# )
|
|
260
260
|
|
|
261
261
|
# Performance metrics
|
|
262
262
|
perf_data = data["performance"]
|
|
@@ -4,10 +4,7 @@ import time
|
|
|
4
4
|
import socket
|
|
5
5
|
import sys
|
|
6
6
|
import os
|
|
7
|
-
import functools
|
|
8
|
-
import json
|
|
9
7
|
import requests
|
|
10
|
-
from enum import Enum
|
|
11
8
|
import threading
|
|
12
9
|
from datetime import datetime
|
|
13
10
|
|
|
@@ -20,7 +17,147 @@ class ProcessStatus:
|
|
|
20
17
|
SUCCESSFUL = "SUCCESSFUL"
|
|
21
18
|
|
|
22
19
|
|
|
23
|
-
class
|
|
20
|
+
class VLLMPyManager:
|
|
21
|
+
"""
|
|
22
|
+
A native vLLM engine manager that provides direct access to the vLLM LLM class.
|
|
23
|
+
This replaces the subprocess-based API server approach with direct Python API access.
|
|
24
|
+
|
|
25
|
+
Example usage:
|
|
26
|
+
from vllm.sampling_params import SamplingParams, GuidedDecodingParams
|
|
27
|
+
|
|
28
|
+
engine = current.vllm.engine
|
|
29
|
+
sampling_params = SamplingParams(temperature=0.7, max_tokens=150)
|
|
30
|
+
outputs = engine.generate(["Hello, world!"], sampling_params)
|
|
31
|
+
|
|
32
|
+
# Structured outputs
|
|
33
|
+
guided_params = GuidedDecodingParams(json=my_schema)
|
|
34
|
+
sampling_params = SamplingParams(guided_decoding=guided_params)
|
|
35
|
+
outputs = engine.generate(prompts, sampling_params)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
model,
|
|
41
|
+
debug=False,
|
|
42
|
+
**engine_args,
|
|
43
|
+
):
|
|
44
|
+
if isinstance(model, list):
|
|
45
|
+
if len(model) != 1:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"vLLM native engine can only serve one model per instance. "
|
|
48
|
+
f"Got {len(model)} models: {model}. "
|
|
49
|
+
f"Please specify a single model or create multiple @vllm decorators."
|
|
50
|
+
)
|
|
51
|
+
self.model = model[0]
|
|
52
|
+
else:
|
|
53
|
+
self.model = model
|
|
54
|
+
|
|
55
|
+
self.debug = debug
|
|
56
|
+
self.engine_args = engine_args
|
|
57
|
+
self.engine = None
|
|
58
|
+
self.initialization_start = time.time()
|
|
59
|
+
|
|
60
|
+
if self.debug:
|
|
61
|
+
print(
|
|
62
|
+
f"[@vllm-native] Initializing native vLLM engine for model: {self.model}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
self._validate_vllm_installation()
|
|
66
|
+
self._initialize_engine()
|
|
67
|
+
|
|
68
|
+
total_init_time = time.time() - self.initialization_start
|
|
69
|
+
if self.debug:
|
|
70
|
+
print(
|
|
71
|
+
f"[@vllm-native] Native engine initialization completed in {total_init_time:.1f}s"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def _validate_vllm_installation(self):
|
|
75
|
+
"""Validate that vLLM is properly installed"""
|
|
76
|
+
try:
|
|
77
|
+
import vllm
|
|
78
|
+
|
|
79
|
+
if self.debug:
|
|
80
|
+
print(f"[@vllm-native] vLLM {vllm.__version__} is available")
|
|
81
|
+
except ImportError as e:
|
|
82
|
+
raise ImportError(
|
|
83
|
+
"vLLM not installed. Please add vLLM to your environment."
|
|
84
|
+
) from e
|
|
85
|
+
|
|
86
|
+
def _map_engine_args(self, engine_args):
|
|
87
|
+
"""
|
|
88
|
+
Map CLI-style engine_args to LLM constructor parameters.
|
|
89
|
+
Most parameters map directly from the API server CLI args to LLM constructor.
|
|
90
|
+
"""
|
|
91
|
+
llm_params = {}
|
|
92
|
+
|
|
93
|
+
# Direct mappings (parameter names are the same)
|
|
94
|
+
direct_mapping = [
|
|
95
|
+
"tensor_parallel_size",
|
|
96
|
+
"max_model_len",
|
|
97
|
+
"gpu_memory_utilization",
|
|
98
|
+
"swap_space",
|
|
99
|
+
"dtype",
|
|
100
|
+
"quantization",
|
|
101
|
+
"seed",
|
|
102
|
+
"trust_remote_code",
|
|
103
|
+
"revision",
|
|
104
|
+
"tokenizer_revision",
|
|
105
|
+
"enforce_eager",
|
|
106
|
+
"max_seq_len_to_capture",
|
|
107
|
+
"disable_custom_all_reduce",
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
for param in direct_mapping:
|
|
111
|
+
if param in engine_args:
|
|
112
|
+
llm_params[param] = engine_args[param]
|
|
113
|
+
|
|
114
|
+
# Handle special mappings if needed
|
|
115
|
+
# (Most/all vLLM CLI args map directly to LLM constructor args)
|
|
116
|
+
|
|
117
|
+
return llm_params
|
|
118
|
+
|
|
119
|
+
def _initialize_engine(self):
|
|
120
|
+
"""Initialize the native vLLM LLM engine"""
|
|
121
|
+
try:
|
|
122
|
+
from vllm import LLM
|
|
123
|
+
|
|
124
|
+
# Map engine args to LLM constructor parameters
|
|
125
|
+
llm_params = self._map_engine_args(self.engine_args)
|
|
126
|
+
|
|
127
|
+
if self.debug:
|
|
128
|
+
print(f"[@vllm] Initializing LLM with params: {llm_params}")
|
|
129
|
+
|
|
130
|
+
# Initialize the native vLLM engine
|
|
131
|
+
self.engine = LLM(model=self.model, **llm_params)
|
|
132
|
+
|
|
133
|
+
if self.debug:
|
|
134
|
+
print(f"[@vllm] LLM engine initialized successfully")
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
error_msg = f"Failed to initialize vLLM engine: {str(e)}"
|
|
138
|
+
if self.debug:
|
|
139
|
+
print(f"[@vllm-native] ERROR: {error_msg}")
|
|
140
|
+
raise RuntimeError(error_msg) from e
|
|
141
|
+
|
|
142
|
+
def terminate_engine(self):
|
|
143
|
+
"""
|
|
144
|
+
Clean up the native engine.
|
|
145
|
+
The LLM class handles cleanup automatically when the object is destroyed.
|
|
146
|
+
"""
|
|
147
|
+
if self.debug:
|
|
148
|
+
print("[@vllm-] Cleaning up vLLM engine")
|
|
149
|
+
|
|
150
|
+
# The vLLM LLM class handles cleanup automatically
|
|
151
|
+
# We just need to clear our reference
|
|
152
|
+
if self.engine:
|
|
153
|
+
del self.engine
|
|
154
|
+
self.engine = None
|
|
155
|
+
|
|
156
|
+
if self.debug:
|
|
157
|
+
print("[@vllm] Engine cleanup completed")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class VLLMOpenAIManager:
|
|
24
161
|
"""
|
|
25
162
|
A process manager for vLLM runtimes.
|
|
26
163
|
Implements interface @vllm(model=..., ...) to provide a local backend.
|
|
@@ -55,6 +192,8 @@ class VLLMManager:
|
|
|
55
192
|
port=8000,
|
|
56
193
|
host="127.0.0.1",
|
|
57
194
|
stream_logs_to_card=False,
|
|
195
|
+
max_retries=60,
|
|
196
|
+
retry_alert_frequency=5,
|
|
58
197
|
**vllm_args,
|
|
59
198
|
):
|
|
60
199
|
# Validate that only a single model is provided
|
|
@@ -79,6 +218,8 @@ class VLLMManager:
|
|
|
79
218
|
self.status_card = status_card
|
|
80
219
|
self.initialization_start = time.time()
|
|
81
220
|
self.server_process = None
|
|
221
|
+
self.max_retries = max_retries
|
|
222
|
+
self.retry_alert_frequency = retry_alert_frequency
|
|
82
223
|
self.vllm_args = vllm_args
|
|
83
224
|
|
|
84
225
|
if backend != "local":
|
|
@@ -211,6 +352,13 @@ class VLLMManager:
|
|
|
211
352
|
f"[@vllm] Starting vLLM OpenAI-compatible server for model: {self.model}"
|
|
212
353
|
)
|
|
213
354
|
|
|
355
|
+
### NOTE: This is not the only way to start the vLLM server.
|
|
356
|
+
# https://docs.vllm.ai/en/v0.9.0/api/vllm/entrypoints/openai/api_server.html
|
|
357
|
+
|
|
358
|
+
# There are other APIs we should consider using in a future extension:
|
|
359
|
+
# https://docs.vllm.ai/en/stable/api/vllm/entrypoints/openai/run_batch.html#vllm.entrypoints.openai.run_batch
|
|
360
|
+
# https://docs.vllm.ai/en/v0.9.0/api/vllm/entrypoints/openai/serving_embedding.html
|
|
361
|
+
# MANY MORE!!! Wait for some feedback and we can add more.
|
|
214
362
|
cmd = [
|
|
215
363
|
sys.executable,
|
|
216
364
|
"-m",
|
|
@@ -226,6 +374,8 @@ class VLLMManager:
|
|
|
226
374
|
vllm_args_copy = self.vllm_args.copy()
|
|
227
375
|
if self.debug or self.stream_logs_to_card:
|
|
228
376
|
# Note: This is an undocumented argument for the vLLM OpenAI server entrypoint.
|
|
377
|
+
# It was useful for debugging the vLLM server startup,
|
|
378
|
+
# likely more confusion potential than its worth for end user.
|
|
229
379
|
vllm_args_copy.setdefault("uvicorn_log_level", "debug")
|
|
230
380
|
|
|
231
381
|
for key, value in vllm_args_copy.items():
|
|
@@ -281,16 +431,15 @@ class VLLMManager:
|
|
|
281
431
|
print(f"[@vllm] Started vLLM server process with PID {process.pid}")
|
|
282
432
|
|
|
283
433
|
retries = 0
|
|
284
|
-
max_retries = 240
|
|
285
434
|
while (
|
|
286
435
|
not self._is_port_open(self.host, self.port, timeout=2)
|
|
287
|
-
and retries < max_retries
|
|
436
|
+
and retries < self.max_retries
|
|
288
437
|
):
|
|
289
438
|
if retries == 0:
|
|
290
439
|
print("[@vllm] Waiting for server to be ready...")
|
|
291
|
-
elif retries %
|
|
440
|
+
elif retries % self.retry_alert_frequency == 0:
|
|
292
441
|
print(
|
|
293
|
-
f"[@vllm] Still waiting for server... ({retries}/{max_retries})"
|
|
442
|
+
f"[@vllm] Still waiting for server... ({retries}/{self.max_retries})"
|
|
294
443
|
)
|
|
295
444
|
|
|
296
445
|
returncode = process.poll()
|
|
@@ -322,7 +471,7 @@ class VLLMManager:
|
|
|
322
471
|
retries += 1
|
|
323
472
|
|
|
324
473
|
if not self._is_port_open(self.host, self.port, timeout=2):
|
|
325
|
-
error_details = f"vLLM server did not start listening on {self.host}:{self.port} after {max_retries*2}s"
|
|
474
|
+
error_details = f"vLLM server did not start listening on {self.host}:{self.port} after {self.max_retries*2}s"
|
|
326
475
|
self.processes[process.pid]["properties"][
|
|
327
476
|
"error_details"
|
|
328
477
|
] = error_details
|
|
@@ -342,6 +491,7 @@ class VLLMManager:
|
|
|
342
491
|
"Running", uptime_start=datetime.now(), model=self.model
|
|
343
492
|
)
|
|
344
493
|
self._log_event("success", "vLLM server is ready and listening")
|
|
494
|
+
print(f"[@vllm] Server ready!")
|
|
345
495
|
|
|
346
496
|
self._update_model_status(self.model, status="Ready")
|
|
347
497
|
|
|
@@ -11,6 +11,11 @@ from metaflow_extensions.outerbounds.plugins.perimeters import (
|
|
|
11
11
|
get_perimeter_config_url_if_set_in_ob_config,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
+
|
|
15
|
+
class OuterboundsConfigException(MetaflowException):
|
|
16
|
+
_OB_CONFIG_EXCEPTION = True
|
|
17
|
+
|
|
18
|
+
|
|
14
19
|
OBP_REMOTE_CONFIG_KEY = "OBP_METAFLOW_CONFIG_URL"
|
|
15
20
|
HOSTNAME_KEY = "OBP_API_SERVER"
|
|
16
21
|
AUTH_KEY = "METAFLOW_SERVICE_AUTH_KEY"
|
|
@@ -31,7 +36,7 @@ def read_config_from_local() -> Optional[Path]:
|
|
|
31
36
|
|
|
32
37
|
# we should error because the user wants a specific config
|
|
33
38
|
if profile:
|
|
34
|
-
raise
|
|
39
|
+
raise OuterboundsConfigException(
|
|
35
40
|
f"Unable to locate METAFLOW_PROFILE {profile} in {config_path}"
|
|
36
41
|
)
|
|
37
42
|
|
|
@@ -55,7 +60,7 @@ def resolve_config_from_remote(remote_url: str, auth_token: str) -> Dict[str, st
|
|
|
55
60
|
data = response.json()
|
|
56
61
|
return data["config"]
|
|
57
62
|
except HTTPError:
|
|
58
|
-
raise
|
|
63
|
+
raise OuterboundsConfigException(
|
|
59
64
|
"Error fetching resolving configuration. Make sure you have run \
|
|
60
65
|
`outerbounds configure` with the correct value"
|
|
61
66
|
)
|
|
@@ -81,7 +86,7 @@ def init_config() -> Dict[str, str]:
|
|
|
81
86
|
try:
|
|
82
87
|
remote_config = json.loads(config_path.read_text())
|
|
83
88
|
except ValueError:
|
|
84
|
-
raise
|
|
89
|
+
raise OuterboundsConfigException(
|
|
85
90
|
"Error decoding your metaflow config. Please run the `outerbounds configure` \
|
|
86
91
|
command with the string provided in the Outerbounds dashboard"
|
|
87
92
|
)
|