ob-metaflow-extensions 1.1.170__py2.py3-none-any.whl → 1.4.35__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (65) hide show
  1. metaflow_extensions/outerbounds/plugins/__init__.py +6 -2
  2. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  3. metaflow_extensions/outerbounds/plugins/apps/app_deploy_decorator.py +146 -0
  4. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +10 -0
  5. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  6. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  7. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/app_cli.py +1200 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +146 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +12 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +161 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +868 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +288 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +139 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +398 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1088 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +303 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  32. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +25 -12
  33. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +9 -77
  34. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  35. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +7 -78
  36. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  37. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +6 -2
  38. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
  39. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +8 -8
  40. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  41. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  42. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  43. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  44. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  45. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  46. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  47. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  48. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  49. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  50. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +6 -3
  51. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +13 -7
  52. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +8 -2
  53. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +4 -0
  54. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +173 -95
  55. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +9 -9
  56. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +159 -9
  57. metaflow_extensions/outerbounds/remote_config.py +8 -3
  58. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +63 -1
  59. metaflow_extensions/outerbounds/toplevel/ob_internal.py +3 -0
  60. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  61. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  62. {ob_metaflow_extensions-1.1.170.dist-info → ob_metaflow_extensions-1.4.35.dist-info}/METADATA +2 -2
  63. {ob_metaflow_extensions-1.1.170.dist-info → ob_metaflow_extensions-1.4.35.dist-info}/RECORD +65 -21
  64. {ob_metaflow_extensions-1.1.170.dist-info → ob_metaflow_extensions-1.4.35.dist-info}/WHEEL +0 -0
  65. {ob_metaflow_extensions-1.1.170.dist-info → ob_metaflow_extensions-1.4.35.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,30 @@
1
1
  from metaflow.decorators import StepDecorator
2
2
  from metaflow import current
3
3
  import functools
4
- import os
4
+ from enum import Enum
5
5
  import threading
6
6
  from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
7
7
  from metaflow.metaflow_config import from_conf
8
8
 
9
- from .vllm_manager import VLLMManager
9
+ from .vllm_manager import VLLMOpenAIManager, VLLMPyManager
10
10
  from .status_card import VLLMStatusCard, CardDecoratorInjector
11
11
 
12
12
  __mf_promote_submodules__ = ["plugins.vllm"]
13
13
 
14
14
 
15
+ ### The following classes are used to store the vLLM information in the current environment.
16
+ # Then, Metaflow users can access the vLLM information through the current environment.
17
+ class OpenAIAPIInfo:
18
+ def __init__(self, local_endpoint, local_api_key):
19
+ self.local_endpoint = local_endpoint
20
+ self.local_api_key = local_api_key
21
+
22
+
23
+ class VLLM:
24
+ def __init__(self, llm):
25
+ self.llm = llm
26
+
27
+
15
28
  class VLLMDecorator(StepDecorator, CardDecoratorInjector):
16
29
  """
17
30
  This decorator is used to run vllm APIs as Metaflow task sidecars.
@@ -40,11 +53,23 @@ class VLLMDecorator(StepDecorator, CardDecoratorInjector):
40
53
  HuggingFace model identifier to be served by vLLM.
41
54
  backend: str
42
55
  Determines where and how to run the vLLM process.
56
+ openai_api_server: bool
57
+ Whether to use OpenAI-compatible API server mode (subprocess) instead of native engine.
58
+ Default is False (uses native engine).
59
+ Set to True for backward compatibility with existing code.
43
60
  debug: bool
44
61
  Whether to turn on verbose debugging logs.
45
- kwargs : Any
46
- Any other keyword arguments are passed directly to the vLLM engine.
47
- This allows for flexible configuration of vLLM server settings.
62
+ card_refresh_interval: int
63
+ Interval in seconds for refreshing the vLLM status card.
64
+ Only used when openai_api_server=True.
65
+ max_retries: int
66
+ Maximum number of retries checking for vLLM server startup.
67
+ Only used when openai_api_server=True.
68
+ retry_alert_frequency: int
69
+ Frequency of alert logs for vLLM server startup retries.
70
+ Only used when openai_api_server=True.
71
+ engine_args : dict
72
+ Additional keyword arguments to pass to the vLLM engine.
48
73
  For example, `tensor_parallel_size=2`.
49
74
  """
50
75
 
@@ -52,9 +77,12 @@ class VLLMDecorator(StepDecorator, CardDecoratorInjector):
52
77
  defaults = {
53
78
  "model": None,
54
79
  "backend": "local",
80
+ "openai_api_server": False, # Default to native engine
55
81
  "debug": False,
56
82
  "stream_logs_to_card": False,
57
83
  "card_refresh_interval": 10,
84
+ "max_retries": 60,
85
+ "retry_alert_frequency": 5,
58
86
  "engine_args": {},
59
87
  }
60
88
 
@@ -72,106 +100,156 @@ class VLLMDecorator(StepDecorator, CardDecoratorInjector):
72
100
  f"Example: @vllm(model='meta-llama/Llama-3.2-1B')"
73
101
  )
74
102
 
75
- # Attach the vllm status card
76
- self.attach_card_decorator(
77
- flow,
78
- step_name,
79
- "vllm_status",
80
- "blank",
81
- refresh_interval=self.attributes["card_refresh_interval"],
82
- )
103
+ # Attach the vllm status card only for API server mode
104
+ if self.attributes["openai_api_server"]:
105
+ self.attach_card_decorator(
106
+ flow,
107
+ step_name,
108
+ "vllm_status",
109
+ "blank",
110
+ refresh_interval=self.attributes["card_refresh_interval"],
111
+ )
83
112
 
84
113
  def task_decorate(
85
114
  self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
86
115
  ):
87
116
  @functools.wraps(step_func)
88
117
  def vllm_wrapper():
89
- self.vllm_manager = None
90
- self.status_card = None
91
- self.card_monitor_thread = None
118
+ # FIXME: Kind of ugly branch. Causing branching elsewhere.
119
+ # Other possibile code paths:
120
+ # - OpenAI batch API
121
+ # - Embedding
122
+ # - Special types of models
123
+ if self.attributes["openai_api_server"]:
124
+ # API Server mode (existing functionality)
125
+ self._run_api_server_mode(step_func)
126
+ else:
127
+ # Native engine mode (new functionality)
128
+ self._run_native_engine_mode(step_func)
92
129
 
93
- try:
94
- self.status_card = VLLMStatusCard(
95
- refresh_interval=self.attributes["card_refresh_interval"]
96
- )
130
+ return vllm_wrapper
97
131
 
98
- def monitor_card():
99
- try:
100
- self.status_card.on_startup(current.card["vllm_status"])
101
-
102
- while not getattr(
103
- self.card_monitor_thread, "_stop_event", False
104
- ):
105
- try:
106
- self.status_card.on_update(
107
- current.card["vllm_status"], None
108
- )
109
- import time
110
-
111
- time.sleep(self.attributes["card_refresh_interval"])
112
- except Exception as e:
113
- if self.attributes["debug"]:
114
- print(f"[@vllm] Card monitoring error: {e}")
115
- break
116
- except Exception as e:
117
- if self.attributes["debug"]:
118
- print(f"[@vllm] Card monitor thread error: {e}")
119
- self.status_card.on_error(current.card["vllm_status"], str(e))
120
-
121
- self.card_monitor_thread = threading.Thread(
122
- target=monitor_card, daemon=True
132
+ def _run_api_server_mode(self, step_func):
133
+ """Run vLLM in API server mode (subprocess, existing functionality)"""
134
+ self.vllm_manager = None
135
+ self.status_card = None
136
+ self.card_monitor_thread = None
137
+
138
+ try:
139
+ self.status_card = VLLMStatusCard(
140
+ refresh_interval=self.attributes["card_refresh_interval"]
141
+ )
142
+
143
+ def monitor_card():
144
+ try:
145
+ self.status_card.on_startup(current.card["vllm_status"])
146
+
147
+ while not getattr(self.card_monitor_thread, "_stop_event", False):
148
+ try:
149
+ self.status_card.on_update(
150
+ current.card["vllm_status"], None
151
+ )
152
+ import time
153
+
154
+ time.sleep(self.attributes["card_refresh_interval"])
155
+ except Exception as e:
156
+ if self.attributes["debug"]:
157
+ print(f"[@vllm] Card monitoring error: {e}")
158
+ break
159
+ except Exception as e:
160
+ if self.attributes["debug"]:
161
+ print(f"[@vllm] Card monitor thread error: {e}")
162
+ self.status_card.on_error(current.card["vllm_status"], str(e))
163
+
164
+ self.card_monitor_thread = threading.Thread(
165
+ target=monitor_card, daemon=True
166
+ )
167
+ self.card_monitor_thread._stop_event = False
168
+ self.card_monitor_thread.start()
169
+ self.vllm_manager = VLLMOpenAIManager(
170
+ model=self.attributes["model"],
171
+ backend=self.attributes["backend"],
172
+ debug=self.attributes["debug"],
173
+ status_card=self.status_card,
174
+ max_retries=self.attributes["max_retries"],
175
+ retry_alert_frequency=self.attributes["retry_alert_frequency"],
176
+ stream_logs_to_card=self.attributes["stream_logs_to_card"],
177
+ **self.attributes["engine_args"],
178
+ )
179
+ current._update_env(
180
+ dict(
181
+ vllm=OpenAIAPIInfo(
182
+ local_endpoint=f"http://127.0.0.1:{self.vllm_manager.port}/v1",
183
+ local_api_key="token123",
184
+ )
123
185
  )
124
- self.card_monitor_thread._stop_event = False
125
- self.card_monitor_thread.start()
126
- self.vllm_manager = VLLMManager(
127
- model=self.attributes["model"],
128
- backend=self.attributes["backend"],
129
- debug=self.attributes["debug"],
130
- status_card=self.status_card,
131
- stream_logs_to_card=self.attributes["stream_logs_to_card"],
132
- **self.attributes["engine_args"],
186
+ )
187
+
188
+ if self.attributes["debug"]:
189
+ print("[@vllm] API server mode initialized.")
190
+
191
+ except Exception as e:
192
+ if self.status_card:
193
+ self.status_card.add_event("error", f"Initialization failed: {str(e)}")
194
+ try:
195
+ self.status_card.on_error(current.card["vllm_status"], str(e))
196
+ except:
197
+ pass
198
+ print(f"[@vllm] Error initializing API server mode: {e}")
199
+ raise
200
+
201
+ try:
202
+ if self.status_card:
203
+ self.status_card.add_event("info", "Starting user step function")
204
+ step_func()
205
+ if self.status_card:
206
+ self.status_card.add_event(
207
+ "success", "User step function completed successfully"
133
208
  )
134
- if self.attributes["debug"]:
135
- print("[@vllm] VLLMManager initialized.")
209
+ finally:
210
+ if self.vllm_manager:
211
+ self.vllm_manager.terminate_models()
136
212
 
137
- except Exception as e:
138
- if self.status_card:
139
- self.status_card.add_event(
140
- "error", f"Initialization failed: {str(e)}"
141
- )
142
- try:
143
- self.status_card.on_error(current.card["vllm_status"], str(e))
144
- except:
145
- pass
146
- print(f"[@vllm] Error initializing VLLMManager: {e}")
147
- raise
148
-
149
- try:
150
- if self.status_card:
151
- self.status_card.add_event("info", "Starting user step function")
152
- step_func()
153
- if self.status_card:
154
- self.status_card.add_event(
155
- "success", "User step function completed successfully"
156
- )
157
- finally:
158
- if self.vllm_manager:
159
- self.vllm_manager.terminate_models()
160
-
161
- if self.card_monitor_thread and self.status_card:
162
- import time
163
-
164
- try:
165
- self.status_card.on_update(current.card["vllm_status"], None)
166
- except Exception as e:
167
- if self.attributes["debug"]:
168
- print(f"[@vllm] Final card update error: {e}")
169
- time.sleep(2)
170
-
171
- if self.card_monitor_thread:
172
- self.card_monitor_thread._stop_event = True
173
- self.card_monitor_thread.join(timeout=5)
213
+ if self.card_monitor_thread and self.status_card:
214
+ import time
215
+
216
+ try:
217
+ self.status_card.on_update(current.card["vllm_status"], None)
218
+ except Exception as e:
174
219
  if self.attributes["debug"]:
175
- print("[@vllm] Card monitoring thread stopped.")
220
+ print(f"[@vllm] Final card update error: {e}")
221
+ time.sleep(2)
176
222
 
177
- return vllm_wrapper
223
+ if self.card_monitor_thread:
224
+ self.card_monitor_thread._stop_event = True
225
+ self.card_monitor_thread.join(timeout=5)
226
+ if self.attributes["debug"]:
227
+ print("[@vllm] Card monitoring thread stopped.")
228
+
229
+ def _run_native_engine_mode(self, step_func):
230
+ """Run vLLM in native engine mode (direct LLM API access)"""
231
+ self.vllm = None
232
+
233
+ try:
234
+ if self.attributes["debug"]:
235
+ print("[@vllm] Initializing native engine mode")
236
+
237
+ self.vllm = VLLMPyManager(
238
+ model=self.attributes["model"],
239
+ debug=self.attributes["debug"],
240
+ **self.attributes["engine_args"],
241
+ )
242
+ current._update_env(dict(vllm=VLLM(llm=self.vllm.engine)))
243
+
244
+ if self.attributes["debug"]:
245
+ print("[@vllm] Native engine mode initialized.")
246
+
247
+ except Exception as e:
248
+ print(f"[@vllm] Error initializing native engine mode: {e}")
249
+ raise
250
+
251
+ try:
252
+ step_func()
253
+ finally:
254
+ if self.vllm:
255
+ self.vllm.terminate_engine()
@@ -160,10 +160,10 @@ class VLLMStatusCard(CardRefresher):
160
160
  # Keep only last 10 events
161
161
  self.status_data["events"] = self.status_data["events"][:10]
162
162
 
163
- def get_circuit_breaker_emoji(self, state):
164
- """Get status emoji for circuit breaker state"""
165
- emoji_map = {"CLOSED": "🟢", "OPEN": "🔴", "HALF_OPEN": "🟡"}
166
- return emoji_map.get(state, "⚪")
163
+ # def get_circuit_breaker_emoji(self, state):
164
+ # """Get status emoji for circuit breaker state"""
165
+ # emoji_map = {"CLOSED": "🟢", "OPEN": "🔴", "HALF_OPEN": "🟡"}
166
+ # return emoji_map.get(state, "⚪")
167
167
 
168
168
  def get_uptime_string(self, start_time):
169
169
  """Calculate uptime string"""
@@ -252,11 +252,11 @@ class VLLMStatusCard(CardRefresher):
252
252
  )
253
253
 
254
254
  # Simplified monitoring note
255
- current_card.append(
256
- Markdown(
257
- "## 🔧 Monitoring\n**Advanced Features:** Disabled (Circuit Breaker, Request Interception)"
258
- )
259
- )
255
+ # current_card.append(
256
+ # Markdown(
257
+ # "## 🔧 Monitoring\n**Advanced Features:** Disabled (Circuit Breaker, Request Interception)"
258
+ # )
259
+ # )
260
260
 
261
261
  # Performance metrics
262
262
  perf_data = data["performance"]
@@ -4,10 +4,7 @@ import time
4
4
  import socket
5
5
  import sys
6
6
  import os
7
- import functools
8
- import json
9
7
  import requests
10
- from enum import Enum
11
8
  import threading
12
9
  from datetime import datetime
13
10
 
@@ -20,7 +17,147 @@ class ProcessStatus:
20
17
  SUCCESSFUL = "SUCCESSFUL"
21
18
 
22
19
 
23
- class VLLMManager:
20
+ class VLLMPyManager:
21
+ """
22
+ A native vLLM engine manager that provides direct access to the vLLM LLM class.
23
+ This replaces the subprocess-based API server approach with direct Python API access.
24
+
25
+ Example usage:
26
+ from vllm.sampling_params import SamplingParams, GuidedDecodingParams
27
+
28
+ engine = current.vllm.engine
29
+ sampling_params = SamplingParams(temperature=0.7, max_tokens=150)
30
+ outputs = engine.generate(["Hello, world!"], sampling_params)
31
+
32
+ # Structured outputs
33
+ guided_params = GuidedDecodingParams(json=my_schema)
34
+ sampling_params = SamplingParams(guided_decoding=guided_params)
35
+ outputs = engine.generate(prompts, sampling_params)
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ model,
41
+ debug=False,
42
+ **engine_args,
43
+ ):
44
+ if isinstance(model, list):
45
+ if len(model) != 1:
46
+ raise ValueError(
47
+ f"vLLM native engine can only serve one model per instance. "
48
+ f"Got {len(model)} models: {model}. "
49
+ f"Please specify a single model or create multiple @vllm decorators."
50
+ )
51
+ self.model = model[0]
52
+ else:
53
+ self.model = model
54
+
55
+ self.debug = debug
56
+ self.engine_args = engine_args
57
+ self.engine = None
58
+ self.initialization_start = time.time()
59
+
60
+ if self.debug:
61
+ print(
62
+ f"[@vllm-native] Initializing native vLLM engine for model: {self.model}"
63
+ )
64
+
65
+ self._validate_vllm_installation()
66
+ self._initialize_engine()
67
+
68
+ total_init_time = time.time() - self.initialization_start
69
+ if self.debug:
70
+ print(
71
+ f"[@vllm-native] Native engine initialization completed in {total_init_time:.1f}s"
72
+ )
73
+
74
+ def _validate_vllm_installation(self):
75
+ """Validate that vLLM is properly installed"""
76
+ try:
77
+ import vllm
78
+
79
+ if self.debug:
80
+ print(f"[@vllm-native] vLLM {vllm.__version__} is available")
81
+ except ImportError as e:
82
+ raise ImportError(
83
+ "vLLM not installed. Please add vLLM to your environment."
84
+ ) from e
85
+
86
+ def _map_engine_args(self, engine_args):
87
+ """
88
+ Map CLI-style engine_args to LLM constructor parameters.
89
+ Most parameters map directly from the API server CLI args to LLM constructor.
90
+ """
91
+ llm_params = {}
92
+
93
+ # Direct mappings (parameter names are the same)
94
+ direct_mapping = [
95
+ "tensor_parallel_size",
96
+ "max_model_len",
97
+ "gpu_memory_utilization",
98
+ "swap_space",
99
+ "dtype",
100
+ "quantization",
101
+ "seed",
102
+ "trust_remote_code",
103
+ "revision",
104
+ "tokenizer_revision",
105
+ "enforce_eager",
106
+ "max_seq_len_to_capture",
107
+ "disable_custom_all_reduce",
108
+ ]
109
+
110
+ for param in direct_mapping:
111
+ if param in engine_args:
112
+ llm_params[param] = engine_args[param]
113
+
114
+ # Handle special mappings if needed
115
+ # (Most/all vLLM CLI args map directly to LLM constructor args)
116
+
117
+ return llm_params
118
+
119
+ def _initialize_engine(self):
120
+ """Initialize the native vLLM LLM engine"""
121
+ try:
122
+ from vllm import LLM
123
+
124
+ # Map engine args to LLM constructor parameters
125
+ llm_params = self._map_engine_args(self.engine_args)
126
+
127
+ if self.debug:
128
+ print(f"[@vllm] Initializing LLM with params: {llm_params}")
129
+
130
+ # Initialize the native vLLM engine
131
+ self.engine = LLM(model=self.model, **llm_params)
132
+
133
+ if self.debug:
134
+ print(f"[@vllm] LLM engine initialized successfully")
135
+
136
+ except Exception as e:
137
+ error_msg = f"Failed to initialize vLLM engine: {str(e)}"
138
+ if self.debug:
139
+ print(f"[@vllm-native] ERROR: {error_msg}")
140
+ raise RuntimeError(error_msg) from e
141
+
142
+ def terminate_engine(self):
143
+ """
144
+ Clean up the native engine.
145
+ The LLM class handles cleanup automatically when the object is destroyed.
146
+ """
147
+ if self.debug:
148
+ print("[@vllm-] Cleaning up vLLM engine")
149
+
150
+ # The vLLM LLM class handles cleanup automatically
151
+ # We just need to clear our reference
152
+ if self.engine:
153
+ del self.engine
154
+ self.engine = None
155
+
156
+ if self.debug:
157
+ print("[@vllm] Engine cleanup completed")
158
+
159
+
160
+ class VLLMOpenAIManager:
24
161
  """
25
162
  A process manager for vLLM runtimes.
26
163
  Implements interface @vllm(model=..., ...) to provide a local backend.
@@ -55,6 +192,8 @@ class VLLMManager:
55
192
  port=8000,
56
193
  host="127.0.0.1",
57
194
  stream_logs_to_card=False,
195
+ max_retries=60,
196
+ retry_alert_frequency=5,
58
197
  **vllm_args,
59
198
  ):
60
199
  # Validate that only a single model is provided
@@ -79,6 +218,8 @@ class VLLMManager:
79
218
  self.status_card = status_card
80
219
  self.initialization_start = time.time()
81
220
  self.server_process = None
221
+ self.max_retries = max_retries
222
+ self.retry_alert_frequency = retry_alert_frequency
82
223
  self.vllm_args = vllm_args
83
224
 
84
225
  if backend != "local":
@@ -211,6 +352,13 @@ class VLLMManager:
211
352
  f"[@vllm] Starting vLLM OpenAI-compatible server for model: {self.model}"
212
353
  )
213
354
 
355
+ ### NOTE: This is not the only way to start the vLLM server.
356
+ # https://docs.vllm.ai/en/v0.9.0/api/vllm/entrypoints/openai/api_server.html
357
+
358
+ # There are other APIs we should consider using in a future extension:
359
+ # https://docs.vllm.ai/en/stable/api/vllm/entrypoints/openai/run_batch.html#vllm.entrypoints.openai.run_batch
360
+ # https://docs.vllm.ai/en/v0.9.0/api/vllm/entrypoints/openai/serving_embedding.html
361
+ # MANY MORE!!! Wait for some feedback and we can add more.
214
362
  cmd = [
215
363
  sys.executable,
216
364
  "-m",
@@ -226,6 +374,8 @@ class VLLMManager:
226
374
  vllm_args_copy = self.vllm_args.copy()
227
375
  if self.debug or self.stream_logs_to_card:
228
376
  # Note: This is an undocumented argument for the vLLM OpenAI server entrypoint.
377
+ # It was useful for debugging the vLLM server startup,
378
+ # likely more confusion potential than its worth for end user.
229
379
  vllm_args_copy.setdefault("uvicorn_log_level", "debug")
230
380
 
231
381
  for key, value in vllm_args_copy.items():
@@ -281,16 +431,15 @@ class VLLMManager:
281
431
  print(f"[@vllm] Started vLLM server process with PID {process.pid}")
282
432
 
283
433
  retries = 0
284
- max_retries = 240
285
434
  while (
286
435
  not self._is_port_open(self.host, self.port, timeout=2)
287
- and retries < max_retries
436
+ and retries < self.max_retries
288
437
  ):
289
438
  if retries == 0:
290
439
  print("[@vllm] Waiting for server to be ready...")
291
- elif retries % 10 == 0:
440
+ elif retries % self.retry_alert_frequency == 0:
292
441
  print(
293
- f"[@vllm] Still waiting for server... ({retries}/{max_retries})"
442
+ f"[@vllm] Still waiting for server... ({retries}/{self.max_retries})"
294
443
  )
295
444
 
296
445
  returncode = process.poll()
@@ -322,7 +471,7 @@ class VLLMManager:
322
471
  retries += 1
323
472
 
324
473
  if not self._is_port_open(self.host, self.port, timeout=2):
325
- error_details = f"vLLM server did not start listening on {self.host}:{self.port} after {max_retries*2}s"
474
+ error_details = f"vLLM server did not start listening on {self.host}:{self.port} after {self.max_retries*2}s"
326
475
  self.processes[process.pid]["properties"][
327
476
  "error_details"
328
477
  ] = error_details
@@ -342,6 +491,7 @@ class VLLMManager:
342
491
  "Running", uptime_start=datetime.now(), model=self.model
343
492
  )
344
493
  self._log_event("success", "vLLM server is ready and listening")
494
+ print(f"[@vllm] Server ready!")
345
495
 
346
496
  self._update_model_status(self.model, status="Ready")
347
497
 
@@ -11,6 +11,11 @@ from metaflow_extensions.outerbounds.plugins.perimeters import (
11
11
  get_perimeter_config_url_if_set_in_ob_config,
12
12
  )
13
13
 
14
+
15
+ class OuterboundsConfigException(MetaflowException):
16
+ _OB_CONFIG_EXCEPTION = True
17
+
18
+
14
19
  OBP_REMOTE_CONFIG_KEY = "OBP_METAFLOW_CONFIG_URL"
15
20
  HOSTNAME_KEY = "OBP_API_SERVER"
16
21
  AUTH_KEY = "METAFLOW_SERVICE_AUTH_KEY"
@@ -31,7 +36,7 @@ def read_config_from_local() -> Optional[Path]:
31
36
 
32
37
  # we should error because the user wants a specific config
33
38
  if profile:
34
- raise MetaflowException(
39
+ raise OuterboundsConfigException(
35
40
  f"Unable to locate METAFLOW_PROFILE {profile} in {config_path}"
36
41
  )
37
42
 
@@ -55,7 +60,7 @@ def resolve_config_from_remote(remote_url: str, auth_token: str) -> Dict[str, st
55
60
  data = response.json()
56
61
  return data["config"]
57
62
  except HTTPError:
58
- raise MetaflowException(
63
+ raise OuterboundsConfigException(
59
64
  "Error fetching resolving configuration. Make sure you have run \
60
65
  `outerbounds configure` with the correct value"
61
66
  )
@@ -81,7 +86,7 @@ def init_config() -> Dict[str, str]:
81
86
  try:
82
87
  remote_config = json.loads(config_path.read_text())
83
88
  except ValueError:
84
- raise MetaflowException(
89
+ raise OuterboundsConfigException(
85
90
  "Error decoding your metaflow config. Please run the `outerbounds configure` \
86
91
  command with the string provided in the Outerbounds dashboard"
87
92
  )