ob-metaflow-extensions 1.1.151__py2.py3-none-any.whl → 1.6.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -1
  2. metaflow_extensions/outerbounds/plugins/__init__.py +24 -3
  3. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  4. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +16 -0
  5. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  6. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  7. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +333 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +1029 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +1300 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/exceptions.py +341 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +123 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  33. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  34. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  35. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  36. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +9 -77
  37. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  38. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +7 -78
  39. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +119 -0
  40. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +17 -3
  41. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
  42. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
  43. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  44. metaflow_extensions/outerbounds/plugins/nim/card.py +1 -6
  45. metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
  46. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
  47. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  48. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
  49. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +32 -8
  50. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +1 -1
  51. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +171 -16
  52. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  53. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  54. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1710 -114
  55. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  56. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +49 -0
  57. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  58. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  59. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  60. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  61. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  62. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  63. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  64. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  65. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  66. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +37 -7
  67. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
  68. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
  69. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
  70. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
  71. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
  72. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  73. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  74. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  75. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  76. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  77. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  78. metaflow_extensions/outerbounds/remote_config.py +46 -9
  79. metaflow_extensions/outerbounds/toplevel/apps/__init__.py +9 -0
  80. metaflow_extensions/outerbounds/toplevel/apps/exceptions.py +11 -0
  81. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +86 -2
  82. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  83. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  84. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  85. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  86. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  87. {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/METADATA +2 -2
  88. ob_metaflow_extensions-1.6.2.dist-info/RECORD +136 -0
  89. metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
  90. ob_metaflow_extensions-1.1.151.dist-info/RECORD +0 -74
  91. {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/WHEEL +0 -0
  92. {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,621 @@
1
+ import subprocess
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ import time
4
+ import socket
5
+ import sys
6
+ import os
7
+ import requests
8
+ import threading
9
+ from datetime import datetime
10
+
11
+ from .constants import VLLM_SUFFIX
12
+
13
+
14
+ class ProcessStatus:
15
+ RUNNING = "RUNNING"
16
+ FAILED = "FAILED"
17
+ SUCCESSFUL = "SUCCESSFUL"
18
+
19
+
20
+ class VLLMPyManager:
21
+ """
22
+ A native vLLM engine manager that provides direct access to the vLLM LLM class.
23
+ This replaces the subprocess-based API server approach with direct Python API access.
24
+
25
+ Example usage:
26
+ from vllm.sampling_params import SamplingParams, GuidedDecodingParams
27
+
28
+ engine = current.vllm.engine
29
+ sampling_params = SamplingParams(temperature=0.7, max_tokens=150)
30
+ outputs = engine.generate(["Hello, world!"], sampling_params)
31
+
32
+ # Structured outputs
33
+ guided_params = GuidedDecodingParams(json=my_schema)
34
+ sampling_params = SamplingParams(guided_decoding=guided_params)
35
+ outputs = engine.generate(prompts, sampling_params)
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ model,
41
+ debug=False,
42
+ **engine_args,
43
+ ):
44
+ if isinstance(model, list):
45
+ if len(model) != 1:
46
+ raise ValueError(
47
+ f"vLLM native engine can only serve one model per instance. "
48
+ f"Got {len(model)} models: {model}. "
49
+ f"Please specify a single model or create multiple @vllm decorators."
50
+ )
51
+ self.model = model[0]
52
+ else:
53
+ self.model = model
54
+
55
+ self.debug = debug
56
+ self.engine_args = engine_args
57
+ self.engine = None
58
+ self.initialization_start = time.time()
59
+
60
+ if self.debug:
61
+ print(
62
+ f"[@vllm-native] Initializing native vLLM engine for model: {self.model}"
63
+ )
64
+
65
+ self._validate_vllm_installation()
66
+ self._initialize_engine()
67
+
68
+ total_init_time = time.time() - self.initialization_start
69
+ if self.debug:
70
+ print(
71
+ f"[@vllm-native] Native engine initialization completed in {total_init_time:.1f}s"
72
+ )
73
+
74
+ def _validate_vllm_installation(self):
75
+ """Validate that vLLM is properly installed"""
76
+ try:
77
+ import vllm
78
+
79
+ if self.debug:
80
+ print(f"[@vllm-native] vLLM {vllm.__version__} is available")
81
+ except ImportError as e:
82
+ raise ImportError(
83
+ "vLLM not installed. Please add vLLM to your environment."
84
+ ) from e
85
+
86
+ def _map_engine_args(self, engine_args):
87
+ """
88
+ Map CLI-style engine_args to LLM constructor parameters.
89
+ Most parameters map directly from the API server CLI args to LLM constructor.
90
+ """
91
+ llm_params = {}
92
+
93
+ # Direct mappings (parameter names are the same)
94
+ direct_mapping = [
95
+ "tensor_parallel_size",
96
+ "max_model_len",
97
+ "gpu_memory_utilization",
98
+ "swap_space",
99
+ "dtype",
100
+ "quantization",
101
+ "seed",
102
+ "trust_remote_code",
103
+ "revision",
104
+ "tokenizer_revision",
105
+ "enforce_eager",
106
+ "max_seq_len_to_capture",
107
+ "disable_custom_all_reduce",
108
+ ]
109
+
110
+ for param in direct_mapping:
111
+ if param in engine_args:
112
+ llm_params[param] = engine_args[param]
113
+
114
+ # Handle special mappings if needed
115
+ # (Most/all vLLM CLI args map directly to LLM constructor args)
116
+
117
+ return llm_params
118
+
119
+ def _initialize_engine(self):
120
+ """Initialize the native vLLM LLM engine"""
121
+ try:
122
+ from vllm import LLM
123
+
124
+ # Map engine args to LLM constructor parameters
125
+ llm_params = self._map_engine_args(self.engine_args)
126
+
127
+ if self.debug:
128
+ print(f"[@vllm] Initializing LLM with params: {llm_params}")
129
+
130
+ # Initialize the native vLLM engine
131
+ self.engine = LLM(model=self.model, **llm_params)
132
+
133
+ if self.debug:
134
+ print(f"[@vllm] LLM engine initialized successfully")
135
+
136
+ except Exception as e:
137
+ error_msg = f"Failed to initialize vLLM engine: {str(e)}"
138
+ if self.debug:
139
+ print(f"[@vllm-native] ERROR: {error_msg}")
140
+ raise RuntimeError(error_msg) from e
141
+
142
+ def terminate_engine(self):
143
+ """
144
+ Clean up the native engine.
145
+ The LLM class handles cleanup automatically when the object is destroyed.
146
+ """
147
+ if self.debug:
148
+ print("[@vllm-] Cleaning up vLLM engine")
149
+
150
+ # The vLLM LLM class handles cleanup automatically
151
+ # We just need to clear our reference
152
+ if self.engine:
153
+ del self.engine
154
+ self.engine = None
155
+
156
+ if self.debug:
157
+ print("[@vllm] Engine cleanup completed")
158
+
159
+
160
+ class VLLMOpenAIManager:
161
+ """
162
+ A process manager for vLLM runtimes.
163
+ Implements interface @vllm(model=..., ...) to provide a local backend.
164
+ It wraps the vLLM OpenAI-compatible API server to make it easier to profile vLLM use on Outerbounds.
165
+
166
+ NOTE: vLLM's OpenAI-compatible server serves ONE model per server instance.
167
+ If you need multiple models, you must create multiple server instances.
168
+
169
+ Example usage:
170
+ from vllm import LLM
171
+ llm = LLM(model="meta-llama/Llama-3.2-1B")
172
+ llm.generate("Hello, world!")
173
+
174
+ Or via OpenAI-compatible API:
175
+ import openai
176
+ client = openai.OpenAI(
177
+ base_url="http://localhost:8000/v1",
178
+ api_key="token-abc123"
179
+ )
180
+ response = client.chat.completions.create(
181
+ model="meta-llama/Llama-3.2-1B",
182
+ messages=[{"role": "user", "content": "Hello"}]
183
+ )
184
+ """
185
+
186
+ def __init__(
187
+ self,
188
+ model,
189
+ backend="local",
190
+ debug=False,
191
+ status_card=None,
192
+ port=8000,
193
+ host="127.0.0.1",
194
+ stream_logs_to_card=False,
195
+ max_retries=60,
196
+ retry_alert_frequency=5,
197
+ **vllm_args,
198
+ ):
199
+ # Validate that only a single model is provided
200
+ if isinstance(model, list):
201
+ if len(model) != 1:
202
+ raise ValueError(
203
+ f"vLLM server can only serve one model per instance. "
204
+ f"Got {len(model)} models: {model}. "
205
+ f"Please specify a single model or create multiple @vllm decorators."
206
+ )
207
+ self.model = model[0]
208
+ else:
209
+ self.model = model
210
+
211
+ self.processes = {}
212
+ self.debug = debug
213
+ self.stream_logs_to_card = stream_logs_to_card
214
+ self.stats = {}
215
+ self.port = port
216
+ self.host = host
217
+ self.vllm_url = f"http://{host}:{port}"
218
+ self.status_card = status_card
219
+ self.initialization_start = time.time()
220
+ self.server_process = None
221
+ self.max_retries = max_retries
222
+ self.retry_alert_frequency = retry_alert_frequency
223
+ self.vllm_args = vllm_args
224
+
225
+ if backend != "local":
226
+ raise ValueError(
227
+ "VLLMManager only supports the 'local' backend at this time."
228
+ )
229
+
230
+ self._log_event("info", "Starting vLLM initialization")
231
+ self._update_server_status("Initializing")
232
+
233
+ self._timeit(self._install_vllm, "install_vllm")
234
+ self._timeit(self._launch_vllm_server, "launch_server")
235
+ self._collect_version_info()
236
+
237
+ total_init_time = time.time() - self.initialization_start
238
+ self._update_performance("total_initialization_time", total_init_time)
239
+ self._log_event(
240
+ "success", f"vLLM initialization completed in {total_init_time:.1f}s"
241
+ )
242
+
243
+ def _log_event(self, event_type, message):
244
+ if self.status_card:
245
+ self.status_card.add_event(event_type, message)
246
+ if self.debug:
247
+ print(f"[@vllm] {event_type.upper()}: {message}")
248
+
249
+ def _update_server_status(self, status, **kwargs):
250
+ if self.status_card:
251
+ update_data = {"status": status}
252
+ update_data.update(kwargs)
253
+ self.status_card.update_status("server", update_data)
254
+
255
+ def _update_model_status(self, model_name, **kwargs):
256
+ if self.status_card:
257
+ current_models = self.status_card.status_data.get("models", {})
258
+ if model_name not in current_models:
259
+ current_models[model_name] = {}
260
+ current_models[model_name].update(kwargs)
261
+ self.status_card.update_status("models", current_models)
262
+
263
+ def _update_performance(self, metric, value):
264
+ if self.status_card:
265
+ self.status_card.update_status("performance", {metric: value})
266
+
267
+ def _timeit(self, f, name):
268
+ t0 = time.time()
269
+ f()
270
+ tf = time.time()
271
+ duration = tf - t0
272
+ self.stats[name] = {"process_runtime": duration}
273
+
274
+ if name == "install_vllm":
275
+ self._update_performance("install_time", duration)
276
+ elif name == "launch_server":
277
+ self._update_performance("server_startup_time", duration)
278
+
279
+ def _stream_output(self, stream, prefix):
280
+ """Reads and logs output from a stream."""
281
+ for line in iter(stream.readline, ""):
282
+ if line:
283
+ line = line.strip()
284
+ if self.stream_logs_to_card and self.status_card:
285
+ self.status_card.add_log_line(f"[{prefix}] {line}")
286
+ elif self.debug:
287
+ print(f"[{prefix}] {line}")
288
+ stream.close()
289
+
290
+ def _is_port_open(self, host, port, timeout=1):
291
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
292
+ sock.settimeout(timeout)
293
+ try:
294
+ sock.connect((host, port))
295
+ return True
296
+ except socket.error:
297
+ return False
298
+
299
+ def _install_vllm(self):
300
+ self._log_event("info", "Checking for existing vLLM installation")
301
+ try:
302
+ import vllm
303
+
304
+ self._log_event("success", f"vLLM {vllm.__version__} is already installed")
305
+ if self.debug:
306
+ print(f"[@vllm] vLLM {vllm.__version__} is already installed.")
307
+ return
308
+ except ImportError as e:
309
+ self._log_event(
310
+ "Error", "vLLM not installed. Please add it to your environment."
311
+ )
312
+ if self.debug:
313
+ print(
314
+ "[@vllm] vLLM not found. The user is responsible for installation."
315
+ )
316
+ raise e
317
+ # We are not installing it automatically to respect user's environment management.
318
+
319
+ def _launch_vllm_server(self):
320
+ self._update_server_status("Starting")
321
+ self._log_event("info", f"Starting vLLM server with model: {self.model}")
322
+
323
+ # Check if the model is cached
324
+ hf_home = os.environ.get("HF_HOME")
325
+ if hf_home:
326
+ # Construct the expected cache path for the model
327
+ model_path_id = f"models--{self.model.replace('/', '--')}"
328
+ model_cache_path = os.path.join(hf_home, model_path_id)
329
+ if os.path.exists(model_cache_path):
330
+ self._log_event("info", f"Found cached model at: {model_cache_path}")
331
+ self._update_model_status(
332
+ self.model, status="Found in cache", location=model_cache_path
333
+ )
334
+ else:
335
+ self._log_event(
336
+ "warning",
337
+ f"Cached model not found at {model_cache_path}. vLLM will attempt to download it.",
338
+ )
339
+ self._update_model_status(self.model, status="Downloading")
340
+ else:
341
+ self._log_event(
342
+ "warning",
343
+ "HF_HOME environment variable not set. vLLM will use default cache location and may re-download.",
344
+ )
345
+
346
+ if not self.model:
347
+ raise ValueError("At least one model must be specified for @vllm.")
348
+
349
+ try:
350
+ if self.debug:
351
+ print(
352
+ f"[@vllm] Starting vLLM OpenAI-compatible server for model: {self.model}"
353
+ )
354
+
355
+ ### NOTE: This is not the only way to start the vLLM server.
356
+ # https://docs.vllm.ai/en/v0.9.0/api/vllm/entrypoints/openai/api_server.html
357
+
358
+ # There are other APIs we should consider using in a future extension:
359
+ # https://docs.vllm.ai/en/stable/api/vllm/entrypoints/openai/run_batch.html#vllm.entrypoints.openai.run_batch
360
+ # https://docs.vllm.ai/en/v0.9.0/api/vllm/entrypoints/openai/serving_embedding.html
361
+ # MANY MORE!!! Wait for some feedback and we can add more.
362
+ cmd = [
363
+ sys.executable,
364
+ "-m",
365
+ "vllm.entrypoints.openai.api_server",
366
+ "--model",
367
+ self.model,
368
+ "--host",
369
+ self.host,
370
+ "--port",
371
+ str(self.port),
372
+ ]
373
+
374
+ vllm_args_copy = self.vllm_args.copy()
375
+ if self.debug or self.stream_logs_to_card:
376
+ # Note: This is an undocumented argument for the vLLM OpenAI server entrypoint.
377
+ # It was useful for debugging the vLLM server startup,
378
+ # likely more confusion potential than its worth for end user.
379
+ vllm_args_copy.setdefault("uvicorn_log_level", "debug")
380
+
381
+ for key, value in vllm_args_copy.items():
382
+ arg_name = f"--{key.replace('_', '-')}"
383
+ if isinstance(value, bool):
384
+ if value:
385
+ cmd.append(arg_name)
386
+ elif value is not None:
387
+ cmd.append(arg_name)
388
+ cmd.append(str(value))
389
+
390
+ # For debugging, log the exact command being run to the status card
391
+ command_str = " ".join(cmd)
392
+ self._log_event("info", f"Launch Command: `{command_str}`")
393
+ if self.debug:
394
+ print(f"[@vllm] Launching vLLM with command: {command_str}")
395
+
396
+ process = subprocess.Popen(
397
+ cmd,
398
+ stdout=subprocess.PIPE,
399
+ stderr=subprocess.PIPE,
400
+ text=True,
401
+ bufsize=1, # Line-buffered
402
+ )
403
+
404
+ # Threads to stream subprocess output
405
+ if self.debug or self.stream_logs_to_card:
406
+ stdout_thread = threading.Thread(
407
+ target=self._stream_output,
408
+ args=(process.stdout, "@vllm-server-out"),
409
+ )
410
+ stderr_thread = threading.Thread(
411
+ target=self._stream_output,
412
+ args=(process.stderr, "@vllm-server-err"),
413
+ )
414
+ stdout_thread.daemon = True
415
+ stderr_thread.daemon = True
416
+ stdout_thread.start()
417
+ stderr_thread.start()
418
+
419
+ self.server_process = process
420
+ self.processes[process.pid] = {
421
+ "p": process,
422
+ "properties": {
423
+ "type": "vllm-server",
424
+ "model": self.model,
425
+ "error_details": None,
426
+ },
427
+ "status": ProcessStatus.RUNNING,
428
+ }
429
+
430
+ if self.debug:
431
+ print(f"[@vllm] Started vLLM server process with PID {process.pid}")
432
+
433
+ retries = 0
434
+ while (
435
+ not self._is_port_open(self.host, self.port, timeout=2)
436
+ and retries < self.max_retries
437
+ ):
438
+ if retries == 0:
439
+ print("[@vllm] Waiting for server to be ready...")
440
+ elif retries % self.retry_alert_frequency == 0:
441
+ print(
442
+ f"[@vllm] Still waiting for server... ({retries}/{self.max_retries})"
443
+ )
444
+
445
+ returncode = process.poll()
446
+ if returncode is not None:
447
+ if self.debug or self.stream_logs_to_card:
448
+ # Threads are handling output, can't use communicate.
449
+ # The error has already been printed to the log by the thread.
450
+ if self.stream_logs_to_card:
451
+ details_msg = "See card for logs."
452
+ else:
453
+ details_msg = "See logs from @vllm-server-err for details."
454
+ error_details = f"Return code: {returncode}. {details_msg}"
455
+ else:
456
+ # No threads, so we can and should use communicate to get stderr.
457
+ stdout, stderr = process.communicate()
458
+ error_details = f"Return code: {returncode}, stderr: {stderr}"
459
+
460
+ self.processes[process.pid]["properties"][
461
+ "error_details"
462
+ ] = error_details
463
+ self.processes[process.pid]["status"] = ProcessStatus.FAILED
464
+ self._update_server_status("Failed", error_details=error_details)
465
+ self._log_event(
466
+ "error", f"vLLM server failed to start: {error_details}"
467
+ )
468
+ raise RuntimeError(f"vLLM server failed to start: {error_details}")
469
+
470
+ time.sleep(2)
471
+ retries += 1
472
+
473
+ if not self._is_port_open(self.host, self.port, timeout=2):
474
+ error_details = f"vLLM server did not start listening on {self.host}:{self.port} after {self.max_retries*2}s"
475
+ self.processes[process.pid]["properties"][
476
+ "error_details"
477
+ ] = error_details
478
+ self.processes[process.pid]["status"] = ProcessStatus.FAILED
479
+ self._update_server_status("Failed", error_details=error_details)
480
+ self._log_event("error", f"Server startup timeout: {error_details}")
481
+ raise RuntimeError(f"vLLM server failed to start: {error_details}")
482
+
483
+ if not self._verify_server_health():
484
+ error_details = "vLLM server started but failed health check"
485
+ self.processes[process.pid]["status"] = ProcessStatus.FAILED
486
+ self._update_server_status("Failed", error_details=error_details)
487
+ self._log_event("error", error_details)
488
+ raise RuntimeError(error_details)
489
+
490
+ self._update_server_status(
491
+ "Running", uptime_start=datetime.now(), model=self.model
492
+ )
493
+ self._log_event("success", "vLLM server is ready and listening")
494
+ print(f"[@vllm] Server ready!")
495
+
496
+ self._update_model_status(self.model, status="Ready")
497
+
498
+ if self.debug:
499
+ print("[@vllm] Server is ready.")
500
+
501
+ except Exception as e:
502
+ if process and process.pid in self.processes:
503
+ self.processes[process.pid]["status"] = ProcessStatus.FAILED
504
+ self.processes[process.pid]["properties"]["error_details"] = str(e)
505
+ self._update_server_status("Failed", error_details=str(e))
506
+ self._log_event("error", f"Error starting vLLM server: {str(e)}")
507
+ raise RuntimeError(f"Error starting vLLM server: {e}") from e
508
+
509
+ def _verify_server_health(self):
510
+ try:
511
+ response = requests.get(f"{self.vllm_url}/v1/models", timeout=10)
512
+ if response.status_code == 200:
513
+ if self.debug:
514
+ models_data = response.json()
515
+ available_models = [
516
+ m.get("id", "unknown") for m in models_data.get("data", [])
517
+ ]
518
+ print(
519
+ f"[@vllm] Health check OK. Available models: {available_models}"
520
+ )
521
+ return True
522
+ else:
523
+ if self.debug:
524
+ print(
525
+ f"[@vllm] Health check failed with status {response.status_code}"
526
+ )
527
+ return False
528
+ except Exception as e:
529
+ if self.debug:
530
+ print(f"[@vllm] Health check exception: {e}")
531
+ return False
532
+
533
+ def _collect_version_info(self):
534
+ version_info = {}
535
+ try:
536
+ import vllm
537
+
538
+ version_info["vllm"] = getattr(vllm, "__version__", "Unknown")
539
+ except ImportError:
540
+ version_info["vllm"] = "Not installed"
541
+ except Exception as e:
542
+ version_info["vllm"] = "Error detecting"
543
+ if self.debug:
544
+ print(f"[@vllm] Error getting vLLM version: {e}")
545
+
546
+ if self.status_card:
547
+ self.status_card.update_status("versions", version_info)
548
+ self._log_event(
549
+ "info", f"vLLM version: {version_info.get('vllm', 'Unknown')}"
550
+ )
551
+
552
+ def terminate_models(self):
553
+ shutdown_start_time = time.time()
554
+ self._log_event("info", "Starting vLLM shutdown sequence")
555
+ if self.debug:
556
+ print("[@vllm] Shutting down vLLM server...")
557
+
558
+ server_shutdown_cause = "graceful"
559
+
560
+ if self.server_process:
561
+ try:
562
+ self._update_server_status("Stopping")
563
+ self._log_event("info", "Stopping vLLM server")
564
+
565
+ # Clear model status since server is shutting down
566
+ self._update_model_status(self.model, status="Stopping")
567
+
568
+ self.server_process.terminate()
569
+ try:
570
+ self.server_process.wait(timeout=10)
571
+ if self.debug:
572
+ print("[@vllm] Server terminated gracefully")
573
+ except subprocess.TimeoutExpired:
574
+ server_shutdown_cause = "force_kill"
575
+ self._log_event(
576
+ "warning",
577
+ "vLLM server did not terminate gracefully, killing...",
578
+ )
579
+ if self.debug:
580
+ print("[@vllm] Server did not terminate, killing...")
581
+ self.server_process.kill()
582
+ self.server_process.wait()
583
+
584
+ if self.server_process.pid in self.processes:
585
+ self.processes[self.server_process.pid][
586
+ "status"
587
+ ] = ProcessStatus.SUCCESSFUL
588
+
589
+ self._update_server_status("Stopped")
590
+ if self.status_card:
591
+ self.status_card.update_status("models", {})
592
+
593
+ self._log_event(
594
+ "success", f"vLLM server stopped ({server_shutdown_cause})"
595
+ )
596
+
597
+ except Exception as e:
598
+ server_shutdown_cause = "failed"
599
+ if self.server_process.pid in self.processes:
600
+ self.processes[self.server_process.pid][
601
+ "status"
602
+ ] = ProcessStatus.FAILED
603
+ self.processes[self.server_process.pid]["properties"][
604
+ "error_details"
605
+ ] = str(e)
606
+ self._update_server_status("Failed to stop")
607
+ if self.status_card:
608
+ self.status_card.update_status("models", {})
609
+ self._log_event("error", f"vLLM server shutdown error: {str(e)}")
610
+ if self.debug:
611
+ print(f"[@vllm] Warning: Error terminating vLLM server: {e}")
612
+
613
+ total_shutdown_time = time.time() - shutdown_start_time
614
+ self._update_performance("total_shutdown_time", total_shutdown_time)
615
+ self._update_performance("shutdown_cause", server_shutdown_cause)
616
+
617
+ self._log_event(
618
+ "success", f"vLLM shutdown completed in {total_shutdown_time:.1f}s"
619
+ )
620
+ if self.debug:
621
+ print("[@vllm] vLLM server shutdown complete.")