ob-metaflow-extensions 1.1.166rc6__tar.gz → 1.1.168rc0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (97) hide show
  1. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/PKG-INFO +1 -1
  2. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/__init__.py +1 -0
  3. ob_metaflow_extensions-1.1.168rc0/metaflow_extensions/outerbounds/plugins/vllm/__init__.py +177 -0
  4. ob_metaflow_extensions-1.1.168rc0/metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  5. ob_metaflow_extensions-1.1.168rc0/metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  6. ob_metaflow_extensions-1.1.168rc0/metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  7. ob_metaflow_extensions-1.1.168rc0/metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +471 -0
  8. ob_metaflow_extensions-1.1.168rc0/metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  9. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
  10. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/ob_metaflow_extensions.egg-info/SOURCES.txt +6 -0
  11. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/setup.py +1 -1
  12. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/README.md +0 -0
  13. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/__init__.py +0 -0
  14. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
  15. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  16. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  17. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/apps/app_utils.py +0 -0
  18. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/apps/consts.py +0 -0
  19. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +0 -0
  20. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +0 -0
  21. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
  22. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/aws/__init__.py +0 -0
  23. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/aws/assume_role.py +0 -0
  24. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +0 -0
  25. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
  26. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +0 -0
  27. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +0 -0
  28. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +0 -0
  29. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +0 -0
  30. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +0 -0
  31. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +0 -0
  32. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  33. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +0 -0
  34. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +0 -0
  35. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
  36. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
  37. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
  38. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
  39. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
  40. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +0 -0
  41. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nim/card.py +0 -0
  42. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +0 -0
  43. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
  44. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nim/utils.py +0 -0
  45. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  46. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvcf/constants.py +0 -0
  47. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +0 -0
  48. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +0 -0
  49. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +0 -0
  50. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +0 -0
  51. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
  52. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvcf/utils.py +0 -0
  53. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  54. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +0 -0
  55. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct.py +0 -0
  56. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +0 -0
  57. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +0 -0
  58. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +0 -0
  59. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/nvct/utils.py +0 -0
  60. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/ollama/__init__.py +0 -0
  61. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/ollama/constants.py +0 -0
  62. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +0 -0
  63. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/ollama/ollama.py +0 -0
  64. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/ollama/status_card.py +0 -0
  65. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
  66. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +0 -0
  67. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +0 -0
  68. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
  69. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/secrets/secrets.py +0 -0
  70. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +0 -0
  71. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +0 -0
  72. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  73. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +0 -0
  74. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
  75. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
  76. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
  77. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
  78. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
  79. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
  80. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +0 -0
  81. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +0 -0
  82. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
  83. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
  84. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/remote_config.py +0 -0
  85. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
  86. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
  87. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/toplevel/ob_internal.py +0 -0
  88. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
  89. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
  90. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
  91. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +0 -0
  92. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +0 -0
  93. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +0 -0
  94. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
  95. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/ob_metaflow_extensions.egg-info/requires.txt +0 -0
  96. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
  97. {ob_metaflow_extensions-1.1.166rc6 → ob_metaflow_extensions-1.1.168rc0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob_metaflow_extensions
3
- Version: 1.1.166rc6
3
+ Version: 1.1.168rc0
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -345,6 +345,7 @@ STEP_DECORATORS_DESC = [
345
345
  ("gpu_profile", ".profilers.gpu_profile_decorator.GPUProfileDecorator"),
346
346
  ("nim", ".nim.nim_decorator.NimDecorator"),
347
347
  ("ollama", ".ollama.OllamaDecorator"),
348
+ ("vllm", ".vllm.VLLMDecorator"),
348
349
  ("app_deploy", ".apps.deploy_decorator.WorkstationAppDeployDecorator"),
349
350
  ]
350
351
 
@@ -0,0 +1,177 @@
1
+ from metaflow.decorators import StepDecorator
2
+ from metaflow import current
3
+ import functools
4
+ import os
5
+ import threading
6
+ from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
7
+ from metaflow.metaflow_config import from_conf
8
+
9
+ from .vllm_manager import VLLMManager
10
+ from .status_card import VLLMStatusCard, CardDecoratorInjector
11
+
12
+ __mf_promote_submodules__ = ["plugins.vllm"]
13
+
14
+
15
+ class VLLMDecorator(StepDecorator, CardDecoratorInjector):
16
+ """
17
+ This decorator is used to run vllm APIs as Metaflow task sidecars.
18
+
19
+ User code call
20
+ --------------
21
+ @vllm(
22
+ model="...",
23
+ ...
24
+ )
25
+
26
+ Valid backend options
27
+ ---------------------
28
+ - 'local': Run as a separate process on the local task machine.
29
+
30
+ Valid model options
31
+ -------------------
32
+ Any HuggingFace model identifier, e.g. 'meta-llama/Llama-3.2-1B'
33
+
34
+ NOTE: vLLM's OpenAI-compatible server serves ONE model per server instance.
35
+ If you need multiple models, you must create multiple @vllm decorators.
36
+
37
+ Parameters
38
+ ----------
39
+ model: str
40
+ HuggingFace model identifier to be served by vLLM.
41
+ backend: str
42
+ Determines where and how to run the vLLM process.
43
+ debug: bool
44
+ Whether to turn on verbose debugging logs.
45
+ kwargs : Any
46
+ Any other keyword arguments are passed directly to the vLLM engine.
47
+ This allows for flexible configuration of vLLM server settings.
48
+ For example, `tensor_parallel_size=2`.
49
+ """
50
+
51
+ name = "vllm"
52
+ defaults = {
53
+ "model": None,
54
+ "backend": "local",
55
+ "debug": False,
56
+ "stream_logs_to_card": False,
57
+ "card_refresh_interval": 10,
58
+ "engine_args": {},
59
+ }
60
+
61
+ def step_init(
62
+ self, flow, graph, step_name, decorators, environment, flow_datastore, logger
63
+ ):
64
+ super().step_init(
65
+ flow, graph, step_name, decorators, environment, flow_datastore, logger
66
+ )
67
+
68
+ # Validate that a model is specified
69
+ if not self.attributes["model"]:
70
+ raise ValueError(
71
+ f"@vllm decorator on step '{step_name}' requires a 'model' parameter. "
72
+ f"Example: @vllm(model='meta-llama/Llama-3.2-1B')"
73
+ )
74
+
75
+ # Attach the vllm status card
76
+ self.attach_card_decorator(
77
+ flow,
78
+ step_name,
79
+ "vllm_status",
80
+ "blank",
81
+ refresh_interval=self.attributes["card_refresh_interval"],
82
+ )
83
+
84
+ def task_decorate(
85
+ self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
86
+ ):
87
+ @functools.wraps(step_func)
88
+ def vllm_wrapper():
89
+ self.vllm_manager = None
90
+ self.status_card = None
91
+ self.card_monitor_thread = None
92
+
93
+ try:
94
+ self.status_card = VLLMStatusCard(
95
+ refresh_interval=self.attributes["card_refresh_interval"]
96
+ )
97
+
98
+ def monitor_card():
99
+ try:
100
+ self.status_card.on_startup(current.card["vllm_status"])
101
+
102
+ while not getattr(
103
+ self.card_monitor_thread, "_stop_event", False
104
+ ):
105
+ try:
106
+ self.status_card.on_update(
107
+ current.card["vllm_status"], None
108
+ )
109
+ import time
110
+
111
+ time.sleep(self.attributes["card_refresh_interval"])
112
+ except Exception as e:
113
+ if self.attributes["debug"]:
114
+ print(f"[@vllm] Card monitoring error: {e}")
115
+ break
116
+ except Exception as e:
117
+ if self.attributes["debug"]:
118
+ print(f"[@vllm] Card monitor thread error: {e}")
119
+ self.status_card.on_error(current.card["vllm_status"], str(e))
120
+
121
+ self.card_monitor_thread = threading.Thread(
122
+ target=monitor_card, daemon=True
123
+ )
124
+ self.card_monitor_thread._stop_event = False
125
+ self.card_monitor_thread.start()
126
+ self.vllm_manager = VLLMManager(
127
+ model=self.attributes["model"],
128
+ backend=self.attributes["backend"],
129
+ debug=self.attributes["debug"],
130
+ status_card=self.status_card,
131
+ stream_logs_to_card=self.attributes["stream_logs_to_card"],
132
+ **self.attributes["engine_args"],
133
+ )
134
+ if self.attributes["debug"]:
135
+ print("[@vllm] VLLMManager initialized.")
136
+
137
+ except Exception as e:
138
+ if self.status_card:
139
+ self.status_card.add_event(
140
+ "error", f"Initialization failed: {str(e)}"
141
+ )
142
+ try:
143
+ self.status_card.on_error(current.card["vllm_status"], str(e))
144
+ except:
145
+ pass
146
+ print(f"[@vllm] Error initializing VLLMManager: {e}")
147
+ raise
148
+
149
+ try:
150
+ if self.status_card:
151
+ self.status_card.add_event("info", "Starting user step function")
152
+ step_func()
153
+ if self.status_card:
154
+ self.status_card.add_event(
155
+ "success", "User step function completed successfully"
156
+ )
157
+ finally:
158
+ if self.vllm_manager:
159
+ self.vllm_manager.terminate_models()
160
+
161
+ if self.card_monitor_thread and self.status_card:
162
+ import time
163
+
164
+ try:
165
+ self.status_card.on_update(current.card["vllm_status"], None)
166
+ except Exception as e:
167
+ if self.attributes["debug"]:
168
+ print(f"[@vllm] Final card update error: {e}")
169
+ time.sleep(2)
170
+
171
+ if self.card_monitor_thread:
172
+ self.card_monitor_thread._stop_event = True
173
+ self.card_monitor_thread.join(timeout=5)
174
+ if self.attributes["debug"]:
175
+ print("[@vllm] Card monitoring thread stopped.")
176
+
177
+ return vllm_wrapper
@@ -0,0 +1 @@
1
+ from metaflow.exception import MetaflowException
@@ -0,0 +1,352 @@
1
+ from metaflow.cards import Markdown, Table, VegaChart
2
+ from metaflow.metaflow_current import current
3
+ from datetime import datetime
4
+ import threading
5
+ import time
6
+
7
+
8
+ from metaflow.exception import MetaflowException
9
+ from collections import defaultdict
10
+
11
+
12
+ class CardDecoratorInjector:
13
+ """
14
+ Mixin Useful for injecting @card decorators from other first class Metaflow decorators.
15
+ """
16
+
17
+ _first_time_init = defaultdict(dict)
18
+
19
+ @classmethod
20
+ def _get_first_time_init_cached_value(cls, step_name, card_id):
21
+ return cls._first_time_init.get(step_name, {}).get(card_id, None)
22
+
23
+ @classmethod
24
+ def _set_first_time_init_cached_value(cls, step_name, card_id, value):
25
+ cls._first_time_init[step_name][card_id] = value
26
+
27
+ def _card_deco_already_attached(self, step, card_id):
28
+ for decorator in step.decorators:
29
+ if decorator.name == "card":
30
+ if decorator.attributes["id"] and card_id == decorator.attributes["id"]:
31
+ return True
32
+ return False
33
+
34
+ def _get_step(self, flow, step_name):
35
+ for step in flow:
36
+ if step.name == step_name:
37
+ return step
38
+ return None
39
+
40
+ def _first_time_init_check(self, step_dag_node, card_id):
41
+ """ """
42
+ return not self._card_deco_already_attached(step_dag_node, card_id)
43
+
44
+ def attach_card_decorator(
45
+ self,
46
+ flow,
47
+ step_name,
48
+ card_id,
49
+ card_type,
50
+ refresh_interval=5,
51
+ ):
52
+ """
53
+ This method is called `step_init` in your StepDecorator code since
54
+ this class is used as a Mixin
55
+ """
56
+ from metaflow import decorators as _decorators
57
+
58
+ if not all([card_id, card_type]):
59
+ raise MetaflowException(
60
+ "`INJECTED_CARD_ID` and `INJECTED_CARD_TYPE` must be set in the `CardDecoratorInjector` Mixin"
61
+ )
62
+
63
+ step_dag_node = self._get_step(flow, step_name)
64
+ if (
65
+ self._get_first_time_init_cached_value(step_name, card_id) is None
66
+ ): # First check class level setting.
67
+ if self._first_time_init_check(step_dag_node, card_id):
68
+ self._set_first_time_init_cached_value(step_name, card_id, True)
69
+ _decorators._attach_decorators_to_step(
70
+ step_dag_node,
71
+ [
72
+ "card:type=%s,id=%s,refresh_interval=%s"
73
+ % (card_type, card_id, str(refresh_interval))
74
+ ],
75
+ )
76
+ else:
77
+ self._set_first_time_init_cached_value(step_name, card_id, False)
78
+
79
+
80
+ class CardRefresher:
81
+
82
+ CARD_ID = None
83
+
84
+ def on_startup(self, current_card):
85
+ raise NotImplementedError("make_card method must be implemented")
86
+
87
+ def on_error(self, current_card, error_message):
88
+ raise NotImplementedError("error_card method must be implemented")
89
+
90
+ def on_update(self, current_card, data_object):
91
+ raise NotImplementedError("update_card method must be implemented")
92
+
93
+ def sqlite_fetch_func(self, conn):
94
+ raise NotImplementedError("sqlite_fetch_func must be implemented")
95
+
96
+
97
+ class VLLMStatusCard(CardRefresher):
98
+ """
99
+ Real-time status card for vLLM system monitoring.
100
+ Shows server health, model status, and recent events.
101
+
102
+ Intended to be inherited from in a step decorator like this:
103
+ class VLLMDecorator(StepDecorator, VLLMStatusCard):
104
+ """
105
+
106
+ CARD_ID = "vllm_status"
107
+
108
+ def __init__(self, refresh_interval=10):
109
+ self.refresh_interval = refresh_interval
110
+ self.status_data = {
111
+ "server": {
112
+ "status": "Starting",
113
+ "uptime_start": None,
114
+ "last_health_check": None,
115
+ "health_status": "Unknown",
116
+ "models": [],
117
+ },
118
+ "models": {}, # model_name -> {status, load_time, etc}
119
+ "performance": {
120
+ "install_time": None,
121
+ "server_startup_time": None,
122
+ "total_initialization_time": None,
123
+ },
124
+ "versions": {
125
+ "vllm": "Detecting...",
126
+ },
127
+ "events": [], # Recent events log
128
+ "logs": [],
129
+ }
130
+ self._lock = threading.Lock()
131
+ self._already_rendered = False
132
+
133
+ def update_status(self, category, data):
134
+ """Thread-safe method to update status data"""
135
+ with self._lock:
136
+ if category in self.status_data:
137
+ self.status_data[category].update(data)
138
+
139
+ def add_log_line(self, log_line):
140
+ """Add a log line to the logs."""
141
+ with self._lock:
142
+ self.status_data["logs"].append(log_line)
143
+ # Keep only last 20 lines
144
+ self.status_data["logs"] = self.status_data["logs"][-20:]
145
+
146
+ def add_event(self, event_type, message, timestamp=None):
147
+ """Add an event to the timeline"""
148
+ if timestamp is None:
149
+ timestamp = datetime.now()
150
+
151
+ with self._lock:
152
+ self.status_data["events"].insert(
153
+ 0,
154
+ {
155
+ "type": event_type, # 'info', 'warning', 'error', 'success'
156
+ "message": message,
157
+ "timestamp": timestamp,
158
+ },
159
+ )
160
+ # Keep only last 10 events
161
+ self.status_data["events"] = self.status_data["events"][:10]
162
+
163
+ def get_circuit_breaker_emoji(self, state):
164
+ """Get status emoji for circuit breaker state"""
165
+ emoji_map = {"CLOSED": "🟢", "OPEN": "🔴", "HALF_OPEN": "🟡"}
166
+ return emoji_map.get(state, "⚪")
167
+
168
+ def get_uptime_string(self, start_time):
169
+ """Calculate uptime string"""
170
+ if not start_time:
171
+ return "Not started"
172
+
173
+ uptime = datetime.now() - start_time
174
+ hours, remainder = divmod(int(uptime.total_seconds()), 3600)
175
+ minutes, seconds = divmod(remainder, 60)
176
+
177
+ if hours > 0:
178
+ return f"{hours}h {minutes}m {seconds}s"
179
+ elif minutes > 0:
180
+ return f"{minutes}m {seconds}s"
181
+ else:
182
+ return f"{seconds}s"
183
+
184
+ def on_startup(self, current_card):
185
+ """Initialize the card when monitoring starts"""
186
+ current_card.append(Markdown("# 🚀 `@vllm` Status Dashboard"))
187
+ current_card.append(Markdown("_Initializing vLLM system..._"))
188
+ current_card.refresh()
189
+
190
+ def render_card_fresh(self, current_card, data):
191
+ """Render the complete card with all status information"""
192
+ self._already_rendered = True
193
+ current_card.clear()
194
+
195
+ current_card.append(Markdown("# 🚀 `@vllm` Status Dashboard"))
196
+
197
+ versions = data.get("versions", {})
198
+ vllm_version = versions.get("vllm", "Unknown")
199
+ current_card.append(Markdown(f"**vLLM Version:** `{vllm_version}`"))
200
+
201
+ current_card.append(
202
+ Markdown(f"_Last updated: {datetime.now().strftime('%H:%M:%S')}_")
203
+ )
204
+
205
+ server_data = data["server"]
206
+ uptime = self.get_uptime_string(server_data.get("uptime_start"))
207
+ server_status = server_data.get("status", "Unknown")
208
+ model = server_data.get("model", "Unknown")
209
+
210
+ # Determine status emoji
211
+ if server_status == "Running":
212
+ status_emoji = "🟢"
213
+ model_emoji = "✅"
214
+ elif server_status == "Failed":
215
+ status_emoji = "🔴"
216
+ model_emoji = "❌"
217
+ elif server_status == "Starting":
218
+ status_emoji = "🟡"
219
+ model_emoji = "⏳"
220
+ else: # Stopped, etc.
221
+ status_emoji = "⚫"
222
+ model_emoji = "⏹️"
223
+
224
+ # Main status section
225
+ current_card.append(
226
+ Markdown(f"## {status_emoji} Server Status: {server_status}")
227
+ )
228
+
229
+ if server_status == "Running" and uptime:
230
+ current_card.append(Markdown(f"**Uptime:** {uptime}"))
231
+
232
+ # Model information - only show detailed status if server is running
233
+ if server_status == "Running":
234
+ current_card.append(Markdown(f"## {model_emoji} Model: `{model}`"))
235
+
236
+ # Show model-specific status if available
237
+ models_data = data.get("models", {})
238
+ if models_data and model in models_data:
239
+ model_info = models_data[model]
240
+ model_status = model_info.get("status", "Unknown")
241
+ load_time = model_info.get("load_time")
242
+ location = model_info.get("location")
243
+
244
+ current_card.append(Markdown(f"**Status:** {model_status}"))
245
+ if location:
246
+ current_card.append(Markdown(f"**Location:** `{location}`"))
247
+ if load_time and isinstance(load_time, (int, float)):
248
+ current_card.append(Markdown(f"**Load Time:** {load_time:.1f}s"))
249
+ elif model != "Unknown":
250
+ current_card.append(
251
+ Markdown(f"## {model_emoji} Model: `{model}` (Server Stopped)")
252
+ )
253
+
254
+ # Simplified monitoring note
255
+ current_card.append(
256
+ Markdown(
257
+ "## 🔧 Monitoring\n**Advanced Features:** Disabled (Circuit Breaker, Request Interception)"
258
+ )
259
+ )
260
+
261
+ # Performance metrics
262
+ perf_data = data["performance"]
263
+ if any(v is not None for v in perf_data.values()):
264
+ current_card.append(Markdown("## ⚡ Performance"))
265
+
266
+ init_metrics = []
267
+ shutdown_metrics = []
268
+
269
+ for metric, value in perf_data.items():
270
+ if value is not None:
271
+ display_value = (
272
+ f"{value:.1f}s" if isinstance(value, (int, float)) else value
273
+ )
274
+ metric_display = metric.replace("_", " ").title()
275
+
276
+ if "shutdown" in metric.lower():
277
+ shutdown_metrics.append([metric_display, display_value])
278
+ elif metric in [
279
+ "install_time",
280
+ "server_startup_time",
281
+ "total_initialization_time",
282
+ ]:
283
+ init_metrics.append([metric_display, display_value])
284
+
285
+ if init_metrics:
286
+ current_card.append(Markdown("### Initialization"))
287
+ current_card.append(Table(init_metrics, headers=["Metric", "Duration"]))
288
+
289
+ if shutdown_metrics:
290
+ current_card.append(Markdown("### Shutdown"))
291
+ current_card.append(
292
+ Table(shutdown_metrics, headers=["Metric", "Value"])
293
+ )
294
+
295
+ # Recent events
296
+ events = data.get("events", [])
297
+ if events:
298
+ current_card.append(Markdown("## 📝 Recent Events"))
299
+ for event in events[:5]: # Show last 5 events
300
+ event_type = event.get("type", "info")
301
+ message = event.get("message", "")
302
+ timestamp = event.get("timestamp", datetime.now())
303
+
304
+ emoji_map = {
305
+ "info": "ℹ️",
306
+ "success": "✅",
307
+ "warning": "⚠️",
308
+ "error": "❌",
309
+ }
310
+ emoji = emoji_map.get(event_type, "ℹ️")
311
+
312
+ time_str = (
313
+ timestamp.strftime("%H:%M:%S")
314
+ if isinstance(timestamp, datetime)
315
+ else str(timestamp)
316
+ )
317
+ current_card.append(Markdown(f"- {emoji} `{time_str}` {message}"))
318
+
319
+ # Server Logs
320
+ logs = data.get("logs", [])
321
+ if logs:
322
+ current_card.append(Markdown("## 📜 Server Logs"))
323
+ # The logs are appended, so they are in chronological order.
324
+ log_content = "\n".join(logs)
325
+ current_card.append(Markdown(f"```\n{log_content}\n```"))
326
+
327
+ current_card.refresh()
328
+
329
+ def on_error(self, current_card, error_message):
330
+ """Handle errors in card rendering"""
331
+ if not self._already_rendered:
332
+ current_card.clear()
333
+ current_card.append(Markdown("# 🚀 `@vllm` Status Dashboard"))
334
+ current_card.append(Markdown(f"## ❌ Error: {str(error_message)}"))
335
+ current_card.refresh()
336
+
337
+ def on_update(self, current_card, data_object):
338
+ """Update the card with new data"""
339
+ with self._lock:
340
+ current_data = self.status_data.copy()
341
+
342
+ if not self._already_rendered:
343
+ self.render_card_fresh(current_card, current_data)
344
+ else:
345
+ # For frequent updates, we could implement incremental updates here
346
+ # For now, just re-render the whole card
347
+ self.render_card_fresh(current_card, current_data)
348
+
349
+ def sqlite_fetch_func(self, conn):
350
+ """Required by CardRefresher (which needs a refactor), but we use in-memory data instead"""
351
+ with self._lock:
352
+ return {"status": self.status_data}