ob-metaflow-extensions 1.1.151__py2.py3-none-any.whl → 1.4.33__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -1
  2. metaflow_extensions/outerbounds/plugins/__init__.py +17 -3
  3. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  4. metaflow_extensions/outerbounds/plugins/apps/app_deploy_decorator.py +146 -0
  5. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +10 -0
  6. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  7. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/app_cli.py +1200 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +146 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +12 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +161 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +868 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +288 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +139 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +398 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1088 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +303 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  33. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  34. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  35. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +78 -0
  36. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +9 -77
  37. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  38. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +7 -78
  39. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  40. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +17 -3
  41. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
  42. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
  43. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  44. metaflow_extensions/outerbounds/plugins/nim/card.py +1 -6
  45. metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
  46. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
  47. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  48. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
  49. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +32 -8
  50. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +1 -1
  51. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +171 -16
  52. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  53. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  54. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1710 -114
  55. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  56. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  57. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  58. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  59. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  60. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  61. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  62. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  63. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  64. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  65. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  66. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +6 -3
  67. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +13 -7
  68. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +8 -2
  69. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  70. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  71. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  72. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  73. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  74. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  75. metaflow_extensions/outerbounds/remote_config.py +27 -3
  76. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +86 -2
  77. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  78. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  79. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  80. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  81. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  82. {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/METADATA +2 -2
  83. ob_metaflow_extensions-1.4.33.dist-info/RECORD +134 -0
  84. metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
  85. ob_metaflow_extensions-1.1.151.dist-info/RECORD +0 -74
  86. {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/WHEEL +0 -0
  87. {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/top_level.txt +0 -0
@@ -46,23 +46,45 @@ SUPPORTABLE_GPU_TYPES = {
46
46
  "H100": [
47
47
  {
48
48
  "n_gpus": 1,
49
- "instance_type": "GCP.GPU.H100_1x",
50
- "backend": "gcp-asia-se-1a",
49
+ "instance_type": "OCI.GPU.H100_1x",
50
+ "backend": "nvcf-dgxc-k8s-oci-nrt-prd8",
51
51
  },
52
52
  {
53
53
  "n_gpus": 2,
54
- "instance_type": "GCP.GPU.H100_2x",
55
- "backend": "gcp-asia-se-1a",
54
+ "instance_type": "OCI.GPU.H100_2x",
55
+ "backend": "nvcf-dgxc-k8s-oci-nrt-prd8",
56
56
  },
57
57
  {
58
58
  "n_gpus": 4,
59
- "instance_type": "GCP.GPU.H100_4x",
60
- "backend": "gcp-asia-se-1a",
59
+ "instance_type": "OCI.GPU.H100_4x",
60
+ "backend": "nvcf-dgxc-k8s-oci-nrt-prd8",
61
61
  },
62
62
  {
63
63
  "n_gpus": 8,
64
- "instance_type": "GCP.GPU.H100_8x",
65
- "backend": "gcp-asia-se-1a",
64
+ "instance_type": "OCI.GPU.H100_8x",
65
+ "backend": "nvcf-dgxc-k8s-oci-nrt-prd8",
66
+ },
67
+ ],
68
+ "NEBIUS_H100": [
69
+ {
70
+ "n_gpus": 1,
71
+ "instance_type": "ON-PREM.GPU.H100_1x",
72
+ "backend": "default-project-eu-north1",
73
+ },
74
+ {
75
+ "n_gpus": 2,
76
+ "instance_type": "ON-PREM.GPU.H100_2x",
77
+ "backend": "default-project-eu-north1",
78
+ },
79
+ {
80
+ "n_gpus": 4,
81
+ "instance_type": "ON-PREM.GPU.H100_4x",
82
+ "backend": "default-project-eu-north1",
83
+ },
84
+ {
85
+ "n_gpus": 8,
86
+ "instance_type": "ON-PREM.GPU.H100_8x",
87
+ "backend": "default-project-eu-north1",
66
88
  },
67
89
  ],
68
90
  }
@@ -154,6 +176,8 @@ class NvctDecorator(StepDecorator):
154
176
 
155
177
  self.attributes["instance_type"] = valid_config["instance_type"]
156
178
  self.attributes["gpu_type"] = requested_gpu_type
179
+ if self.attributes["gpu_type"] == "NEBIUS_H100":
180
+ self.attributes["gpu_type"] = "H100"
157
181
  self.attributes["backend"] = valid_config["backend"]
158
182
 
159
183
  def runtime_init(self, flow, graph, package, run_id):
@@ -124,7 +124,7 @@ class NvctRunner:
124
124
 
125
125
  request = (
126
126
  NVCTRequest(task_name)
127
- .container_image("nvcr.io/zhxkmsaasxhw/nvct-base:1.0-jovyan")
127
+ .container_image("nvcr.io/zhxkmsaasxhw/nvct-base:2.0-jovyan")
128
128
  .container_args(nvct_cmd)
129
129
  .gpu(
130
130
  gpu=self.gpu_type,
@@ -1,8 +1,11 @@
1
1
  from metaflow.decorators import StepDecorator
2
2
  from metaflow import current
3
3
  import functools
4
+ import os
5
+ import threading
4
6
 
5
- from .ollama import OllamaManager
7
+ from .ollama import OllamaManager, OllamaRequestInterceptor
8
+ from .status_card import OllamaStatusCard
6
9
  from ..card_utilities.injector import CardDecoratorInjector
7
10
 
8
11
  __mf_promote_submodules__ = ["plugins.ollama"]
@@ -13,10 +16,10 @@ class OllamaDecorator(StepDecorator, CardDecoratorInjector):
13
16
  This decorator is used to run Ollama APIs as Metaflow task sidecars.
14
17
 
15
18
  User code call
16
- -----------
19
+ --------------
17
20
  @ollama(
18
- models=['meta/llama3-8b-instruct', 'meta/llama3-70b-instruct'],
19
- backend='local'
21
+ models=[...],
22
+ ...
20
23
  )
21
24
 
22
25
  Valid backend options
@@ -26,45 +29,197 @@ class OllamaDecorator(StepDecorator, CardDecoratorInjector):
26
29
  - (TODO) 'remote': Spin up separate instance to serve Ollama models.
27
30
 
28
31
  Valid model options
29
- ----------------
30
- - 'llama3.2'
31
- - 'llama3.3'
32
- - any model here https://ollama.com/search
32
+ -------------------
33
+ Any model here https://ollama.com/search, e.g. 'llama3.2', 'llama3.3'
33
34
 
34
35
  Parameters
35
36
  ----------
36
- models: list[Ollama]
37
+ models: list[str]
37
38
  List of Ollama containers running models in sidecars.
38
39
  backend: str
39
40
  Determines where and how to run the Ollama process.
41
+ force_pull: bool
42
+ Whether to run `ollama pull` no matter what, or first check the remote cache in Metaflow datastore for this model key.
43
+ cache_update_policy: str
44
+ Cache update policy: "auto", "force", or "never".
45
+ force_cache_update: bool
46
+ Simple override for "force" cache update policy.
47
+ debug: bool
48
+ Whether to turn on verbose debugging logs.
49
+ circuit_breaker_config: dict
50
+ Configuration for circuit breaker protection. Keys: failure_threshold, recovery_timeout, reset_timeout.
51
+ timeout_config: dict
52
+ Configuration for various operation timeouts. Keys: pull, stop, health_check, install, server_startup.
40
53
  """
41
54
 
42
55
  name = "ollama"
43
- defaults = {"models": [], "backend": "local", "debug": False}
56
+ defaults = {
57
+ "models": [],
58
+ "backend": "local",
59
+ "force_pull": False,
60
+ "cache_update_policy": "auto", # "auto", "force", "never"
61
+ "force_cache_update": False, # Simple override for "force"
62
+ "debug": False,
63
+ "circuit_breaker_config": {
64
+ "failure_threshold": 3,
65
+ "recovery_timeout": 60,
66
+ "reset_timeout": 30,
67
+ },
68
+ "timeout_config": {
69
+ "pull": 600, # 10 minutes for model pulls
70
+ "stop": 30, # 30 seconds for model stops
71
+ "health_check": 5, # 5 seconds for health checks
72
+ "install": 60, # 1 minute for Ollama installation
73
+ "server_startup": 300, # 5 minutes for server startup
74
+ },
75
+ "card_refresh_interval": 10, # seconds - how often to update the status card
76
+ }
77
+
78
+ def step_init(
79
+ self, flow, graph, step_name, decorators, environment, flow_datastore, logger
80
+ ):
81
+ super().step_init(
82
+ flow, graph, step_name, decorators, environment, flow_datastore, logger
83
+ )
84
+ self.flow_datastore_backend = flow_datastore._storage_impl
85
+
86
+ # Attach the ollama status card
87
+ self.attach_card_decorator(
88
+ flow,
89
+ step_name,
90
+ "ollama_status",
91
+ "blank",
92
+ refresh_interval=self.attributes["card_refresh_interval"],
93
+ )
44
94
 
45
95
  def task_decorate(
46
96
  self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
47
97
  ):
48
98
  @functools.wraps(step_func)
49
99
  def ollama_wrapper():
100
+ self.ollama_manager = None
101
+ self.request_interceptor = None
102
+ self.status_card = None
103
+ self.card_monitor_thread = None
104
+
50
105
  try:
106
+ # Initialize status card and monitoring
107
+ self.status_card = OllamaStatusCard(
108
+ refresh_interval=self.attributes["card_refresh_interval"]
109
+ )
110
+
111
+ # Start card monitoring in background
112
+ def monitor_card():
113
+ try:
114
+ self.status_card.on_startup(current.card["ollama_status"])
115
+
116
+ while not getattr(
117
+ self.card_monitor_thread, "_stop_event", False
118
+ ):
119
+ try:
120
+ # Trigger card update with current data
121
+ self.status_card.on_update(
122
+ current.card["ollama_status"], None
123
+ )
124
+ import time
125
+
126
+ time.sleep(self.attributes["card_refresh_interval"])
127
+ except Exception as e:
128
+ if self.attributes["debug"]:
129
+ print(f"[@ollama] Card monitoring error: {e}")
130
+ break
131
+ except Exception as e:
132
+ if self.attributes["debug"]:
133
+ print(f"[@ollama] Card monitor thread error: {e}")
134
+ self.status_card.on_error(current.card["ollama_status"], str(e))
135
+
136
+ self.card_monitor_thread = threading.Thread(
137
+ target=monitor_card, daemon=True
138
+ )
139
+ self.card_monitor_thread._stop_event = False
140
+ self.card_monitor_thread.start()
141
+
142
+ # Initialize OllamaManager with status card
51
143
  self.ollama_manager = OllamaManager(
52
144
  models=self.attributes["models"],
53
145
  backend=self.attributes["backend"],
146
+ flow_datastore_backend=self.flow_datastore_backend,
147
+ force_pull=self.attributes["force_pull"],
148
+ cache_update_policy=self.attributes["cache_update_policy"],
149
+ force_cache_update=self.attributes["force_cache_update"],
54
150
  debug=self.attributes["debug"],
151
+ circuit_breaker_config=self.attributes["circuit_breaker_config"],
152
+ timeout_config=self.attributes["timeout_config"],
153
+ status_card=self.status_card,
55
154
  )
155
+
156
+ # Install request protection by monkey-patching ollama package
157
+ self.request_interceptor = OllamaRequestInterceptor(
158
+ self.ollama_manager.circuit_breaker, self.attributes["debug"]
159
+ )
160
+ self.request_interceptor.install_protection()
161
+
162
+ if self.attributes["debug"]:
163
+ print(
164
+ "[@ollama] OllamaManager initialized and request protection installed"
165
+ )
166
+
56
167
  except Exception as e:
168
+ if self.status_card:
169
+ self.status_card.add_event(
170
+ "error", f"Initialization failed: {str(e)}"
171
+ )
172
+ try:
173
+ self.status_card.on_error(current.card["ollama_status"], str(e))
174
+ except:
175
+ pass
57
176
  print(f"[@ollama] Error initializing OllamaManager: {e}")
58
177
  raise
178
+
59
179
  try:
180
+ if self.status_card:
181
+ self.status_card.add_event("info", "Starting user step function")
60
182
  step_func()
183
+ if self.status_card:
184
+ self.status_card.add_event(
185
+ "success", "User step function completed successfully"
186
+ )
61
187
  finally:
62
- try:
188
+ # Remove request protection first (before terminating models)
189
+ if self.request_interceptor:
190
+ self.request_interceptor.remove_protection()
191
+ if self.attributes["debug"]:
192
+ print("[@ollama] Request protection removed")
193
+
194
+ # Then cleanup ollama manager (while card monitoring is still active)
195
+ if self.ollama_manager:
63
196
  self.ollama_manager.terminate_models()
64
- except Exception as term_e:
65
- print(f"[@ollama] Error during sidecar termination: {term_e}")
66
- if self.attributes["debug"]:
67
- print(f"[@ollama] process statuses: {self.ollama_manager.processes}")
68
- print(f"[@ollama] process runtime stats: {self.ollama_manager.stats}")
197
+
198
+ # Give the card a moment to render the final shutdown events
199
+ if self.card_monitor_thread and self.status_card:
200
+ import time
201
+
202
+ # Trigger one final card update to capture all shutdown events
203
+ try:
204
+ self.status_card.on_update(current.card["ollama_status"], None)
205
+ except Exception as e:
206
+ if self.attributes["debug"]:
207
+ print(f"[@ollama] Final card update error: {e}")
208
+ time.sleep(2) # Allow final events to be rendered
209
+
210
+ # Now stop card monitoring
211
+ if self.card_monitor_thread:
212
+ self.card_monitor_thread._stop_event = True
213
+
214
+ if self.ollama_manager and self.attributes["debug"]:
215
+ print(
216
+ f"[@ollama] process statuses: {self.ollama_manager.processes}"
217
+ )
218
+ print(
219
+ f"[@ollama] process runtime stats: {self.ollama_manager.stats}"
220
+ )
221
+ print(
222
+ f"[@ollama] Circuit Breaker status: {self.ollama_manager.circuit_breaker.get_status()}"
223
+ )
69
224
 
70
225
  return ollama_wrapper
@@ -0,0 +1 @@
1
+ OLLAMA_SUFFIX = "mf.ollama"
@@ -0,0 +1,22 @@
1
+ from metaflow.exception import MetaflowException
2
+
3
+
4
+ class UnspecifiedRemoteStorageRootException(MetaflowException):
5
+ headline = "Storage root not specified."
6
+
7
+ def __init__(self, message):
8
+ super(UnspecifiedRemoteStorageRootException, self).__init__(message)
9
+
10
+
11
+ class EmptyOllamaManifestCacheException(MetaflowException):
12
+ headline = "Model not found."
13
+
14
+ def __init__(self, message):
15
+ super(EmptyOllamaManifestCacheException, self).__init__(message)
16
+
17
+
18
+ class EmptyOllamaBlobCacheException(MetaflowException):
19
+ headline = "Blob not found."
20
+
21
+ def __init__(self, message):
22
+ super(EmptyOllamaBlobCacheException, self).__init__(message)