ob-metaflow-extensions 1.1.158__tar.gz → 1.1.160rc0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (86) hide show
  1. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/PKG-INFO +1 -1
  2. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/__init__.py +1 -1
  3. ob-metaflow-extensions-1.1.160rc0/metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  4. ob-metaflow-extensions-1.1.160rc0/metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
  5. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +24 -0
  6. ob-metaflow-extensions-1.1.160rc0/metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +159 -0
  7. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +1 -0
  8. ob-metaflow-extensions-1.1.160rc0/metaflow_extensions/outerbounds/toplevel/ob_internal.py +1 -0
  9. ob-metaflow-extensions-1.1.160rc0/metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  10. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
  11. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/ob_metaflow_extensions.egg-info/SOURCES.txt +4 -0
  12. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/setup.py +1 -1
  13. ob-metaflow-extensions-1.1.158/metaflow_extensions/outerbounds/plugins/nvcf/constants.py +0 -3
  14. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/README.md +0 -0
  15. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/__init__.py +0 -0
  16. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
  17. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  18. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/apps/app_utils.py +0 -0
  19. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/apps/consts.py +0 -0
  20. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +0 -0
  21. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +0 -0
  22. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
  23. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
  24. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +0 -0
  25. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +0 -0
  26. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +0 -0
  27. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +0 -0
  28. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +0 -0
  29. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +0 -0
  30. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  31. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +0 -0
  32. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
  33. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
  34. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
  35. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
  36. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
  37. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nim/card.py +0 -0
  38. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +0 -0
  39. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
  40. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nim/utils.py +0 -0
  41. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  42. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +0 -0
  43. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +0 -0
  44. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +0 -0
  45. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +0 -0
  46. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
  47. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvcf/utils.py +0 -0
  48. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  49. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +0 -0
  50. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct.py +0 -0
  51. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +0 -0
  52. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +0 -0
  53. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/nvct/utils.py +0 -0
  54. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/ollama/__init__.py +0 -0
  55. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/ollama/constants.py +0 -0
  56. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +0 -0
  57. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/ollama/ollama.py +0 -0
  58. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
  59. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +0 -0
  60. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +0 -0
  61. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
  62. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/secrets/secrets.py +0 -0
  63. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +0 -0
  64. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +0 -0
  65. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  66. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +0 -0
  67. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
  68. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
  69. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
  70. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
  71. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
  72. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
  73. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +0 -0
  74. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
  75. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
  76. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/remote_config.py +0 -0
  77. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
  78. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
  79. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
  80. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
  81. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +0 -0
  82. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +0 -0
  83. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
  84. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/ob_metaflow_extensions.egg-info/requires.txt +0 -0
  85. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
  86. {ob-metaflow-extensions-1.1.158 → ob-metaflow-extensions-1.1.160rc0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.158
3
+ Version: 1.1.160rc0
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -347,4 +347,4 @@ SECRETS_PROVIDERS_DESC = [
347
347
  ("outerbounds", ".secrets.secrets.OuterboundsSecretsProvider"),
348
348
  ]
349
349
  # Adding an override here so the library can be imported at the metaflow.plugins level
350
- __mf_promote_submodules__ = ["snowflake", "ollama"]
350
+ __mf_promote_submodules__ = ["snowflake", "ollama", "torchtune"]
@@ -0,0 +1,110 @@
1
+ import threading
2
+ import time
3
+ import sys
4
+ from typing import Dict, Optional, Any, Callable
5
+ from functools import partial
6
+ from metaflow.exception import MetaflowException
7
+ from metaflow.metaflow_config import FAST_BAKERY_URL
8
+
9
+ from .fast_bakery import FastBakery, FastBakeryApiResponse, FastBakeryException
10
+ from .docker_environment import cache_request
11
+
12
+ BAKERY_METAFILE = ".imagebakery-cache"
13
+
14
+
15
+ class BakerException(MetaflowException):
16
+ headline = "Ran into an error while baking image"
17
+
18
+ def __init__(self, msg):
19
+ super(BakerException, self).__init__(msg)
20
+
21
+
22
+ def bake_image(
23
+ cache_file_path: str,
24
+ ref: Optional[str] = None,
25
+ python: Optional[str] = None,
26
+ pypi_packages: Optional[Dict[str, str]] = None,
27
+ conda_packages: Optional[Dict[str, str]] = None,
28
+ base_image: Optional[str] = None,
29
+ logger: Optional[Callable[[str], Any]] = None,
30
+ ) -> FastBakeryApiResponse:
31
+ """
32
+ Bakes a Docker image with the specified dependencies.
33
+
34
+ Args:
35
+ cache_file_path: Path to the cache file
36
+ ref: Reference identifier for this bake (for logging purposes)
37
+ python: Python version to use
38
+ pypi_packages: Dictionary of PyPI packages and versions
39
+ conda_packages: Dictionary of Conda packages and versions
40
+ base_image: Base Docker image to use
41
+ logger: Optional logger function to output progress
42
+
43
+ Returns:
44
+ FastBakeryApiResponse: The response from the bakery service
45
+
46
+ Raises:
47
+ BakerException: If the baking process fails
48
+ """
49
+ # Default logger if none provided
50
+ if logger is None:
51
+ logger = partial(print, file=sys.stderr)
52
+
53
+ # Thread lock for logging
54
+ logger_lock = threading.Lock()
55
+ images_baked = 0
56
+
57
+ @cache_request(cache_file_path)
58
+ def _cached_bake(
59
+ ref=None,
60
+ python=None,
61
+ pypi_packages=None,
62
+ conda_packages=None,
63
+ base_image=None,
64
+ ):
65
+ try:
66
+ bakery = FastBakery(url=FAST_BAKERY_URL)
67
+ bakery._reset_payload()
68
+ bakery.python_version(python)
69
+ bakery.pypi_packages(pypi_packages)
70
+ bakery.conda_packages(conda_packages)
71
+ bakery.base_image(base_image)
72
+ # bakery.ignore_cache()
73
+
74
+ with logger_lock:
75
+ logger(f"🍳 Baking [{ref}] ...")
76
+ logger(f" 🐍 Python: {python}")
77
+
78
+ if pypi_packages:
79
+ logger(f" 📦 PyPI packages:")
80
+ for package, version in pypi_packages.items():
81
+ logger(f" 🔧 {package}: {version}")
82
+
83
+ if conda_packages:
84
+ logger(f" 📦 Conda packages:")
85
+ for package, version in conda_packages.items():
86
+ logger(f" 🔧 {package}: {version}")
87
+
88
+ logger(f" 🏗️ Base image: {base_image}")
89
+
90
+ start_time = time.time()
91
+ res = bakery.bake()
92
+ # TODO: Get actual bake time from bakery
93
+ bake_time = time.time() - start_time
94
+
95
+ with logger_lock:
96
+ logger(f"🏁 Baked [{ref}] in {bake_time:.2f} seconds!")
97
+ nonlocal images_baked
98
+ images_baked += 1
99
+ return res
100
+ except FastBakeryException as ex:
101
+ raise BakerException(f"Bake [{ref}] failed: {str(ex)}")
102
+
103
+ # Call the cached bake function with the provided parameters
104
+ return _cached_bake(
105
+ ref=ref,
106
+ python=python,
107
+ pypi_packages=pypi_packages,
108
+ conda_packages=conda_packages,
109
+ base_image=base_image,
110
+ )
@@ -0,0 +1,3 @@
1
+ SUPPORTABLE_GPU_TYPES = ["L40", "L40S", "L40G", "H100", "NEBIUS_H100"]
2
+ DEFAULT_GPU_TYPE = "H100"
3
+ MAX_N_GPU_BY_TYPE = {"L40": 1, "L40S": 1, "L40G": 1, "H100": 4, "NEBIUS_H100": 8}
@@ -65,6 +65,28 @@ SUPPORTABLE_GPU_TYPES = {
65
65
  "backend": "gcp-asia-se-1a",
66
66
  },
67
67
  ],
68
+ "NEBIUS_H100": [
69
+ {
70
+ "n_gpus": 1,
71
+ "instance_type": "ON-PREM.GPU.H100_1x",
72
+ "backend": "default-project-eu-north1",
73
+ },
74
+ {
75
+ "n_gpus": 2,
76
+ "instance_type": "ON-PREM.GPU.H100_2x",
77
+ "backend": "default-project-eu-north1",
78
+ },
79
+ {
80
+ "n_gpus": 4,
81
+ "instance_type": "ON-PREM.GPU.H100_4x",
82
+ "backend": "default-project-eu-north1",
83
+ },
84
+ {
85
+ "n_gpus": 8,
86
+ "instance_type": "ON-PREM.GPU.H100_8x",
87
+ "backend": "default-project-eu-north1",
88
+ },
89
+ ],
68
90
  }
69
91
 
70
92
 
@@ -154,6 +176,8 @@ class NvctDecorator(StepDecorator):
154
176
 
155
177
  self.attributes["instance_type"] = valid_config["instance_type"]
156
178
  self.attributes["gpu_type"] = requested_gpu_type
179
+ if self.attributes["gpu_type"] == "NEBIUS_H100":
180
+ self.attributes["gpu_type"] = "H100"
157
181
  self.attributes["backend"] = valid_config["backend"]
158
182
 
159
183
  def runtime_init(self, flow, graph, package, run_id):
@@ -0,0 +1,159 @@
1
+ from queue import Queue, Empty
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ from typing import Optional, List, Dict
4
+ import subprocess
5
+ import shutil
6
+ import sys
7
+ from metaflow import current
8
+
9
+ __mf_promote_submodules__ = ["plugins.torchtune"]
10
+
11
+
12
+ class TorchTune:
13
+ def __init__(
14
+ self,
15
+ use_multi_node_config: bool = False,
16
+ ):
17
+ """
18
+ Initialize the Tune launcher.
19
+
20
+ :param use_multi_node_config: If True, attempt to build a distributed configuration
21
+ from current.torch.torchrun_args.
22
+ """
23
+ self.multi_node_config = {}
24
+ if use_multi_node_config:
25
+ if getattr(current, "torch", None):
26
+ print(
27
+ "[Metaflow Tune] Since @torchrun is used, multi-node config can be used to launch the job."
28
+ )
29
+ # For distributed torchtune launches, we use similar parameters as torchrun.
30
+ # (You might need to adjust the keys according to your environment.)
31
+ self.multi_node_config = {
32
+ "nnodes": current.torch.torchrun_args["nnodes"],
33
+ "master_addr": current.torch.torchrun_args["master_addr"],
34
+ "master_port": int(current.torch.torchrun_args["master_port"]),
35
+ "node_rank": current.torch.torchrun_args["node_rank"],
36
+ "nproc_per_node": current.torch.torchrun_args["nproc_per_node"],
37
+ "num_processes": current.torch.torchrun_args["nproc_per_node"]
38
+ * current.torch.torchrun_args["nnodes"],
39
+ }
40
+ print(
41
+ f"[Metaflow Tune] Discovered multi-node config for torchrun: {self.multi_node_config}"
42
+ )
43
+ else:
44
+ print(
45
+ "[Metaflow Tune] Since @torchrun is not used, default multi-node config cannot be used to launch the job."
46
+ )
47
+
48
+ def run(
49
+ self,
50
+ recipe: str,
51
+ config_dict: Dict,
52
+ additional_cli_options: Optional[List[str]] = None,
53
+ ):
54
+ """
55
+ Launch the torchtune job via its CLI.
56
+
57
+ :param recipe: The path to the recipe (or name of the recipe) to run.
58
+ :param config_dict: Optional dictionary that will be dumped to a YAML file and passed via --config.
59
+ :param additional_cli_options: Optional list of additional CLI options.
60
+ :raises: subprocess.CalledProcessError if the subprocess returns a nonzero exit code.
61
+ """
62
+ import yaml
63
+ import tempfile
64
+ import os
65
+
66
+ _temp_dir = tempfile.mkdtemp()
67
+ try:
68
+ config_path = os.path.join(_temp_dir, "config.yaml")
69
+ with open(config_path, "w") as f:
70
+ yaml.dump(config_dict, f)
71
+
72
+ additional_options = (
73
+ additional_cli_options if additional_cli_options else []
74
+ )
75
+
76
+ # Build the command. Here we use "tune run" as the base command.
77
+ cmd = ["tune", "run"]
78
+
79
+ # If distributed configuration is present, add torchrun–style flags.
80
+ if self.multi_node_config:
81
+ cmd.extend(
82
+ [
83
+ "--nnodes",
84
+ str(self.multi_node_config.get("nnodes")),
85
+ "--nproc-per-node",
86
+ str(self.multi_node_config.get("nproc_per_node")),
87
+ # "--rdzv_conf", f"rdzv_endpoint={self.multi_node_config.get('master_addr')}:{self.multi_node_config.get('master_port')}"
88
+ "--rdzv-backend",
89
+ "c10d",
90
+ "--rdzv-endpoint",
91
+ f"{self.multi_node_config.get('master_addr')}:{self.multi_node_config.get('master_port')}",
92
+ "--rdzv-id",
93
+ "1234567890",
94
+ "--node-rank",
95
+ str(self.multi_node_config.get("node_rank")),
96
+ # TODO: should there be a masterip/port here ?
97
+ ]
98
+ )
99
+
100
+ cmd.extend(additional_options)
101
+
102
+ cmd.append(recipe)
103
+ # If a recipe configuration was provided, pass it via the --config flag.
104
+ cmd.extend(["--config", config_path])
105
+
106
+ # Append any additional CLI options.
107
+
108
+ # Launch the subprocess.
109
+ print(f"[Metaflow tune] {' '.join(cmd)}")
110
+ process = subprocess.Popen(
111
+ cmd,
112
+ stdout=subprocess.PIPE,
113
+ stderr=subprocess.PIPE,
114
+ universal_newlines=True,
115
+ )
116
+
117
+ # Stream the output in real-time.
118
+ for out_line, err_line in read_popen_pipes(process):
119
+ print(out_line, end="", flush=True)
120
+ print(err_line, end="", file=sys.stderr, flush=True)
121
+
122
+ process.wait()
123
+ if process.returncode != 0:
124
+ raise subprocess.CalledProcessError(process.returncode, cmd)
125
+ finally:
126
+ shutil.rmtree(_temp_dir)
127
+
128
+
129
+ def enqueue_output(file, queue):
130
+ for line in iter(file.readline, ""):
131
+ queue.put(line)
132
+ file.close()
133
+
134
+
135
+ def read_popen_pipes(p):
136
+
137
+ with ThreadPoolExecutor(2) as pool:
138
+ q_stdout, q_stderr = Queue(), Queue()
139
+
140
+ pool.submit(enqueue_output, p.stdout, q_stdout)
141
+ pool.submit(enqueue_output, p.stderr, q_stderr)
142
+
143
+ while True:
144
+
145
+ if p.poll() is not None and q_stdout.empty() and q_stderr.empty():
146
+ break
147
+
148
+ out_line = err_line = ""
149
+
150
+ try:
151
+ out_line = q_stdout.get_nowait()
152
+ except Empty:
153
+ pass
154
+ try:
155
+ err_line = q_stderr.get_nowait()
156
+ except Empty:
157
+ pass
158
+
159
+ yield (out_line, err_line)
@@ -53,3 +53,4 @@ def S3(*args, **kwargs):
53
53
  from .. import profilers
54
54
  from ..plugins.snowflake import Snowflake
55
55
  from ..plugins.checkpoint_datastores import nebius_checkpoints, coreweave_checkpoints
56
+ from . import ob_internal
@@ -0,0 +1 @@
1
+ from ..plugins.fast_bakery.baker import bake_image
@@ -0,0 +1 @@
1
+ __mf_promote_submodules__ = ["plugins.torchtune"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.158
3
+ Version: 1.1.160rc0
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -19,6 +19,7 @@ metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py
19
19
  metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py
20
20
  metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py
21
21
  metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py
22
+ metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py
22
23
  metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py
23
24
  metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py
24
25
  metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py
@@ -63,15 +64,18 @@ metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py
63
64
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py
64
65
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py
65
66
  metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py
67
+ metaflow_extensions/outerbounds/plugins/torchtune/__init__.py
66
68
  metaflow_extensions/outerbounds/profilers/__init__.py
67
69
  metaflow_extensions/outerbounds/profilers/gpu.py
68
70
  metaflow_extensions/outerbounds/toplevel/__init__.py
69
71
  metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py
72
+ metaflow_extensions/outerbounds/toplevel/ob_internal.py
70
73
  metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py
71
74
  metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py
72
75
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py
73
76
  metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py
74
77
  metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py
78
+ metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py
75
79
  ob_metaflow_extensions.egg-info/PKG-INFO
76
80
  ob_metaflow_extensions.egg-info/SOURCES.txt
77
81
  ob_metaflow_extensions.egg-info/dependency_links.txt
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
2
2
  from pathlib import Path
3
3
 
4
4
 
5
- version = "1.1.158"
5
+ version = "1.1.160rc0"
6
6
  this_directory = Path(__file__).parent
7
7
  long_description = (this_directory / "README.md").read_text()
8
8
 
@@ -1,3 +0,0 @@
1
- SUPPORTABLE_GPU_TYPES = ["L40", "L40S", "L40G", "H100"]
2
- DEFAULT_GPU_TYPE = "H100"
3
- MAX_N_GPU_BY_TYPE = {"L40": 1, "L40S": 1, "L40G": 1, "H100": 4}