ob-metaflow-extensions 1.1.158__py2.py3-none-any.whl → 1.1.159__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -347,4 +347,4 @@ SECRETS_PROVIDERS_DESC = [
347
347
  ("outerbounds", ".secrets.secrets.OuterboundsSecretsProvider"),
348
348
  ]
349
349
  # Adding an override here so the library can be imported at the metaflow.plugins level
350
- __mf_promote_submodules__ = ["snowflake", "ollama"]
350
+ __mf_promote_submodules__ = ["snowflake", "ollama", "torchtune"]
@@ -0,0 +1,159 @@
1
+ from queue import Queue, Empty
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ from typing import Optional, List, Dict
4
+ import subprocess
5
+ import shutil
6
+ import sys
7
+ from metaflow import current
8
+
9
+ __mf_promote_submodules__ = ["plugins.torchtune"]
10
+
11
+
12
+ class TorchTune:
13
+ def __init__(
14
+ self,
15
+ use_multi_node_config: bool = False,
16
+ ):
17
+ """
18
+ Initialize the Tune launcher.
19
+
20
+ :param use_multi_node_config: If True, attempt to build a distributed configuration
21
+ from current.torch.torchrun_args.
22
+ """
23
+ self.multi_node_config = {}
24
+ if use_multi_node_config:
25
+ if getattr(current, "torch", None):
26
+ print(
27
+ "[Metaflow Tune] Since @torchrun is used, multi-node config can be used to launch the job."
28
+ )
29
+ # For distributed torchtune launches, we use similar parameters as torchrun.
30
+ # (You might need to adjust the keys according to your environment.)
31
+ self.multi_node_config = {
32
+ "nnodes": current.torch.torchrun_args["nnodes"],
33
+ "master_addr": current.torch.torchrun_args["master_addr"],
34
+ "master_port": int(current.torch.torchrun_args["master_port"]),
35
+ "node_rank": current.torch.torchrun_args["node_rank"],
36
+ "nproc_per_node": current.torch.torchrun_args["nproc_per_node"],
37
+ "num_processes": current.torch.torchrun_args["nproc_per_node"]
38
+ * current.torch.torchrun_args["nnodes"],
39
+ }
40
+ print(
41
+ f"[Metaflow Tune] Discovered multi-node config for torchrun: {self.multi_node_config}"
42
+ )
43
+ else:
44
+ print(
45
+ "[Metaflow Tune] Since @torchrun is not used, default multi-node config cannot be used to launch the job."
46
+ )
47
+
48
+ def run(
49
+ self,
50
+ recipe: str,
51
+ config_dict: Dict,
52
+ additional_cli_options: Optional[List[str]] = None,
53
+ ):
54
+ """
55
+ Launch the torchtune job via its CLI.
56
+
57
+ :param recipe: The path to the recipe (or name of the recipe) to run.
58
+ :param config_dict: Optional dictionary that will be dumped to a YAML file and passed via --config.
59
+ :param additional_cli_options: Optional list of additional CLI options.
60
+ :raises: subprocess.CalledProcessError if the subprocess returns a nonzero exit code.
61
+ """
62
+ import yaml
63
+ import tempfile
64
+ import os
65
+
66
+ _temp_dir = tempfile.mkdtemp()
67
+ try:
68
+ config_path = os.path.join(_temp_dir, "config.yaml")
69
+ with open(config_path, "w") as f:
70
+ yaml.dump(config_dict, f)
71
+
72
+ additional_options = (
73
+ additional_cli_options if additional_cli_options else []
74
+ )
75
+
76
+ # Build the command. Here we use "tune run" as the base command.
77
+ cmd = ["tune", "run"]
78
+
79
+ # If distributed configuration is present, add torchrun–style flags.
80
+ if self.multi_node_config:
81
+ cmd.extend(
82
+ [
83
+ "--nnodes",
84
+ str(self.multi_node_config.get("nnodes")),
85
+ "--nproc-per-node",
86
+ str(self.multi_node_config.get("nproc_per_node")),
87
+ # "--rdzv_conf", f"rdzv_endpoint={self.multi_node_config.get('master_addr')}:{self.multi_node_config.get('master_port')}"
88
+ "--rdzv-backend",
89
+ "c10d",
90
+ "--rdzv-endpoint",
91
+ f"{self.multi_node_config.get('master_addr')}:{self.multi_node_config.get('master_port')}",
92
+ "--rdzv-id",
93
+ "1234567890",
94
+ "--node-rank",
95
+ str(self.multi_node_config.get("node_rank")),
96
+ # TODO: should there be a masterip/port here ?
97
+ ]
98
+ )
99
+
100
+ cmd.extend(additional_options)
101
+
102
+ cmd.append(recipe)
103
+ # If a recipe configuration was provided, pass it via the --config flag.
104
+ cmd.extend(["--config", config_path])
105
+
106
+ # Append any additional CLI options.
107
+
108
+ # Launch the subprocess.
109
+ print(f"[Metaflow tune] {' '.join(cmd)}")
110
+ process = subprocess.Popen(
111
+ cmd,
112
+ stdout=subprocess.PIPE,
113
+ stderr=subprocess.PIPE,
114
+ universal_newlines=True,
115
+ )
116
+
117
+ # Stream the output in real-time.
118
+ for out_line, err_line in read_popen_pipes(process):
119
+ print(out_line, end="", flush=True)
120
+ print(err_line, end="", file=sys.stderr, flush=True)
121
+
122
+ process.wait()
123
+ if process.returncode != 0:
124
+ raise subprocess.CalledProcessError(process.returncode, cmd)
125
+ finally:
126
+ shutil.rmtree(_temp_dir)
127
+
128
+
129
+ def enqueue_output(file, queue):
130
+ for line in iter(file.readline, ""):
131
+ queue.put(line)
132
+ file.close()
133
+
134
+
135
+ def read_popen_pipes(p):
136
+
137
+ with ThreadPoolExecutor(2) as pool:
138
+ q_stdout, q_stderr = Queue(), Queue()
139
+
140
+ pool.submit(enqueue_output, p.stdout, q_stdout)
141
+ pool.submit(enqueue_output, p.stderr, q_stderr)
142
+
143
+ while True:
144
+
145
+ if p.poll() is not None and q_stdout.empty() and q_stderr.empty():
146
+ break
147
+
148
+ out_line = err_line = ""
149
+
150
+ try:
151
+ out_line = q_stdout.get_nowait()
152
+ except Empty:
153
+ pass
154
+ try:
155
+ err_line = q_stderr.get_nowait()
156
+ except Empty:
157
+ pass
158
+
159
+ yield (out_line, err_line)
@@ -0,0 +1 @@
1
+ __mf_promote_submodules__ = ["plugins.torchtune"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.158
3
+ Version: 1.1.159
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -1,7 +1,7 @@
1
1
  metaflow_extensions/outerbounds/__init__.py,sha256=Gb8u06s9ClQsA_vzxmkCzuMnigPy7kKcDnLfb7eB-64,514
2
2
  metaflow_extensions/outerbounds/remote_config.py,sha256=pEFJuKDYs98eoB_-ryPjVi9b_c4gpHMdBHE14ltoxIU,4672
3
3
  metaflow_extensions/outerbounds/config/__init__.py,sha256=JsQGRuGFz28fQWjUvxUgR8EKBLGRdLUIk_buPLJplJY,1225
4
- metaflow_extensions/outerbounds/plugins/__init__.py,sha256=gytuNt3lNabirHLEYzrmHFMp-JWh8dA2AZPK11HmaNw,13242
4
+ metaflow_extensions/outerbounds/plugins/__init__.py,sha256=GxYKjrMJCGVKoxhfdPAlVF9kYrEb3-xn9fgUTb_H9VY,13255
5
5
  metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=_Q9_2EL0Xy77bCRphkwT1aSu8gQXRDOH-Z-RxTUO8N4,2202
6
6
  metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
7
7
  metaflow_extensions/outerbounds/plugins/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -61,6 +61,7 @@ metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py,sha256=F
61
61
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py,sha256=aQphxX6jqYgfa83w387pEWl0keuLm38V53I8P8UL2ck,6887
62
62
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py,sha256=AI_kcm1hZV3JRxJkookcH6twiGnAYjk9Dx-MeoYz60Y,8511
63
63
  metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py,sha256=9lUM4Cqi5RjrHBRfG6AQMRz8-R96eZC8Ih0KD2lv22Y,1858
64
+ metaflow_extensions/outerbounds/plugins/torchtune/__init__.py,sha256=TOXNeyhcgd8VxplXO_oEuryFEsbk0tikn5GL0-44SU8,5853
64
65
  metaflow_extensions/outerbounds/profilers/__init__.py,sha256=wa_jhnCBr82TBxoS0e8b6_6sLyZX0fdHicuGJZNTqKw,29
65
66
  metaflow_extensions/outerbounds/profilers/gpu.py,sha256=3Er8uKQzfm_082uadg4yn_D4Y-iSCgzUfFmguYxZsz4,27485
66
67
  metaflow_extensions/outerbounds/toplevel/__init__.py,sha256=qWUJSv_r5hXJ7jV_On4nEasKIfUCm6_UjkjXWA_A1Ts,90
@@ -70,7 +71,8 @@ metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3u
70
71
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
71
72
  metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py,sha256=GRSz2zwqkvlmFS6bcfYD_CX6CMko9DHQokMaH1iBshA,47
72
73
  metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
73
- ob_metaflow_extensions-1.1.158.dist-info/METADATA,sha256=0t_P8-Uhi3I39xyeSGv2BpRQO5Upe1eIjs04e6Stjd8,521
74
- ob_metaflow_extensions-1.1.158.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
75
- ob_metaflow_extensions-1.1.158.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
76
- ob_metaflow_extensions-1.1.158.dist-info/RECORD,,
74
+ metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py,sha256=uTVkdSk3xZ7hEKYfdlyVteWj5KeDwaM1hU9WT-_YKfI,50
75
+ ob_metaflow_extensions-1.1.159.dist-info/METADATA,sha256=B88tVfHuAktlRjsOBy8-pwquOYyBJUuOctDcL2KfN9I,521
76
+ ob_metaflow_extensions-1.1.159.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
77
+ ob_metaflow_extensions-1.1.159.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
78
+ ob_metaflow_extensions-1.1.159.dist-info/RECORD,,