ob-metaflow 2.12.30.2__py2.py3-none-any.whl → 2.13.6.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow might be problematic. Click here for more details.

Files changed (96) hide show
  1. metaflow/__init__.py +3 -0
  2. metaflow/cards.py +1 -0
  3. metaflow/cli.py +185 -717
  4. metaflow/cli_args.py +17 -0
  5. metaflow/cli_components/__init__.py +0 -0
  6. metaflow/cli_components/dump_cmd.py +96 -0
  7. metaflow/cli_components/init_cmd.py +51 -0
  8. metaflow/cli_components/run_cmds.py +362 -0
  9. metaflow/cli_components/step_cmd.py +176 -0
  10. metaflow/cli_components/utils.py +140 -0
  11. metaflow/cmd/develop/stub_generator.py +9 -2
  12. metaflow/datastore/flow_datastore.py +2 -2
  13. metaflow/decorators.py +63 -2
  14. metaflow/exception.py +8 -2
  15. metaflow/extension_support/plugins.py +42 -27
  16. metaflow/flowspec.py +176 -23
  17. metaflow/graph.py +28 -27
  18. metaflow/includefile.py +50 -22
  19. metaflow/lint.py +35 -20
  20. metaflow/metadata_provider/heartbeat.py +23 -8
  21. metaflow/metaflow_config.py +10 -1
  22. metaflow/multicore_utils.py +31 -14
  23. metaflow/package.py +17 -3
  24. metaflow/parameters.py +97 -25
  25. metaflow/plugins/__init__.py +22 -0
  26. metaflow/plugins/airflow/airflow.py +18 -17
  27. metaflow/plugins/airflow/airflow_cli.py +1 -0
  28. metaflow/plugins/argo/argo_client.py +0 -2
  29. metaflow/plugins/argo/argo_workflows.py +195 -132
  30. metaflow/plugins/argo/argo_workflows_cli.py +1 -1
  31. metaflow/plugins/argo/argo_workflows_decorator.py +2 -4
  32. metaflow/plugins/argo/argo_workflows_deployer_objects.py +51 -9
  33. metaflow/plugins/argo/jobset_input_paths.py +0 -1
  34. metaflow/plugins/aws/aws_utils.py +6 -1
  35. metaflow/plugins/aws/batch/batch_client.py +1 -3
  36. metaflow/plugins/aws/batch/batch_decorator.py +13 -13
  37. metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
  38. metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
  39. metaflow/plugins/aws/step_functions/production_token.py +1 -1
  40. metaflow/plugins/aws/step_functions/step_functions.py +33 -1
  41. metaflow/plugins/aws/step_functions/step_functions_cli.py +1 -1
  42. metaflow/plugins/aws/step_functions/step_functions_decorator.py +0 -1
  43. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +7 -9
  44. metaflow/plugins/cards/card_cli.py +7 -2
  45. metaflow/plugins/cards/card_creator.py +1 -0
  46. metaflow/plugins/cards/card_decorator.py +79 -8
  47. metaflow/plugins/cards/card_modules/basic.py +56 -5
  48. metaflow/plugins/cards/card_modules/card.py +16 -1
  49. metaflow/plugins/cards/card_modules/components.py +64 -16
  50. metaflow/plugins/cards/card_modules/main.js +27 -25
  51. metaflow/plugins/cards/card_modules/test_cards.py +4 -4
  52. metaflow/plugins/cards/component_serializer.py +1 -1
  53. metaflow/plugins/datatools/s3/s3.py +12 -4
  54. metaflow/plugins/datatools/s3/s3op.py +3 -3
  55. metaflow/plugins/events_decorator.py +338 -186
  56. metaflow/plugins/kubernetes/kube_utils.py +84 -1
  57. metaflow/plugins/kubernetes/kubernetes.py +40 -92
  58. metaflow/plugins/kubernetes/kubernetes_cli.py +32 -7
  59. metaflow/plugins/kubernetes/kubernetes_decorator.py +76 -4
  60. metaflow/plugins/kubernetes/kubernetes_job.py +23 -20
  61. metaflow/plugins/kubernetes/kubernetes_jobsets.py +41 -20
  62. metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
  63. metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
  64. metaflow/plugins/parallel_decorator.py +4 -1
  65. metaflow/plugins/project_decorator.py +33 -5
  66. metaflow/plugins/pypi/bootstrap.py +249 -81
  67. metaflow/plugins/pypi/conda_decorator.py +20 -10
  68. metaflow/plugins/pypi/conda_environment.py +83 -27
  69. metaflow/plugins/pypi/micromamba.py +82 -37
  70. metaflow/plugins/pypi/pip.py +9 -6
  71. metaflow/plugins/pypi/pypi_decorator.py +11 -9
  72. metaflow/plugins/pypi/utils.py +4 -2
  73. metaflow/plugins/timeout_decorator.py +2 -2
  74. metaflow/runner/click_api.py +240 -50
  75. metaflow/runner/deployer.py +1 -1
  76. metaflow/runner/deployer_impl.py +12 -11
  77. metaflow/runner/metaflow_runner.py +68 -34
  78. metaflow/runner/nbdeploy.py +2 -0
  79. metaflow/runner/nbrun.py +1 -1
  80. metaflow/runner/subprocess_manager.py +61 -10
  81. metaflow/runner/utils.py +208 -44
  82. metaflow/runtime.py +216 -112
  83. metaflow/sidecar/sidecar_worker.py +1 -1
  84. metaflow/tracing/tracing_modules.py +4 -1
  85. metaflow/user_configs/__init__.py +0 -0
  86. metaflow/user_configs/config_decorators.py +563 -0
  87. metaflow/user_configs/config_options.py +548 -0
  88. metaflow/user_configs/config_parameters.py +436 -0
  89. metaflow/util.py +22 -0
  90. metaflow/version.py +1 -1
  91. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/METADATA +12 -3
  92. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/RECORD +96 -84
  93. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/WHEEL +1 -1
  94. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/LICENSE +0 -0
  95. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/entry_points.txt +0 -0
  96. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  import bz2
2
+ import concurrent.futures
2
3
  import io
3
4
  import json
4
5
  import os
@@ -6,21 +7,33 @@ import shutil
6
7
  import subprocess
7
8
  import sys
8
9
  import tarfile
9
-
10
+ import time
11
+ from urllib.error import URLError
12
+ from urllib.request import urlopen
10
13
  from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
11
14
  from metaflow.plugins import DATASTORES
15
+ from metaflow.plugins.pypi.utils import MICROMAMBA_MIRROR_URL, MICROMAMBA_URL
12
16
  from metaflow.util import which
17
+ from urllib.request import Request
18
+ import warnings
13
19
 
14
20
  from . import MAGIC_FILE, _datastore_packageroot
15
21
 
16
22
  # Bootstraps a valid conda virtual environment composed of conda and pypi packages
17
23
 
18
- if __name__ == "__main__":
19
- if len(sys.argv) != 5:
20
- print("Usage: bootstrap.py <flow_name> <id> <datastore_type> <architecture>")
21
- sys.exit(1)
22
- _, flow_name, id_, datastore_type, architecture = sys.argv
23
24
 
25
+ def timer(func):
26
+ def wrapper(*args, **kwargs):
27
+ start_time = time.time()
28
+ result = func(*args, **kwargs)
29
+ duration = time.time() - start_time
30
+ # print(f"Time taken for {func.__name__}: {duration:.2f} seconds")
31
+ return result
32
+
33
+ return wrapper
34
+
35
+
36
+ if __name__ == "__main__":
24
37
  # TODO: Detect architecture on the fly when dealing with arm architectures.
25
38
  # ARCH=$(uname -m)
26
39
  # OS=$(uname)
@@ -45,96 +58,251 @@ if __name__ == "__main__":
45
58
  # fi
46
59
  # fi
47
60
 
48
- prefix = os.path.join(os.getcwd(), architecture, id_)
49
- pkgs_dir = os.path.join(os.getcwd(), ".pkgs")
50
- manifest_dir = os.path.join(os.getcwd(), DATASTORE_LOCAL_DIR, flow_name)
61
+ def run_cmd(cmd):
62
+ result = subprocess.run(
63
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
64
+ )
65
+ if result.returncode != 0:
66
+ print(f"Bootstrap failed while executing: {cmd}")
67
+ print("Stdout:", result.stdout)
68
+ print("Stderr:", result.stderr)
69
+ sys.exit(1)
51
70
 
52
- datastores = [d for d in DATASTORES if d.TYPE == datastore_type]
53
- if not datastores:
54
- print(f"No datastore found for type: {datastore_type}")
55
- sys.exit(1)
71
+ @timer
72
+ def install_micromamba(architecture):
73
+ micromamba_dir = os.path.join(os.getcwd(), "micromamba")
74
+ micromamba_path = os.path.join(micromamba_dir, "bin", "micromamba")
75
+
76
+ if which("micromamba"):
77
+ return which("micromamba")
78
+ if os.path.exists(micromamba_path):
79
+ os.environ["PATH"] += os.pathsep + os.path.dirname(micromamba_path)
80
+ return micromamba_path
81
+
82
+ # Download and extract in one go
83
+ url = MICROMAMBA_URL.format(platform=architecture, version="2.0.4")
84
+ mirror_url = MICROMAMBA_MIRROR_URL.format(
85
+ platform=architecture, version="2.0.4"
86
+ )
87
+
88
+ # Prepare directory once
89
+ os.makedirs(os.path.dirname(micromamba_path), exist_ok=True)
90
+
91
+ # Download and decompress in one go
92
+ def _download_and_extract(url):
93
+ headers = {
94
+ "Accept-Encoding": "gzip, deflate, br",
95
+ "Connection": "keep-alive",
96
+ "User-Agent": "python-urllib",
97
+ }
98
+
99
+ max_retries = 3
100
+ for attempt in range(max_retries):
101
+ try:
102
+ req = Request(url, headers=headers)
56
103
 
57
- storage = datastores[0](
58
- _datastore_packageroot(datastores[0], lambda *args, **kwargs: None)
59
- )
104
+ with urlopen(req) as response:
105
+ decompressor = bz2.BZ2Decompressor()
106
+ with warnings.catch_warnings():
107
+ warnings.filterwarnings(
108
+ "ignore", category=DeprecationWarning
109
+ )
110
+ with tarfile.open(
111
+ fileobj=io.BytesIO(
112
+ decompressor.decompress(response.read())
113
+ ),
114
+ mode="r:",
115
+ ) as tar:
116
+ member = tar.getmember("bin/micromamba")
117
+ tar.extract(member, micromamba_dir)
118
+ break
119
+ except (URLError, IOError) as e:
120
+ if attempt == max_retries - 1:
121
+ raise Exception(
122
+ f"Failed to download micromamba after {max_retries} attempts: {e}"
123
+ )
124
+ time.sleep(2**attempt)
60
125
 
61
- # Move MAGIC_FILE inside local datastore.
62
- os.makedirs(manifest_dir, exist_ok=True)
63
- shutil.move(
64
- os.path.join(os.getcwd(), MAGIC_FILE),
65
- os.path.join(manifest_dir, MAGIC_FILE),
66
- )
126
+ try:
127
+ # first try from mirror
128
+ _download_and_extract(mirror_url)
129
+ except Exception:
130
+ # download from mirror failed, try official source before failing.
131
+ _download_and_extract(url)
67
132
 
68
- with open(os.path.join(manifest_dir, MAGIC_FILE)) as f:
69
- env = json.load(f)[id_][architecture]
133
+ # Set executable permission
134
+ os.chmod(micromamba_path, 0o755)
70
135
 
71
- # Download Conda packages.
72
- conda_pkgs_dir = os.path.join(pkgs_dir, "conda")
73
- with storage.load_bytes([package["path"] for package in env["conda"]]) as results:
74
- for key, tmpfile, _ in results:
136
+ # Update PATH only once at the end
137
+ os.environ["PATH"] += os.pathsep + os.path.dirname(micromamba_path)
138
+ return micromamba_path
139
+
140
+ @timer
141
+ def download_conda_packages(storage, packages, dest_dir):
142
+ def process_conda_package(args):
75
143
  # Ensure that conda packages go into architecture specific folders.
76
144
  # The path looks like REPO/CHANNEL/CONDA_SUBDIR/PACKAGE. We trick
77
145
  # Micromamba into believing that all packages are coming from a local
78
146
  # channel - the only hurdle is ensuring that packages are organised
79
147
  # properly.
80
-
81
- # TODO: consider RAM disk
82
- dest = os.path.join(conda_pkgs_dir, "/".join(key.split("/")[-2:]))
148
+ key, tmpfile, dest_dir = args
149
+ dest = os.path.join(dest_dir, "/".join(key.split("/")[-2:]))
83
150
  os.makedirs(os.path.dirname(dest), exist_ok=True)
84
151
  shutil.move(tmpfile, dest)
85
152
 
86
- # Create Conda environment.
87
- cmds = [
88
- # TODO: check if mamba or conda are already available on the image
89
- # TODO: micromamba installation can be pawned off to micromamba.py
90
- f"""set -e;
91
- if ! command -v micromamba >/dev/null 2>&1; then
92
- mkdir -p micromamba;
93
- python -c "import requests, bz2, sys; data = requests.get('https://micro.mamba.pm/api/micromamba/{architecture}/1.5.7').content; sys.stdout.buffer.write(bz2.decompress(data))" | tar -xv -C $(pwd)/micromamba bin/micromamba --strip-components 1;
153
+ os.makedirs(dest_dir, exist_ok=True)
154
+ with storage.load_bytes([package["path"] for package in packages]) as results:
155
+ with concurrent.futures.ThreadPoolExecutor() as executor:
156
+ executor.map(
157
+ process_conda_package,
158
+ [(key, tmpfile, dest_dir) for key, tmpfile, _ in results],
159
+ )
160
+ # for key, tmpfile, _ in results:
161
+
162
+ # # TODO: consider RAM disk
163
+ # dest = os.path.join(dest_dir, "/".join(key.split("/")[-2:]))
164
+ # os.makedirs(os.path.dirname(dest), exist_ok=True)
165
+ # shutil.move(tmpfile, dest)
166
+ return dest_dir
167
+
168
+ @timer
169
+ def download_pypi_packages(storage, packages, dest_dir):
170
+ def process_pypi_package(args):
171
+ key, tmpfile, dest_dir = args
172
+ dest = os.path.join(dest_dir, os.path.basename(key))
173
+ shutil.move(tmpfile, dest)
174
+
175
+ os.makedirs(dest_dir, exist_ok=True)
176
+ with storage.load_bytes([package["path"] for package in packages]) as results:
177
+ with concurrent.futures.ThreadPoolExecutor() as executor:
178
+ executor.map(
179
+ process_pypi_package,
180
+ [(key, tmpfile, dest_dir) for key, tmpfile, _ in results],
181
+ )
182
+ # for key, tmpfile, _ in results:
183
+ # dest = os.path.join(dest_dir, os.path.basename(key))
184
+ # shutil.move(tmpfile, dest)
185
+ return dest_dir
186
+
187
+ @timer
188
+ def create_conda_environment(prefix, conda_pkgs_dir):
189
+ cmd = f'''set -e;
190
+ tmpfile=$(mktemp);
191
+ echo "@EXPLICIT" > "$tmpfile";
192
+ ls -d {conda_pkgs_dir}/*/* >> "$tmpfile";
193
+ export PATH=$PATH:$(pwd)/micromamba;
194
+ export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
195
+ export MAMBA_NO_LOW_SPEED_LIMIT=1;
196
+ export MAMBA_USE_INDEX_CACHE=1;
197
+ export MAMBA_NO_PROGRESS_BARS=1;
198
+ export CONDA_FETCH_THREADS=1;
199
+ micromamba create --yes --offline --no-deps \
200
+ --safety-checks=disabled --no-extra-safety-checks \
201
+ --prefix {prefix} --file "$tmpfile" \
202
+ --no-pyc --no-rc --always-copy;
203
+ rm "$tmpfile"'''
204
+ run_cmd(cmd)
205
+
206
+ @timer
207
+ def install_pypi_packages(prefix, pypi_pkgs_dir):
208
+ cmd = f"""set -e;
94
209
  export PATH=$PATH:$(pwd)/micromamba;
95
- if ! command -v micromamba >/dev/null 2>&1; then
96
- echo "Failed to install Micromamba!";
97
- exit 1;
98
- fi;
99
- fi""",
100
- # Create a conda environment through Micromamba.
101
- f'''set -e;
102
- tmpfile=$(mktemp);
103
- echo "@EXPLICIT" > "$tmpfile";
104
- ls -d {conda_pkgs_dir}/*/* >> "$tmpfile";
105
- export PATH=$PATH:$(pwd)/micromamba;
106
- export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
107
- micromamba create --yes --offline --no-deps --safety-checks=disabled --no-extra-safety-checks --prefix {prefix} --file "$tmpfile";
108
- rm "$tmpfile"''',
109
- ]
110
-
111
- # Download PyPI packages.
112
- if "pypi" in env:
210
+ export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
211
+ micromamba run --prefix {prefix} python -m pip --disable-pip-version-check \
212
+ install --root-user-action=ignore --no-compile --no-index \
213
+ --no-cache-dir --no-deps --prefer-binary \
214
+ --find-links={pypi_pkgs_dir} --no-user \
215
+ --no-warn-script-location --no-input \
216
+ {pypi_pkgs_dir}/*.whl
217
+ """
218
+ run_cmd(cmd)
219
+
220
+ @timer
221
+ def setup_environment(
222
+ architecture, storage, env, prefix, conda_pkgs_dir, pypi_pkgs_dir
223
+ ):
224
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
225
+ # install micromamba, download conda and pypi packages in parallel
226
+ futures = {
227
+ "micromamba": executor.submit(install_micromamba, architecture),
228
+ "conda_pkgs": executor.submit(
229
+ download_conda_packages, storage, env["conda"], conda_pkgs_dir
230
+ ),
231
+ }
232
+ if "pypi" in env:
233
+ futures["pypi_pkgs"] = executor.submit(
234
+ download_pypi_packages, storage, env["pypi"], pypi_pkgs_dir
235
+ )
236
+
237
+ # create conda environment after micromamba is installed and conda packages are downloaded
238
+ done, _ = concurrent.futures.wait(
239
+ [futures["micromamba"], futures["conda_pkgs"]],
240
+ return_when=concurrent.futures.ALL_COMPLETED,
241
+ )
242
+
243
+ for future in done:
244
+ future.result()
245
+
246
+ # start conda environment creation
247
+ futures["conda_env"] = executor.submit(
248
+ create_conda_environment, prefix, conda_pkgs_dir
249
+ )
250
+
251
+ if "pypi" in env:
252
+ # install pypi packages after conda environment is created and pypi packages are downloaded
253
+ done, _ = concurrent.futures.wait(
254
+ [futures["conda_env"], futures["pypi_pkgs"]],
255
+ return_when=concurrent.futures.ALL_COMPLETED,
256
+ )
257
+
258
+ for future in done:
259
+ future.result()
260
+
261
+ # install pypi packages
262
+ futures["pypi_install"] = executor.submit(
263
+ install_pypi_packages, prefix, pypi_pkgs_dir
264
+ )
265
+ # wait for pypi packages to be installed
266
+ futures["pypi_install"].result()
267
+ else:
268
+ # wait for conda environment to be created
269
+ futures["conda_env"].result()
270
+
271
+ if len(sys.argv) != 5:
272
+ print("Usage: bootstrap.py <flow_name> <id> <datastore_type> <architecture>")
273
+ sys.exit(1)
274
+
275
+ try:
276
+ _, flow_name, id_, datastore_type, architecture = sys.argv
277
+
278
+ prefix = os.path.join(os.getcwd(), architecture, id_)
279
+ pkgs_dir = os.path.join(os.getcwd(), ".pkgs")
280
+ conda_pkgs_dir = os.path.join(pkgs_dir, "conda")
113
281
  pypi_pkgs_dir = os.path.join(pkgs_dir, "pypi")
114
- with storage.load_bytes(
115
- [package["path"] for package in env["pypi"]]
116
- ) as results:
117
- for key, tmpfile, _ in results:
118
- dest = os.path.join(pypi_pkgs_dir, os.path.basename(key))
119
- os.makedirs(os.path.dirname(dest), exist_ok=True)
120
- shutil.move(tmpfile, dest)
121
-
122
- # Install PyPI packages.
123
- cmds.extend(
124
- [
125
- f"""set -e;
126
- export PATH=$PATH:$(pwd)/micromamba;
127
- export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
128
- micromamba run --prefix {prefix} python -m pip --disable-pip-version-check install --root-user-action=ignore --no-compile {pypi_pkgs_dir}/*.whl --no-user"""
129
- ]
282
+ manifest_dir = os.path.join(os.getcwd(), DATASTORE_LOCAL_DIR, flow_name)
283
+
284
+ datastores = [d for d in DATASTORES if d.TYPE == datastore_type]
285
+ if not datastores:
286
+ print(f"No datastore found for type: {datastore_type}")
287
+ sys.exit(1)
288
+
289
+ storage = datastores[0](
290
+ _datastore_packageroot(datastores[0], lambda *args, **kwargs: None)
130
291
  )
131
292
 
132
- for cmd in cmds:
133
- result = subprocess.run(
134
- cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
293
+ # Move MAGIC_FILE inside local datastore.
294
+ os.makedirs(manifest_dir, exist_ok=True)
295
+ shutil.move(
296
+ os.path.join(os.getcwd(), MAGIC_FILE),
297
+ os.path.join(manifest_dir, MAGIC_FILE),
135
298
  )
136
- if result.returncode != 0:
137
- print(f"Bootstrap failed while executing: {cmd}")
138
- print("Stdout:", result.stdout.decode())
139
- print("Stderr:", result.stderr.decode())
140
- sys.exit(1)
299
+ with open(os.path.join(manifest_dir, MAGIC_FILE)) as f:
300
+ env = json.load(f)[id_][architecture]
301
+
302
+ setup_environment(
303
+ architecture, storage, env, prefix, conda_pkgs_dir, pypi_pkgs_dir
304
+ )
305
+
306
+ except Exception as e:
307
+ print(f"Error: {str(e)}", file=sys.stderr)
308
+ sys.exit(1)
@@ -50,20 +50,26 @@ class CondaStepDecorator(StepDecorator):
50
50
  # conda channels, users can specify channel::package as the package name.
51
51
 
52
52
  def __init__(self, attributes=None, statically_defined=False):
53
- self._user_defined_attributes = (
54
- attributes.copy() if attributes is not None else {}
53
+ self._attributes_with_user_values = (
54
+ set(attributes.keys()) if attributes is not None else set()
55
55
  )
56
+
56
57
  super(CondaStepDecorator, self).__init__(attributes, statically_defined)
57
58
 
59
+ def init(self):
60
+ super(CondaStepDecorator, self).init()
61
+
58
62
  # Support legacy 'libraries=' attribute for the decorator.
59
63
  self.attributes["packages"] = {
60
64
  **self.attributes["libraries"],
61
65
  **self.attributes["packages"],
62
66
  }
63
67
  del self.attributes["libraries"]
68
+ if self.attributes["packages"]:
69
+ self._attributes_with_user_values.add("packages")
64
70
 
65
71
  def is_attribute_user_defined(self, name):
66
- return name in self._user_defined_attributes
72
+ return name in self._attributes_with_user_values
67
73
 
68
74
  def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
69
75
  # The init_environment hook for Environment creates the relevant virtual
@@ -83,10 +89,10 @@ class CondaStepDecorator(StepDecorator):
83
89
  **super_attributes["packages"],
84
90
  **self.attributes["packages"],
85
91
  }
86
- self._user_defined_attributes = {
87
- **self._user_defined_attributes,
88
- **conda_base._user_defined_attributes,
89
- }
92
+ self._attributes_with_user_values.update(
93
+ conda_base._attributes_with_user_values
94
+ )
95
+
90
96
  self.attributes["python"] = (
91
97
  self.attributes["python"] or super_attributes["python"]
92
98
  )
@@ -333,11 +339,15 @@ class CondaFlowDecorator(FlowDecorator):
333
339
  }
334
340
 
335
341
  def __init__(self, attributes=None, statically_defined=False):
336
- self._user_defined_attributes = (
337
- attributes.copy() if attributes is not None else {}
342
+ self._attributes_with_user_values = (
343
+ set(attributes.keys()) if attributes is not None else set()
338
344
  )
345
+
339
346
  super(CondaFlowDecorator, self).__init__(attributes, statically_defined)
340
347
 
348
+ def init(self):
349
+ super(CondaFlowDecorator, self).init()
350
+
341
351
  # Support legacy 'libraries=' attribute for the decorator.
342
352
  self.attributes["packages"] = {
343
353
  **self.attributes["libraries"],
@@ -348,7 +358,7 @@ class CondaFlowDecorator(FlowDecorator):
348
358
  self.attributes["python"] = str(self.attributes["python"])
349
359
 
350
360
  def is_attribute_user_defined(self, name):
351
- return name in self._user_defined_attributes
361
+ return name in self._attributes_with_user_values
352
362
 
353
363
  def flow_init(
354
364
  self, flow, graph, environment, flow_datastore, metadata, logger, echo, options
@@ -5,21 +5,17 @@ import functools
5
5
  import io
6
6
  import json
7
7
  import os
8
- import sys
9
8
  import tarfile
10
- import time
11
- from concurrent.futures import ThreadPoolExecutor
9
+ import threading
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from functools import wraps
12
12
  from hashlib import sha256
13
13
  from io import BufferedIOBase, BytesIO
14
- from itertools import chain
15
14
  from urllib.parse import unquote, urlparse
16
15
 
17
- import requests
18
-
19
16
  from metaflow.exception import MetaflowException
20
17
  from metaflow.metaflow_config import get_pinned_conda_libs
21
18
  from metaflow.metaflow_environment import MetaflowEnvironment
22
- from metaflow.metaflow_profile import profile
23
19
 
24
20
  from . import MAGIC_FILE, _datastore_packageroot
25
21
  from .utils import conda_platform
@@ -50,7 +46,6 @@ class CondaEnvironment(MetaflowEnvironment):
50
46
 
51
47
  def validate_environment(self, logger, datastore_type):
52
48
  self.datastore_type = datastore_type
53
- self.logger = logger
54
49
 
55
50
  # Avoiding circular imports.
56
51
  from metaflow.plugins import DATASTORES
@@ -62,8 +57,21 @@ class CondaEnvironment(MetaflowEnvironment):
62
57
  from .micromamba import Micromamba
63
58
  from .pip import Pip
64
59
 
65
- micromamba = Micromamba()
66
- self.solvers = {"conda": micromamba, "pypi": Pip(micromamba)}
60
+ print_lock = threading.Lock()
61
+
62
+ def make_thread_safe(func):
63
+ @wraps(func)
64
+ def wrapper(*args, **kwargs):
65
+ with print_lock:
66
+ return func(*args, **kwargs)
67
+
68
+ return wrapper
69
+
70
+ self.logger = make_thread_safe(logger)
71
+
72
+ # TODO: Wire up logging
73
+ micromamba = Micromamba(self.logger)
74
+ self.solvers = {"conda": micromamba, "pypi": Pip(micromamba, self.logger)}
67
75
 
68
76
  def init_environment(self, echo, only_steps=None):
69
77
  # The implementation optimizes for latency to ensure as many operations can
@@ -150,6 +158,9 @@ class CondaEnvironment(MetaflowEnvironment):
150
158
  (
151
159
  package["path"],
152
160
  # Lazily fetch package from the interweb if needed.
161
+ # TODO: Depending on the len_hint, the package might be downloaded from
162
+ # the interweb prematurely. save_bytes needs to be adjusted to handle
163
+ # this scenario.
153
164
  LazyOpen(
154
165
  package["local_path"],
155
166
  "rb",
@@ -166,22 +177,60 @@ class CondaEnvironment(MetaflowEnvironment):
166
177
  if id_ in dirty:
167
178
  self.write_to_environment_manifest([id_, platform, type_], packages)
168
179
 
169
- # First resolve environments through Conda, before PyPI.
180
+ storage = None
181
+ if self.datastore_type not in ["local"]:
182
+ # Initialize storage for caching if using a remote datastore
183
+ storage = self.datastore(_datastore_packageroot(self.datastore, echo))
184
+
170
185
  self.logger("Bootstrapping virtual environment(s) ...")
171
- for solver in ["conda", "pypi"]:
172
- with ThreadPoolExecutor() as executor:
173
- results = list(
174
- executor.map(lambda x: solve(*x, solver), environments(solver))
175
- )
176
- _ = list(map(lambda x: self.solvers[solver].download(*x), results))
177
- with ThreadPoolExecutor() as executor:
178
- _ = list(
179
- executor.map(lambda x: self.solvers[solver].create(*x), results)
180
- )
181
- if self.datastore_type not in ["local"]:
182
- # Cache packages only when a remote datastore is in play.
183
- storage = self.datastore(_datastore_packageroot(self.datastore, echo))
184
- cache(storage, results, solver)
186
+ # Sequence of operations:
187
+ # 1. Start all conda solves in parallel
188
+ # 2. Download conda packages sequentially
189
+ # 3. Create and cache conda environments in parallel
190
+ # 4. Start PyPI solves in parallel after each conda environment is created
191
+ # 5. Download PyPI packages sequentially
192
+ # 6. Create and cache PyPI environments in parallel
193
+
194
+ with ThreadPoolExecutor() as executor:
195
+ # Start all conda solves in parallel
196
+ conda_futures = [
197
+ executor.submit(lambda x: solve(*x, "conda"), env)
198
+ for env in environments("conda")
199
+ ]
200
+
201
+ pypi_envs = {env[0]: env for env in environments("pypi")}
202
+ pypi_futures = []
203
+
204
+ # Process conda results sequentially for downloads
205
+ for future in as_completed(conda_futures):
206
+ result = future.result()
207
+ # Sequential conda download
208
+ self.solvers["conda"].download(*result)
209
+ # Parallel conda create and cache
210
+ create_future = executor.submit(self.solvers["conda"].create, *result)
211
+ if storage:
212
+ executor.submit(cache, storage, [result], "conda")
213
+
214
+ # Queue PyPI solve to start after conda create
215
+ if result[0] in pypi_envs:
216
+
217
+ def pypi_solve(env):
218
+ create_future.result() # Wait for conda create
219
+ return solve(*env, "pypi")
220
+
221
+ pypi_futures.append(
222
+ executor.submit(pypi_solve, pypi_envs[result[0]])
223
+ )
224
+
225
+ # Process PyPI results sequentially for downloads
226
+ for solve_future in pypi_futures:
227
+ result = solve_future.result()
228
+ # Sequential PyPI download
229
+ self.solvers["pypi"].download(*result)
230
+ # Parallel PyPI create and cache
231
+ executor.submit(self.solvers["pypi"].create, *result)
232
+ if storage:
233
+ executor.submit(cache, storage, [result], "pypi")
185
234
  self.logger("Virtual environment(s) bootstrapped!")
186
235
 
187
236
  def executable(self, step_name, default=None):
@@ -385,7 +434,8 @@ class CondaEnvironment(MetaflowEnvironment):
385
434
  'DISABLE_TRACING=True python -m metaflow.plugins.pypi.bootstrap "%s" %s "%s" linux-64'
386
435
  % (self.flow.name, id_, self.datastore_type),
387
436
  "echo 'Environment bootstrapped.'",
388
- "export PATH=$PATH:$(pwd)/micromamba",
437
+ # To avoid having to install micromamba in the PATH in micromamba.py, we add it to the PATH here.
438
+ "export PATH=$PATH:$(pwd)/micromamba/bin",
389
439
  ]
390
440
  else:
391
441
  # for @conda/@pypi(disabled=True).
@@ -446,6 +496,7 @@ class LazyOpen(BufferedIOBase):
446
496
  self._file = None
447
497
  self._buffer = None
448
498
  self._position = 0
499
+ self.requests = None
449
500
 
450
501
  def _ensure_file(self):
451
502
  if not self._file:
@@ -462,8 +513,13 @@ class LazyOpen(BufferedIOBase):
462
513
  raise ValueError("Both filename and url are missing")
463
514
 
464
515
  def _download_to_buffer(self):
516
+ if self.requests is None:
517
+ # TODO: Remove dependency on requests
518
+ import requests
519
+
520
+ self.requests = requests
465
521
  # TODO: Stream it in chunks?
466
- response = requests.get(self.url, stream=True)
522
+ response = self.requests.get(self.url, stream=True)
467
523
  response.raise_for_status()
468
524
  return response.content
469
525