ob-metaflow 2.12.36.3__py2.py3-none-any.whl → 2.12.39.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow might be problematic. Click here for more details.

Files changed (58) hide show
  1. metaflow/__init__.py +3 -0
  2. metaflow/cli.py +84 -697
  3. metaflow/cli_args.py +17 -0
  4. metaflow/cli_components/__init__.py +0 -0
  5. metaflow/cli_components/dump_cmd.py +96 -0
  6. metaflow/cli_components/init_cmd.py +51 -0
  7. metaflow/cli_components/run_cmds.py +358 -0
  8. metaflow/cli_components/step_cmd.py +189 -0
  9. metaflow/cli_components/utils.py +140 -0
  10. metaflow/cmd/develop/stub_generator.py +9 -2
  11. metaflow/decorators.py +63 -2
  12. metaflow/extension_support/plugins.py +41 -27
  13. metaflow/flowspec.py +156 -16
  14. metaflow/includefile.py +50 -22
  15. metaflow/metaflow_config.py +1 -1
  16. metaflow/package.py +17 -3
  17. metaflow/parameters.py +80 -23
  18. metaflow/plugins/__init__.py +4 -0
  19. metaflow/plugins/airflow/airflow_cli.py +1 -0
  20. metaflow/plugins/argo/argo_workflows.py +41 -1
  21. metaflow/plugins/argo/argo_workflows_cli.py +1 -0
  22. metaflow/plugins/argo/argo_workflows_deployer_objects.py +47 -1
  23. metaflow/plugins/aws/batch/batch_decorator.py +2 -2
  24. metaflow/plugins/aws/step_functions/step_functions.py +32 -0
  25. metaflow/plugins/aws/step_functions/step_functions_cli.py +1 -0
  26. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +3 -0
  27. metaflow/plugins/datatools/s3/s3op.py +3 -3
  28. metaflow/plugins/kubernetes/kubernetes_cli.py +1 -1
  29. metaflow/plugins/kubernetes/kubernetes_decorator.py +2 -2
  30. metaflow/plugins/pypi/bootstrap.py +196 -61
  31. metaflow/plugins/pypi/conda_decorator.py +20 -10
  32. metaflow/plugins/pypi/conda_environment.py +76 -21
  33. metaflow/plugins/pypi/micromamba.py +42 -15
  34. metaflow/plugins/pypi/pip.py +8 -3
  35. metaflow/plugins/pypi/pypi_decorator.py +11 -9
  36. metaflow/plugins/timeout_decorator.py +2 -2
  37. metaflow/runner/click_api.py +73 -19
  38. metaflow/runner/deployer.py +1 -1
  39. metaflow/runner/deployer_impl.py +2 -2
  40. metaflow/runner/metaflow_runner.py +4 -1
  41. metaflow/runner/nbdeploy.py +2 -0
  42. metaflow/runner/nbrun.py +1 -1
  43. metaflow/runner/subprocess_manager.py +3 -1
  44. metaflow/runner/utils.py +41 -19
  45. metaflow/runtime.py +111 -73
  46. metaflow/sidecar/sidecar_worker.py +1 -1
  47. metaflow/user_configs/__init__.py +0 -0
  48. metaflow/user_configs/config_decorators.py +563 -0
  49. metaflow/user_configs/config_options.py +495 -0
  50. metaflow/user_configs/config_parameters.py +386 -0
  51. metaflow/util.py +17 -0
  52. metaflow/version.py +1 -1
  53. {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/METADATA +3 -2
  54. {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/RECORD +58 -48
  55. {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/LICENSE +0 -0
  56. {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/WHEEL +0 -0
  57. {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/entry_points.txt +0 -0
  58. {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/top_level.txt +0 -0
@@ -61,6 +61,7 @@ from metaflow.plugins.kubernetes.kubernetes import (
61
61
  )
62
62
  from metaflow.plugins.kubernetes.kubernetes_jobsets import KubernetesArgoJobSet
63
63
  from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
64
+ from metaflow.user_configs.config_options import ConfigInput
64
65
  from metaflow.util import (
65
66
  compress_list,
66
67
  dict_to_cli_options,
@@ -169,6 +170,7 @@ class ArgoWorkflows(object):
169
170
  self.enable_heartbeat_daemon = enable_heartbeat_daemon
170
171
  self.enable_error_msg_capture = enable_error_msg_capture
171
172
  self.parameters = self._process_parameters()
173
+ self.config_parameters = self._process_config_parameters()
172
174
  self.triggers, self.trigger_options = self._process_triggers()
173
175
  self._schedule, self._timezone = self._get_schedule()
174
176
 
@@ -456,6 +458,10 @@ class ArgoWorkflows(object):
456
458
  "case-insensitive." % param.name
457
459
  )
458
460
  seen.add(norm)
461
+ # NOTE: We skip config parameters as these do not have dynamic values,
462
+ # and need to be treated differently.
463
+ if param.IS_CONFIG_PARAMETER:
464
+ continue
459
465
 
460
466
  extra_attrs = {}
461
467
  if param.kwargs.get("type") == JSONType:
@@ -489,6 +495,7 @@ class ArgoWorkflows(object):
489
495
  # execution - which needs to be fixed imminently.
490
496
  if not is_required or default_value is not None:
491
497
  default_value = json.dumps(default_value)
498
+
492
499
  parameters[param.name] = dict(
493
500
  name=param.name,
494
501
  value=default_value,
@@ -499,6 +506,27 @@ class ArgoWorkflows(object):
499
506
  )
500
507
  return parameters
501
508
 
509
+ def _process_config_parameters(self):
510
+ parameters = []
511
+ seen = set()
512
+ for var, param in self.flow._get_parameters():
513
+ if not param.IS_CONFIG_PARAMETER:
514
+ continue
515
+ # Throw an exception if the parameter is specified twice.
516
+ norm = param.name.lower()
517
+ if norm in seen:
518
+ raise MetaflowException(
519
+ "Parameter *%s* is specified twice. "
520
+ "Note that parameter names are "
521
+ "case-insensitive." % param.name
522
+ )
523
+ seen.add(norm)
524
+
525
+ parameters.append(
526
+ dict(name=param.name, kv_name=ConfigInput.make_key_name(param.name))
527
+ )
528
+ return parameters
529
+
502
530
  def _process_triggers(self):
503
531
  # Impute triggers for Argo Workflow Template specified through @trigger and
504
532
  # @trigger_on_finish decorators
@@ -521,8 +549,13 @@ class ArgoWorkflows(object):
521
549
  # convert them to lower case since Metaflow parameters are case
522
550
  # insensitive.
523
551
  seen = set()
552
+ # NOTE: We skip config parameters as their values can not be set through event payloads
524
553
  params = set(
525
- [param.name.lower() for var, param in self.flow._get_parameters()]
554
+ [
555
+ param.name.lower()
556
+ for var, param in self.flow._get_parameters()
557
+ if not param.IS_CONFIG_PARAMETER
558
+ ]
526
559
  )
527
560
  trigger_deco = self.flow._flow_decorators.get("trigger")[0]
528
561
  trigger_deco.format_deploytime_value()
@@ -1721,6 +1754,13 @@ class ArgoWorkflows(object):
1721
1754
  metaflow_version["production_token"] = self.production_token
1722
1755
  env["METAFLOW_VERSION"] = json.dumps(metaflow_version)
1723
1756
 
1757
+ # map config values
1758
+ cfg_env = {
1759
+ param["name"]: param["kv_name"] for param in self.config_parameters
1760
+ }
1761
+ if cfg_env:
1762
+ env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env)
1763
+
1724
1764
  # Set the template inputs and outputs for passing state. Very simply,
1725
1765
  # the container template takes in input-paths as input and outputs
1726
1766
  # the task-id (which feeds in as input-paths to the subsequent task).
@@ -470,6 +470,7 @@ def make_flow(
470
470
  decorators._attach_decorators(
471
471
  obj.flow, [KubernetesDecorator.name, EnvironmentDecorator.name]
472
472
  )
473
+ decorators._init(obj.flow)
473
474
 
474
475
  decorators._init_step_decorators(
475
476
  obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger
@@ -1,5 +1,6 @@
1
1
  import sys
2
2
  import json
3
+ import time
3
4
  import tempfile
4
5
  from typing import ClassVar, Optional
5
6
 
@@ -97,6 +98,7 @@ class ArgoWorkflowsTriggeredRun(TriggeredRun):
97
98
  )
98
99
 
99
100
  command_obj = self.deployer.spm.get(pid)
101
+ command_obj.sync_wait()
100
102
  return command_obj.process.returncode == 0
101
103
 
102
104
  def unsuspend(self, **kwargs) -> bool:
@@ -131,6 +133,7 @@ class ArgoWorkflowsTriggeredRun(TriggeredRun):
131
133
  )
132
134
 
133
135
  command_obj = self.deployer.spm.get(pid)
136
+ command_obj.sync_wait()
134
137
  return command_obj.process.returncode == 0
135
138
 
136
139
  def terminate(self, **kwargs) -> bool:
@@ -165,8 +168,50 @@ class ArgoWorkflowsTriggeredRun(TriggeredRun):
165
168
  )
166
169
 
167
170
  command_obj = self.deployer.spm.get(pid)
171
+ command_obj.sync_wait()
168
172
  return command_obj.process.returncode == 0
169
173
 
174
+ def wait_for_completion(self, timeout: Optional[int] = None):
175
+ """
176
+ Wait for the workflow to complete or timeout.
177
+
178
+ Parameters
179
+ ----------
180
+ timeout : int, optional, default None
181
+ Maximum time in seconds to wait for workflow completion.
182
+ If None, waits indefinitely.
183
+
184
+ Raises
185
+ ------
186
+ TimeoutError
187
+ If the workflow does not complete within the specified timeout period.
188
+ """
189
+ start_time = time.time()
190
+ check_interval = 5
191
+ while self.is_running:
192
+ if timeout is not None and (time.time() - start_time) > timeout:
193
+ raise TimeoutError(
194
+ "Workflow did not complete within specified timeout."
195
+ )
196
+ time.sleep(check_interval)
197
+
198
+ @property
199
+ def is_running(self):
200
+ """
201
+ Check if the workflow is currently running.
202
+
203
+ Returns
204
+ -------
205
+ bool
206
+ True if the workflow status is either 'Pending' or 'Running',
207
+ False otherwise.
208
+ """
209
+ workflow_status = self.status
210
+ # full list of all states present here:
211
+ # https://github.com/argoproj/argo-workflows/blob/main/pkg/apis/workflow/v1alpha1/workflow_types.go#L54
212
+ # we only consider non-terminal states to determine if the workflow has not finished
213
+ return workflow_status is not None and workflow_status in ["Pending", "Running"]
214
+
170
215
  @property
171
216
  def status(self) -> Optional[str]:
172
217
  """
@@ -319,6 +364,7 @@ class ArgoWorkflowsDeployedFlow(DeployedFlow):
319
364
  )
320
365
 
321
366
  command_obj = self.deployer.spm.get(pid)
367
+ command_obj.sync_wait()
322
368
  return command_obj.process.returncode == 0
323
369
 
324
370
  def trigger(self, **kwargs) -> ArgoWorkflowsTriggeredRun:
@@ -361,7 +407,7 @@ class ArgoWorkflowsDeployedFlow(DeployedFlow):
361
407
  content = handle_timeout(
362
408
  attribute_file_fd, command_obj, self.deployer.file_read_timeout
363
409
  )
364
-
410
+ command_obj.sync_wait()
365
411
  if command_obj.process.returncode == 0:
366
412
  return ArgoWorkflowsTriggeredRun(
367
413
  deployer=self.deployer, content=content
@@ -138,8 +138,8 @@ class BatchDecorator(StepDecorator):
138
138
  supports_conda_environment = True
139
139
  target_platform = "linux-64"
140
140
 
141
- def __init__(self, attributes=None, statically_defined=False):
142
- super(BatchDecorator, self).__init__(attributes, statically_defined)
141
+ def init(self):
142
+ super(BatchDecorator, self).init()
143
143
 
144
144
  # If no docker image is explicitly specified, impute a default image.
145
145
  if not self.attributes["image"]:
@@ -18,6 +18,7 @@ from metaflow.metaflow_config import (
18
18
  SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH,
19
19
  )
20
20
  from metaflow.parameters import deploy_time_eval
21
+ from metaflow.user_configs.config_options import ConfigInput
21
22
  from metaflow.util import dict_to_cli_options, to_pascalcase
22
23
 
23
24
  from ..batch.batch import Batch
@@ -71,6 +72,7 @@ class StepFunctions(object):
71
72
  self.username = username
72
73
  self.max_workers = max_workers
73
74
  self.workflow_timeout = workflow_timeout
75
+ self.config_parameters = self._process_config_parameters()
74
76
 
75
77
  # https://aws.amazon.com/blogs/aws/step-functions-distributed-map-a-serverless-solution-for-large-scale-parallel-data-processing/
76
78
  self.use_distributed_map = use_distributed_map
@@ -485,6 +487,10 @@ class StepFunctions(object):
485
487
  "case-insensitive." % param.name
486
488
  )
487
489
  seen.add(norm)
490
+ # NOTE: We skip config parameters as these do not have dynamic values,
491
+ # and need to be treated differently.
492
+ if param.IS_CONFIG_PARAMETER:
493
+ continue
488
494
 
489
495
  is_required = param.kwargs.get("required", False)
490
496
  # Throw an exception if a schedule is set for a flow with required
@@ -501,6 +507,27 @@ class StepFunctions(object):
501
507
  parameters.append(dict(name=param.name, value=value))
502
508
  return parameters
503
509
 
510
+ def _process_config_parameters(self):
511
+ parameters = []
512
+ seen = set()
513
+ for var, param in self.flow._get_parameters():
514
+ if not param.IS_CONFIG_PARAMETER:
515
+ continue
516
+ # Throw an exception if the parameter is specified twice.
517
+ norm = param.name.lower()
518
+ if norm in seen:
519
+ raise MetaflowException(
520
+ "Parameter *%s* is specified twice. "
521
+ "Note that parameter names are "
522
+ "case-insensitive." % param.name
523
+ )
524
+ seen.add(norm)
525
+
526
+ parameters.append(
527
+ dict(name=param.name, kv_name=ConfigInput.make_key_name(param.name))
528
+ )
529
+ return parameters
530
+
504
531
  def _batch(self, node):
505
532
  attrs = {
506
533
  # metaflow.user is only used for setting the AWS Job Name.
@@ -747,6 +774,11 @@ class StepFunctions(object):
747
774
  metaflow_version["production_token"] = self.production_token
748
775
  env["METAFLOW_VERSION"] = json.dumps(metaflow_version)
749
776
 
777
+ # map config values
778
+ cfg_env = {param["name"]: param["kv_name"] for param in self.config_parameters}
779
+ if cfg_env:
780
+ env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env)
781
+
750
782
  # Set AWS DynamoDb Table Name for state tracking for for-eaches.
751
783
  # There are three instances when metaflow runtime directly interacts
752
784
  # with AWS DynamoDB.
@@ -326,6 +326,7 @@ def make_flow(
326
326
 
327
327
  # Attach AWS Batch decorator to the flow
328
328
  decorators._attach_decorators(obj.flow, [BatchDecorator.name])
329
+ decorators._init(obj.flow)
329
330
  decorators._init_step_decorators(
330
331
  obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger
331
332
  )
@@ -46,6 +46,7 @@ class StepFunctionsTriggeredRun(TriggeredRun):
46
46
  )
47
47
 
48
48
  command_obj = self.deployer.spm.get(pid)
49
+ command_obj.sync_wait()
49
50
  return command_obj.process.returncode == 0
50
51
 
51
52
 
@@ -174,6 +175,7 @@ class StepFunctionsDeployedFlow(DeployedFlow):
174
175
  )
175
176
 
176
177
  command_obj = self.deployer.spm.get(pid)
178
+ command_obj.sync_wait()
177
179
  return command_obj.process.returncode == 0
178
180
 
179
181
  def trigger(self, **kwargs) -> StepFunctionsTriggeredRun:
@@ -217,6 +219,7 @@ class StepFunctionsDeployedFlow(DeployedFlow):
217
219
  attribute_file_fd, command_obj, self.deployer.file_read_timeout
218
220
  )
219
221
 
222
+ command_obj.sync_wait()
220
223
  if command_obj.process.returncode == 0:
221
224
  return StepFunctionsTriggeredRun(
222
225
  deployer=self.deployer, content=content
@@ -722,8 +722,8 @@ def cli():
722
722
  pass
723
723
 
724
724
 
725
- @tracing.cli_entrypoint("s3op/list")
726
725
  @cli.command("list", help="List S3 objects")
726
+ @tracing.cli_entrypoint("s3op/list")
727
727
  @click.option(
728
728
  "--recursive/--no-recursive",
729
729
  default=False,
@@ -782,8 +782,8 @@ def lst(
782
782
  print(format_result_line(idx, url.prefix, url.url, str(size)))
783
783
 
784
784
 
785
- @tracing.cli_entrypoint("s3op/put")
786
785
  @cli.command(help="Upload files to S3")
786
+ @tracing.cli_entrypoint("s3op/put")
787
787
  @click.option(
788
788
  "--file",
789
789
  "files",
@@ -977,8 +977,8 @@ def _populate_prefixes(prefixes, inputs):
977
977
  return prefixes, is_transient_retry
978
978
 
979
979
 
980
- @tracing.cli_entrypoint("s3op/get")
981
980
  @cli.command(help="Download files from S3")
981
+ @tracing.cli_entrypoint("s3op/get")
982
982
  @click.option(
983
983
  "--recursive/--no-recursive",
984
984
  default=False,
@@ -33,12 +33,12 @@ def kubernetes():
33
33
  pass
34
34
 
35
35
 
36
- @tracing.cli_entrypoint("kubernetes/step")
37
36
  @kubernetes.command(
38
37
  help="Execute a single task on Kubernetes. This command calls the top-level step "
39
38
  "command inside a Kubernetes pod with the given options. Typically you do not call "
40
39
  "this command directly; it is used internally by Metaflow."
41
40
  )
41
+ @tracing.cli_entrypoint("kubernetes/step")
42
42
  @click.argument("step-name")
43
43
  @click.argument("code-package-sha")
44
44
  @click.argument("code-package-url")
@@ -153,8 +153,8 @@ class KubernetesDecorator(StepDecorator):
153
153
  supports_conda_environment = True
154
154
  target_platform = "linux-64"
155
155
 
156
- def __init__(self, attributes=None, statically_defined=False):
157
- super(KubernetesDecorator, self).__init__(attributes, statically_defined)
156
+ def init(self):
157
+ super(KubernetesDecorator, self).init()
158
158
 
159
159
  if not self.attributes["namespace"]:
160
160
  self.attributes["namespace"] = KUBERNETES_NAMESPACE
@@ -1,4 +1,5 @@
1
1
  import bz2
2
+ import concurrent.futures
2
3
  import io
3
4
  import json
4
5
  import os
@@ -6,6 +7,9 @@ import shutil
6
7
  import subprocess
7
8
  import sys
8
9
  import tarfile
10
+ import time
11
+
12
+ import requests
9
13
 
10
14
  from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
11
15
  from metaflow.plugins import DATASTORES
@@ -15,6 +19,18 @@ from . import MAGIC_FILE, _datastore_packageroot
15
19
 
16
20
  # Bootstraps a valid conda virtual environment composed of conda and pypi packages
17
21
 
22
+
23
+ def timer(func):
24
+ def wrapper(*args, **kwargs):
25
+ start_time = time.time()
26
+ result = func(*args, **kwargs)
27
+ duration = time.time() - start_time
28
+ # print(f"Time taken for {func.__name__}: {duration:.2f} seconds")
29
+ return result
30
+
31
+ return wrapper
32
+
33
+
18
34
  if __name__ == "__main__":
19
35
  if len(sys.argv) != 5:
20
36
  print("Usage: bootstrap.py <flow_name> <id> <datastore_type> <architecture>")
@@ -47,6 +63,8 @@ if __name__ == "__main__":
47
63
 
48
64
  prefix = os.path.join(os.getcwd(), architecture, id_)
49
65
  pkgs_dir = os.path.join(os.getcwd(), ".pkgs")
66
+ conda_pkgs_dir = os.path.join(pkgs_dir, "conda")
67
+ pypi_pkgs_dir = os.path.join(pkgs_dir, "pypi")
50
68
  manifest_dir = os.path.join(os.getcwd(), DATASTORE_LOCAL_DIR, flow_name)
51
69
 
52
70
  datastores = [d for d in DATASTORES if d.TYPE == datastore_type]
@@ -64,77 +82,194 @@ if __name__ == "__main__":
64
82
  os.path.join(os.getcwd(), MAGIC_FILE),
65
83
  os.path.join(manifest_dir, MAGIC_FILE),
66
84
  )
67
-
68
85
  with open(os.path.join(manifest_dir, MAGIC_FILE)) as f:
69
86
  env = json.load(f)[id_][architecture]
70
87
 
71
- # Download Conda packages.
72
- conda_pkgs_dir = os.path.join(pkgs_dir, "conda")
73
- with storage.load_bytes([package["path"] for package in env["conda"]]) as results:
74
- for key, tmpfile, _ in results:
88
+ def run_cmd(cmd):
89
+ result = subprocess.run(
90
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
91
+ )
92
+ if result.returncode != 0:
93
+ print(f"Bootstrap failed while executing: {cmd}")
94
+ print("Stdout:", result.stdout)
95
+ print("Stderr:", result.stderr)
96
+ sys.exit(1)
97
+
98
+ @timer
99
+ def install_micromamba(architecture):
100
+ micromamba_dir = os.path.join(os.getcwd(), "micromamba")
101
+ micromamba_path = os.path.join(micromamba_dir, "bin", "micromamba")
102
+
103
+ if which("micromamba"):
104
+ return which("micromamba")
105
+ if os.path.exists(micromamba_path):
106
+ os.environ["PATH"] += os.pathsep + os.path.dirname(micromamba_path)
107
+ return micromamba_path
108
+
109
+ # Download and extract in one go
110
+ # TODO: Serve from cloudflare
111
+ url = f"https://micro.mamba.pm/api/micromamba/{architecture}/2.0.4"
112
+
113
+ # Prepare directory once
114
+ os.makedirs(os.path.dirname(micromamba_path), exist_ok=True)
115
+
116
+ # Stream and process directly to file
117
+ with requests.get(url, stream=True, timeout=30) as response:
118
+ if response.status_code != 200:
119
+ raise Exception(
120
+ f"Failed to download micromamba: HTTP {response.status_code}"
121
+ )
122
+
123
+ decompressor = bz2.BZ2Decompressor()
124
+
125
+ # Process in memory without temporary files
126
+ tar_content = decompressor.decompress(response.raw.read())
127
+
128
+ with tarfile.open(fileobj=io.BytesIO(tar_content), mode="r:") as tar:
129
+ member = tar.getmember("bin/micromamba")
130
+ # Extract directly to final location
131
+ with open(micromamba_path, "wb") as f:
132
+ f.write(tar.extractfile(member).read())
133
+
134
+ # Set executable permission
135
+ os.chmod(micromamba_path, 0o755)
136
+
137
+ # Update PATH only once at the end
138
+ os.environ["PATH"] += os.pathsep + os.path.dirname(micromamba_path)
139
+ return micromamba_path
140
+
141
+ @timer
142
+ def download_conda_packages(storage, packages, dest_dir):
143
+
144
+ def process_conda_package(args):
75
145
  # Ensure that conda packages go into architecture specific folders.
76
146
  # The path looks like REPO/CHANNEL/CONDA_SUBDIR/PACKAGE. We trick
77
147
  # Micromamba into believing that all packages are coming from a local
78
148
  # channel - the only hurdle is ensuring that packages are organised
79
149
  # properly.
80
-
81
- # TODO: consider RAM disk
82
- dest = os.path.join(conda_pkgs_dir, "/".join(key.split("/")[-2:]))
150
+ key, tmpfile, dest_dir = args
151
+ dest = os.path.join(dest_dir, "/".join(key.split("/")[-2:]))
83
152
  os.makedirs(os.path.dirname(dest), exist_ok=True)
84
153
  shutil.move(tmpfile, dest)
85
154
 
86
- # Create Conda environment.
87
- cmds = [
88
- # TODO: check if mamba or conda are already available on the image
89
- # TODO: micromamba installation can be pawned off to micromamba.py
90
- f"""set -e;
91
- if ! command -v micromamba >/dev/null 2>&1; then
92
- mkdir -p micromamba;
93
- python -c "import requests, bz2, sys; data = requests.get('https://micro.mamba.pm/api/micromamba/{architecture}/1.5.7').content; sys.stdout.buffer.write(bz2.decompress(data))" | tar -xv -C $(pwd)/micromamba bin/micromamba --strip-components 1;
155
+ os.makedirs(dest_dir, exist_ok=True)
156
+ with storage.load_bytes([package["path"] for package in packages]) as results:
157
+ with concurrent.futures.ThreadPoolExecutor() as executor:
158
+ executor.map(
159
+ process_conda_package,
160
+ [(key, tmpfile, dest_dir) for key, tmpfile, _ in results],
161
+ )
162
+ # for key, tmpfile, _ in results:
163
+
164
+ # # TODO: consider RAM disk
165
+ # dest = os.path.join(dest_dir, "/".join(key.split("/")[-2:]))
166
+ # os.makedirs(os.path.dirname(dest), exist_ok=True)
167
+ # shutil.move(tmpfile, dest)
168
+ return dest_dir
169
+
170
+ @timer
171
+ def download_pypi_packages(storage, packages, dest_dir):
172
+
173
+ def process_pypi_package(args):
174
+ key, tmpfile, dest_dir = args
175
+ dest = os.path.join(dest_dir, os.path.basename(key))
176
+ shutil.move(tmpfile, dest)
177
+
178
+ os.makedirs(dest_dir, exist_ok=True)
179
+ with storage.load_bytes([package["path"] for package in packages]) as results:
180
+ with concurrent.futures.ThreadPoolExecutor() as executor:
181
+ executor.map(
182
+ process_pypi_package,
183
+ [(key, tmpfile, dest_dir) for key, tmpfile, _ in results],
184
+ )
185
+ # for key, tmpfile, _ in results:
186
+ # dest = os.path.join(dest_dir, os.path.basename(key))
187
+ # shutil.move(tmpfile, dest)
188
+ return dest_dir
189
+
190
+ @timer
191
+ def create_conda_environment(prefix, conda_pkgs_dir):
192
+ cmd = f'''set -e;
193
+ tmpfile=$(mktemp);
194
+ echo "@EXPLICIT" > "$tmpfile";
195
+ ls -d {conda_pkgs_dir}/*/* >> "$tmpfile";
94
196
  export PATH=$PATH:$(pwd)/micromamba;
95
- if ! command -v micromamba >/dev/null 2>&1; then
96
- echo "Failed to install Micromamba!";
97
- exit 1;
98
- fi;
99
- fi""",
100
- # Create a conda environment through Micromamba.
101
- f'''set -e;
102
- tmpfile=$(mktemp);
103
- echo "@EXPLICIT" > "$tmpfile";
104
- ls -d {conda_pkgs_dir}/*/* >> "$tmpfile";
105
- export PATH=$PATH:$(pwd)/micromamba;
106
- export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
107
- micromamba create --yes --offline --no-deps --safety-checks=disabled --no-extra-safety-checks --prefix {prefix} --file "$tmpfile";
108
- rm "$tmpfile"''',
109
- ]
110
-
111
- # Download PyPI packages.
112
- if "pypi" in env:
113
- pypi_pkgs_dir = os.path.join(pkgs_dir, "pypi")
114
- with storage.load_bytes(
115
- [package["path"] for package in env["pypi"]]
116
- ) as results:
117
- for key, tmpfile, _ in results:
118
- dest = os.path.join(pypi_pkgs_dir, os.path.basename(key))
119
- os.makedirs(os.path.dirname(dest), exist_ok=True)
120
- shutil.move(tmpfile, dest)
121
-
122
- # Install PyPI packages.
123
- cmds.extend(
124
- [
125
- f"""set -e;
126
- export PATH=$PATH:$(pwd)/micromamba;
127
- export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
128
- micromamba run --prefix {prefix} python -m pip --disable-pip-version-check install --root-user-action=ignore --no-compile {pypi_pkgs_dir}/*.whl --no-user"""
129
- ]
130
- )
197
+ export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
198
+ export MAMBA_NO_LOW_SPEED_LIMIT=1;
199
+ export MAMBA_USE_INDEX_CACHE=1;
200
+ export MAMBA_NO_PROGRESS_BARS=1;
201
+ export CONDA_FETCH_THREADS=1;
202
+ micromamba create --yes --offline --no-deps \
203
+ --safety-checks=disabled --no-extra-safety-checks \
204
+ --prefix {prefix} --file "$tmpfile" \
205
+ --no-pyc --no-rc --always-copy;
206
+ rm "$tmpfile"'''
207
+ run_cmd(cmd)
131
208
 
132
- for cmd in cmds:
133
- result = subprocess.run(
134
- cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
135
- )
136
- if result.returncode != 0:
137
- print(f"Bootstrap failed while executing: {cmd}")
138
- print("Stdout:", result.stdout.decode())
139
- print("Stderr:", result.stderr.decode())
140
- sys.exit(1)
209
+ @timer
210
+ def install_pypi_packages(prefix, pypi_pkgs_dir):
211
+
212
+ cmd = f"""set -e;
213
+ export PATH=$PATH:$(pwd)/micromamba;
214
+ export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
215
+ micromamba run --prefix {prefix} python -m pip --disable-pip-version-check \
216
+ install --root-user-action=ignore --no-compile --no-index \
217
+ --no-cache-dir --no-deps --prefer-binary \
218
+ --find-links={pypi_pkgs_dir} --no-user \
219
+ --no-warn-script-location --no-input \
220
+ {pypi_pkgs_dir}/*.whl
221
+ """
222
+ run_cmd(cmd)
223
+
224
+ @timer
225
+ def setup_environment(
226
+ architecture, storage, env, prefix, conda_pkgs_dir, pypi_pkgs_dir
227
+ ):
228
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
229
+ # install micromamba, download conda and pypi packages in parallel
230
+ futures = {
231
+ "micromamba": executor.submit(install_micromamba, architecture),
232
+ "conda_pkgs": executor.submit(
233
+ download_conda_packages, storage, env["conda"], conda_pkgs_dir
234
+ ),
235
+ }
236
+ if "pypi" in env:
237
+ futures["pypi_pkgs"] = executor.submit(
238
+ download_pypi_packages, storage, env["pypi"], pypi_pkgs_dir
239
+ )
240
+
241
+ # create conda environment after micromamba is installed and conda packages are downloaded
242
+ done, _ = concurrent.futures.wait(
243
+ [futures["micromamba"], futures["conda_pkgs"]],
244
+ return_when=concurrent.futures.ALL_COMPLETED,
245
+ )
246
+
247
+ for future in done:
248
+ future.result()
249
+
250
+ # start conda environment creation
251
+ futures["conda_env"] = executor.submit(
252
+ create_conda_environment, prefix, conda_pkgs_dir
253
+ )
254
+
255
+ if "pypi" in env:
256
+ # install pypi packages after conda environment is created and pypi packages are downloaded
257
+ done, _ = concurrent.futures.wait(
258
+ [futures["conda_env"], futures["pypi_pkgs"]],
259
+ return_when=concurrent.futures.ALL_COMPLETED,
260
+ )
261
+
262
+ for future in done:
263
+ future.result()
264
+
265
+ # install pypi packages
266
+ futures["pypi_install"] = executor.submit(
267
+ install_pypi_packages, prefix, pypi_pkgs_dir
268
+ )
269
+ # wait for pypi packages to be installed
270
+ futures["pypi_install"].result()
271
+ else:
272
+ # wait for conda environment to be created
273
+ futures["conda_env"].result()
274
+
275
+ setup_environment(architecture, storage, env, prefix, conda_pkgs_dir, pypi_pkgs_dir)