ob-metaflow 2.12.36.3__py2.py3-none-any.whl → 2.12.39.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/__init__.py +3 -0
- metaflow/cli.py +84 -697
- metaflow/cli_args.py +17 -0
- metaflow/cli_components/__init__.py +0 -0
- metaflow/cli_components/dump_cmd.py +96 -0
- metaflow/cli_components/init_cmd.py +51 -0
- metaflow/cli_components/run_cmds.py +358 -0
- metaflow/cli_components/step_cmd.py +189 -0
- metaflow/cli_components/utils.py +140 -0
- metaflow/cmd/develop/stub_generator.py +9 -2
- metaflow/decorators.py +63 -2
- metaflow/extension_support/plugins.py +41 -27
- metaflow/flowspec.py +156 -16
- metaflow/includefile.py +50 -22
- metaflow/metaflow_config.py +1 -1
- metaflow/package.py +17 -3
- metaflow/parameters.py +80 -23
- metaflow/plugins/__init__.py +4 -0
- metaflow/plugins/airflow/airflow_cli.py +1 -0
- metaflow/plugins/argo/argo_workflows.py +41 -1
- metaflow/plugins/argo/argo_workflows_cli.py +1 -0
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +47 -1
- metaflow/plugins/aws/batch/batch_decorator.py +2 -2
- metaflow/plugins/aws/step_functions/step_functions.py +32 -0
- metaflow/plugins/aws/step_functions/step_functions_cli.py +1 -0
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +3 -0
- metaflow/plugins/datatools/s3/s3op.py +3 -3
- metaflow/plugins/kubernetes/kubernetes_cli.py +1 -1
- metaflow/plugins/kubernetes/kubernetes_decorator.py +2 -2
- metaflow/plugins/pypi/bootstrap.py +196 -61
- metaflow/plugins/pypi/conda_decorator.py +20 -10
- metaflow/plugins/pypi/conda_environment.py +76 -21
- metaflow/plugins/pypi/micromamba.py +42 -15
- metaflow/plugins/pypi/pip.py +8 -3
- metaflow/plugins/pypi/pypi_decorator.py +11 -9
- metaflow/plugins/timeout_decorator.py +2 -2
- metaflow/runner/click_api.py +73 -19
- metaflow/runner/deployer.py +1 -1
- metaflow/runner/deployer_impl.py +2 -2
- metaflow/runner/metaflow_runner.py +4 -1
- metaflow/runner/nbdeploy.py +2 -0
- metaflow/runner/nbrun.py +1 -1
- metaflow/runner/subprocess_manager.py +3 -1
- metaflow/runner/utils.py +41 -19
- metaflow/runtime.py +111 -73
- metaflow/sidecar/sidecar_worker.py +1 -1
- metaflow/user_configs/__init__.py +0 -0
- metaflow/user_configs/config_decorators.py +563 -0
- metaflow/user_configs/config_options.py +495 -0
- metaflow/user_configs/config_parameters.py +386 -0
- metaflow/util.py +17 -0
- metaflow/version.py +1 -1
- {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/METADATA +3 -2
- {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/RECORD +58 -48
- {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/WHEEL +0 -0
- {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.12.36.3.dist-info → ob_metaflow-2.12.39.1.dist-info}/top_level.txt +0 -0
|
@@ -61,6 +61,7 @@ from metaflow.plugins.kubernetes.kubernetes import (
|
|
|
61
61
|
)
|
|
62
62
|
from metaflow.plugins.kubernetes.kubernetes_jobsets import KubernetesArgoJobSet
|
|
63
63
|
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
64
|
+
from metaflow.user_configs.config_options import ConfigInput
|
|
64
65
|
from metaflow.util import (
|
|
65
66
|
compress_list,
|
|
66
67
|
dict_to_cli_options,
|
|
@@ -169,6 +170,7 @@ class ArgoWorkflows(object):
|
|
|
169
170
|
self.enable_heartbeat_daemon = enable_heartbeat_daemon
|
|
170
171
|
self.enable_error_msg_capture = enable_error_msg_capture
|
|
171
172
|
self.parameters = self._process_parameters()
|
|
173
|
+
self.config_parameters = self._process_config_parameters()
|
|
172
174
|
self.triggers, self.trigger_options = self._process_triggers()
|
|
173
175
|
self._schedule, self._timezone = self._get_schedule()
|
|
174
176
|
|
|
@@ -456,6 +458,10 @@ class ArgoWorkflows(object):
|
|
|
456
458
|
"case-insensitive." % param.name
|
|
457
459
|
)
|
|
458
460
|
seen.add(norm)
|
|
461
|
+
# NOTE: We skip config parameters as these do not have dynamic values,
|
|
462
|
+
# and need to be treated differently.
|
|
463
|
+
if param.IS_CONFIG_PARAMETER:
|
|
464
|
+
continue
|
|
459
465
|
|
|
460
466
|
extra_attrs = {}
|
|
461
467
|
if param.kwargs.get("type") == JSONType:
|
|
@@ -489,6 +495,7 @@ class ArgoWorkflows(object):
|
|
|
489
495
|
# execution - which needs to be fixed imminently.
|
|
490
496
|
if not is_required or default_value is not None:
|
|
491
497
|
default_value = json.dumps(default_value)
|
|
498
|
+
|
|
492
499
|
parameters[param.name] = dict(
|
|
493
500
|
name=param.name,
|
|
494
501
|
value=default_value,
|
|
@@ -499,6 +506,27 @@ class ArgoWorkflows(object):
|
|
|
499
506
|
)
|
|
500
507
|
return parameters
|
|
501
508
|
|
|
509
|
+
def _process_config_parameters(self):
|
|
510
|
+
parameters = []
|
|
511
|
+
seen = set()
|
|
512
|
+
for var, param in self.flow._get_parameters():
|
|
513
|
+
if not param.IS_CONFIG_PARAMETER:
|
|
514
|
+
continue
|
|
515
|
+
# Throw an exception if the parameter is specified twice.
|
|
516
|
+
norm = param.name.lower()
|
|
517
|
+
if norm in seen:
|
|
518
|
+
raise MetaflowException(
|
|
519
|
+
"Parameter *%s* is specified twice. "
|
|
520
|
+
"Note that parameter names are "
|
|
521
|
+
"case-insensitive." % param.name
|
|
522
|
+
)
|
|
523
|
+
seen.add(norm)
|
|
524
|
+
|
|
525
|
+
parameters.append(
|
|
526
|
+
dict(name=param.name, kv_name=ConfigInput.make_key_name(param.name))
|
|
527
|
+
)
|
|
528
|
+
return parameters
|
|
529
|
+
|
|
502
530
|
def _process_triggers(self):
|
|
503
531
|
# Impute triggers for Argo Workflow Template specified through @trigger and
|
|
504
532
|
# @trigger_on_finish decorators
|
|
@@ -521,8 +549,13 @@ class ArgoWorkflows(object):
|
|
|
521
549
|
# convert them to lower case since Metaflow parameters are case
|
|
522
550
|
# insensitive.
|
|
523
551
|
seen = set()
|
|
552
|
+
# NOTE: We skip config parameters as their values can not be set through event payloads
|
|
524
553
|
params = set(
|
|
525
|
-
[
|
|
554
|
+
[
|
|
555
|
+
param.name.lower()
|
|
556
|
+
for var, param in self.flow._get_parameters()
|
|
557
|
+
if not param.IS_CONFIG_PARAMETER
|
|
558
|
+
]
|
|
526
559
|
)
|
|
527
560
|
trigger_deco = self.flow._flow_decorators.get("trigger")[0]
|
|
528
561
|
trigger_deco.format_deploytime_value()
|
|
@@ -1721,6 +1754,13 @@ class ArgoWorkflows(object):
|
|
|
1721
1754
|
metaflow_version["production_token"] = self.production_token
|
|
1722
1755
|
env["METAFLOW_VERSION"] = json.dumps(metaflow_version)
|
|
1723
1756
|
|
|
1757
|
+
# map config values
|
|
1758
|
+
cfg_env = {
|
|
1759
|
+
param["name"]: param["kv_name"] for param in self.config_parameters
|
|
1760
|
+
}
|
|
1761
|
+
if cfg_env:
|
|
1762
|
+
env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env)
|
|
1763
|
+
|
|
1724
1764
|
# Set the template inputs and outputs for passing state. Very simply,
|
|
1725
1765
|
# the container template takes in input-paths as input and outputs
|
|
1726
1766
|
# the task-id (which feeds in as input-paths to the subsequent task).
|
|
@@ -470,6 +470,7 @@ def make_flow(
|
|
|
470
470
|
decorators._attach_decorators(
|
|
471
471
|
obj.flow, [KubernetesDecorator.name, EnvironmentDecorator.name]
|
|
472
472
|
)
|
|
473
|
+
decorators._init(obj.flow)
|
|
473
474
|
|
|
474
475
|
decorators._init_step_decorators(
|
|
475
476
|
obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import json
|
|
3
|
+
import time
|
|
3
4
|
import tempfile
|
|
4
5
|
from typing import ClassVar, Optional
|
|
5
6
|
|
|
@@ -97,6 +98,7 @@ class ArgoWorkflowsTriggeredRun(TriggeredRun):
|
|
|
97
98
|
)
|
|
98
99
|
|
|
99
100
|
command_obj = self.deployer.spm.get(pid)
|
|
101
|
+
command_obj.sync_wait()
|
|
100
102
|
return command_obj.process.returncode == 0
|
|
101
103
|
|
|
102
104
|
def unsuspend(self, **kwargs) -> bool:
|
|
@@ -131,6 +133,7 @@ class ArgoWorkflowsTriggeredRun(TriggeredRun):
|
|
|
131
133
|
)
|
|
132
134
|
|
|
133
135
|
command_obj = self.deployer.spm.get(pid)
|
|
136
|
+
command_obj.sync_wait()
|
|
134
137
|
return command_obj.process.returncode == 0
|
|
135
138
|
|
|
136
139
|
def terminate(self, **kwargs) -> bool:
|
|
@@ -165,8 +168,50 @@ class ArgoWorkflowsTriggeredRun(TriggeredRun):
|
|
|
165
168
|
)
|
|
166
169
|
|
|
167
170
|
command_obj = self.deployer.spm.get(pid)
|
|
171
|
+
command_obj.sync_wait()
|
|
168
172
|
return command_obj.process.returncode == 0
|
|
169
173
|
|
|
174
|
+
def wait_for_completion(self, timeout: Optional[int] = None):
|
|
175
|
+
"""
|
|
176
|
+
Wait for the workflow to complete or timeout.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
timeout : int, optional, default None
|
|
181
|
+
Maximum time in seconds to wait for workflow completion.
|
|
182
|
+
If None, waits indefinitely.
|
|
183
|
+
|
|
184
|
+
Raises
|
|
185
|
+
------
|
|
186
|
+
TimeoutError
|
|
187
|
+
If the workflow does not complete within the specified timeout period.
|
|
188
|
+
"""
|
|
189
|
+
start_time = time.time()
|
|
190
|
+
check_interval = 5
|
|
191
|
+
while self.is_running:
|
|
192
|
+
if timeout is not None and (time.time() - start_time) > timeout:
|
|
193
|
+
raise TimeoutError(
|
|
194
|
+
"Workflow did not complete within specified timeout."
|
|
195
|
+
)
|
|
196
|
+
time.sleep(check_interval)
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def is_running(self):
|
|
200
|
+
"""
|
|
201
|
+
Check if the workflow is currently running.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
bool
|
|
206
|
+
True if the workflow status is either 'Pending' or 'Running',
|
|
207
|
+
False otherwise.
|
|
208
|
+
"""
|
|
209
|
+
workflow_status = self.status
|
|
210
|
+
# full list of all states present here:
|
|
211
|
+
# https://github.com/argoproj/argo-workflows/blob/main/pkg/apis/workflow/v1alpha1/workflow_types.go#L54
|
|
212
|
+
# we only consider non-terminal states to determine if the workflow has not finished
|
|
213
|
+
return workflow_status is not None and workflow_status in ["Pending", "Running"]
|
|
214
|
+
|
|
170
215
|
@property
|
|
171
216
|
def status(self) -> Optional[str]:
|
|
172
217
|
"""
|
|
@@ -319,6 +364,7 @@ class ArgoWorkflowsDeployedFlow(DeployedFlow):
|
|
|
319
364
|
)
|
|
320
365
|
|
|
321
366
|
command_obj = self.deployer.spm.get(pid)
|
|
367
|
+
command_obj.sync_wait()
|
|
322
368
|
return command_obj.process.returncode == 0
|
|
323
369
|
|
|
324
370
|
def trigger(self, **kwargs) -> ArgoWorkflowsTriggeredRun:
|
|
@@ -361,7 +407,7 @@ class ArgoWorkflowsDeployedFlow(DeployedFlow):
|
|
|
361
407
|
content = handle_timeout(
|
|
362
408
|
attribute_file_fd, command_obj, self.deployer.file_read_timeout
|
|
363
409
|
)
|
|
364
|
-
|
|
410
|
+
command_obj.sync_wait()
|
|
365
411
|
if command_obj.process.returncode == 0:
|
|
366
412
|
return ArgoWorkflowsTriggeredRun(
|
|
367
413
|
deployer=self.deployer, content=content
|
|
@@ -138,8 +138,8 @@ class BatchDecorator(StepDecorator):
|
|
|
138
138
|
supports_conda_environment = True
|
|
139
139
|
target_platform = "linux-64"
|
|
140
140
|
|
|
141
|
-
def
|
|
142
|
-
super(BatchDecorator, self).
|
|
141
|
+
def init(self):
|
|
142
|
+
super(BatchDecorator, self).init()
|
|
143
143
|
|
|
144
144
|
# If no docker image is explicitly specified, impute a default image.
|
|
145
145
|
if not self.attributes["image"]:
|
|
@@ -18,6 +18,7 @@ from metaflow.metaflow_config import (
|
|
|
18
18
|
SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH,
|
|
19
19
|
)
|
|
20
20
|
from metaflow.parameters import deploy_time_eval
|
|
21
|
+
from metaflow.user_configs.config_options import ConfigInput
|
|
21
22
|
from metaflow.util import dict_to_cli_options, to_pascalcase
|
|
22
23
|
|
|
23
24
|
from ..batch.batch import Batch
|
|
@@ -71,6 +72,7 @@ class StepFunctions(object):
|
|
|
71
72
|
self.username = username
|
|
72
73
|
self.max_workers = max_workers
|
|
73
74
|
self.workflow_timeout = workflow_timeout
|
|
75
|
+
self.config_parameters = self._process_config_parameters()
|
|
74
76
|
|
|
75
77
|
# https://aws.amazon.com/blogs/aws/step-functions-distributed-map-a-serverless-solution-for-large-scale-parallel-data-processing/
|
|
76
78
|
self.use_distributed_map = use_distributed_map
|
|
@@ -485,6 +487,10 @@ class StepFunctions(object):
|
|
|
485
487
|
"case-insensitive." % param.name
|
|
486
488
|
)
|
|
487
489
|
seen.add(norm)
|
|
490
|
+
# NOTE: We skip config parameters as these do not have dynamic values,
|
|
491
|
+
# and need to be treated differently.
|
|
492
|
+
if param.IS_CONFIG_PARAMETER:
|
|
493
|
+
continue
|
|
488
494
|
|
|
489
495
|
is_required = param.kwargs.get("required", False)
|
|
490
496
|
# Throw an exception if a schedule is set for a flow with required
|
|
@@ -501,6 +507,27 @@ class StepFunctions(object):
|
|
|
501
507
|
parameters.append(dict(name=param.name, value=value))
|
|
502
508
|
return parameters
|
|
503
509
|
|
|
510
|
+
def _process_config_parameters(self):
|
|
511
|
+
parameters = []
|
|
512
|
+
seen = set()
|
|
513
|
+
for var, param in self.flow._get_parameters():
|
|
514
|
+
if not param.IS_CONFIG_PARAMETER:
|
|
515
|
+
continue
|
|
516
|
+
# Throw an exception if the parameter is specified twice.
|
|
517
|
+
norm = param.name.lower()
|
|
518
|
+
if norm in seen:
|
|
519
|
+
raise MetaflowException(
|
|
520
|
+
"Parameter *%s* is specified twice. "
|
|
521
|
+
"Note that parameter names are "
|
|
522
|
+
"case-insensitive." % param.name
|
|
523
|
+
)
|
|
524
|
+
seen.add(norm)
|
|
525
|
+
|
|
526
|
+
parameters.append(
|
|
527
|
+
dict(name=param.name, kv_name=ConfigInput.make_key_name(param.name))
|
|
528
|
+
)
|
|
529
|
+
return parameters
|
|
530
|
+
|
|
504
531
|
def _batch(self, node):
|
|
505
532
|
attrs = {
|
|
506
533
|
# metaflow.user is only used for setting the AWS Job Name.
|
|
@@ -747,6 +774,11 @@ class StepFunctions(object):
|
|
|
747
774
|
metaflow_version["production_token"] = self.production_token
|
|
748
775
|
env["METAFLOW_VERSION"] = json.dumps(metaflow_version)
|
|
749
776
|
|
|
777
|
+
# map config values
|
|
778
|
+
cfg_env = {param["name"]: param["kv_name"] for param in self.config_parameters}
|
|
779
|
+
if cfg_env:
|
|
780
|
+
env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env)
|
|
781
|
+
|
|
750
782
|
# Set AWS DynamoDb Table Name for state tracking for for-eaches.
|
|
751
783
|
# There are three instances when metaflow runtime directly interacts
|
|
752
784
|
# with AWS DynamoDB.
|
|
@@ -326,6 +326,7 @@ def make_flow(
|
|
|
326
326
|
|
|
327
327
|
# Attach AWS Batch decorator to the flow
|
|
328
328
|
decorators._attach_decorators(obj.flow, [BatchDecorator.name])
|
|
329
|
+
decorators._init(obj.flow)
|
|
329
330
|
decorators._init_step_decorators(
|
|
330
331
|
obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger
|
|
331
332
|
)
|
|
@@ -46,6 +46,7 @@ class StepFunctionsTriggeredRun(TriggeredRun):
|
|
|
46
46
|
)
|
|
47
47
|
|
|
48
48
|
command_obj = self.deployer.spm.get(pid)
|
|
49
|
+
command_obj.sync_wait()
|
|
49
50
|
return command_obj.process.returncode == 0
|
|
50
51
|
|
|
51
52
|
|
|
@@ -174,6 +175,7 @@ class StepFunctionsDeployedFlow(DeployedFlow):
|
|
|
174
175
|
)
|
|
175
176
|
|
|
176
177
|
command_obj = self.deployer.spm.get(pid)
|
|
178
|
+
command_obj.sync_wait()
|
|
177
179
|
return command_obj.process.returncode == 0
|
|
178
180
|
|
|
179
181
|
def trigger(self, **kwargs) -> StepFunctionsTriggeredRun:
|
|
@@ -217,6 +219,7 @@ class StepFunctionsDeployedFlow(DeployedFlow):
|
|
|
217
219
|
attribute_file_fd, command_obj, self.deployer.file_read_timeout
|
|
218
220
|
)
|
|
219
221
|
|
|
222
|
+
command_obj.sync_wait()
|
|
220
223
|
if command_obj.process.returncode == 0:
|
|
221
224
|
return StepFunctionsTriggeredRun(
|
|
222
225
|
deployer=self.deployer, content=content
|
|
@@ -722,8 +722,8 @@ def cli():
|
|
|
722
722
|
pass
|
|
723
723
|
|
|
724
724
|
|
|
725
|
-
@tracing.cli_entrypoint("s3op/list")
|
|
726
725
|
@cli.command("list", help="List S3 objects")
|
|
726
|
+
@tracing.cli_entrypoint("s3op/list")
|
|
727
727
|
@click.option(
|
|
728
728
|
"--recursive/--no-recursive",
|
|
729
729
|
default=False,
|
|
@@ -782,8 +782,8 @@ def lst(
|
|
|
782
782
|
print(format_result_line(idx, url.prefix, url.url, str(size)))
|
|
783
783
|
|
|
784
784
|
|
|
785
|
-
@tracing.cli_entrypoint("s3op/put")
|
|
786
785
|
@cli.command(help="Upload files to S3")
|
|
786
|
+
@tracing.cli_entrypoint("s3op/put")
|
|
787
787
|
@click.option(
|
|
788
788
|
"--file",
|
|
789
789
|
"files",
|
|
@@ -977,8 +977,8 @@ def _populate_prefixes(prefixes, inputs):
|
|
|
977
977
|
return prefixes, is_transient_retry
|
|
978
978
|
|
|
979
979
|
|
|
980
|
-
@tracing.cli_entrypoint("s3op/get")
|
|
981
980
|
@cli.command(help="Download files from S3")
|
|
981
|
+
@tracing.cli_entrypoint("s3op/get")
|
|
982
982
|
@click.option(
|
|
983
983
|
"--recursive/--no-recursive",
|
|
984
984
|
default=False,
|
|
@@ -33,12 +33,12 @@ def kubernetes():
|
|
|
33
33
|
pass
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
@tracing.cli_entrypoint("kubernetes/step")
|
|
37
36
|
@kubernetes.command(
|
|
38
37
|
help="Execute a single task on Kubernetes. This command calls the top-level step "
|
|
39
38
|
"command inside a Kubernetes pod with the given options. Typically you do not call "
|
|
40
39
|
"this command directly; it is used internally by Metaflow."
|
|
41
40
|
)
|
|
41
|
+
@tracing.cli_entrypoint("kubernetes/step")
|
|
42
42
|
@click.argument("step-name")
|
|
43
43
|
@click.argument("code-package-sha")
|
|
44
44
|
@click.argument("code-package-url")
|
|
@@ -153,8 +153,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
153
153
|
supports_conda_environment = True
|
|
154
154
|
target_platform = "linux-64"
|
|
155
155
|
|
|
156
|
-
def
|
|
157
|
-
super(KubernetesDecorator, self).
|
|
156
|
+
def init(self):
|
|
157
|
+
super(KubernetesDecorator, self).init()
|
|
158
158
|
|
|
159
159
|
if not self.attributes["namespace"]:
|
|
160
160
|
self.attributes["namespace"] = KUBERNETES_NAMESPACE
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import bz2
|
|
2
|
+
import concurrent.futures
|
|
2
3
|
import io
|
|
3
4
|
import json
|
|
4
5
|
import os
|
|
@@ -6,6 +7,9 @@ import shutil
|
|
|
6
7
|
import subprocess
|
|
7
8
|
import sys
|
|
8
9
|
import tarfile
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
import requests
|
|
9
13
|
|
|
10
14
|
from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
|
|
11
15
|
from metaflow.plugins import DATASTORES
|
|
@@ -15,6 +19,18 @@ from . import MAGIC_FILE, _datastore_packageroot
|
|
|
15
19
|
|
|
16
20
|
# Bootstraps a valid conda virtual environment composed of conda and pypi packages
|
|
17
21
|
|
|
22
|
+
|
|
23
|
+
def timer(func):
|
|
24
|
+
def wrapper(*args, **kwargs):
|
|
25
|
+
start_time = time.time()
|
|
26
|
+
result = func(*args, **kwargs)
|
|
27
|
+
duration = time.time() - start_time
|
|
28
|
+
# print(f"Time taken for {func.__name__}: {duration:.2f} seconds")
|
|
29
|
+
return result
|
|
30
|
+
|
|
31
|
+
return wrapper
|
|
32
|
+
|
|
33
|
+
|
|
18
34
|
if __name__ == "__main__":
|
|
19
35
|
if len(sys.argv) != 5:
|
|
20
36
|
print("Usage: bootstrap.py <flow_name> <id> <datastore_type> <architecture>")
|
|
@@ -47,6 +63,8 @@ if __name__ == "__main__":
|
|
|
47
63
|
|
|
48
64
|
prefix = os.path.join(os.getcwd(), architecture, id_)
|
|
49
65
|
pkgs_dir = os.path.join(os.getcwd(), ".pkgs")
|
|
66
|
+
conda_pkgs_dir = os.path.join(pkgs_dir, "conda")
|
|
67
|
+
pypi_pkgs_dir = os.path.join(pkgs_dir, "pypi")
|
|
50
68
|
manifest_dir = os.path.join(os.getcwd(), DATASTORE_LOCAL_DIR, flow_name)
|
|
51
69
|
|
|
52
70
|
datastores = [d for d in DATASTORES if d.TYPE == datastore_type]
|
|
@@ -64,77 +82,194 @@ if __name__ == "__main__":
|
|
|
64
82
|
os.path.join(os.getcwd(), MAGIC_FILE),
|
|
65
83
|
os.path.join(manifest_dir, MAGIC_FILE),
|
|
66
84
|
)
|
|
67
|
-
|
|
68
85
|
with open(os.path.join(manifest_dir, MAGIC_FILE)) as f:
|
|
69
86
|
env = json.load(f)[id_][architecture]
|
|
70
87
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
88
|
+
def run_cmd(cmd):
|
|
89
|
+
result = subprocess.run(
|
|
90
|
+
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
|
91
|
+
)
|
|
92
|
+
if result.returncode != 0:
|
|
93
|
+
print(f"Bootstrap failed while executing: {cmd}")
|
|
94
|
+
print("Stdout:", result.stdout)
|
|
95
|
+
print("Stderr:", result.stderr)
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
@timer
|
|
99
|
+
def install_micromamba(architecture):
|
|
100
|
+
micromamba_dir = os.path.join(os.getcwd(), "micromamba")
|
|
101
|
+
micromamba_path = os.path.join(micromamba_dir, "bin", "micromamba")
|
|
102
|
+
|
|
103
|
+
if which("micromamba"):
|
|
104
|
+
return which("micromamba")
|
|
105
|
+
if os.path.exists(micromamba_path):
|
|
106
|
+
os.environ["PATH"] += os.pathsep + os.path.dirname(micromamba_path)
|
|
107
|
+
return micromamba_path
|
|
108
|
+
|
|
109
|
+
# Download and extract in one go
|
|
110
|
+
# TODO: Serve from cloudflare
|
|
111
|
+
url = f"https://micro.mamba.pm/api/micromamba/{architecture}/2.0.4"
|
|
112
|
+
|
|
113
|
+
# Prepare directory once
|
|
114
|
+
os.makedirs(os.path.dirname(micromamba_path), exist_ok=True)
|
|
115
|
+
|
|
116
|
+
# Stream and process directly to file
|
|
117
|
+
with requests.get(url, stream=True, timeout=30) as response:
|
|
118
|
+
if response.status_code != 200:
|
|
119
|
+
raise Exception(
|
|
120
|
+
f"Failed to download micromamba: HTTP {response.status_code}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
decompressor = bz2.BZ2Decompressor()
|
|
124
|
+
|
|
125
|
+
# Process in memory without temporary files
|
|
126
|
+
tar_content = decompressor.decompress(response.raw.read())
|
|
127
|
+
|
|
128
|
+
with tarfile.open(fileobj=io.BytesIO(tar_content), mode="r:") as tar:
|
|
129
|
+
member = tar.getmember("bin/micromamba")
|
|
130
|
+
# Extract directly to final location
|
|
131
|
+
with open(micromamba_path, "wb") as f:
|
|
132
|
+
f.write(tar.extractfile(member).read())
|
|
133
|
+
|
|
134
|
+
# Set executable permission
|
|
135
|
+
os.chmod(micromamba_path, 0o755)
|
|
136
|
+
|
|
137
|
+
# Update PATH only once at the end
|
|
138
|
+
os.environ["PATH"] += os.pathsep + os.path.dirname(micromamba_path)
|
|
139
|
+
return micromamba_path
|
|
140
|
+
|
|
141
|
+
@timer
|
|
142
|
+
def download_conda_packages(storage, packages, dest_dir):
|
|
143
|
+
|
|
144
|
+
def process_conda_package(args):
|
|
75
145
|
# Ensure that conda packages go into architecture specific folders.
|
|
76
146
|
# The path looks like REPO/CHANNEL/CONDA_SUBDIR/PACKAGE. We trick
|
|
77
147
|
# Micromamba into believing that all packages are coming from a local
|
|
78
148
|
# channel - the only hurdle is ensuring that packages are organised
|
|
79
149
|
# properly.
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
dest = os.path.join(conda_pkgs_dir, "/".join(key.split("/")[-2:]))
|
|
150
|
+
key, tmpfile, dest_dir = args
|
|
151
|
+
dest = os.path.join(dest_dir, "/".join(key.split("/")[-2:]))
|
|
83
152
|
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
|
84
153
|
shutil.move(tmpfile, dest)
|
|
85
154
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
155
|
+
os.makedirs(dest_dir, exist_ok=True)
|
|
156
|
+
with storage.load_bytes([package["path"] for package in packages]) as results:
|
|
157
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
158
|
+
executor.map(
|
|
159
|
+
process_conda_package,
|
|
160
|
+
[(key, tmpfile, dest_dir) for key, tmpfile, _ in results],
|
|
161
|
+
)
|
|
162
|
+
# for key, tmpfile, _ in results:
|
|
163
|
+
|
|
164
|
+
# # TODO: consider RAM disk
|
|
165
|
+
# dest = os.path.join(dest_dir, "/".join(key.split("/")[-2:]))
|
|
166
|
+
# os.makedirs(os.path.dirname(dest), exist_ok=True)
|
|
167
|
+
# shutil.move(tmpfile, dest)
|
|
168
|
+
return dest_dir
|
|
169
|
+
|
|
170
|
+
@timer
|
|
171
|
+
def download_pypi_packages(storage, packages, dest_dir):
|
|
172
|
+
|
|
173
|
+
def process_pypi_package(args):
|
|
174
|
+
key, tmpfile, dest_dir = args
|
|
175
|
+
dest = os.path.join(dest_dir, os.path.basename(key))
|
|
176
|
+
shutil.move(tmpfile, dest)
|
|
177
|
+
|
|
178
|
+
os.makedirs(dest_dir, exist_ok=True)
|
|
179
|
+
with storage.load_bytes([package["path"] for package in packages]) as results:
|
|
180
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
181
|
+
executor.map(
|
|
182
|
+
process_pypi_package,
|
|
183
|
+
[(key, tmpfile, dest_dir) for key, tmpfile, _ in results],
|
|
184
|
+
)
|
|
185
|
+
# for key, tmpfile, _ in results:
|
|
186
|
+
# dest = os.path.join(dest_dir, os.path.basename(key))
|
|
187
|
+
# shutil.move(tmpfile, dest)
|
|
188
|
+
return dest_dir
|
|
189
|
+
|
|
190
|
+
@timer
|
|
191
|
+
def create_conda_environment(prefix, conda_pkgs_dir):
|
|
192
|
+
cmd = f'''set -e;
|
|
193
|
+
tmpfile=$(mktemp);
|
|
194
|
+
echo "@EXPLICIT" > "$tmpfile";
|
|
195
|
+
ls -d {conda_pkgs_dir}/*/* >> "$tmpfile";
|
|
94
196
|
export PATH=$PATH:$(pwd)/micromamba;
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
|
|
107
|
-
micromamba create --yes --offline --no-deps --safety-checks=disabled --no-extra-safety-checks --prefix {prefix} --file "$tmpfile";
|
|
108
|
-
rm "$tmpfile"''',
|
|
109
|
-
]
|
|
110
|
-
|
|
111
|
-
# Download PyPI packages.
|
|
112
|
-
if "pypi" in env:
|
|
113
|
-
pypi_pkgs_dir = os.path.join(pkgs_dir, "pypi")
|
|
114
|
-
with storage.load_bytes(
|
|
115
|
-
[package["path"] for package in env["pypi"]]
|
|
116
|
-
) as results:
|
|
117
|
-
for key, tmpfile, _ in results:
|
|
118
|
-
dest = os.path.join(pypi_pkgs_dir, os.path.basename(key))
|
|
119
|
-
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
|
120
|
-
shutil.move(tmpfile, dest)
|
|
121
|
-
|
|
122
|
-
# Install PyPI packages.
|
|
123
|
-
cmds.extend(
|
|
124
|
-
[
|
|
125
|
-
f"""set -e;
|
|
126
|
-
export PATH=$PATH:$(pwd)/micromamba;
|
|
127
|
-
export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
|
|
128
|
-
micromamba run --prefix {prefix} python -m pip --disable-pip-version-check install --root-user-action=ignore --no-compile {pypi_pkgs_dir}/*.whl --no-user"""
|
|
129
|
-
]
|
|
130
|
-
)
|
|
197
|
+
export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
|
|
198
|
+
export MAMBA_NO_LOW_SPEED_LIMIT=1;
|
|
199
|
+
export MAMBA_USE_INDEX_CACHE=1;
|
|
200
|
+
export MAMBA_NO_PROGRESS_BARS=1;
|
|
201
|
+
export CONDA_FETCH_THREADS=1;
|
|
202
|
+
micromamba create --yes --offline --no-deps \
|
|
203
|
+
--safety-checks=disabled --no-extra-safety-checks \
|
|
204
|
+
--prefix {prefix} --file "$tmpfile" \
|
|
205
|
+
--no-pyc --no-rc --always-copy;
|
|
206
|
+
rm "$tmpfile"'''
|
|
207
|
+
run_cmd(cmd)
|
|
131
208
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
209
|
+
@timer
|
|
210
|
+
def install_pypi_packages(prefix, pypi_pkgs_dir):
|
|
211
|
+
|
|
212
|
+
cmd = f"""set -e;
|
|
213
|
+
export PATH=$PATH:$(pwd)/micromamba;
|
|
214
|
+
export CONDA_PKGS_DIRS=$(pwd)/micromamba/pkgs;
|
|
215
|
+
micromamba run --prefix {prefix} python -m pip --disable-pip-version-check \
|
|
216
|
+
install --root-user-action=ignore --no-compile --no-index \
|
|
217
|
+
--no-cache-dir --no-deps --prefer-binary \
|
|
218
|
+
--find-links={pypi_pkgs_dir} --no-user \
|
|
219
|
+
--no-warn-script-location --no-input \
|
|
220
|
+
{pypi_pkgs_dir}/*.whl
|
|
221
|
+
"""
|
|
222
|
+
run_cmd(cmd)
|
|
223
|
+
|
|
224
|
+
@timer
|
|
225
|
+
def setup_environment(
|
|
226
|
+
architecture, storage, env, prefix, conda_pkgs_dir, pypi_pkgs_dir
|
|
227
|
+
):
|
|
228
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
|
229
|
+
# install micromamba, download conda and pypi packages in parallel
|
|
230
|
+
futures = {
|
|
231
|
+
"micromamba": executor.submit(install_micromamba, architecture),
|
|
232
|
+
"conda_pkgs": executor.submit(
|
|
233
|
+
download_conda_packages, storage, env["conda"], conda_pkgs_dir
|
|
234
|
+
),
|
|
235
|
+
}
|
|
236
|
+
if "pypi" in env:
|
|
237
|
+
futures["pypi_pkgs"] = executor.submit(
|
|
238
|
+
download_pypi_packages, storage, env["pypi"], pypi_pkgs_dir
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# create conda environment after micromamba is installed and conda packages are downloaded
|
|
242
|
+
done, _ = concurrent.futures.wait(
|
|
243
|
+
[futures["micromamba"], futures["conda_pkgs"]],
|
|
244
|
+
return_when=concurrent.futures.ALL_COMPLETED,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
for future in done:
|
|
248
|
+
future.result()
|
|
249
|
+
|
|
250
|
+
# start conda environment creation
|
|
251
|
+
futures["conda_env"] = executor.submit(
|
|
252
|
+
create_conda_environment, prefix, conda_pkgs_dir
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
if "pypi" in env:
|
|
256
|
+
# install pypi packages after conda environment is created and pypi packages are downloaded
|
|
257
|
+
done, _ = concurrent.futures.wait(
|
|
258
|
+
[futures["conda_env"], futures["pypi_pkgs"]],
|
|
259
|
+
return_when=concurrent.futures.ALL_COMPLETED,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
for future in done:
|
|
263
|
+
future.result()
|
|
264
|
+
|
|
265
|
+
# install pypi packages
|
|
266
|
+
futures["pypi_install"] = executor.submit(
|
|
267
|
+
install_pypi_packages, prefix, pypi_pkgs_dir
|
|
268
|
+
)
|
|
269
|
+
# wait for pypi packages to be installed
|
|
270
|
+
futures["pypi_install"].result()
|
|
271
|
+
else:
|
|
272
|
+
# wait for conda environment to be created
|
|
273
|
+
futures["conda_env"].result()
|
|
274
|
+
|
|
275
|
+
setup_environment(architecture, storage, env, prefix, conda_pkgs_dir, pypi_pkgs_dir)
|