easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +24 -3
- easylink/configuration.py +43 -36
- easylink/devtools/implementation_creator.py +71 -22
- easylink/implementation.py +88 -11
- easylink/implementation_metadata.yaml +177 -29
- easylink/pipeline.py +15 -6
- easylink/pipeline_schema.py +12 -13
- easylink/pipeline_schema_constants/__init__.py +4 -5
- easylink/pipeline_schema_constants/main.py +489 -0
- easylink/runner.py +11 -7
- easylink/step.py +89 -0
- easylink/steps/cascading/exclude_clustered.def +22 -0
- easylink/steps/cascading/exclude_clustered.py +76 -0
- easylink/steps/cascading/exclude_none.def +22 -0
- easylink/steps/cascading/exclude_none.py +76 -0
- easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
- easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
- easylink/steps/default/default_clusters_to_links.def +22 -0
- easylink/steps/default/default_clusters_to_links.py +91 -0
- easylink/steps/default/default_determining_exclusions.def +22 -0
- easylink/steps/default/default_determining_exclusions.py +81 -0
- easylink/steps/default/default_removing_records.def +22 -0
- easylink/steps/default/default_removing_records.py +59 -0
- easylink/steps/default/default_schema_alignment.def +22 -0
- easylink/steps/default/default_schema_alignment.py +53 -0
- easylink/steps/default/default_updating_clusters.def +22 -0
- easylink/steps/default/default_updating_clusters.py +67 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
- easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
- easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
- easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
- easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
- easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
- easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
- easylink/steps/splink/splink_evaluating_pairs.def +22 -0
- easylink/steps/splink/splink_evaluating_pairs.py +164 -0
- easylink/steps/splink/splink_links_to_clusters.def +22 -0
- easylink/steps/splink/splink_links_to_clusters.py +63 -0
- easylink/utilities/data_utils.py +72 -0
- easylink/utilities/paths.py +4 -3
- easylink/utilities/validation_utils.py +509 -11
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
- easylink-0.1.19.dist-info/RECORD +91 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
- easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
- easylink-0.1.17.dist-info/RECORD +0 -55
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.19"
|
easylink/cli.py
CHANGED
@@ -55,7 +55,7 @@ from easylink.utilities.general_utils import (
|
|
55
55
|
configure_logging_to_terminal,
|
56
56
|
handle_exceptions,
|
57
57
|
)
|
58
|
-
from easylink.utilities.paths import
|
58
|
+
from easylink.utilities.paths import DEFAULT_IMAGES_DIR, DEV_IMAGES_DIR
|
59
59
|
|
60
60
|
SHARED_OPTIONS = [
|
61
61
|
click.option(
|
@@ -91,6 +91,11 @@ SHARED_OPTIONS = [
|
|
91
91
|
default=False,
|
92
92
|
help="Do not save the results in a timestamped sub-directory of ``--output-dir``.",
|
93
93
|
),
|
94
|
+
click.option(
|
95
|
+
"--schema",
|
96
|
+
hidden=True,
|
97
|
+
default="main",
|
98
|
+
),
|
94
99
|
]
|
95
100
|
|
96
101
|
VERBOSE_WITH_DEBUGGER_OPTIONS = [
|
@@ -149,6 +154,16 @@ def easylink():
|
|
149
154
|
|
150
155
|
@easylink.command()
|
151
156
|
@_pass_shared_options
|
157
|
+
@click.option(
|
158
|
+
"-I",
|
159
|
+
"--images",
|
160
|
+
hidden=True,
|
161
|
+
type=click.Path(exists=False, file_okay=False, resolve_path=True),
|
162
|
+
help=(
|
163
|
+
"The directory containing the images to run. If no value is passed, a new "
|
164
|
+
f"directory will be created at the home directory: {DEFAULT_IMAGES_DIR}."
|
165
|
+
),
|
166
|
+
)
|
152
167
|
@click.option(
|
153
168
|
"-e",
|
154
169
|
"--computing-environment",
|
@@ -165,6 +180,8 @@ def run(
|
|
165
180
|
input_data: str,
|
166
181
|
output_dir: str | None,
|
167
182
|
no_timestamp: bool,
|
183
|
+
schema: str,
|
184
|
+
images: str,
|
168
185
|
computing_environment: str | None,
|
169
186
|
verbose: int,
|
170
187
|
with_debugger: bool,
|
@@ -190,6 +207,8 @@ def run(
|
|
190
207
|
input_data=input_data,
|
191
208
|
computing_environment=computing_environment,
|
192
209
|
results_dir=results_dir,
|
210
|
+
images_dir=images,
|
211
|
+
schema_name=schema,
|
193
212
|
)
|
194
213
|
logger.info("*** FINISHED ***")
|
195
214
|
|
@@ -201,6 +220,7 @@ def generate_dag(
|
|
201
220
|
input_data: str,
|
202
221
|
output_dir: str | None,
|
203
222
|
no_timestamp: bool,
|
223
|
+
schema: str,
|
204
224
|
verbose: int,
|
205
225
|
with_debugger: bool,
|
206
226
|
) -> None:
|
@@ -223,6 +243,7 @@ def generate_dag(
|
|
223
243
|
input_data=input_data,
|
224
244
|
computing_environment=None,
|
225
245
|
results_dir=results_dir,
|
246
|
+
schema_name=schema,
|
226
247
|
)
|
227
248
|
logger.info("*** DAG saved to result directory ***")
|
228
249
|
|
@@ -254,7 +275,7 @@ easylink.add_command(devtools)
|
|
254
275
|
type=click.Path(exists=False, dir_okay=True, file_okay=False, resolve_path=True),
|
255
276
|
help=(
|
256
277
|
"The directory to move the container to. If no value is passed, it will "
|
257
|
-
f"be moved to {
|
278
|
+
f"be moved to {DEV_IMAGES_DIR} in a sub-directory named with the username."
|
258
279
|
),
|
259
280
|
)
|
260
281
|
def create_implementation(
|
@@ -291,7 +312,7 @@ def create_implementation(
|
|
291
312
|
if not scripts:
|
292
313
|
logger.error("No scripts provided.")
|
293
314
|
return
|
294
|
-
output_dir = Path(output_dir) if output_dir else Path(f"{
|
315
|
+
output_dir = Path(output_dir) if output_dir else Path(f"{DEV_IMAGES_DIR}/{os.getlogin()}")
|
295
316
|
if not output_dir.exists():
|
296
317
|
# make the directory with rwxrwxr-x permissions
|
297
318
|
output_dir.mkdir(parents=True, mode=0o775)
|
easylink/configuration.py
CHANGED
@@ -14,9 +14,10 @@ from typing import Any
|
|
14
14
|
|
15
15
|
from layered_config_tree import LayeredConfigTree
|
16
16
|
|
17
|
-
from easylink.pipeline_schema import
|
17
|
+
from easylink.pipeline_schema import PipelineSchema
|
18
18
|
from easylink.utilities.data_utils import load_yaml
|
19
19
|
from easylink.utilities.general_utils import exit_with_validation_error
|
20
|
+
from easylink.utilities.paths import DEFAULT_IMAGES_DIR
|
20
21
|
|
21
22
|
PIPELINE_ERRORS_KEY = "PIPELINE ERRORS"
|
22
23
|
INPUT_DATA_ERRORS_KEY = "INPUT DATA ERRORS"
|
@@ -66,10 +67,14 @@ class Config(LayeredConfigTree):
|
|
66
67
|
config_params
|
67
68
|
A dictionary of all specifications required to run the pipeline. This
|
68
69
|
includes the pipeline, input data, and computing environment specifications,
|
69
|
-
as well as the results directory.
|
70
|
-
|
71
|
-
|
72
|
-
|
70
|
+
as well as the results directory and images directory.
|
71
|
+
schema_name
|
72
|
+
The name of the schema to validate the pipeline configuration against.
|
73
|
+
images_dir
|
74
|
+
The directory containing the images or to download the images to if they
|
75
|
+
don't exist. If None, will default to the :data:`~easylink.utilities.paths.DEFAULT_IMAGES_DIR`.
|
76
|
+
command
|
77
|
+
The EasyLink command being run.
|
73
78
|
|
74
79
|
Attributes
|
75
80
|
----------
|
@@ -82,22 +87,21 @@ class Config(LayeredConfigTree):
|
|
82
87
|
input_data
|
83
88
|
The input data filepaths.
|
84
89
|
schema
|
85
|
-
The :class:`~easylink.pipeline_schema.PipelineSchema
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
to be the correct one and is attached to the ``Config`` object and its
|
93
|
-
:meth:`~easylink.pipeline_schema.PipelineSchema.configure_pipeline`
|
94
|
-
method is called.
|
90
|
+
The :class:`~easylink.pipeline_schema.PipelineSchema`.
|
91
|
+
images_dir
|
92
|
+
The directory containing the images or to download the images to if they
|
93
|
+
don't exist. If None, will default to ~/.easylink_images.
|
94
|
+
command
|
95
|
+
The EasyLink command being run.
|
96
|
+
|
95
97
|
"""
|
96
98
|
|
97
99
|
def __init__(
|
98
100
|
self,
|
99
101
|
config_params: dict[str, Any],
|
100
|
-
|
102
|
+
schema_name: str = "main",
|
103
|
+
images_dir: str | Path | None = None,
|
104
|
+
command: str = "run",
|
101
105
|
) -> None:
|
102
106
|
super().__init__(layers=["initial_data", "default", "user_configured"])
|
103
107
|
self.update(DEFAULT_ENVIRONMENT, layer="default")
|
@@ -108,10 +112,16 @@ class Config(LayeredConfigTree):
|
|
108
112
|
# Set slurm defaults to empty dict instead of None so that we don't get errors
|
109
113
|
# in slurm_resources property
|
110
114
|
self.update({"environment": {"slurm": {}}}, layer="default")
|
111
|
-
|
112
|
-
potential_schemas = [potential_schemas]
|
113
|
-
self.update({"schema": self._get_schema(potential_schemas)}, layer="initial_data")
|
115
|
+
self.update({"schema": self._get_schema(schema_name)}, layer="initial_data")
|
114
116
|
self.schema.configure_pipeline(self.pipeline, self.input_data)
|
117
|
+
# use the images_dir if provided, otherwise use default
|
118
|
+
self.update(
|
119
|
+
{
|
120
|
+
"images_dir": Path(images_dir) if images_dir else DEFAULT_IMAGES_DIR,
|
121
|
+
},
|
122
|
+
layer="user_configured",
|
123
|
+
)
|
124
|
+
self.update({"command": command}, layer="user_configured")
|
115
125
|
self._validate()
|
116
126
|
self.freeze()
|
117
127
|
|
@@ -173,22 +183,22 @@ class Config(LayeredConfigTree):
|
|
173
183
|
# Setup Methods #
|
174
184
|
#################
|
175
185
|
|
176
|
-
def _get_schema(self,
|
186
|
+
def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
|
177
187
|
"""Returns the first :class:`~easylink.pipeline_schema.PipelineSchema` that validates the requested pipeline.
|
178
188
|
|
179
189
|
Parameters
|
180
190
|
----------
|
181
|
-
|
182
|
-
``
|
191
|
+
schema_name
|
192
|
+
The name of the specific ``PipelineSchema`` to validate the pipeline configuration against.
|
183
193
|
|
184
194
|
Returns
|
185
195
|
-------
|
186
|
-
The
|
196
|
+
The requested ``PipelineSchema`` if it validates the requested pipeline configuration.
|
187
197
|
|
188
198
|
Raises
|
189
199
|
------
|
190
200
|
SystemExit
|
191
|
-
If the pipeline configuration is not valid for
|
201
|
+
If the pipeline configuration is not valid for the requested schema,
|
192
202
|
the program exits with a non-zero code and all validation errors found
|
193
203
|
are logged.
|
194
204
|
|
@@ -197,20 +207,15 @@ class Config(LayeredConfigTree):
|
|
197
207
|
This acts as the pipeline configuration file's validation method since
|
198
208
|
we can only find a matching ``PipelineSchema`` if that file is valid.
|
199
209
|
|
200
|
-
This method returns the *first* ``PipelineSchema`` that validates and does
|
201
|
-
not attempt to check additional ones.
|
202
210
|
"""
|
203
211
|
errors = defaultdict(dict)
|
204
212
|
# Try each schema until one is validated
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
return schema
|
212
|
-
# No schemas were validated
|
213
|
-
exit_with_validation_error(dict(errors))
|
213
|
+
schema = PipelineSchema.get_schema(schema_name)
|
214
|
+
logs = schema.validate_step(self.pipeline, self.input_data)
|
215
|
+
if logs:
|
216
|
+
errors[PIPELINE_ERRORS_KEY][schema.name] = logs
|
217
|
+
exit_with_validation_error(dict(errors))
|
218
|
+
return schema
|
214
219
|
|
215
220
|
def _validate(self) -> None:
|
216
221
|
"""Validates the ``Config``.
|
@@ -319,7 +324,9 @@ def _load_input_data_paths(
|
|
319
324
|
f"Input was: '{input_data_paths}'"
|
320
325
|
)
|
321
326
|
filepath_dict = {
|
322
|
-
|
327
|
+
# Resolve paths relative to location of the YAML file
|
328
|
+
filename: (Path(input_data_specification_path).parent / Path(filepath)).resolve()
|
329
|
+
for filename, filepath in input_data_paths.items()
|
323
330
|
}
|
324
331
|
return filepath_dict
|
325
332
|
|
@@ -19,7 +19,7 @@ from typing import cast
|
|
19
19
|
import yaml
|
20
20
|
from loguru import logger
|
21
21
|
|
22
|
-
from easylink.pipeline_schema_constants import
|
22
|
+
from easylink.pipeline_schema_constants import SCHEMA_PARAMS
|
23
23
|
from easylink.step import (
|
24
24
|
ChoiceStep,
|
25
25
|
EmbarrassinglyParallelStep,
|
@@ -69,8 +69,6 @@ class ImplementationCreator:
|
|
69
69
|
for the container.
|
70
70
|
implementation_name
|
71
71
|
The name of the implementation. It is by definition the name of the script.
|
72
|
-
requirements
|
73
|
-
The install requirements for the implementation (if any).
|
74
72
|
step
|
75
73
|
The name of the step that this implementation implements.
|
76
74
|
output_slot
|
@@ -93,20 +91,30 @@ class ImplementationCreator:
|
|
93
91
|
for the container."""
|
94
92
|
self.implementation_name = script_path.stem
|
95
93
|
"""The name of the implementation. It is by definition the name of the script."""
|
96
|
-
self.requirements = self._extract_requirements(script_path)
|
97
|
-
"""The install requirements for the implementation (if any)."""
|
98
94
|
self.step = self._extract_implemented_step(script_path)
|
99
95
|
"""The name of the step that this implementation implements."""
|
96
|
+
self.has_custom_recipe = self._extract_has_custom_recipe(script_path)
|
97
|
+
"""Whether the user has already written the recipe for this implementation."""
|
98
|
+
self.script_base_command = self._extract_script_base_command(script_path)
|
99
|
+
"""The base command to use to run the script in this implementation."""
|
100
100
|
self.output_slot = self._extract_output_slot(script_path, self.step)
|
101
101
|
"""The name of the output slot that this implementation sends results to."""
|
102
102
|
|
103
103
|
def create_recipe(self) -> None:
|
104
104
|
"""Builds the singularity recipe and writes it to disk."""
|
105
|
-
|
106
|
-
|
105
|
+
if self.has_custom_recipe:
|
106
|
+
if not self.recipe_path.exists():
|
107
|
+
raise ValueError(f"Could not find a custom recipe at {self.recipe_path}.")
|
108
|
+
return
|
109
|
+
|
110
|
+
recipe = PythonRecipe(
|
111
|
+
self.script_path,
|
112
|
+
self.recipe_path,
|
113
|
+
ImplementationCreator._extract_requirements(self.script_path),
|
114
|
+
self.script_base_command,
|
115
|
+
)
|
107
116
|
recipe.build()
|
108
117
|
recipe.write()
|
109
|
-
pass
|
110
118
|
|
111
119
|
def build_container(self) -> None:
|
112
120
|
"""Builds the container from the recipe.
|
@@ -190,7 +198,7 @@ class ImplementationCreator:
|
|
190
198
|
info[self.implementation_name] = {
|
191
199
|
"steps": [self.step],
|
192
200
|
"image_path": str(self.hosted_container_path),
|
193
|
-
"script_cmd": f"
|
201
|
+
"script_cmd": f"{self.script_base_command} /{self.script_path.name}",
|
194
202
|
"outputs": {
|
195
203
|
self.output_slot: "result.parquet",
|
196
204
|
},
|
@@ -241,20 +249,36 @@ class ImplementationCreator:
|
|
241
249
|
)
|
242
250
|
return steps[0]
|
243
251
|
|
252
|
+
@staticmethod
|
253
|
+
def _extract_has_custom_recipe(script_path: Path) -> bool:
|
254
|
+
"""Extracts whether the user has already written the recipe for this implementation.
|
255
|
+
|
256
|
+
The expectation is that this flag is specified within the script
|
257
|
+
as a comment of the format:
|
258
|
+
|
259
|
+
.. code-block:: python
|
260
|
+
# HAS_CUSTOM_RECIPE: true
|
261
|
+
"""
|
262
|
+
has_custom_recipe = _extract_metadata("HAS_CUSTOM_RECIPE", script_path)
|
263
|
+
if len(has_custom_recipe) == 0:
|
264
|
+
return False
|
265
|
+
else:
|
266
|
+
return str(has_custom_recipe[0]).strip().lower() in ["true", "yes"]
|
267
|
+
|
244
268
|
@staticmethod
|
245
269
|
def _extract_output_slot(script_path: Path, step_name: str) -> str:
|
246
270
|
"""Extracts the name of the output slot that this script is implementing."""
|
247
|
-
|
248
|
-
implementable_steps = ImplementationCreator._extract_implementable_steps(
|
271
|
+
schema_name = ImplementationCreator._extract_pipeline_schema_name(script_path)
|
272
|
+
implementable_steps = ImplementationCreator._extract_implementable_steps(schema_name)
|
249
273
|
step_names = [step.name for step in implementable_steps]
|
250
274
|
if step_name not in step_names:
|
251
275
|
raise ValueError(
|
252
|
-
f"'{step_name}' does not exist as an implementable step in the '{
|
276
|
+
f"'{step_name}' does not exist as an implementable step in the '{schema_name}' pipeline schema. "
|
253
277
|
)
|
254
278
|
duplicates = list(set([step for step in step_names if step_names.count(step) > 1]))
|
255
279
|
if duplicates:
|
256
280
|
raise ValueError(
|
257
|
-
f"Multiple implementable steps with the same name found in the '{
|
281
|
+
f"Multiple implementable steps with the same name found in the '{schema_name}' "
|
258
282
|
f"pipeline schema: {duplicates}."
|
259
283
|
)
|
260
284
|
implemented_step = [step for step in implementable_steps if step.name == step_name][0]
|
@@ -266,7 +290,7 @@ class ImplementationCreator:
|
|
266
290
|
return list(implemented_step.output_slots)[0]
|
267
291
|
|
268
292
|
@staticmethod
|
269
|
-
def _extract_implementable_steps(
|
293
|
+
def _extract_implementable_steps(schema_name: str) -> list[Step]:
|
270
294
|
"""Extracts all implementable steps from the pipeline schema.
|
271
295
|
|
272
296
|
This method recursively traverses the pipeline schema specified in the script
|
@@ -296,8 +320,7 @@ class ImplementationCreator:
|
|
296
320
|
implementable_steps.append(node)
|
297
321
|
return
|
298
322
|
|
299
|
-
schema_steps =
|
300
|
-
|
323
|
+
schema_steps, _edges = SCHEMA_PARAMS[schema_name]
|
301
324
|
implementable_steps: list[Step] = []
|
302
325
|
for schema_step in schema_steps:
|
303
326
|
_process_step(schema_step)
|
@@ -305,10 +328,10 @@ class ImplementationCreator:
|
|
305
328
|
return implementable_steps
|
306
329
|
|
307
330
|
@staticmethod
|
308
|
-
def
|
331
|
+
def _extract_pipeline_schema_name(script_path: Path) -> str:
|
309
332
|
"""Extracts the relevant pipeline schema name.
|
310
333
|
|
311
|
-
The expectation is that the
|
334
|
+
The expectation is that the pipeline schema's name is specified within the script
|
312
335
|
as a comment of the format:
|
313
336
|
|
314
337
|
.. code-block:: python
|
@@ -316,8 +339,27 @@ class ImplementationCreator:
|
|
316
339
|
|
317
340
|
If no pipeline schema is specified, "main" will be used by default.
|
318
341
|
"""
|
319
|
-
|
320
|
-
|
342
|
+
schema_name_list: list[str] = _extract_metadata("PIPELINE_SCHEMA", script_path)
|
343
|
+
schema_name = "main" if len(schema_name_list) == 0 else schema_name_list[0]
|
344
|
+
if schema_name not in SCHEMA_PARAMS:
|
345
|
+
raise ValueError(f"Pipeline schema '{schema_name}' is not supported.")
|
346
|
+
return schema_name
|
347
|
+
|
348
|
+
@staticmethod
|
349
|
+
def _extract_script_base_command(script_path: Path) -> str:
|
350
|
+
"""Extracts the base command to be used to run the script.
|
351
|
+
|
352
|
+
The expectation is that the base command is specified within the script
|
353
|
+
as a comment of the format:
|
354
|
+
|
355
|
+
.. code-block:: python
|
356
|
+
# SCRIPT_BASE_COMMAND: python
|
357
|
+
|
358
|
+
If no pipeline schema is specified, "python" will be used by default.
|
359
|
+
"""
|
360
|
+
base_command_list: list[str] = _extract_metadata("SCRIPT_BASE_COMMAND", script_path)
|
361
|
+
base_command = base_command_list[0] if base_command_list else "python"
|
362
|
+
return base_command
|
321
363
|
|
322
364
|
@staticmethod
|
323
365
|
def _write_metadata(info: dict[str, dict[str, str]]) -> None:
|
@@ -339,10 +381,17 @@ class PythonRecipe:
|
|
339
381
|
"python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899"
|
340
382
|
)
|
341
383
|
|
342
|
-
def __init__(
|
384
|
+
def __init__(
|
385
|
+
self,
|
386
|
+
script_path: Path,
|
387
|
+
recipe_path: Path,
|
388
|
+
requirements: str,
|
389
|
+
script_base_command: str,
|
390
|
+
) -> None:
|
343
391
|
self.script_path = script_path
|
344
392
|
self.recipe_path = recipe_path
|
345
393
|
self.requirements = requirements
|
394
|
+
self.script_base_command = script_base_command
|
346
395
|
self.text: str | None = None
|
347
396
|
|
348
397
|
def build(self) -> None:
|
@@ -371,7 +420,7 @@ From: {self.BASE_IMAGE}
|
|
371
420
|
export LC_ALL=C
|
372
421
|
|
373
422
|
%runscript
|
374
|
-
|
423
|
+
{self.script_base_command} /{script_name} '$@'"""
|
375
424
|
|
376
425
|
def write(self) -> None:
|
377
426
|
"""Writes the recipe to disk.
|
easylink/implementation.py
CHANGED
@@ -16,9 +16,14 @@ from pathlib import Path
|
|
16
16
|
from typing import TYPE_CHECKING
|
17
17
|
|
18
18
|
from layered_config_tree import LayeredConfigTree
|
19
|
+
from loguru import logger
|
19
20
|
|
20
21
|
from easylink.utilities import paths
|
21
|
-
from easylink.utilities.data_utils import
|
22
|
+
from easylink.utilities.data_utils import (
|
23
|
+
calculate_md5_checksum,
|
24
|
+
download_image,
|
25
|
+
load_yaml,
|
26
|
+
)
|
22
27
|
|
23
28
|
if TYPE_CHECKING:
|
24
29
|
from easylink.graph_components import InputSlot, OutputSlot
|
@@ -74,14 +79,14 @@ class Implementation:
|
|
74
79
|
def __repr__(self) -> str:
|
75
80
|
return f"Implementation.{self.name}"
|
76
81
|
|
77
|
-
def validate(self) -> list[str]:
|
82
|
+
def validate(self, skip_image_validation: bool, images_dir: str | Path) -> list[str]:
|
78
83
|
"""Validates individual ``Implementation`` instances.
|
79
84
|
|
80
85
|
Returns
|
81
86
|
-------
|
82
87
|
A list of logs containing any validation errors. Each item in the list
|
83
88
|
is a distinct message about a particular validation error (e.g. if a
|
84
|
-
required
|
89
|
+
required image does not exist).
|
85
90
|
|
86
91
|
Notes
|
87
92
|
-----
|
@@ -89,7 +94,8 @@ class Implementation:
|
|
89
94
|
"""
|
90
95
|
logs = []
|
91
96
|
logs = self._validate_expected_steps(logs)
|
92
|
-
|
97
|
+
if not skip_image_validation:
|
98
|
+
logs = self._download_and_validate_image(logs, images_dir)
|
93
99
|
return logs
|
94
100
|
|
95
101
|
##################
|
@@ -110,11 +116,82 @@ class Implementation:
|
|
110
116
|
)
|
111
117
|
return logs
|
112
118
|
|
113
|
-
def
|
114
|
-
|
115
|
-
|
116
|
-
if
|
117
|
-
|
119
|
+
def _download_and_validate_image(
|
120
|
+
self, logs: list[str], images_dir: str | Path
|
121
|
+
) -> list[str]:
|
122
|
+
"""Downloads the image if required and validates it exists.
|
123
|
+
|
124
|
+
If the image does not exist in the specified images directory, it will
|
125
|
+
attempt to download it.
|
126
|
+
"""
|
127
|
+
# HACK: We manually create the image path here as well as later when writing
|
128
|
+
# each implementations Snakefile rule.
|
129
|
+
image_path = Path(images_dir) / self.singularity_image_name
|
130
|
+
expected_md5_checksum = self._metadata.get("md5_checksum", None)
|
131
|
+
record_id = self._metadata.get("zenodo_record_id", None)
|
132
|
+
if image_path.exists():
|
133
|
+
self._handle_conflicting_checksums(
|
134
|
+
logs, image_path, expected_md5_checksum, record_id
|
135
|
+
)
|
136
|
+
else:
|
137
|
+
if not record_id:
|
138
|
+
logs.append(
|
139
|
+
f"Image '{str(image_path)}' does not exist and no Zenodo record ID "
|
140
|
+
"is provided to download it."
|
141
|
+
)
|
142
|
+
if not expected_md5_checksum:
|
143
|
+
logs.append(
|
144
|
+
f"Image '{str(image_path)}' does not exist and no MD5 checksum "
|
145
|
+
"is provided to verify from the host."
|
146
|
+
)
|
147
|
+
if not record_id or not expected_md5_checksum:
|
148
|
+
return logs
|
149
|
+
download_image(
|
150
|
+
images_dir=images_dir,
|
151
|
+
record_id=record_id,
|
152
|
+
filename=self.singularity_image_name,
|
153
|
+
md5_checksum=expected_md5_checksum,
|
154
|
+
)
|
155
|
+
if not image_path.exists():
|
156
|
+
logs.append(
|
157
|
+
f"Image '{str(image_path)}' does not exist and could not be downloaded."
|
158
|
+
)
|
159
|
+
return logs
|
160
|
+
|
161
|
+
@staticmethod
|
162
|
+
def _handle_conflicting_checksums(
|
163
|
+
logs: list[str],
|
164
|
+
image_path: Path,
|
165
|
+
expected_md5_checksum: str | None,
|
166
|
+
record_id: str | None,
|
167
|
+
) -> list[str]:
|
168
|
+
# TODO: Strengthen the following logic to better handle image updates.
|
169
|
+
# If using the default images directory and the image already exists
|
170
|
+
# but with a different checksum than in the implementation metadata,
|
171
|
+
# re-download.
|
172
|
+
calculated_md5_checksum = calculate_md5_checksum(image_path)
|
173
|
+
if (
|
174
|
+
image_path.parent == paths.DEFAULT_IMAGES_DIR
|
175
|
+
and expected_md5_checksum
|
176
|
+
and calculated_md5_checksum != expected_md5_checksum
|
177
|
+
):
|
178
|
+
if not record_id:
|
179
|
+
logs.append(
|
180
|
+
f"Image '{str(image_path)}' exists but has a different MD5 checksum "
|
181
|
+
f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
|
182
|
+
"No Zenodo record ID is provided to re-download the image."
|
183
|
+
)
|
184
|
+
logger.info(
|
185
|
+
f"Image '{str(image_path)}' exists but has a different MD5 checksum "
|
186
|
+
f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
|
187
|
+
"Re-downloading the image."
|
188
|
+
)
|
189
|
+
download_image(
|
190
|
+
images_dir=image_path.parent,
|
191
|
+
record_id=record_id,
|
192
|
+
filename=image_path.name,
|
193
|
+
md5_checksum=expected_md5_checksum,
|
194
|
+
)
|
118
195
|
return logs
|
119
196
|
|
120
197
|
def _get_env_vars(self, implementation_config: LayeredConfigTree) -> dict[str, str]:
|
@@ -124,9 +201,9 @@ class Implementation:
|
|
124
201
|
return env_vars
|
125
202
|
|
126
203
|
@property
|
127
|
-
def
|
204
|
+
def singularity_image_name(self) -> str:
|
128
205
|
"""The path to the required Singularity image."""
|
129
|
-
return self._metadata["
|
206
|
+
return self._metadata["image_name"]
|
130
207
|
|
131
208
|
@property
|
132
209
|
def script_cmd(self) -> str:
|