easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. easylink/_version.py +1 -1
  2. easylink/cli.py +24 -3
  3. easylink/configuration.py +43 -36
  4. easylink/devtools/implementation_creator.py +71 -22
  5. easylink/implementation.py +88 -11
  6. easylink/implementation_metadata.yaml +177 -29
  7. easylink/pipeline.py +15 -6
  8. easylink/pipeline_schema.py +12 -13
  9. easylink/pipeline_schema_constants/__init__.py +4 -5
  10. easylink/pipeline_schema_constants/main.py +489 -0
  11. easylink/runner.py +11 -7
  12. easylink/step.py +89 -0
  13. easylink/steps/cascading/exclude_clustered.def +22 -0
  14. easylink/steps/cascading/exclude_clustered.py +76 -0
  15. easylink/steps/cascading/exclude_none.def +22 -0
  16. easylink/steps/cascading/exclude_none.py +76 -0
  17. easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
  18. easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
  19. easylink/steps/default/default_clusters_to_links.def +22 -0
  20. easylink/steps/default/default_clusters_to_links.py +91 -0
  21. easylink/steps/default/default_determining_exclusions.def +22 -0
  22. easylink/steps/default/default_determining_exclusions.py +81 -0
  23. easylink/steps/default/default_removing_records.def +22 -0
  24. easylink/steps/default/default_removing_records.py +59 -0
  25. easylink/steps/default/default_schema_alignment.def +22 -0
  26. easylink/steps/default/default_schema_alignment.py +53 -0
  27. easylink/steps/default/default_updating_clusters.def +22 -0
  28. easylink/steps/default/default_updating_clusters.py +67 -0
  29. easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
  30. easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
  31. easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
  32. easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
  33. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
  34. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
  35. easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
  36. easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
  37. easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
  38. easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
  39. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
  40. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
  41. easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
  42. easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
  43. easylink/steps/splink/splink_evaluating_pairs.def +22 -0
  44. easylink/steps/splink/splink_evaluating_pairs.py +164 -0
  45. easylink/steps/splink/splink_links_to_clusters.def +22 -0
  46. easylink/steps/splink/splink_links_to_clusters.py +63 -0
  47. easylink/utilities/data_utils.py +72 -0
  48. easylink/utilities/paths.py +4 -3
  49. easylink/utilities/validation_utils.py +509 -11
  50. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
  51. easylink-0.1.19.dist-info/RECORD +91 -0
  52. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
  53. easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
  54. easylink-0.1.17.dist-info/RECORD +0 -55
  55. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
  56. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.17"
1
+ __version__ = "0.1.19"
easylink/cli.py CHANGED
@@ -55,7 +55,7 @@ from easylink.utilities.general_utils import (
55
55
  configure_logging_to_terminal,
56
56
  handle_exceptions,
57
57
  )
58
- from easylink.utilities.paths import CONTAINER_DIR
58
+ from easylink.utilities.paths import DEFAULT_IMAGES_DIR, DEV_IMAGES_DIR
59
59
 
60
60
  SHARED_OPTIONS = [
61
61
  click.option(
@@ -91,6 +91,11 @@ SHARED_OPTIONS = [
91
91
  default=False,
92
92
  help="Do not save the results in a timestamped sub-directory of ``--output-dir``.",
93
93
  ),
94
+ click.option(
95
+ "--schema",
96
+ hidden=True,
97
+ default="main",
98
+ ),
94
99
  ]
95
100
 
96
101
  VERBOSE_WITH_DEBUGGER_OPTIONS = [
@@ -149,6 +154,16 @@ def easylink():
149
154
 
150
155
  @easylink.command()
151
156
  @_pass_shared_options
157
+ @click.option(
158
+ "-I",
159
+ "--images",
160
+ hidden=True,
161
+ type=click.Path(exists=False, file_okay=False, resolve_path=True),
162
+ help=(
163
+ "The directory containing the images to run. If no value is passed, a new "
164
+ f"directory will be created at the home directory: {DEFAULT_IMAGES_DIR}."
165
+ ),
166
+ )
152
167
  @click.option(
153
168
  "-e",
154
169
  "--computing-environment",
@@ -165,6 +180,8 @@ def run(
165
180
  input_data: str,
166
181
  output_dir: str | None,
167
182
  no_timestamp: bool,
183
+ schema: str,
184
+ images: str,
168
185
  computing_environment: str | None,
169
186
  verbose: int,
170
187
  with_debugger: bool,
@@ -190,6 +207,8 @@ def run(
190
207
  input_data=input_data,
191
208
  computing_environment=computing_environment,
192
209
  results_dir=results_dir,
210
+ images_dir=images,
211
+ schema_name=schema,
193
212
  )
194
213
  logger.info("*** FINISHED ***")
195
214
 
@@ -201,6 +220,7 @@ def generate_dag(
201
220
  input_data: str,
202
221
  output_dir: str | None,
203
222
  no_timestamp: bool,
223
+ schema: str,
204
224
  verbose: int,
205
225
  with_debugger: bool,
206
226
  ) -> None:
@@ -223,6 +243,7 @@ def generate_dag(
223
243
  input_data=input_data,
224
244
  computing_environment=None,
225
245
  results_dir=results_dir,
246
+ schema_name=schema,
226
247
  )
227
248
  logger.info("*** DAG saved to result directory ***")
228
249
 
@@ -254,7 +275,7 @@ easylink.add_command(devtools)
254
275
  type=click.Path(exists=False, dir_okay=True, file_okay=False, resolve_path=True),
255
276
  help=(
256
277
  "The directory to move the container to. If no value is passed, it will "
257
- f"be moved to {CONTAINER_DIR} in a sub-directory named with the username."
278
+ f"be moved to {DEV_IMAGES_DIR} in a sub-directory named with the username."
258
279
  ),
259
280
  )
260
281
  def create_implementation(
@@ -291,7 +312,7 @@ def create_implementation(
291
312
  if not scripts:
292
313
  logger.error("No scripts provided.")
293
314
  return
294
- output_dir = Path(output_dir) if output_dir else Path(f"{CONTAINER_DIR}/{os.getlogin()}")
315
+ output_dir = Path(output_dir) if output_dir else Path(f"{DEV_IMAGES_DIR}/{os.getlogin()}")
295
316
  if not output_dir.exists():
296
317
  # make the directory with rwxrwxr-x permissions
297
318
  output_dir.mkdir(parents=True, mode=0o775)
easylink/configuration.py CHANGED
@@ -14,9 +14,10 @@ from typing import Any
14
14
 
15
15
  from layered_config_tree import LayeredConfigTree
16
16
 
17
- from easylink.pipeline_schema import PIPELINE_SCHEMAS, PipelineSchema
17
+ from easylink.pipeline_schema import PipelineSchema
18
18
  from easylink.utilities.data_utils import load_yaml
19
19
  from easylink.utilities.general_utils import exit_with_validation_error
20
+ from easylink.utilities.paths import DEFAULT_IMAGES_DIR
20
21
 
21
22
  PIPELINE_ERRORS_KEY = "PIPELINE ERRORS"
22
23
  INPUT_DATA_ERRORS_KEY = "INPUT DATA ERRORS"
@@ -66,10 +67,14 @@ class Config(LayeredConfigTree):
66
67
  config_params
67
68
  A dictionary of all specifications required to run the pipeline. This
68
69
  includes the pipeline, input data, and computing environment specifications,
69
- as well as the results directory.
70
- potential_schemas
71
- A list of potential schemas to validate the pipeline configuration against.
72
- This is primarily used for testing purposes. Defaults to the supported schemas.
70
+ as well as the results directory and images directory.
71
+ schema_name
72
+ The name of the schema to validate the pipeline configuration against.
73
+ images_dir
74
+ The directory containing the images or to download the images to if they
75
+ don't exist. If None, will default to the :data:`~easylink.utilities.paths.DEFAULT_IMAGES_DIR`.
76
+ command
77
+ The EasyLink command being run.
73
78
 
74
79
  Attributes
75
80
  ----------
@@ -82,22 +87,21 @@ class Config(LayeredConfigTree):
82
87
  input_data
83
88
  The input data filepaths.
84
89
  schema
85
- The :class:`~easylink.pipeline_schema.PipelineSchema` that successfully
86
- validated the requested pipeline.
87
-
88
- Notes
89
- -----
90
- The requested pipeline is checked against a set of supported
91
- ``PipelineSchemas``. The first schema that successfully validates is assumed
92
- to be the correct one and is attached to the ``Config`` object and its
93
- :meth:`~easylink.pipeline_schema.PipelineSchema.configure_pipeline`
94
- method is called.
90
+ The :class:`~easylink.pipeline_schema.PipelineSchema`.
91
+ images_dir
92
+ The directory containing the images or to download the images to if they
93
+ don't exist. If None, will default to ~/.easylink_images.
94
+ command
95
+ The EasyLink command being run.
96
+
95
97
  """
96
98
 
97
99
  def __init__(
98
100
  self,
99
101
  config_params: dict[str, Any],
100
- potential_schemas: PipelineSchema | list[PipelineSchema] = PIPELINE_SCHEMAS,
102
+ schema_name: str = "main",
103
+ images_dir: str | Path | None = None,
104
+ command: str = "run",
101
105
  ) -> None:
102
106
  super().__init__(layers=["initial_data", "default", "user_configured"])
103
107
  self.update(DEFAULT_ENVIRONMENT, layer="default")
@@ -108,10 +112,16 @@ class Config(LayeredConfigTree):
108
112
  # Set slurm defaults to empty dict instead of None so that we don't get errors
109
113
  # in slurm_resources property
110
114
  self.update({"environment": {"slurm": {}}}, layer="default")
111
- if not isinstance(potential_schemas, list):
112
- potential_schemas = [potential_schemas]
113
- self.update({"schema": self._get_schema(potential_schemas)}, layer="initial_data")
115
+ self.update({"schema": self._get_schema(schema_name)}, layer="initial_data")
114
116
  self.schema.configure_pipeline(self.pipeline, self.input_data)
117
+ # use the images_dir if provided, otherwise use default
118
+ self.update(
119
+ {
120
+ "images_dir": Path(images_dir) if images_dir else DEFAULT_IMAGES_DIR,
121
+ },
122
+ layer="user_configured",
123
+ )
124
+ self.update({"command": command}, layer="user_configured")
115
125
  self._validate()
116
126
  self.freeze()
117
127
 
@@ -173,22 +183,22 @@ class Config(LayeredConfigTree):
173
183
  # Setup Methods #
174
184
  #################
175
185
 
176
- def _get_schema(self, potential_schemas: list[PipelineSchema]) -> PipelineSchema:
186
+ def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
177
187
  """Returns the first :class:`~easylink.pipeline_schema.PipelineSchema` that validates the requested pipeline.
178
188
 
179
189
  Parameters
180
190
  ----------
181
- potential_schemas
182
- ``PipelineSchemas`` to validate the pipeline configuration against.
191
+ schema_name
192
+ The name of the specific ``PipelineSchema`` to validate the pipeline configuration against.
183
193
 
184
194
  Returns
185
195
  -------
186
- The first ``PipelineSchema`` that validates the requested pipeline configuration.
196
+ The requested ``PipelineSchema`` if it validates the requested pipeline configuration.
187
197
 
188
198
  Raises
189
199
  ------
190
200
  SystemExit
191
- If the pipeline configuration is not valid for any of the ``potential_schemas``,
201
+ If the pipeline configuration is not valid for the requested schema,
192
202
  the program exits with a non-zero code and all validation errors found
193
203
  are logged.
194
204
 
@@ -197,20 +207,15 @@ class Config(LayeredConfigTree):
197
207
  This acts as the pipeline configuration file's validation method since
198
208
  we can only find a matching ``PipelineSchema`` if that file is valid.
199
209
 
200
- This method returns the *first* ``PipelineSchema`` that validates and does
201
- not attempt to check additional ones.
202
210
  """
203
211
  errors = defaultdict(dict)
204
212
  # Try each schema until one is validated
205
- for schema in potential_schemas:
206
- logs = schema.validate_step(self.pipeline, self.input_data)
207
- if logs:
208
- errors[PIPELINE_ERRORS_KEY][schema.name] = logs
209
- pass # try the next schema
210
- else: # schema was validated
211
- return schema
212
- # No schemas were validated
213
- exit_with_validation_error(dict(errors))
213
+ schema = PipelineSchema.get_schema(schema_name)
214
+ logs = schema.validate_step(self.pipeline, self.input_data)
215
+ if logs:
216
+ errors[PIPELINE_ERRORS_KEY][schema.name] = logs
217
+ exit_with_validation_error(dict(errors))
218
+ return schema
214
219
 
215
220
  def _validate(self) -> None:
216
221
  """Validates the ``Config``.
@@ -319,7 +324,9 @@ def _load_input_data_paths(
319
324
  f"Input was: '{input_data_paths}'"
320
325
  )
321
326
  filepath_dict = {
322
- filename: Path(filepath).resolve() for filename, filepath in input_data_paths.items()
327
+ # Resolve paths relative to location of the YAML file
328
+ filename: (Path(input_data_specification_path).parent / Path(filepath)).resolve()
329
+ for filename, filepath in input_data_paths.items()
323
330
  }
324
331
  return filepath_dict
325
332
 
@@ -19,7 +19,7 @@ from typing import cast
19
19
  import yaml
20
20
  from loguru import logger
21
21
 
22
- from easylink.pipeline_schema_constants import ALLOWED_SCHEMA_PARAMS
22
+ from easylink.pipeline_schema_constants import SCHEMA_PARAMS
23
23
  from easylink.step import (
24
24
  ChoiceStep,
25
25
  EmbarrassinglyParallelStep,
@@ -69,8 +69,6 @@ class ImplementationCreator:
69
69
  for the container.
70
70
  implementation_name
71
71
  The name of the implementation. It is by definition the name of the script.
72
- requirements
73
- The install requirements for the implementation (if any).
74
72
  step
75
73
  The name of the step that this implementation implements.
76
74
  output_slot
@@ -93,20 +91,30 @@ class ImplementationCreator:
93
91
  for the container."""
94
92
  self.implementation_name = script_path.stem
95
93
  """The name of the implementation. It is by definition the name of the script."""
96
- self.requirements = self._extract_requirements(script_path)
97
- """The install requirements for the implementation (if any)."""
98
94
  self.step = self._extract_implemented_step(script_path)
99
95
  """The name of the step that this implementation implements."""
96
+ self.has_custom_recipe = self._extract_has_custom_recipe(script_path)
97
+ """Whether the user has already written the recipe for this implementation."""
98
+ self.script_base_command = self._extract_script_base_command(script_path)
99
+ """The base command to use to run the script in this implementation."""
100
100
  self.output_slot = self._extract_output_slot(script_path, self.step)
101
101
  """The name of the output slot that this implementation sends results to."""
102
102
 
103
103
  def create_recipe(self) -> None:
104
104
  """Builds the singularity recipe and writes it to disk."""
105
-
106
- recipe = PythonRecipe(self.script_path, self.recipe_path, self.requirements)
105
+ if self.has_custom_recipe:
106
+ if not self.recipe_path.exists():
107
+ raise ValueError(f"Could not find a custom recipe at {self.recipe_path}.")
108
+ return
109
+
110
+ recipe = PythonRecipe(
111
+ self.script_path,
112
+ self.recipe_path,
113
+ ImplementationCreator._extract_requirements(self.script_path),
114
+ self.script_base_command,
115
+ )
107
116
  recipe.build()
108
117
  recipe.write()
109
- pass
110
118
 
111
119
  def build_container(self) -> None:
112
120
  """Builds the container from the recipe.
@@ -190,7 +198,7 @@ class ImplementationCreator:
190
198
  info[self.implementation_name] = {
191
199
  "steps": [self.step],
192
200
  "image_path": str(self.hosted_container_path),
193
- "script_cmd": f"python /{self.script_path.name}",
201
+ "script_cmd": f"{self.script_base_command} /{self.script_path.name}",
194
202
  "outputs": {
195
203
  self.output_slot: "result.parquet",
196
204
  },
@@ -241,20 +249,36 @@ class ImplementationCreator:
241
249
  )
242
250
  return steps[0]
243
251
 
252
+ @staticmethod
253
+ def _extract_has_custom_recipe(script_path: Path) -> bool:
254
+ """Extracts whether the user has already written the recipe for this implementation.
255
+
256
+ The expectation is that this flag is specified within the script
257
+ as a comment of the format:
258
+
259
+ .. code-block:: python
260
+ # HAS_CUSTOM_RECIPE: true
261
+ """
262
+ has_custom_recipe = _extract_metadata("HAS_CUSTOM_RECIPE", script_path)
263
+ if len(has_custom_recipe) == 0:
264
+ return False
265
+ else:
266
+ return str(has_custom_recipe[0]).strip().lower() in ["true", "yes"]
267
+
244
268
  @staticmethod
245
269
  def _extract_output_slot(script_path: Path, step_name: str) -> str:
246
270
  """Extracts the name of the output slot that this script is implementing."""
247
- schema = ImplementationCreator._extract_pipeline_schema(script_path)
248
- implementable_steps = ImplementationCreator._extract_implementable_steps(schema)
271
+ schema_name = ImplementationCreator._extract_pipeline_schema_name(script_path)
272
+ implementable_steps = ImplementationCreator._extract_implementable_steps(schema_name)
249
273
  step_names = [step.name for step in implementable_steps]
250
274
  if step_name not in step_names:
251
275
  raise ValueError(
252
- f"'{step_name}' does not exist as an implementable step in the '{schema}' pipeline schema. "
276
+ f"'{step_name}' does not exist as an implementable step in the '{schema_name}' pipeline schema. "
253
277
  )
254
278
  duplicates = list(set([step for step in step_names if step_names.count(step) > 1]))
255
279
  if duplicates:
256
280
  raise ValueError(
257
- f"Multiple implementable steps with the same name found in the '{schema}' "
281
+ f"Multiple implementable steps with the same name found in the '{schema_name}' "
258
282
  f"pipeline schema: {duplicates}."
259
283
  )
260
284
  implemented_step = [step for step in implementable_steps if step.name == step_name][0]
@@ -266,7 +290,7 @@ class ImplementationCreator:
266
290
  return list(implemented_step.output_slots)[0]
267
291
 
268
292
  @staticmethod
269
- def _extract_implementable_steps(schema: str) -> list[Step]:
293
+ def _extract_implementable_steps(schema_name: str) -> list[Step]:
270
294
  """Extracts all implementable steps from the pipeline schema.
271
295
 
272
296
  This method recursively traverses the pipeline schema specified in the script
@@ -296,8 +320,7 @@ class ImplementationCreator:
296
320
  implementable_steps.append(node)
297
321
  return
298
322
 
299
- schema_steps = ALLOWED_SCHEMA_PARAMS[schema][0]
300
-
323
+ schema_steps, _edges = SCHEMA_PARAMS[schema_name]
301
324
  implementable_steps: list[Step] = []
302
325
  for schema_step in schema_steps:
303
326
  _process_step(schema_step)
@@ -305,10 +328,10 @@ class ImplementationCreator:
305
328
  return implementable_steps
306
329
 
307
330
  @staticmethod
308
- def _extract_pipeline_schema(script_path: Path) -> str:
331
+ def _extract_pipeline_schema_name(script_path: Path) -> str:
309
332
  """Extracts the relevant pipeline schema name.
310
333
 
311
- The expectation is that the output slot's name is specified within the script
334
+ The expectation is that the pipeline schema's name is specified within the script
312
335
  as a comment of the format:
313
336
 
314
337
  .. code-block:: python
@@ -316,8 +339,27 @@ class ImplementationCreator:
316
339
 
317
340
  If no pipeline schema is specified, "main" will be used by default.
318
341
  """
319
- schema = _extract_metadata("PIPELINE_SCHEMA", script_path)
320
- return "main" if len(schema) == 0 else schema[0]
342
+ schema_name_list: list[str] = _extract_metadata("PIPELINE_SCHEMA", script_path)
343
+ schema_name = "main" if len(schema_name_list) == 0 else schema_name_list[0]
344
+ if schema_name not in SCHEMA_PARAMS:
345
+ raise ValueError(f"Pipeline schema '{schema_name}' is not supported.")
346
+ return schema_name
347
+
348
+ @staticmethod
349
+ def _extract_script_base_command(script_path: Path) -> str:
350
+ """Extracts the base command to be used to run the script.
351
+
352
+ The expectation is that the base command is specified within the script
353
+ as a comment of the format:
354
+
355
+ .. code-block:: python
356
+ # SCRIPT_BASE_COMMAND: python
357
+
358
+ If no pipeline schema is specified, "python" will be used by default.
359
+ """
360
+ base_command_list: list[str] = _extract_metadata("SCRIPT_BASE_COMMAND", script_path)
361
+ base_command = base_command_list[0] if base_command_list else "python"
362
+ return base_command
321
363
 
322
364
  @staticmethod
323
365
  def _write_metadata(info: dict[str, dict[str, str]]) -> None:
@@ -339,10 +381,17 @@ class PythonRecipe:
339
381
  "python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899"
340
382
  )
341
383
 
342
- def __init__(self, script_path: Path, recipe_path: Path, requirements: str) -> None:
384
+ def __init__(
385
+ self,
386
+ script_path: Path,
387
+ recipe_path: Path,
388
+ requirements: str,
389
+ script_base_command: str,
390
+ ) -> None:
343
391
  self.script_path = script_path
344
392
  self.recipe_path = recipe_path
345
393
  self.requirements = requirements
394
+ self.script_base_command = script_base_command
346
395
  self.text: str | None = None
347
396
 
348
397
  def build(self) -> None:
@@ -371,7 +420,7 @@ From: {self.BASE_IMAGE}
371
420
  export LC_ALL=C
372
421
 
373
422
  %runscript
374
- python /{script_name} '$@'"""
423
+ {self.script_base_command} /{script_name} '$@'"""
375
424
 
376
425
  def write(self) -> None:
377
426
  """Writes the recipe to disk.
@@ -16,9 +16,14 @@ from pathlib import Path
16
16
  from typing import TYPE_CHECKING
17
17
 
18
18
  from layered_config_tree import LayeredConfigTree
19
+ from loguru import logger
19
20
 
20
21
  from easylink.utilities import paths
21
- from easylink.utilities.data_utils import load_yaml
22
+ from easylink.utilities.data_utils import (
23
+ calculate_md5_checksum,
24
+ download_image,
25
+ load_yaml,
26
+ )
22
27
 
23
28
  if TYPE_CHECKING:
24
29
  from easylink.graph_components import InputSlot, OutputSlot
@@ -74,14 +79,14 @@ class Implementation:
74
79
  def __repr__(self) -> str:
75
80
  return f"Implementation.{self.name}"
76
81
 
77
- def validate(self) -> list[str]:
82
+ def validate(self, skip_image_validation: bool, images_dir: str | Path) -> list[str]:
78
83
  """Validates individual ``Implementation`` instances.
79
84
 
80
85
  Returns
81
86
  -------
82
87
  A list of logs containing any validation errors. Each item in the list
83
88
  is a distinct message about a particular validation error (e.g. if a
84
- required container does not exist).
89
+ required image does not exist).
85
90
 
86
91
  Notes
87
92
  -----
@@ -89,7 +94,8 @@ class Implementation:
89
94
  """
90
95
  logs = []
91
96
  logs = self._validate_expected_steps(logs)
92
- logs = self._validate_container_exists(logs)
97
+ if not skip_image_validation:
98
+ logs = self._download_and_validate_image(logs, images_dir)
93
99
  return logs
94
100
 
95
101
  ##################
@@ -110,11 +116,82 @@ class Implementation:
110
116
  )
111
117
  return logs
112
118
 
113
- def _validate_container_exists(self, logs: list[str]) -> list[str]:
114
- """Validates that the container to run exists."""
115
- err_str = f"Container '{self.singularity_image_path}' does not exist."
116
- if not Path(self.singularity_image_path).exists():
117
- logs.append(err_str)
119
+ def _download_and_validate_image(
120
+ self, logs: list[str], images_dir: str | Path
121
+ ) -> list[str]:
122
+ """Downloads the image if required and validates it exists.
123
+
124
+ If the image does not exist in the specified images directory, it will
125
+ attempt to download it.
126
+ """
127
+ # HACK: We manually create the image path here as well as later when writing
128
+ # each implementations Snakefile rule.
129
+ image_path = Path(images_dir) / self.singularity_image_name
130
+ expected_md5_checksum = self._metadata.get("md5_checksum", None)
131
+ record_id = self._metadata.get("zenodo_record_id", None)
132
+ if image_path.exists():
133
+ self._handle_conflicting_checksums(
134
+ logs, image_path, expected_md5_checksum, record_id
135
+ )
136
+ else:
137
+ if not record_id:
138
+ logs.append(
139
+ f"Image '{str(image_path)}' does not exist and no Zenodo record ID "
140
+ "is provided to download it."
141
+ )
142
+ if not expected_md5_checksum:
143
+ logs.append(
144
+ f"Image '{str(image_path)}' does not exist and no MD5 checksum "
145
+ "is provided to verify from the host."
146
+ )
147
+ if not record_id or not expected_md5_checksum:
148
+ return logs
149
+ download_image(
150
+ images_dir=images_dir,
151
+ record_id=record_id,
152
+ filename=self.singularity_image_name,
153
+ md5_checksum=expected_md5_checksum,
154
+ )
155
+ if not image_path.exists():
156
+ logs.append(
157
+ f"Image '{str(image_path)}' does not exist and could not be downloaded."
158
+ )
159
+ return logs
160
+
161
+ @staticmethod
162
+ def _handle_conflicting_checksums(
163
+ logs: list[str],
164
+ image_path: Path,
165
+ expected_md5_checksum: str | None,
166
+ record_id: str | None,
167
+ ) -> list[str]:
168
+ # TODO: Strengthen the following logic to better handle image updates.
169
+ # If using the default images directory and the image already exists
170
+ # but with a different checksum than in the implementation metadata,
171
+ # re-download.
172
+ calculated_md5_checksum = calculate_md5_checksum(image_path)
173
+ if (
174
+ image_path.parent == paths.DEFAULT_IMAGES_DIR
175
+ and expected_md5_checksum
176
+ and calculated_md5_checksum != expected_md5_checksum
177
+ ):
178
+ if not record_id:
179
+ logs.append(
180
+ f"Image '{str(image_path)}' exists but has a different MD5 checksum "
181
+ f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
182
+ "No Zenodo record ID is provided to re-download the image."
183
+ )
184
+ logger.info(
185
+ f"Image '{str(image_path)}' exists but has a different MD5 checksum "
186
+ f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
187
+ "Re-downloading the image."
188
+ )
189
+ download_image(
190
+ images_dir=image_path.parent,
191
+ record_id=record_id,
192
+ filename=image_path.name,
193
+ md5_checksum=expected_md5_checksum,
194
+ )
118
195
  return logs
119
196
 
120
197
  def _get_env_vars(self, implementation_config: LayeredConfigTree) -> dict[str, str]:
@@ -124,9 +201,9 @@ class Implementation:
124
201
  return env_vars
125
202
 
126
203
  @property
127
- def singularity_image_path(self) -> str:
204
+ def singularity_image_name(self) -> str:
128
205
  """The path to the required Singularity image."""
129
- return self._metadata["image_path"]
206
+ return self._metadata["image_name"]
130
207
 
131
208
  @property
132
209
  def script_cmd(self) -> str: