easylink 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. easylink/_version.py +1 -1
  2. easylink/cli.py +15 -3
  3. easylink/configuration.py +25 -2
  4. easylink/devtools/implementation_creator.py +58 -11
  5. easylink/implementation.py +88 -11
  6. easylink/implementation_metadata.yaml +177 -26
  7. easylink/pipeline.py +15 -6
  8. easylink/pipeline_schema_constants/__init__.py +2 -2
  9. easylink/pipeline_schema_constants/main.py +489 -0
  10. easylink/runner.py +7 -1
  11. easylink/step.py +89 -0
  12. easylink/steps/cascading/exclude_clustered.def +22 -0
  13. easylink/steps/cascading/exclude_clustered.py +76 -0
  14. easylink/steps/cascading/exclude_none.def +22 -0
  15. easylink/steps/cascading/exclude_none.py +76 -0
  16. easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
  17. easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
  18. easylink/steps/default/default_clusters_to_links.def +22 -0
  19. easylink/steps/default/default_clusters_to_links.py +91 -0
  20. easylink/steps/default/default_determining_exclusions.def +22 -0
  21. easylink/steps/default/default_determining_exclusions.py +81 -0
  22. easylink/steps/default/default_removing_records.def +22 -0
  23. easylink/steps/default/default_removing_records.py +59 -0
  24. easylink/steps/default/default_schema_alignment.def +22 -0
  25. easylink/steps/default/default_schema_alignment.py +53 -0
  26. easylink/steps/default/default_updating_clusters.def +22 -0
  27. easylink/steps/default/default_updating_clusters.py +67 -0
  28. easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
  29. easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
  30. easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
  31. easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
  32. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
  33. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
  34. easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
  35. easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
  36. easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
  37. easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
  38. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
  39. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
  40. easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
  41. easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
  42. easylink/steps/splink/splink_evaluating_pairs.def +22 -0
  43. easylink/steps/splink/splink_evaluating_pairs.py +164 -0
  44. easylink/steps/splink/splink_links_to_clusters.def +22 -0
  45. easylink/steps/splink/splink_links_to_clusters.py +63 -0
  46. easylink/utilities/data_utils.py +72 -0
  47. easylink/utilities/paths.py +4 -3
  48. easylink/utilities/validation_utils.py +509 -11
  49. {easylink-0.1.18.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
  50. easylink-0.1.19.dist-info/RECORD +91 -0
  51. {easylink-0.1.18.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
  52. easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
  53. easylink-0.1.18.dist-info/RECORD +0 -55
  54. {easylink-0.1.18.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
  55. {easylink-0.1.18.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.18"
1
+ __version__ = "0.1.19"
easylink/cli.py CHANGED
@@ -55,7 +55,7 @@ from easylink.utilities.general_utils import (
55
55
  configure_logging_to_terminal,
56
56
  handle_exceptions,
57
57
  )
58
- from easylink.utilities.paths import CONTAINER_DIR
58
+ from easylink.utilities.paths import DEFAULT_IMAGES_DIR, DEV_IMAGES_DIR
59
59
 
60
60
  SHARED_OPTIONS = [
61
61
  click.option(
@@ -154,6 +154,16 @@ def easylink():
154
154
 
155
155
  @easylink.command()
156
156
  @_pass_shared_options
157
+ @click.option(
158
+ "-I",
159
+ "--images",
160
+ hidden=True,
161
+ type=click.Path(exists=False, file_okay=False, resolve_path=True),
162
+ help=(
163
+ "The directory containing the images to run. If no value is passed, a new "
164
+ f"directory will be created at the home directory: {DEFAULT_IMAGES_DIR}."
165
+ ),
166
+ )
157
167
  @click.option(
158
168
  "-e",
159
169
  "--computing-environment",
@@ -171,6 +181,7 @@ def run(
171
181
  output_dir: str | None,
172
182
  no_timestamp: bool,
173
183
  schema: str,
184
+ images: str,
174
185
  computing_environment: str | None,
175
186
  verbose: int,
176
187
  with_debugger: bool,
@@ -196,6 +207,7 @@ def run(
196
207
  input_data=input_data,
197
208
  computing_environment=computing_environment,
198
209
  results_dir=results_dir,
210
+ images_dir=images,
199
211
  schema_name=schema,
200
212
  )
201
213
  logger.info("*** FINISHED ***")
@@ -263,7 +275,7 @@ easylink.add_command(devtools)
263
275
  type=click.Path(exists=False, dir_okay=True, file_okay=False, resolve_path=True),
264
276
  help=(
265
277
  "The directory to move the container to. If no value is passed, it will "
266
- f"be moved to {CONTAINER_DIR} in a sub-directory named with the username."
278
+ f"be moved to {DEV_IMAGES_DIR} in a sub-directory named with the username."
267
279
  ),
268
280
  )
269
281
  def create_implementation(
@@ -300,7 +312,7 @@ def create_implementation(
300
312
  if not scripts:
301
313
  logger.error("No scripts provided.")
302
314
  return
303
- output_dir = Path(output_dir) if output_dir else Path(f"{CONTAINER_DIR}/{os.getlogin()}")
315
+ output_dir = Path(output_dir) if output_dir else Path(f"{DEV_IMAGES_DIR}/{os.getlogin()}")
304
316
  if not output_dir.exists():
305
317
  # make the directory with rwxrwxr-x permissions
306
318
  output_dir.mkdir(parents=True, mode=0o775)
easylink/configuration.py CHANGED
@@ -17,6 +17,7 @@ from layered_config_tree import LayeredConfigTree
17
17
  from easylink.pipeline_schema import PipelineSchema
18
18
  from easylink.utilities.data_utils import load_yaml
19
19
  from easylink.utilities.general_utils import exit_with_validation_error
20
+ from easylink.utilities.paths import DEFAULT_IMAGES_DIR
20
21
 
21
22
  PIPELINE_ERRORS_KEY = "PIPELINE ERRORS"
22
23
  INPUT_DATA_ERRORS_KEY = "INPUT DATA ERRORS"
@@ -66,9 +67,14 @@ class Config(LayeredConfigTree):
66
67
  config_params
67
68
  A dictionary of all specifications required to run the pipeline. This
68
69
  includes the pipeline, input data, and computing environment specifications,
69
- as well as the results directory.
70
+ as well as the results directory and images directory.
70
71
  schema_name
71
72
  The name of the schema to validate the pipeline configuration against.
73
+ images_dir
74
+ The directory containing the images or to download the images to if they
75
+ don't exist. If None, will default to the :data:`~easylink.utilities.paths.DEFAULT_IMAGES_DIR`.
76
+ command
77
+ The EasyLink command being run.
72
78
 
73
79
  Attributes
74
80
  ----------
@@ -82,6 +88,11 @@ class Config(LayeredConfigTree):
82
88
  The input data filepaths.
83
89
  schema
84
90
  The :class:`~easylink.pipeline_schema.PipelineSchema`.
91
+ images_dir
92
+ The directory containing the images or to download the images to if they
93
+ don't exist. If None, will default to ~/.easylink_images.
94
+ command
95
+ The EasyLink command being run.
85
96
 
86
97
  """
87
98
 
@@ -89,6 +100,8 @@ class Config(LayeredConfigTree):
89
100
  self,
90
101
  config_params: dict[str, Any],
91
102
  schema_name: str = "main",
103
+ images_dir: str | Path | None = None,
104
+ command: str = "run",
92
105
  ) -> None:
93
106
  super().__init__(layers=["initial_data", "default", "user_configured"])
94
107
  self.update(DEFAULT_ENVIRONMENT, layer="default")
@@ -101,6 +114,14 @@ class Config(LayeredConfigTree):
101
114
  self.update({"environment": {"slurm": {}}}, layer="default")
102
115
  self.update({"schema": self._get_schema(schema_name)}, layer="initial_data")
103
116
  self.schema.configure_pipeline(self.pipeline, self.input_data)
117
+ # use the images_dir if provided, otherwise use default
118
+ self.update(
119
+ {
120
+ "images_dir": Path(images_dir) if images_dir else DEFAULT_IMAGES_DIR,
121
+ },
122
+ layer="user_configured",
123
+ )
124
+ self.update({"command": command}, layer="user_configured")
104
125
  self._validate()
105
126
  self.freeze()
106
127
 
@@ -303,7 +324,9 @@ def _load_input_data_paths(
303
324
  f"Input was: '{input_data_paths}'"
304
325
  )
305
326
  filepath_dict = {
306
- filename: Path(filepath).resolve() for filename, filepath in input_data_paths.items()
327
+ # Resolve paths relative to location of the YAML file
328
+ filename: (Path(input_data_specification_path).parent / Path(filepath)).resolve()
329
+ for filename, filepath in input_data_paths.items()
307
330
  }
308
331
  return filepath_dict
309
332
 
@@ -69,8 +69,6 @@ class ImplementationCreator:
69
69
  for the container.
70
70
  implementation_name
71
71
  The name of the implementation. It is by definition the name of the script.
72
- requirements
73
- The install requirements for the implementation (if any).
74
72
  step
75
73
  The name of the step that this implementation implements.
76
74
  output_slot
@@ -93,20 +91,30 @@ class ImplementationCreator:
93
91
  for the container."""
94
92
  self.implementation_name = script_path.stem
95
93
  """The name of the implementation. It is by definition the name of the script."""
96
- self.requirements = self._extract_requirements(script_path)
97
- """The install requirements for the implementation (if any)."""
98
94
  self.step = self._extract_implemented_step(script_path)
99
95
  """The name of the step that this implementation implements."""
96
+ self.has_custom_recipe = self._extract_has_custom_recipe(script_path)
97
+ """Whether the user has already written the recipe for this implementation."""
98
+ self.script_base_command = self._extract_script_base_command(script_path)
99
+ """The base command to use to run the script in this implementation."""
100
100
  self.output_slot = self._extract_output_slot(script_path, self.step)
101
101
  """The name of the output slot that this implementation sends results to."""
102
102
 
103
103
  def create_recipe(self) -> None:
104
104
  """Builds the singularity recipe and writes it to disk."""
105
-
106
- recipe = PythonRecipe(self.script_path, self.recipe_path, self.requirements)
105
+ if self.has_custom_recipe:
106
+ if not self.recipe_path.exists():
107
+ raise ValueError(f"Could not find a custom recipe at {self.recipe_path}.")
108
+ return
109
+
110
+ recipe = PythonRecipe(
111
+ self.script_path,
112
+ self.recipe_path,
113
+ ImplementationCreator._extract_requirements(self.script_path),
114
+ self.script_base_command,
115
+ )
107
116
  recipe.build()
108
117
  recipe.write()
109
- pass
110
118
 
111
119
  def build_container(self) -> None:
112
120
  """Builds the container from the recipe.
@@ -190,7 +198,7 @@ class ImplementationCreator:
190
198
  info[self.implementation_name] = {
191
199
  "steps": [self.step],
192
200
  "image_path": str(self.hosted_container_path),
193
- "script_cmd": f"python /{self.script_path.name}",
201
+ "script_cmd": f"{self.script_base_command} /{self.script_path.name}",
194
202
  "outputs": {
195
203
  self.output_slot: "result.parquet",
196
204
  },
@@ -241,6 +249,22 @@ class ImplementationCreator:
241
249
  )
242
250
  return steps[0]
243
251
 
252
+ @staticmethod
253
+ def _extract_has_custom_recipe(script_path: Path) -> bool:
254
+ """Extracts whether the user has already written the recipe for this implementation.
255
+
256
+ The expectation is that this flag is specified within the script
257
+ as a comment of the format:
258
+
259
+ .. code-block:: python
260
+ # HAS_CUSTOM_RECIPE: true
261
+ """
262
+ has_custom_recipe = _extract_metadata("HAS_CUSTOM_RECIPE", script_path)
263
+ if len(has_custom_recipe) == 0:
264
+ return False
265
+ else:
266
+ return str(has_custom_recipe[0]).strip().lower() in ["true", "yes"]
267
+
244
268
  @staticmethod
245
269
  def _extract_output_slot(script_path: Path, step_name: str) -> str:
246
270
  """Extracts the name of the output slot that this script is implementing."""
@@ -307,7 +331,7 @@ class ImplementationCreator:
307
331
  def _extract_pipeline_schema_name(script_path: Path) -> str:
308
332
  """Extracts the relevant pipeline schema name.
309
333
 
310
- The expectation is that the output slot's name is specified within the script
334
+ The expectation is that the pipeline schema's name is specified within the script
311
335
  as a comment of the format:
312
336
 
313
337
  .. code-block:: python
@@ -321,6 +345,22 @@ class ImplementationCreator:
321
345
  raise ValueError(f"Pipeline schema '{schema_name}' is not supported.")
322
346
  return schema_name
323
347
 
348
+ @staticmethod
349
+ def _extract_script_base_command(script_path: Path) -> str:
350
+ """Extracts the base command to be used to run the script.
351
+
352
+ The expectation is that the base command is specified within the script
353
+ as a comment of the format:
354
+
355
+ .. code-block:: python
356
+ # SCRIPT_BASE_COMMAND: python
357
+
358
+ If no pipeline schema is specified, "python" will be used by default.
359
+ """
360
+ base_command_list: list[str] = _extract_metadata("SCRIPT_BASE_COMMAND", script_path)
361
+ base_command = base_command_list[0] if base_command_list else "python"
362
+ return base_command
363
+
324
364
  @staticmethod
325
365
  def _write_metadata(info: dict[str, dict[str, str]]) -> None:
326
366
  """Writes the implementation metadata to disk.
@@ -341,10 +381,17 @@ class PythonRecipe:
341
381
  "python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899"
342
382
  )
343
383
 
344
- def __init__(self, script_path: Path, recipe_path: Path, requirements: str) -> None:
384
+ def __init__(
385
+ self,
386
+ script_path: Path,
387
+ recipe_path: Path,
388
+ requirements: str,
389
+ script_base_command: str,
390
+ ) -> None:
345
391
  self.script_path = script_path
346
392
  self.recipe_path = recipe_path
347
393
  self.requirements = requirements
394
+ self.script_base_command = script_base_command
348
395
  self.text: str | None = None
349
396
 
350
397
  def build(self) -> None:
@@ -373,7 +420,7 @@ From: {self.BASE_IMAGE}
373
420
  export LC_ALL=C
374
421
 
375
422
  %runscript
376
- python /{script_name} '$@'"""
423
+ {self.script_base_command} /{script_name} '$@'"""
377
424
 
378
425
  def write(self) -> None:
379
426
  """Writes the recipe to disk.
@@ -16,9 +16,14 @@ from pathlib import Path
16
16
  from typing import TYPE_CHECKING
17
17
 
18
18
  from layered_config_tree import LayeredConfigTree
19
+ from loguru import logger
19
20
 
20
21
  from easylink.utilities import paths
21
- from easylink.utilities.data_utils import load_yaml
22
+ from easylink.utilities.data_utils import (
23
+ calculate_md5_checksum,
24
+ download_image,
25
+ load_yaml,
26
+ )
22
27
 
23
28
  if TYPE_CHECKING:
24
29
  from easylink.graph_components import InputSlot, OutputSlot
@@ -74,14 +79,14 @@ class Implementation:
74
79
  def __repr__(self) -> str:
75
80
  return f"Implementation.{self.name}"
76
81
 
77
- def validate(self) -> list[str]:
82
+ def validate(self, skip_image_validation: bool, images_dir: str | Path) -> list[str]:
78
83
  """Validates individual ``Implementation`` instances.
79
84
 
80
85
  Returns
81
86
  -------
82
87
  A list of logs containing any validation errors. Each item in the list
83
88
  is a distinct message about a particular validation error (e.g. if a
84
- required container does not exist).
89
+ required image does not exist).
85
90
 
86
91
  Notes
87
92
  -----
@@ -89,7 +94,8 @@ class Implementation:
89
94
  """
90
95
  logs = []
91
96
  logs = self._validate_expected_steps(logs)
92
- logs = self._validate_container_exists(logs)
97
+ if not skip_image_validation:
98
+ logs = self._download_and_validate_image(logs, images_dir)
93
99
  return logs
94
100
 
95
101
  ##################
@@ -110,11 +116,82 @@ class Implementation:
110
116
  )
111
117
  return logs
112
118
 
113
- def _validate_container_exists(self, logs: list[str]) -> list[str]:
114
- """Validates that the container to run exists."""
115
- err_str = f"Container '{self.singularity_image_path}' does not exist."
116
- if not Path(self.singularity_image_path).exists():
117
- logs.append(err_str)
119
+ def _download_and_validate_image(
120
+ self, logs: list[str], images_dir: str | Path
121
+ ) -> list[str]:
122
+ """Downloads the image if required and validates it exists.
123
+
124
+ If the image does not exist in the specified images directory, it will
125
+ attempt to download it.
126
+ """
127
+ # HACK: We manually create the image path here as well as later when writing
128
+ # each implementations Snakefile rule.
129
+ image_path = Path(images_dir) / self.singularity_image_name
130
+ expected_md5_checksum = self._metadata.get("md5_checksum", None)
131
+ record_id = self._metadata.get("zenodo_record_id", None)
132
+ if image_path.exists():
133
+ self._handle_conflicting_checksums(
134
+ logs, image_path, expected_md5_checksum, record_id
135
+ )
136
+ else:
137
+ if not record_id:
138
+ logs.append(
139
+ f"Image '{str(image_path)}' does not exist and no Zenodo record ID "
140
+ "is provided to download it."
141
+ )
142
+ if not expected_md5_checksum:
143
+ logs.append(
144
+ f"Image '{str(image_path)}' does not exist and no MD5 checksum "
145
+ "is provided to verify from the host."
146
+ )
147
+ if not record_id or not expected_md5_checksum:
148
+ return logs
149
+ download_image(
150
+ images_dir=images_dir,
151
+ record_id=record_id,
152
+ filename=self.singularity_image_name,
153
+ md5_checksum=expected_md5_checksum,
154
+ )
155
+ if not image_path.exists():
156
+ logs.append(
157
+ f"Image '{str(image_path)}' does not exist and could not be downloaded."
158
+ )
159
+ return logs
160
+
161
+ @staticmethod
162
+ def _handle_conflicting_checksums(
163
+ logs: list[str],
164
+ image_path: Path,
165
+ expected_md5_checksum: str | None,
166
+ record_id: str | None,
167
+ ) -> list[str]:
168
+ # TODO: Strengthen the following logic to better handle image updates.
169
+ # If using the default images directory and the image already exists
170
+ # but with a different checksum than in the implementation metadata,
171
+ # re-download.
172
+ calculated_md5_checksum = calculate_md5_checksum(image_path)
173
+ if (
174
+ image_path.parent == paths.DEFAULT_IMAGES_DIR
175
+ and expected_md5_checksum
176
+ and calculated_md5_checksum != expected_md5_checksum
177
+ ):
178
+ if not record_id:
179
+ logs.append(
180
+ f"Image '{str(image_path)}' exists but has a different MD5 checksum "
181
+ f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
182
+ "No Zenodo record ID is provided to re-download the image."
183
+ )
184
+ logger.info(
185
+ f"Image '{str(image_path)}' exists but has a different MD5 checksum "
186
+ f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
187
+ "Re-downloading the image."
188
+ )
189
+ download_image(
190
+ images_dir=image_path.parent,
191
+ record_id=record_id,
192
+ filename=image_path.name,
193
+ md5_checksum=expected_md5_checksum,
194
+ )
118
195
  return logs
119
196
 
120
197
  def _get_env_vars(self, implementation_config: LayeredConfigTree) -> dict[str, str]:
@@ -124,9 +201,9 @@ class Implementation:
124
201
  return env_vars
125
202
 
126
203
  @property
127
- def singularity_image_path(self) -> str:
204
+ def singularity_image_name(self) -> str:
128
205
  """The path to the required Singularity image."""
129
- return self._metadata["image_path"]
206
+ return self._metadata["image_name"]
130
207
 
131
208
  @property
132
209
  def script_cmd(self) -> str: