easylink 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. easylink/_version.py +1 -1
  2. easylink/cli.py +15 -3
  3. easylink/configuration.py +25 -2
  4. easylink/devtools/implementation_creator.py +75 -13
  5. easylink/implementation.py +88 -11
  6. easylink/implementation_metadata.yaml +177 -26
  7. easylink/pipeline.py +15 -6
  8. easylink/pipeline_schema_constants/__init__.py +2 -2
  9. easylink/pipeline_schema_constants/main.py +489 -0
  10. easylink/runner.py +7 -1
  11. easylink/step.py +89 -0
  12. easylink/steps/cascading/exclude_clustered.def +22 -0
  13. easylink/steps/cascading/exclude_clustered.py +76 -0
  14. easylink/steps/cascading/exclude_none.def +22 -0
  15. easylink/steps/cascading/exclude_none.py +76 -0
  16. easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
  17. easylink/steps/cascading/update_clusters_by_connected_components.py +109 -0
  18. easylink/steps/default/default_clusters_to_links.def +22 -0
  19. easylink/steps/default/default_clusters_to_links.py +91 -0
  20. easylink/steps/default/default_determining_exclusions.def +22 -0
  21. easylink/steps/default/default_determining_exclusions.py +81 -0
  22. easylink/steps/default/default_removing_records.def +22 -0
  23. easylink/steps/default/default_removing_records.py +59 -0
  24. easylink/steps/default/default_schema_alignment.def +22 -0
  25. easylink/steps/default/default_schema_alignment.py +53 -0
  26. easylink/steps/default/default_updating_clusters.def +22 -0
  27. easylink/steps/default/default_updating_clusters.py +67 -0
  28. easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
  29. easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
  30. easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
  31. easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
  32. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
  33. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
  34. easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
  35. easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
  36. easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
  37. easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
  38. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
  39. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
  40. easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
  41. easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
  42. easylink/steps/splink/splink_evaluating_pairs.def +22 -0
  43. easylink/steps/splink/splink_evaluating_pairs.py +164 -0
  44. easylink/steps/splink/splink_links_to_clusters.def +22 -0
  45. easylink/steps/splink/splink_links_to_clusters.py +63 -0
  46. easylink/utilities/data_utils.py +72 -0
  47. easylink/utilities/paths.py +4 -3
  48. easylink/utilities/validation_utils.py +509 -11
  49. {easylink-0.1.18.dist-info → easylink-0.1.20.dist-info}/METADATA +5 -1
  50. easylink-0.1.20.dist-info/RECORD +91 -0
  51. {easylink-0.1.18.dist-info → easylink-0.1.20.dist-info}/WHEEL +1 -1
  52. easylink-0.1.20.dist-info/licenses/LICENSE +28 -0
  53. easylink-0.1.18.dist-info/RECORD +0 -55
  54. {easylink-0.1.18.dist-info → easylink-0.1.20.dist-info}/entry_points.txt +0 -0
  55. {easylink-0.1.18.dist-info → easylink-0.1.20.dist-info}/top_level.txt +0 -0
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.18"
1
+ __version__ = "0.1.20"
easylink/cli.py CHANGED
@@ -55,7 +55,7 @@ from easylink.utilities.general_utils import (
55
55
  configure_logging_to_terminal,
56
56
  handle_exceptions,
57
57
  )
58
- from easylink.utilities.paths import CONTAINER_DIR
58
+ from easylink.utilities.paths import DEFAULT_IMAGES_DIR, DEV_IMAGES_DIR
59
59
 
60
60
  SHARED_OPTIONS = [
61
61
  click.option(
@@ -154,6 +154,16 @@ def easylink():
154
154
 
155
155
  @easylink.command()
156
156
  @_pass_shared_options
157
+ @click.option(
158
+ "-I",
159
+ "--images",
160
+ hidden=True,
161
+ type=click.Path(exists=False, file_okay=False, resolve_path=True),
162
+ help=(
163
+ "The directory containing the images to run. If no value is passed, a new "
164
+ f"directory will be created at the home directory: {DEFAULT_IMAGES_DIR}."
165
+ ),
166
+ )
157
167
  @click.option(
158
168
  "-e",
159
169
  "--computing-environment",
@@ -171,6 +181,7 @@ def run(
171
181
  output_dir: str | None,
172
182
  no_timestamp: bool,
173
183
  schema: str,
184
+ images: str,
174
185
  computing_environment: str | None,
175
186
  verbose: int,
176
187
  with_debugger: bool,
@@ -196,6 +207,7 @@ def run(
196
207
  input_data=input_data,
197
208
  computing_environment=computing_environment,
198
209
  results_dir=results_dir,
210
+ images_dir=images,
199
211
  schema_name=schema,
200
212
  )
201
213
  logger.info("*** FINISHED ***")
@@ -263,7 +275,7 @@ easylink.add_command(devtools)
263
275
  type=click.Path(exists=False, dir_okay=True, file_okay=False, resolve_path=True),
264
276
  help=(
265
277
  "The directory to move the container to. If no value is passed, it will "
266
- f"be moved to {CONTAINER_DIR} in a sub-directory named with the username."
278
+ f"be moved to {DEV_IMAGES_DIR} in a sub-directory named with the username."
267
279
  ),
268
280
  )
269
281
  def create_implementation(
@@ -300,7 +312,7 @@ def create_implementation(
300
312
  if not scripts:
301
313
  logger.error("No scripts provided.")
302
314
  return
303
- output_dir = Path(output_dir) if output_dir else Path(f"{CONTAINER_DIR}/{os.getlogin()}")
315
+ output_dir = Path(output_dir) if output_dir else Path(f"{DEV_IMAGES_DIR}/{os.getlogin()}")
304
316
  if not output_dir.exists():
305
317
  # make the directory with rwxrwxr-x permissions
306
318
  output_dir.mkdir(parents=True, mode=0o775)
easylink/configuration.py CHANGED
@@ -17,6 +17,7 @@ from layered_config_tree import LayeredConfigTree
17
17
  from easylink.pipeline_schema import PipelineSchema
18
18
  from easylink.utilities.data_utils import load_yaml
19
19
  from easylink.utilities.general_utils import exit_with_validation_error
20
+ from easylink.utilities.paths import DEFAULT_IMAGES_DIR
20
21
 
21
22
  PIPELINE_ERRORS_KEY = "PIPELINE ERRORS"
22
23
  INPUT_DATA_ERRORS_KEY = "INPUT DATA ERRORS"
@@ -66,9 +67,14 @@ class Config(LayeredConfigTree):
66
67
  config_params
67
68
  A dictionary of all specifications required to run the pipeline. This
68
69
  includes the pipeline, input data, and computing environment specifications,
69
- as well as the results directory.
70
+ as well as the results directory and images directory.
70
71
  schema_name
71
72
  The name of the schema to validate the pipeline configuration against.
73
+ images_dir
74
+ The directory containing the images or to download the images to if they
75
+ don't exist. If None, will default to the :data:`~easylink.utilities.paths.DEFAULT_IMAGES_DIR`.
76
+ command
77
+ The EasyLink command being run.
72
78
 
73
79
  Attributes
74
80
  ----------
@@ -82,6 +88,11 @@ class Config(LayeredConfigTree):
82
88
  The input data filepaths.
83
89
  schema
84
90
  The :class:`~easylink.pipeline_schema.PipelineSchema`.
91
+ images_dir
92
+ The directory containing the images or to download the images to if they
93
+ don't exist. If None, will default to ~/.easylink_images.
94
+ command
95
+ The EasyLink command being run.
85
96
 
86
97
  """
87
98
 
@@ -89,6 +100,8 @@ class Config(LayeredConfigTree):
89
100
  self,
90
101
  config_params: dict[str, Any],
91
102
  schema_name: str = "main",
103
+ images_dir: str | Path | None = None,
104
+ command: str = "run",
92
105
  ) -> None:
93
106
  super().__init__(layers=["initial_data", "default", "user_configured"])
94
107
  self.update(DEFAULT_ENVIRONMENT, layer="default")
@@ -101,6 +114,14 @@ class Config(LayeredConfigTree):
101
114
  self.update({"environment": {"slurm": {}}}, layer="default")
102
115
  self.update({"schema": self._get_schema(schema_name)}, layer="initial_data")
103
116
  self.schema.configure_pipeline(self.pipeline, self.input_data)
117
+ # use the images_dir if provided, otherwise use default
118
+ self.update(
119
+ {
120
+ "images_dir": Path(images_dir) if images_dir else DEFAULT_IMAGES_DIR,
121
+ },
122
+ layer="user_configured",
123
+ )
124
+ self.update({"command": command}, layer="user_configured")
104
125
  self._validate()
105
126
  self.freeze()
106
127
 
@@ -303,7 +324,9 @@ def _load_input_data_paths(
303
324
  f"Input was: '{input_data_paths}'"
304
325
  )
305
326
  filepath_dict = {
306
- filename: Path(filepath).resolve() for filename, filepath in input_data_paths.items()
327
+ # Resolve paths relative to location of the YAML file
328
+ filename: (Path(input_data_specification_path).parent / Path(filepath)).resolve()
329
+ for filename, filepath in input_data_paths.items()
307
330
  }
308
331
  return filepath_dict
309
332
 
@@ -29,7 +29,7 @@ from easylink.step import (
29
29
  TemplatedStep,
30
30
  )
31
31
  from easylink.utilities.data_utils import load_yaml
32
- from easylink.utilities.paths import IMPLEMENTATION_METADATA
32
+ from easylink.utilities.paths import DEV_IMAGES_DIR, IMPLEMENTATION_METADATA
33
33
 
34
34
 
35
35
  def main(script_path: Path, host: Path) -> None:
@@ -69,8 +69,6 @@ class ImplementationCreator:
69
69
  for the container.
70
70
  implementation_name
71
71
  The name of the implementation. It is by definition the name of the script.
72
- requirements
73
- The install requirements for the implementation (if any).
74
72
  step
75
73
  The name of the step that this implementation implements.
76
74
  output_slot
@@ -93,20 +91,30 @@ class ImplementationCreator:
93
91
  for the container."""
94
92
  self.implementation_name = script_path.stem
95
93
  """The name of the implementation. It is by definition the name of the script."""
96
- self.requirements = self._extract_requirements(script_path)
97
- """The install requirements for the implementation (if any)."""
98
94
  self.step = self._extract_implemented_step(script_path)
99
95
  """The name of the step that this implementation implements."""
96
+ self.has_custom_recipe = self._extract_has_custom_recipe(script_path)
97
+ """Whether the user has already written the recipe for this implementation."""
98
+ self.script_base_command = self._extract_script_base_command(script_path)
99
+ """The base command to use to run the script in this implementation."""
100
100
  self.output_slot = self._extract_output_slot(script_path, self.step)
101
101
  """The name of the output slot that this implementation sends results to."""
102
102
 
103
103
  def create_recipe(self) -> None:
104
104
  """Builds the singularity recipe and writes it to disk."""
105
-
106
- recipe = PythonRecipe(self.script_path, self.recipe_path, self.requirements)
105
+ if self.has_custom_recipe:
106
+ if not self.recipe_path.exists():
107
+ raise ValueError(f"Could not find a custom recipe at {self.recipe_path}.")
108
+ return
109
+
110
+ recipe = PythonRecipe(
111
+ self.script_path,
112
+ self.recipe_path,
113
+ ImplementationCreator._extract_requirements(self.script_path),
114
+ self.script_base_command,
115
+ )
107
116
  recipe.build()
108
117
  recipe.write()
109
- pass
110
118
 
111
119
  def build_container(self) -> None:
112
120
  """Builds the container from the recipe.
@@ -187,10 +195,25 @@ class ImplementationCreator:
187
195
  f"Implementation '{self.implementation_name}' already exists in the registry. "
188
196
  "Overwriting it with the latest data."
189
197
  )
198
+
199
+ # Handle the fact that developers might be saving to username subdirs
200
+ # If the host folder is a subdirectory of DEV_IMAGES_DIR (e.g., the default
201
+ # host directory when calling `easylink devtools create-implementation`
202
+ # is DEV_IMAGES_DIR/<username>), we want to include the relative path
203
+ # to the DEV_IMAGES_DIR in the image name. This is required because ultimately
204
+ # when running a pipeline, all images are expected to be in a single directory.
205
+ image_name = (
206
+ self.hosted_container_path.name
207
+ # Use just the image name if the hosted path is not a part of DEV_IMAGES_DIR
208
+ if not self.hosted_container_path.is_relative_to(DEV_IMAGES_DIR)
209
+ # Use the path relative to DEV_IMAGES_DIR as the image name
210
+ else str(self.hosted_container_path.relative_to(DEV_IMAGES_DIR))
211
+ )
212
+
190
213
  info[self.implementation_name] = {
191
214
  "steps": [self.step],
192
- "image_path": str(self.hosted_container_path),
193
- "script_cmd": f"python /{self.script_path.name}",
215
+ "image_name": str(image_name),
216
+ "script_cmd": f"{self.script_base_command} /{self.script_path.name}",
194
217
  "outputs": {
195
218
  self.output_slot: "result.parquet",
196
219
  },
@@ -241,6 +264,22 @@ class ImplementationCreator:
241
264
  )
242
265
  return steps[0]
243
266
 
267
+ @staticmethod
268
+ def _extract_has_custom_recipe(script_path: Path) -> bool:
269
+ """Extracts whether the user has already written the recipe for this implementation.
270
+
271
+ The expectation is that this flag is specified within the script
272
+ as a comment of the format:
273
+
274
+ .. code-block:: python
275
+ # HAS_CUSTOM_RECIPE: true
276
+ """
277
+ has_custom_recipe = _extract_metadata("HAS_CUSTOM_RECIPE", script_path)
278
+ if len(has_custom_recipe) == 0:
279
+ return False
280
+ else:
281
+ return str(has_custom_recipe[0]).strip().lower() in ["true", "yes"]
282
+
244
283
  @staticmethod
245
284
  def _extract_output_slot(script_path: Path, step_name: str) -> str:
246
285
  """Extracts the name of the output slot that this script is implementing."""
@@ -307,7 +346,7 @@ class ImplementationCreator:
307
346
  def _extract_pipeline_schema_name(script_path: Path) -> str:
308
347
  """Extracts the relevant pipeline schema name.
309
348
 
310
- The expectation is that the output slot's name is specified within the script
349
+ The expectation is that the pipeline schema's name is specified within the script
311
350
  as a comment of the format:
312
351
 
313
352
  .. code-block:: python
@@ -321,6 +360,22 @@ class ImplementationCreator:
321
360
  raise ValueError(f"Pipeline schema '{schema_name}' is not supported.")
322
361
  return schema_name
323
362
 
363
+ @staticmethod
364
+ def _extract_script_base_command(script_path: Path) -> str:
365
+ """Extracts the base command to be used to run the script.
366
+
367
+ The expectation is that the base command is specified within the script
368
+ as a comment of the format:
369
+
370
+ .. code-block:: python
371
+ # SCRIPT_BASE_COMMAND: python
372
+
373
+ If no pipeline schema is specified, "python" will be used by default.
374
+ """
375
+ base_command_list: list[str] = _extract_metadata("SCRIPT_BASE_COMMAND", script_path)
376
+ base_command = base_command_list[0] if base_command_list else "python"
377
+ return base_command
378
+
324
379
  @staticmethod
325
380
  def _write_metadata(info: dict[str, dict[str, str]]) -> None:
326
381
  """Writes the implementation metadata to disk.
@@ -341,10 +396,17 @@ class PythonRecipe:
341
396
  "python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899"
342
397
  )
343
398
 
344
- def __init__(self, script_path: Path, recipe_path: Path, requirements: str) -> None:
399
+ def __init__(
400
+ self,
401
+ script_path: Path,
402
+ recipe_path: Path,
403
+ requirements: str,
404
+ script_base_command: str,
405
+ ) -> None:
345
406
  self.script_path = script_path
346
407
  self.recipe_path = recipe_path
347
408
  self.requirements = requirements
409
+ self.script_base_command = script_base_command
348
410
  self.text: str | None = None
349
411
 
350
412
  def build(self) -> None:
@@ -373,7 +435,7 @@ From: {self.BASE_IMAGE}
373
435
  export LC_ALL=C
374
436
 
375
437
  %runscript
376
- python /{script_name} '$@'"""
438
+ {self.script_base_command} /{script_name} '$@'"""
377
439
 
378
440
  def write(self) -> None:
379
441
  """Writes the recipe to disk.
@@ -16,9 +16,14 @@ from pathlib import Path
16
16
  from typing import TYPE_CHECKING
17
17
 
18
18
  from layered_config_tree import LayeredConfigTree
19
+ from loguru import logger
19
20
 
20
21
  from easylink.utilities import paths
21
- from easylink.utilities.data_utils import load_yaml
22
+ from easylink.utilities.data_utils import (
23
+ calculate_md5_checksum,
24
+ download_image,
25
+ load_yaml,
26
+ )
22
27
 
23
28
  if TYPE_CHECKING:
24
29
  from easylink.graph_components import InputSlot, OutputSlot
@@ -74,14 +79,14 @@ class Implementation:
74
79
  def __repr__(self) -> str:
75
80
  return f"Implementation.{self.name}"
76
81
 
77
- def validate(self) -> list[str]:
82
+ def validate(self, skip_image_validation: bool, images_dir: str | Path) -> list[str]:
78
83
  """Validates individual ``Implementation`` instances.
79
84
 
80
85
  Returns
81
86
  -------
82
87
  A list of logs containing any validation errors. Each item in the list
83
88
  is a distinct message about a particular validation error (e.g. if a
84
- required container does not exist).
89
+ required image does not exist).
85
90
 
86
91
  Notes
87
92
  -----
@@ -89,7 +94,8 @@ class Implementation:
89
94
  """
90
95
  logs = []
91
96
  logs = self._validate_expected_steps(logs)
92
- logs = self._validate_container_exists(logs)
97
+ if not skip_image_validation:
98
+ logs = self._download_and_validate_image(logs, images_dir)
93
99
  return logs
94
100
 
95
101
  ##################
@@ -110,11 +116,82 @@ class Implementation:
110
116
  )
111
117
  return logs
112
118
 
113
- def _validate_container_exists(self, logs: list[str]) -> list[str]:
114
- """Validates that the container to run exists."""
115
- err_str = f"Container '{self.singularity_image_path}' does not exist."
116
- if not Path(self.singularity_image_path).exists():
117
- logs.append(err_str)
119
+ def _download_and_validate_image(
120
+ self, logs: list[str], images_dir: str | Path
121
+ ) -> list[str]:
122
+ """Downloads the image if required and validates it exists.
123
+
124
+ If the image does not exist in the specified images directory, it will
125
+ attempt to download it.
126
+ """
127
+ # HACK: We manually create the image path here as well as later when writing
128
+ # each implementations Snakefile rule.
129
+ image_path = Path(images_dir) / self.singularity_image_name
130
+ expected_md5_checksum = self._metadata.get("md5_checksum", None)
131
+ record_id = self._metadata.get("zenodo_record_id", None)
132
+ if image_path.exists():
133
+ self._handle_conflicting_checksums(
134
+ logs, image_path, expected_md5_checksum, record_id
135
+ )
136
+ else:
137
+ if not record_id:
138
+ logs.append(
139
+ f"Image '{str(image_path)}' does not exist and no Zenodo record ID "
140
+ "is provided to download it."
141
+ )
142
+ if not expected_md5_checksum:
143
+ logs.append(
144
+ f"Image '{str(image_path)}' does not exist and no MD5 checksum "
145
+ "is provided to verify from the host."
146
+ )
147
+ if not record_id or not expected_md5_checksum:
148
+ return logs
149
+ download_image(
150
+ images_dir=images_dir,
151
+ record_id=record_id,
152
+ filename=self.singularity_image_name,
153
+ md5_checksum=expected_md5_checksum,
154
+ )
155
+ if not image_path.exists():
156
+ logs.append(
157
+ f"Image '{str(image_path)}' does not exist and could not be downloaded."
158
+ )
159
+ return logs
160
+
161
+ @staticmethod
162
+ def _handle_conflicting_checksums(
163
+ logs: list[str],
164
+ image_path: Path,
165
+ expected_md5_checksum: str | None,
166
+ record_id: str | None,
167
+ ) -> list[str]:
168
+ # TODO: Strengthen the following logic to better handle image updates.
169
+ # If using the default images directory and the image already exists
170
+ # but with a different checksum than in the implementation metadata,
171
+ # re-download.
172
+ calculated_md5_checksum = calculate_md5_checksum(image_path)
173
+ if (
174
+ image_path.parent == paths.DEFAULT_IMAGES_DIR
175
+ and expected_md5_checksum
176
+ and calculated_md5_checksum != expected_md5_checksum
177
+ ):
178
+ if not record_id:
179
+ logs.append(
180
+ f"Image '{str(image_path)}' exists but has a different MD5 checksum "
181
+ f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
182
+ "No Zenodo record ID is provided to re-download the image."
183
+ )
184
+ logger.info(
185
+ f"Image '{str(image_path)}' exists but has a different MD5 checksum "
186
+ f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
187
+ "Re-downloading the image."
188
+ )
189
+ download_image(
190
+ images_dir=image_path.parent,
191
+ record_id=record_id,
192
+ filename=image_path.name,
193
+ md5_checksum=expected_md5_checksum,
194
+ )
118
195
  return logs
119
196
 
120
197
  def _get_env_vars(self, implementation_config: LayeredConfigTree) -> dict[str, str]:
@@ -124,9 +201,9 @@ class Implementation:
124
201
  return env_vars
125
202
 
126
203
  @property
127
- def singularity_image_path(self) -> str:
204
+ def singularity_image_name(self) -> str:
128
205
  """The path to the required Singularity image."""
129
- return self._metadata["image_path"]
206
+ return self._metadata["image_name"]
130
207
 
131
208
  @property
132
209
  def script_cmd(self) -> str: