easylink 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +15 -3
- easylink/configuration.py +25 -2
- easylink/devtools/implementation_creator.py +58 -11
- easylink/implementation.py +88 -11
- easylink/implementation_metadata.yaml +177 -26
- easylink/pipeline.py +15 -6
- easylink/pipeline_schema_constants/__init__.py +2 -2
- easylink/pipeline_schema_constants/main.py +489 -0
- easylink/runner.py +7 -1
- easylink/step.py +89 -0
- easylink/steps/cascading/exclude_clustered.def +22 -0
- easylink/steps/cascading/exclude_clustered.py +76 -0
- easylink/steps/cascading/exclude_none.def +22 -0
- easylink/steps/cascading/exclude_none.py +76 -0
- easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
- easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
- easylink/steps/default/default_clusters_to_links.def +22 -0
- easylink/steps/default/default_clusters_to_links.py +91 -0
- easylink/steps/default/default_determining_exclusions.def +22 -0
- easylink/steps/default/default_determining_exclusions.py +81 -0
- easylink/steps/default/default_removing_records.def +22 -0
- easylink/steps/default/default_removing_records.py +59 -0
- easylink/steps/default/default_schema_alignment.def +22 -0
- easylink/steps/default/default_schema_alignment.py +53 -0
- easylink/steps/default/default_updating_clusters.def +22 -0
- easylink/steps/default/default_updating_clusters.py +67 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
- easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
- easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
- easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
- easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
- easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
- easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
- easylink/steps/splink/splink_evaluating_pairs.def +22 -0
- easylink/steps/splink/splink_evaluating_pairs.py +164 -0
- easylink/steps/splink/splink_links_to_clusters.def +22 -0
- easylink/steps/splink/splink_links_to_clusters.py +63 -0
- easylink/utilities/data_utils.py +72 -0
- easylink/utilities/paths.py +4 -3
- easylink/utilities/validation_utils.py +509 -11
- {easylink-0.1.18.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
- easylink-0.1.19.dist-info/RECORD +91 -0
- {easylink-0.1.18.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
- easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
- easylink-0.1.18.dist-info/RECORD +0 -55
- {easylink-0.1.18.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.18.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.19"
|
easylink/cli.py
CHANGED
@@ -55,7 +55,7 @@ from easylink.utilities.general_utils import (
|
|
55
55
|
configure_logging_to_terminal,
|
56
56
|
handle_exceptions,
|
57
57
|
)
|
58
|
-
from easylink.utilities.paths import
|
58
|
+
from easylink.utilities.paths import DEFAULT_IMAGES_DIR, DEV_IMAGES_DIR
|
59
59
|
|
60
60
|
SHARED_OPTIONS = [
|
61
61
|
click.option(
|
@@ -154,6 +154,16 @@ def easylink():
|
|
154
154
|
|
155
155
|
@easylink.command()
|
156
156
|
@_pass_shared_options
|
157
|
+
@click.option(
|
158
|
+
"-I",
|
159
|
+
"--images",
|
160
|
+
hidden=True,
|
161
|
+
type=click.Path(exists=False, file_okay=False, resolve_path=True),
|
162
|
+
help=(
|
163
|
+
"The directory containing the images to run. If no value is passed, a new "
|
164
|
+
f"directory will be created at the home directory: {DEFAULT_IMAGES_DIR}."
|
165
|
+
),
|
166
|
+
)
|
157
167
|
@click.option(
|
158
168
|
"-e",
|
159
169
|
"--computing-environment",
|
@@ -171,6 +181,7 @@ def run(
|
|
171
181
|
output_dir: str | None,
|
172
182
|
no_timestamp: bool,
|
173
183
|
schema: str,
|
184
|
+
images: str,
|
174
185
|
computing_environment: str | None,
|
175
186
|
verbose: int,
|
176
187
|
with_debugger: bool,
|
@@ -196,6 +207,7 @@ def run(
|
|
196
207
|
input_data=input_data,
|
197
208
|
computing_environment=computing_environment,
|
198
209
|
results_dir=results_dir,
|
210
|
+
images_dir=images,
|
199
211
|
schema_name=schema,
|
200
212
|
)
|
201
213
|
logger.info("*** FINISHED ***")
|
@@ -263,7 +275,7 @@ easylink.add_command(devtools)
|
|
263
275
|
type=click.Path(exists=False, dir_okay=True, file_okay=False, resolve_path=True),
|
264
276
|
help=(
|
265
277
|
"The directory to move the container to. If no value is passed, it will "
|
266
|
-
f"be moved to {
|
278
|
+
f"be moved to {DEV_IMAGES_DIR} in a sub-directory named with the username."
|
267
279
|
),
|
268
280
|
)
|
269
281
|
def create_implementation(
|
@@ -300,7 +312,7 @@ def create_implementation(
|
|
300
312
|
if not scripts:
|
301
313
|
logger.error("No scripts provided.")
|
302
314
|
return
|
303
|
-
output_dir = Path(output_dir) if output_dir else Path(f"{
|
315
|
+
output_dir = Path(output_dir) if output_dir else Path(f"{DEV_IMAGES_DIR}/{os.getlogin()}")
|
304
316
|
if not output_dir.exists():
|
305
317
|
# make the directory with rwxrwxr-x permissions
|
306
318
|
output_dir.mkdir(parents=True, mode=0o775)
|
easylink/configuration.py
CHANGED
@@ -17,6 +17,7 @@ from layered_config_tree import LayeredConfigTree
|
|
17
17
|
from easylink.pipeline_schema import PipelineSchema
|
18
18
|
from easylink.utilities.data_utils import load_yaml
|
19
19
|
from easylink.utilities.general_utils import exit_with_validation_error
|
20
|
+
from easylink.utilities.paths import DEFAULT_IMAGES_DIR
|
20
21
|
|
21
22
|
PIPELINE_ERRORS_KEY = "PIPELINE ERRORS"
|
22
23
|
INPUT_DATA_ERRORS_KEY = "INPUT DATA ERRORS"
|
@@ -66,9 +67,14 @@ class Config(LayeredConfigTree):
|
|
66
67
|
config_params
|
67
68
|
A dictionary of all specifications required to run the pipeline. This
|
68
69
|
includes the pipeline, input data, and computing environment specifications,
|
69
|
-
as well as the results directory.
|
70
|
+
as well as the results directory and images directory.
|
70
71
|
schema_name
|
71
72
|
The name of the schema to validate the pipeline configuration against.
|
73
|
+
images_dir
|
74
|
+
The directory containing the images or to download the images to if they
|
75
|
+
don't exist. If None, will default to the :data:`~easylink.utilities.paths.DEFAULT_IMAGES_DIR`.
|
76
|
+
command
|
77
|
+
The EasyLink command being run.
|
72
78
|
|
73
79
|
Attributes
|
74
80
|
----------
|
@@ -82,6 +88,11 @@ class Config(LayeredConfigTree):
|
|
82
88
|
The input data filepaths.
|
83
89
|
schema
|
84
90
|
The :class:`~easylink.pipeline_schema.PipelineSchema`.
|
91
|
+
images_dir
|
92
|
+
The directory containing the images or to download the images to if they
|
93
|
+
don't exist. If None, will default to ~/.easylink_images.
|
94
|
+
command
|
95
|
+
The EasyLink command being run.
|
85
96
|
|
86
97
|
"""
|
87
98
|
|
@@ -89,6 +100,8 @@ class Config(LayeredConfigTree):
|
|
89
100
|
self,
|
90
101
|
config_params: dict[str, Any],
|
91
102
|
schema_name: str = "main",
|
103
|
+
images_dir: str | Path | None = None,
|
104
|
+
command: str = "run",
|
92
105
|
) -> None:
|
93
106
|
super().__init__(layers=["initial_data", "default", "user_configured"])
|
94
107
|
self.update(DEFAULT_ENVIRONMENT, layer="default")
|
@@ -101,6 +114,14 @@ class Config(LayeredConfigTree):
|
|
101
114
|
self.update({"environment": {"slurm": {}}}, layer="default")
|
102
115
|
self.update({"schema": self._get_schema(schema_name)}, layer="initial_data")
|
103
116
|
self.schema.configure_pipeline(self.pipeline, self.input_data)
|
117
|
+
# use the images_dir if provided, otherwise use default
|
118
|
+
self.update(
|
119
|
+
{
|
120
|
+
"images_dir": Path(images_dir) if images_dir else DEFAULT_IMAGES_DIR,
|
121
|
+
},
|
122
|
+
layer="user_configured",
|
123
|
+
)
|
124
|
+
self.update({"command": command}, layer="user_configured")
|
104
125
|
self._validate()
|
105
126
|
self.freeze()
|
106
127
|
|
@@ -303,7 +324,9 @@ def _load_input_data_paths(
|
|
303
324
|
f"Input was: '{input_data_paths}'"
|
304
325
|
)
|
305
326
|
filepath_dict = {
|
306
|
-
|
327
|
+
# Resolve paths relative to location of the YAML file
|
328
|
+
filename: (Path(input_data_specification_path).parent / Path(filepath)).resolve()
|
329
|
+
for filename, filepath in input_data_paths.items()
|
307
330
|
}
|
308
331
|
return filepath_dict
|
309
332
|
|
@@ -69,8 +69,6 @@ class ImplementationCreator:
|
|
69
69
|
for the container.
|
70
70
|
implementation_name
|
71
71
|
The name of the implementation. It is by definition the name of the script.
|
72
|
-
requirements
|
73
|
-
The install requirements for the implementation (if any).
|
74
72
|
step
|
75
73
|
The name of the step that this implementation implements.
|
76
74
|
output_slot
|
@@ -93,20 +91,30 @@ class ImplementationCreator:
|
|
93
91
|
for the container."""
|
94
92
|
self.implementation_name = script_path.stem
|
95
93
|
"""The name of the implementation. It is by definition the name of the script."""
|
96
|
-
self.requirements = self._extract_requirements(script_path)
|
97
|
-
"""The install requirements for the implementation (if any)."""
|
98
94
|
self.step = self._extract_implemented_step(script_path)
|
99
95
|
"""The name of the step that this implementation implements."""
|
96
|
+
self.has_custom_recipe = self._extract_has_custom_recipe(script_path)
|
97
|
+
"""Whether the user has already written the recipe for this implementation."""
|
98
|
+
self.script_base_command = self._extract_script_base_command(script_path)
|
99
|
+
"""The base command to use to run the script in this implementation."""
|
100
100
|
self.output_slot = self._extract_output_slot(script_path, self.step)
|
101
101
|
"""The name of the output slot that this implementation sends results to."""
|
102
102
|
|
103
103
|
def create_recipe(self) -> None:
|
104
104
|
"""Builds the singularity recipe and writes it to disk."""
|
105
|
-
|
106
|
-
|
105
|
+
if self.has_custom_recipe:
|
106
|
+
if not self.recipe_path.exists():
|
107
|
+
raise ValueError(f"Could not find a custom recipe at {self.recipe_path}.")
|
108
|
+
return
|
109
|
+
|
110
|
+
recipe = PythonRecipe(
|
111
|
+
self.script_path,
|
112
|
+
self.recipe_path,
|
113
|
+
ImplementationCreator._extract_requirements(self.script_path),
|
114
|
+
self.script_base_command,
|
115
|
+
)
|
107
116
|
recipe.build()
|
108
117
|
recipe.write()
|
109
|
-
pass
|
110
118
|
|
111
119
|
def build_container(self) -> None:
|
112
120
|
"""Builds the container from the recipe.
|
@@ -190,7 +198,7 @@ class ImplementationCreator:
|
|
190
198
|
info[self.implementation_name] = {
|
191
199
|
"steps": [self.step],
|
192
200
|
"image_path": str(self.hosted_container_path),
|
193
|
-
"script_cmd": f"
|
201
|
+
"script_cmd": f"{self.script_base_command} /{self.script_path.name}",
|
194
202
|
"outputs": {
|
195
203
|
self.output_slot: "result.parquet",
|
196
204
|
},
|
@@ -241,6 +249,22 @@ class ImplementationCreator:
|
|
241
249
|
)
|
242
250
|
return steps[0]
|
243
251
|
|
252
|
+
@staticmethod
|
253
|
+
def _extract_has_custom_recipe(script_path: Path) -> bool:
|
254
|
+
"""Extracts whether the user has already written the recipe for this implementation.
|
255
|
+
|
256
|
+
The expectation is that this flag is specified within the script
|
257
|
+
as a comment of the format:
|
258
|
+
|
259
|
+
.. code-block:: python
|
260
|
+
# HAS_CUSTOM_RECIPE: true
|
261
|
+
"""
|
262
|
+
has_custom_recipe = _extract_metadata("HAS_CUSTOM_RECIPE", script_path)
|
263
|
+
if len(has_custom_recipe) == 0:
|
264
|
+
return False
|
265
|
+
else:
|
266
|
+
return str(has_custom_recipe[0]).strip().lower() in ["true", "yes"]
|
267
|
+
|
244
268
|
@staticmethod
|
245
269
|
def _extract_output_slot(script_path: Path, step_name: str) -> str:
|
246
270
|
"""Extracts the name of the output slot that this script is implementing."""
|
@@ -307,7 +331,7 @@ class ImplementationCreator:
|
|
307
331
|
def _extract_pipeline_schema_name(script_path: Path) -> str:
|
308
332
|
"""Extracts the relevant pipeline schema name.
|
309
333
|
|
310
|
-
The expectation is that the
|
334
|
+
The expectation is that the pipeline schema's name is specified within the script
|
311
335
|
as a comment of the format:
|
312
336
|
|
313
337
|
.. code-block:: python
|
@@ -321,6 +345,22 @@ class ImplementationCreator:
|
|
321
345
|
raise ValueError(f"Pipeline schema '{schema_name}' is not supported.")
|
322
346
|
return schema_name
|
323
347
|
|
348
|
+
@staticmethod
|
349
|
+
def _extract_script_base_command(script_path: Path) -> str:
|
350
|
+
"""Extracts the base command to be used to run the script.
|
351
|
+
|
352
|
+
The expectation is that the base command is specified within the script
|
353
|
+
as a comment of the format:
|
354
|
+
|
355
|
+
.. code-block:: python
|
356
|
+
# SCRIPT_BASE_COMMAND: python
|
357
|
+
|
358
|
+
If no pipeline schema is specified, "python" will be used by default.
|
359
|
+
"""
|
360
|
+
base_command_list: list[str] = _extract_metadata("SCRIPT_BASE_COMMAND", script_path)
|
361
|
+
base_command = base_command_list[0] if base_command_list else "python"
|
362
|
+
return base_command
|
363
|
+
|
324
364
|
@staticmethod
|
325
365
|
def _write_metadata(info: dict[str, dict[str, str]]) -> None:
|
326
366
|
"""Writes the implementation metadata to disk.
|
@@ -341,10 +381,17 @@ class PythonRecipe:
|
|
341
381
|
"python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899"
|
342
382
|
)
|
343
383
|
|
344
|
-
def __init__(
|
384
|
+
def __init__(
|
385
|
+
self,
|
386
|
+
script_path: Path,
|
387
|
+
recipe_path: Path,
|
388
|
+
requirements: str,
|
389
|
+
script_base_command: str,
|
390
|
+
) -> None:
|
345
391
|
self.script_path = script_path
|
346
392
|
self.recipe_path = recipe_path
|
347
393
|
self.requirements = requirements
|
394
|
+
self.script_base_command = script_base_command
|
348
395
|
self.text: str | None = None
|
349
396
|
|
350
397
|
def build(self) -> None:
|
@@ -373,7 +420,7 @@ From: {self.BASE_IMAGE}
|
|
373
420
|
export LC_ALL=C
|
374
421
|
|
375
422
|
%runscript
|
376
|
-
|
423
|
+
{self.script_base_command} /{script_name} '$@'"""
|
377
424
|
|
378
425
|
def write(self) -> None:
|
379
426
|
"""Writes the recipe to disk.
|
easylink/implementation.py
CHANGED
@@ -16,9 +16,14 @@ from pathlib import Path
|
|
16
16
|
from typing import TYPE_CHECKING
|
17
17
|
|
18
18
|
from layered_config_tree import LayeredConfigTree
|
19
|
+
from loguru import logger
|
19
20
|
|
20
21
|
from easylink.utilities import paths
|
21
|
-
from easylink.utilities.data_utils import
|
22
|
+
from easylink.utilities.data_utils import (
|
23
|
+
calculate_md5_checksum,
|
24
|
+
download_image,
|
25
|
+
load_yaml,
|
26
|
+
)
|
22
27
|
|
23
28
|
if TYPE_CHECKING:
|
24
29
|
from easylink.graph_components import InputSlot, OutputSlot
|
@@ -74,14 +79,14 @@ class Implementation:
|
|
74
79
|
def __repr__(self) -> str:
|
75
80
|
return f"Implementation.{self.name}"
|
76
81
|
|
77
|
-
def validate(self) -> list[str]:
|
82
|
+
def validate(self, skip_image_validation: bool, images_dir: str | Path) -> list[str]:
|
78
83
|
"""Validates individual ``Implementation`` instances.
|
79
84
|
|
80
85
|
Returns
|
81
86
|
-------
|
82
87
|
A list of logs containing any validation errors. Each item in the list
|
83
88
|
is a distinct message about a particular validation error (e.g. if a
|
84
|
-
required
|
89
|
+
required image does not exist).
|
85
90
|
|
86
91
|
Notes
|
87
92
|
-----
|
@@ -89,7 +94,8 @@ class Implementation:
|
|
89
94
|
"""
|
90
95
|
logs = []
|
91
96
|
logs = self._validate_expected_steps(logs)
|
92
|
-
|
97
|
+
if not skip_image_validation:
|
98
|
+
logs = self._download_and_validate_image(logs, images_dir)
|
93
99
|
return logs
|
94
100
|
|
95
101
|
##################
|
@@ -110,11 +116,82 @@ class Implementation:
|
|
110
116
|
)
|
111
117
|
return logs
|
112
118
|
|
113
|
-
def
|
114
|
-
|
115
|
-
|
116
|
-
if
|
117
|
-
|
119
|
+
def _download_and_validate_image(
|
120
|
+
self, logs: list[str], images_dir: str | Path
|
121
|
+
) -> list[str]:
|
122
|
+
"""Downloads the image if required and validates it exists.
|
123
|
+
|
124
|
+
If the image does not exist in the specified images directory, it will
|
125
|
+
attempt to download it.
|
126
|
+
"""
|
127
|
+
# HACK: We manually create the image path here as well as later when writing
|
128
|
+
# each implementations Snakefile rule.
|
129
|
+
image_path = Path(images_dir) / self.singularity_image_name
|
130
|
+
expected_md5_checksum = self._metadata.get("md5_checksum", None)
|
131
|
+
record_id = self._metadata.get("zenodo_record_id", None)
|
132
|
+
if image_path.exists():
|
133
|
+
self._handle_conflicting_checksums(
|
134
|
+
logs, image_path, expected_md5_checksum, record_id
|
135
|
+
)
|
136
|
+
else:
|
137
|
+
if not record_id:
|
138
|
+
logs.append(
|
139
|
+
f"Image '{str(image_path)}' does not exist and no Zenodo record ID "
|
140
|
+
"is provided to download it."
|
141
|
+
)
|
142
|
+
if not expected_md5_checksum:
|
143
|
+
logs.append(
|
144
|
+
f"Image '{str(image_path)}' does not exist and no MD5 checksum "
|
145
|
+
"is provided to verify from the host."
|
146
|
+
)
|
147
|
+
if not record_id or not expected_md5_checksum:
|
148
|
+
return logs
|
149
|
+
download_image(
|
150
|
+
images_dir=images_dir,
|
151
|
+
record_id=record_id,
|
152
|
+
filename=self.singularity_image_name,
|
153
|
+
md5_checksum=expected_md5_checksum,
|
154
|
+
)
|
155
|
+
if not image_path.exists():
|
156
|
+
logs.append(
|
157
|
+
f"Image '{str(image_path)}' does not exist and could not be downloaded."
|
158
|
+
)
|
159
|
+
return logs
|
160
|
+
|
161
|
+
@staticmethod
|
162
|
+
def _handle_conflicting_checksums(
|
163
|
+
logs: list[str],
|
164
|
+
image_path: Path,
|
165
|
+
expected_md5_checksum: str | None,
|
166
|
+
record_id: str | None,
|
167
|
+
) -> list[str]:
|
168
|
+
# TODO: Strengthen the following logic to better handle image updates.
|
169
|
+
# If using the default images directory and the image already exists
|
170
|
+
# but with a different checksum than in the implementation metadata,
|
171
|
+
# re-download.
|
172
|
+
calculated_md5_checksum = calculate_md5_checksum(image_path)
|
173
|
+
if (
|
174
|
+
image_path.parent == paths.DEFAULT_IMAGES_DIR
|
175
|
+
and expected_md5_checksum
|
176
|
+
and calculated_md5_checksum != expected_md5_checksum
|
177
|
+
):
|
178
|
+
if not record_id:
|
179
|
+
logs.append(
|
180
|
+
f"Image '{str(image_path)}' exists but has a different MD5 checksum "
|
181
|
+
f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
|
182
|
+
"No Zenodo record ID is provided to re-download the image."
|
183
|
+
)
|
184
|
+
logger.info(
|
185
|
+
f"Image '{str(image_path)}' exists but has a different MD5 checksum "
|
186
|
+
f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
|
187
|
+
"Re-downloading the image."
|
188
|
+
)
|
189
|
+
download_image(
|
190
|
+
images_dir=image_path.parent,
|
191
|
+
record_id=record_id,
|
192
|
+
filename=image_path.name,
|
193
|
+
md5_checksum=expected_md5_checksum,
|
194
|
+
)
|
118
195
|
return logs
|
119
196
|
|
120
197
|
def _get_env_vars(self, implementation_config: LayeredConfigTree) -> dict[str, str]:
|
@@ -124,9 +201,9 @@ class Implementation:
|
|
124
201
|
return env_vars
|
125
202
|
|
126
203
|
@property
|
127
|
-
def
|
204
|
+
def singularity_image_name(self) -> str:
|
128
205
|
"""The path to the required Singularity image."""
|
129
|
-
return self._metadata["
|
206
|
+
return self._metadata["image_name"]
|
130
207
|
|
131
208
|
@property
|
132
209
|
def script_cmd(self) -> str:
|