easylink 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/graph_components.py +7 -3
- easylink/pipeline_schema.py +7 -7
- easylink/pipeline_schema_constants/__init__.py +11 -0
- easylink/pipeline_schema_constants/development.py +143 -135
- easylink/pipeline_schema_constants/testing.py +7 -3
- easylink/step.py +391 -353
- easylink/utilities/__init__.py +3 -2
- easylink/utilities/aggregator_utils.py +1 -0
- easylink/utilities/data_utils.py +98 -5
- easylink/utilities/general_utils.py +48 -10
- easylink/utilities/paths.py +9 -3
- easylink/utilities/splitter_utils.py +1 -0
- easylink/utilities/validation_utils.py +29 -0
- {easylink-0.1.7.dist-info → easylink-0.1.9.dist-info}/METADATA +1 -1
- {easylink-0.1.7.dist-info → easylink-0.1.9.dist-info}/RECORD +19 -19
- {easylink-0.1.7.dist-info → easylink-0.1.9.dist-info}/WHEEL +1 -1
- {easylink-0.1.7.dist-info → easylink-0.1.9.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.7.dist-info → easylink-0.1.9.dist-info}/top_level.txt +0 -0
easylink/utilities/__init__.py
CHANGED
easylink/utilities/data_utils.py
CHANGED
@@ -1,14 +1,43 @@
|
|
1
1
|
# mypy: ignore-errors
|
2
|
+
"""
|
3
|
+
==============
|
4
|
+
Data Utilities
|
5
|
+
==============
|
6
|
+
|
7
|
+
This module contains utility functions for handling data files and directories.
|
8
|
+
|
9
|
+
"""
|
10
|
+
|
2
11
|
import os
|
3
12
|
import shutil
|
13
|
+
from collections.abc import Callable
|
4
14
|
from datetime import datetime
|
5
15
|
from pathlib import Path
|
6
16
|
|
7
17
|
import yaml
|
8
18
|
|
9
19
|
|
10
|
-
def modify_umask(func):
|
11
|
-
"""
|
20
|
+
def modify_umask(func: Callable) -> Callable:
|
21
|
+
"""Decorates a function to modify a process's umask temporarily before calling the function.
|
22
|
+
|
23
|
+
This decorator sets the umask to 0o002, which grants write permission to the
|
24
|
+
group while preserving the umask settings for the owner and others. It ensures
|
25
|
+
that any file or directory created by the decorated function has group write
|
26
|
+
permissions. After the function executes, the decorator restores the original
|
27
|
+
umask.
|
28
|
+
|
29
|
+
Parameters
|
30
|
+
----------
|
31
|
+
func
|
32
|
+
The function to be decorated. It can be any callable that might create files
|
33
|
+
or directories during its execution.
|
34
|
+
|
35
|
+
Returns
|
36
|
+
-------
|
37
|
+
A wrapper function that, when called, modifies the umask, calls the original
|
38
|
+
function with the provided arguments, and finally restores the umask to its
|
39
|
+
original value.
|
40
|
+
"""
|
12
41
|
|
13
42
|
def wrapper(*args, **kwargs):
|
14
43
|
old_umask = os.umask(0o002)
|
@@ -22,11 +51,28 @@ def modify_umask(func):
|
|
22
51
|
|
23
52
|
@modify_umask
|
24
53
|
def create_results_directory(results_dir: Path) -> None:
|
54
|
+
"""Creates a results directory.
|
55
|
+
|
56
|
+
This creates the high-level results directory to be used for storing results
|
57
|
+
(including any missing sub-directories).
|
58
|
+
|
59
|
+
Parameters
|
60
|
+
----------
|
61
|
+
results_dir
|
62
|
+
The directory to be created.
|
63
|
+
"""
|
25
64
|
results_dir.mkdir(parents=True, exist_ok=True)
|
26
65
|
|
27
66
|
|
28
67
|
@modify_umask
|
29
68
|
def create_results_intermediates(results_dir: Path) -> None:
|
69
|
+
"""Creates required sub-directories within a given run's results directory.
|
70
|
+
|
71
|
+
Parameters
|
72
|
+
----------
|
73
|
+
results_dir
|
74
|
+
The results directory for the current run.
|
75
|
+
"""
|
30
76
|
(results_dir / "intermediate").mkdir(exist_ok=True)
|
31
77
|
(results_dir / "diagnostics").mkdir(exist_ok=True)
|
32
78
|
|
@@ -37,6 +83,21 @@ def copy_configuration_files_to_results_directory(
|
|
37
83
|
computing_environment: Path | None,
|
38
84
|
results_dir: Path,
|
39
85
|
) -> None:
|
86
|
+
"""Copies all configuration files into the results directory.
|
87
|
+
|
88
|
+
Parameters
|
89
|
+
----------
|
90
|
+
pipeline_specification
|
91
|
+
The filepath to the pipeline specification file.
|
92
|
+
input_data
|
93
|
+
The filepath to the input data specification file (_not_ the paths to the
|
94
|
+
input data themselves).
|
95
|
+
computing_environment
|
96
|
+
The filepath to the specification file defining the computing environment
|
97
|
+
to run the pipeline on.
|
98
|
+
results_dir
|
99
|
+
The directory to write results and incidental files (logs, etc.) to.
|
100
|
+
"""
|
40
101
|
shutil.copy(pipeline_specification, results_dir)
|
41
102
|
shutil.copy(input_data, results_dir)
|
42
103
|
if computing_environment:
|
@@ -44,18 +105,50 @@ def copy_configuration_files_to_results_directory(
|
|
44
105
|
|
45
106
|
|
46
107
|
def get_results_directory(output_dir: str | None, no_timestamp: bool) -> Path:
|
108
|
+
"""Determines the results directory path.
|
109
|
+
|
110
|
+
This function determines the filepath for storing results by (optionally) appending
|
111
|
+
a timestamp to the specified output directory. If no output directory is provided,
|
112
|
+
it defaults to a directory named 'results' in the current working directory.
|
113
|
+
|
114
|
+
Parameters
|
115
|
+
----------
|
116
|
+
output_dir
|
117
|
+
The directory to write results and incidental files (logs, etc.) to. If no
|
118
|
+
value is provided, results will be written to a 'results/' directory in the
|
119
|
+
current working directory.
|
120
|
+
no_timestamp
|
121
|
+
Whether or not to save the results in a timestamped sub-directory.
|
122
|
+
|
123
|
+
Returns
|
124
|
+
-------
|
125
|
+
The fully resolved path to the results directory.
|
126
|
+
"""
|
47
127
|
results_dir = Path("results" if output_dir is None else output_dir).resolve()
|
48
128
|
if not no_timestamp:
|
49
|
-
|
50
|
-
results_dir = results_dir / launch_time
|
129
|
+
results_dir = results_dir / _get_timestamp()
|
51
130
|
return results_dir
|
52
131
|
|
53
132
|
|
54
|
-
def _get_timestamp():
|
133
|
+
def _get_timestamp() -> str:
|
55
134
|
return datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
|
56
135
|
|
57
136
|
|
58
137
|
def load_yaml(filepath: str | Path) -> dict:
|
138
|
+
"""Loads and returns the contents of a YAML file.
|
139
|
+
|
140
|
+
This function uses `yaml.safe_load` to parse the YAML file, which is designed
|
141
|
+
to safely load a subset of YAML without executing arbitrary code.
|
142
|
+
|
143
|
+
Parameters
|
144
|
+
----------
|
145
|
+
filepath
|
146
|
+
The path to the YAML file to be loaded.
|
147
|
+
|
148
|
+
Returns
|
149
|
+
-------
|
150
|
+
The contents of the YAML file.
|
151
|
+
"""
|
59
152
|
with open(filepath, "r") as file:
|
60
153
|
data = yaml.safe_load(file)
|
61
154
|
return data
|
@@ -1,4 +1,13 @@
|
|
1
1
|
# mypy: ignore-errors
|
2
|
+
"""
|
3
|
+
=================
|
4
|
+
General Utilities
|
5
|
+
=================
|
6
|
+
|
7
|
+
This module contains various utility functions.
|
8
|
+
|
9
|
+
"""
|
10
|
+
|
2
11
|
import errno
|
3
12
|
import functools
|
4
13
|
import shutil
|
@@ -14,7 +23,26 @@ from loguru import logger
|
|
14
23
|
def handle_exceptions(
|
15
24
|
func: Callable, exceptions_logger: Any, with_debugger: bool
|
16
25
|
) -> Callable:
|
17
|
-
"""
|
26
|
+
"""Wraps a function to handle exceptions by logging and optionally dropping into a debugger.
|
27
|
+
|
28
|
+
Parameters
|
29
|
+
----------
|
30
|
+
func
|
31
|
+
The wrapped function that is executed and monitored for exceptions.
|
32
|
+
exceptions_logger
|
33
|
+
The logging object used to log exceptions that occur during function execution.
|
34
|
+
with_debugger
|
35
|
+
Whether or not to drop into an interactive debugger upon encountering an exception.
|
36
|
+
|
37
|
+
Returns
|
38
|
+
-------
|
39
|
+
A wrapped version of `func` that includes the exception handling logic.
|
40
|
+
|
41
|
+
Notes
|
42
|
+
-----
|
43
|
+
Exceptions `BdbQuit` and `KeyboardInterrupt` are re-raised _without_ logging
|
44
|
+
to allow for normal debugger and program exit behaviors.
|
45
|
+
"""
|
18
46
|
|
19
47
|
@functools.wraps(func)
|
20
48
|
def wrapped(*args, **kwargs):
|
@@ -35,13 +63,18 @@ def handle_exceptions(
|
|
35
63
|
return wrapped
|
36
64
|
|
37
65
|
|
38
|
-
def configure_logging_to_terminal(verbose: int):
|
39
|
-
"""
|
66
|
+
def configure_logging_to_terminal(verbose: int) -> None:
|
67
|
+
"""Configures logging output to the terminal with optional verbosity levels.
|
40
68
|
|
41
69
|
Parameters
|
42
70
|
----------
|
43
71
|
verbose
|
44
|
-
|
72
|
+
An integer indicating the verbosity level of the logging output. Higher
|
73
|
+
values produce more detailed logging information.
|
74
|
+
|
75
|
+
Notes
|
76
|
+
-----
|
77
|
+
This function clears any default logging configuration before applying the new settings.
|
45
78
|
"""
|
46
79
|
logger.remove(0) # Clear default configuration
|
47
80
|
_add_logging_sink(sys.stdout, verbose, colorize=True)
|
@@ -49,21 +82,20 @@ def configure_logging_to_terminal(verbose: int):
|
|
49
82
|
|
50
83
|
def _add_logging_sink(
|
51
84
|
sink: TextIO, verbose: int, colorize: bool = False, serialize: bool = False
|
52
|
-
):
|
85
|
+
) -> None:
|
53
86
|
"""Adds a logging sink to the global process logger.
|
54
87
|
|
55
88
|
Parameters
|
56
89
|
----------
|
57
90
|
sink
|
58
|
-
|
91
|
+
The output stream to which log messages will be directed, e.g. ``sys.stdout``.
|
59
92
|
verbose
|
60
|
-
Verbosity of the logger.
|
93
|
+
Verbosity of the logger. The log level is set to INFO if 0 and DEBUG otherwise.
|
61
94
|
colorize
|
62
95
|
Whether to use the colorization options from :mod:`loguru`.
|
63
96
|
serialize
|
64
97
|
Whether the logs should be converted to JSON before they're dumped
|
65
98
|
to the logging sink.
|
66
|
-
|
67
99
|
"""
|
68
100
|
message_format = (
|
69
101
|
"<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <green>{elapsed}</green> | "
|
@@ -104,7 +136,6 @@ def exit_with_validation_error(error_msg: dict) -> None:
|
|
104
136
|
SystemExit
|
105
137
|
Exits the program with an EINVAL (invalid argument) code due to
|
106
138
|
previously-determined validation errors.
|
107
|
-
|
108
139
|
"""
|
109
140
|
|
110
141
|
logger.error(
|
@@ -118,5 +149,12 @@ def exit_with_validation_error(error_msg: dict) -> None:
|
|
118
149
|
|
119
150
|
|
120
151
|
def is_on_slurm() -> bool:
|
121
|
-
"""Returns True if the current environment is a SLURM cluster.
|
152
|
+
"""Returns True if the current environment is a SLURM cluster.
|
153
|
+
|
154
|
+
Notes
|
155
|
+
-----
|
156
|
+
This function simply checks for the presence of the `sbatch` command to _infer_
|
157
|
+
if SLURM is installed. It does _not_ check if SLURM is currently active or
|
158
|
+
managing jobs.
|
159
|
+
"""
|
122
160
|
return shutil.which("sbatch") is not None
|
easylink/utilities/paths.py
CHANGED
@@ -2,15 +2,21 @@
|
|
2
2
|
=========
|
3
3
|
Filepaths
|
4
4
|
=========
|
5
|
+
|
6
|
+
This module contains commonly-used filepaths and directories.
|
7
|
+
|
5
8
|
"""
|
6
9
|
|
7
10
|
from pathlib import Path
|
8
11
|
|
9
12
|
# TODO: We'll need to update this to be more generic for external users and have a way of configuring this
|
10
13
|
CONTAINER_DIR = "/mnt/team/simulation_science/priv/engineering/er_ecosystem/images"
|
14
|
+
"""Path to the directory where the container images are stored."""
|
11
15
|
IMPLEMENTATION_METADATA = Path(__file__).parent.parent / "implementation_metadata.yaml"
|
12
|
-
|
13
|
-
# For now, put slurm in /tmp to avoid creating a subdir with a prolog script
|
16
|
+
"""Path to the implementation metadata file."""
|
14
17
|
EASYLINK_TEMP = {"local": Path("/tmp/easylink"), "slurm": Path("/tmp")}
|
15
|
-
|
18
|
+
"""Paths to the easylink tmp/ directory to get bound to the container's /tmp directory.
|
19
|
+
When running on slurm, we bind /tmp (rather than /tmp/easylink) to avoid creating
|
20
|
+
a subdir with a prolog script"""
|
16
21
|
SPARK_SNAKEFILE = Path(__file__).parent / "spark.smk"
|
22
|
+
"""Path to the Snakemake snakefile containing spark-specific rules."""
|
@@ -1,3 +1,13 @@
|
|
1
|
+
"""
|
2
|
+
=========================
|
3
|
+
Data Validation Utilities
|
4
|
+
=========================
|
5
|
+
|
6
|
+
This module contains utility functions for validating datasets, e.g. the validation
|
7
|
+
function(s) for processed data being passed out of one pipeline step and into the next.
|
8
|
+
|
9
|
+
"""
|
10
|
+
|
1
11
|
from pathlib import Path
|
2
12
|
|
3
13
|
import pandas as pd
|
@@ -5,6 +15,25 @@ from pyarrow import parquet as pq
|
|
5
15
|
|
6
16
|
|
7
17
|
def validate_input_file_dummy(filepath: str) -> None:
|
18
|
+
"""Validates an input file to a dummy :class:`~easylink.step.Step`.
|
19
|
+
|
20
|
+
This function is intended to be used as the :attr:`~easylink.graph_components.InputSlot.validator`
|
21
|
+
for _all_ input data at every step in the dummy/:mod:`easylink.pipeline_schema_constants.development`
|
22
|
+
pipeline schema. It simply checks for supported file types as well as the presence
|
23
|
+
of required columns.
|
24
|
+
|
25
|
+
Parameters
|
26
|
+
----------
|
27
|
+
filepath
|
28
|
+
The path to the input data file to be validated.
|
29
|
+
|
30
|
+
Raises
|
31
|
+
------
|
32
|
+
NotImplementedError
|
33
|
+
If the file type is not supported.
|
34
|
+
LookupError
|
35
|
+
If the file is missing required columns.
|
36
|
+
"""
|
8
37
|
extension = Path(filepath).suffix
|
9
38
|
if extension == ".parquet":
|
10
39
|
output_columns = set(pq.ParquetFile(filepath).schema.names)
|
@@ -1,22 +1,22 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
3
|
+
easylink/_version.py,sha256=XIaxbMbyiP-L3kguR1GhxirFblTXiHR1lMfDVITvHUI,22
|
4
4
|
easylink/cli.py,sha256=ARSKAljepNOEYd1VCS_QqBJQIBLzE3IgKiOb5-OROdY,6380
|
5
5
|
easylink/configuration.py,sha256=Ire2pMZNZ6wtSwhcWnQpYa-snX4KrhXgovlQwQ2Wxf4,12530
|
6
|
-
easylink/graph_components.py,sha256=
|
6
|
+
easylink/graph_components.py,sha256=6OipaUkCW2ESBW6bxwZVgpRAX8RuL15m4x_mGE7i4R8,12669
|
7
7
|
easylink/implementation.py,sha256=AwGl5YCKCSQo91owWj-gg9_5lBz7H_4q2z7jF0BhXs4,8992
|
8
8
|
easylink/implementation_metadata.yaml,sha256=VvlEu3Dvlmeh1MpzeYx91j22GiV-9mu3hZP5yVuW04o,6763
|
9
9
|
easylink/pipeline.py,sha256=EyCXv5p9WzTqcndXK6ukBJE6jY_fWIP_DGZQUl1wRcY,12284
|
10
10
|
easylink/pipeline_graph.py,sha256=vsY6nW_iEwZCNf_N_3CsixsKBUy_5JxGEi61-1Q-KAw,22842
|
11
|
-
easylink/pipeline_schema.py,sha256=
|
11
|
+
easylink/pipeline_schema.py,sha256=kINpvy2Fl2S3FBqgdgZCCFHEk237_36X4ltLOtk5-dE,5862
|
12
12
|
easylink/rule.py,sha256=W97LMI-vkEPipJbnSZLn2BxfYfFtvzGTKzq6YgDVri0,19913
|
13
13
|
easylink/runner.py,sha256=k9ICTToHj2xr6MGIuvlWf6YMeZ47UGgseaMByMgUGac,6271
|
14
|
-
easylink/step.py,sha256=
|
14
|
+
easylink/step.py,sha256=8EhoFOXBLWgDfb3OhmQu5g03fqElIJCWg8-Y_5azKEA,67100
|
15
15
|
easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
|
16
16
|
easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
|
17
|
-
easylink/pipeline_schema_constants/__init__.py,sha256=
|
18
|
-
easylink/pipeline_schema_constants/development.py,sha256
|
19
|
-
easylink/pipeline_schema_constants/testing.py,sha256=
|
17
|
+
easylink/pipeline_schema_constants/__init__.py,sha256=uRVjQw7_Ff5IBQw0_Jc93Fzfa-MnbPVPKsy18CCaW7E,1021
|
18
|
+
easylink/pipeline_schema_constants/development.py,sha256=kOTEqfZD5pWqP9gu7E6r9Cubf3ILtWEUxCfJfrN8znc,11547
|
19
|
+
easylink/pipeline_schema_constants/testing.py,sha256=ohcTlT_viZYxS1GkO46mjkb8IzXo6yIOqvBbb4YrOhA,10897
|
20
20
|
easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
|
21
21
|
easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
|
22
22
|
easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
|
@@ -35,16 +35,16 @@ easylink/steps/dev/python_pyspark/python_pyspark.def,sha256=j_RmVjspmXGOhJTr10ED
|
|
35
35
|
easylink/steps/dev/r/README.md,sha256=dPjZdDTqcJsZCiwhddzlOj1ob0P7YocZUNFrLIGM1-0,1201
|
36
36
|
easylink/steps/dev/r/dummy_step.R,sha256=1TWZY8CEkT6gavrulBxFsKbDSKJJjk0NtJrGH7TIikE,4975
|
37
37
|
easylink/steps/dev/r/r-image.def,sha256=LrhXlt0C3k7d_VJWopRPEVARnFWSuq_oILlwo7g03bE,627
|
38
|
-
easylink/utilities/__init__.py,sha256=
|
39
|
-
easylink/utilities/aggregator_utils.py,sha256=
|
40
|
-
easylink/utilities/data_utils.py,sha256=
|
41
|
-
easylink/utilities/general_utils.py,sha256=
|
42
|
-
easylink/utilities/paths.py,sha256=
|
38
|
+
easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
|
39
|
+
easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl64u0Lr__g,972
|
40
|
+
easylink/utilities/data_utils.py,sha256=CcnM3u0_MQDQo3jMs3E4IK_rz8wAsFdJ674fZxYEFZg,4620
|
41
|
+
easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
|
42
|
+
easylink/utilities/paths.py,sha256=KM1GlnsAcKbUJrC4LZKpeJfPljxe_aXP1ZhVp43TYRA,924
|
43
43
|
easylink/utilities/spark.smk,sha256=tQ7RArNQzhjbaBQQcRORB4IxxkuDx4gPHUBcWHDYJ_U,5795
|
44
|
-
easylink/utilities/splitter_utils.py,sha256=
|
45
|
-
easylink/utilities/validation_utils.py,sha256=
|
46
|
-
easylink-0.1.
|
47
|
-
easylink-0.1.
|
48
|
-
easylink-0.1.
|
49
|
-
easylink-0.1.
|
50
|
-
easylink-0.1.
|
44
|
+
easylink/utilities/splitter_utils.py,sha256=y4CbbTBgRaoXFxy-9Eu5eWx4lA4ZEcbrYpxgLIzG_kc,2602
|
45
|
+
easylink/utilities/validation_utils.py,sha256=W9r_RXcivJjfpioLhONirfwdByYttxNsVY489_sbrYQ,1683
|
46
|
+
easylink-0.1.9.dist-info/METADATA,sha256=kc6QCMEr_QU7QtZNnQu-Ic20wmV2OzoN_23Ndw6ArEc,2804
|
47
|
+
easylink-0.1.9.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
48
|
+
easylink-0.1.9.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
49
|
+
easylink-0.1.9.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
50
|
+
easylink-0.1.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|