easylink 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +9 -0
- easylink/configuration.py +18 -34
- easylink/devtools/implementation_creator.py +13 -11
- easylink/implementation.py +11 -2
- easylink/implementation_metadata.yaml +19 -0
- easylink/pipeline_schema.py +12 -13
- easylink/pipeline_schema_constants/__init__.py +4 -4
- easylink/pipeline_schema_constants/testing.py +53 -1
- easylink/rule.py +14 -1
- easylink/runner.py +5 -7
- easylink/steps/output_dir/dummy_step_1_for_output_dir_example.def +22 -0
- easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py +18 -0
- easylink/steps/output_dir/dummy_step_2_for_output_dir_example.def +22 -0
- easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py +22 -0
- easylink/utilities/validation_utils.py +6 -0
- {easylink-0.1.16.dist-info → easylink-0.1.18.dist-info}/METADATA +1 -1
- {easylink-0.1.16.dist-info → easylink-0.1.18.dist-info}/RECORD +21 -17
- {easylink-0.1.16.dist-info → easylink-0.1.18.dist-info}/WHEEL +1 -1
- {easylink-0.1.16.dist-info → easylink-0.1.18.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.16.dist-info → easylink-0.1.18.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.18"
|
easylink/cli.py
CHANGED
@@ -91,6 +91,11 @@ SHARED_OPTIONS = [
|
|
91
91
|
default=False,
|
92
92
|
help="Do not save the results in a timestamped sub-directory of ``--output-dir``.",
|
93
93
|
),
|
94
|
+
click.option(
|
95
|
+
"--schema",
|
96
|
+
hidden=True,
|
97
|
+
default="main",
|
98
|
+
),
|
94
99
|
]
|
95
100
|
|
96
101
|
VERBOSE_WITH_DEBUGGER_OPTIONS = [
|
@@ -165,6 +170,7 @@ def run(
|
|
165
170
|
input_data: str,
|
166
171
|
output_dir: str | None,
|
167
172
|
no_timestamp: bool,
|
173
|
+
schema: str,
|
168
174
|
computing_environment: str | None,
|
169
175
|
verbose: int,
|
170
176
|
with_debugger: bool,
|
@@ -190,6 +196,7 @@ def run(
|
|
190
196
|
input_data=input_data,
|
191
197
|
computing_environment=computing_environment,
|
192
198
|
results_dir=results_dir,
|
199
|
+
schema_name=schema,
|
193
200
|
)
|
194
201
|
logger.info("*** FINISHED ***")
|
195
202
|
|
@@ -201,6 +208,7 @@ def generate_dag(
|
|
201
208
|
input_data: str,
|
202
209
|
output_dir: str | None,
|
203
210
|
no_timestamp: bool,
|
211
|
+
schema: str,
|
204
212
|
verbose: int,
|
205
213
|
with_debugger: bool,
|
206
214
|
) -> None:
|
@@ -223,6 +231,7 @@ def generate_dag(
|
|
223
231
|
input_data=input_data,
|
224
232
|
computing_environment=None,
|
225
233
|
results_dir=results_dir,
|
234
|
+
schema_name=schema,
|
226
235
|
)
|
227
236
|
logger.info("*** DAG saved to result directory ***")
|
228
237
|
|
easylink/configuration.py
CHANGED
@@ -14,7 +14,7 @@ from typing import Any
|
|
14
14
|
|
15
15
|
from layered_config_tree import LayeredConfigTree
|
16
16
|
|
17
|
-
from easylink.pipeline_schema import
|
17
|
+
from easylink.pipeline_schema import PipelineSchema
|
18
18
|
from easylink.utilities.data_utils import load_yaml
|
19
19
|
from easylink.utilities.general_utils import exit_with_validation_error
|
20
20
|
|
@@ -67,9 +67,8 @@ class Config(LayeredConfigTree):
|
|
67
67
|
A dictionary of all specifications required to run the pipeline. This
|
68
68
|
includes the pipeline, input data, and computing environment specifications,
|
69
69
|
as well as the results directory.
|
70
|
-
|
71
|
-
|
72
|
-
This is primarily used for testing purposes. Defaults to the supported schemas.
|
70
|
+
schema_name
|
71
|
+
The name of the schema to validate the pipeline configuration against.
|
73
72
|
|
74
73
|
Attributes
|
75
74
|
----------
|
@@ -82,22 +81,14 @@ class Config(LayeredConfigTree):
|
|
82
81
|
input_data
|
83
82
|
The input data filepaths.
|
84
83
|
schema
|
85
|
-
The :class:`~easylink.pipeline_schema.PipelineSchema
|
86
|
-
|
87
|
-
|
88
|
-
Notes
|
89
|
-
-----
|
90
|
-
The requested pipeline is checked against a set of supported
|
91
|
-
``PipelineSchemas``. The first schema that successfully validates is assumed
|
92
|
-
to be the correct one and is attached to the ``Config`` object and its
|
93
|
-
:meth:`~easylink.pipeline_schema.PipelineSchema.configure_pipeline`
|
94
|
-
method is called.
|
84
|
+
The :class:`~easylink.pipeline_schema.PipelineSchema`.
|
85
|
+
|
95
86
|
"""
|
96
87
|
|
97
88
|
def __init__(
|
98
89
|
self,
|
99
90
|
config_params: dict[str, Any],
|
100
|
-
|
91
|
+
schema_name: str = "main",
|
101
92
|
) -> None:
|
102
93
|
super().__init__(layers=["initial_data", "default", "user_configured"])
|
103
94
|
self.update(DEFAULT_ENVIRONMENT, layer="default")
|
@@ -108,9 +99,7 @@ class Config(LayeredConfigTree):
|
|
108
99
|
# Set slurm defaults to empty dict instead of None so that we don't get errors
|
109
100
|
# in slurm_resources property
|
110
101
|
self.update({"environment": {"slurm": {}}}, layer="default")
|
111
|
-
|
112
|
-
potential_schemas = [potential_schemas]
|
113
|
-
self.update({"schema": self._get_schema(potential_schemas)}, layer="initial_data")
|
102
|
+
self.update({"schema": self._get_schema(schema_name)}, layer="initial_data")
|
114
103
|
self.schema.configure_pipeline(self.pipeline, self.input_data)
|
115
104
|
self._validate()
|
116
105
|
self.freeze()
|
@@ -173,22 +162,22 @@ class Config(LayeredConfigTree):
|
|
173
162
|
# Setup Methods #
|
174
163
|
#################
|
175
164
|
|
176
|
-
def _get_schema(self,
|
165
|
+
def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
|
177
166
|
"""Returns the first :class:`~easylink.pipeline_schema.PipelineSchema` that validates the requested pipeline.
|
178
167
|
|
179
168
|
Parameters
|
180
169
|
----------
|
181
|
-
|
182
|
-
``
|
170
|
+
schema_name
|
171
|
+
The name of the specific ``PipelineSchema`` to validate the pipeline configuration against.
|
183
172
|
|
184
173
|
Returns
|
185
174
|
-------
|
186
|
-
The
|
175
|
+
The requested ``PipelineSchema`` if it validates the requested pipeline configuration.
|
187
176
|
|
188
177
|
Raises
|
189
178
|
------
|
190
179
|
SystemExit
|
191
|
-
If the pipeline configuration is not valid for
|
180
|
+
If the pipeline configuration is not valid for the requested schema,
|
192
181
|
the program exits with a non-zero code and all validation errors found
|
193
182
|
are logged.
|
194
183
|
|
@@ -197,20 +186,15 @@ class Config(LayeredConfigTree):
|
|
197
186
|
This acts as the pipeline configuration file's validation method since
|
198
187
|
we can only find a matching ``PipelineSchema`` if that file is valid.
|
199
188
|
|
200
|
-
This method returns the *first* ``PipelineSchema`` that validates and does
|
201
|
-
not attempt to check additional ones.
|
202
189
|
"""
|
203
190
|
errors = defaultdict(dict)
|
204
191
|
# Try each schema until one is validated
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
return schema
|
212
|
-
# No schemas were validated
|
213
|
-
exit_with_validation_error(dict(errors))
|
192
|
+
schema = PipelineSchema.get_schema(schema_name)
|
193
|
+
logs = schema.validate_step(self.pipeline, self.input_data)
|
194
|
+
if logs:
|
195
|
+
errors[PIPELINE_ERRORS_KEY][schema.name] = logs
|
196
|
+
exit_with_validation_error(dict(errors))
|
197
|
+
return schema
|
214
198
|
|
215
199
|
def _validate(self) -> None:
|
216
200
|
"""Validates the ``Config``.
|
@@ -19,7 +19,7 @@ from typing import cast
|
|
19
19
|
import yaml
|
20
20
|
from loguru import logger
|
21
21
|
|
22
|
-
from easylink.pipeline_schema_constants import
|
22
|
+
from easylink.pipeline_schema_constants import SCHEMA_PARAMS
|
23
23
|
from easylink.step import (
|
24
24
|
ChoiceStep,
|
25
25
|
EmbarrassinglyParallelStep,
|
@@ -244,17 +244,17 @@ class ImplementationCreator:
|
|
244
244
|
@staticmethod
|
245
245
|
def _extract_output_slot(script_path: Path, step_name: str) -> str:
|
246
246
|
"""Extracts the name of the output slot that this script is implementing."""
|
247
|
-
|
248
|
-
implementable_steps = ImplementationCreator._extract_implementable_steps(
|
247
|
+
schema_name = ImplementationCreator._extract_pipeline_schema_name(script_path)
|
248
|
+
implementable_steps = ImplementationCreator._extract_implementable_steps(schema_name)
|
249
249
|
step_names = [step.name for step in implementable_steps]
|
250
250
|
if step_name not in step_names:
|
251
251
|
raise ValueError(
|
252
|
-
f"'{step_name}' does not exist as an implementable step in the '{
|
252
|
+
f"'{step_name}' does not exist as an implementable step in the '{schema_name}' pipeline schema. "
|
253
253
|
)
|
254
254
|
duplicates = list(set([step for step in step_names if step_names.count(step) > 1]))
|
255
255
|
if duplicates:
|
256
256
|
raise ValueError(
|
257
|
-
f"Multiple implementable steps with the same name found in the '{
|
257
|
+
f"Multiple implementable steps with the same name found in the '{schema_name}' "
|
258
258
|
f"pipeline schema: {duplicates}."
|
259
259
|
)
|
260
260
|
implemented_step = [step for step in implementable_steps if step.name == step_name][0]
|
@@ -266,7 +266,7 @@ class ImplementationCreator:
|
|
266
266
|
return list(implemented_step.output_slots)[0]
|
267
267
|
|
268
268
|
@staticmethod
|
269
|
-
def _extract_implementable_steps(
|
269
|
+
def _extract_implementable_steps(schema_name: str) -> list[Step]:
|
270
270
|
"""Extracts all implementable steps from the pipeline schema.
|
271
271
|
|
272
272
|
This method recursively traverses the pipeline schema specified in the script
|
@@ -296,8 +296,7 @@ class ImplementationCreator:
|
|
296
296
|
implementable_steps.append(node)
|
297
297
|
return
|
298
298
|
|
299
|
-
schema_steps =
|
300
|
-
|
299
|
+
schema_steps, _edges = SCHEMA_PARAMS[schema_name]
|
301
300
|
implementable_steps: list[Step] = []
|
302
301
|
for schema_step in schema_steps:
|
303
302
|
_process_step(schema_step)
|
@@ -305,7 +304,7 @@ class ImplementationCreator:
|
|
305
304
|
return implementable_steps
|
306
305
|
|
307
306
|
@staticmethod
|
308
|
-
def
|
307
|
+
def _extract_pipeline_schema_name(script_path: Path) -> str:
|
309
308
|
"""Extracts the relevant pipeline schema name.
|
310
309
|
|
311
310
|
The expectation is that the output slot's name is specified within the script
|
@@ -316,8 +315,11 @@ class ImplementationCreator:
|
|
316
315
|
|
317
316
|
If no pipeline schema is specified, "main" will be used by default.
|
318
317
|
"""
|
319
|
-
|
320
|
-
|
318
|
+
schema_name_list: list[str] = _extract_metadata("PIPELINE_SCHEMA", script_path)
|
319
|
+
schema_name = "main" if len(schema_name_list) == 0 else schema_name_list[0]
|
320
|
+
if schema_name not in SCHEMA_PARAMS:
|
321
|
+
raise ValueError(f"Pipeline schema '{schema_name}' is not supported.")
|
322
|
+
return schema_name
|
321
323
|
|
322
324
|
@staticmethod
|
323
325
|
def _write_metadata(info: dict[str, dict[str, str]]) -> None:
|
easylink/implementation.py
CHANGED
@@ -135,8 +135,17 @@ class Implementation:
|
|
135
135
|
|
136
136
|
@property
|
137
137
|
def outputs(self) -> dict[str, list[str]]:
|
138
|
-
"""The expected output metadata.
|
139
|
-
|
138
|
+
"""The expected output paths. If output metadata is provided, use it. Otherwise,
|
139
|
+
assume that the output is a sub-directory with the name of the output slot.
|
140
|
+
If there is only one output slot, use '.'."""
|
141
|
+
if len(self.output_slots) == 1:
|
142
|
+
return self._metadata.get("outputs", {list(self.output_slots.keys())[0]: "."})
|
143
|
+
return {
|
144
|
+
output_slot_name: self._metadata.get("outputs", {}).get(
|
145
|
+
output_slot_name, output_slot_name
|
146
|
+
)
|
147
|
+
for output_slot_name in self.output_slots
|
148
|
+
}
|
140
149
|
|
141
150
|
|
142
151
|
class NullImplementation:
|
@@ -192,3 +192,22 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
192
192
|
script_cmd: python /dummy_step.py
|
193
193
|
outputs:
|
194
194
|
step_1_main_output: result.parquet
|
195
|
+
dummy_step_1_for_output_dir_example:
|
196
|
+
steps:
|
197
|
+
- step_1_for_output_dir_example
|
198
|
+
image_path: /mnt/team/simulation_science/priv/engineering/er_ecosystem/images/zmbc/dummy_step_1_for_output_dir_example.sif
|
199
|
+
script_cmd: python /dummy_step_1_for_output_dir_example.py
|
200
|
+
outputs:
|
201
|
+
step_1_main_output_directory: output_dir/
|
202
|
+
dummy_step_1_for_output_dir_example_default:
|
203
|
+
steps:
|
204
|
+
- step_1_for_output_dir_example
|
205
|
+
image_path: /mnt/team/simulation_science/priv/engineering/er_ecosystem/images/zmbc/dummy_step_1_for_output_dir_example.sif
|
206
|
+
script_cmd: python /dummy_step_1_for_output_dir_example.py
|
207
|
+
dummy_step_2_for_output_dir_example:
|
208
|
+
steps:
|
209
|
+
- step_2_for_output_dir_example
|
210
|
+
image_path: /mnt/team/simulation_science/priv/engineering/er_ecosystem/images/zmbc/dummy_step_2_for_output_dir_example.sif
|
211
|
+
script_cmd: python /dummy_step_2_for_output_dir_example.py
|
212
|
+
outputs:
|
213
|
+
step_2_main_output: result.parquet
|
easylink/pipeline_schema.py
CHANGED
@@ -14,7 +14,7 @@ from pathlib import Path
|
|
14
14
|
from layered_config_tree import LayeredConfigTree
|
15
15
|
|
16
16
|
from easylink.graph_components import EdgeParams, ImplementationGraph
|
17
|
-
from easylink.pipeline_schema_constants import
|
17
|
+
from easylink.pipeline_schema_constants import SCHEMA_PARAMS
|
18
18
|
from easylink.step import HierarchicalStep, NonLeafConfigurationState, Step
|
19
19
|
|
20
20
|
|
@@ -39,7 +39,7 @@ class PipelineSchema(HierarchicalStep):
|
|
39
39
|
|
40
40
|
Notes
|
41
41
|
-----
|
42
|
-
|
42
|
+
A ``PipelineSchema`` is intended to be constructed by the :meth:`get_schema`
|
43
43
|
class method.
|
44
44
|
|
45
45
|
The ``PipelineSchema`` is a high-level abstraction; it represents the desired
|
@@ -159,22 +159,21 @@ class PipelineSchema(HierarchicalStep):
|
|
159
159
|
)
|
160
160
|
|
161
161
|
@classmethod
|
162
|
-
def
|
162
|
+
def get_schema(cls, name: str = "main") -> list["PipelineSchema"]:
|
163
163
|
"""Gets all allowable ``PipelineSchemas``.
|
164
164
|
|
165
165
|
These ``PipelineSchemas`` represent the fully supported pipelines and are
|
166
166
|
used to validate the user-requested pipeline.
|
167
167
|
|
168
|
+
Parameters
|
169
|
+
----------
|
170
|
+
name
|
171
|
+
The name of the ``PipelineSchema`` to get.
|
172
|
+
|
168
173
|
Returns
|
169
174
|
-------
|
170
|
-
|
175
|
+
The requested ``PipelineSchema``.
|
171
176
|
"""
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
]
|
176
|
-
|
177
|
-
|
178
|
-
PIPELINE_SCHEMAS = PipelineSchema._get_schemas()
|
179
|
-
"""All allowable :class:`PipelineSchemas<PipelineSchema>` to validate the requested
|
180
|
-
pipeline against."""
|
177
|
+
if name not in SCHEMA_PARAMS:
|
178
|
+
raise ValueError(f"Pipeline schema '{name}' is not supported.")
|
179
|
+
return cls(name, *SCHEMA_PARAMS[name])
|
@@ -11,12 +11,12 @@ package defines the nodes and edges required to instantiate such ``PipelineSchem
|
|
11
11
|
|
12
12
|
from easylink.pipeline_schema_constants import development, testing
|
13
13
|
|
14
|
-
|
14
|
+
SCHEMA_PARAMS = {
|
15
|
+
"main": "TODO",
|
16
|
+
# development and testing
|
15
17
|
"development": development.SCHEMA_PARAMS,
|
16
|
-
}
|
17
|
-
|
18
|
-
TESTING_SCHEMA_PARAMS = {
|
19
18
|
"integration": testing.SCHEMA_PARAMS_ONE_STEP,
|
19
|
+
"output_dir": testing.SCHEMA_PARAMS_OUTPUT_DIR,
|
20
20
|
"combine_bad_topology": testing.SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY,
|
21
21
|
"combine_bad_implementation_names": testing.SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY,
|
22
22
|
"nested_templated_steps": testing.SCHEMA_PARAMS_NESTED_TEMPLATED_STEPS,
|
@@ -26,7 +26,7 @@ from easylink.step import (
|
|
26
26
|
)
|
27
27
|
from easylink.utilities.aggregator_utils import concatenate_datasets
|
28
28
|
from easylink.utilities.splitter_utils import split_data_in_two
|
29
|
-
from easylink.utilities.validation_utils import validate_input_file_dummy
|
29
|
+
from easylink.utilities.validation_utils import validate_dir, validate_input_file_dummy
|
30
30
|
|
31
31
|
NODES_ONE_STEP = [
|
32
32
|
InputStep(),
|
@@ -582,3 +582,55 @@ EDGES_ONE_STEP_TWO_ISLOTS = [
|
|
582
582
|
),
|
583
583
|
]
|
584
584
|
SCHEMA_PARAMS_EP_HIERARCHICAL_STEP = (NODES_EP_HIERARCHICAL_STEP, EDGES_ONE_STEP_TWO_ISLOTS)
|
585
|
+
|
586
|
+
NODES_OUTPUT_DIR = [
|
587
|
+
InputStep(),
|
588
|
+
Step(
|
589
|
+
step_name="step_1_for_output_dir_example",
|
590
|
+
input_slots=[
|
591
|
+
InputSlot(
|
592
|
+
name="step_1_main_input",
|
593
|
+
env_var="STEP_1_MAIN_INPUT_FILE_PATHS",
|
594
|
+
validator=validate_input_file_dummy,
|
595
|
+
)
|
596
|
+
],
|
597
|
+
output_slots=[OutputSlot("step_1_main_output_directory")],
|
598
|
+
),
|
599
|
+
Step(
|
600
|
+
step_name="step_2_for_output_dir_example",
|
601
|
+
input_slots=[
|
602
|
+
InputSlot(
|
603
|
+
name="step_2_main_input",
|
604
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_DIR_PATH",
|
605
|
+
validator=validate_dir,
|
606
|
+
)
|
607
|
+
],
|
608
|
+
output_slots=[OutputSlot("step_2_main_output")],
|
609
|
+
),
|
610
|
+
OutputStep(
|
611
|
+
input_slots=[
|
612
|
+
InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
|
613
|
+
],
|
614
|
+
),
|
615
|
+
]
|
616
|
+
EDGES_OUTPUT_DIR = [
|
617
|
+
EdgeParams(
|
618
|
+
source_node="input_data",
|
619
|
+
target_node="step_1_for_output_dir_example",
|
620
|
+
output_slot="all",
|
621
|
+
input_slot="step_1_main_input",
|
622
|
+
),
|
623
|
+
EdgeParams(
|
624
|
+
source_node="step_1_for_output_dir_example",
|
625
|
+
target_node="step_2_for_output_dir_example",
|
626
|
+
output_slot="step_1_main_output_directory",
|
627
|
+
input_slot="step_2_main_input",
|
628
|
+
),
|
629
|
+
EdgeParams(
|
630
|
+
source_node="step_2_for_output_dir_example",
|
631
|
+
target_node="results",
|
632
|
+
output_slot="step_2_main_output",
|
633
|
+
input_slot="result",
|
634
|
+
),
|
635
|
+
]
|
636
|
+
SCHEMA_PARAMS_OUTPUT_DIR = (NODES_OUTPUT_DIR, EDGES_OUTPUT_DIR)
|
easylink/rule.py
CHANGED
@@ -17,6 +17,7 @@ import os
|
|
17
17
|
from abc import ABC, abstractmethod
|
18
18
|
from collections.abc import Callable
|
19
19
|
from dataclasses import dataclass
|
20
|
+
from pathlib import Path
|
20
21
|
|
21
22
|
|
22
23
|
class Rule(ABC):
|
@@ -125,6 +126,18 @@ class ImplementedRule(Rule):
|
|
125
126
|
def _build_io(self) -> str:
|
126
127
|
"""Builds the input/output portion of the rule."""
|
127
128
|
log_path_chunk_adder = "-{chunk}" if self.is_embarrassingly_parallel else ""
|
129
|
+
# Handle output files vs directories
|
130
|
+
files = [path for path in self.output if Path(path).suffix != ""]
|
131
|
+
if len(files) == len(self.output):
|
132
|
+
output = self.output
|
133
|
+
elif len(files) == 0:
|
134
|
+
if len(self.output) != 1:
|
135
|
+
raise NotImplementedError("Multiple output directories is not supported.")
|
136
|
+
output = f"directory('{self.output[0]}')"
|
137
|
+
else:
|
138
|
+
raise NotImplementedError(
|
139
|
+
"Mixed output types (files and directories) is not supported."
|
140
|
+
)
|
128
141
|
io_str = (
|
129
142
|
f"""
|
130
143
|
rule:
|
@@ -132,7 +145,7 @@ rule:
|
|
132
145
|
message: "Running {self.step_name} implementation: {self.implementation_name}" """
|
133
146
|
+ self._build_input()
|
134
147
|
+ f"""
|
135
|
-
output: {
|
148
|
+
output: {output}
|
136
149
|
log: "{self.diagnostics_dir}/{self.name}-output{log_path_chunk_adder}.log"
|
137
150
|
container: "{self.image_path}" """
|
138
151
|
)
|
easylink/runner.py
CHANGED
@@ -19,7 +19,6 @@ from snakemake.cli import main as snake_main
|
|
19
19
|
|
20
20
|
from easylink.configuration import Config, load_params_from_specification
|
21
21
|
from easylink.pipeline import Pipeline
|
22
|
-
from easylink.pipeline_schema import PIPELINE_SCHEMAS, PipelineSchema
|
23
22
|
from easylink.utilities.data_utils import (
|
24
23
|
copy_configuration_files_to_results_directory,
|
25
24
|
create_results_directory,
|
@@ -35,8 +34,8 @@ def main(
|
|
35
34
|
input_data: str | Path,
|
36
35
|
computing_environment: str | Path | None,
|
37
36
|
results_dir: str | Path,
|
38
|
-
|
39
|
-
|
37
|
+
schema_name: str = "main",
|
38
|
+
debug: bool = False,
|
40
39
|
) -> None:
|
41
40
|
"""Runs an EasyLink command.
|
42
41
|
|
@@ -60,17 +59,16 @@ def main(
|
|
60
59
|
to run the pipeline on. If None, the pipeline will be run locally.
|
61
60
|
results_dir
|
62
61
|
The directory to write results and incidental files (logs, etc.) to.
|
62
|
+
schema_name
|
63
|
+
The name of the schema to validate the pipeline configuration against.
|
63
64
|
debug
|
64
65
|
If False (the default), will suppress some of the workflow output. This
|
65
66
|
is intended to only be used for testing and development purposes.
|
66
|
-
potential_schemas
|
67
|
-
A list of potential schemas to validate the pipeline configuration against.
|
68
|
-
This is primarily used for testing purposes. Defaults to the supported schemas.
|
69
67
|
"""
|
70
68
|
config_params = load_params_from_specification(
|
71
69
|
pipeline_specification, input_data, computing_environment, results_dir
|
72
70
|
)
|
73
|
-
config = Config(config_params,
|
71
|
+
config = Config(config_params, schema_name)
|
74
72
|
pipeline = Pipeline(config)
|
75
73
|
# After validation is completed, create the results directory
|
76
74
|
create_results_directory(Path(results_dir))
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./dummy_step_1_for_output_dir_example.py /dummy_step_1_for_output_dir_example.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /dummy_step_1_for_output_dir_example.py '$@'
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# PIPELINE_SCHEMA: output_dir
|
2
|
+
# STEP_NAME: step_1_for_output_dir_example
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow
|
4
|
+
|
5
|
+
import os
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
data = pd.read_parquet(os.environ["STEP_1_MAIN_INPUT_FILE_PATHS"])
|
11
|
+
|
12
|
+
print(data)
|
13
|
+
|
14
|
+
dir_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
|
15
|
+
dir_path.mkdir(parents=True, exist_ok=True)
|
16
|
+
|
17
|
+
for i in range(3):
|
18
|
+
data.to_parquet(dir_path / f"result_{i}.parquet")
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./dummy_step_2_for_output_dir_example.py /dummy_step_2_for_output_dir_example.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /dummy_step_2_for_output_dir_example.py '$@'
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# PIPELINE_SCHEMA: output_dir
|
2
|
+
# STEP_NAME: step_2_for_output_dir_example
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow
|
4
|
+
|
5
|
+
import os
|
6
|
+
import shutil
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
dir_path = Path(os.environ["DUMMY_CONTAINER_MAIN_INPUT_DIR_PATH"])
|
12
|
+
saved = False
|
13
|
+
|
14
|
+
for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
|
15
|
+
if "snakemake" in str(f):
|
16
|
+
continue
|
17
|
+
|
18
|
+
if not saved:
|
19
|
+
shutil.copy(f, os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
|
20
|
+
saved = True
|
21
|
+
|
22
|
+
print(pd.read_parquet(f))
|
@@ -50,3 +50,9 @@ def validate_input_file_dummy(filepath: str) -> None:
|
|
50
50
|
raise LookupError(
|
51
51
|
f"Data file {filepath} is missing required column(s) {missing_columns}"
|
52
52
|
)
|
53
|
+
|
54
|
+
|
55
|
+
def validate_dir(filepath: str) -> None:
|
56
|
+
input_path = Path(filepath)
|
57
|
+
if not input_path.is_dir():
|
58
|
+
raise NotADirectoryError(f"The path {filepath} is not a directory.")
|
@@ -1,23 +1,23 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
4
|
-
easylink/cli.py,sha256=
|
5
|
-
easylink/configuration.py,sha256=
|
3
|
+
easylink/_version.py,sha256=6BiuMUkhwQp6bzUZSF8np8F1NwCltEtK0sPBF__tepU,23
|
4
|
+
easylink/cli.py,sha256=v8OALTAI3WlNELrHuGQumlJFdmYML4K-XX_OtqSJYZM,9925
|
5
|
+
easylink/configuration.py,sha256=rFPTZMEBZjiXYZWesUqpppOj6ONxp3sybf5g9MiDsOY,11639
|
6
6
|
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
7
|
-
easylink/implementation.py,sha256=
|
8
|
-
easylink/implementation_metadata.yaml,sha256=
|
7
|
+
easylink/implementation.py,sha256=XLSat6_IXFn-nH6X8AazmfWhDtTK4GtA7yiht9QLlQQ,11366
|
8
|
+
easylink/implementation_metadata.yaml,sha256=trq5CvSSZRmqRQ979o68L2QONvlv-ncFXS-rh3-79Uk,7558
|
9
9
|
easylink/pipeline.py,sha256=5KOYH5HyJjVlFoBRKGLs2hn5mpC3tPYG_ux3T1qSV9k,17504
|
10
10
|
easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
|
11
|
-
easylink/pipeline_schema.py,sha256=
|
12
|
-
easylink/rule.py,sha256=
|
13
|
-
easylink/runner.py,sha256=
|
11
|
+
easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
|
12
|
+
easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
|
13
|
+
easylink/runner.py,sha256=GhkPGDh9UFOb38ksqXpMKZoxXs9hZaOFzZDo2jlEp-U,6458
|
14
14
|
easylink/step.py,sha256=u1AMPrYGNVb3ZH6uB_U0dUeJvOeQ2MoVHdlC8k63AA8,85226
|
15
|
-
easylink/devtools/implementation_creator.py,sha256=
|
15
|
+
easylink/devtools/implementation_creator.py,sha256=ddzJltlzOfvzwAMuInovCbfn3IM2u_s7I_dObWV4os0,16430
|
16
16
|
easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
|
17
17
|
easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
|
18
|
-
easylink/pipeline_schema_constants/__init__.py,sha256=
|
18
|
+
easylink/pipeline_schema_constants/__init__.py,sha256=45S-Q69CugGfBroHuGR8c7Jlq1wqAy5lRtys5C_0--M,1337
|
19
19
|
easylink/pipeline_schema_constants/development.py,sha256=XxcYYZDZM4IADp3eFPQCchD6-OtMp99GiyZBfSswzFo,12640
|
20
|
-
easylink/pipeline_schema_constants/testing.py,sha256=
|
20
|
+
easylink/pipeline_schema_constants/testing.py,sha256=UDmVVjI1SiDktMbJ2CrSb7amHSYNwhgqNkXhl4lYxQw,20459
|
21
21
|
easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
|
22
22
|
easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
|
23
23
|
easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
|
@@ -36,6 +36,10 @@ easylink/steps/dev/python_pyspark/python_pyspark.def,sha256=j_RmVjspmXGOhJTr10ED
|
|
36
36
|
easylink/steps/dev/r/README.md,sha256=dPjZdDTqcJsZCiwhddzlOj1ob0P7YocZUNFrLIGM1-0,1201
|
37
37
|
easylink/steps/dev/r/dummy_step.R,sha256=1TWZY8CEkT6gavrulBxFsKbDSKJJjk0NtJrGH7TIikE,4975
|
38
38
|
easylink/steps/dev/r/r-image.def,sha256=LrhXlt0C3k7d_VJWopRPEVARnFWSuq_oILlwo7g03bE,627
|
39
|
+
easylink/steps/output_dir/dummy_step_1_for_output_dir_example.def,sha256=CkQVG-uDRQ9spAavdkZbhx2GD_fRsKZGELPrr8yltsc,550
|
40
|
+
easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py,sha256=dI0OWugE35ABLcSwsI-T3C4dvuPTKXwjE52dtSsCo8Y,428
|
41
|
+
easylink/steps/output_dir/dummy_step_2_for_output_dir_example.def,sha256=9gShg1EDJEHZcz7Z5VfZ1A4Gpm9XQes8ezn6rAZDgDM,550
|
42
|
+
easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py,sha256=DMJW5TXjhELxhY4U9q2RpLjqxlS1YSosTGL2AfRnaZM,521
|
39
43
|
easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
|
40
44
|
easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl64u0Lr__g,972
|
41
45
|
easylink/utilities/data_utils.py,sha256=CcnM3u0_MQDQo3jMs3E4IK_rz8wAsFdJ674fZxYEFZg,4620
|
@@ -43,9 +47,9 @@ easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4v
|
|
43
47
|
easylink/utilities/paths.py,sha256=KM1GlnsAcKbUJrC4LZKpeJfPljxe_aXP1ZhVp43TYRA,924
|
44
48
|
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
45
49
|
easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
|
46
|
-
easylink/utilities/validation_utils.py,sha256=
|
47
|
-
easylink-0.1.
|
48
|
-
easylink-0.1.
|
49
|
-
easylink-0.1.
|
50
|
-
easylink-0.1.
|
51
|
-
easylink-0.1.
|
50
|
+
easylink/utilities/validation_utils.py,sha256=rOIeQbbXXdsuL2hI0i2gApAWfiNJXMwYH4pmw8uLrGM,1867
|
51
|
+
easylink-0.1.18.dist-info/METADATA,sha256=9RPc6nIJrkdNQxUXqVYQW26h2G3ukGuXyAmUA4razpA,3477
|
52
|
+
easylink-0.1.18.dist-info/WHEEL,sha256=QZxptf4Y1BKFRCEDxD4h2V0mBFQOVFLFEpvxHmIs52A,91
|
53
|
+
easylink-0.1.18.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
54
|
+
easylink-0.1.18.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
55
|
+
easylink-0.1.18.dist-info/RECORD,,
|
File without changes
|
File without changes
|