easylink 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.16"
1
+ __version__ = "0.1.18"
easylink/cli.py CHANGED
@@ -91,6 +91,11 @@ SHARED_OPTIONS = [
91
91
  default=False,
92
92
  help="Do not save the results in a timestamped sub-directory of ``--output-dir``.",
93
93
  ),
94
+ click.option(
95
+ "--schema",
96
+ hidden=True,
97
+ default="main",
98
+ ),
94
99
  ]
95
100
 
96
101
  VERBOSE_WITH_DEBUGGER_OPTIONS = [
@@ -165,6 +170,7 @@ def run(
165
170
  input_data: str,
166
171
  output_dir: str | None,
167
172
  no_timestamp: bool,
173
+ schema: str,
168
174
  computing_environment: str | None,
169
175
  verbose: int,
170
176
  with_debugger: bool,
@@ -190,6 +196,7 @@ def run(
190
196
  input_data=input_data,
191
197
  computing_environment=computing_environment,
192
198
  results_dir=results_dir,
199
+ schema_name=schema,
193
200
  )
194
201
  logger.info("*** FINISHED ***")
195
202
 
@@ -201,6 +208,7 @@ def generate_dag(
201
208
  input_data: str,
202
209
  output_dir: str | None,
203
210
  no_timestamp: bool,
211
+ schema: str,
204
212
  verbose: int,
205
213
  with_debugger: bool,
206
214
  ) -> None:
@@ -223,6 +231,7 @@ def generate_dag(
223
231
  input_data=input_data,
224
232
  computing_environment=None,
225
233
  results_dir=results_dir,
234
+ schema_name=schema,
226
235
  )
227
236
  logger.info("*** DAG saved to result directory ***")
228
237
 
easylink/configuration.py CHANGED
@@ -14,7 +14,7 @@ from typing import Any
14
14
 
15
15
  from layered_config_tree import LayeredConfigTree
16
16
 
17
- from easylink.pipeline_schema import PIPELINE_SCHEMAS, PipelineSchema
17
+ from easylink.pipeline_schema import PipelineSchema
18
18
  from easylink.utilities.data_utils import load_yaml
19
19
  from easylink.utilities.general_utils import exit_with_validation_error
20
20
 
@@ -67,9 +67,8 @@ class Config(LayeredConfigTree):
67
67
  A dictionary of all specifications required to run the pipeline. This
68
68
  includes the pipeline, input data, and computing environment specifications,
69
69
  as well as the results directory.
70
- potential_schemas
71
- A list of potential schemas to validate the pipeline configuration against.
72
- This is primarily used for testing purposes. Defaults to the supported schemas.
70
+ schema_name
71
+ The name of the schema to validate the pipeline configuration against.
73
72
 
74
73
  Attributes
75
74
  ----------
@@ -82,22 +81,14 @@ class Config(LayeredConfigTree):
82
81
  input_data
83
82
  The input data filepaths.
84
83
  schema
85
- The :class:`~easylink.pipeline_schema.PipelineSchema` that successfully
86
- validated the requested pipeline.
87
-
88
- Notes
89
- -----
90
- The requested pipeline is checked against a set of supported
91
- ``PipelineSchemas``. The first schema that successfully validates is assumed
92
- to be the correct one and is attached to the ``Config`` object and its
93
- :meth:`~easylink.pipeline_schema.PipelineSchema.configure_pipeline`
94
- method is called.
84
+ The :class:`~easylink.pipeline_schema.PipelineSchema`.
85
+
95
86
  """
96
87
 
97
88
  def __init__(
98
89
  self,
99
90
  config_params: dict[str, Any],
100
- potential_schemas: PipelineSchema | list[PipelineSchema] = PIPELINE_SCHEMAS,
91
+ schema_name: str = "main",
101
92
  ) -> None:
102
93
  super().__init__(layers=["initial_data", "default", "user_configured"])
103
94
  self.update(DEFAULT_ENVIRONMENT, layer="default")
@@ -108,9 +99,7 @@ class Config(LayeredConfigTree):
108
99
  # Set slurm defaults to empty dict instead of None so that we don't get errors
109
100
  # in slurm_resources property
110
101
  self.update({"environment": {"slurm": {}}}, layer="default")
111
- if not isinstance(potential_schemas, list):
112
- potential_schemas = [potential_schemas]
113
- self.update({"schema": self._get_schema(potential_schemas)}, layer="initial_data")
102
+ self.update({"schema": self._get_schema(schema_name)}, layer="initial_data")
114
103
  self.schema.configure_pipeline(self.pipeline, self.input_data)
115
104
  self._validate()
116
105
  self.freeze()
@@ -173,22 +162,22 @@ class Config(LayeredConfigTree):
173
162
  # Setup Methods #
174
163
  #################
175
164
 
176
- def _get_schema(self, potential_schemas: list[PipelineSchema]) -> PipelineSchema:
165
+ def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
177
166
  """Returns the first :class:`~easylink.pipeline_schema.PipelineSchema` that validates the requested pipeline.
178
167
 
179
168
  Parameters
180
169
  ----------
181
- potential_schemas
182
- ``PipelineSchemas`` to validate the pipeline configuration against.
170
+ schema_name
171
+ The name of the specific ``PipelineSchema`` to validate the pipeline configuration against.
183
172
 
184
173
  Returns
185
174
  -------
186
- The first ``PipelineSchema`` that validates the requested pipeline configuration.
175
+ The requested ``PipelineSchema`` if it validates the requested pipeline configuration.
187
176
 
188
177
  Raises
189
178
  ------
190
179
  SystemExit
191
- If the pipeline configuration is not valid for any of the ``potential_schemas``,
180
+ If the pipeline configuration is not valid for the requested schema,
192
181
  the program exits with a non-zero code and all validation errors found
193
182
  are logged.
194
183
 
@@ -197,20 +186,15 @@ class Config(LayeredConfigTree):
197
186
  This acts as the pipeline configuration file's validation method since
198
187
  we can only find a matching ``PipelineSchema`` if that file is valid.
199
188
 
200
- This method returns the *first* ``PipelineSchema`` that validates and does
201
- not attempt to check additional ones.
202
189
  """
203
190
  errors = defaultdict(dict)
204
191
  # Try each schema until one is validated
205
- for schema in potential_schemas:
206
- logs = schema.validate_step(self.pipeline, self.input_data)
207
- if logs:
208
- errors[PIPELINE_ERRORS_KEY][schema.name] = logs
209
- pass # try the next schema
210
- else: # schema was validated
211
- return schema
212
- # No schemas were validated
213
- exit_with_validation_error(dict(errors))
192
+ schema = PipelineSchema.get_schema(schema_name)
193
+ logs = schema.validate_step(self.pipeline, self.input_data)
194
+ if logs:
195
+ errors[PIPELINE_ERRORS_KEY][schema.name] = logs
196
+ exit_with_validation_error(dict(errors))
197
+ return schema
214
198
 
215
199
  def _validate(self) -> None:
216
200
  """Validates the ``Config``.
@@ -19,7 +19,7 @@ from typing import cast
19
19
  import yaml
20
20
  from loguru import logger
21
21
 
22
- from easylink.pipeline_schema_constants import ALLOWED_SCHEMA_PARAMS
22
+ from easylink.pipeline_schema_constants import SCHEMA_PARAMS
23
23
  from easylink.step import (
24
24
  ChoiceStep,
25
25
  EmbarrassinglyParallelStep,
@@ -244,17 +244,17 @@ class ImplementationCreator:
244
244
  @staticmethod
245
245
  def _extract_output_slot(script_path: Path, step_name: str) -> str:
246
246
  """Extracts the name of the output slot that this script is implementing."""
247
- schema = ImplementationCreator._extract_pipeline_schema(script_path)
248
- implementable_steps = ImplementationCreator._extract_implementable_steps(schema)
247
+ schema_name = ImplementationCreator._extract_pipeline_schema_name(script_path)
248
+ implementable_steps = ImplementationCreator._extract_implementable_steps(schema_name)
249
249
  step_names = [step.name for step in implementable_steps]
250
250
  if step_name not in step_names:
251
251
  raise ValueError(
252
- f"'{step_name}' does not exist as an implementable step in the '{schema}' pipeline schema. "
252
+ f"'{step_name}' does not exist as an implementable step in the '{schema_name}' pipeline schema. "
253
253
  )
254
254
  duplicates = list(set([step for step in step_names if step_names.count(step) > 1]))
255
255
  if duplicates:
256
256
  raise ValueError(
257
- f"Multiple implementable steps with the same name found in the '{schema}' "
257
+ f"Multiple implementable steps with the same name found in the '{schema_name}' "
258
258
  f"pipeline schema: {duplicates}."
259
259
  )
260
260
  implemented_step = [step for step in implementable_steps if step.name == step_name][0]
@@ -266,7 +266,7 @@ class ImplementationCreator:
266
266
  return list(implemented_step.output_slots)[0]
267
267
 
268
268
  @staticmethod
269
- def _extract_implementable_steps(schema: str) -> list[Step]:
269
+ def _extract_implementable_steps(schema_name: str) -> list[Step]:
270
270
  """Extracts all implementable steps from the pipeline schema.
271
271
 
272
272
  This method recursively traverses the pipeline schema specified in the script
@@ -296,8 +296,7 @@ class ImplementationCreator:
296
296
  implementable_steps.append(node)
297
297
  return
298
298
 
299
- schema_steps = ALLOWED_SCHEMA_PARAMS[schema][0]
300
-
299
+ schema_steps, _edges = SCHEMA_PARAMS[schema_name]
301
300
  implementable_steps: list[Step] = []
302
301
  for schema_step in schema_steps:
303
302
  _process_step(schema_step)
@@ -305,7 +304,7 @@ class ImplementationCreator:
305
304
  return implementable_steps
306
305
 
307
306
  @staticmethod
308
- def _extract_pipeline_schema(script_path: Path) -> str:
307
+ def _extract_pipeline_schema_name(script_path: Path) -> str:
309
308
  """Extracts the relevant pipeline schema name.
310
309
 
311
310
  The expectation is that the output slot's name is specified within the script
@@ -316,8 +315,11 @@ class ImplementationCreator:
316
315
 
317
316
  If no pipeline schema is specified, "main" will be used by default.
318
317
  """
319
- schema = _extract_metadata("PIPELINE_SCHEMA", script_path)
320
- return "main" if len(schema) == 0 else schema[0]
318
+ schema_name_list: list[str] = _extract_metadata("PIPELINE_SCHEMA", script_path)
319
+ schema_name = "main" if len(schema_name_list) == 0 else schema_name_list[0]
320
+ if schema_name not in SCHEMA_PARAMS:
321
+ raise ValueError(f"Pipeline schema '{schema_name}' is not supported.")
322
+ return schema_name
321
323
 
322
324
  @staticmethod
323
325
  def _write_metadata(info: dict[str, dict[str, str]]) -> None:
@@ -135,8 +135,17 @@ class Implementation:
135
135
 
136
136
  @property
137
137
  def outputs(self) -> dict[str, list[str]]:
138
- """The expected output metadata."""
139
- return self._metadata["outputs"]
138
+ """The expected output paths. If output metadata is provided, use it. Otherwise,
139
+ assume that the output is a sub-directory with the name of the output slot.
140
+ If there is only one output slot, use '.'."""
141
+ if len(self.output_slots) == 1:
142
+ return self._metadata.get("outputs", {list(self.output_slots.keys())[0]: "."})
143
+ return {
144
+ output_slot_name: self._metadata.get("outputs", {}).get(
145
+ output_slot_name, output_slot_name
146
+ )
147
+ for output_slot_name in self.output_slots
148
+ }
140
149
 
141
150
 
142
151
  class NullImplementation:
@@ -192,3 +192,22 @@ step_1a_and_step_1b_combined_python_pandas:
192
192
  script_cmd: python /dummy_step.py
193
193
  outputs:
194
194
  step_1_main_output: result.parquet
195
+ dummy_step_1_for_output_dir_example:
196
+ steps:
197
+ - step_1_for_output_dir_example
198
+ image_path: /mnt/team/simulation_science/priv/engineering/er_ecosystem/images/zmbc/dummy_step_1_for_output_dir_example.sif
199
+ script_cmd: python /dummy_step_1_for_output_dir_example.py
200
+ outputs:
201
+ step_1_main_output_directory: output_dir/
202
+ dummy_step_1_for_output_dir_example_default:
203
+ steps:
204
+ - step_1_for_output_dir_example
205
+ image_path: /mnt/team/simulation_science/priv/engineering/er_ecosystem/images/zmbc/dummy_step_1_for_output_dir_example.sif
206
+ script_cmd: python /dummy_step_1_for_output_dir_example.py
207
+ dummy_step_2_for_output_dir_example:
208
+ steps:
209
+ - step_2_for_output_dir_example
210
+ image_path: /mnt/team/simulation_science/priv/engineering/er_ecosystem/images/zmbc/dummy_step_2_for_output_dir_example.sif
211
+ script_cmd: python /dummy_step_2_for_output_dir_example.py
212
+ outputs:
213
+ step_2_main_output: result.parquet
@@ -14,7 +14,7 @@ from pathlib import Path
14
14
  from layered_config_tree import LayeredConfigTree
15
15
 
16
16
  from easylink.graph_components import EdgeParams, ImplementationGraph
17
- from easylink.pipeline_schema_constants import ALLOWED_SCHEMA_PARAMS
17
+ from easylink.pipeline_schema_constants import SCHEMA_PARAMS
18
18
  from easylink.step import HierarchicalStep, NonLeafConfigurationState, Step
19
19
 
20
20
 
@@ -39,7 +39,7 @@ class PipelineSchema(HierarchicalStep):
39
39
 
40
40
  Notes
41
41
  -----
42
- All ``PipelineSchema`` instances are intended to be created by the :meth:`_get_schemas`
42
+ A ``PipelineSchema`` is intended to be constructed by the :meth:`get_schema`
43
43
  class method.
44
44
 
45
45
  The ``PipelineSchema`` is a high-level abstraction; it represents the desired
@@ -159,22 +159,21 @@ class PipelineSchema(HierarchicalStep):
159
159
  )
160
160
 
161
161
  @classmethod
162
- def _get_schemas(cls) -> list["PipelineSchema"]:
162
+ def get_schema(cls, name: str = "main") -> list["PipelineSchema"]:
163
163
  """Gets all allowable ``PipelineSchemas``.
164
164
 
165
165
  These ``PipelineSchemas`` represent the fully supported pipelines and are
166
166
  used to validate the user-requested pipeline.
167
167
 
168
+ Parameters
169
+ ----------
170
+ name
171
+ The name of the ``PipelineSchema`` to get.
172
+
168
173
  Returns
169
174
  -------
170
- All allowable ``PipelineSchemas``.
175
+ The requested ``PipelineSchema``.
171
176
  """
172
- return [
173
- cls(name, nodes=nodes, edges=edges)
174
- for name, (nodes, edges) in ALLOWED_SCHEMA_PARAMS.items()
175
- ]
176
-
177
-
178
- PIPELINE_SCHEMAS = PipelineSchema._get_schemas()
179
- """All allowable :class:`PipelineSchemas<PipelineSchema>` to validate the requested
180
- pipeline against."""
177
+ if name not in SCHEMA_PARAMS:
178
+ raise ValueError(f"Pipeline schema '{name}' is not supported.")
179
+ return cls(name, *SCHEMA_PARAMS[name])
@@ -11,12 +11,12 @@ package defines the nodes and edges required to instantiate such ``PipelineSchem
11
11
 
12
12
  from easylink.pipeline_schema_constants import development, testing
13
13
 
14
- ALLOWED_SCHEMA_PARAMS = {
14
+ SCHEMA_PARAMS = {
15
+ "main": "TODO",
16
+ # development and testing
15
17
  "development": development.SCHEMA_PARAMS,
16
- }
17
-
18
- TESTING_SCHEMA_PARAMS = {
19
18
  "integration": testing.SCHEMA_PARAMS_ONE_STEP,
19
+ "output_dir": testing.SCHEMA_PARAMS_OUTPUT_DIR,
20
20
  "combine_bad_topology": testing.SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY,
21
21
  "combine_bad_implementation_names": testing.SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY,
22
22
  "nested_templated_steps": testing.SCHEMA_PARAMS_NESTED_TEMPLATED_STEPS,
@@ -26,7 +26,7 @@ from easylink.step import (
26
26
  )
27
27
  from easylink.utilities.aggregator_utils import concatenate_datasets
28
28
  from easylink.utilities.splitter_utils import split_data_in_two
29
- from easylink.utilities.validation_utils import validate_input_file_dummy
29
+ from easylink.utilities.validation_utils import validate_dir, validate_input_file_dummy
30
30
 
31
31
  NODES_ONE_STEP = [
32
32
  InputStep(),
@@ -582,3 +582,55 @@ EDGES_ONE_STEP_TWO_ISLOTS = [
582
582
  ),
583
583
  ]
584
584
  SCHEMA_PARAMS_EP_HIERARCHICAL_STEP = (NODES_EP_HIERARCHICAL_STEP, EDGES_ONE_STEP_TWO_ISLOTS)
585
+
586
+ NODES_OUTPUT_DIR = [
587
+ InputStep(),
588
+ Step(
589
+ step_name="step_1_for_output_dir_example",
590
+ input_slots=[
591
+ InputSlot(
592
+ name="step_1_main_input",
593
+ env_var="STEP_1_MAIN_INPUT_FILE_PATHS",
594
+ validator=validate_input_file_dummy,
595
+ )
596
+ ],
597
+ output_slots=[OutputSlot("step_1_main_output_directory")],
598
+ ),
599
+ Step(
600
+ step_name="step_2_for_output_dir_example",
601
+ input_slots=[
602
+ InputSlot(
603
+ name="step_2_main_input",
604
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_DIR_PATH",
605
+ validator=validate_dir,
606
+ )
607
+ ],
608
+ output_slots=[OutputSlot("step_2_main_output")],
609
+ ),
610
+ OutputStep(
611
+ input_slots=[
612
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
613
+ ],
614
+ ),
615
+ ]
616
+ EDGES_OUTPUT_DIR = [
617
+ EdgeParams(
618
+ source_node="input_data",
619
+ target_node="step_1_for_output_dir_example",
620
+ output_slot="all",
621
+ input_slot="step_1_main_input",
622
+ ),
623
+ EdgeParams(
624
+ source_node="step_1_for_output_dir_example",
625
+ target_node="step_2_for_output_dir_example",
626
+ output_slot="step_1_main_output_directory",
627
+ input_slot="step_2_main_input",
628
+ ),
629
+ EdgeParams(
630
+ source_node="step_2_for_output_dir_example",
631
+ target_node="results",
632
+ output_slot="step_2_main_output",
633
+ input_slot="result",
634
+ ),
635
+ ]
636
+ SCHEMA_PARAMS_OUTPUT_DIR = (NODES_OUTPUT_DIR, EDGES_OUTPUT_DIR)
easylink/rule.py CHANGED
@@ -17,6 +17,7 @@ import os
17
17
  from abc import ABC, abstractmethod
18
18
  from collections.abc import Callable
19
19
  from dataclasses import dataclass
20
+ from pathlib import Path
20
21
 
21
22
 
22
23
  class Rule(ABC):
@@ -125,6 +126,18 @@ class ImplementedRule(Rule):
125
126
  def _build_io(self) -> str:
126
127
  """Builds the input/output portion of the rule."""
127
128
  log_path_chunk_adder = "-{chunk}" if self.is_embarrassingly_parallel else ""
129
+ # Handle output files vs directories
130
+ files = [path for path in self.output if Path(path).suffix != ""]
131
+ if len(files) == len(self.output):
132
+ output = self.output
133
+ elif len(files) == 0:
134
+ if len(self.output) != 1:
135
+ raise NotImplementedError("Multiple output directories is not supported.")
136
+ output = f"directory('{self.output[0]}')"
137
+ else:
138
+ raise NotImplementedError(
139
+ "Mixed output types (files and directories) is not supported."
140
+ )
128
141
  io_str = (
129
142
  f"""
130
143
  rule:
@@ -132,7 +145,7 @@ rule:
132
145
  message: "Running {self.step_name} implementation: {self.implementation_name}" """
133
146
  + self._build_input()
134
147
  + f"""
135
- output: {self.output}
148
+ output: {output}
136
149
  log: "{self.diagnostics_dir}/{self.name}-output{log_path_chunk_adder}.log"
137
150
  container: "{self.image_path}" """
138
151
  )
easylink/runner.py CHANGED
@@ -19,7 +19,6 @@ from snakemake.cli import main as snake_main
19
19
 
20
20
  from easylink.configuration import Config, load_params_from_specification
21
21
  from easylink.pipeline import Pipeline
22
- from easylink.pipeline_schema import PIPELINE_SCHEMAS, PipelineSchema
23
22
  from easylink.utilities.data_utils import (
24
23
  copy_configuration_files_to_results_directory,
25
24
  create_results_directory,
@@ -35,8 +34,8 @@ def main(
35
34
  input_data: str | Path,
36
35
  computing_environment: str | Path | None,
37
36
  results_dir: str | Path,
38
- debug=False,
39
- potential_schemas: PipelineSchema | list[PipelineSchema] = PIPELINE_SCHEMAS,
37
+ schema_name: str = "main",
38
+ debug: bool = False,
40
39
  ) -> None:
41
40
  """Runs an EasyLink command.
42
41
 
@@ -60,17 +59,16 @@ def main(
60
59
  to run the pipeline on. If None, the pipeline will be run locally.
61
60
  results_dir
62
61
  The directory to write results and incidental files (logs, etc.) to.
62
+ schema_name
63
+ The name of the schema to validate the pipeline configuration against.
63
64
  debug
64
65
  If False (the default), will suppress some of the workflow output. This
65
66
  is intended to only be used for testing and development purposes.
66
- potential_schemas
67
- A list of potential schemas to validate the pipeline configuration against.
68
- This is primarily used for testing purposes. Defaults to the supported schemas.
69
67
  """
70
68
  config_params = load_params_from_specification(
71
69
  pipeline_specification, input_data, computing_environment, results_dir
72
70
  )
73
- config = Config(config_params, potential_schemas)
71
+ config = Config(config_params, schema_name)
74
72
  pipeline = Pipeline(config)
75
73
  # After validation is completed, create the results directory
76
74
  create_results_directory(Path(results_dir))
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./dummy_step_1_for_output_dir_example.py /dummy_step_1_for_output_dir_example.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /dummy_step_1_for_output_dir_example.py '$@'
@@ -0,0 +1,18 @@
1
+ # PIPELINE_SCHEMA: output_dir
2
+ # STEP_NAME: step_1_for_output_dir_example
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ data = pd.read_parquet(os.environ["STEP_1_MAIN_INPUT_FILE_PATHS"])
11
+
12
+ print(data)
13
+
14
+ dir_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
15
+ dir_path.mkdir(parents=True, exist_ok=True)
16
+
17
+ for i in range(3):
18
+ data.to_parquet(dir_path / f"result_{i}.parquet")
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./dummy_step_2_for_output_dir_example.py /dummy_step_2_for_output_dir_example.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /dummy_step_2_for_output_dir_example.py '$@'
@@ -0,0 +1,22 @@
1
+ # PIPELINE_SCHEMA: output_dir
2
+ # STEP_NAME: step_2_for_output_dir_example
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow
4
+
5
+ import os
6
+ import shutil
7
+ from pathlib import Path
8
+
9
+ import pandas as pd
10
+
11
+ dir_path = Path(os.environ["DUMMY_CONTAINER_MAIN_INPUT_DIR_PATH"])
12
+ saved = False
13
+
14
+ for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
15
+ if "snakemake" in str(f):
16
+ continue
17
+
18
+ if not saved:
19
+ shutil.copy(f, os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
20
+ saved = True
21
+
22
+ print(pd.read_parquet(f))
@@ -50,3 +50,9 @@ def validate_input_file_dummy(filepath: str) -> None:
50
50
  raise LookupError(
51
51
  f"Data file {filepath} is missing required column(s) {missing_columns}"
52
52
  )
53
+
54
+
55
+ def validate_dir(filepath: str) -> None:
56
+ input_path = Path(filepath)
57
+ if not input_path.is_dir():
58
+ raise NotADirectoryError(f"The path {filepath} is not a directory.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -1,23 +1,23 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=yF88-8vL8keLe6gCTumymw0UoMkWkSrJnzLru4zBCLQ,23
4
- easylink/cli.py,sha256=mv9l9XHojfhDK4hpDeV1E4iensgt6zx2ovkGBQ8x9xk,9745
5
- easylink/configuration.py,sha256=lfm8ViUpr1-O-EovTjKZbAlIht2EBv3RndN1mzYbmDE,12565
3
+ easylink/_version.py,sha256=6BiuMUkhwQp6bzUZSF8np8F1NwCltEtK0sPBF__tepU,23
4
+ easylink/cli.py,sha256=v8OALTAI3WlNELrHuGQumlJFdmYML4K-XX_OtqSJYZM,9925
5
+ easylink/configuration.py,sha256=rFPTZMEBZjiXYZWesUqpppOj6ONxp3sybf5g9MiDsOY,11639
6
6
  easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
- easylink/implementation.py,sha256=4u3QgLOrNttfU9Kd_9u_lg3in4ePoYUfO9u_udwiuh0,10878
8
- easylink/implementation_metadata.yaml,sha256=_maN5UWFZxDykYcUrDXoEKMej4jeF_rZLt3QZj72kQM,6645
7
+ easylink/implementation.py,sha256=XLSat6_IXFn-nH6X8AazmfWhDtTK4GtA7yiht9QLlQQ,11366
8
+ easylink/implementation_metadata.yaml,sha256=trq5CvSSZRmqRQ979o68L2QONvlv-ncFXS-rh3-79Uk,7558
9
9
  easylink/pipeline.py,sha256=5KOYH5HyJjVlFoBRKGLs2hn5mpC3tPYG_ux3T1qSV9k,17504
10
10
  easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
11
- easylink/pipeline_schema.py,sha256=Q2sCpsC-F2W0yxVP7ufunowDepOBrRVENXOdap9J5iY,6921
12
- easylink/rule.py,sha256=uoPj7yFFqiwvxlnhoejrZuPR3YX--y1k02uDDz3viTc,16196
13
- easylink/runner.py,sha256=cbCo5_NvvulmjjAaBCG6qCmbtJiHK-7NuDvbngdU_PY,6675
11
+ easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
12
+ easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
13
+ easylink/runner.py,sha256=GhkPGDh9UFOb38ksqXpMKZoxXs9hZaOFzZDo2jlEp-U,6458
14
14
  easylink/step.py,sha256=u1AMPrYGNVb3ZH6uB_U0dUeJvOeQ2MoVHdlC8k63AA8,85226
15
- easylink/devtools/implementation_creator.py,sha256=mkiQ9nhtQC3mhxcG8IyvejzSK0WSkwplCztPLXbpXXQ,16199
15
+ easylink/devtools/implementation_creator.py,sha256=ddzJltlzOfvzwAMuInovCbfn3IM2u_s7I_dObWV4os0,16430
16
16
  easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
17
17
  easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
18
- easylink/pipeline_schema_constants/__init__.py,sha256=HbN-NytoGuk8aTfe0Wal232UnLopFBQGe2uRjmg_igQ,1272
18
+ easylink/pipeline_schema_constants/__init__.py,sha256=45S-Q69CugGfBroHuGR8c7Jlq1wqAy5lRtys5C_0--M,1337
19
19
  easylink/pipeline_schema_constants/development.py,sha256=XxcYYZDZM4IADp3eFPQCchD6-OtMp99GiyZBfSswzFo,12640
20
- easylink/pipeline_schema_constants/testing.py,sha256=8vVGj7opZ9Uzj7EHGMbgXyZj3_SboIeUPB0XlZkmvrM,18901
20
+ easylink/pipeline_schema_constants/testing.py,sha256=UDmVVjI1SiDktMbJ2CrSb7amHSYNwhgqNkXhl4lYxQw,20459
21
21
  easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
22
22
  easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
23
23
  easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
@@ -36,6 +36,10 @@ easylink/steps/dev/python_pyspark/python_pyspark.def,sha256=j_RmVjspmXGOhJTr10ED
36
36
  easylink/steps/dev/r/README.md,sha256=dPjZdDTqcJsZCiwhddzlOj1ob0P7YocZUNFrLIGM1-0,1201
37
37
  easylink/steps/dev/r/dummy_step.R,sha256=1TWZY8CEkT6gavrulBxFsKbDSKJJjk0NtJrGH7TIikE,4975
38
38
  easylink/steps/dev/r/r-image.def,sha256=LrhXlt0C3k7d_VJWopRPEVARnFWSuq_oILlwo7g03bE,627
39
+ easylink/steps/output_dir/dummy_step_1_for_output_dir_example.def,sha256=CkQVG-uDRQ9spAavdkZbhx2GD_fRsKZGELPrr8yltsc,550
40
+ easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py,sha256=dI0OWugE35ABLcSwsI-T3C4dvuPTKXwjE52dtSsCo8Y,428
41
+ easylink/steps/output_dir/dummy_step_2_for_output_dir_example.def,sha256=9gShg1EDJEHZcz7Z5VfZ1A4Gpm9XQes8ezn6rAZDgDM,550
42
+ easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py,sha256=DMJW5TXjhELxhY4U9q2RpLjqxlS1YSosTGL2AfRnaZM,521
39
43
  easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
40
44
  easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl64u0Lr__g,972
41
45
  easylink/utilities/data_utils.py,sha256=CcnM3u0_MQDQo3jMs3E4IK_rz8wAsFdJ674fZxYEFZg,4620
@@ -43,9 +47,9 @@ easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4v
43
47
  easylink/utilities/paths.py,sha256=KM1GlnsAcKbUJrC4LZKpeJfPljxe_aXP1ZhVp43TYRA,924
44
48
  easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
45
49
  easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
46
- easylink/utilities/validation_utils.py,sha256=W9r_RXcivJjfpioLhONirfwdByYttxNsVY489_sbrYQ,1683
47
- easylink-0.1.16.dist-info/METADATA,sha256=xkRlfeXuPHvvZXwEHaObnpu6MsOWSF6Lu-1wi7wRlJQ,3477
48
- easylink-0.1.16.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
49
- easylink-0.1.16.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
50
- easylink-0.1.16.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
51
- easylink-0.1.16.dist-info/RECORD,,
50
+ easylink/utilities/validation_utils.py,sha256=rOIeQbbXXdsuL2hI0i2gApAWfiNJXMwYH4pmw8uLrGM,1867
51
+ easylink-0.1.18.dist-info/METADATA,sha256=9RPc6nIJrkdNQxUXqVYQW26h2G3ukGuXyAmUA4razpA,3477
52
+ easylink-0.1.18.dist-info/WHEEL,sha256=QZxptf4Y1BKFRCEDxD4h2V0mBFQOVFLFEpvxHmIs52A,91
53
+ easylink-0.1.18.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
54
+ easylink-0.1.18.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
55
+ easylink-0.1.18.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.4.0)
2
+ Generator: setuptools (80.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5