easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. easylink/_version.py +1 -1
  2. easylink/cli.py +24 -3
  3. easylink/configuration.py +43 -36
  4. easylink/devtools/implementation_creator.py +71 -22
  5. easylink/implementation.py +88 -11
  6. easylink/implementation_metadata.yaml +177 -29
  7. easylink/pipeline.py +15 -6
  8. easylink/pipeline_schema.py +12 -13
  9. easylink/pipeline_schema_constants/__init__.py +4 -5
  10. easylink/pipeline_schema_constants/main.py +489 -0
  11. easylink/runner.py +11 -7
  12. easylink/step.py +89 -0
  13. easylink/steps/cascading/exclude_clustered.def +22 -0
  14. easylink/steps/cascading/exclude_clustered.py +76 -0
  15. easylink/steps/cascading/exclude_none.def +22 -0
  16. easylink/steps/cascading/exclude_none.py +76 -0
  17. easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
  18. easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
  19. easylink/steps/default/default_clusters_to_links.def +22 -0
  20. easylink/steps/default/default_clusters_to_links.py +91 -0
  21. easylink/steps/default/default_determining_exclusions.def +22 -0
  22. easylink/steps/default/default_determining_exclusions.py +81 -0
  23. easylink/steps/default/default_removing_records.def +22 -0
  24. easylink/steps/default/default_removing_records.py +59 -0
  25. easylink/steps/default/default_schema_alignment.def +22 -0
  26. easylink/steps/default/default_schema_alignment.py +53 -0
  27. easylink/steps/default/default_updating_clusters.def +22 -0
  28. easylink/steps/default/default_updating_clusters.py +67 -0
  29. easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
  30. easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
  31. easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
  32. easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
  33. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
  34. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
  35. easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
  36. easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
  37. easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
  38. easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
  39. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
  40. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
  41. easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
  42. easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
  43. easylink/steps/splink/splink_evaluating_pairs.def +22 -0
  44. easylink/steps/splink/splink_evaluating_pairs.py +164 -0
  45. easylink/steps/splink/splink_links_to_clusters.def +22 -0
  46. easylink/steps/splink/splink_links_to_clusters.py +63 -0
  47. easylink/utilities/data_utils.py +72 -0
  48. easylink/utilities/paths.py +4 -3
  49. easylink/utilities/validation_utils.py +509 -11
  50. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
  51. easylink-0.1.19.dist-info/RECORD +91 -0
  52. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
  53. easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
  54. easylink-0.1.17.dist-info/RECORD +0 -55
  55. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
  56. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,489 @@
1
+ """
2
+ =============================
3
+ Main EasyLink Pipeline Schema
4
+ =============================
5
+ """
6
+
7
+ from easylink.graph_components import (
8
+ EdgeParams,
9
+ InputSlot,
10
+ InputSlotMapping,
11
+ OutputSlot,
12
+ OutputSlotMapping,
13
+ )
14
+ from easylink.step import (
15
+ HierarchicalStep,
16
+ InputStep,
17
+ LoopStep,
18
+ OutputStep,
19
+ ParallelStep,
20
+ Step,
21
+ )
22
+ from easylink.utilities.validation_utils import (
23
+ dont_validate,
24
+ validate_blocks,
25
+ validate_clusters,
26
+ validate_dataset_dir,
27
+ validate_ids_to_remove,
28
+ validate_input_dataset_or_known_clusters,
29
+ validate_links,
30
+ validate_records,
31
+ )
32
+
33
+ NODES = [
34
+ # NOTE: In our pipeline schema as documented, there are two inputs: input datasets and known clusters
35
+ # However, due to limitations currently in EasyLink, we can't have multiple output slots on the InputStep.
36
+ # Instead we have a single undifferentiated slot and make it the *implementation's* problem to differentiate
37
+ # based on filename.
38
+ InputStep(),
39
+ LoopStep(
40
+ template_step=HierarchicalStep(
41
+ step_name="entity_resolution",
42
+ input_slots=[
43
+ InputSlot(
44
+ name="input_datasets",
45
+ env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
46
+ # NOTE: Since this originates from the InputStep, it will be a *list*
47
+ # of files, and this validator will be called on *each*
48
+ # TODO: Change this when https://jira.ihme.washington.edu/browse/MIC-6070 is implemented
49
+ validator=validate_input_dataset_or_known_clusters,
50
+ ),
51
+ InputSlot(
52
+ name="known_clusters",
53
+ env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
54
+ validator=validate_input_dataset_or_known_clusters,
55
+ ),
56
+ ],
57
+ output_slots=[OutputSlot("clusters")],
58
+ nodes=[
59
+ ParallelStep(
60
+ # NOTE: Splitters/aggregators on the ParallelStep are implicit!
61
+ template_step=HierarchicalStep(
62
+ step_name="determining_exclusions_and_removing_records",
63
+ directly_implemented=False,
64
+ input_slots=[
65
+ InputSlot(
66
+ name="input_datasets",
67
+ env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
68
+ validator=validate_input_dataset_or_known_clusters,
69
+ ),
70
+ InputSlot(
71
+ name="known_clusters",
72
+ env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
73
+ validator=validate_input_dataset_or_known_clusters,
74
+ ),
75
+ ],
76
+ output_slots=[OutputSlot("datasets")],
77
+ nodes=[
78
+ Step(
79
+ step_name="determining_exclusions",
80
+ input_slots=[
81
+ InputSlot(
82
+ name="input_datasets",
83
+ env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
84
+ validator=validate_input_dataset_or_known_clusters,
85
+ ),
86
+ InputSlot(
87
+ name="known_clusters",
88
+ env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
89
+ validator=validate_input_dataset_or_known_clusters,
90
+ ),
91
+ ],
92
+ output_slots=[OutputSlot("ids_to_remove")],
93
+ ),
94
+ Step(
95
+ step_name="removing_records",
96
+ input_slots=[
97
+ InputSlot(
98
+ name="input_datasets",
99
+ env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
100
+ validator=validate_input_dataset_or_known_clusters,
101
+ ),
102
+ InputSlot(
103
+ name="ids_to_remove",
104
+ env_var="IDS_TO_REMOVE_FILE_PATH",
105
+ validator=validate_ids_to_remove,
106
+ ),
107
+ ],
108
+ output_slots=[OutputSlot("dataset")],
109
+ ),
110
+ ],
111
+ edges=[
112
+ EdgeParams(
113
+ source_node="determining_exclusions",
114
+ target_node="removing_records",
115
+ output_slot="ids_to_remove",
116
+ input_slot="ids_to_remove",
117
+ )
118
+ ],
119
+ input_slot_mappings=[
120
+ # NOTE: This is the edge that would normally be split,
121
+ # but it won't be here, because we don't want it to split
122
+ # the known clusters to be a separate thing!
123
+ InputSlotMapping(
124
+ parent_slot="input_datasets",
125
+ child_node="determining_exclusions",
126
+ child_slot="input_datasets",
127
+ ),
128
+ InputSlotMapping(
129
+ parent_slot="known_clusters",
130
+ child_node="determining_exclusions",
131
+ child_slot="known_clusters",
132
+ ),
133
+ InputSlotMapping(
134
+ parent_slot="input_datasets",
135
+ child_node="removing_records",
136
+ child_slot="input_datasets",
137
+ ),
138
+ ],
139
+ output_slot_mappings=[
140
+ OutputSlotMapping(
141
+ # Becomes multiple, after implicit cloneable aggregator
142
+ parent_slot="datasets",
143
+ child_node="removing_records",
144
+ child_slot="dataset",
145
+ )
146
+ ],
147
+ )
148
+ ),
149
+ HierarchicalStep(
150
+ step_name="clustering",
151
+ input_slots=[
152
+ InputSlot(
153
+ name="datasets",
154
+ env_var="DATASETS_FILE_PATHS",
155
+ validator=validate_dataset_dir,
156
+ ),
157
+ InputSlot(
158
+ name="known_clusters",
159
+ env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
160
+ validator=validate_input_dataset_or_known_clusters,
161
+ ),
162
+ ],
163
+ output_slots=[OutputSlot("new_clusters")],
164
+ nodes=[
165
+ Step(
166
+ step_name="clusters_to_links",
167
+ input_slots=[
168
+ InputSlot(
169
+ name="known_clusters",
170
+ env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
171
+ validator=validate_input_dataset_or_known_clusters,
172
+ ),
173
+ ],
174
+ output_slots=[OutputSlot("known_links")],
175
+ ),
176
+ LoopStep(
177
+ template_step=HierarchicalStep(
178
+ step_name="linking",
179
+ input_slots=[
180
+ InputSlot(
181
+ name="datasets",
182
+ env_var="DATASETS_FILE_PATHS",
183
+ validator=validate_dataset_dir,
184
+ ),
185
+ InputSlot(
186
+ name="known_links",
187
+ env_var="KNOWN_LINKS_FILE_PATH",
188
+ validator=validate_links,
189
+ ),
190
+ ],
191
+ output_slots=[OutputSlot("links")],
192
+ nodes=[
193
+ ParallelStep(
194
+ template_step=LoopStep(
195
+ template_step=Step(
196
+ step_name="pre-processing",
197
+ input_slots=[
198
+ InputSlot(
199
+ # NOTE: No splitter here, because
200
+ # not supported by EasyLink;
201
+ # the implementation must do the splitting itself.
202
+ name="dataset",
203
+ env_var="DATASET_DIR_PATHS",
204
+ validator=validate_dataset_dir,
205
+ ),
206
+ ],
207
+ output_slots=[OutputSlot("dataset")],
208
+ ),
209
+ self_edges=[
210
+ EdgeParams(
211
+ source_node="pre-processing",
212
+ target_node="pre-processing",
213
+ output_slot="dataset",
214
+ input_slot="dataset",
215
+ ),
216
+ ],
217
+ )
218
+ ),
219
+ Step(
220
+ step_name="schema_alignment",
221
+ input_slots=[
222
+ InputSlot(
223
+ name="datasets",
224
+ env_var="DATASETS_DIR_PATHS",
225
+ validator=validate_dataset_dir,
226
+ ),
227
+ ],
228
+ output_slots=[OutputSlot("records")],
229
+ ),
230
+ Step(
231
+ step_name="blocking_and_filtering",
232
+ input_slots=[
233
+ InputSlot(
234
+ name="records",
235
+ env_var="RECORDS_FILE_PATH",
236
+ validator=validate_records,
237
+ ),
238
+ InputSlot(
239
+ name="known_links",
240
+ env_var="KNOWN_LINKS_FILE_PATH",
241
+ validator=validate_links,
242
+ ),
243
+ ],
244
+ output_slots=[OutputSlot("blocks")],
245
+ ),
246
+ Step(
247
+ step_name="evaluating_pairs",
248
+ input_slots=[
249
+ InputSlot(
250
+ name="blocks",
251
+ env_var="BLOCKS_DIR_PATH",
252
+ validator=validate_blocks,
253
+ ),
254
+ InputSlot(
255
+ name="known_links",
256
+ env_var="KNOWN_LINKS_FILE_PATH",
257
+ validator=validate_links,
258
+ ),
259
+ ],
260
+ output_slots=[OutputSlot("links")],
261
+ ),
262
+ ],
263
+ edges=[
264
+ EdgeParams(
265
+ source_node="pre-processing",
266
+ target_node="schema_alignment",
267
+ output_slot="dataset",
268
+ # NOTE: The implicit ParallelStep aggregator has
269
+ # made this multiple (a list)
270
+ input_slot="datasets",
271
+ ),
272
+ EdgeParams(
273
+ source_node="schema_alignment",
274
+ target_node="blocking_and_filtering",
275
+ output_slot="records",
276
+ input_slot="records",
277
+ ),
278
+ EdgeParams(
279
+ source_node="blocking_and_filtering",
280
+ target_node="evaluating_pairs",
281
+ output_slot="blocks",
282
+ input_slot="blocks",
283
+ ),
284
+ ],
285
+ input_slot_mappings=[
286
+ InputSlotMapping(
287
+ parent_slot="datasets",
288
+ child_node="pre-processing",
289
+ child_slot="dataset",
290
+ ),
291
+ InputSlotMapping(
292
+ parent_slot="known_links",
293
+ child_node="blocking_and_filtering",
294
+ child_slot="known_links",
295
+ ),
296
+ InputSlotMapping(
297
+ parent_slot="known_links",
298
+ child_node="evaluating_pairs",
299
+ child_slot="known_links",
300
+ ),
301
+ ],
302
+ output_slot_mappings=[
303
+ OutputSlotMapping(
304
+ parent_slot="links",
305
+ child_node="evaluating_pairs",
306
+ child_slot="links",
307
+ )
308
+ ],
309
+ ),
310
+ self_edges=[
311
+ EdgeParams(
312
+ source_node="linking",
313
+ target_node="linking",
314
+ output_slot="links",
315
+ input_slot="known_links",
316
+ )
317
+ ],
318
+ ),
319
+ Step(
320
+ step_name="links_to_clusters",
321
+ input_slots=[
322
+ InputSlot(
323
+ name="links",
324
+ env_var="LINKS_FILE_PATH",
325
+ validator=validate_links,
326
+ ),
327
+ ],
328
+ output_slots=[OutputSlot("clusters")],
329
+ ),
330
+ ],
331
+ edges=[
332
+ EdgeParams(
333
+ source_node="clusters_to_links",
334
+ target_node="linking",
335
+ output_slot="known_links",
336
+ input_slot="known_links",
337
+ ),
338
+ EdgeParams(
339
+ source_node="linking",
340
+ target_node="links_to_clusters",
341
+ output_slot="links",
342
+ input_slot="links",
343
+ ),
344
+ ],
345
+ input_slot_mappings=[
346
+ InputSlotMapping(
347
+ parent_slot="datasets",
348
+ child_node="linking",
349
+ child_slot="datasets",
350
+ ),
351
+ InputSlotMapping(
352
+ parent_slot="known_clusters",
353
+ child_node="clusters_to_links",
354
+ child_slot="known_clusters",
355
+ ),
356
+ ],
357
+ output_slot_mappings=[
358
+ OutputSlotMapping(
359
+ parent_slot="new_clusters",
360
+ child_node="links_to_clusters",
361
+ child_slot="clusters",
362
+ ),
363
+ ],
364
+ ),
365
+ Step(
366
+ step_name="updating_clusters",
367
+ input_slots=[
368
+ InputSlot(
369
+ name="new_clusters",
370
+ env_var="NEW_CLUSTERS_FILE_PATH",
371
+ validator=validate_clusters,
372
+ ),
373
+ InputSlot(
374
+ name="known_clusters",
375
+ env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
376
+ validator=validate_input_dataset_or_known_clusters,
377
+ ),
378
+ ],
379
+ output_slots=[OutputSlot("clusters")],
380
+ ),
381
+ ],
382
+ edges=[
383
+ EdgeParams(
384
+ source_node="determining_exclusions_and_removing_records",
385
+ target_node="clustering",
386
+ output_slot="datasets",
387
+ input_slot="datasets",
388
+ ),
389
+ EdgeParams(
390
+ source_node="clustering",
391
+ target_node="updating_clusters",
392
+ output_slot="new_clusters",
393
+ input_slot="new_clusters",
394
+ ),
395
+ ],
396
+ input_slot_mappings=[
397
+ InputSlotMapping(
398
+ parent_slot="input_datasets",
399
+ child_node="determining_exclusions_and_removing_records",
400
+ child_slot="input_datasets",
401
+ ),
402
+ InputSlotMapping(
403
+ parent_slot="known_clusters",
404
+ child_node="determining_exclusions_and_removing_records",
405
+ child_slot="known_clusters",
406
+ ),
407
+ InputSlotMapping(
408
+ parent_slot="known_clusters",
409
+ child_node="clustering",
410
+ child_slot="known_clusters",
411
+ ),
412
+ InputSlotMapping(
413
+ parent_slot="known_clusters",
414
+ child_node="updating_clusters",
415
+ child_slot="known_clusters",
416
+ ),
417
+ ],
418
+ output_slot_mappings=[
419
+ OutputSlotMapping(
420
+ child_node="updating_clusters",
421
+ child_slot="clusters",
422
+ parent_slot="clusters",
423
+ ),
424
+ ],
425
+ ),
426
+ self_edges=[
427
+ EdgeParams(
428
+ source_node="entity_resolution",
429
+ target_node="entity_resolution",
430
+ output_slot="clusters",
431
+ input_slot="known_clusters",
432
+ )
433
+ ],
434
+ ),
435
+ Step(
436
+ step_name="canonicalizing_and_downstream_analysis",
437
+ input_slots=[
438
+ InputSlot(
439
+ name="input_datasets",
440
+ env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
441
+ validator=validate_input_dataset_or_known_clusters,
442
+ ),
443
+ InputSlot(
444
+ name="clusters",
445
+ env_var="CLUSTERS_FILE_PATH",
446
+ validator=validate_clusters,
447
+ ),
448
+ ],
449
+ output_slots=[OutputSlot("analysis_output")],
450
+ ),
451
+ OutputStep(
452
+ input_slots=[
453
+ InputSlot(name="analysis_output", env_var=None, validator=dont_validate)
454
+ ],
455
+ ),
456
+ ]
457
+ EDGES = [
458
+ EdgeParams(
459
+ source_node="input_data",
460
+ target_node="entity_resolution",
461
+ output_slot="all",
462
+ input_slot="input_datasets",
463
+ ),
464
+ EdgeParams(
465
+ source_node="input_data",
466
+ target_node="entity_resolution",
467
+ output_slot="all",
468
+ input_slot="known_clusters",
469
+ ),
470
+ EdgeParams(
471
+ source_node="input_data",
472
+ target_node="canonicalizing_and_downstream_analysis",
473
+ output_slot="all",
474
+ input_slot="input_datasets",
475
+ ),
476
+ EdgeParams(
477
+ source_node="entity_resolution",
478
+ target_node="canonicalizing_and_downstream_analysis",
479
+ output_slot="clusters",
480
+ input_slot="clusters",
481
+ ),
482
+ EdgeParams(
483
+ source_node="canonicalizing_and_downstream_analysis",
484
+ target_node="results",
485
+ output_slot="analysis_output",
486
+ input_slot="analysis_output",
487
+ ),
488
+ ]
489
+ SCHEMA_PARAMS = (NODES, EDGES)
easylink/runner.py CHANGED
@@ -19,7 +19,6 @@ from snakemake.cli import main as snake_main
19
19
 
20
20
  from easylink.configuration import Config, load_params_from_specification
21
21
  from easylink.pipeline import Pipeline
22
- from easylink.pipeline_schema import PIPELINE_SCHEMAS, PipelineSchema
23
22
  from easylink.utilities.data_utils import (
24
23
  copy_configuration_files_to_results_directory,
25
24
  create_results_directory,
@@ -35,8 +34,9 @@ def main(
35
34
  input_data: str | Path,
36
35
  computing_environment: str | Path | None,
37
36
  results_dir: str | Path,
38
- debug=False,
39
- potential_schemas: PipelineSchema | list[PipelineSchema] = PIPELINE_SCHEMAS,
37
+ images_dir: str | None,
38
+ schema_name: str = "main",
39
+ debug: bool = False,
40
40
  ) -> None:
41
41
  """Runs an EasyLink command.
42
42
 
@@ -60,17 +60,21 @@ def main(
60
60
  to run the pipeline on. If None, the pipeline will be run locally.
61
61
  results_dir
62
62
  The directory to write results and incidental files (logs, etc.) to.
63
+ images_dir
64
+ The directory containing the images or to download the images to if they
65
+ don't exist. If None, will default to ~/.easylink_images.
66
+ schema_name
67
+ The name of the schema to validate the pipeline configuration against.
63
68
  debug
64
69
  If False (the default), will suppress some of the workflow output. This
65
70
  is intended to only be used for testing and development purposes.
66
- potential_schemas
67
- A list of potential schemas to validate the pipeline configuration against.
68
- This is primarily used for testing purposes. Defaults to the supported schemas.
69
71
  """
70
72
  config_params = load_params_from_specification(
71
73
  pipeline_specification, input_data, computing_environment, results_dir
72
74
  )
73
- config = Config(config_params, potential_schemas)
75
+ config = Config(
76
+ config_params, schema_name=schema_name, images_dir=images_dir, command=command
77
+ )
74
78
  pipeline = Pipeline(config)
75
79
  # After validation is completed, create the results directory
76
80
  create_results_directory(Path(results_dir))
easylink/step.py CHANGED
@@ -104,8 +104,19 @@ class Step:
104
104
  during the process of flattening the ``Stepgraph``, e.g. unrolling loops, etc.
105
105
  For example, if step 1 is looped multiple times, each node would have a
106
106
  ``step_name`` of, perhaps, "step_1" but unique ``names`` ("step_1_loop_1", etc)."""
107
+
108
+ if len(set(slot.name for slot in input_slots)) != len(input_slots):
109
+ raise ValueError(f"{step_name} has duplicate input slot names!")
110
+
111
+ if len(set(s.env_var for s in input_slots)) != len(input_slots):
112
+ raise ValueError(f"{step_name} has duplicate input slot environment variables!")
113
+
107
114
  self.input_slots = {slot.name: slot for slot in input_slots}
108
115
  """A mapping of ``InputSlot`` names to their instances."""
116
+
117
+ if len(set(s.name for s in output_slots)) != len(output_slots):
118
+ raise ValueError(f"{step_name} has duplicate output slot names!")
119
+
109
120
  self.output_slots = {slot.name: slot for slot in output_slots}
110
121
  """A mapping of ``OutputSlot`` names to their instances."""
111
122
  self.slot_mappings = {
@@ -592,6 +603,10 @@ class HierarchicalStep(Step):
592
603
  attribute to allow for back-end ``HierarchicalStep`` creation that are not
593
604
  user-facing (i.e. they do not need to provide a 'substeps' configuration key)."""
594
605
 
606
+ self._check_edges_are_valid()
607
+ self._check_slot_mappings_are_valid()
608
+ self._check_validators_are_consistent()
609
+
595
610
  @property
596
611
  def config_key(self):
597
612
  """The pipeline specification key required for a ``HierarchicalStep``."""
@@ -721,6 +736,80 @@ class HierarchicalStep(Step):
721
736
  errors[f"step {extra_step}"] = [f"{extra_step} is not a valid step."]
722
737
  return errors
723
738
 
739
+ def _check_edges_are_valid(self):
740
+ """Check that edges are valid, i.e. each connect two slots that actually exist."""
741
+ for edge in self.edges:
742
+ # Edges connect the *output* slot of a *source* node to the
743
+ # *input* slot of a *target* node
744
+ for slot_type, node_type in (("output", "source"), ("input", "target")):
745
+ node_name = getattr(edge, f"{node_type}_node")
746
+ if node_name not in self.step_graph.nodes:
747
+ raise ValueError(f"Edge {edge} has non-existent {node_type} node")
748
+ if getattr(edge, f"{slot_type}_slot") not in getattr(
749
+ self.step_graph.nodes[node_name]["step"], f"{slot_type}_slots"
750
+ ):
751
+ raise ValueError(f"Edge {edge} has non-existent {node_type} slot")
752
+
753
+ def _check_slot_mappings_are_valid(self):
754
+ """Check that input and output slot mappings are valid.
755
+
756
+ Checks that the input and output slots on the parent step are all mapped,
757
+ and that all slot mappings connect a slot on self (the parent) that actually exists
758
+ to an slot that actually exists on a sub-step.
759
+ """
760
+ for slot_type in ["input", "output"]:
761
+ slots = getattr(self, f"{slot_type}_slots")
762
+ slot_mappings = self.slot_mappings[slot_type]
763
+
764
+ if set(slots) != set(sm.parent_slot for sm in slot_mappings):
765
+ raise ValueError(
766
+ f"{self.step_name} {slot_type} slots do not match {slot_type} slot mappings"
767
+ )
768
+
769
+ for sm in slot_mappings:
770
+ if sm.child_node not in self.step_graph.nodes:
771
+ raise ValueError(
772
+ f"{self.step_name} {slot_type} slot {sm.parent_slot} maps to non-existent child node {sm.child_node}"
773
+ )
774
+ if sm.child_slot not in getattr(
775
+ self.step_graph.nodes[sm.child_node]["step"], f"{slot_type}_slots"
776
+ ):
777
+ raise ValueError(
778
+ f"{self.step_name} {slot_type} slot {sm.parent_slot} maps to non-existent slot {sm.child_slot} on child node {sm.child_node}"
779
+ )
780
+
781
+ def _check_validators_are_consistent(self):
782
+ """Check that if two input slots will receive the same data, they have the same validator.
783
+
784
+ There are two versions of this to check: input slots that receive the same data because
785
+ one is mapped to the other by a slot mapping, and input slots that receive the
786
+ same data because they both are at the receiving end of edges from the same output slot.
787
+ """
788
+ # Check that input slots mapped to by our slot mappings have consistent validators
789
+ for sm in self.slot_mappings["input"]:
790
+ expected_validator = self.input_slots[sm.parent_slot].validator
791
+ child_input_slot = self.step_graph.nodes[sm.child_node]["step"].input_slots[
792
+ sm.child_slot
793
+ ]
794
+ if child_input_slot.validator != expected_validator:
795
+ raise ValueError(
796
+ f"{sm.child_node}'s {sm.child_slot}, which is mapped from {self.step_name}'s {sm.parent_slot}, does not have the same validator"
797
+ )
798
+
799
+ # Check that input slots receiving the same data have consistent validators
800
+ validators_by_child_output_slot = {}
801
+ for edge in self.edges:
802
+ child_input_slot = self.step_graph.edges[(edge.source_node, edge.target_node, 0)][
803
+ "input_slot"
804
+ ]
805
+ source_slot = (edge.source_node, edge.output_slot)
806
+ if source_slot not in validators_by_child_output_slot:
807
+ validators_by_child_output_slot[source_slot] = child_input_slot.validator
808
+ elif child_input_slot.validator != validators_by_child_output_slot[source_slot]:
809
+ raise ValueError(
810
+ f"Not all input slots receiving edges from {edge.source_node}'s {edge.output_slot} have the same validator"
811
+ )
812
+
724
813
 
725
814
  class TemplatedStep(Step, ABC):
726
815
  """A type of :class:`Step` that may contain multiplicity.
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./exclude_clustered.py /exclude_clustered.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /exclude_clustered.py '$@'