easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +24 -3
- easylink/configuration.py +43 -36
- easylink/devtools/implementation_creator.py +71 -22
- easylink/implementation.py +88 -11
- easylink/implementation_metadata.yaml +177 -29
- easylink/pipeline.py +15 -6
- easylink/pipeline_schema.py +12 -13
- easylink/pipeline_schema_constants/__init__.py +4 -5
- easylink/pipeline_schema_constants/main.py +489 -0
- easylink/runner.py +11 -7
- easylink/step.py +89 -0
- easylink/steps/cascading/exclude_clustered.def +22 -0
- easylink/steps/cascading/exclude_clustered.py +76 -0
- easylink/steps/cascading/exclude_none.def +22 -0
- easylink/steps/cascading/exclude_none.py +76 -0
- easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
- easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
- easylink/steps/default/default_clusters_to_links.def +22 -0
- easylink/steps/default/default_clusters_to_links.py +91 -0
- easylink/steps/default/default_determining_exclusions.def +22 -0
- easylink/steps/default/default_determining_exclusions.py +81 -0
- easylink/steps/default/default_removing_records.def +22 -0
- easylink/steps/default/default_removing_records.py +59 -0
- easylink/steps/default/default_schema_alignment.def +22 -0
- easylink/steps/default/default_schema_alignment.py +53 -0
- easylink/steps/default/default_updating_clusters.def +22 -0
- easylink/steps/default/default_updating_clusters.py +67 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
- easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
- easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
- easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
- easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
- easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
- easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
- easylink/steps/splink/splink_evaluating_pairs.def +22 -0
- easylink/steps/splink/splink_evaluating_pairs.py +164 -0
- easylink/steps/splink/splink_links_to_clusters.def +22 -0
- easylink/steps/splink/splink_links_to_clusters.py +63 -0
- easylink/utilities/data_utils.py +72 -0
- easylink/utilities/paths.py +4 -3
- easylink/utilities/validation_utils.py +509 -11
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
- easylink-0.1.19.dist-info/RECORD +91 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
- easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
- easylink-0.1.17.dist-info/RECORD +0 -55
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,489 @@
|
|
1
|
+
"""
|
2
|
+
=============================
|
3
|
+
Main EasyLink Pipeline Schema
|
4
|
+
=============================
|
5
|
+
"""
|
6
|
+
|
7
|
+
from easylink.graph_components import (
|
8
|
+
EdgeParams,
|
9
|
+
InputSlot,
|
10
|
+
InputSlotMapping,
|
11
|
+
OutputSlot,
|
12
|
+
OutputSlotMapping,
|
13
|
+
)
|
14
|
+
from easylink.step import (
|
15
|
+
HierarchicalStep,
|
16
|
+
InputStep,
|
17
|
+
LoopStep,
|
18
|
+
OutputStep,
|
19
|
+
ParallelStep,
|
20
|
+
Step,
|
21
|
+
)
|
22
|
+
from easylink.utilities.validation_utils import (
|
23
|
+
dont_validate,
|
24
|
+
validate_blocks,
|
25
|
+
validate_clusters,
|
26
|
+
validate_dataset_dir,
|
27
|
+
validate_ids_to_remove,
|
28
|
+
validate_input_dataset_or_known_clusters,
|
29
|
+
validate_links,
|
30
|
+
validate_records,
|
31
|
+
)
|
32
|
+
|
33
|
+
NODES = [
|
34
|
+
# NOTE: In our pipeline schema as documented, there are two inputs: input datasets and known clusters
|
35
|
+
# However, due to limitations currently in EasyLink, we can't have multiple output slots on the InputStep.
|
36
|
+
# Instead we have a single undifferentiated slot and make it the *implementation's* problem to differentiate
|
37
|
+
# based on filename.
|
38
|
+
InputStep(),
|
39
|
+
LoopStep(
|
40
|
+
template_step=HierarchicalStep(
|
41
|
+
step_name="entity_resolution",
|
42
|
+
input_slots=[
|
43
|
+
InputSlot(
|
44
|
+
name="input_datasets",
|
45
|
+
env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
|
46
|
+
# NOTE: Since this originates from the InputStep, it will be a *list*
|
47
|
+
# of files, and this validator will be called on *each*
|
48
|
+
# TODO: Change this when https://jira.ihme.washington.edu/browse/MIC-6070 is implemented
|
49
|
+
validator=validate_input_dataset_or_known_clusters,
|
50
|
+
),
|
51
|
+
InputSlot(
|
52
|
+
name="known_clusters",
|
53
|
+
env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
|
54
|
+
validator=validate_input_dataset_or_known_clusters,
|
55
|
+
),
|
56
|
+
],
|
57
|
+
output_slots=[OutputSlot("clusters")],
|
58
|
+
nodes=[
|
59
|
+
ParallelStep(
|
60
|
+
# NOTE: Splitters/aggregators on the ParallelStep are implicit!
|
61
|
+
template_step=HierarchicalStep(
|
62
|
+
step_name="determining_exclusions_and_removing_records",
|
63
|
+
directly_implemented=False,
|
64
|
+
input_slots=[
|
65
|
+
InputSlot(
|
66
|
+
name="input_datasets",
|
67
|
+
env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
|
68
|
+
validator=validate_input_dataset_or_known_clusters,
|
69
|
+
),
|
70
|
+
InputSlot(
|
71
|
+
name="known_clusters",
|
72
|
+
env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
|
73
|
+
validator=validate_input_dataset_or_known_clusters,
|
74
|
+
),
|
75
|
+
],
|
76
|
+
output_slots=[OutputSlot("datasets")],
|
77
|
+
nodes=[
|
78
|
+
Step(
|
79
|
+
step_name="determining_exclusions",
|
80
|
+
input_slots=[
|
81
|
+
InputSlot(
|
82
|
+
name="input_datasets",
|
83
|
+
env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
|
84
|
+
validator=validate_input_dataset_or_known_clusters,
|
85
|
+
),
|
86
|
+
InputSlot(
|
87
|
+
name="known_clusters",
|
88
|
+
env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
|
89
|
+
validator=validate_input_dataset_or_known_clusters,
|
90
|
+
),
|
91
|
+
],
|
92
|
+
output_slots=[OutputSlot("ids_to_remove")],
|
93
|
+
),
|
94
|
+
Step(
|
95
|
+
step_name="removing_records",
|
96
|
+
input_slots=[
|
97
|
+
InputSlot(
|
98
|
+
name="input_datasets",
|
99
|
+
env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
|
100
|
+
validator=validate_input_dataset_or_known_clusters,
|
101
|
+
),
|
102
|
+
InputSlot(
|
103
|
+
name="ids_to_remove",
|
104
|
+
env_var="IDS_TO_REMOVE_FILE_PATH",
|
105
|
+
validator=validate_ids_to_remove,
|
106
|
+
),
|
107
|
+
],
|
108
|
+
output_slots=[OutputSlot("dataset")],
|
109
|
+
),
|
110
|
+
],
|
111
|
+
edges=[
|
112
|
+
EdgeParams(
|
113
|
+
source_node="determining_exclusions",
|
114
|
+
target_node="removing_records",
|
115
|
+
output_slot="ids_to_remove",
|
116
|
+
input_slot="ids_to_remove",
|
117
|
+
)
|
118
|
+
],
|
119
|
+
input_slot_mappings=[
|
120
|
+
# NOTE: This is the edge that would normally be split,
|
121
|
+
# but it won't be here, because we don't want it to split
|
122
|
+
# the known clusters to be a separate thing!
|
123
|
+
InputSlotMapping(
|
124
|
+
parent_slot="input_datasets",
|
125
|
+
child_node="determining_exclusions",
|
126
|
+
child_slot="input_datasets",
|
127
|
+
),
|
128
|
+
InputSlotMapping(
|
129
|
+
parent_slot="known_clusters",
|
130
|
+
child_node="determining_exclusions",
|
131
|
+
child_slot="known_clusters",
|
132
|
+
),
|
133
|
+
InputSlotMapping(
|
134
|
+
parent_slot="input_datasets",
|
135
|
+
child_node="removing_records",
|
136
|
+
child_slot="input_datasets",
|
137
|
+
),
|
138
|
+
],
|
139
|
+
output_slot_mappings=[
|
140
|
+
OutputSlotMapping(
|
141
|
+
# Becomes multiple, after implicit cloneable aggregator
|
142
|
+
parent_slot="datasets",
|
143
|
+
child_node="removing_records",
|
144
|
+
child_slot="dataset",
|
145
|
+
)
|
146
|
+
],
|
147
|
+
)
|
148
|
+
),
|
149
|
+
HierarchicalStep(
|
150
|
+
step_name="clustering",
|
151
|
+
input_slots=[
|
152
|
+
InputSlot(
|
153
|
+
name="datasets",
|
154
|
+
env_var="DATASETS_FILE_PATHS",
|
155
|
+
validator=validate_dataset_dir,
|
156
|
+
),
|
157
|
+
InputSlot(
|
158
|
+
name="known_clusters",
|
159
|
+
env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
|
160
|
+
validator=validate_input_dataset_or_known_clusters,
|
161
|
+
),
|
162
|
+
],
|
163
|
+
output_slots=[OutputSlot("new_clusters")],
|
164
|
+
nodes=[
|
165
|
+
Step(
|
166
|
+
step_name="clusters_to_links",
|
167
|
+
input_slots=[
|
168
|
+
InputSlot(
|
169
|
+
name="known_clusters",
|
170
|
+
env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
|
171
|
+
validator=validate_input_dataset_or_known_clusters,
|
172
|
+
),
|
173
|
+
],
|
174
|
+
output_slots=[OutputSlot("known_links")],
|
175
|
+
),
|
176
|
+
LoopStep(
|
177
|
+
template_step=HierarchicalStep(
|
178
|
+
step_name="linking",
|
179
|
+
input_slots=[
|
180
|
+
InputSlot(
|
181
|
+
name="datasets",
|
182
|
+
env_var="DATASETS_FILE_PATHS",
|
183
|
+
validator=validate_dataset_dir,
|
184
|
+
),
|
185
|
+
InputSlot(
|
186
|
+
name="known_links",
|
187
|
+
env_var="KNOWN_LINKS_FILE_PATH",
|
188
|
+
validator=validate_links,
|
189
|
+
),
|
190
|
+
],
|
191
|
+
output_slots=[OutputSlot("links")],
|
192
|
+
nodes=[
|
193
|
+
ParallelStep(
|
194
|
+
template_step=LoopStep(
|
195
|
+
template_step=Step(
|
196
|
+
step_name="pre-processing",
|
197
|
+
input_slots=[
|
198
|
+
InputSlot(
|
199
|
+
# NOTE: No splitter here, because
|
200
|
+
# not supported by EasyLink;
|
201
|
+
# the implementation must do the splitting itself.
|
202
|
+
name="dataset",
|
203
|
+
env_var="DATASET_DIR_PATHS",
|
204
|
+
validator=validate_dataset_dir,
|
205
|
+
),
|
206
|
+
],
|
207
|
+
output_slots=[OutputSlot("dataset")],
|
208
|
+
),
|
209
|
+
self_edges=[
|
210
|
+
EdgeParams(
|
211
|
+
source_node="pre-processing",
|
212
|
+
target_node="pre-processing",
|
213
|
+
output_slot="dataset",
|
214
|
+
input_slot="dataset",
|
215
|
+
),
|
216
|
+
],
|
217
|
+
)
|
218
|
+
),
|
219
|
+
Step(
|
220
|
+
step_name="schema_alignment",
|
221
|
+
input_slots=[
|
222
|
+
InputSlot(
|
223
|
+
name="datasets",
|
224
|
+
env_var="DATASETS_DIR_PATHS",
|
225
|
+
validator=validate_dataset_dir,
|
226
|
+
),
|
227
|
+
],
|
228
|
+
output_slots=[OutputSlot("records")],
|
229
|
+
),
|
230
|
+
Step(
|
231
|
+
step_name="blocking_and_filtering",
|
232
|
+
input_slots=[
|
233
|
+
InputSlot(
|
234
|
+
name="records",
|
235
|
+
env_var="RECORDS_FILE_PATH",
|
236
|
+
validator=validate_records,
|
237
|
+
),
|
238
|
+
InputSlot(
|
239
|
+
name="known_links",
|
240
|
+
env_var="KNOWN_LINKS_FILE_PATH",
|
241
|
+
validator=validate_links,
|
242
|
+
),
|
243
|
+
],
|
244
|
+
output_slots=[OutputSlot("blocks")],
|
245
|
+
),
|
246
|
+
Step(
|
247
|
+
step_name="evaluating_pairs",
|
248
|
+
input_slots=[
|
249
|
+
InputSlot(
|
250
|
+
name="blocks",
|
251
|
+
env_var="BLOCKS_DIR_PATH",
|
252
|
+
validator=validate_blocks,
|
253
|
+
),
|
254
|
+
InputSlot(
|
255
|
+
name="known_links",
|
256
|
+
env_var="KNOWN_LINKS_FILE_PATH",
|
257
|
+
validator=validate_links,
|
258
|
+
),
|
259
|
+
],
|
260
|
+
output_slots=[OutputSlot("links")],
|
261
|
+
),
|
262
|
+
],
|
263
|
+
edges=[
|
264
|
+
EdgeParams(
|
265
|
+
source_node="pre-processing",
|
266
|
+
target_node="schema_alignment",
|
267
|
+
output_slot="dataset",
|
268
|
+
# NOTE: The implicit ParallelStep aggregator has
|
269
|
+
# made this multiple (a list)
|
270
|
+
input_slot="datasets",
|
271
|
+
),
|
272
|
+
EdgeParams(
|
273
|
+
source_node="schema_alignment",
|
274
|
+
target_node="blocking_and_filtering",
|
275
|
+
output_slot="records",
|
276
|
+
input_slot="records",
|
277
|
+
),
|
278
|
+
EdgeParams(
|
279
|
+
source_node="blocking_and_filtering",
|
280
|
+
target_node="evaluating_pairs",
|
281
|
+
output_slot="blocks",
|
282
|
+
input_slot="blocks",
|
283
|
+
),
|
284
|
+
],
|
285
|
+
input_slot_mappings=[
|
286
|
+
InputSlotMapping(
|
287
|
+
parent_slot="datasets",
|
288
|
+
child_node="pre-processing",
|
289
|
+
child_slot="dataset",
|
290
|
+
),
|
291
|
+
InputSlotMapping(
|
292
|
+
parent_slot="known_links",
|
293
|
+
child_node="blocking_and_filtering",
|
294
|
+
child_slot="known_links",
|
295
|
+
),
|
296
|
+
InputSlotMapping(
|
297
|
+
parent_slot="known_links",
|
298
|
+
child_node="evaluating_pairs",
|
299
|
+
child_slot="known_links",
|
300
|
+
),
|
301
|
+
],
|
302
|
+
output_slot_mappings=[
|
303
|
+
OutputSlotMapping(
|
304
|
+
parent_slot="links",
|
305
|
+
child_node="evaluating_pairs",
|
306
|
+
child_slot="links",
|
307
|
+
)
|
308
|
+
],
|
309
|
+
),
|
310
|
+
self_edges=[
|
311
|
+
EdgeParams(
|
312
|
+
source_node="linking",
|
313
|
+
target_node="linking",
|
314
|
+
output_slot="links",
|
315
|
+
input_slot="known_links",
|
316
|
+
)
|
317
|
+
],
|
318
|
+
),
|
319
|
+
Step(
|
320
|
+
step_name="links_to_clusters",
|
321
|
+
input_slots=[
|
322
|
+
InputSlot(
|
323
|
+
name="links",
|
324
|
+
env_var="LINKS_FILE_PATH",
|
325
|
+
validator=validate_links,
|
326
|
+
),
|
327
|
+
],
|
328
|
+
output_slots=[OutputSlot("clusters")],
|
329
|
+
),
|
330
|
+
],
|
331
|
+
edges=[
|
332
|
+
EdgeParams(
|
333
|
+
source_node="clusters_to_links",
|
334
|
+
target_node="linking",
|
335
|
+
output_slot="known_links",
|
336
|
+
input_slot="known_links",
|
337
|
+
),
|
338
|
+
EdgeParams(
|
339
|
+
source_node="linking",
|
340
|
+
target_node="links_to_clusters",
|
341
|
+
output_slot="links",
|
342
|
+
input_slot="links",
|
343
|
+
),
|
344
|
+
],
|
345
|
+
input_slot_mappings=[
|
346
|
+
InputSlotMapping(
|
347
|
+
parent_slot="datasets",
|
348
|
+
child_node="linking",
|
349
|
+
child_slot="datasets",
|
350
|
+
),
|
351
|
+
InputSlotMapping(
|
352
|
+
parent_slot="known_clusters",
|
353
|
+
child_node="clusters_to_links",
|
354
|
+
child_slot="known_clusters",
|
355
|
+
),
|
356
|
+
],
|
357
|
+
output_slot_mappings=[
|
358
|
+
OutputSlotMapping(
|
359
|
+
parent_slot="new_clusters",
|
360
|
+
child_node="links_to_clusters",
|
361
|
+
child_slot="clusters",
|
362
|
+
),
|
363
|
+
],
|
364
|
+
),
|
365
|
+
Step(
|
366
|
+
step_name="updating_clusters",
|
367
|
+
input_slots=[
|
368
|
+
InputSlot(
|
369
|
+
name="new_clusters",
|
370
|
+
env_var="NEW_CLUSTERS_FILE_PATH",
|
371
|
+
validator=validate_clusters,
|
372
|
+
),
|
373
|
+
InputSlot(
|
374
|
+
name="known_clusters",
|
375
|
+
env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
|
376
|
+
validator=validate_input_dataset_or_known_clusters,
|
377
|
+
),
|
378
|
+
],
|
379
|
+
output_slots=[OutputSlot("clusters")],
|
380
|
+
),
|
381
|
+
],
|
382
|
+
edges=[
|
383
|
+
EdgeParams(
|
384
|
+
source_node="determining_exclusions_and_removing_records",
|
385
|
+
target_node="clustering",
|
386
|
+
output_slot="datasets",
|
387
|
+
input_slot="datasets",
|
388
|
+
),
|
389
|
+
EdgeParams(
|
390
|
+
source_node="clustering",
|
391
|
+
target_node="updating_clusters",
|
392
|
+
output_slot="new_clusters",
|
393
|
+
input_slot="new_clusters",
|
394
|
+
),
|
395
|
+
],
|
396
|
+
input_slot_mappings=[
|
397
|
+
InputSlotMapping(
|
398
|
+
parent_slot="input_datasets",
|
399
|
+
child_node="determining_exclusions_and_removing_records",
|
400
|
+
child_slot="input_datasets",
|
401
|
+
),
|
402
|
+
InputSlotMapping(
|
403
|
+
parent_slot="known_clusters",
|
404
|
+
child_node="determining_exclusions_and_removing_records",
|
405
|
+
child_slot="known_clusters",
|
406
|
+
),
|
407
|
+
InputSlotMapping(
|
408
|
+
parent_slot="known_clusters",
|
409
|
+
child_node="clustering",
|
410
|
+
child_slot="known_clusters",
|
411
|
+
),
|
412
|
+
InputSlotMapping(
|
413
|
+
parent_slot="known_clusters",
|
414
|
+
child_node="updating_clusters",
|
415
|
+
child_slot="known_clusters",
|
416
|
+
),
|
417
|
+
],
|
418
|
+
output_slot_mappings=[
|
419
|
+
OutputSlotMapping(
|
420
|
+
child_node="updating_clusters",
|
421
|
+
child_slot="clusters",
|
422
|
+
parent_slot="clusters",
|
423
|
+
),
|
424
|
+
],
|
425
|
+
),
|
426
|
+
self_edges=[
|
427
|
+
EdgeParams(
|
428
|
+
source_node="entity_resolution",
|
429
|
+
target_node="entity_resolution",
|
430
|
+
output_slot="clusters",
|
431
|
+
input_slot="known_clusters",
|
432
|
+
)
|
433
|
+
],
|
434
|
+
),
|
435
|
+
Step(
|
436
|
+
step_name="canonicalizing_and_downstream_analysis",
|
437
|
+
input_slots=[
|
438
|
+
InputSlot(
|
439
|
+
name="input_datasets",
|
440
|
+
env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
|
441
|
+
validator=validate_input_dataset_or_known_clusters,
|
442
|
+
),
|
443
|
+
InputSlot(
|
444
|
+
name="clusters",
|
445
|
+
env_var="CLUSTERS_FILE_PATH",
|
446
|
+
validator=validate_clusters,
|
447
|
+
),
|
448
|
+
],
|
449
|
+
output_slots=[OutputSlot("analysis_output")],
|
450
|
+
),
|
451
|
+
OutputStep(
|
452
|
+
input_slots=[
|
453
|
+
InputSlot(name="analysis_output", env_var=None, validator=dont_validate)
|
454
|
+
],
|
455
|
+
),
|
456
|
+
]
|
457
|
+
EDGES = [
|
458
|
+
EdgeParams(
|
459
|
+
source_node="input_data",
|
460
|
+
target_node="entity_resolution",
|
461
|
+
output_slot="all",
|
462
|
+
input_slot="input_datasets",
|
463
|
+
),
|
464
|
+
EdgeParams(
|
465
|
+
source_node="input_data",
|
466
|
+
target_node="entity_resolution",
|
467
|
+
output_slot="all",
|
468
|
+
input_slot="known_clusters",
|
469
|
+
),
|
470
|
+
EdgeParams(
|
471
|
+
source_node="input_data",
|
472
|
+
target_node="canonicalizing_and_downstream_analysis",
|
473
|
+
output_slot="all",
|
474
|
+
input_slot="input_datasets",
|
475
|
+
),
|
476
|
+
EdgeParams(
|
477
|
+
source_node="entity_resolution",
|
478
|
+
target_node="canonicalizing_and_downstream_analysis",
|
479
|
+
output_slot="clusters",
|
480
|
+
input_slot="clusters",
|
481
|
+
),
|
482
|
+
EdgeParams(
|
483
|
+
source_node="canonicalizing_and_downstream_analysis",
|
484
|
+
target_node="results",
|
485
|
+
output_slot="analysis_output",
|
486
|
+
input_slot="analysis_output",
|
487
|
+
),
|
488
|
+
]
|
489
|
+
SCHEMA_PARAMS = (NODES, EDGES)
|
easylink/runner.py
CHANGED
@@ -19,7 +19,6 @@ from snakemake.cli import main as snake_main
|
|
19
19
|
|
20
20
|
from easylink.configuration import Config, load_params_from_specification
|
21
21
|
from easylink.pipeline import Pipeline
|
22
|
-
from easylink.pipeline_schema import PIPELINE_SCHEMAS, PipelineSchema
|
23
22
|
from easylink.utilities.data_utils import (
|
24
23
|
copy_configuration_files_to_results_directory,
|
25
24
|
create_results_directory,
|
@@ -35,8 +34,9 @@ def main(
|
|
35
34
|
input_data: str | Path,
|
36
35
|
computing_environment: str | Path | None,
|
37
36
|
results_dir: str | Path,
|
38
|
-
|
39
|
-
|
37
|
+
images_dir: str | None,
|
38
|
+
schema_name: str = "main",
|
39
|
+
debug: bool = False,
|
40
40
|
) -> None:
|
41
41
|
"""Runs an EasyLink command.
|
42
42
|
|
@@ -60,17 +60,21 @@ def main(
|
|
60
60
|
to run the pipeline on. If None, the pipeline will be run locally.
|
61
61
|
results_dir
|
62
62
|
The directory to write results and incidental files (logs, etc.) to.
|
63
|
+
images_dir
|
64
|
+
The directory containing the images or to download the images to if they
|
65
|
+
don't exist. If None, will default to ~/.easylink_images.
|
66
|
+
schema_name
|
67
|
+
The name of the schema to validate the pipeline configuration against.
|
63
68
|
debug
|
64
69
|
If False (the default), will suppress some of the workflow output. This
|
65
70
|
is intended to only be used for testing and development purposes.
|
66
|
-
potential_schemas
|
67
|
-
A list of potential schemas to validate the pipeline configuration against.
|
68
|
-
This is primarily used for testing purposes. Defaults to the supported schemas.
|
69
71
|
"""
|
70
72
|
config_params = load_params_from_specification(
|
71
73
|
pipeline_specification, input_data, computing_environment, results_dir
|
72
74
|
)
|
73
|
-
config = Config(
|
75
|
+
config = Config(
|
76
|
+
config_params, schema_name=schema_name, images_dir=images_dir, command=command
|
77
|
+
)
|
74
78
|
pipeline = Pipeline(config)
|
75
79
|
# After validation is completed, create the results directory
|
76
80
|
create_results_directory(Path(results_dir))
|
easylink/step.py
CHANGED
@@ -104,8 +104,19 @@ class Step:
|
|
104
104
|
during the process of flattening the ``Stepgraph``, e.g. unrolling loops, etc.
|
105
105
|
For example, if step 1 is looped multiple times, each node would have a
|
106
106
|
``step_name`` of, perhaps, "step_1" but unique ``names`` ("step_1_loop_1", etc)."""
|
107
|
+
|
108
|
+
if len(set(slot.name for slot in input_slots)) != len(input_slots):
|
109
|
+
raise ValueError(f"{step_name} has duplicate input slot names!")
|
110
|
+
|
111
|
+
if len(set(s.env_var for s in input_slots)) != len(input_slots):
|
112
|
+
raise ValueError(f"{step_name} has duplicate input slot environment variables!")
|
113
|
+
|
107
114
|
self.input_slots = {slot.name: slot for slot in input_slots}
|
108
115
|
"""A mapping of ``InputSlot`` names to their instances."""
|
116
|
+
|
117
|
+
if len(set(s.name for s in output_slots)) != len(output_slots):
|
118
|
+
raise ValueError(f"{step_name} has duplicate output slot names!")
|
119
|
+
|
109
120
|
self.output_slots = {slot.name: slot for slot in output_slots}
|
110
121
|
"""A mapping of ``OutputSlot`` names to their instances."""
|
111
122
|
self.slot_mappings = {
|
@@ -592,6 +603,10 @@ class HierarchicalStep(Step):
|
|
592
603
|
attribute to allow for back-end ``HierarchicalStep`` creation that are not
|
593
604
|
user-facing (i.e. they do not need to provide a 'substeps' configuration key)."""
|
594
605
|
|
606
|
+
self._check_edges_are_valid()
|
607
|
+
self._check_slot_mappings_are_valid()
|
608
|
+
self._check_validators_are_consistent()
|
609
|
+
|
595
610
|
@property
|
596
611
|
def config_key(self):
|
597
612
|
"""The pipeline specification key required for a ``HierarchicalStep``."""
|
@@ -721,6 +736,80 @@ class HierarchicalStep(Step):
|
|
721
736
|
errors[f"step {extra_step}"] = [f"{extra_step} is not a valid step."]
|
722
737
|
return errors
|
723
738
|
|
739
|
+
def _check_edges_are_valid(self):
|
740
|
+
"""Check that edges are valid, i.e. each connect two slots that actually exist."""
|
741
|
+
for edge in self.edges:
|
742
|
+
# Edges connect the *output* slot of a *source* node to the
|
743
|
+
# *input* slot of a *target* node
|
744
|
+
for slot_type, node_type in (("output", "source"), ("input", "target")):
|
745
|
+
node_name = getattr(edge, f"{node_type}_node")
|
746
|
+
if node_name not in self.step_graph.nodes:
|
747
|
+
raise ValueError(f"Edge {edge} has non-existent {node_type} node")
|
748
|
+
if getattr(edge, f"{slot_type}_slot") not in getattr(
|
749
|
+
self.step_graph.nodes[node_name]["step"], f"{slot_type}_slots"
|
750
|
+
):
|
751
|
+
raise ValueError(f"Edge {edge} has non-existent {node_type} slot")
|
752
|
+
|
753
|
+
def _check_slot_mappings_are_valid(self):
|
754
|
+
"""Check that input and output slot mappings are valid.
|
755
|
+
|
756
|
+
Checks that the input and output slots on the parent step are all mapped,
|
757
|
+
and that all slot mappings connect a slot on self (the parent) that actually exists
|
758
|
+
to an slot that actually exists on a sub-step.
|
759
|
+
"""
|
760
|
+
for slot_type in ["input", "output"]:
|
761
|
+
slots = getattr(self, f"{slot_type}_slots")
|
762
|
+
slot_mappings = self.slot_mappings[slot_type]
|
763
|
+
|
764
|
+
if set(slots) != set(sm.parent_slot for sm in slot_mappings):
|
765
|
+
raise ValueError(
|
766
|
+
f"{self.step_name} {slot_type} slots do not match {slot_type} slot mappings"
|
767
|
+
)
|
768
|
+
|
769
|
+
for sm in slot_mappings:
|
770
|
+
if sm.child_node not in self.step_graph.nodes:
|
771
|
+
raise ValueError(
|
772
|
+
f"{self.step_name} {slot_type} slot {sm.parent_slot} maps to non-existent child node {sm.child_node}"
|
773
|
+
)
|
774
|
+
if sm.child_slot not in getattr(
|
775
|
+
self.step_graph.nodes[sm.child_node]["step"], f"{slot_type}_slots"
|
776
|
+
):
|
777
|
+
raise ValueError(
|
778
|
+
f"{self.step_name} {slot_type} slot {sm.parent_slot} maps to non-existent slot {sm.child_slot} on child node {sm.child_node}"
|
779
|
+
)
|
780
|
+
|
781
|
+
def _check_validators_are_consistent(self):
|
782
|
+
"""Check that if two input slots will receive the same data, they have the same validator.
|
783
|
+
|
784
|
+
There are two versions of this to check: input slots that receive the same data because
|
785
|
+
one is mapped to the other by a slot mapping, and input slots that receive the
|
786
|
+
same data because they both are at the receiving end of edges from the same output slot.
|
787
|
+
"""
|
788
|
+
# Check that input slots mapped to by our slot mappings have consistent validators
|
789
|
+
for sm in self.slot_mappings["input"]:
|
790
|
+
expected_validator = self.input_slots[sm.parent_slot].validator
|
791
|
+
child_input_slot = self.step_graph.nodes[sm.child_node]["step"].input_slots[
|
792
|
+
sm.child_slot
|
793
|
+
]
|
794
|
+
if child_input_slot.validator != expected_validator:
|
795
|
+
raise ValueError(
|
796
|
+
f"{sm.child_node}'s {sm.child_slot}, which is mapped from {self.step_name}'s {sm.parent_slot}, does not have the same validator"
|
797
|
+
)
|
798
|
+
|
799
|
+
# Check that input slots receiving the same data have consistent validators
|
800
|
+
validators_by_child_output_slot = {}
|
801
|
+
for edge in self.edges:
|
802
|
+
child_input_slot = self.step_graph.edges[(edge.source_node, edge.target_node, 0)][
|
803
|
+
"input_slot"
|
804
|
+
]
|
805
|
+
source_slot = (edge.source_node, edge.output_slot)
|
806
|
+
if source_slot not in validators_by_child_output_slot:
|
807
|
+
validators_by_child_output_slot[source_slot] = child_input_slot.validator
|
808
|
+
elif child_input_slot.validator != validators_by_child_output_slot[source_slot]:
|
809
|
+
raise ValueError(
|
810
|
+
f"Not all input slots receiving edges from {edge.source_node}'s {edge.output_slot} have the same validator"
|
811
|
+
)
|
812
|
+
|
724
813
|
|
725
814
|
class TemplatedStep(Step, ABC):
|
726
815
|
"""A type of :class:`Step` that may contain multiplicity.
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./exclude_clustered.py /exclude_clustered.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /exclude_clustered.py '$@'
|