easylink 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/devtools/implementation_creator.py +19 -4
- easylink/implementation.py +2 -2
- easylink/pipeline.py +13 -15
- easylink/pipeline_graph.py +10 -15
- easylink/pipeline_schema_constants/__init__.py +4 -4
- easylink/pipeline_schema_constants/development.py +4 -4
- easylink/pipeline_schema_constants/main.py +5 -5
- easylink/pipeline_schema_constants/testing.py +22 -16
- easylink/rule.py +9 -10
- easylink/step.py +34 -35
- easylink/steps/cascading/update_clusters_by_connected_components.py +18 -10
- easylink/utilities/aggregator_utils.py +2 -2
- easylink/utilities/splitter_utils.py +1 -1
- {easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/METADATA +1 -1
- {easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/RECORD +20 -20
- {easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/WHEEL +0 -0
- {easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/licenses/LICENSE +0 -0
- {easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.21"
|
@@ -21,15 +21,15 @@ from loguru import logger
|
|
21
21
|
|
22
22
|
from easylink.pipeline_schema_constants import SCHEMA_PARAMS
|
23
23
|
from easylink.step import (
|
24
|
+
AutoParallelStep,
|
24
25
|
ChoiceStep,
|
25
|
-
EmbarrassinglyParallelStep,
|
26
26
|
HierarchicalStep,
|
27
27
|
IOStep,
|
28
28
|
Step,
|
29
29
|
TemplatedStep,
|
30
30
|
)
|
31
31
|
from easylink.utilities.data_utils import load_yaml
|
32
|
-
from easylink.utilities.paths import IMPLEMENTATION_METADATA
|
32
|
+
from easylink.utilities.paths import DEV_IMAGES_DIR, IMPLEMENTATION_METADATA
|
33
33
|
|
34
34
|
|
35
35
|
def main(script_path: Path, host: Path) -> None:
|
@@ -195,9 +195,24 @@ class ImplementationCreator:
|
|
195
195
|
f"Implementation '{self.implementation_name}' already exists in the registry. "
|
196
196
|
"Overwriting it with the latest data."
|
197
197
|
)
|
198
|
+
|
199
|
+
# Handle the fact that developers might be saving to username subdirs
|
200
|
+
# If the host folder is a subdirectory of DEV_IMAGES_DIR (e.g., the default
|
201
|
+
# host directory when calling `easylink devtools create-implementation`
|
202
|
+
# is DEV_IMAGES_DIR/<username>), we want to include the relative path
|
203
|
+
# to the DEV_IMAGES_DIR in the image name. This is required because ultimately
|
204
|
+
# when running a pipeline, all images are expected to be in a single directory.
|
205
|
+
image_name = (
|
206
|
+
self.hosted_container_path.name
|
207
|
+
# Use just the image name if the hosted path is not a part of DEV_IMAGES_DIR
|
208
|
+
if not self.hosted_container_path.is_relative_to(DEV_IMAGES_DIR)
|
209
|
+
# Use the path relative to DEV_IMAGES_DIR as the image name
|
210
|
+
else str(self.hosted_container_path.relative_to(DEV_IMAGES_DIR))
|
211
|
+
)
|
212
|
+
|
198
213
|
info[self.implementation_name] = {
|
199
214
|
"steps": [self.step],
|
200
|
-
"
|
215
|
+
"image_name": str(image_name),
|
201
216
|
"script_cmd": f"{self.script_base_command} /{self.script_path.name}",
|
202
217
|
"outputs": {
|
203
218
|
self.output_slot: "result.parquet",
|
@@ -304,7 +319,7 @@ class ImplementationCreator:
|
|
304
319
|
elif isinstance(node, TemplatedStep):
|
305
320
|
_process_step(node.template_step)
|
306
321
|
return
|
307
|
-
elif isinstance(node,
|
322
|
+
elif isinstance(node, AutoParallelStep):
|
308
323
|
_process_step(node.step)
|
309
324
|
return
|
310
325
|
elif isinstance(node, ChoiceStep):
|
easylink/implementation.py
CHANGED
@@ -55,7 +55,7 @@ class Implementation:
|
|
55
55
|
implementation_config: LayeredConfigTree,
|
56
56
|
input_slots: Iterable[InputSlot] = (),
|
57
57
|
output_slots: Iterable[OutputSlot] = (),
|
58
|
-
|
58
|
+
is_auto_parallel: bool = False,
|
59
59
|
):
|
60
60
|
self.name = implementation_config.name
|
61
61
|
"""The name of this ``Implementation``."""
|
@@ -74,7 +74,7 @@ class Implementation:
|
|
74
74
|
implemented by this particular ``Implementation``."""
|
75
75
|
self.requires_spark = self._metadata.get("requires_spark", False)
|
76
76
|
"""Whether this ``Implementation`` requires a Spark environment."""
|
77
|
-
self.
|
77
|
+
self.is_auto_parallel = is_auto_parallel
|
78
78
|
|
79
79
|
def __repr__(self) -> str:
|
80
80
|
return f"Implementation.{self.name}"
|
easylink/pipeline.py
CHANGED
@@ -45,9 +45,9 @@ class Pipeline:
|
|
45
45
|
The :class:`~easylink.pipeline_graph.PipelineGraph` object.
|
46
46
|
spark_is_required
|
47
47
|
A boolean indicating whether the pipeline requires Spark.
|
48
|
-
|
48
|
+
any_auto_parallel
|
49
49
|
A boolean indicating whether any implementation in the pipeline is to be
|
50
|
-
run in
|
50
|
+
automatically run in parallel.
|
51
51
|
|
52
52
|
"""
|
53
53
|
|
@@ -55,7 +55,7 @@ class Pipeline:
|
|
55
55
|
self.config = config
|
56
56
|
self.pipeline_graph = PipelineGraph(config)
|
57
57
|
self.spark_is_required = self.pipeline_graph.spark_is_required
|
58
|
-
self.
|
58
|
+
self.any_auto_parallel = self.pipeline_graph.any_auto_parallel
|
59
59
|
|
60
60
|
# TODO [MIC-4880]: refactor into validation object
|
61
61
|
self._validate()
|
@@ -179,7 +179,7 @@ class Pipeline:
|
|
179
179
|
#################################
|
180
180
|
|
181
181
|
def _write_imports(self) -> None:
|
182
|
-
if not self.
|
182
|
+
if not self.any_auto_parallel:
|
183
183
|
imports = "from easylink.utilities import validation_utils\n"
|
184
184
|
else:
|
185
185
|
imports = """import glob
|
@@ -193,7 +193,7 @@ from easylink.utilities import aggregator_utils, splitter_utils, validation_util
|
|
193
193
|
f.write(imports)
|
194
194
|
|
195
195
|
def _write_wildcard_constraints(self) -> None:
|
196
|
-
if self.
|
196
|
+
if self.any_auto_parallel:
|
197
197
|
with open(self.snakefile_path, "a") as f:
|
198
198
|
f.write(
|
199
199
|
"""
|
@@ -301,12 +301,10 @@ use rule start_spark_worker from spark_cluster with:
|
|
301
301
|
The name of the ``Implementation`` to write the rule(s) for.
|
302
302
|
"""
|
303
303
|
|
304
|
-
|
305
|
-
node_name
|
306
|
-
)
|
304
|
+
is_auto_parallel = self.pipeline_graph.get_whether_auto_parallel(node_name)
|
307
305
|
input_slots, _output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
|
308
306
|
validation_files, validation_rules = self._get_validations(
|
309
|
-
node_name, input_slots,
|
307
|
+
node_name, input_slots, is_auto_parallel
|
310
308
|
)
|
311
309
|
for validation_rule in validation_rules:
|
312
310
|
validation_rule.write_to_snakefile(self.snakefile_path)
|
@@ -334,7 +332,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
334
332
|
image_path=self.config.images_dir / implementation.singularity_image_name,
|
335
333
|
script_cmd=implementation.script_cmd,
|
336
334
|
requires_spark=implementation.requires_spark,
|
337
|
-
|
335
|
+
is_auto_parallel=is_auto_parallel,
|
338
336
|
).write_to_snakefile(self.snakefile_path)
|
339
337
|
|
340
338
|
def _write_checkpoint_rule(self, node_name: str, checkpoint_filepath: str) -> None:
|
@@ -377,7 +375,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
377
375
|
input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
|
378
376
|
if len(output_slots) > 1:
|
379
377
|
raise NotImplementedError(
|
380
|
-
"FIXME [MIC-5883] Multiple output slots/files of
|
378
|
+
"FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
|
381
379
|
)
|
382
380
|
if len(output_files) > 1:
|
383
381
|
raise ValueError(
|
@@ -388,7 +386,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
388
386
|
output_slot_attrs = list(output_slots.values())[0]
|
389
387
|
if len(output_slot_attrs["filepaths"]) > 1:
|
390
388
|
raise NotImplementedError(
|
391
|
-
"FIXME [MIC-5883] Multiple output slots/files of
|
389
|
+
"FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
|
392
390
|
)
|
393
391
|
checkpoint_rule_name = f"checkpoints.{implementation.splitter_node_name}"
|
394
392
|
AggregationRule(
|
@@ -404,7 +402,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
404
402
|
def _get_validations(
|
405
403
|
node_name: str,
|
406
404
|
input_slots: dict[str, dict[str, str | list[str]]],
|
407
|
-
|
405
|
+
is_auto_parallel: bool,
|
408
406
|
) -> tuple[list[str], list[InputValidationRule]]:
|
409
407
|
"""Gets the validation rule and its output filepath for each slot for a given node.
|
410
408
|
|
@@ -423,10 +421,10 @@ use rule start_spark_worker from spark_cluster with:
|
|
423
421
|
validation_rules = []
|
424
422
|
|
425
423
|
for input_slot_name, input_slot_attrs in input_slots.items():
|
426
|
-
#
|
424
|
+
# auto-parallel implementations rely on snakemake wildcards
|
427
425
|
# TODO: [MIC-5787] - need to support multiple wildcards at once
|
428
426
|
validation_file = f"input_validations/{node_name}/{input_slot_name}_validator" + (
|
429
|
-
"-{chunk}" if
|
427
|
+
"-{chunk}" if is_auto_parallel else ""
|
430
428
|
)
|
431
429
|
validation_files.append(validation_file)
|
432
430
|
validation_rules.append(
|
easylink/pipeline_graph.py
CHANGED
@@ -72,31 +72,26 @@ class PipelineGraph(ImplementationGraph):
|
|
72
72
|
return any([implementation.requires_spark for implementation in self.implementations])
|
73
73
|
|
74
74
|
@property
|
75
|
-
def
|
75
|
+
def any_auto_parallel(self) -> bool:
|
76
76
|
"""Whether or not any :class:`~easylink.implementation.Implementation` is
|
77
|
-
to be run in
|
77
|
+
to be automatically run in parallel."""
|
78
78
|
return any(
|
79
|
-
[
|
80
|
-
self.get_whether_embarrassingly_parallel(node)
|
81
|
-
for node in self.implementation_nodes
|
82
|
-
]
|
79
|
+
[self.get_whether_auto_parallel(node) for node in self.implementation_nodes]
|
83
80
|
)
|
84
81
|
|
85
|
-
def
|
86
|
-
"""Determines whether a node is to be run in
|
82
|
+
def get_whether_auto_parallel(self, node: str) -> dict[str, bool]:
|
83
|
+
"""Determines whether a node is to be automatically run in parallel.
|
87
84
|
|
88
85
|
Parameters
|
89
86
|
----------
|
90
87
|
node
|
91
|
-
The node name to determine whether or not it is to be run in
|
92
|
-
embarrassingly parallel way.
|
88
|
+
The node name to determine whether or not it is to be automatically run in parallel.
|
93
89
|
|
94
90
|
Returns
|
95
91
|
-------
|
96
|
-
A boolean indicating whether the node is to be run in
|
97
|
-
parallel way.
|
92
|
+
A boolean indicating whether the node is to be automatically run in parallel.
|
98
93
|
"""
|
99
|
-
return self.nodes[node]["implementation"].
|
94
|
+
return self.nodes[node]["implementation"].is_auto_parallel
|
100
95
|
|
101
96
|
def get_io_filepaths(self, node: str) -> tuple[list[str], list[str]]:
|
102
97
|
"""Gets all of a node's input and output filepaths from its edges.
|
@@ -482,9 +477,9 @@ class PipelineGraph(ImplementationGraph):
|
|
482
477
|
str(
|
483
478
|
Path("intermediate")
|
484
479
|
/ node
|
485
|
-
#
|
480
|
+
# auto-parallel implementations rely on snakemake wildcards
|
486
481
|
# TODO: [MIC-5787] - need to support multiple wildcards at once
|
487
|
-
/ ("{chunk}" if implementation.
|
482
|
+
/ ("{chunk}" if implementation.is_auto_parallel else "")
|
488
483
|
/ imp_outputs[edge_attrs["output_slot"].name]
|
489
484
|
),
|
490
485
|
)
|
@@ -23,8 +23,8 @@ SCHEMA_PARAMS = {
|
|
23
23
|
"combine_with_iteration": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
|
24
24
|
"combine_with_iteration_cycle": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
|
25
25
|
"combine_with_extra_node": testing.SCHEMA_PARAMS_THREE_STEPS,
|
26
|
-
"
|
27
|
-
"
|
28
|
-
"
|
29
|
-
"
|
26
|
+
"looping_auto_parallel_step": testing.SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP,
|
27
|
+
"auto_parallel_cloneable_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP,
|
28
|
+
"auto_parallel_loop_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP,
|
29
|
+
"auto_parallel_hierarchical_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP,
|
30
30
|
}
|
@@ -18,13 +18,13 @@ from easylink.graph_components import (
|
|
18
18
|
OutputSlotMapping,
|
19
19
|
)
|
20
20
|
from easylink.step import (
|
21
|
+
AutoParallelStep,
|
21
22
|
ChoiceStep,
|
22
|
-
|
23
|
+
CloneableStep,
|
23
24
|
HierarchicalStep,
|
24
25
|
InputStep,
|
25
26
|
LoopStep,
|
26
27
|
OutputStep,
|
27
|
-
ParallelStep,
|
28
28
|
Step,
|
29
29
|
)
|
30
30
|
from easylink.utilities.aggregator_utils import concatenate_datasets
|
@@ -33,7 +33,7 @@ from easylink.utilities.validation_utils import validate_input_file_dummy
|
|
33
33
|
|
34
34
|
NODES = [
|
35
35
|
InputStep(),
|
36
|
-
|
36
|
+
CloneableStep(
|
37
37
|
template_step=Step(
|
38
38
|
step_name="step_1",
|
39
39
|
input_slots=[
|
@@ -58,7 +58,7 @@ NODES = [
|
|
58
58
|
output_slots=[OutputSlot("step_2_main_output")],
|
59
59
|
),
|
60
60
|
LoopStep(
|
61
|
-
template_step=
|
61
|
+
template_step=AutoParallelStep(
|
62
62
|
step=Step(
|
63
63
|
step_name="step_3",
|
64
64
|
input_slots=[
|
@@ -12,11 +12,11 @@ from easylink.graph_components import (
|
|
12
12
|
OutputSlotMapping,
|
13
13
|
)
|
14
14
|
from easylink.step import (
|
15
|
+
CloneableStep,
|
15
16
|
HierarchicalStep,
|
16
17
|
InputStep,
|
17
18
|
LoopStep,
|
18
19
|
OutputStep,
|
19
|
-
ParallelStep,
|
20
20
|
Step,
|
21
21
|
)
|
22
22
|
from easylink.utilities.validation_utils import (
|
@@ -56,8 +56,8 @@ NODES = [
|
|
56
56
|
],
|
57
57
|
output_slots=[OutputSlot("clusters")],
|
58
58
|
nodes=[
|
59
|
-
|
60
|
-
# NOTE: Splitters/aggregators on the
|
59
|
+
CloneableStep(
|
60
|
+
# NOTE: Splitters/aggregators on the CloneableStep are implicit!
|
61
61
|
template_step=HierarchicalStep(
|
62
62
|
step_name="determining_exclusions_and_removing_records",
|
63
63
|
directly_implemented=False,
|
@@ -190,7 +190,7 @@ NODES = [
|
|
190
190
|
],
|
191
191
|
output_slots=[OutputSlot("links")],
|
192
192
|
nodes=[
|
193
|
-
|
193
|
+
CloneableStep(
|
194
194
|
template_step=LoopStep(
|
195
195
|
template_step=Step(
|
196
196
|
step_name="pre-processing",
|
@@ -265,7 +265,7 @@ NODES = [
|
|
265
265
|
source_node="pre-processing",
|
266
266
|
target_node="schema_alignment",
|
267
267
|
output_slot="dataset",
|
268
|
-
# NOTE: The implicit
|
268
|
+
# NOTE: The implicit CloneableStep aggregator has
|
269
269
|
# made this multiple (a list)
|
270
270
|
input_slot="datasets",
|
271
271
|
),
|
@@ -16,12 +16,12 @@ from easylink.graph_components import (
|
|
16
16
|
OutputSlotMapping,
|
17
17
|
)
|
18
18
|
from easylink.step import (
|
19
|
-
|
19
|
+
AutoParallelStep,
|
20
|
+
CloneableStep,
|
20
21
|
HierarchicalStep,
|
21
22
|
InputStep,
|
22
23
|
LoopStep,
|
23
24
|
OutputStep,
|
24
|
-
ParallelStep,
|
25
25
|
Step,
|
26
26
|
)
|
27
27
|
from easylink.utilities.aggregator_utils import concatenate_datasets
|
@@ -215,7 +215,7 @@ SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY = (NODES_BAD_COMBINED_TOPOLOGY, EDGES_ONE_ST
|
|
215
215
|
NODES_NESTED_TEMPLATED_STEPS = [
|
216
216
|
InputStep(),
|
217
217
|
LoopStep(
|
218
|
-
template_step=
|
218
|
+
template_step=CloneableStep(
|
219
219
|
template_step=HierarchicalStep(
|
220
220
|
step_name="step_1",
|
221
221
|
input_slots=[
|
@@ -355,10 +355,10 @@ EDGES_TWO_STEPS = [
|
|
355
355
|
SCHEMA_PARAMS_COMBINE_WITH_ITERATION = (NODES_COMBINE_WITH_ITERATION, EDGES_TWO_STEPS)
|
356
356
|
|
357
357
|
|
358
|
-
|
358
|
+
NODES_LOOPING_AUTO_PARALLEL_STEP = [
|
359
359
|
InputStep(),
|
360
360
|
LoopStep(
|
361
|
-
template_step=
|
361
|
+
template_step=AutoParallelStep(
|
362
362
|
step=Step(
|
363
363
|
step_name="step_1",
|
364
364
|
input_slots=[
|
@@ -392,13 +392,13 @@ NODES_LOOPING_EP_STEP = [
|
|
392
392
|
]
|
393
393
|
),
|
394
394
|
]
|
395
|
-
|
395
|
+
SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP = (NODES_LOOPING_AUTO_PARALLEL_STEP, EDGES_ONE_STEP)
|
396
396
|
|
397
397
|
|
398
|
-
|
398
|
+
NODES_AUTO_PARALLEL_PARALLEL_STEP = [
|
399
399
|
InputStep(),
|
400
|
-
|
401
|
-
step=
|
400
|
+
AutoParallelStep(
|
401
|
+
step=CloneableStep(
|
402
402
|
template_step=Step(
|
403
403
|
step_name="step_1",
|
404
404
|
input_slots=[
|
@@ -424,12 +424,15 @@ NODES_EP_PARALLEL_STEP = [
|
|
424
424
|
]
|
425
425
|
),
|
426
426
|
]
|
427
|
-
|
427
|
+
SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP = (
|
428
|
+
NODES_AUTO_PARALLEL_PARALLEL_STEP,
|
429
|
+
EDGES_ONE_STEP,
|
430
|
+
)
|
428
431
|
|
429
432
|
|
430
|
-
|
433
|
+
NODES_AUTO_PARALLEL_LOOP_STEP = [
|
431
434
|
InputStep(),
|
432
|
-
|
435
|
+
AutoParallelStep(
|
433
436
|
step=LoopStep(
|
434
437
|
template_step=Step(
|
435
438
|
step_name="step_1",
|
@@ -464,12 +467,12 @@ NODES_EP_LOOP_STEP = [
|
|
464
467
|
]
|
465
468
|
),
|
466
469
|
]
|
467
|
-
|
470
|
+
SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP = (NODES_AUTO_PARALLEL_LOOP_STEP, EDGES_ONE_STEP)
|
468
471
|
|
469
472
|
|
470
|
-
|
473
|
+
NODES_AUTO_PARALLEL_HIERARCHICAL_STEP = [
|
471
474
|
InputStep(),
|
472
|
-
|
475
|
+
AutoParallelStep(
|
473
476
|
step=HierarchicalStep(
|
474
477
|
step_name="step_1",
|
475
478
|
input_slots=[
|
@@ -581,7 +584,10 @@ EDGES_ONE_STEP_TWO_ISLOTS = [
|
|
581
584
|
input_slot="result",
|
582
585
|
),
|
583
586
|
]
|
584
|
-
|
587
|
+
SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP = (
|
588
|
+
NODES_AUTO_PARALLEL_HIERARCHICAL_STEP,
|
589
|
+
EDGES_ONE_STEP_TWO_ISLOTS,
|
590
|
+
)
|
585
591
|
|
586
592
|
NODES_OUTPUT_DIR = [
|
587
593
|
InputStep(),
|
easylink/rule.py
CHANGED
@@ -111,21 +111,20 @@ class ImplementedRule(Rule):
|
|
111
111
|
"""Command to execute."""
|
112
112
|
requires_spark: bool
|
113
113
|
"""Whether or not this ``Implementation`` requires a Spark environment."""
|
114
|
-
|
115
|
-
"""Whether or not this ``Implementation`` is to be run in
|
116
|
-
parallel way."""
|
114
|
+
is_auto_parallel: bool = False
|
115
|
+
"""Whether or not this ``Implementation`` is to be automatically run in parallel."""
|
117
116
|
|
118
117
|
def build_rule(self) -> str:
|
119
118
|
"""Builds the Snakemake rule for this ``Implementation``."""
|
120
|
-
if self.
|
119
|
+
if self.is_auto_parallel and len(self.output) > 1:
|
121
120
|
raise NotImplementedError(
|
122
|
-
"Multiple output slots/files of
|
121
|
+
"Multiple output slots/files of AutoParallelSteps not yet supported"
|
123
122
|
)
|
124
123
|
return self._build_io() + self._build_resources() + self._build_shell_cmd()
|
125
124
|
|
126
125
|
def _build_io(self) -> str:
|
127
126
|
"""Builds the input/output portion of the rule."""
|
128
|
-
log_path_chunk_adder = "-{chunk}" if self.
|
127
|
+
log_path_chunk_adder = "-{chunk}" if self.is_auto_parallel else ""
|
129
128
|
# Handle output files vs directories
|
130
129
|
files = [path for path in self.output if Path(path).suffix != ""]
|
131
130
|
if len(files) == len(self.output):
|
@@ -260,7 +259,7 @@ rule:
|
|
260
259
|
class CheckpointRule(Rule):
|
261
260
|
"""A :class:`Rule` that defines a checkpoint.
|
262
261
|
|
263
|
-
When running an :class:`~easylink.implementation.Implementation` in an
|
262
|
+
When running an :class:`~easylink.implementation.Implementation` in an auto
|
264
263
|
parallel way, we do not know until runtime how many parallel jobs there will
|
265
264
|
be (e.g. we don't know beforehand how many chunks a large incoming dataset will
|
266
265
|
be split into since the incoming dataset isn't created until runtime). The
|
@@ -326,7 +325,7 @@ checkpoint:
|
|
326
325
|
class AggregationRule(Rule):
|
327
326
|
"""A :class:`Rule` that aggregates the processed chunks of output data.
|
328
327
|
|
329
|
-
When running an :class:`~easylink.implementation.Implementation` in an
|
328
|
+
When running an :class:`~easylink.implementation.Implementation` in an auto
|
330
329
|
parallel way, we need to aggregate the output files from each parallel job
|
331
330
|
into a single output file.
|
332
331
|
"""
|
@@ -347,10 +346,10 @@ class AggregationRule(Rule):
|
|
347
346
|
def build_rule(self) -> str:
|
348
347
|
"""Builds the Snakemake rule for this aggregator.
|
349
348
|
|
350
|
-
When running an :class:`~easylink.step.
|
349
|
+
When running an :class:`~easylink.step.AutoParallelStep`, we need
|
351
350
|
to aggregate the output files from each parallel job into a single output file.
|
352
351
|
This rule relies on a dynamically generated aggregation function which returns
|
353
|
-
all of the **processed** chunks (from running the ``
|
352
|
+
all of the **processed** chunks (from running the ``AutoParallelStep's``
|
354
353
|
container in parallel) and uses them as inputs to the actual aggregation
|
355
354
|
rule.
|
356
355
|
|
easylink/step.py
CHANGED
@@ -71,8 +71,8 @@ class Step:
|
|
71
71
|
The :class:`InputSlotMapping<easylink.graph_components.InputSlotMapping>` of this ``Step``.
|
72
72
|
output_slot_mappings
|
73
73
|
The :class:`OutputSlotMapping<easylink.graph_components.OutputSlotMapping>` of this ``Step``.
|
74
|
-
|
75
|
-
Whether or not this ``Step`` is to
|
74
|
+
is_auto_parallel
|
75
|
+
Whether or not this ``Step`` is to automatically run in parallel.
|
76
76
|
|
77
77
|
Notes
|
78
78
|
-----
|
@@ -91,7 +91,7 @@ class Step:
|
|
91
91
|
output_slots: Iterable[OutputSlot] = (),
|
92
92
|
input_slot_mappings: Iterable[InputSlotMapping] = (),
|
93
93
|
output_slot_mappings: Iterable[OutputSlotMapping] = (),
|
94
|
-
|
94
|
+
is_auto_parallel: bool = False,
|
95
95
|
) -> None:
|
96
96
|
if not step_name and not name:
|
97
97
|
raise ValueError("All Steps must contain a step_name, name, or both.")
|
@@ -125,8 +125,8 @@ class Step:
|
|
125
125
|
}
|
126
126
|
"""A combined dictionary containing both the ``InputSlotMappings`` and
|
127
127
|
``OutputSlotMappings`` of this ``Step``."""
|
128
|
-
self.
|
129
|
-
"""Whether or not this ``Step`` is to be run in
|
128
|
+
self.is_auto_parallel = is_auto_parallel
|
129
|
+
"""Whether or not this ``Step`` is to be automatically run in parallel."""
|
130
130
|
self.parent_step = None
|
131
131
|
"""This ``Step's`` parent ``Step``, if applicable."""
|
132
132
|
self._configuration_state = None
|
@@ -816,7 +816,7 @@ class TemplatedStep(Step, ABC):
|
|
816
816
|
|
817
817
|
A ``TemplatedStep`` is used to represents a ``Step`` that contains a specified
|
818
818
|
amount of multiplicity, such as one that is looped or run in parallel; it is
|
819
|
-
inherited by concrete :class:`LoopStep` and :class:`
|
819
|
+
inherited by concrete :class:`LoopStep` and :class:`CloneableStep` instances.
|
820
820
|
|
821
821
|
See :class:`Step` for inherited attributes.
|
822
822
|
|
@@ -1206,7 +1206,7 @@ class LoopStep(TemplatedStep):
|
|
1206
1206
|
return {"input": input_mappings, "output": output_mappings}
|
1207
1207
|
|
1208
1208
|
|
1209
|
-
class
|
1209
|
+
class CloneableStep(TemplatedStep):
|
1210
1210
|
"""A type of :class:`TemplatedStep` that creates multiple copies in parallel
|
1211
1211
|
with no dependencies between them.
|
1212
1212
|
|
@@ -1216,13 +1216,13 @@ class ParallelStep(TemplatedStep):
|
|
1216
1216
|
|
1217
1217
|
@property
|
1218
1218
|
def config_key(self):
|
1219
|
-
"""The pipeline specification key required for a ``
|
1220
|
-
return "
|
1219
|
+
"""The pipeline specification key required for a ``CloneableStep``."""
|
1220
|
+
return "clones"
|
1221
1221
|
|
1222
1222
|
@property
|
1223
1223
|
def node_prefix(self):
|
1224
|
-
"""The prefix to be used in the ``
|
1225
|
-
return "
|
1224
|
+
"""The prefix to be used in the ``CloneableStep`` node name."""
|
1225
|
+
return "clone"
|
1226
1226
|
|
1227
1227
|
def _update_step_graph(self, num_repeats: int) -> StepGraph:
|
1228
1228
|
"""Updates the :class:`~easylink.graph_components.StepGraph` to include parallelization.
|
@@ -1276,10 +1276,10 @@ class ParallelStep(TemplatedStep):
|
|
1276
1276
|
return {"input": input_mappings, "output": output_mappings}
|
1277
1277
|
|
1278
1278
|
|
1279
|
-
class
|
1279
|
+
class AutoParallelStep(Step):
|
1280
1280
|
"""A :class:`Step` that is run in parallel on the backend.
|
1281
1281
|
|
1282
|
-
An ``
|
1282
|
+
An ``AutoParallelStep`` is different than a :class:`CloneableStep`
|
1283
1283
|
in that it is not configured by the user to be run in parallel - it completely
|
1284
1284
|
happens on the back end for performance reasons.
|
1285
1285
|
|
@@ -1288,8 +1288,8 @@ class EmbarrassinglyParallelStep(Step):
|
|
1288
1288
|
Parameters
|
1289
1289
|
----------
|
1290
1290
|
step
|
1291
|
-
The ``Step`` to be run in
|
1292
|
-
|
1291
|
+
The ``Step`` to be automatically run in parallel. To run multiple steps in
|
1292
|
+
parallel, use a :class:`HierarchicalStep`.
|
1293
1293
|
slot_splitter_mapping
|
1294
1294
|
A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
|
1295
1295
|
to the actual splitter function to be used.
|
@@ -1308,7 +1308,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1308
1308
|
super().__init__(
|
1309
1309
|
step_name=None,
|
1310
1310
|
name=step.name,
|
1311
|
-
|
1311
|
+
is_auto_parallel=True,
|
1312
1312
|
)
|
1313
1313
|
self.slot_splitter_mapping = slot_splitter_mapping
|
1314
1314
|
"""A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
|
@@ -1328,14 +1328,14 @@ class EmbarrassinglyParallelStep(Step):
|
|
1328
1328
|
|
1329
1329
|
@Step.name.setter
|
1330
1330
|
def name(self, value: str) -> None:
|
1331
|
-
"""Changes the name of the ``
|
1331
|
+
"""Changes the name of the ``AutoParallelStep`` and the underlying :class:`Step` to the given value."""
|
1332
1332
|
self._name = value
|
1333
1333
|
self.step._name = value
|
1334
1334
|
|
1335
1335
|
def _validate(self) -> None:
|
1336
|
-
"""Validates the ``
|
1336
|
+
"""Validates the ``AutoParallelStep``.
|
1337
1337
|
|
1338
|
-
``
|
1338
|
+
``AutoParallelSteps`` are not configured by the user to be run
|
1339
1339
|
in parallel. Since it happens on the back end, we need to do somewhat unique
|
1340
1340
|
validations during construction. Specifically,
|
1341
1341
|
- one and only one :class:`~easylink.graph_components.InputSlot` *must*
|
@@ -1348,17 +1348,17 @@ class EmbarrassinglyParallelStep(Step):
|
|
1348
1348
|
# check that only one input slot has a splitter assigned
|
1349
1349
|
if len(self.slot_splitter_mapping) != 1:
|
1350
1350
|
errors.append(
|
1351
|
-
f"
|
1351
|
+
f"AutoParallelStep '{self.step_name}' is attempting to define "
|
1352
1352
|
f"{len(self.slot_splitter_mapping)} splitters when only one should be defined."
|
1353
1353
|
)
|
1354
1354
|
if len(self.slot_splitter_mapping) == 0:
|
1355
1355
|
errors.append(
|
1356
|
-
f"
|
1356
|
+
f"AutoParallelStep '{self.step_name}' does not have any input slots with a "
|
1357
1357
|
"splitter method assigned; one and only one input slot must have a splitter."
|
1358
1358
|
)
|
1359
1359
|
if len(self.slot_splitter_mapping) > 1:
|
1360
1360
|
errors.append(
|
1361
|
-
f"
|
1361
|
+
f"AutoParallelStep '{self.step_name}' has multiple input slots with "
|
1362
1362
|
"splitter methods assigned; one and only one input slot must have a splitter.\n"
|
1363
1363
|
f"Input slots with splitters: {list(self.slot_splitter_mapping)}"
|
1364
1364
|
)
|
@@ -1371,7 +1371,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1371
1371
|
]
|
1372
1372
|
if len(missing_aggregators) != 0:
|
1373
1373
|
errors.append(
|
1374
|
-
f"
|
1374
|
+
f"AutoParallelStep '{self.step_name}' has output slots without "
|
1375
1375
|
f"aggregator methods assigned: {missing_aggregators}"
|
1376
1376
|
)
|
1377
1377
|
if errors:
|
@@ -1451,7 +1451,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1451
1451
|
aggregator_node_name = f"{self.name}_aggregate"
|
1452
1452
|
if len(self.output_slots) > 1:
|
1453
1453
|
raise NotImplementedError(
|
1454
|
-
"FIXME [MIC-5883] Multiple output slots/files of
|
1454
|
+
"FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
|
1455
1455
|
)
|
1456
1456
|
output_slot = list(self.output_slots.values())[0]
|
1457
1457
|
aggregator_step = AggregatorStep(
|
@@ -1464,7 +1464,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1464
1464
|
self._update_slot_mappings(splitter_step, aggregator_step)
|
1465
1465
|
# Add the key back to the expanded config
|
1466
1466
|
expanded_config = LayeredConfigTree({self.step.name: step_config})
|
1467
|
-
#
|
1467
|
+
# AutoParallelSteps are by definition non-leaf steps
|
1468
1468
|
self._configuration_state = NonLeafConfigurationState(
|
1469
1469
|
self, expanded_config, combined_implementations, input_data_config
|
1470
1470
|
)
|
@@ -1513,7 +1513,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1513
1513
|
# Add the Step -> AggregatorStep edge
|
1514
1514
|
if len(self.step.output_slots) > 1:
|
1515
1515
|
raise NotImplementedError(
|
1516
|
-
"
|
1516
|
+
"AutoParallelStep does not support multiple output slots."
|
1517
1517
|
)
|
1518
1518
|
self.step_graph.add_edge_from_params(
|
1519
1519
|
EdgeParams(
|
@@ -1562,7 +1562,7 @@ class SplitterStep(StandaloneStep):
|
|
1562
1562
|
"""A :class:`StandaloneStep` that splits an :class:`~easylink.graph_components.InputSlot` for parallel processing.
|
1563
1563
|
|
1564
1564
|
A ``SplitterStep`` is intended to be used in conjunction with a corresponding
|
1565
|
-
:class:`AggregatorStep` and only during construction of an :class:`
|
1565
|
+
:class:`AggregatorStep` and only during construction of an :class:`AutoParallelStep`.
|
1566
1566
|
|
1567
1567
|
See :class:`Step` for inherited attributes.
|
1568
1568
|
|
@@ -1613,7 +1613,7 @@ class AggregatorStep(StandaloneStep):
|
|
1613
1613
|
"""A :class:`StandaloneStep` that aggregates :class:`OutputSlots<easylink.graph_components.Outputslot>` after parallel processing.
|
1614
1614
|
|
1615
1615
|
An ``AggregatorStep`` is intended to be used in conjunction with a corresponding
|
1616
|
-
:class:`SplitterStep` and only during construction of an :class:`
|
1616
|
+
:class:`SplitterStep` and only during construction of an :class:`AutoParallelStep`.
|
1617
1617
|
|
1618
1618
|
See :class:`Step` for inherited attributes.
|
1619
1619
|
|
@@ -1918,10 +1918,9 @@ class LeafConfigurationState(ConfigurationState):
|
|
1918
1918
|
"""
|
1919
1919
|
step = self._step
|
1920
1920
|
if self.is_combined:
|
1921
|
-
if step.
|
1921
|
+
if step.is_auto_parallel:
|
1922
1922
|
raise NotImplementedError(
|
1923
|
-
"Combining implementations with
|
1924
|
-
"is not supported."
|
1923
|
+
"Combining implementations with auto-parallel steps is not supported."
|
1925
1924
|
)
|
1926
1925
|
implementation = PartialImplementation(
|
1927
1926
|
combined_name=self.step_config[COMBINED_IMPLEMENTATION_KEY],
|
@@ -1935,7 +1934,7 @@ class LeafConfigurationState(ConfigurationState):
|
|
1935
1934
|
implementation_config=self.implementation_config,
|
1936
1935
|
input_slots=step.input_slots.values(),
|
1937
1936
|
output_slots=step.output_slots.values(),
|
1938
|
-
|
1937
|
+
is_auto_parallel=step.is_auto_parallel,
|
1939
1938
|
)
|
1940
1939
|
implementation_graph.add_node_from_implementation(
|
1941
1940
|
step.implementation_node_name,
|
@@ -1985,7 +1984,7 @@ class LeafConfigurationState(ConfigurationState):
|
|
1985
1984
|
if mapping.parent_slot == edge.input_slot
|
1986
1985
|
]
|
1987
1986
|
for mapping in mappings:
|
1988
|
-
# FIXME [MIC-5771]: Fix
|
1987
|
+
# FIXME [MIC-5771]: Fix CloneableSteps
|
1989
1988
|
if (
|
1990
1989
|
"input_data_file" in self.step_config
|
1991
1990
|
and edge.source_node == "pipeline_graph_input_data"
|
@@ -2070,8 +2069,8 @@ class NonLeafConfigurationState(ConfigurationState):
|
|
2070
2069
|
"""
|
2071
2070
|
for node in self._step.step_graph.nodes:
|
2072
2071
|
substep = self._step.step_graph.nodes[node]["step"]
|
2073
|
-
if self._step.
|
2074
|
-
substep.
|
2072
|
+
if self._step.is_auto_parallel:
|
2073
|
+
substep.is_auto_parallel = True
|
2075
2074
|
substep.add_nodes_to_implementation_graph(implementation_graph)
|
2076
2075
|
|
2077
2076
|
def add_edges_to_implementation_graph(
|
@@ -60,12 +60,14 @@ new_clusters_df = load_file(new_clusters_filepath)
|
|
60
60
|
def merge_clusters(known_clusters_df, new_clusters_df):
|
61
61
|
# Combine both dataframes
|
62
62
|
combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
|
63
|
-
|
64
|
-
|
65
|
-
|
63
|
+
combined_df["Input Record Key"] = (
|
64
|
+
combined_df["Input Record Dataset"]
|
65
|
+
+ "-__-"
|
66
|
+
+ combined_df["Input Record ID"].astype(int).astype(str)
|
67
|
+
)
|
66
68
|
|
67
69
|
# Group by Cluster ID to get connected records
|
68
|
-
cluster_groups = combined_df.groupby("Cluster ID")["Input Record
|
70
|
+
cluster_groups = combined_df.groupby("Cluster ID")["Input Record Key"].apply(list)
|
69
71
|
|
70
72
|
# Build a graph of all connections implied by cluster IDs
|
71
73
|
G = nx.Graph()
|
@@ -75,8 +77,8 @@ def merge_clusters(known_clusters_df, new_clusters_df):
|
|
75
77
|
G.add_edge(group[i], group[j])
|
76
78
|
|
77
79
|
# Add isolated nodes (records with unique clusters)
|
78
|
-
|
79
|
-
G.add_nodes_from(
|
80
|
+
all_keys = set(combined_df["Input Record Key"])
|
81
|
+
G.add_nodes_from(all_keys)
|
80
82
|
|
81
83
|
# Compute connected components
|
82
84
|
components = list(nx.connected_components(G))
|
@@ -84,13 +86,19 @@ def merge_clusters(known_clusters_df, new_clusters_df):
|
|
84
86
|
# Assign new cluster IDs
|
85
87
|
merged_data = []
|
86
88
|
for cluster_id, records in enumerate(components, start=1):
|
87
|
-
for
|
88
|
-
merged_data.append((
|
89
|
+
for record_key in records:
|
90
|
+
merged_data.append((record_key, cluster_id))
|
89
91
|
|
90
92
|
# Build the final DataFrame
|
91
|
-
merged_df = pd.DataFrame(merged_data, columns=["Input Record
|
93
|
+
merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
|
94
|
+
|
95
|
+
merged_df[["Input Record Dataset", "Input Record ID"]] = merged_df[
|
96
|
+
"Input Record Key"
|
97
|
+
].str.split("-__-", n=1, expand=True)
|
98
|
+
|
99
|
+
merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
|
92
100
|
|
93
|
-
return merged_df
|
101
|
+
return merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]]
|
94
102
|
|
95
103
|
|
96
104
|
output_df = merge_clusters(known_clusters_df, new_clusters_df)
|
@@ -4,8 +4,8 @@ Data Aggregating Utilities
|
|
4
4
|
==========================
|
5
5
|
|
6
6
|
This module contains utility functions for aggregating datasets. One primary use
|
7
|
-
case for this is combine the results of
|
8
|
-
|
7
|
+
case for this is to combine the results of sections that were automatically run
|
8
|
+
in parallel.
|
9
9
|
|
10
10
|
Note that it is critical that all data aggregating utility functions are definied
|
11
11
|
in this module; easylink will not be able to find them otherwise.
|
@@ -4,7 +4,7 @@ Data Splitting Utilities
|
|
4
4
|
========================
|
5
5
|
|
6
6
|
This module contains utility functions for splitting datasets into smaller datasets.
|
7
|
-
One primary use case for this is to run sections of the pipeline in an
|
7
|
+
One primary use case for this is to run sections of the pipeline in an auto
|
8
8
|
parallel manner.
|
9
9
|
|
10
10
|
Note that it is critical that all data splitting utility functions are definied
|
@@ -1,30 +1,30 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
3
|
+
easylink/_version.py,sha256=qEmNtjnOwhDYQ0cHPPtUkUaghzD2xl0thJEznl4giYw,23
|
4
4
|
easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
|
5
5
|
easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
|
6
6
|
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
7
|
-
easylink/implementation.py,sha256=
|
7
|
+
easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
|
8
8
|
easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
|
9
|
-
easylink/pipeline.py,sha256=
|
10
|
-
easylink/pipeline_graph.py,sha256=
|
9
|
+
easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
|
10
|
+
easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
|
11
11
|
easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
|
12
|
-
easylink/rule.py,sha256=
|
12
|
+
easylink/rule.py,sha256=QJPmrvQUZPnqGFD9UmMK8imdJ7VODzGlUOSnpJhb9AU,16677
|
13
13
|
easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
|
14
|
-
easylink/step.py,sha256=
|
15
|
-
easylink/devtools/implementation_creator.py,sha256=
|
14
|
+
easylink/step.py,sha256=SqOxinHyRaLCEnB_y5dvhGMaRLyphQDCpVsQ3160c9U,89588
|
15
|
+
easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
|
16
16
|
easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
|
17
17
|
easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
|
18
|
-
easylink/pipeline_schema_constants/__init__.py,sha256=
|
19
|
-
easylink/pipeline_schema_constants/development.py,sha256=
|
20
|
-
easylink/pipeline_schema_constants/main.py,sha256=
|
21
|
-
easylink/pipeline_schema_constants/testing.py,sha256=
|
18
|
+
easylink/pipeline_schema_constants/__init__.py,sha256=SMNXz49DSwx05PHMKUsunJsgMOqsBJaAHA1fmIOJsUU,1445
|
19
|
+
easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
|
20
|
+
easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
|
21
|
+
easylink/pipeline_schema_constants/testing.py,sha256=G7szRMyY48dL8kUHWq2MeMaV2G0F-AdAPsQxFzdUnFI,20567
|
22
22
|
easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
|
23
23
|
easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
|
24
24
|
easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
|
25
25
|
easylink/steps/cascading/exclude_none.py,sha256=KntBX3q-V47d96ztOlPNRY_kCFJNi1LNYQ7UNs5wB4c,2507
|
26
26
|
easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
|
27
|
-
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=
|
27
|
+
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=43D5GBmPXSgxcjgbJTvEoGFvPzBCGqYgBaT42pncNNw,3661
|
28
28
|
easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
|
29
29
|
easylink/steps/default/default_clusters_to_links.py,sha256=EIYeP0lj0plBl2OpTRuv3iDEQl-zNVJONUg0kgKSEF0,2848
|
30
30
|
easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
|
@@ -76,16 +76,16 @@ easylink/steps/splink/splink_evaluating_pairs.py,sha256=JR2qVgb14cNZKozDyOrN11nr
|
|
76
76
|
easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
|
77
77
|
easylink/steps/splink/splink_links_to_clusters.py,sha256=z5ymdYl9ytp1e5MA6vn8wpGRFWVuhh23LqGq8NJJxZQ,1936
|
78
78
|
easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
|
79
|
-
easylink/utilities/aggregator_utils.py,sha256=
|
79
|
+
easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
|
80
80
|
easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
|
81
81
|
easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
|
82
82
|
easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
|
83
83
|
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
84
|
-
easylink/utilities/splitter_utils.py,sha256=
|
84
|
+
easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
|
85
85
|
easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
|
86
|
-
easylink-0.1.
|
87
|
-
easylink-0.1.
|
88
|
-
easylink-0.1.
|
89
|
-
easylink-0.1.
|
90
|
-
easylink-0.1.
|
91
|
-
easylink-0.1.
|
86
|
+
easylink-0.1.21.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
|
87
|
+
easylink-0.1.21.dist-info/METADATA,sha256=wdHGbqg2d4yte9ep9mO_GAr2EbUmEAVHHjPg6LsvMLE,3565
|
88
|
+
easylink-0.1.21.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
89
|
+
easylink-0.1.21.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
90
|
+
easylink-0.1.21.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
91
|
+
easylink-0.1.21.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|