easylink 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.19"
1
+ __version__ = "0.1.21"
@@ -21,15 +21,15 @@ from loguru import logger
21
21
 
22
22
  from easylink.pipeline_schema_constants import SCHEMA_PARAMS
23
23
  from easylink.step import (
24
+ AutoParallelStep,
24
25
  ChoiceStep,
25
- EmbarrassinglyParallelStep,
26
26
  HierarchicalStep,
27
27
  IOStep,
28
28
  Step,
29
29
  TemplatedStep,
30
30
  )
31
31
  from easylink.utilities.data_utils import load_yaml
32
- from easylink.utilities.paths import IMPLEMENTATION_METADATA
32
+ from easylink.utilities.paths import DEV_IMAGES_DIR, IMPLEMENTATION_METADATA
33
33
 
34
34
 
35
35
  def main(script_path: Path, host: Path) -> None:
@@ -195,9 +195,24 @@ class ImplementationCreator:
195
195
  f"Implementation '{self.implementation_name}' already exists in the registry. "
196
196
  "Overwriting it with the latest data."
197
197
  )
198
+
199
+ # Handle the fact that developers might be saving to username subdirs
200
+ # If the host folder is a subdirectory of DEV_IMAGES_DIR (e.g., the default
201
+ # host directory when calling `easylink devtools create-implementation`
202
+ # is DEV_IMAGES_DIR/<username>), we want to include the relative path
203
+ # to the DEV_IMAGES_DIR in the image name. This is required because ultimately
204
+ # when running a pipeline, all images are expected to be in a single directory.
205
+ image_name = (
206
+ self.hosted_container_path.name
207
+ # Use just the image name if the hosted path is not a part of DEV_IMAGES_DIR
208
+ if not self.hosted_container_path.is_relative_to(DEV_IMAGES_DIR)
209
+ # Use the path relative to DEV_IMAGES_DIR as the image name
210
+ else str(self.hosted_container_path.relative_to(DEV_IMAGES_DIR))
211
+ )
212
+
198
213
  info[self.implementation_name] = {
199
214
  "steps": [self.step],
200
- "image_path": str(self.hosted_container_path),
215
+ "image_name": str(image_name),
201
216
  "script_cmd": f"{self.script_base_command} /{self.script_path.name}",
202
217
  "outputs": {
203
218
  self.output_slot: "result.parquet",
@@ -304,7 +319,7 @@ class ImplementationCreator:
304
319
  elif isinstance(node, TemplatedStep):
305
320
  _process_step(node.template_step)
306
321
  return
307
- elif isinstance(node, EmbarrassinglyParallelStep):
322
+ elif isinstance(node, AutoParallelStep):
308
323
  _process_step(node.step)
309
324
  return
310
325
  elif isinstance(node, ChoiceStep):
@@ -55,7 +55,7 @@ class Implementation:
55
55
  implementation_config: LayeredConfigTree,
56
56
  input_slots: Iterable[InputSlot] = (),
57
57
  output_slots: Iterable[OutputSlot] = (),
58
- is_embarrassingly_parallel: bool = False,
58
+ is_auto_parallel: bool = False,
59
59
  ):
60
60
  self.name = implementation_config.name
61
61
  """The name of this ``Implementation``."""
@@ -74,7 +74,7 @@ class Implementation:
74
74
  implemented by this particular ``Implementation``."""
75
75
  self.requires_spark = self._metadata.get("requires_spark", False)
76
76
  """Whether this ``Implementation`` requires a Spark environment."""
77
- self.is_embarrassingly_parallel = is_embarrassingly_parallel
77
+ self.is_auto_parallel = is_auto_parallel
78
78
 
79
79
  def __repr__(self) -> str:
80
80
  return f"Implementation.{self.name}"
easylink/pipeline.py CHANGED
@@ -45,9 +45,9 @@ class Pipeline:
45
45
  The :class:`~easylink.pipeline_graph.PipelineGraph` object.
46
46
  spark_is_required
47
47
  A boolean indicating whether the pipeline requires Spark.
48
- any_embarrassingly_parallel
48
+ any_auto_parallel
49
49
  A boolean indicating whether any implementation in the pipeline is to be
50
- run in an embarrassingly parallel manner.
50
+ automatically run in parallel.
51
51
 
52
52
  """
53
53
 
@@ -55,7 +55,7 @@ class Pipeline:
55
55
  self.config = config
56
56
  self.pipeline_graph = PipelineGraph(config)
57
57
  self.spark_is_required = self.pipeline_graph.spark_is_required
58
- self.any_embarrassingly_parallel = self.pipeline_graph.any_embarrassingly_parallel
58
+ self.any_auto_parallel = self.pipeline_graph.any_auto_parallel
59
59
 
60
60
  # TODO [MIC-4880]: refactor into validation object
61
61
  self._validate()
@@ -179,7 +179,7 @@ class Pipeline:
179
179
  #################################
180
180
 
181
181
  def _write_imports(self) -> None:
182
- if not self.any_embarrassingly_parallel:
182
+ if not self.any_auto_parallel:
183
183
  imports = "from easylink.utilities import validation_utils\n"
184
184
  else:
185
185
  imports = """import glob
@@ -193,7 +193,7 @@ from easylink.utilities import aggregator_utils, splitter_utils, validation_util
193
193
  f.write(imports)
194
194
 
195
195
  def _write_wildcard_constraints(self) -> None:
196
- if self.any_embarrassingly_parallel:
196
+ if self.any_auto_parallel:
197
197
  with open(self.snakefile_path, "a") as f:
198
198
  f.write(
199
199
  """
@@ -301,12 +301,10 @@ use rule start_spark_worker from spark_cluster with:
301
301
  The name of the ``Implementation`` to write the rule(s) for.
302
302
  """
303
303
 
304
- is_embarrassingly_parallel = self.pipeline_graph.get_whether_embarrassingly_parallel(
305
- node_name
306
- )
304
+ is_auto_parallel = self.pipeline_graph.get_whether_auto_parallel(node_name)
307
305
  input_slots, _output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
308
306
  validation_files, validation_rules = self._get_validations(
309
- node_name, input_slots, is_embarrassingly_parallel
307
+ node_name, input_slots, is_auto_parallel
310
308
  )
311
309
  for validation_rule in validation_rules:
312
310
  validation_rule.write_to_snakefile(self.snakefile_path)
@@ -334,7 +332,7 @@ use rule start_spark_worker from spark_cluster with:
334
332
  image_path=self.config.images_dir / implementation.singularity_image_name,
335
333
  script_cmd=implementation.script_cmd,
336
334
  requires_spark=implementation.requires_spark,
337
- is_embarrassingly_parallel=is_embarrassingly_parallel,
335
+ is_auto_parallel=is_auto_parallel,
338
336
  ).write_to_snakefile(self.snakefile_path)
339
337
 
340
338
  def _write_checkpoint_rule(self, node_name: str, checkpoint_filepath: str) -> None:
@@ -377,7 +375,7 @@ use rule start_spark_worker from spark_cluster with:
377
375
  input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
378
376
  if len(output_slots) > 1:
379
377
  raise NotImplementedError(
380
- "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
378
+ "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
381
379
  )
382
380
  if len(output_files) > 1:
383
381
  raise ValueError(
@@ -388,7 +386,7 @@ use rule start_spark_worker from spark_cluster with:
388
386
  output_slot_attrs = list(output_slots.values())[0]
389
387
  if len(output_slot_attrs["filepaths"]) > 1:
390
388
  raise NotImplementedError(
391
- "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
389
+ "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
392
390
  )
393
391
  checkpoint_rule_name = f"checkpoints.{implementation.splitter_node_name}"
394
392
  AggregationRule(
@@ -404,7 +402,7 @@ use rule start_spark_worker from spark_cluster with:
404
402
  def _get_validations(
405
403
  node_name: str,
406
404
  input_slots: dict[str, dict[str, str | list[str]]],
407
- is_embarrassingly_parallel: bool,
405
+ is_auto_parallel: bool,
408
406
  ) -> tuple[list[str], list[InputValidationRule]]:
409
407
  """Gets the validation rule and its output filepath for each slot for a given node.
410
408
 
@@ -423,10 +421,10 @@ use rule start_spark_worker from spark_cluster with:
423
421
  validation_rules = []
424
422
 
425
423
  for input_slot_name, input_slot_attrs in input_slots.items():
426
- # embarrassingly parallel implementations rely on snakemake wildcards
424
+ # auto-parallel implementations rely on snakemake wildcards
427
425
  # TODO: [MIC-5787] - need to support multiple wildcards at once
428
426
  validation_file = f"input_validations/{node_name}/{input_slot_name}_validator" + (
429
- "-{chunk}" if is_embarrassingly_parallel else ""
427
+ "-{chunk}" if is_auto_parallel else ""
430
428
  )
431
429
  validation_files.append(validation_file)
432
430
  validation_rules.append(
@@ -72,31 +72,26 @@ class PipelineGraph(ImplementationGraph):
72
72
  return any([implementation.requires_spark for implementation in self.implementations])
73
73
 
74
74
  @property
75
- def any_embarrassingly_parallel(self) -> bool:
75
+ def any_auto_parallel(self) -> bool:
76
76
  """Whether or not any :class:`~easylink.implementation.Implementation` is
77
- to be run in an embarrassingly parallel way."""
77
+ to be automatically run in parallel."""
78
78
  return any(
79
- [
80
- self.get_whether_embarrassingly_parallel(node)
81
- for node in self.implementation_nodes
82
- ]
79
+ [self.get_whether_auto_parallel(node) for node in self.implementation_nodes]
83
80
  )
84
81
 
85
- def get_whether_embarrassingly_parallel(self, node: str) -> dict[str, bool]:
86
- """Determines whether a node is to be run in an embarrassingly parallel way.
82
+ def get_whether_auto_parallel(self, node: str) -> dict[str, bool]:
83
+ """Determines whether a node is to be automatically run in parallel.
87
84
 
88
85
  Parameters
89
86
  ----------
90
87
  node
91
- The node name to determine whether or not it is to be run in an
92
- embarrassingly parallel way.
88
+ The node name to determine whether or not it is to be automatically run in parallel.
93
89
 
94
90
  Returns
95
91
  -------
96
- A boolean indicating whether the node is to be run in an embarrassingly
97
- parallel way.
92
+ A boolean indicating whether the node is to be automatically run in parallel.
98
93
  """
99
- return self.nodes[node]["implementation"].is_embarrassingly_parallel
94
+ return self.nodes[node]["implementation"].is_auto_parallel
100
95
 
101
96
  def get_io_filepaths(self, node: str) -> tuple[list[str], list[str]]:
102
97
  """Gets all of a node's input and output filepaths from its edges.
@@ -482,9 +477,9 @@ class PipelineGraph(ImplementationGraph):
482
477
  str(
483
478
  Path("intermediate")
484
479
  / node
485
- # embarrassingly parallel implementations rely on snakemake wildcards
480
+ # auto-parallel implementations rely on snakemake wildcards
486
481
  # TODO: [MIC-5787] - need to support multiple wildcards at once
487
- / ("{chunk}" if implementation.is_embarrassingly_parallel else "")
482
+ / ("{chunk}" if implementation.is_auto_parallel else "")
488
483
  / imp_outputs[edge_attrs["output_slot"].name]
489
484
  ),
490
485
  )
@@ -23,8 +23,8 @@ SCHEMA_PARAMS = {
23
23
  "combine_with_iteration": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
24
24
  "combine_with_iteration_cycle": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
25
25
  "combine_with_extra_node": testing.SCHEMA_PARAMS_THREE_STEPS,
26
- "looping_ep_step": testing.SCHEMA_PARAMS_LOOPING_EP_STEP,
27
- "ep_parallel_step": testing.SCHEMA_PARAMS_EP_PARALLEL_STEP,
28
- "ep_loop_step": testing.SCHEMA_PARAMS_EP_LOOP_STEP,
29
- "ep_hierarchical_step": testing.SCHEMA_PARAMS_EP_HIERARCHICAL_STEP,
26
+ "looping_auto_parallel_step": testing.SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP,
27
+ "auto_parallel_cloneable_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP,
28
+ "auto_parallel_loop_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP,
29
+ "auto_parallel_hierarchical_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP,
30
30
  }
@@ -18,13 +18,13 @@ from easylink.graph_components import (
18
18
  OutputSlotMapping,
19
19
  )
20
20
  from easylink.step import (
21
+ AutoParallelStep,
21
22
  ChoiceStep,
22
- EmbarrassinglyParallelStep,
23
+ CloneableStep,
23
24
  HierarchicalStep,
24
25
  InputStep,
25
26
  LoopStep,
26
27
  OutputStep,
27
- ParallelStep,
28
28
  Step,
29
29
  )
30
30
  from easylink.utilities.aggregator_utils import concatenate_datasets
@@ -33,7 +33,7 @@ from easylink.utilities.validation_utils import validate_input_file_dummy
33
33
 
34
34
  NODES = [
35
35
  InputStep(),
36
- ParallelStep(
36
+ CloneableStep(
37
37
  template_step=Step(
38
38
  step_name="step_1",
39
39
  input_slots=[
@@ -58,7 +58,7 @@ NODES = [
58
58
  output_slots=[OutputSlot("step_2_main_output")],
59
59
  ),
60
60
  LoopStep(
61
- template_step=EmbarrassinglyParallelStep(
61
+ template_step=AutoParallelStep(
62
62
  step=Step(
63
63
  step_name="step_3",
64
64
  input_slots=[
@@ -12,11 +12,11 @@ from easylink.graph_components import (
12
12
  OutputSlotMapping,
13
13
  )
14
14
  from easylink.step import (
15
+ CloneableStep,
15
16
  HierarchicalStep,
16
17
  InputStep,
17
18
  LoopStep,
18
19
  OutputStep,
19
- ParallelStep,
20
20
  Step,
21
21
  )
22
22
  from easylink.utilities.validation_utils import (
@@ -56,8 +56,8 @@ NODES = [
56
56
  ],
57
57
  output_slots=[OutputSlot("clusters")],
58
58
  nodes=[
59
- ParallelStep(
60
- # NOTE: Splitters/aggregators on the ParallelStep are implicit!
59
+ CloneableStep(
60
+ # NOTE: Splitters/aggregators on the CloneableStep are implicit!
61
61
  template_step=HierarchicalStep(
62
62
  step_name="determining_exclusions_and_removing_records",
63
63
  directly_implemented=False,
@@ -190,7 +190,7 @@ NODES = [
190
190
  ],
191
191
  output_slots=[OutputSlot("links")],
192
192
  nodes=[
193
- ParallelStep(
193
+ CloneableStep(
194
194
  template_step=LoopStep(
195
195
  template_step=Step(
196
196
  step_name="pre-processing",
@@ -265,7 +265,7 @@ NODES = [
265
265
  source_node="pre-processing",
266
266
  target_node="schema_alignment",
267
267
  output_slot="dataset",
268
- # NOTE: The implicit ParallelStep aggregator has
268
+ # NOTE: The implicit CloneableStep aggregator has
269
269
  # made this multiple (a list)
270
270
  input_slot="datasets",
271
271
  ),
@@ -16,12 +16,12 @@ from easylink.graph_components import (
16
16
  OutputSlotMapping,
17
17
  )
18
18
  from easylink.step import (
19
- EmbarrassinglyParallelStep,
19
+ AutoParallelStep,
20
+ CloneableStep,
20
21
  HierarchicalStep,
21
22
  InputStep,
22
23
  LoopStep,
23
24
  OutputStep,
24
- ParallelStep,
25
25
  Step,
26
26
  )
27
27
  from easylink.utilities.aggregator_utils import concatenate_datasets
@@ -215,7 +215,7 @@ SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY = (NODES_BAD_COMBINED_TOPOLOGY, EDGES_ONE_ST
215
215
  NODES_NESTED_TEMPLATED_STEPS = [
216
216
  InputStep(),
217
217
  LoopStep(
218
- template_step=ParallelStep(
218
+ template_step=CloneableStep(
219
219
  template_step=HierarchicalStep(
220
220
  step_name="step_1",
221
221
  input_slots=[
@@ -355,10 +355,10 @@ EDGES_TWO_STEPS = [
355
355
  SCHEMA_PARAMS_COMBINE_WITH_ITERATION = (NODES_COMBINE_WITH_ITERATION, EDGES_TWO_STEPS)
356
356
 
357
357
 
358
- NODES_LOOPING_EP_STEP = [
358
+ NODES_LOOPING_AUTO_PARALLEL_STEP = [
359
359
  InputStep(),
360
360
  LoopStep(
361
- template_step=EmbarrassinglyParallelStep(
361
+ template_step=AutoParallelStep(
362
362
  step=Step(
363
363
  step_name="step_1",
364
364
  input_slots=[
@@ -392,13 +392,13 @@ NODES_LOOPING_EP_STEP = [
392
392
  ]
393
393
  ),
394
394
  ]
395
- SCHEMA_PARAMS_LOOPING_EP_STEP = (NODES_LOOPING_EP_STEP, EDGES_ONE_STEP)
395
+ SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP = (NODES_LOOPING_AUTO_PARALLEL_STEP, EDGES_ONE_STEP)
396
396
 
397
397
 
398
- NODES_EP_PARALLEL_STEP = [
398
+ NODES_AUTO_PARALLEL_PARALLEL_STEP = [
399
399
  InputStep(),
400
- EmbarrassinglyParallelStep(
401
- step=ParallelStep(
400
+ AutoParallelStep(
401
+ step=CloneableStep(
402
402
  template_step=Step(
403
403
  step_name="step_1",
404
404
  input_slots=[
@@ -424,12 +424,15 @@ NODES_EP_PARALLEL_STEP = [
424
424
  ]
425
425
  ),
426
426
  ]
427
- SCHEMA_PARAMS_EP_PARALLEL_STEP = (NODES_EP_PARALLEL_STEP, EDGES_ONE_STEP)
427
+ SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP = (
428
+ NODES_AUTO_PARALLEL_PARALLEL_STEP,
429
+ EDGES_ONE_STEP,
430
+ )
428
431
 
429
432
 
430
- NODES_EP_LOOP_STEP = [
433
+ NODES_AUTO_PARALLEL_LOOP_STEP = [
431
434
  InputStep(),
432
- EmbarrassinglyParallelStep(
435
+ AutoParallelStep(
433
436
  step=LoopStep(
434
437
  template_step=Step(
435
438
  step_name="step_1",
@@ -464,12 +467,12 @@ NODES_EP_LOOP_STEP = [
464
467
  ]
465
468
  ),
466
469
  ]
467
- SCHEMA_PARAMS_EP_LOOP_STEP = (NODES_EP_LOOP_STEP, EDGES_ONE_STEP)
470
+ SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP = (NODES_AUTO_PARALLEL_LOOP_STEP, EDGES_ONE_STEP)
468
471
 
469
472
 
470
- NODES_EP_HIERARCHICAL_STEP = [
473
+ NODES_AUTO_PARALLEL_HIERARCHICAL_STEP = [
471
474
  InputStep(),
472
- EmbarrassinglyParallelStep(
475
+ AutoParallelStep(
473
476
  step=HierarchicalStep(
474
477
  step_name="step_1",
475
478
  input_slots=[
@@ -581,7 +584,10 @@ EDGES_ONE_STEP_TWO_ISLOTS = [
581
584
  input_slot="result",
582
585
  ),
583
586
  ]
584
- SCHEMA_PARAMS_EP_HIERARCHICAL_STEP = (NODES_EP_HIERARCHICAL_STEP, EDGES_ONE_STEP_TWO_ISLOTS)
587
+ SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP = (
588
+ NODES_AUTO_PARALLEL_HIERARCHICAL_STEP,
589
+ EDGES_ONE_STEP_TWO_ISLOTS,
590
+ )
585
591
 
586
592
  NODES_OUTPUT_DIR = [
587
593
  InputStep(),
easylink/rule.py CHANGED
@@ -111,21 +111,20 @@ class ImplementedRule(Rule):
111
111
  """Command to execute."""
112
112
  requires_spark: bool
113
113
  """Whether or not this ``Implementation`` requires a Spark environment."""
114
- is_embarrassingly_parallel: bool = False
115
- """Whether or not this ``Implementation`` is to be run in an embarrassingly
116
- parallel way."""
114
+ is_auto_parallel: bool = False
115
+ """Whether or not this ``Implementation`` is to be automatically run in parallel."""
117
116
 
118
117
  def build_rule(self) -> str:
119
118
  """Builds the Snakemake rule for this ``Implementation``."""
120
- if self.is_embarrassingly_parallel and len(self.output) > 1:
119
+ if self.is_auto_parallel and len(self.output) > 1:
121
120
  raise NotImplementedError(
122
- "Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
121
+ "Multiple output slots/files of AutoParallelSteps not yet supported"
123
122
  )
124
123
  return self._build_io() + self._build_resources() + self._build_shell_cmd()
125
124
 
126
125
  def _build_io(self) -> str:
127
126
  """Builds the input/output portion of the rule."""
128
- log_path_chunk_adder = "-{chunk}" if self.is_embarrassingly_parallel else ""
127
+ log_path_chunk_adder = "-{chunk}" if self.is_auto_parallel else ""
129
128
  # Handle output files vs directories
130
129
  files = [path for path in self.output if Path(path).suffix != ""]
131
130
  if len(files) == len(self.output):
@@ -260,7 +259,7 @@ rule:
260
259
  class CheckpointRule(Rule):
261
260
  """A :class:`Rule` that defines a checkpoint.
262
261
 
263
- When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
262
+ When running an :class:`~easylink.implementation.Implementation` in an auto
264
263
  parallel way, we do not know until runtime how many parallel jobs there will
265
264
  be (e.g. we don't know beforehand how many chunks a large incoming dataset will
266
265
  be split into since the incoming dataset isn't created until runtime). The
@@ -326,7 +325,7 @@ checkpoint:
326
325
  class AggregationRule(Rule):
327
326
  """A :class:`Rule` that aggregates the processed chunks of output data.
328
327
 
329
- When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
328
+ When running an :class:`~easylink.implementation.Implementation` in an auto
330
329
  parallel way, we need to aggregate the output files from each parallel job
331
330
  into a single output file.
332
331
  """
@@ -347,10 +346,10 @@ class AggregationRule(Rule):
347
346
  def build_rule(self) -> str:
348
347
  """Builds the Snakemake rule for this aggregator.
349
348
 
350
- When running an :class:`~easylink.step.EmbarrassinglyParallelStep`, we need
349
+ When running an :class:`~easylink.step.AutoParallelStep`, we need
351
350
  to aggregate the output files from each parallel job into a single output file.
352
351
  This rule relies on a dynamically generated aggregation function which returns
353
- all of the **processed** chunks (from running the ``EmbarrassinglyParallelStep's``
352
+ all of the **processed** chunks (from running the ``AutoParallelStep's``
354
353
  container in parallel) and uses them as inputs to the actual aggregation
355
354
  rule.
356
355
 
easylink/step.py CHANGED
@@ -71,8 +71,8 @@ class Step:
71
71
  The :class:`InputSlotMapping<easylink.graph_components.InputSlotMapping>` of this ``Step``.
72
72
  output_slot_mappings
73
73
  The :class:`OutputSlotMapping<easylink.graph_components.OutputSlotMapping>` of this ``Step``.
74
- is_embarrassingly_parallel
75
- Whether or not this ``Step`` is to be run in an embarrassingly parallel manner.
74
+ is_auto_parallel
75
+ Whether or not this ``Step`` is to automatically run in parallel.
76
76
 
77
77
  Notes
78
78
  -----
@@ -91,7 +91,7 @@ class Step:
91
91
  output_slots: Iterable[OutputSlot] = (),
92
92
  input_slot_mappings: Iterable[InputSlotMapping] = (),
93
93
  output_slot_mappings: Iterable[OutputSlotMapping] = (),
94
- is_embarrassingly_parallel: bool = False,
94
+ is_auto_parallel: bool = False,
95
95
  ) -> None:
96
96
  if not step_name and not name:
97
97
  raise ValueError("All Steps must contain a step_name, name, or both.")
@@ -125,8 +125,8 @@ class Step:
125
125
  }
126
126
  """A combined dictionary containing both the ``InputSlotMappings`` and
127
127
  ``OutputSlotMappings`` of this ``Step``."""
128
- self.is_embarrassingly_parallel = is_embarrassingly_parallel
129
- """Whether or not this ``Step`` is to be run in an embarrassingly parallel manner."""
128
+ self.is_auto_parallel = is_auto_parallel
129
+ """Whether or not this ``Step`` is to be automatically run in parallel."""
130
130
  self.parent_step = None
131
131
  """This ``Step's`` parent ``Step``, if applicable."""
132
132
  self._configuration_state = None
@@ -816,7 +816,7 @@ class TemplatedStep(Step, ABC):
816
816
 
817
817
  A ``TemplatedStep`` is used to represents a ``Step`` that contains a specified
818
818
  amount of multiplicity, such as one that is looped or run in parallel; it is
819
- inherited by concrete :class:`LoopStep` and :class:`ParallelStep` instances.
819
+ inherited by concrete :class:`LoopStep` and :class:`CloneableStep` instances.
820
820
 
821
821
  See :class:`Step` for inherited attributes.
822
822
 
@@ -1206,7 +1206,7 @@ class LoopStep(TemplatedStep):
1206
1206
  return {"input": input_mappings, "output": output_mappings}
1207
1207
 
1208
1208
 
1209
- class ParallelStep(TemplatedStep):
1209
+ class CloneableStep(TemplatedStep):
1210
1210
  """A type of :class:`TemplatedStep` that creates multiple copies in parallel
1211
1211
  with no dependencies between them.
1212
1212
 
@@ -1216,13 +1216,13 @@ class ParallelStep(TemplatedStep):
1216
1216
 
1217
1217
  @property
1218
1218
  def config_key(self):
1219
- """The pipeline specification key required for a ``ParallelStep``."""
1220
- return "parallel"
1219
+ """The pipeline specification key required for a ``CloneableStep``."""
1220
+ return "clones"
1221
1221
 
1222
1222
  @property
1223
1223
  def node_prefix(self):
1224
- """The prefix to be used in the ``ParallelStep`` node name."""
1225
- return "parallel_split"
1224
+ """The prefix to be used in the ``CloneableStep`` node name."""
1225
+ return "clone"
1226
1226
 
1227
1227
  def _update_step_graph(self, num_repeats: int) -> StepGraph:
1228
1228
  """Updates the :class:`~easylink.graph_components.StepGraph` to include parallelization.
@@ -1276,10 +1276,10 @@ class ParallelStep(TemplatedStep):
1276
1276
  return {"input": input_mappings, "output": output_mappings}
1277
1277
 
1278
1278
 
1279
- class EmbarrassinglyParallelStep(Step):
1279
+ class AutoParallelStep(Step):
1280
1280
  """A :class:`Step` that is run in parallel on the backend.
1281
1281
 
1282
- An ``EmbarrassinglyParallelStep`` is different than a :class:`ParallelStep`
1282
+ An ``AutoParallelStep`` is different than a :class:`CloneableStep`
1283
1283
  in that it is not configured by the user to be run in parallel - it completely
1284
1284
  happens on the back end for performance reasons.
1285
1285
 
@@ -1288,8 +1288,8 @@ class EmbarrassinglyParallelStep(Step):
1288
1288
  Parameters
1289
1289
  ----------
1290
1290
  step
1291
- The ``Step`` to be run in an embarrassingly parallel manner. To run multiple
1292
- steps in parallel, use a :class:`HierarchicalStep`.
1291
+ The ``Step`` to be automatically run in parallel. To run multiple steps in
1292
+ parallel, use a :class:`HierarchicalStep`.
1293
1293
  slot_splitter_mapping
1294
1294
  A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
1295
1295
  to the actual splitter function to be used.
@@ -1308,7 +1308,7 @@ class EmbarrassinglyParallelStep(Step):
1308
1308
  super().__init__(
1309
1309
  step_name=None,
1310
1310
  name=step.name,
1311
- is_embarrassingly_parallel=True,
1311
+ is_auto_parallel=True,
1312
1312
  )
1313
1313
  self.slot_splitter_mapping = slot_splitter_mapping
1314
1314
  """A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
@@ -1328,14 +1328,14 @@ class EmbarrassinglyParallelStep(Step):
1328
1328
 
1329
1329
  @Step.name.setter
1330
1330
  def name(self, value: str) -> None:
1331
- """Changes the name of the ``EmbarrassinglyParallelStep`` and the underlying :class:`Step` to the given value."""
1331
+ """Changes the name of the ``AutoParallelStep`` and the underlying :class:`Step` to the given value."""
1332
1332
  self._name = value
1333
1333
  self.step._name = value
1334
1334
 
1335
1335
  def _validate(self) -> None:
1336
- """Validates the ``EmbarrassinglyParallelStep``.
1336
+ """Validates the ``AutoParallelStep``.
1337
1337
 
1338
- ``EmbarrassinglyParallelSteps`` are not configured by the user to be run
1338
+ ``AutoParallelSteps`` are not configured by the user to be run
1339
1339
  in parallel. Since it happens on the back end, we need to do somewhat unique
1340
1340
  validations during construction. Specifically,
1341
1341
  - one and only one :class:`~easylink.graph_components.InputSlot` *must*
@@ -1348,17 +1348,17 @@ class EmbarrassinglyParallelStep(Step):
1348
1348
  # check that only one input slot has a splitter assigned
1349
1349
  if len(self.slot_splitter_mapping) != 1:
1350
1350
  errors.append(
1351
- f"EmbarrassinglyParallelStep '{self.step_name}' is attempting to define "
1351
+ f"AutoParallelStep '{self.step_name}' is attempting to define "
1352
1352
  f"{len(self.slot_splitter_mapping)} splitters when only one should be defined."
1353
1353
  )
1354
1354
  if len(self.slot_splitter_mapping) == 0:
1355
1355
  errors.append(
1356
- f"EmbarrassinglyParallelStep '{self.step_name}' does not have any input slots with a "
1356
+ f"AutoParallelStep '{self.step_name}' does not have any input slots with a "
1357
1357
  "splitter method assigned; one and only one input slot must have a splitter."
1358
1358
  )
1359
1359
  if len(self.slot_splitter_mapping) > 1:
1360
1360
  errors.append(
1361
- f"EmbarrassinglyParallelStep '{self.step_name}' has multiple input slots with "
1361
+ f"AutoParallelStep '{self.step_name}' has multiple input slots with "
1362
1362
  "splitter methods assigned; one and only one input slot must have a splitter.\n"
1363
1363
  f"Input slots with splitters: {list(self.slot_splitter_mapping)}"
1364
1364
  )
@@ -1371,7 +1371,7 @@ class EmbarrassinglyParallelStep(Step):
1371
1371
  ]
1372
1372
  if len(missing_aggregators) != 0:
1373
1373
  errors.append(
1374
- f"EmbarrassinglyParallelStep '{self.step_name}' has output slots without "
1374
+ f"AutoParallelStep '{self.step_name}' has output slots without "
1375
1375
  f"aggregator methods assigned: {missing_aggregators}"
1376
1376
  )
1377
1377
  if errors:
@@ -1451,7 +1451,7 @@ class EmbarrassinglyParallelStep(Step):
1451
1451
  aggregator_node_name = f"{self.name}_aggregate"
1452
1452
  if len(self.output_slots) > 1:
1453
1453
  raise NotImplementedError(
1454
- "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
1454
+ "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
1455
1455
  )
1456
1456
  output_slot = list(self.output_slots.values())[0]
1457
1457
  aggregator_step = AggregatorStep(
@@ -1464,7 +1464,7 @@ class EmbarrassinglyParallelStep(Step):
1464
1464
  self._update_slot_mappings(splitter_step, aggregator_step)
1465
1465
  # Add the key back to the expanded config
1466
1466
  expanded_config = LayeredConfigTree({self.step.name: step_config})
1467
- # EmbarrassinglyParallelSteps are by definition non-leaf steps
1467
+ # AutoParallelSteps are by definition non-leaf steps
1468
1468
  self._configuration_state = NonLeafConfigurationState(
1469
1469
  self, expanded_config, combined_implementations, input_data_config
1470
1470
  )
@@ -1513,7 +1513,7 @@ class EmbarrassinglyParallelStep(Step):
1513
1513
  # Add the Step -> AggregatorStep edge
1514
1514
  if len(self.step.output_slots) > 1:
1515
1515
  raise NotImplementedError(
1516
- "EmbarrassinglyParallelStep does not support multiple output slots."
1516
+ "AutoParallelStep does not support multiple output slots."
1517
1517
  )
1518
1518
  self.step_graph.add_edge_from_params(
1519
1519
  EdgeParams(
@@ -1562,7 +1562,7 @@ class SplitterStep(StandaloneStep):
1562
1562
  """A :class:`StandaloneStep` that splits an :class:`~easylink.graph_components.InputSlot` for parallel processing.
1563
1563
 
1564
1564
  A ``SplitterStep`` is intended to be used in conjunction with a corresponding
1565
- :class:`AggregatorStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
1565
+ :class:`AggregatorStep` and only during construction of an :class:`AutoParallelStep`.
1566
1566
 
1567
1567
  See :class:`Step` for inherited attributes.
1568
1568
 
@@ -1613,7 +1613,7 @@ class AggregatorStep(StandaloneStep):
1613
1613
  """A :class:`StandaloneStep` that aggregates :class:`OutputSlots<easylink.graph_components.Outputslot>` after parallel processing.
1614
1614
 
1615
1615
  An ``AggregatorStep`` is intended to be used in conjunction with a corresponding
1616
- :class:`SplitterStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
1616
+ :class:`SplitterStep` and only during construction of an :class:`AutoParallelStep`.
1617
1617
 
1618
1618
  See :class:`Step` for inherited attributes.
1619
1619
 
@@ -1918,10 +1918,9 @@ class LeafConfigurationState(ConfigurationState):
1918
1918
  """
1919
1919
  step = self._step
1920
1920
  if self.is_combined:
1921
- if step.is_embarrassingly_parallel:
1921
+ if step.is_auto_parallel:
1922
1922
  raise NotImplementedError(
1923
- "Combining implementations with embarrassingly parallel steps "
1924
- "is not supported."
1923
+ "Combining implementations with auto-parallel steps is not supported."
1925
1924
  )
1926
1925
  implementation = PartialImplementation(
1927
1926
  combined_name=self.step_config[COMBINED_IMPLEMENTATION_KEY],
@@ -1935,7 +1934,7 @@ class LeafConfigurationState(ConfigurationState):
1935
1934
  implementation_config=self.implementation_config,
1936
1935
  input_slots=step.input_slots.values(),
1937
1936
  output_slots=step.output_slots.values(),
1938
- is_embarrassingly_parallel=step.is_embarrassingly_parallel,
1937
+ is_auto_parallel=step.is_auto_parallel,
1939
1938
  )
1940
1939
  implementation_graph.add_node_from_implementation(
1941
1940
  step.implementation_node_name,
@@ -1985,7 +1984,7 @@ class LeafConfigurationState(ConfigurationState):
1985
1984
  if mapping.parent_slot == edge.input_slot
1986
1985
  ]
1987
1986
  for mapping in mappings:
1988
- # FIXME [MIC-5771]: Fix ParallelSteps
1987
+ # FIXME [MIC-5771]: Fix CloneableSteps
1989
1988
  if (
1990
1989
  "input_data_file" in self.step_config
1991
1990
  and edge.source_node == "pipeline_graph_input_data"
@@ -2070,8 +2069,8 @@ class NonLeafConfigurationState(ConfigurationState):
2070
2069
  """
2071
2070
  for node in self._step.step_graph.nodes:
2072
2071
  substep = self._step.step_graph.nodes[node]["step"]
2073
- if self._step.is_embarrassingly_parallel:
2074
- substep.is_embarrassingly_parallel = True
2072
+ if self._step.is_auto_parallel:
2073
+ substep.is_auto_parallel = True
2075
2074
  substep.add_nodes_to_implementation_graph(implementation_graph)
2076
2075
 
2077
2076
  def add_edges_to_implementation_graph(
@@ -60,12 +60,14 @@ new_clusters_df = load_file(new_clusters_filepath)
60
60
  def merge_clusters(known_clusters_df, new_clusters_df):
61
61
  # Combine both dataframes
62
62
  combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
63
-
64
- # Drop records with missing cluster IDs
65
- combined_df = combined_df.dropna(subset=["Cluster ID"])
63
+ combined_df["Input Record Key"] = (
64
+ combined_df["Input Record Dataset"]
65
+ + "-__-"
66
+ + combined_df["Input Record ID"].astype(int).astype(str)
67
+ )
66
68
 
67
69
  # Group by Cluster ID to get connected records
68
- cluster_groups = combined_df.groupby("Cluster ID")["Input Record ID"].apply(list)
70
+ cluster_groups = combined_df.groupby("Cluster ID")["Input Record Key"].apply(list)
69
71
 
70
72
  # Build a graph of all connections implied by cluster IDs
71
73
  G = nx.Graph()
@@ -75,8 +77,8 @@ def merge_clusters(known_clusters_df, new_clusters_df):
75
77
  G.add_edge(group[i], group[j])
76
78
 
77
79
  # Add isolated nodes (records with unique clusters)
78
- all_ids = set(combined_df["Input Record ID"])
79
- G.add_nodes_from(all_ids)
80
+ all_keys = set(combined_df["Input Record Key"])
81
+ G.add_nodes_from(all_keys)
80
82
 
81
83
  # Compute connected components
82
84
  components = list(nx.connected_components(G))
@@ -84,13 +86,19 @@ def merge_clusters(known_clusters_df, new_clusters_df):
84
86
  # Assign new cluster IDs
85
87
  merged_data = []
86
88
  for cluster_id, records in enumerate(components, start=1):
87
- for record_id in records:
88
- merged_data.append((record_id, cluster_id))
89
+ for record_key in records:
90
+ merged_data.append((record_key, cluster_id))
89
91
 
90
92
  # Build the final DataFrame
91
- merged_df = pd.DataFrame(merged_data, columns=["Input Record ID", "Cluster ID"])
93
+ merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
94
+
95
+ merged_df[["Input Record Dataset", "Input Record ID"]] = merged_df[
96
+ "Input Record Key"
97
+ ].str.split("-__-", n=1, expand=True)
98
+
99
+ merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
92
100
 
93
- return merged_df
101
+ return merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]]
94
102
 
95
103
 
96
104
  output_df = merge_clusters(known_clusters_df, new_clusters_df)
@@ -4,8 +4,8 @@ Data Aggregating Utilities
4
4
  ==========================
5
5
 
6
6
  This module contains utility functions for aggregating datasets. One primary use
7
- case for this is combine the results of running sections of the pipeline in an
8
- embarrassingly parallel manner.
7
+ case for this is to combine the results of sections that were automatically run
8
+ in parallel.
9
9
 
10
10
  Note that it is critical that all data aggregating utility functions are definied
11
11
  in this module; easylink will not be able to find them otherwise.
@@ -4,7 +4,7 @@ Data Splitting Utilities
4
4
  ========================
5
5
 
6
6
  This module contains utility functions for splitting datasets into smaller datasets.
7
- One primary use case for this is to run sections of the pipeline in an embarrassingly
7
+ One primary use case for this is to run sections of the pipeline in an auto
8
8
  parallel manner.
9
9
 
10
10
  Note that it is critical that all data splitting utility functions are definied
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.19
3
+ Version: 0.1.21
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -1,30 +1,30 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=cAJAbAh288a9AL-3yxwFzEM1L26izSJ6wma5aiml_9Y,23
3
+ easylink/_version.py,sha256=qEmNtjnOwhDYQ0cHPPtUkUaghzD2xl0thJEznl4giYw,23
4
4
  easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
5
5
  easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
6
6
  easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
- easylink/implementation.py,sha256=H46WjW9O3csaVAU7qLto3aOu1bSfVOBS0ZySBBX05o0,14544
7
+ easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
8
8
  easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
9
- easylink/pipeline.py,sha256=LC0mwboLfe84Mbju9manJjN00Kup4jauiugLlgGCz6I,17884
10
- easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
9
+ easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
10
+ easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
11
11
  easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
12
- easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
12
+ easylink/rule.py,sha256=QJPmrvQUZPnqGFD9UmMK8imdJ7VODzGlUOSnpJhb9AU,16677
13
13
  easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
14
- easylink/step.py,sha256=NGy1KNqM4eXP7kP0kdfcfyGc4K_ExSCSidCdW3h0Qg8,89902
15
- easylink/devtools/implementation_creator.py,sha256=1WQOOrjQYOhjjp8MQM9j1xoeAp-SW51A1f1oW4G792I,18251
14
+ easylink/step.py,sha256=SqOxinHyRaLCEnB_y5dvhGMaRLyphQDCpVsQ3160c9U,89588
15
+ easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
16
16
  easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
17
17
  easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
18
- easylink/pipeline_schema_constants/__init__.py,sha256=xYymSjTeH3prvQL_rgGFVrriohANFtW_cy0vDwlF3ds,1355
19
- easylink/pipeline_schema_constants/development.py,sha256=XxcYYZDZM4IADp3eFPQCchD6-OtMp99GiyZBfSswzFo,12640
20
- easylink/pipeline_schema_constants/main.py,sha256=9IxAjgQej7AaV-zYZEFhG8U-v_rYBFaPuNS3Y3m4Sho,22929
21
- easylink/pipeline_schema_constants/testing.py,sha256=UDmVVjI1SiDktMbJ2CrSb7amHSYNwhgqNkXhl4lYxQw,20459
18
+ easylink/pipeline_schema_constants/__init__.py,sha256=SMNXz49DSwx05PHMKUsunJsgMOqsBJaAHA1fmIOJsUU,1445
19
+ easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
20
+ easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
21
+ easylink/pipeline_schema_constants/testing.py,sha256=G7szRMyY48dL8kUHWq2MeMaV2G0F-AdAPsQxFzdUnFI,20567
22
22
  easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
23
23
  easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
24
24
  easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
25
25
  easylink/steps/cascading/exclude_none.py,sha256=KntBX3q-V47d96ztOlPNRY_kCFJNi1LNYQ7UNs5wB4c,2507
26
26
  easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
27
- easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=w7tAOs2QtIIcpTDxw2P_dqMIR-BFa-wi-OmZwrKyhmg,3309
27
+ easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=43D5GBmPXSgxcjgbJTvEoGFvPzBCGqYgBaT42pncNNw,3661
28
28
  easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
29
29
  easylink/steps/default/default_clusters_to_links.py,sha256=EIYeP0lj0plBl2OpTRuv3iDEQl-zNVJONUg0kgKSEF0,2848
30
30
  easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
@@ -76,16 +76,16 @@ easylink/steps/splink/splink_evaluating_pairs.py,sha256=JR2qVgb14cNZKozDyOrN11nr
76
76
  easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
77
77
  easylink/steps/splink/splink_links_to_clusters.py,sha256=z5ymdYl9ytp1e5MA6vn8wpGRFWVuhh23LqGq8NJJxZQ,1936
78
78
  easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
79
- easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl64u0Lr__g,972
79
+ easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
80
80
  easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
81
81
  easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
82
82
  easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
83
83
  easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
84
- easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
84
+ easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
85
85
  easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
86
- easylink-0.1.19.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
- easylink-0.1.19.dist-info/METADATA,sha256=nFZA-jZKgZUG4DdiDqY-pNOTfdt1H3QeiwNzvo27vpg,3565
88
- easylink-0.1.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
- easylink-0.1.19.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
- easylink-0.1.19.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
- easylink-0.1.19.dist-info/RECORD,,
86
+ easylink-0.1.21.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
+ easylink-0.1.21.dist-info/METADATA,sha256=wdHGbqg2d4yte9ep9mO_GAr2EbUmEAVHHjPg6LsvMLE,3565
88
+ easylink-0.1.21.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
+ easylink-0.1.21.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
+ easylink-0.1.21.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
+ easylink-0.1.21.dist-info/RECORD,,