PyPI - easylink - Versions diffs - 0.1.24__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

easylink 0.1.24py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

easylink/_version.py +1 -1
easylink/cli.py +19 -10
easylink/implementation_metadata.yaml +94 -44
easylink/runner.py +174 -1
easylink/steps/cascading/accept_all_pairs.def +22 -0
easylink/steps/cascading/accept_all_pairs.py +26 -0
easylink/steps/cascading/exclude_clustered.py +11 -2
easylink/steps/cascading/exclude_none.py +1 -1
easylink/steps/cascading/one_to_many_links_to_clusters.def +22 -0
easylink/steps/cascading/one_to_many_links_to_clusters.py +109 -0
easylink/steps/cascading/update_clusters_by_connected_components.py +17 -4
easylink/steps/default/default_determining_exclusions.py +1 -1
easylink/steps/default/default_removing_records.py +1 -1
easylink/steps/splink/splink_blocking_and_filtering.py +12 -6
easylink/steps/splink/splink_evaluating_pairs.py +2 -1
easylink/steps/splink/splink_links_to_clusters.py +2 -0
easylink/utilities/general_utils.py +18 -8
easylink/utilities/validation_utils.py +6 -6
{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/METADATA +27 -11
{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/RECORD +24 -20
{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/WHEEL +0 -0
{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/entry_points.txt +0 -0
{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/licenses/LICENSE +0 -0
{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/top_level.txt +0 -0

easylink/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.24"
1	+ __version__ = "0.2.0"

easylink/cli.py CHANGED Viewed

@@ -201,16 +201,25 @@ def run(
     main = handle_exceptions(
         func=runner.main, exceptions_logger=logger, with_debugger=with_debugger
     )
-    main(
-        command="run",
-        pipeline_specification=pipeline_specification,
-        input_data=input_data,
-        computing_environment=computing_environment,
-        results_dir=results_dir,
-        images_dir=images,
-        schema_name=schema,
-    )
-    logger.info("*** FINISHED ***")
+    try:
+        main(
+            command="run",
+            pipeline_specification=pipeline_specification,
+            input_data=input_data,
+            computing_environment=computing_environment,
+            results_dir=results_dir,
+            images_dir=images,
+            schema_name=schema,
+        )
+    except SystemExit as e:
+        # Snakemake uses SystemExit with exit code 0 for success, non-zero for failure
+        if e.code == 0:
+            logger.info("\033[32m*** FINISHED ***\033[0m")  # Green
+        else:
+            logger.error(
+                f"\033[31mERROR: Pipeline failed with exit code {e.code}\033[0m"
+            )  # Red
+        raise
 @easylink.command()

easylink/implementation_metadata.yaml CHANGED Viewed

@@ -2,7 +2,7 @@ step_1_python_pandas:
   steps:
   - step_1
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -11,7 +11,7 @@ step_1a_python_pandas:
   steps:
   - step_1a
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -22,7 +22,7 @@ step_1b_python_pandas:
   steps:
   - step_1b
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -33,7 +33,7 @@ step_2_python_pandas:
   steps:
   - step_2
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -42,7 +42,7 @@ step_3_python_pandas:
   steps:
   - step_3
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -51,7 +51,7 @@ step_4_python_pandas:
   steps:
   - step_4
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -62,7 +62,7 @@ step_5_python_pandas:
   steps:
   - step_5
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -73,7 +73,7 @@ step_6_python_pandas:
   steps:
   - step_6
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -84,7 +84,7 @@ step_4a_python_pandas:
   steps:
   - step_4a
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -95,7 +95,7 @@ step_4b_python_pandas:
   steps:
   - step_4b
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -106,7 +106,7 @@ step_4b_r:
   steps:
   - step_4b
   image_name: r-image.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   env:
@@ -117,7 +117,7 @@ step_1_python_pyspark:
   steps:
   - step_1
   image_name: python_pyspark.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: c948577ab0607411dd4b640622d9ec3a
   script_cmd: python3 /code/dummy_step.py
   outputs:
@@ -127,7 +127,7 @@ step_2_python_pyspark:
   steps:
   - step_2
   image_name: python_pyspark.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: c948577ab0607411dd4b640622d9ec3a
   script_cmd: python3 /code/dummy_step.py
   outputs:
@@ -137,7 +137,7 @@ step_3_python_pyspark:
   steps:
   - step_3
   image_name: python_pyspark.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: c948577ab0607411dd4b640622d9ec3a
   script_cmd: python3 /code/dummy_step.py
   outputs:
@@ -147,7 +147,7 @@ step_4_python_pyspark:
   steps:
   - step_4
   image_name: python_pyspark.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: c948577ab0607411dd4b640622d9ec3a
   script_cmd: python3 /code/dummy_step.py
   env:
@@ -158,7 +158,7 @@ step_1_r:
   steps:
   - step_1
   image_name: r-image.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   outputs:
@@ -168,7 +168,7 @@ step_2_r:
   steps:
   - step_2
   image_name: r-image.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   outputs:
@@ -178,7 +178,7 @@ step_3_r:
   steps:
   - step_3
   image_name: r-image.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   outputs:
@@ -188,7 +188,7 @@ step_4_r:
   steps:
   - step_4
   image_name: r-image.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   env:
@@ -201,7 +201,7 @@ step_1_and_step_2_combined_python_pandas:
   - step_1
   - step_2
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -211,7 +211,7 @@ step_1_and_step_2_parallel_python_pandas:
   - step_1
   - step_2
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -223,7 +223,7 @@ step_3_and_step_4_combined_python_pandas:
   - step_3
   - step_4
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -233,7 +233,7 @@ step_1a_and_step_1b_combined_python_pandas:
   - step_1a
   - step_1b
   image_name: python_pandas.sif
-  zenodo_record_id: 15733426
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -241,131 +241,181 @@ step_1a_and_step_1b_combined_python_pandas:
 dummy_step_1_for_output_dir_example:
   steps:
   - step_1_for_output_dir_example
-  image_name: main/dummy_step_1_for_output_dir_example.sif
+  image_name: dummy_step_1_for_output_dir_example.sif
   script_cmd: python /dummy_step_1_for_output_dir_example.py
   outputs:
     step_1_main_output_directory: output_dir/
 dummy_step_1_for_output_dir_example_default:
   steps:
   - step_1_for_output_dir_example
-  image_name: main/dummy_step_1_for_output_dir_example.sif
+  image_name: dummy_step_1_for_output_dir_example.sif
   script_cmd: python /dummy_step_1_for_output_dir_example.py
 dummy_step_2_for_output_dir_example:
   steps:
   - step_2_for_output_dir_example
-  image_name: main/dummy_step_2_for_output_dir_example.sif
+  image_name: dummy_step_2_for_output_dir_example.sif
   script_cmd: python /dummy_step_2_for_output_dir_example.py
   outputs:
     step_2_main_output: result.parquet
 default_removing_records:
   steps:
   - removing_records
-  image_name: main/default_removing_records.sif
+  image_name: default_removing_records.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 05123136e756bfa57f1d7d5a3315f2f6
   script_cmd: python /default_removing_records.py
   outputs:
     dataset: dataset
 default_clusters_to_links:
   steps:
   - clusters_to_links
-  image_name: main/default_clusters_to_links.sif
+  image_name: default_clusters_to_links.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 0d00d1272bd8193f60727791097aa065
   script_cmd: python /default_clusters_to_links.py
   outputs:
     known_links: result.parquet
 default_determining_exclusions:
   steps:
   - determining_exclusions
-  image_name: main/default_determining_exclusions.sif
+  image_name: default_determining_exclusions.sif
+  zenodo_record_id: 15778354
+  md5_checksum: f4e9f740d8dd7599bfbb2b9eb54ced38
   script_cmd: python /default_determining_exclusions.py
   outputs:
     ids_to_remove: result.parquet
 default_updating_clusters:
   steps:
   - updating_clusters
-  image_name: main/default_updating_clusters.sif
+  image_name: default_updating_clusters.sif
+  zenodo_record_id: 15778354
+  md5_checksum: cc6bd29e099c2523347fa04545aa35c9
   script_cmd: python /default_updating_clusters.py
   outputs:
     clusters: clusters.parquet
-dummy_canonicalizing_and_downstream_analysis:
+# NOTE: This was made from dummy_canonicalizing_and_downstream_analysis.py,
+# if rebuilding change the name of that file to save_clusters.py
+save_clusters:
   steps:
   - canonicalizing_and_downstream_analysis
-  image_name: main/dummy_canonicalizing_and_downstream_analysis.sif
+  image_name: save_clusters.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 384ab2be668cbadc45160a674f621022
   script_cmd: python /dummy_canonicalizing_and_downstream_analysis.py
   outputs:
     analysis_output: result.parquet
-dummy_pre-processing:
+# NOTE: This was made from dummy_pre-processing.py,
+# if rebuilding change the name of that file to no_pre-processing.py
+no_pre-processing:
   steps:
   - pre-processing
-  image_name: main/dummy_pre-processing.sif
+  image_name: no_pre-processing.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 9a9c080cf145078152501cf96bf61f27
   script_cmd: python /dummy_pre-processing.py
   outputs:
     dataset: dataset
 default_schema_alignment:
   steps:
   - schema_alignment
-  image_name: main/default_schema_alignment.sif
+  image_name: default_schema_alignment.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 3166587f9cfec478b999a17074d628f7
   script_cmd: python /default_schema_alignment.py
   outputs:
     records: result.parquet
 splink_blocking_and_filtering:
   steps:
   - blocking_and_filtering
-  image_name: main/splink_blocking_and_filtering.sif
+  image_name: splink_blocking_and_filtering.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 3f8777c5751d7550762be078d87e7db2
   script_cmd: python /splink_blocking_and_filtering.py
   outputs:
     blocks: blocks
 splink_evaluating_pairs:
   steps:
   - evaluating_pairs
-  image_name: main/splink_evaluating_pairs.sif
+  image_name: splink_evaluating_pairs.sif
+  zenodo_record_id: 15778354
+  md5_checksum: b57f4bd16b7a3aa5099569078ea4c064
   script_cmd: python /splink_evaluating_pairs.py
   outputs:
     links: result.parquet
 splink_links_to_clusters:
   steps:
   - links_to_clusters
-  image_name: main/splink_links_to_clusters.sif
+  image_name: splink_links_to_clusters.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 81a71aa2ce6544953f3edb88d4ee6ec1
   script_cmd: python /splink_links_to_clusters.py
   outputs:
     clusters: result.parquet
 fastLink_evaluating_pairs:
   steps:
   - evaluating_pairs
-  image_name: main/fastLink_evaluating_pairs.sif
+  image_name: fastLink_evaluating_pairs.sif
   script_cmd: Rscript /fastLink_evaluating_pairs.R
   outputs:
     links: result.parquet
 fastLink_links_to_clusters:
   steps:
   - links_to_clusters
-  image_name: main/fastLink_links_to_clusters.sif
+  image_name: fastLink_links_to_clusters.sif
   script_cmd: Rscript /fastLink_links_to_clusters.R
   outputs:
     clusters: result.parquet
 exclude_clustered:
   steps:
   - determining_exclusions
-  image_name: main/exclude_clustered.sif
+  image_name: exclude_clustered.sif
+  zenodo_record_id: 15778354
+  md5_checksum: db51f68ea24d114ed2b83a1382b6e6b6
   script_cmd: python /exclude_clustered.py
   outputs:
     ids_to_remove: result.parquet
 exclude_none:
   steps:
   - determining_exclusions
-  image_name: main/exclude_none.sif
+  image_name: exclude_none.sif
+  zenodo_record_id: 15778354
+  md5_checksum: af12b6dde2aace9dab08d352368b16a1
   script_cmd: python /exclude_none.py
   outputs:
     ids_to_remove: result.parquet
 update_clusters_by_connected_components:
   steps:
   - updating_clusters
-  image_name: main/update_clusters_by_connected_components.sif
+  image_name: update_clusters_by_connected_components.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 806b0fe86a3306d74391678ed951b054
   script_cmd: python /update_clusters_by_connected_components.py
   outputs:
     clusters: result.parquet
 middle_name_to_initial:
   steps:
   - pre-processing
-  image_name: main/middle_name_to_initial.sif
+  image_name: middle_name_to_initial.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 89db9c3318300cda9d538cde08c3c323
   script_cmd: python /middle_name_to_initial.py
   outputs:
     dataset: dataset
+one_to_many_links_to_clusters:
+  steps:
+  - links_to_clusters
+  image_name: one_to_many_links_to_clusters.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 0bf6f0b8663b4c2f99995a2213dc541a
+  script_cmd: python /one_to_many_links_to_clusters.py
+  outputs:
+    clusters: result.parquet
+accept_all_pairs:
+  steps:
+  - evaluating_pairs
+  image_name: accept_all_pairs.sif
+  zenodo_record_id: 15778354
+  md5_checksum: c71c88d159c3d7343ebc39cd37224bd9
+  script_cmd: python /accept_all_pairs.py
+  outputs:
+    links: result.parquet

easylink/runner.py CHANGED Viewed

@@ -11,6 +11,9 @@ be called from the ``easylink.cli`` module.
 import os
 import socket
 import subprocess
+import threading
+import time
+from contextlib import redirect_stderr, redirect_stdout
 from pathlib import Path
 from graphviz import Source
@@ -123,7 +126,177 @@ def main(
     argv.extend(environment_args)
     logger.info(f"Running Snakemake")
     logger.debug(f"Snakemake arguments: {argv}")
-    snake_main(argv)
+    # Run snakemake
+    if debug:
+        snake_main(argv)
+    else:
+        _run_snakemake_with_filtered_output(argv, Path(results_dir))
+def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> None:
+    """Runs Snakemake with simplified log filtering.
+    Parameters
+    ----------
+    argv
+        Snakemake command line arguments.
+    results_dir
+        Directory to save the full Snakemake log.
+    """
+    snakemake_log_file = results_dir / "pipeline.log"
+    # Create a filtering output handler that processes lines in real-time
+    class FilteringOutput:
+        """Handles real-time filtering and logging of Snakemake output.
+        This class writes all snakemake output to a log file and selectively logs
+        filtered lines to the logger for user visibility.
+        Parameters
+        ----------
+        log_file_path
+            The path to the log file where all output will be written.
+        """
+        def __init__(self, log_file_path: Path):
+            self.log_file = open(log_file_path, "w")
+            self.buffer = ""
+            self.last_output_time = time.time()
+            self.heartbeat_timer = None
+            self.dots_printed = False  # Track if we've printed progress dots
+            self._start_heartbeat()
+        def _start_heartbeat(self):
+            """Start a timer that prints progress dots during long-running containers."""
+            def heartbeat():
+                current_time = time.time()
+                if current_time - self.last_output_time > 30:  # 30 seconds since last output
+                    # Print a dot to show progress - use original stdout if available
+                    if hasattr(self, "original_stdout") and self.original_stdout:
+                        self.original_stdout.write(".")
+                        self.original_stdout.flush()
+                        self.dots_printed = True  # Mark that we've printed dots
+                    self.last_output_time = current_time
+                # Schedule next heartbeat
+                self.heartbeat_timer = threading.Timer(30.0, heartbeat)
+                self.heartbeat_timer.daemon = True
+                self.heartbeat_timer.start()
+            # Start first heartbeat after 30 seconds
+            self.heartbeat_timer = threading.Timer(30.0, heartbeat)
+            self.heartbeat_timer.daemon = True
+            self.heartbeat_timer.start()
+        def write(self, text: str) -> int:
+            # Write to log file
+            self.log_file.write(text)
+            self.log_file.flush()
+            # Process and log filtered output
+            self.buffer += text
+            while "\n" in self.buffer:
+                line, self.buffer = self.buffer.split("\n", 1)
+                if line.strip():
+                    filtered_line = _filter_snakemake_output(line.strip())
+                    if filtered_line:
+                        # Add newline after dots if we've printed any
+                        if (
+                            self.dots_printed
+                            and hasattr(self, "original_stdout")
+                            and self.original_stdout
+                        ):
+                            self.original_stdout.write("\n")
+                            self.original_stdout.flush()
+                            self.dots_printed = False  # Reset the flag
+                        logger.info(filtered_line)
+                        self.last_output_time = time.time()  # Reset heartbeat timer
+            return len(text)
+        def flush(self):
+            self.log_file.flush()
+        def close(self):
+            # Stop heartbeat timer
+            if self.heartbeat_timer:
+                self.heartbeat_timer.cancel()
+            # Process and log any remaining buffer content
+            if self.buffer.strip():
+                filtered_line = _filter_snakemake_output(self.buffer.strip())
+                if filtered_line:
+                    # Add newline after dots if we've printed any
+                    if (
+                        self.dots_printed
+                        and hasattr(self, "original_stdout")
+                        and self.original_stdout
+                    ):
+                        self.original_stdout.write("\n")
+                        self.original_stdout.flush()
+                        self.dots_printed = False
+                    logger.info(filtered_line)
+            self.log_file.close()
+        def __enter__(self):
+            return self
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            self.close()
+    # Create the filtering output handler and ensure the log file is always closed
+    # Save original stdout for progress dots before redirection
+    import sys
+    original_stdout = sys.stdout
+    with FilteringOutput(snakemake_log_file) as filtering_output:
+        # Pass original stdout to filtering output for progress dots
+        filtering_output.original_stdout = original_stdout
+        try:
+            # Redirect both stdout and stderr to our filtering handler
+            with redirect_stdout(filtering_output), redirect_stderr(filtering_output):
+                snake_main(argv)
+        except SystemExit:
+            # Snakemake uses SystemExit for both success and failure
+            logger.info(
+                f"Pipeline finished running - full log saved to: {snakemake_log_file}"
+            )
+            # Always re-raise to allow test frameworks to detect completion
+            raise
+def _filter_snakemake_output(line: str) -> str:
+    """Filter for Snakemake output.
+    Parameters
+    ----------
+    line
+        A single line of Snakemake output.
+    Returns
+    -------
+        The filtered line for display.
+    """
+    # Skip empty lines
+    if not line.strip():
+        return ""
+    if line.startswith("localrule "):
+        # Show localrule names (without the "localrule" prefix)
+        # Extract rule name (remove "localrule " prefix and colon at the end)
+        filtered_line = line.replace("localrule ", "").rstrip(":")
+    elif line.startswith("Job ") and ":" in line:
+        # Show Job messages
+        # Extract everything after "Job ##: "
+        parts = line.split(":", 1)
+        filtered_line = parts[1].strip() if len(parts) > 1 else ""
+    else:
+        # Suppress everything else
+        filtered_line = ""
+    return filtered_line
 def _get_singularity_args(config: Config) -> str:

easylink/steps/cascading/accept_all_pairs.def ADDED Viewed

@@ -0,0 +1,22 @@
+Bootstrap: docker
+From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
+%files
+    ./accept_all_pairs.py /accept_all_pairs.py
+%post
+    # Create directories
+    mkdir -p /input_data
+    mkdir -p /extra_implementation_specific_input_data
+    mkdir -p /results
+    mkdir -p /diagnostics
+    # Install Python packages with specific versions
+    pip install pandas pyarrow
+%environment
+    export LC_ALL=C
+%runscript
+    python /accept_all_pairs.py '$@'

easylink/steps/cascading/accept_all_pairs.py ADDED Viewed

@@ -0,0 +1,26 @@
+# STEP_NAME: evaluating_pairs
+# REQUIREMENTS: pandas pyarrow
+import os
+from pathlib import Path
+import pandas as pd
+blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
+diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
+output_path = Path(os.environ["OUTPUT_PATHS"])
+Path(output_path).parent.mkdir(exist_ok=True, parents=True)
+all_predictions = []
+for block_dir in blocks_dir.iterdir():
+    if str(block_dir.stem).startswith("."):
+        continue
+    pairs = pd.read_parquet(block_dir / "pairs.parquet")
+    all_predictions.append(pairs.assign(Probability=1.0))
+all_predictions = pd.concat(all_predictions, ignore_index=True)
+print(all_predictions)
+all_predictions.to_parquet(output_path)

easylink/steps/cascading/exclude_clustered.py CHANGED Viewed

@@ -62,12 +62,21 @@ clusters_filepath = clusters_filepaths[0]
 # Exclude records that have been clustered
 clusters_df = load_file(clusters_filepath)
+# NOTE: We defined "clustered" for these purposes as clustered *with* anything else.
+# Simply putting a record into its own cluster does not indicate to us that it has
+# been sufficiently clustered to ignore.
+cluster_sizes = clusters_df.groupby("Cluster ID").size()
+clusters_df["size"] = cluster_sizes.loc[clusters_df["Cluster ID"]].values
+clusters_df = clusters_df[clusters_df["size"] > 1]
 dataset_df = load_file(dataset_path)
 clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
-    clusters_df["Input Record ID"].unique()
+    clusters_df[clusters_df["Input Record Dataset"] == splitter_choice][
+        "Input Record ID"
+    ].unique()
 )
-IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
+IDS_TO_REMOVE = pd.DataFrame({"Input Record ID": list(clustered_record_ids)})
 # OUTPUT_PATHS is a single path to a file (results.parquet)
 results_filepath = os.environ["OUTPUT_PATHS"]

easylink/steps/cascading/exclude_none.py CHANGED Viewed

@@ -67,7 +67,7 @@ clusters_df = load_file(clusters_filepath)
 # SAVE OUTPUTS
-IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
+IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
 # OUTPUT_PATHS is a single path to a file (results.parquet)
 results_filepath = os.environ["OUTPUT_PATHS"]

easylink/steps/cascading/one_to_many_links_to_clusters.def ADDED Viewed

@@ -0,0 +1,22 @@
+Bootstrap: docker
+From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
+%files
+    ./one_to_many_links_to_clusters.py /one_to_many_links_to_clusters.py
+%post
+    # Create directories
+    mkdir -p /input_data
+    mkdir -p /extra_implementation_specific_input_data
+    mkdir -p /results
+    mkdir -p /diagnostics
+    # Install Python packages with specific versions
+    pip install pandas pyarrow networkx
+%environment
+    export LC_ALL=C
+%runscript
+    python /one_to_many_links_to_clusters.py '$@'

easylink/steps/cascading/one_to_many_links_to_clusters.py ADDED Viewed

@@ -0,0 +1,109 @@
+# STEP_NAME: links_to_clusters
+# REQUIREMENTS: pandas pyarrow networkx
+import os
+from pathlib import Path
+import networkx as nx
+import pandas as pd
+links = pd.read_parquet(os.environ["LINKS_FILE_PATH"])
+output_path = Path(os.environ["OUTPUT_PATHS"])
+no_duplicates_dataset = os.environ["NO_DUPLICATES_DATASET"]
+break_ties_method = os.getenv("BREAK_TIES_METHOD", "drop")
+left_no_duplicates_dataset = links["Left Record Dataset"] == no_duplicates_dataset
+right_no_duplicates_dataset = links["Right Record Dataset"] == no_duplicates_dataset
+if (left_no_duplicates_dataset & right_no_duplicates_dataset).any():
+    raise ValueError(
+        f"Provided links include links within the no_duplicates_dataset ({no_duplicates_dataset})"
+    )
+if not (left_no_duplicates_dataset | right_no_duplicates_dataset).all():
+    raise ValueError(
+        f"Provided links include links that don't involve the no_duplicates_dataset ({no_duplicates_dataset})"
+    )
+# Get the no-duplicates dataset all on the right
+id_cols = [
+    "Left Record Dataset",
+    "Left Record ID",
+    "Right Record Dataset",
+    "Right Record ID",
+]
+switched_id_cols = [
+    "Right Record Dataset",
+    "Right Record ID",
+    "Left Record Dataset",
+    "Left Record ID",
+]
+links.loc[left_no_duplicates_dataset, id_cols] = links.loc[
+    left_no_duplicates_dataset, switched_id_cols
+].to_numpy()
+links[["Left Record ID", "Right Record ID"]] = links[
+    ["Left Record ID", "Right Record ID"]
+].astype(int)
+links["Left Record Key"] = (
+    links["Left Record Dataset"] + "-__-" + links["Left Record ID"].astype(int).astype(str)
+)
+links["Right Record Key"] = (
+    links["Right Record Dataset"] + "-__-" + links["Right Record ID"].astype(int).astype(str)
+)
+links_to_accept = (
+    links[links["Probability"] >= float(os.environ["THRESHOLD_MATCH_PROBABILITY"])]
+    # Pre-emptively break probability ties by right record key for the highest_id method
+    .sort_values(["Probability", "Right Record Key"], ascending=False)
+    # No duplicates in the *right* means only one link per *left* record
+    .groupby(["Left Record Key"]).first()
+)
+if break_ties_method == "drop":
+    num_tied = (
+        links_to_accept.merge(links, on=["Left Record Key", "Probability"])
+        .groupby(["Left Record Key"])
+        .size()
+    )
+    print("Ties:")
+    print(num_tied)
+    print(num_tied.describe())
+    links_to_accept = links_to_accept[num_tied == 1]
+elif break_ties_method == "highest_id":
+    # Done above pre-emptively
+    pass
+else:
+    raise ValueError(f"Unknown break_ties_method {break_ties_method}")
+# NOTE: We only include nodes involved in an accepted link in our cluster.
+# If a node isn't involved in an accepted link, that could just represent
+# that we haven't evaluated the right pairs involving it, not confidence that
+# it is a singleton.
+G = nx.from_pandas_edgelist(
+    links_to_accept.reset_index()[["Left Record Key", "Right Record Key"]].rename(
+        columns={"Left Record Key": "source", "Right Record Key": "target"}
+    )
+)
+# Compute connected components
+components = list(nx.connected_components(G))
+# Assign new cluster IDs
+merged_data = []
+for cluster_id, records in enumerate(components, start=1):
+    for record_key in records:
+        merged_data.append((record_key, cluster_id))
+# Build the final DataFrame
+merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
+merged_df[["Input Record Dataset", "Input Record ID"]] = (
+    merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
+    if not merged_df.empty
+    else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
+)
+merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
+merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]].to_parquet(output_path)

easylink/steps/cascading/update_clusters_by_connected_components.py CHANGED Viewed

@@ -59,7 +59,18 @@ new_clusters_df = load_file(new_clusters_filepath)
 def merge_clusters(known_clusters_df, new_clusters_df):
     # Combine both dataframes
-    combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
+    combined_df = pd.concat(
+        [
+            # Ensure cluster names are unique
+            known_clusters_df.assign(
+                **{"Cluster ID": lambda df: "known__" + df["Cluster ID"].astype(str)}
+            ),
+            new_clusters_df.assign(
+                **{"Cluster ID": lambda df: "new__" + df["Cluster ID"].astype(str)}
+            ),
+        ],
+        ignore_index=True,
+    )
     combined_df["Input Record Key"] = (
         combined_df["Input Record Dataset"]
         + "-__-"
@@ -92,9 +103,11 @@ def merge_clusters(known_clusters_df, new_clusters_df):
     # Build the final DataFrame
     merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
-    merged_df[["Input Record Dataset", "Input Record ID"]] = merged_df[
-        "Input Record Key"
-    ].str.split("-__-", n=1, expand=True)
+    merged_df[["Input Record Dataset", "Input Record ID"]] = (
+        merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
+        if not merged_df.empty
+        else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
+    )
     merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)

easylink/steps/default/default_determining_exclusions.py CHANGED Viewed

@@ -72,7 +72,7 @@ if len(clusters_df) > 0:
 # SAVE OUTPUTS
-IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
+IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
 # OUTPUT_PATHS is a single path to a file (results.parquet)
 results_filepath = os.environ["OUTPUT_PATHS"]

easylink/steps/default/default_removing_records.py CHANGED Viewed

@@ -52,7 +52,7 @@ results_dir.mkdir(exist_ok=True, parents=True)
 dataset = load_file(dataset_path)
 ids_to_remove = load_file(ids_filepath)
-dataset = dataset[~dataset["Record ID"].isin(ids_to_remove)]
+dataset = dataset[~dataset["Record ID"].isin(ids_to_remove["Input Record ID"])]
 output_path = results_dir / Path(dataset_path).name
 logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")

easylink/steps/splink/splink_blocking_and_filtering.py CHANGED Viewed

@@ -90,12 +90,18 @@ blocked_pairs = (
     .drop(columns=["match_key"])
 )
-blocked_pairs[["Left Record Dataset", "Left Record ID"]] = blocked_pairs.pop(
-    "join_key_l"
-).str.split("-__-", n=1, expand=True)
-blocked_pairs[["Right Record Dataset", "Right Record ID"]] = blocked_pairs.pop(
-    "join_key_r"
-).str.split("-__-", n=1, expand=True)
+blocked_pairs[["Left Record Dataset", "Left Record ID"]] = (
+    blocked_pairs.pop("join_key_l").str.split("-__-", n=1, expand=True)
+    if not blocked_pairs.empty
+    else pd.DataFrame(columns=["Left Record Dataset", "Left Record ID"])
+)
+blocked_pairs[["Right Record Dataset", "Right Record ID"]] = (
+    blocked_pairs.pop("join_key_r").str.split("-__-", n=1, expand=True)
+    if not blocked_pairs.empty
+    else pd.DataFrame(columns=["Right Record Dataset", "Right Record ID"])
+)
 blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
     ["Left Record ID", "Right Record ID"]
 ].astype(int)

easylink/steps/splink/splink_evaluating_pairs.py CHANGED Viewed

@@ -35,6 +35,7 @@ for block_dir in blocks_dir.iterdir():
             comparisons.append(cl.LevenshteinAtThresholds(column))
         else:
             raise ValueError(f"Unknown comparison method {method}")
+    # TODO: check both datasets contain all the columns
     # Create the Splink linker in dedupe mode
     settings = SettingsCreator(
@@ -135,7 +136,7 @@ for block_dir in blocks_dir.iterdir():
     sqls = predict_from_comparison_vectors_sqls_using_settings(
         linker._settings_obj,
-        float(os.environ["THRESHOLD_MATCH_PROBABILITY"]),
+        float(os.getenv("THRESHOLD_MATCH_PROBABILITY", 0)),
         threshold_match_weight=None,
         sql_infinity_expression=linker._infinity_expression,
     )

easylink/steps/splink/splink_links_to_clusters.py CHANGED Viewed

@@ -53,6 +53,8 @@ cc = (
 # Split "Record Key" back into "Input Record Dataset" and "Input Record ID"
 cc[["Input Record Dataset", "Input Record ID"]] = (
     cc["Record Key"].astype(str).str.split("-__-", n=1, expand=True)
+    if not cc.empty
+    else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
 )
 cc = cc.drop(columns=["Record Key"])
 cc["Input Record ID"] = cc["Input Record ID"].astype(int)

easylink/utilities/general_utils.py CHANGED Viewed

@@ -97,24 +97,34 @@ def _add_logging_sink(
         Whether the logs should be converted to JSON before they're dumped
         to the logging sink.
     """
-    message_format = (
-        "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <green>{elapsed}</green> | "
-        "<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
-    )
+    def format_message(record):
+        elapsed_seconds = int(record["elapsed"].total_seconds())
+        hours = elapsed_seconds // 3600
+        minutes = (elapsed_seconds % 3600) // 60
+        seconds = elapsed_seconds % 60
+        elapsed_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+        time_str = record["time"].strftime("%Y-%m-%d %H:%M:%S")
+        if colorize:
+            return f"\033[32m{time_str}\033[0m | \033[32m{elapsed_str}\033[0m | {record['message']}\n"
+        else:
+            return f"{time_str} | {elapsed_str} | {record['message']}\n"
     if verbose == 0:
         logger.add(
             sink,
-            colorize=colorize,
+            colorize=False,  # We handle colors in format_message
             level="INFO",
-            format=message_format,
+            format=format_message,
             serialize=serialize,
         )
     elif verbose >= 1:
         logger.add(
             sink,
-            colorize=colorize,
+            colorize=False,  # We handle colors in format_message
             level="DEBUG",
-            format=message_format,
+            format=format_message,
             serialize=serialize,
         )

easylink/utilities/validation_utils.py CHANGED Viewed

@@ -341,8 +341,8 @@ def _validate_pairs(df: pd.DataFrame, filepath: str) -> None:
 def validate_ids_to_remove(filepath: str) -> None:
     """Validates a file containing IDs to remove.
-    - The file must contain a single column: "Record ID".
-    - "Record ID" must have unique values.
+    - The file must contain a single column: "Input Record ID".
+    - "Input Record ID" must have unique values.
     Parameters
     ----------
@@ -352,13 +352,13 @@ def validate_ids_to_remove(filepath: str) -> None:
     Raises
     ------
     LookupError
-        If the file is missing the "Record ID" column.
+        If the file is missing the "Input Record ID" column.
     ValueError
-        If the "Record ID" column is not unique.
+        If the "Input Record ID" column is not unique.
     """
-    _validate_required_columns(filepath, {"Record ID"})
+    _validate_required_columns(filepath, {"Input Record ID"})
     df = _read_file(filepath)
-    _validate_unique_column(df, "Record ID", filepath)
+    _validate_unique_column(df, "Input Record ID", filepath)
 def validate_records(filepath: str) -> None:

{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easylink
-Version: 0.1.24
+Version: 0.2.0
 Summary: Research repository for the EasyLink ER ecosystem project.
 Home-page: https://github.com/ihmeuw/easylink
 Author: The EasyLink developers
@@ -78,34 +78,50 @@ Installation
 .. _installation:
+**NOTE: This package requires AMD64 CPU architecture - it is not compatible with
+Apple's ARM64 architecture (e.g. M1 and newer Macs).**
 There are a few things to install in order to use this package:
-- Install singularity.
+- Set up Linux.
+  Singularity (and thus EasyLink) requires Linux to run. If you are not already
+  using Linux, you will need to set up a virtual machine; refer to the
+  `Singularity documentation for installing on Windows or Mac <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-windows-or-mac>`_.
-  You may need to request it from your system admin.
-  Refer to https://docs.sylabs.io/guides/4.1/admin-guide/installation.html.
-  You can check if you already have singularity installed by running the command
-  ``singularity --version``. For an existing installation, your singularity version
+- Install Singularity.
+  First check if you already have Singularity installed by running the command
+  ``singularity --version``. For an existing installation, your Singularity version
   number is printed.
+  If Singularity is not yet installed, you will need to install it;
+  refer to the `Singularity docs for installing on Linux <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-linux>`_.
+  Note that this requires administrator privileges; you may need to request installation
+  from your system admin if you are working in a shared computing environment.
 - Install conda.
   We recommend `miniforge <https://github.com/conda-forge/miniforge>`_. You can
   check if you already have conda installed by running the command ``conda --version``.
   For an existing installation, a version will be displayed.
-- Install easylink, python and graphviz in a conda environment.
+- Create a conda environment with python and graphviz installed.
+  ::
+  $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
+  $ conda activate easylink
+- Install easylink in the environment.
   Option 1 - Install from PyPI with pip::
-    $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
-    $ conda activate easylink
     $ pip install easylink
   Option 2 - Build from source with pip::
-    $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
-    $ conda activate easylink
     $ pip install git+https://github.com/ihmeuw/easylink.git
 .. _end_installation:

{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/RECORD RENAMED Viewed

@@ -1,34 +1,38 @@
 easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
 easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
-easylink/_version.py,sha256=Jq7e1LcKcQSNVg4EOJ-acPyPgs8Os5cYEZWXrQsI7Pg,23
-easylink/cli.py,sha256=06nv_3or9SxEynfi8rZTgYP1hm-Y_CeNymf5qn5s6Qo,10332
+easylink/_version.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
+easylink/cli.py,sha256=3Xoqclhn7mEHzuqyuVUjt67-V3Fqu0_Jr3B3lCdIuAg,10704
 easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
 easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
 easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
-easylink/implementation_metadata.yaml,sha256=pKu_H9fLnTsS8E4wCnYRitumW1-zs7mfE3z66BAyO30,10848
+easylink/implementation_metadata.yaml,sha256=ahuSVk5Ur1V0F8EsLZO5apkNC2bWv2RsytNaiWGo9Yk,12562
 easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
 easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
 easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
 easylink/rule.py,sha256=n8r4NL7MiNzppl39MRbEMdHEpn_e_XS7LfrsJD6KNfA,16629
-easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
+easylink/runner.py,sha256=h39MbWHgTs-VwkPxk76186si76e8UTf1hySqepqUSS8,13155
 easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
 easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
 easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
 easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
 easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
 easylink/pipeline_schema_constants/testing.py,sha256=ti08DeUuF-eWrGKMj4BMyOFFJnEYooDaWX0DGiferbk,24579
+easylink/steps/cascading/accept_all_pairs.def,sha256=kwZMF3H0mqCBcO1Y2parJXFBLp4e9bLQoVIYU7zZ8xY,486
+easylink/steps/cascading/accept_all_pairs.py,sha256=eF_rmqcZtL3vI1u-TJejOcKX2Qou-AbaLI7qAAGjoGI,703
 easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
-easylink/steps/cascading/exclude_clustered.py,sha256=Bpsyf9vAZ431Fh96RVzHkF7fy77NQjo1Cl6bHCIy69c,2580
+easylink/steps/cascading/exclude_clustered.py,sha256=T60deNb91_ZFg5K190G-Q7BC5EYrEdLPhFEK7Togv0Y,3048
 easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
-easylink/steps/cascading/exclude_none.py,sha256=5DK5bNG4TneMwUKE49Kmz7VDnKBNZWjOERkuSJU3BmA,2475
+easylink/steps/cascading/exclude_none.py,sha256=DesKAO-UcPqKKtUS92OHU25YDXMJLiBEcGLk69UYWDk,2481
+easylink/steps/cascading/one_to_many_links_to_clusters.def,sha256=BVFusUydsV3hY1en16OVr3TPqzwst-cEVBwvb8dtpqA,534
+easylink/steps/cascading/one_to_many_links_to_clusters.py,sha256=7QSJxW3mmR3LIjWBzzgi3vcOsmoYOsiSJn6iYGppHLA,3789
 easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
-easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=sFZXMGXl17jcGt8Fu5hgQz1KW5bFvPYdCoQGZ9Erc0I,3629
+easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=jhpMgewztCXrRxBw2FnH2HjIybpp7GcHe4kjTMgQOyg,4059
 easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
 easylink/steps/default/default_clusters_to_links.py,sha256=Ckm53d3W-ETNlTvQJPOkpHmSqCmxSWknMPQrEAIoTBY,2816
 easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
-easylink/steps/default/default_determining_exclusions.py,sha256=4diLfuqYm_Koj7gwifjwe_7mLZ6xb6RQiEdk-RRtB94,2629
+easylink/steps/default/default_determining_exclusions.py,sha256=RpYHFAral4uYevgiOsYqUHYgsEIejV5NhYdQ3q7VeU0,2635
 easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
-easylink/steps/default/default_removing_records.py,sha256=P4mmX2D4mhSoWd_S5CaNT4hlHOMAeZiMhCScWQiR_fQ,1906
+easylink/steps/default/default_removing_records.py,sha256=I_xGdWftlwP7H8HdxfblSG2YFVqA986KOECVwMCn4fk,1925
 easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
 easylink/steps/default/default_schema_alignment.py,sha256=oT5jbYQ3C3ocLgqqOnvH0SIJ6NeTtPBWWmCqr_frnAI,1479
 easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
@@ -70,22 +74,22 @@ easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4Ag
 easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
 easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2yFW5MpsMXdpoY_NjHcBvVVAHWv89LUgXE,1852
 easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
-easylink/steps/splink/splink_blocking_and_filtering.py,sha256=FO8YJ2_KgCLpQoq5xsM339bTSN1DhCXCL8XT1pb5STY,5259
+easylink/steps/splink/splink_blocking_and_filtering.py,sha256=3WMBmNEECB9Kxu4D6PAesZzBrhHTdpFEgvnGPsV4bww,5475
 easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
-easylink/steps/splink/splink_evaluating_pairs.py,sha256=m-j1QMRSvPCiSoWVSV1kzzzsK1c_xG8nqYKMd3cj7kM,6195
+easylink/steps/splink/splink_evaluating_pairs.py,sha256=vWUe3vQo9uGs0Cy8pG5PbolzsJX_cEaPS3Q0PMcBjcg,6253
 easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
-easylink/steps/splink/splink_links_to_clusters.py,sha256=5Sw8yi0dVLuRB-trN2kXmxbHBR0VJBxYee6u4_usg2Y,1920
+easylink/steps/splink/splink_links_to_clusters.py,sha256=Brpy3ZKSBpBUeOitg1ZgDvuMVwILH0QBkLXRJN8LXno,2015
 easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
 easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
 easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
-easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
+easylink/utilities/general_utils.py,sha256=MmuoV4T6PgyEDjbepC_1D3TGrq70Hp-hl-GaAYr5wRU,5033
 easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
 easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
 easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
-easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
-easylink-0.1.24.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
-easylink-0.1.24.dist-info/METADATA,sha256=qXBlEqCrYzAtLJGKC4Lqf_Q7pMn5gJdRxFGYDGUX4pE,3565
-easylink-0.1.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-easylink-0.1.24.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
-easylink-0.1.24.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
-easylink-0.1.24.dist-info/RECORD,,
+easylink/utilities/validation_utils.py,sha256=1naksMPStw_xIOqskX6DE99f16Y7eCcVF9I5ZILjMvI,18453
+easylink-0.2.0.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
+easylink-0.2.0.dist-info/METADATA,sha256=HxtOiOMe9hTRcK6HL6sLTTQNeP9X7hrhiodTpEMUeOA,4218
+easylink-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+easylink-0.2.0.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
+easylink-0.2.0.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
+easylink-0.2.0.dist-info/RECORD,,

{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

easylink 0.1.24__py3-none-any.whl → 0.2.0__py3-none-any.whl

easylink 0.1.24py3-none-any.whl → 0.2.0py3-none-any.whl