PyPI - easylink - Versions diffs - 0.1.25__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

easylink 0.1.25py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

easylink/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.25"
1	+ __version__ = "0.2.0"

easylink/cli.py CHANGED Viewed

@@ -211,9 +211,14 @@ def run(
             images_dir=images,
             schema_name=schema,
         )
-    except SystemExit:
-        # Snakemake uses SystemExit for completion - log success and re-raise
-        logger.info("*** FINISHED ***")
+    except SystemExit as e:
+        # Snakemake uses SystemExit with exit code 0 for success, non-zero for failure
+        if e.code == 0:
+            logger.info("\033[32m*** FINISHED ***\033[0m")  # Green
+        else:
+            logger.error(
+                f"\033[31mERROR: Pipeline failed with exit code {e.code}\033[0m"
+            )  # Red
         raise

easylink/implementation_metadata.yaml CHANGED Viewed

@@ -2,7 +2,7 @@ step_1_python_pandas:
   steps:
   - step_1
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -11,7 +11,7 @@ step_1a_python_pandas:
   steps:
   - step_1a
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -22,7 +22,7 @@ step_1b_python_pandas:
   steps:
   - step_1b
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -33,7 +33,7 @@ step_2_python_pandas:
   steps:
   - step_2
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -42,7 +42,7 @@ step_3_python_pandas:
   steps:
   - step_3
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -51,7 +51,7 @@ step_4_python_pandas:
   steps:
   - step_4
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -62,7 +62,7 @@ step_5_python_pandas:
   steps:
   - step_5
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -73,7 +73,7 @@ step_6_python_pandas:
   steps:
   - step_6
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -84,7 +84,7 @@ step_4a_python_pandas:
   steps:
   - step_4a
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -95,7 +95,7 @@ step_4b_python_pandas:
   steps:
   - step_4b
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -106,7 +106,7 @@ step_4b_r:
   steps:
   - step_4b
   image_name: r-image.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   env:
@@ -117,7 +117,7 @@ step_1_python_pyspark:
   steps:
   - step_1
   image_name: python_pyspark.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: c948577ab0607411dd4b640622d9ec3a
   script_cmd: python3 /code/dummy_step.py
   outputs:
@@ -127,7 +127,7 @@ step_2_python_pyspark:
   steps:
   - step_2
   image_name: python_pyspark.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: c948577ab0607411dd4b640622d9ec3a
   script_cmd: python3 /code/dummy_step.py
   outputs:
@@ -137,7 +137,7 @@ step_3_python_pyspark:
   steps:
   - step_3
   image_name: python_pyspark.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: c948577ab0607411dd4b640622d9ec3a
   script_cmd: python3 /code/dummy_step.py
   outputs:
@@ -147,7 +147,7 @@ step_4_python_pyspark:
   steps:
   - step_4
   image_name: python_pyspark.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: c948577ab0607411dd4b640622d9ec3a
   script_cmd: python3 /code/dummy_step.py
   env:
@@ -158,7 +158,7 @@ step_1_r:
   steps:
   - step_1
   image_name: r-image.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   outputs:
@@ -168,7 +168,7 @@ step_2_r:
   steps:
   - step_2
   image_name: r-image.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   outputs:
@@ -178,7 +178,7 @@ step_3_r:
   steps:
   - step_3
   image_name: r-image.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   outputs:
@@ -188,7 +188,7 @@ step_4_r:
   steps:
   - step_4
   image_name: r-image.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
   script_cmd: Rscript /dummy_step.R
   env:
@@ -201,7 +201,7 @@ step_1_and_step_2_combined_python_pandas:
   - step_1
   - step_2
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -211,7 +211,7 @@ step_1_and_step_2_parallel_python_pandas:
   - step_1
   - step_2
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   env:
@@ -223,7 +223,7 @@ step_3_and_step_4_combined_python_pandas:
   - step_3
   - step_4
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -233,7 +233,7 @@ step_1a_and_step_1b_combined_python_pandas:
   - step_1a
   - step_1b
   image_name: python_pandas.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
   script_cmd: python /dummy_step.py
   outputs:
@@ -261,8 +261,8 @@ default_removing_records:
   steps:
   - removing_records
   image_name: default_removing_records.sif
-  zenodo_record_id: 15757317
-  md5_checksum: 85dba6fd73c9f8f504fddb6d5c30f2de
+  zenodo_record_id: 15778354
+  md5_checksum: 05123136e756bfa57f1d7d5a3315f2f6
   script_cmd: python /default_removing_records.py
   outputs:
     dataset: dataset
@@ -270,7 +270,7 @@ default_clusters_to_links:
   steps:
   - clusters_to_links
   image_name: default_clusters_to_links.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 0d00d1272bd8193f60727791097aa065
   script_cmd: python /default_clusters_to_links.py
   outputs:
@@ -279,8 +279,8 @@ default_determining_exclusions:
   steps:
   - determining_exclusions
   image_name: default_determining_exclusions.sif
-  zenodo_record_id: 15757317
-  md5_checksum: e61cb32ad45b79ca9a2c36db4e76ef7e
+  zenodo_record_id: 15778354
+  md5_checksum: f4e9f740d8dd7599bfbb2b9eb54ced38
   script_cmd: python /default_determining_exclusions.py
   outputs:
     ids_to_remove: result.parquet
@@ -288,7 +288,7 @@ default_updating_clusters:
   steps:
   - updating_clusters
   image_name: default_updating_clusters.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: cc6bd29e099c2523347fa04545aa35c9
   script_cmd: python /default_updating_clusters.py
   outputs:
@@ -299,7 +299,7 @@ save_clusters:
   steps:
   - canonicalizing_and_downstream_analysis
   image_name: save_clusters.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 384ab2be668cbadc45160a674f621022
   script_cmd: python /dummy_canonicalizing_and_downstream_analysis.py
   outputs:
@@ -310,7 +310,7 @@ no_pre-processing:
   steps:
   - pre-processing
   image_name: no_pre-processing.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 9a9c080cf145078152501cf96bf61f27
   script_cmd: python /dummy_pre-processing.py
   outputs:
@@ -319,7 +319,7 @@ default_schema_alignment:
   steps:
   - schema_alignment
   image_name: default_schema_alignment.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 3166587f9cfec478b999a17074d628f7
   script_cmd: python /default_schema_alignment.py
   outputs:
@@ -328,8 +328,8 @@ splink_blocking_and_filtering:
   steps:
   - blocking_and_filtering
   image_name: splink_blocking_and_filtering.sif
-  zenodo_record_id: 15757317
-  md5_checksum: 8a365b90295ef6beaad2b7f80a03d768
+  zenodo_record_id: 15778354
+  md5_checksum: 3f8777c5751d7550762be078d87e7db2
   script_cmd: python /splink_blocking_and_filtering.py
   outputs:
     blocks: blocks
@@ -337,7 +337,7 @@ splink_evaluating_pairs:
   steps:
   - evaluating_pairs
   image_name: splink_evaluating_pairs.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: b57f4bd16b7a3aa5099569078ea4c064
   script_cmd: python /splink_evaluating_pairs.py
   outputs:
@@ -346,8 +346,8 @@ splink_links_to_clusters:
   steps:
   - links_to_clusters
   image_name: splink_links_to_clusters.sif
-  zenodo_record_id: 15757317
-  md5_checksum: 645937f7bab9c2557b7aacafaf4e4765
+  zenodo_record_id: 15778354
+  md5_checksum: 81a71aa2ce6544953f3edb88d4ee6ec1
   script_cmd: python /splink_links_to_clusters.py
   outputs:
     clusters: result.parquet
@@ -369,6 +369,8 @@ exclude_clustered:
   steps:
   - determining_exclusions
   image_name: exclude_clustered.sif
+  zenodo_record_id: 15778354
+  md5_checksum: db51f68ea24d114ed2b83a1382b6e6b6
   script_cmd: python /exclude_clustered.py
   outputs:
     ids_to_remove: result.parquet
@@ -376,6 +378,8 @@ exclude_none:
   steps:
   - determining_exclusions
   image_name: exclude_none.sif
+  zenodo_record_id: 15778354
+  md5_checksum: af12b6dde2aace9dab08d352368b16a1
   script_cmd: python /exclude_none.py
   outputs:
     ids_to_remove: result.parquet
@@ -383,6 +387,8 @@ update_clusters_by_connected_components:
   steps:
   - updating_clusters
   image_name: update_clusters_by_connected_components.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 806b0fe86a3306d74391678ed951b054
   script_cmd: python /update_clusters_by_connected_components.py
   outputs:
     clusters: result.parquet
@@ -390,8 +396,26 @@ middle_name_to_initial:
   steps:
   - pre-processing
   image_name: middle_name_to_initial.sif
-  zenodo_record_id: 15757317
+  zenodo_record_id: 15778354
   md5_checksum: 89db9c3318300cda9d538cde08c3c323
   script_cmd: python /middle_name_to_initial.py
   outputs:
     dataset: dataset
+one_to_many_links_to_clusters:
+  steps:
+  - links_to_clusters
+  image_name: one_to_many_links_to_clusters.sif
+  zenodo_record_id: 15778354
+  md5_checksum: 0bf6f0b8663b4c2f99995a2213dc541a
+  script_cmd: python /one_to_many_links_to_clusters.py
+  outputs:
+    clusters: result.parquet
+accept_all_pairs:
+  steps:
+  - evaluating_pairs
+  image_name: accept_all_pairs.sif
+  zenodo_record_id: 15778354
+  md5_checksum: c71c88d159c3d7343ebc39cd37224bd9
+  script_cmd: python /accept_all_pairs.py
+  outputs:
+    links: result.parquet

easylink/runner.py CHANGED Viewed

@@ -11,6 +11,8 @@ be called from the ``easylink.cli`` module.
 import os
 import socket
 import subprocess
+import threading
+import time
 from contextlib import redirect_stderr, redirect_stdout
 from pathlib import Path
@@ -161,6 +163,32 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
         def __init__(self, log_file_path: Path):
             self.log_file = open(log_file_path, "w")
             self.buffer = ""
+            self.last_output_time = time.time()
+            self.heartbeat_timer = None
+            self.dots_printed = False  # Track if we've printed progress dots
+            self._start_heartbeat()
+        def _start_heartbeat(self):
+            """Start a timer that prints progress dots during long-running containers."""
+            def heartbeat():
+                current_time = time.time()
+                if current_time - self.last_output_time > 30:  # 30 seconds since last output
+                    # Print a dot to show progress - use original stdout if available
+                    if hasattr(self, "original_stdout") and self.original_stdout:
+                        self.original_stdout.write(".")
+                        self.original_stdout.flush()
+                        self.dots_printed = True  # Mark that we've printed dots
+                    self.last_output_time = current_time
+                # Schedule next heartbeat
+                self.heartbeat_timer = threading.Timer(30.0, heartbeat)
+                self.heartbeat_timer.daemon = True
+                self.heartbeat_timer.start()
+            # Start first heartbeat after 30 seconds
+            self.heartbeat_timer = threading.Timer(30.0, heartbeat)
+            self.heartbeat_timer.daemon = True
+            self.heartbeat_timer.start()
         def write(self, text: str) -> int:
             # Write to log file
@@ -172,9 +200,19 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
             while "\n" in self.buffer:
                 line, self.buffer = self.buffer.split("\n", 1)
                 if line.strip():
-                    filtered_line = _filter_snakemake_output_simple(line.strip())
+                    filtered_line = _filter_snakemake_output(line.strip())
                     if filtered_line:
+                        # Add newline after dots if we've printed any
+                        if (
+                            self.dots_printed
+                            and hasattr(self, "original_stdout")
+                            and self.original_stdout
+                        ):
+                            self.original_stdout.write("\n")
+                            self.original_stdout.flush()
+                            self.dots_printed = False  # Reset the flag
                         logger.info(filtered_line)
+                        self.last_output_time = time.time()  # Reset heartbeat timer
             return len(text)
@@ -182,10 +220,23 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
             self.log_file.flush()
         def close(self):
+            # Stop heartbeat timer
+            if self.heartbeat_timer:
+                self.heartbeat_timer.cancel()
             # Process and log any remaining buffer content
             if self.buffer.strip():
-                filtered_line = _filter_snakemake_output_simple(self.buffer.strip())
+                filtered_line = _filter_snakemake_output(self.buffer.strip())
                 if filtered_line:
+                    # Add newline after dots if we've printed any
+                    if (
+                        self.dots_printed
+                        and hasattr(self, "original_stdout")
+                        and self.original_stdout
+                    ):
+                        self.original_stdout.write("\n")
+                        self.original_stdout.flush()
+                        self.dots_printed = False
                     logger.info(filtered_line)
             self.log_file.close()
@@ -196,7 +247,14 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
             self.close()
     # Create the filtering output handler and ensure the log file is always closed
+    # Save original stdout for progress dots before redirection
+    import sys
+    original_stdout = sys.stdout
     with FilteringOutput(snakemake_log_file) as filtering_output:
+        # Pass original stdout to filtering output for progress dots
+        filtering_output.original_stdout = original_stdout
         try:
             # Redirect both stdout and stderr to our filtering handler
             with redirect_stdout(filtering_output), redirect_stderr(filtering_output):
@@ -210,9 +268,8 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
             raise
-def _filter_snakemake_output_simple(line: str) -> str | None:
-    """
-    Simple filter for Snakemake output showing only localrules and Job messages.
+def _filter_snakemake_output(line: str) -> str:
+    """Filter for Snakemake output.
     Parameters
     ----------
@@ -221,12 +278,11 @@ def _filter_snakemake_output_simple(line: str) -> str | None:
     Returns
     -------
-    str or None
-        The filtered line for display, or None to suppress the line.
+        The filtered line for display.
     """
     # Skip empty lines
     if not line.strip():
-        return None
+        return ""
     if line.startswith("localrule "):
         # Show localrule names (without the "localrule" prefix)
@@ -236,10 +292,10 @@ def _filter_snakemake_output_simple(line: str) -> str | None:
         # Show Job messages
         # Extract everything after "Job ##: "
         parts = line.split(":", 1)
-        filtered_line = parts[1].strip() if len(parts) > 1 else None
+        filtered_line = parts[1].strip() if len(parts) > 1 else ""
     else:
         # Suppress everything else
-        filtered_line = None
+        filtered_line = ""
     return filtered_line

easylink/steps/cascading/accept_all_pairs.def ADDED Viewed

@@ -0,0 +1,22 @@
+Bootstrap: docker
+From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
+%files
+    ./accept_all_pairs.py /accept_all_pairs.py
+%post
+    # Create directories
+    mkdir -p /input_data
+    mkdir -p /extra_implementation_specific_input_data
+    mkdir -p /results
+    mkdir -p /diagnostics
+    # Install Python packages with specific versions
+    pip install pandas pyarrow
+%environment
+    export LC_ALL=C
+%runscript
+    python /accept_all_pairs.py '$@'

easylink/steps/cascading/accept_all_pairs.py ADDED Viewed

@@ -0,0 +1,26 @@
+# STEP_NAME: evaluating_pairs
+# REQUIREMENTS: pandas pyarrow
+import os
+from pathlib import Path
+import pandas as pd
+blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
+diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
+output_path = Path(os.environ["OUTPUT_PATHS"])
+Path(output_path).parent.mkdir(exist_ok=True, parents=True)
+all_predictions = []
+for block_dir in blocks_dir.iterdir():
+    if str(block_dir.stem).startswith("."):
+        continue
+    pairs = pd.read_parquet(block_dir / "pairs.parquet")
+    all_predictions.append(pairs.assign(Probability=1.0))
+all_predictions = pd.concat(all_predictions, ignore_index=True)
+print(all_predictions)
+all_predictions.to_parquet(output_path)

easylink/steps/cascading/exclude_clustered.py CHANGED Viewed

@@ -62,12 +62,21 @@ clusters_filepath = clusters_filepaths[0]
 # Exclude records that have been clustered
 clusters_df = load_file(clusters_filepath)
+# NOTE: We defined "clustered" for these purposes as clustered *with* anything else.
+# Simply putting a record into its own cluster does not indicate to us that it has
+# been sufficiently clustered to ignore.
+cluster_sizes = clusters_df.groupby("Cluster ID").size()
+clusters_df["size"] = cluster_sizes.loc[clusters_df["Cluster ID"]].values
+clusters_df = clusters_df[clusters_df["size"] > 1]
 dataset_df = load_file(dataset_path)
 clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
-    clusters_df["Input Record ID"].unique()
+    clusters_df[clusters_df["Input Record Dataset"] == splitter_choice][
+        "Input Record ID"
+    ].unique()
 )
-IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
+IDS_TO_REMOVE = pd.DataFrame({"Input Record ID": list(clustered_record_ids)})
 # OUTPUT_PATHS is a single path to a file (results.parquet)
 results_filepath = os.environ["OUTPUT_PATHS"]

easylink/steps/cascading/exclude_none.py CHANGED Viewed

@@ -67,7 +67,7 @@ clusters_df = load_file(clusters_filepath)
 # SAVE OUTPUTS
-IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
+IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
 # OUTPUT_PATHS is a single path to a file (results.parquet)
 results_filepath = os.environ["OUTPUT_PATHS"]

easylink/steps/cascading/one_to_many_links_to_clusters.def ADDED Viewed

@@ -0,0 +1,22 @@
+Bootstrap: docker
+From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
+%files
+    ./one_to_many_links_to_clusters.py /one_to_many_links_to_clusters.py
+%post
+    # Create directories
+    mkdir -p /input_data
+    mkdir -p /extra_implementation_specific_input_data
+    mkdir -p /results
+    mkdir -p /diagnostics
+    # Install Python packages with specific versions
+    pip install pandas pyarrow networkx
+%environment
+    export LC_ALL=C
+%runscript
+    python /one_to_many_links_to_clusters.py '$@'

easylink/steps/cascading/one_to_many_links_to_clusters.py ADDED Viewed

@@ -0,0 +1,109 @@
+# STEP_NAME: links_to_clusters
+# REQUIREMENTS: pandas pyarrow networkx
+import os
+from pathlib import Path
+import networkx as nx
+import pandas as pd
+links = pd.read_parquet(os.environ["LINKS_FILE_PATH"])
+output_path = Path(os.environ["OUTPUT_PATHS"])
+no_duplicates_dataset = os.environ["NO_DUPLICATES_DATASET"]
+break_ties_method = os.getenv("BREAK_TIES_METHOD", "drop")
+left_no_duplicates_dataset = links["Left Record Dataset"] == no_duplicates_dataset
+right_no_duplicates_dataset = links["Right Record Dataset"] == no_duplicates_dataset
+if (left_no_duplicates_dataset & right_no_duplicates_dataset).any():
+    raise ValueError(
+        f"Provided links include links within the no_duplicates_dataset ({no_duplicates_dataset})"
+    )
+if not (left_no_duplicates_dataset | right_no_duplicates_dataset).all():
+    raise ValueError(
+        f"Provided links include links that don't involve the no_duplicates_dataset ({no_duplicates_dataset})"
+    )
+# Get the no-duplicates dataset all on the right
+id_cols = [
+    "Left Record Dataset",
+    "Left Record ID",
+    "Right Record Dataset",
+    "Right Record ID",
+]
+switched_id_cols = [
+    "Right Record Dataset",
+    "Right Record ID",
+    "Left Record Dataset",
+    "Left Record ID",
+]
+links.loc[left_no_duplicates_dataset, id_cols] = links.loc[
+    left_no_duplicates_dataset, switched_id_cols
+].to_numpy()
+links[["Left Record ID", "Right Record ID"]] = links[
+    ["Left Record ID", "Right Record ID"]
+].astype(int)
+links["Left Record Key"] = (
+    links["Left Record Dataset"] + "-__-" + links["Left Record ID"].astype(int).astype(str)
+)
+links["Right Record Key"] = (
+    links["Right Record Dataset"] + "-__-" + links["Right Record ID"].astype(int).astype(str)
+)
+links_to_accept = (
+    links[links["Probability"] >= float(os.environ["THRESHOLD_MATCH_PROBABILITY"])]
+    # Pre-emptively break probability ties by right record key for the highest_id method
+    .sort_values(["Probability", "Right Record Key"], ascending=False)
+    # No duplicates in the *right* means only one link per *left* record
+    .groupby(["Left Record Key"]).first()
+)
+if break_ties_method == "drop":
+    num_tied = (
+        links_to_accept.merge(links, on=["Left Record Key", "Probability"])
+        .groupby(["Left Record Key"])
+        .size()
+    )
+    print("Ties:")
+    print(num_tied)
+    print(num_tied.describe())
+    links_to_accept = links_to_accept[num_tied == 1]
+elif break_ties_method == "highest_id":
+    # Done above pre-emptively
+    pass
+else:
+    raise ValueError(f"Unknown break_ties_method {break_ties_method}")
+# NOTE: We only include nodes involved in an accepted link in our cluster.
+# If a node isn't involved in an accepted link, that could just represent
+# that we haven't evaluated the right pairs involving it, not confidence that
+# it is a singleton.
+G = nx.from_pandas_edgelist(
+    links_to_accept.reset_index()[["Left Record Key", "Right Record Key"]].rename(
+        columns={"Left Record Key": "source", "Right Record Key": "target"}
+    )
+)
+# Compute connected components
+components = list(nx.connected_components(G))
+# Assign new cluster IDs
+merged_data = []
+for cluster_id, records in enumerate(components, start=1):
+    for record_key in records:
+        merged_data.append((record_key, cluster_id))
+# Build the final DataFrame
+merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
+merged_df[["Input Record Dataset", "Input Record ID"]] = (
+    merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
+    if not merged_df.empty
+    else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
+)
+merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
+merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]].to_parquet(output_path)

easylink/steps/cascading/update_clusters_by_connected_components.py CHANGED Viewed

@@ -59,7 +59,18 @@ new_clusters_df = load_file(new_clusters_filepath)
 def merge_clusters(known_clusters_df, new_clusters_df):
     # Combine both dataframes
-    combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
+    combined_df = pd.concat(
+        [
+            # Ensure cluster names are unique
+            known_clusters_df.assign(
+                **{"Cluster ID": lambda df: "known__" + df["Cluster ID"].astype(str)}
+            ),
+            new_clusters_df.assign(
+                **{"Cluster ID": lambda df: "new__" + df["Cluster ID"].astype(str)}
+            ),
+        ],
+        ignore_index=True,
+    )
     combined_df["Input Record Key"] = (
         combined_df["Input Record Dataset"]
         + "-__-"
@@ -92,9 +103,11 @@ def merge_clusters(known_clusters_df, new_clusters_df):
     # Build the final DataFrame
     merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
-    merged_df[["Input Record Dataset", "Input Record ID"]] = merged_df[
-        "Input Record Key"
-    ].str.split("-__-", n=1, expand=True)
+    merged_df[["Input Record Dataset", "Input Record ID"]] = (
+        merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
+        if not merged_df.empty
+        else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
+    )
     merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)

easylink/steps/default/default_determining_exclusions.py CHANGED Viewed

@@ -72,7 +72,7 @@ if len(clusters_df) > 0:
 # SAVE OUTPUTS
-IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
+IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
 # OUTPUT_PATHS is a single path to a file (results.parquet)
 results_filepath = os.environ["OUTPUT_PATHS"]

easylink/steps/default/default_removing_records.py CHANGED Viewed

@@ -52,7 +52,7 @@ results_dir.mkdir(exist_ok=True, parents=True)
 dataset = load_file(dataset_path)
 ids_to_remove = load_file(ids_filepath)
-dataset = dataset[~dataset["Record ID"].isin(ids_to_remove)]
+dataset = dataset[~dataset["Record ID"].isin(ids_to_remove["Input Record ID"])]
 output_path = results_dir / Path(dataset_path).name
 logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")

easylink/steps/splink/splink_blocking_and_filtering.py CHANGED Viewed

@@ -90,12 +90,18 @@ blocked_pairs = (
     .drop(columns=["match_key"])
 )
-blocked_pairs[["Left Record Dataset", "Left Record ID"]] = blocked_pairs.pop(
-    "join_key_l"
-).str.split("-__-", n=1, expand=True)
-blocked_pairs[["Right Record Dataset", "Right Record ID"]] = blocked_pairs.pop(
-    "join_key_r"
-).str.split("-__-", n=1, expand=True)
+blocked_pairs[["Left Record Dataset", "Left Record ID"]] = (
+    blocked_pairs.pop("join_key_l").str.split("-__-", n=1, expand=True)
+    if not blocked_pairs.empty
+    else pd.DataFrame(columns=["Left Record Dataset", "Left Record ID"])
+)
+blocked_pairs[["Right Record Dataset", "Right Record ID"]] = (
+    blocked_pairs.pop("join_key_r").str.split("-__-", n=1, expand=True)
+    if not blocked_pairs.empty
+    else pd.DataFrame(columns=["Right Record Dataset", "Right Record ID"])
+)
 blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
     ["Left Record ID", "Right Record ID"]
 ].astype(int)

easylink/steps/splink/splink_links_to_clusters.py CHANGED Viewed

@@ -53,6 +53,8 @@ cc = (
 # Split "Record Key" back into "Input Record Dataset" and "Input Record ID"
 cc[["Input Record Dataset", "Input Record ID"]] = (
     cc["Record Key"].astype(str).str.split("-__-", n=1, expand=True)
+    if not cc.empty
+    else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
 )
 cc = cc.drop(columns=["Record Key"])
 cc["Input Record ID"] = cc["Input Record ID"].astype(int)

easylink/utilities/validation_utils.py CHANGED Viewed

@@ -341,8 +341,8 @@ def _validate_pairs(df: pd.DataFrame, filepath: str) -> None:
 def validate_ids_to_remove(filepath: str) -> None:
     """Validates a file containing IDs to remove.
-    - The file must contain a single column: "Record ID".
-    - "Record ID" must have unique values.
+    - The file must contain a single column: "Input Record ID".
+    - "Input Record ID" must have unique values.
     Parameters
     ----------
@@ -352,13 +352,13 @@ def validate_ids_to_remove(filepath: str) -> None:
     Raises
     ------
     LookupError
-        If the file is missing the "Record ID" column.
+        If the file is missing the "Input Record ID" column.
     ValueError
-        If the "Record ID" column is not unique.
+        If the "Input Record ID" column is not unique.
     """
-    _validate_required_columns(filepath, {"Record ID"})
+    _validate_required_columns(filepath, {"Input Record ID"})
     df = _read_file(filepath)
-    _validate_unique_column(df, "Record ID", filepath)
+    _validate_unique_column(df, "Input Record ID", filepath)
 def validate_records(filepath: str) -> None:

{easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easylink
-Version: 0.1.25
+Version: 0.2.0
 Summary: Research repository for the EasyLink ER ecosystem project.
 Home-page: https://github.com/ihmeuw/easylink
 Author: The EasyLink developers

{easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/RECORD RENAMED Viewed

@@ -1,34 +1,38 @@
 easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
 easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
-easylink/_version.py,sha256=Ej7LsXg-6CASlaEHsZkUoLDpYEfHeFKdIeXMIM0esgA,23
-easylink/cli.py,sha256=80_EVklOdX78fPqAinTpsfTfWUqMM4ghFaQcVgZG354,10496
+easylink/_version.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
+easylink/cli.py,sha256=3Xoqclhn7mEHzuqyuVUjt67-V3Fqu0_Jr3B3lCdIuAg,10704
 easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
 easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
 easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
-easylink/implementation_metadata.yaml,sha256=u_E51gVzVzTuM19dMv7-p_0JV-A6j5dfUwJrxtAZDBQ,11805
+easylink/implementation_metadata.yaml,sha256=ahuSVk5Ur1V0F8EsLZO5apkNC2bWv2RsytNaiWGo9Yk,12562
 easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
 easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
 easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
 easylink/rule.py,sha256=n8r4NL7MiNzppl39MRbEMdHEpn_e_XS7LfrsJD6KNfA,16629
-easylink/runner.py,sha256=irMmrUME1B8BFTtQkCr-u-lHBDKaEll7IX_a4Q4AJNc,10576
+easylink/runner.py,sha256=h39MbWHgTs-VwkPxk76186si76e8UTf1hySqepqUSS8,13155
 easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
 easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
 easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
 easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
 easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
 easylink/pipeline_schema_constants/testing.py,sha256=ti08DeUuF-eWrGKMj4BMyOFFJnEYooDaWX0DGiferbk,24579
+easylink/steps/cascading/accept_all_pairs.def,sha256=kwZMF3H0mqCBcO1Y2parJXFBLp4e9bLQoVIYU7zZ8xY,486
+easylink/steps/cascading/accept_all_pairs.py,sha256=eF_rmqcZtL3vI1u-TJejOcKX2Qou-AbaLI7qAAGjoGI,703
 easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
-easylink/steps/cascading/exclude_clustered.py,sha256=Bpsyf9vAZ431Fh96RVzHkF7fy77NQjo1Cl6bHCIy69c,2580
+easylink/steps/cascading/exclude_clustered.py,sha256=T60deNb91_ZFg5K190G-Q7BC5EYrEdLPhFEK7Togv0Y,3048
 easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
-easylink/steps/cascading/exclude_none.py,sha256=5DK5bNG4TneMwUKE49Kmz7VDnKBNZWjOERkuSJU3BmA,2475
+easylink/steps/cascading/exclude_none.py,sha256=DesKAO-UcPqKKtUS92OHU25YDXMJLiBEcGLk69UYWDk,2481
+easylink/steps/cascading/one_to_many_links_to_clusters.def,sha256=BVFusUydsV3hY1en16OVr3TPqzwst-cEVBwvb8dtpqA,534
+easylink/steps/cascading/one_to_many_links_to_clusters.py,sha256=7QSJxW3mmR3LIjWBzzgi3vcOsmoYOsiSJn6iYGppHLA,3789
 easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
-easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=sFZXMGXl17jcGt8Fu5hgQz1KW5bFvPYdCoQGZ9Erc0I,3629
+easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=jhpMgewztCXrRxBw2FnH2HjIybpp7GcHe4kjTMgQOyg,4059
 easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
 easylink/steps/default/default_clusters_to_links.py,sha256=Ckm53d3W-ETNlTvQJPOkpHmSqCmxSWknMPQrEAIoTBY,2816
 easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
-easylink/steps/default/default_determining_exclusions.py,sha256=4diLfuqYm_Koj7gwifjwe_7mLZ6xb6RQiEdk-RRtB94,2629
+easylink/steps/default/default_determining_exclusions.py,sha256=RpYHFAral4uYevgiOsYqUHYgsEIejV5NhYdQ3q7VeU0,2635
 easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
-easylink/steps/default/default_removing_records.py,sha256=P4mmX2D4mhSoWd_S5CaNT4hlHOMAeZiMhCScWQiR_fQ,1906
+easylink/steps/default/default_removing_records.py,sha256=I_xGdWftlwP7H8HdxfblSG2YFVqA986KOECVwMCn4fk,1925
 easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
 easylink/steps/default/default_schema_alignment.py,sha256=oT5jbYQ3C3ocLgqqOnvH0SIJ6NeTtPBWWmCqr_frnAI,1479
 easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
@@ -70,11 +74,11 @@ easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4Ag
 easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
 easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2yFW5MpsMXdpoY_NjHcBvVVAHWv89LUgXE,1852
 easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
-easylink/steps/splink/splink_blocking_and_filtering.py,sha256=FO8YJ2_KgCLpQoq5xsM339bTSN1DhCXCL8XT1pb5STY,5259
+easylink/steps/splink/splink_blocking_and_filtering.py,sha256=3WMBmNEECB9Kxu4D6PAesZzBrhHTdpFEgvnGPsV4bww,5475
 easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
 easylink/steps/splink/splink_evaluating_pairs.py,sha256=vWUe3vQo9uGs0Cy8pG5PbolzsJX_cEaPS3Q0PMcBjcg,6253
 easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
-easylink/steps/splink/splink_links_to_clusters.py,sha256=5Sw8yi0dVLuRB-trN2kXmxbHBR0VJBxYee6u4_usg2Y,1920
+easylink/steps/splink/splink_links_to_clusters.py,sha256=Brpy3ZKSBpBUeOitg1ZgDvuMVwILH0QBkLXRJN8LXno,2015
 easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
 easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
 easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
@@ -82,10 +86,10 @@ easylink/utilities/general_utils.py,sha256=MmuoV4T6PgyEDjbepC_1D3TGrq70Hp-hl-GaA
 easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
 easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
 easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
-easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
-easylink-0.1.25.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
-easylink-0.1.25.dist-info/METADATA,sha256=agOx4R08pqrpwjWmoSAmhU33gmHZ5QhDt9UHVRbnkHI,4219
-easylink-0.1.25.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-easylink-0.1.25.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
-easylink-0.1.25.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
-easylink-0.1.25.dist-info/RECORD,,
+easylink/utilities/validation_utils.py,sha256=1naksMPStw_xIOqskX6DE99f16Y7eCcVF9I5ZILjMvI,18453
+easylink-0.2.0.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
+easylink-0.2.0.dist-info/METADATA,sha256=HxtOiOMe9hTRcK6HL6sLTTQNeP9X7hrhiodTpEMUeOA,4218
+easylink-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+easylink-0.2.0.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
+easylink-0.2.0.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
+easylink-0.2.0.dist-info/RECORD,,

{easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

easylink 0.1.25__py3-none-any.whl → 0.2.0__py3-none-any.whl

easylink 0.1.25py3-none-any.whl → 0.2.0py3-none-any.whl