easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. easylink/_version.py +1 -1
  2. easylink/cli.py +24 -3
  3. easylink/configuration.py +43 -36
  4. easylink/devtools/implementation_creator.py +71 -22
  5. easylink/implementation.py +88 -11
  6. easylink/implementation_metadata.yaml +177 -29
  7. easylink/pipeline.py +15 -6
  8. easylink/pipeline_schema.py +12 -13
  9. easylink/pipeline_schema_constants/__init__.py +4 -5
  10. easylink/pipeline_schema_constants/main.py +489 -0
  11. easylink/runner.py +11 -7
  12. easylink/step.py +89 -0
  13. easylink/steps/cascading/exclude_clustered.def +22 -0
  14. easylink/steps/cascading/exclude_clustered.py +76 -0
  15. easylink/steps/cascading/exclude_none.def +22 -0
  16. easylink/steps/cascading/exclude_none.py +76 -0
  17. easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
  18. easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
  19. easylink/steps/default/default_clusters_to_links.def +22 -0
  20. easylink/steps/default/default_clusters_to_links.py +91 -0
  21. easylink/steps/default/default_determining_exclusions.def +22 -0
  22. easylink/steps/default/default_determining_exclusions.py +81 -0
  23. easylink/steps/default/default_removing_records.def +22 -0
  24. easylink/steps/default/default_removing_records.py +59 -0
  25. easylink/steps/default/default_schema_alignment.def +22 -0
  26. easylink/steps/default/default_schema_alignment.py +53 -0
  27. easylink/steps/default/default_updating_clusters.def +22 -0
  28. easylink/steps/default/default_updating_clusters.py +67 -0
  29. easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
  30. easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
  31. easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
  32. easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
  33. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
  34. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
  35. easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
  36. easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
  37. easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
  38. easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
  39. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
  40. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
  41. easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
  42. easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
  43. easylink/steps/splink/splink_evaluating_pairs.def +22 -0
  44. easylink/steps/splink/splink_evaluating_pairs.py +164 -0
  45. easylink/steps/splink/splink_links_to_clusters.def +22 -0
  46. easylink/steps/splink/splink_links_to_clusters.py +63 -0
  47. easylink/utilities/data_utils.py +72 -0
  48. easylink/utilities/paths.py +4 -3
  49. easylink/utilities/validation_utils.py +509 -11
  50. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
  51. easylink-0.1.19.dist-info/RECORD +91 -0
  52. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
  53. easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
  54. easylink-0.1.17.dist-info/RECORD +0 -55
  55. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
  56. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./default_updating_clusters.py /default_updating_clusters.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /default_updating_clusters.py '$@'
@@ -0,0 +1,67 @@
1
+ # STEP_NAME: updating_clusters
2
+
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
4
+
5
+ # PIPELINE_SCHEMA: main
6
+
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s %(message)s",
16
+ handlers=[logging.StreamHandler()],
17
+ )
18
+
19
+
20
+ def load_file(file_path, file_format=None):
21
+ logging.info(f"Loading file {file_path} with format {file_format}")
22
+ if file_format is None:
23
+ file_format = file_path.split(".")[-1]
24
+ if file_format == "parquet":
25
+ return pd.read_parquet(file_path)
26
+ raise ValueError(f"Unknown file format {file_format}")
27
+
28
+
29
+ # LOAD INPUTS and SAVE OUTPUTS
30
+
31
+ # NEW_CLUSTERS_FILE_PATH is a path to a single file
32
+ new_clusters_filepath = os.environ["NEW_CLUSTERS_FILE_PATH"]
33
+
34
+ # KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
35
+ # There is one item in it that is a file with "clusters" in the filename.
36
+ # That's the only item we're interested in here.
37
+ # The other items may be there if this is coming from the user's input,
38
+ # due to our workaround for only having one slot of user input.
39
+ known_clusters_filepaths = [
40
+ path
41
+ for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
42
+ if "clusters" in Path(path).stem
43
+ ]
44
+ if len(known_clusters_filepaths) > 1:
45
+ raise ValueError("Multiple known clusters files found")
46
+ if len(known_clusters_filepaths) == 0:
47
+ raise ValueError("No known clusters file found")
48
+
49
+ known_clusters_filepath = known_clusters_filepaths[0]
50
+ known_clusters_df = load_file(known_clusters_filepath)
51
+
52
+ if len(known_clusters_df) > 0:
53
+ raise ValueError(
54
+ "Default implementation of updating_clusters passed a non-empty set of known clusters"
55
+ )
56
+
57
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (clusters.parquet)
58
+ results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
59
+ Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
60
+
61
+ clusters_df = load_file(new_clusters_filepath)
62
+
63
+
64
+ logging.info(
65
+ f"Writing output for dataset from input {new_clusters_filepath} to {results_filepath}"
66
+ )
67
+ clusters_df.to_parquet(results_filepath)
@@ -0,0 +1,136 @@
1
+ # STEP_NAME: evaluating_pairs
2
+ # HAS_CUSTOM_RECIPE: true
3
+ # SCRIPT_BASE_COMMAND: Rscript
4
+
5
+ library(fastLink)
6
+ library(arrow)
7
+ library(dplyr)
8
+ library(stringr)
9
+
10
+ # Check required environment variables
11
+ required_env_vars <- c(
12
+ "BLOCKS_DIR_PATH",
13
+ "DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY",
14
+ "DUMMY_CONTAINER_OUTPUT_PATHS",
15
+ "COMPARISONS",
16
+ "THRESHOLD_MATCH_PROBABILITY"
17
+ )
18
+ missing_vars <- required_env_vars[!nzchar(Sys.getenv(required_env_vars))]
19
+ if (length(missing_vars) > 0) {
20
+ stop(sprintf(
21
+ "The following required environment variables are not set or are empty: %s",
22
+ paste(missing_vars, collapse = ", ")
23
+ ))
24
+ }
25
+
26
+ blocks_dir <- Sys.getenv("BLOCKS_DIR_PATH")
27
+ diagnostics_dir <- Sys.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY")
28
+ output_path <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS")
29
+ comparisons <- strsplit(Sys.getenv("COMPARISONS"), ",")[[1]]
30
+
31
+ all_predictions <- list()
32
+
33
+ block_dirs <- list.dirs(blocks_dir, recursive = FALSE, full.names = TRUE)
34
+
35
+ for (block_dir in block_dirs) {
36
+ records_path <- file.path(block_dir, "records.parquet")
37
+ pairs_path <- file.path(block_dir, "pairs.parquet")
38
+ if (!file.exists(records_path)) {
39
+ stop(sprintf("File not found: %s", records_path))
40
+ }
41
+ if (!file.exists(pairs_path)) {
42
+ stop(sprintf("File not found: %s", pairs_path))
43
+ }
44
+
45
+ records <- read_parquet(records_path) %>%
46
+ mutate(
47
+ unique_id = paste0(`Input Record Dataset`, "::", `Input Record ID`)
48
+ )
49
+ pairs <- read_parquet(pairs_path) %>%
50
+ mutate(
51
+ join_key_l = paste0(`Left Record Dataset`, "::", `Left Record ID`),
52
+ join_key_r = paste0(`Right Record Dataset`, "::", `Right Record ID`)
53
+ )
54
+
55
+ # Subset records to only those that appear in pairs (left and right separately)
56
+ left_ids <- unique(pairs$join_key_l)
57
+ right_ids <- unique(pairs$join_key_r)
58
+ dfA <- records %>% filter(unique_id %in% left_ids)
59
+ dfB <- records %>% filter(unique_id %in% right_ids)
60
+
61
+ # Prepare comparison columns and fastLink match flags
62
+ comparison_cols <- sapply(comparisons, function(x) strsplit(x, ":")[[1]][1])
63
+ comparison_methods <- sapply(comparisons, function(x) strsplit(x, ":")[[1]][2])
64
+
65
+ stringdist.match <- comparison_cols[comparison_methods == "stringdist" | comparison_methods == "partial"]
66
+ partial.match <- comparison_cols[comparison_methods == "partial"]
67
+
68
+ # Estimate number of candidate pairs (after subsetting)
69
+ nA <- nrow(dfA)
70
+ nB <- nrow(dfB)
71
+ n_requested_pairs <- nrow(pairs)
72
+ n_possible_pairs <- nA * nB
73
+ if (n_possible_pairs > 10 * n_requested_pairs) {
74
+ warning(sprintf(
75
+ "fastLink will compute %d candidate pairs, which is more than 10x the %d requested pairs.",
76
+ n_possible_pairs, n_requested_pairs
77
+ ))
78
+ }
79
+
80
+ # Trick: add a dummy column to dfB to avoid fastLink dedupe restriction if dfA and dfB are identical
81
+ dfB_b <- dfB
82
+ if (identical(dfA, dfB)) {
83
+ dfB_b[["__dummy__"]] <- 1
84
+ }
85
+
86
+ # Run fastLink on all pairs (Cartesian product of reduced sets)
87
+ fl_out <- fastLink::fastLink(
88
+ dfA = dfA,
89
+ dfB = dfB_b,
90
+ varnames = comparison_cols,
91
+ stringdist.match = stringdist.match,
92
+ partial.match = partial.match,
93
+ threshold.match = as.numeric(Sys.getenv("THRESHOLD_MATCH_PROBABILITY")),
94
+ dedupe.matches = FALSE,
95
+ return.all = TRUE
96
+ )
97
+
98
+ inds_a <- fl_out$matches$inds.a
99
+ inds_b <- fl_out$matches$inds.b
100
+ posteriors <- fl_out$posterior
101
+
102
+ # Build matched pairs dataframe
103
+ dfA_match <- dfA[inds_a, , drop = FALSE]
104
+ dfB_match <- dfB[inds_b, , drop = FALSE]
105
+ matches <- data.frame(
106
+ join_key_l = dfA_match$unique_id,
107
+ join_key_r = dfB_match$unique_id,
108
+ Probability = posteriors
109
+ )
110
+
111
+ # Subset to only requested pairs
112
+ matches <- matches %>%
113
+ semi_join(pairs, by = c("join_key_l", "join_key_r"))
114
+
115
+ # Parse out dataset and record ID from join keys
116
+ predictions <- matches %>%
117
+ transmute(
118
+ `Left Record Dataset` = str_split_fixed(join_key_l, "::", 2)[,1],
119
+ `Left Record ID` = as.integer(str_split_fixed(join_key_l, "::", 2)[,2]),
120
+ `Right Record Dataset` = str_split_fixed(join_key_r, "::", 2)[,1],
121
+ `Right Record ID` = as.integer(str_split_fixed(join_key_r, "::", 2)[,2]),
122
+ Probability
123
+ )
124
+
125
+ all_predictions[[length(all_predictions) + 1]] <- predictions
126
+
127
+ # Optionally, save diagnostics (e.g., plot of match probabilities)
128
+ chart_path <- file.path(diagnostics_dir, paste0("match_weights_chart_", basename(block_dir), ".png"))
129
+ png(chart_path)
130
+ hist(predictions$Probability, main = "Match Probabilities", xlab = "Probability")
131
+ dev.off()
132
+ }
133
+
134
+ all_predictions_df <- bind_rows(all_predictions)
135
+ print(all_predictions_df)
136
+ write_parquet(all_predictions_df, output_path)
@@ -0,0 +1,21 @@
1
+
2
+ Bootstrap: docker
3
+ From: rocker/r2u@sha256:02311f32a3e58b73027a9f572836ed4772fc530c9678b953604d875ea58ddde4
4
+
5
+ %files
6
+ ./fastLink_evaluating_pairs.R /fastLink_evaluating_pairs.R
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ Rscript -e 'install.packages(c("arrow", "dplyr", "stringr", "fastLink"))'
16
+
17
+ %environment
18
+ export LC_ALL=C
19
+
20
+ %runscript
21
+ Rscript /fastLink_evaluating_pairs.R '$@'
@@ -0,0 +1,128 @@
1
+ # STEP_NAME: links_to_clusters
2
+ # HAS_CUSTOM_RECIPE: true
3
+ # SCRIPT_BASE_COMMAND: Rscript
4
+
5
+ options(error = function() traceback(3))
6
+
7
+ library(fastLink)
8
+ library(arrow)
9
+ library(dplyr)
10
+ library(stringr)
11
+
12
+ # Check required environment variables
13
+ required_env_vars <- c(
14
+ "LINKS_FILE_PATH",
15
+ "DUMMY_CONTAINER_OUTPUT_PATHS",
16
+ "THRESHOLD_MATCH_PROBABILITY"
17
+ )
18
+ missing_vars <- required_env_vars[!nzchar(Sys.getenv(required_env_vars))]
19
+ if (length(missing_vars) > 0) {
20
+ stop(sprintf(
21
+ "The following required environment variables are not set or are empty: %s",
22
+ paste(missing_vars, collapse = ", ")
23
+ ))
24
+ }
25
+
26
+ links_file_path <- Sys.getenv("LINKS_FILE_PATH")
27
+ output_path <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS")
28
+
29
+ if (!file.exists(links_file_path)) {
30
+ stop(sprintf("File not found: %s", links_file_path))
31
+ }
32
+
33
+ links <- read_parquet(links_file_path)
34
+
35
+ # Filter links by threshold
36
+ threshold <- as.numeric(Sys.getenv("THRESHOLD_MATCH_PROBABILITY"))
37
+ links <- links %>%
38
+ filter(Probability >= threshold)
39
+
40
+ if (nrow(links) == 0) {
41
+ clusters <- data.frame(
42
+ `Input Record Dataset` = character(),
43
+ `Input Record ID` = character(),
44
+ `Cluster ID` = integer()
45
+ )
46
+ } else {
47
+ # NOTE: fastLink's dedupeMatches function is quite tightly coupled with other parts
48
+ # of the fastLink pipeline.
49
+ # It takes a bunch of objects, usually created by earlier steps in the pipeline,
50
+ # which need to have particular formats.
51
+ # Here, we "spoof" these objects using carefully constructed data.frames in order to
52
+ # use the linear sum assignment algorithm in isolation (potentially with links not
53
+ # from fastLink).
54
+ # ChatGPT helped me write the below code.
55
+
56
+ # Combine dataset and record ID for unique keys
57
+ links <- links %>%
58
+ mutate(
59
+ Left_Record_Key = paste0(`Left Record Dataset`, "::", `Left Record ID`),
60
+ Right_Record_Key = paste0(`Right Record Dataset`, "::", `Right Record ID`)
61
+ )
62
+
63
+ n <- nrow(links)
64
+
65
+ # 1) Build matchesA / matchesB using the combined keys without touching `links`
66
+ matchesA <- data.frame(
67
+ idA = links[["Left_Record_Key"]],
68
+ idB = links[["Right_Record_Key"]],
69
+ prob = links[["Probability"]],
70
+ stringsAsFactors = FALSE
71
+ )
72
+ matchesB <- matchesA # identical copy
73
+
74
+ # 2) Dummy one‐to‐one pattern key
75
+ patterns <- data.frame(
76
+ patID = seq_len(n)
77
+ )
78
+
79
+ # 3) Fake EM object, with dummy gamma columns
80
+ EM <- list(
81
+ patterns.w = data.frame(
82
+ patID = patterns$patID,
83
+ counts = 1,
84
+ weights = 1,
85
+ p.gamma.j.m = 1,
86
+ p.gamma.j.u = 1
87
+ ),
88
+ # convert match‐probabilities into log‐odds scores
89
+ zeta.j = log(matchesA$prob / (1 - matchesA$prob))
90
+ )
91
+
92
+ # 4) Map each row back to its original record keys
93
+ matchesLink <- data.frame(
94
+ inds.a = links[["Left_Record_Key"]],
95
+ inds.b = links[["Right_Record_Key"]],
96
+ stringsAsFactors = FALSE
97
+ )
98
+
99
+ # 5) Call dedupeMatches with linear programming
100
+ deduped <- dedupeMatches(
101
+ matchesA = matchesA,
102
+ matchesB = matchesB,
103
+ EM = EM,
104
+ matchesLink = matchesLink,
105
+ patterns = patterns,
106
+ linprog = TRUE
107
+ )
108
+
109
+ # Parse out dataset and record ID from keys for both sides, then bind
110
+ clusters <- deduped$matchesLink %>%
111
+ transmute(
112
+ `Input Record Dataset` = str_split_fixed(inds.a, "::", 2)[,1],
113
+ `Input Record ID` = as.integer(str_split_fixed(inds.a, "::", 2)[,2]),
114
+ `Cluster ID` = paste0("cluster_", inds.a, "_", inds.b)
115
+ ) %>%
116
+ bind_rows(
117
+ deduped$matchesLink %>%
118
+ transmute(
119
+ `Input Record Dataset` = str_split_fixed(inds.b, "::", 2)[,1],
120
+ `Input Record ID` = as.integer(str_split_fixed(inds.b, "::", 2)[,2]),
121
+ `Cluster ID` = paste0("cluster_", inds.a, "_", inds.b)
122
+ )
123
+ )
124
+ }
125
+
126
+ print(clusters)
127
+ # Write clusters to output
128
+ arrow::write_parquet(clusters, output_path)
@@ -0,0 +1,21 @@
1
+
2
+ Bootstrap: docker
3
+ From: rocker/r2u@sha256:02311f32a3e58b73027a9f572836ed4772fc530c9678b953604d875ea58ddde4
4
+
5
+ %files
6
+ ./fastLink_links_to_clusters.R /fastLink_links_to_clusters.R
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ Rscript -e 'install.packages(c("arrow", "dplyr", "stringr", "fastLink"))'
16
+
17
+ %environment
18
+ export LC_ALL=C
19
+
20
+ %runscript
21
+ Rscript /fastLink_links_to_clusters.R '$@'
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./dummy_canonicalizing_and_downstream_analysis.py /dummy_canonicalizing_and_downstream_analysis.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /dummy_canonicalizing_and_downstream_analysis.py '$@'
@@ -0,0 +1,42 @@
1
+ # STEP_NAME: canonicalizing_and_downstream_analysis
2
+
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
4
+
5
+ # PIPELINE_SCHEMA: main
6
+
7
+ import logging
8
+ import os
9
+ from itertools import chain, combinations
10
+ from pathlib import Path
11
+
12
+ import pandas as pd
13
+
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format="%(asctime)s %(message)s",
17
+ handlers=[logging.StreamHandler()],
18
+ )
19
+
20
+
21
+ def load_file(file_path, file_format=None):
22
+ logging.info(f"Loading file {file_path} with format {file_format}")
23
+ if file_format is None:
24
+ file_format = file_path.split(".")[-1]
25
+ if file_format == "parquet":
26
+ return pd.read_parquet(file_path)
27
+ raise ValueError(f"Unknown file format {file_format}")
28
+
29
+
30
+ # LOAD INPUTS and SAVE OUTPUTS
31
+
32
+ # For dummy we will load the clusters and output (only) them as-is
33
+
34
+ # CLUSTERS_FILE_PATH is a path to a single file
35
+ clusters_path = os.environ["CLUSTERS_FILE_PATH"]
36
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (results.parquet)
37
+ results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
38
+
39
+ clusters_df = load_file(clusters_path)
40
+
41
+ logging.info(f"Writing output for dataset from input {clusters_path} to {results_filepath}")
42
+ clusters_df.to_parquet(results_filepath)