easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +24 -3
- easylink/configuration.py +43 -36
- easylink/devtools/implementation_creator.py +71 -22
- easylink/implementation.py +88 -11
- easylink/implementation_metadata.yaml +177 -29
- easylink/pipeline.py +15 -6
- easylink/pipeline_schema.py +12 -13
- easylink/pipeline_schema_constants/__init__.py +4 -5
- easylink/pipeline_schema_constants/main.py +489 -0
- easylink/runner.py +11 -7
- easylink/step.py +89 -0
- easylink/steps/cascading/exclude_clustered.def +22 -0
- easylink/steps/cascading/exclude_clustered.py +76 -0
- easylink/steps/cascading/exclude_none.def +22 -0
- easylink/steps/cascading/exclude_none.py +76 -0
- easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
- easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
- easylink/steps/default/default_clusters_to_links.def +22 -0
- easylink/steps/default/default_clusters_to_links.py +91 -0
- easylink/steps/default/default_determining_exclusions.def +22 -0
- easylink/steps/default/default_determining_exclusions.py +81 -0
- easylink/steps/default/default_removing_records.def +22 -0
- easylink/steps/default/default_removing_records.py +59 -0
- easylink/steps/default/default_schema_alignment.def +22 -0
- easylink/steps/default/default_schema_alignment.py +53 -0
- easylink/steps/default/default_updating_clusters.def +22 -0
- easylink/steps/default/default_updating_clusters.py +67 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
- easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
- easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
- easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
- easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
- easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
- easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
- easylink/steps/splink/splink_evaluating_pairs.def +22 -0
- easylink/steps/splink/splink_evaluating_pairs.py +164 -0
- easylink/steps/splink/splink_links_to_clusters.def +22 -0
- easylink/steps/splink/splink_links_to_clusters.py +63 -0
- easylink/utilities/data_utils.py +72 -0
- easylink/utilities/paths.py +4 -3
- easylink/utilities/validation_utils.py +509 -11
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
- easylink-0.1.19.dist-info/RECORD +91 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
- easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
- easylink-0.1.17.dist-info/RECORD +0 -55
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./default_updating_clusters.py /default_updating_clusters.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /default_updating_clusters.py '$@'
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# STEP_NAME: updating_clusters
|
2
|
+
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
4
|
+
|
5
|
+
# PIPELINE_SCHEMA: main
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
|
13
|
+
logging.basicConfig(
|
14
|
+
level=logging.INFO,
|
15
|
+
format="%(asctime)s %(message)s",
|
16
|
+
handlers=[logging.StreamHandler()],
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
def load_file(file_path, file_format=None):
|
21
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
22
|
+
if file_format is None:
|
23
|
+
file_format = file_path.split(".")[-1]
|
24
|
+
if file_format == "parquet":
|
25
|
+
return pd.read_parquet(file_path)
|
26
|
+
raise ValueError(f"Unknown file format {file_format}")
|
27
|
+
|
28
|
+
|
29
|
+
# LOAD INPUTS and SAVE OUTPUTS
|
30
|
+
|
31
|
+
# NEW_CLUSTERS_FILE_PATH is a path to a single file
|
32
|
+
new_clusters_filepath = os.environ["NEW_CLUSTERS_FILE_PATH"]
|
33
|
+
|
34
|
+
# KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
|
35
|
+
# There is one item in it that is a file with "clusters" in the filename.
|
36
|
+
# That's the only item we're interested in here.
|
37
|
+
# The other items may be there if this is coming from the user's input,
|
38
|
+
# due to our workaround for only having one slot of user input.
|
39
|
+
known_clusters_filepaths = [
|
40
|
+
path
|
41
|
+
for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
|
42
|
+
if "clusters" in Path(path).stem
|
43
|
+
]
|
44
|
+
if len(known_clusters_filepaths) > 1:
|
45
|
+
raise ValueError("Multiple known clusters files found")
|
46
|
+
if len(known_clusters_filepaths) == 0:
|
47
|
+
raise ValueError("No known clusters file found")
|
48
|
+
|
49
|
+
known_clusters_filepath = known_clusters_filepaths[0]
|
50
|
+
known_clusters_df = load_file(known_clusters_filepath)
|
51
|
+
|
52
|
+
if len(known_clusters_df) > 0:
|
53
|
+
raise ValueError(
|
54
|
+
"Default implementation of updating_clusters passed a non-empty set of known clusters"
|
55
|
+
)
|
56
|
+
|
57
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (clusters.parquet)
|
58
|
+
results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
|
59
|
+
Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
|
60
|
+
|
61
|
+
clusters_df = load_file(new_clusters_filepath)
|
62
|
+
|
63
|
+
|
64
|
+
logging.info(
|
65
|
+
f"Writing output for dataset from input {new_clusters_filepath} to {results_filepath}"
|
66
|
+
)
|
67
|
+
clusters_df.to_parquet(results_filepath)
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# STEP_NAME: evaluating_pairs
|
2
|
+
# HAS_CUSTOM_RECIPE: true
|
3
|
+
# SCRIPT_BASE_COMMAND: Rscript
|
4
|
+
|
5
|
+
library(fastLink)
|
6
|
+
library(arrow)
|
7
|
+
library(dplyr)
|
8
|
+
library(stringr)
|
9
|
+
|
10
|
+
# Check required environment variables
|
11
|
+
required_env_vars <- c(
|
12
|
+
"BLOCKS_DIR_PATH",
|
13
|
+
"DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY",
|
14
|
+
"DUMMY_CONTAINER_OUTPUT_PATHS",
|
15
|
+
"COMPARISONS",
|
16
|
+
"THRESHOLD_MATCH_PROBABILITY"
|
17
|
+
)
|
18
|
+
missing_vars <- required_env_vars[!nzchar(Sys.getenv(required_env_vars))]
|
19
|
+
if (length(missing_vars) > 0) {
|
20
|
+
stop(sprintf(
|
21
|
+
"The following required environment variables are not set or are empty: %s",
|
22
|
+
paste(missing_vars, collapse = ", ")
|
23
|
+
))
|
24
|
+
}
|
25
|
+
|
26
|
+
blocks_dir <- Sys.getenv("BLOCKS_DIR_PATH")
|
27
|
+
diagnostics_dir <- Sys.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY")
|
28
|
+
output_path <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS")
|
29
|
+
comparisons <- strsplit(Sys.getenv("COMPARISONS"), ",")[[1]]
|
30
|
+
|
31
|
+
all_predictions <- list()
|
32
|
+
|
33
|
+
block_dirs <- list.dirs(blocks_dir, recursive = FALSE, full.names = TRUE)
|
34
|
+
|
35
|
+
for (block_dir in block_dirs) {
|
36
|
+
records_path <- file.path(block_dir, "records.parquet")
|
37
|
+
pairs_path <- file.path(block_dir, "pairs.parquet")
|
38
|
+
if (!file.exists(records_path)) {
|
39
|
+
stop(sprintf("File not found: %s", records_path))
|
40
|
+
}
|
41
|
+
if (!file.exists(pairs_path)) {
|
42
|
+
stop(sprintf("File not found: %s", pairs_path))
|
43
|
+
}
|
44
|
+
|
45
|
+
records <- read_parquet(records_path) %>%
|
46
|
+
mutate(
|
47
|
+
unique_id = paste0(`Input Record Dataset`, "::", `Input Record ID`)
|
48
|
+
)
|
49
|
+
pairs <- read_parquet(pairs_path) %>%
|
50
|
+
mutate(
|
51
|
+
join_key_l = paste0(`Left Record Dataset`, "::", `Left Record ID`),
|
52
|
+
join_key_r = paste0(`Right Record Dataset`, "::", `Right Record ID`)
|
53
|
+
)
|
54
|
+
|
55
|
+
# Subset records to only those that appear in pairs (left and right separately)
|
56
|
+
left_ids <- unique(pairs$join_key_l)
|
57
|
+
right_ids <- unique(pairs$join_key_r)
|
58
|
+
dfA <- records %>% filter(unique_id %in% left_ids)
|
59
|
+
dfB <- records %>% filter(unique_id %in% right_ids)
|
60
|
+
|
61
|
+
# Prepare comparison columns and fastLink match flags
|
62
|
+
comparison_cols <- sapply(comparisons, function(x) strsplit(x, ":")[[1]][1])
|
63
|
+
comparison_methods <- sapply(comparisons, function(x) strsplit(x, ":")[[1]][2])
|
64
|
+
|
65
|
+
stringdist.match <- comparison_cols[comparison_methods == "stringdist" | comparison_methods == "partial"]
|
66
|
+
partial.match <- comparison_cols[comparison_methods == "partial"]
|
67
|
+
|
68
|
+
# Estimate number of candidate pairs (after subsetting)
|
69
|
+
nA <- nrow(dfA)
|
70
|
+
nB <- nrow(dfB)
|
71
|
+
n_requested_pairs <- nrow(pairs)
|
72
|
+
n_possible_pairs <- nA * nB
|
73
|
+
if (n_possible_pairs > 10 * n_requested_pairs) {
|
74
|
+
warning(sprintf(
|
75
|
+
"fastLink will compute %d candidate pairs, which is more than 10x the %d requested pairs.",
|
76
|
+
n_possible_pairs, n_requested_pairs
|
77
|
+
))
|
78
|
+
}
|
79
|
+
|
80
|
+
# Trick: add a dummy column to dfB to avoid fastLink dedupe restriction if dfA and dfB are identical
|
81
|
+
dfB_b <- dfB
|
82
|
+
if (identical(dfA, dfB)) {
|
83
|
+
dfB_b[["__dummy__"]] <- 1
|
84
|
+
}
|
85
|
+
|
86
|
+
# Run fastLink on all pairs (Cartesian product of reduced sets)
|
87
|
+
fl_out <- fastLink::fastLink(
|
88
|
+
dfA = dfA,
|
89
|
+
dfB = dfB_b,
|
90
|
+
varnames = comparison_cols,
|
91
|
+
stringdist.match = stringdist.match,
|
92
|
+
partial.match = partial.match,
|
93
|
+
threshold.match = as.numeric(Sys.getenv("THRESHOLD_MATCH_PROBABILITY")),
|
94
|
+
dedupe.matches = FALSE,
|
95
|
+
return.all = TRUE
|
96
|
+
)
|
97
|
+
|
98
|
+
inds_a <- fl_out$matches$inds.a
|
99
|
+
inds_b <- fl_out$matches$inds.b
|
100
|
+
posteriors <- fl_out$posterior
|
101
|
+
|
102
|
+
# Build matched pairs dataframe
|
103
|
+
dfA_match <- dfA[inds_a, , drop = FALSE]
|
104
|
+
dfB_match <- dfB[inds_b, , drop = FALSE]
|
105
|
+
matches <- data.frame(
|
106
|
+
join_key_l = dfA_match$unique_id,
|
107
|
+
join_key_r = dfB_match$unique_id,
|
108
|
+
Probability = posteriors
|
109
|
+
)
|
110
|
+
|
111
|
+
# Subset to only requested pairs
|
112
|
+
matches <- matches %>%
|
113
|
+
semi_join(pairs, by = c("join_key_l", "join_key_r"))
|
114
|
+
|
115
|
+
# Parse out dataset and record ID from join keys
|
116
|
+
predictions <- matches %>%
|
117
|
+
transmute(
|
118
|
+
`Left Record Dataset` = str_split_fixed(join_key_l, "::", 2)[,1],
|
119
|
+
`Left Record ID` = as.integer(str_split_fixed(join_key_l, "::", 2)[,2]),
|
120
|
+
`Right Record Dataset` = str_split_fixed(join_key_r, "::", 2)[,1],
|
121
|
+
`Right Record ID` = as.integer(str_split_fixed(join_key_r, "::", 2)[,2]),
|
122
|
+
Probability
|
123
|
+
)
|
124
|
+
|
125
|
+
all_predictions[[length(all_predictions) + 1]] <- predictions
|
126
|
+
|
127
|
+
# Optionally, save diagnostics (e.g., plot of match probabilities)
|
128
|
+
chart_path <- file.path(diagnostics_dir, paste0("match_weights_chart_", basename(block_dir), ".png"))
|
129
|
+
png(chart_path)
|
130
|
+
hist(predictions$Probability, main = "Match Probabilities", xlab = "Probability")
|
131
|
+
dev.off()
|
132
|
+
}
|
133
|
+
|
134
|
+
all_predictions_df <- bind_rows(all_predictions)
|
135
|
+
print(all_predictions_df)
|
136
|
+
write_parquet(all_predictions_df, output_path)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: rocker/r2u@sha256:02311f32a3e58b73027a9f572836ed4772fc530c9678b953604d875ea58ddde4
|
4
|
+
|
5
|
+
%files
|
6
|
+
./fastLink_evaluating_pairs.R /fastLink_evaluating_pairs.R
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
Rscript -e 'install.packages(c("arrow", "dplyr", "stringr", "fastLink"))'
|
16
|
+
|
17
|
+
%environment
|
18
|
+
export LC_ALL=C
|
19
|
+
|
20
|
+
%runscript
|
21
|
+
Rscript /fastLink_evaluating_pairs.R '$@'
|
@@ -0,0 +1,128 @@
|
|
1
|
+
# STEP_NAME: links_to_clusters
|
2
|
+
# HAS_CUSTOM_RECIPE: true
|
3
|
+
# SCRIPT_BASE_COMMAND: Rscript
|
4
|
+
|
5
|
+
options(error = function() traceback(3))
|
6
|
+
|
7
|
+
library(fastLink)
|
8
|
+
library(arrow)
|
9
|
+
library(dplyr)
|
10
|
+
library(stringr)
|
11
|
+
|
12
|
+
# Check required environment variables
|
13
|
+
required_env_vars <- c(
|
14
|
+
"LINKS_FILE_PATH",
|
15
|
+
"DUMMY_CONTAINER_OUTPUT_PATHS",
|
16
|
+
"THRESHOLD_MATCH_PROBABILITY"
|
17
|
+
)
|
18
|
+
missing_vars <- required_env_vars[!nzchar(Sys.getenv(required_env_vars))]
|
19
|
+
if (length(missing_vars) > 0) {
|
20
|
+
stop(sprintf(
|
21
|
+
"The following required environment variables are not set or are empty: %s",
|
22
|
+
paste(missing_vars, collapse = ", ")
|
23
|
+
))
|
24
|
+
}
|
25
|
+
|
26
|
+
links_file_path <- Sys.getenv("LINKS_FILE_PATH")
|
27
|
+
output_path <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS")
|
28
|
+
|
29
|
+
if (!file.exists(links_file_path)) {
|
30
|
+
stop(sprintf("File not found: %s", links_file_path))
|
31
|
+
}
|
32
|
+
|
33
|
+
links <- read_parquet(links_file_path)
|
34
|
+
|
35
|
+
# Filter links by threshold
|
36
|
+
threshold <- as.numeric(Sys.getenv("THRESHOLD_MATCH_PROBABILITY"))
|
37
|
+
links <- links %>%
|
38
|
+
filter(Probability >= threshold)
|
39
|
+
|
40
|
+
if (nrow(links) == 0) {
|
41
|
+
clusters <- data.frame(
|
42
|
+
`Input Record Dataset` = character(),
|
43
|
+
`Input Record ID` = character(),
|
44
|
+
`Cluster ID` = integer()
|
45
|
+
)
|
46
|
+
} else {
|
47
|
+
# NOTE: fastLink's dedupeMatches function is quite tightly coupled with other parts
|
48
|
+
# of the fastLink pipeline.
|
49
|
+
# It takes a bunch of objects, usually created by earlier steps in the pipeline,
|
50
|
+
# which need to have particular formats.
|
51
|
+
# Here, we "spoof" these objects using carefully constructed data.frames in order to
|
52
|
+
# use the linear sum assignment algorithm in isolation (potentially with links not
|
53
|
+
# from fastLink).
|
54
|
+
# ChatGPT helped me write the below code.
|
55
|
+
|
56
|
+
# Combine dataset and record ID for unique keys
|
57
|
+
links <- links %>%
|
58
|
+
mutate(
|
59
|
+
Left_Record_Key = paste0(`Left Record Dataset`, "::", `Left Record ID`),
|
60
|
+
Right_Record_Key = paste0(`Right Record Dataset`, "::", `Right Record ID`)
|
61
|
+
)
|
62
|
+
|
63
|
+
n <- nrow(links)
|
64
|
+
|
65
|
+
# 1) Build matchesA / matchesB using the combined keys without touching `links`
|
66
|
+
matchesA <- data.frame(
|
67
|
+
idA = links[["Left_Record_Key"]],
|
68
|
+
idB = links[["Right_Record_Key"]],
|
69
|
+
prob = links[["Probability"]],
|
70
|
+
stringsAsFactors = FALSE
|
71
|
+
)
|
72
|
+
matchesB <- matchesA # identical copy
|
73
|
+
|
74
|
+
# 2) Dummy one‐to‐one pattern key
|
75
|
+
patterns <- data.frame(
|
76
|
+
patID = seq_len(n)
|
77
|
+
)
|
78
|
+
|
79
|
+
# 3) Fake EM object, with dummy gamma columns
|
80
|
+
EM <- list(
|
81
|
+
patterns.w = data.frame(
|
82
|
+
patID = patterns$patID,
|
83
|
+
counts = 1,
|
84
|
+
weights = 1,
|
85
|
+
p.gamma.j.m = 1,
|
86
|
+
p.gamma.j.u = 1
|
87
|
+
),
|
88
|
+
# convert match‐probabilities into log‐odds scores
|
89
|
+
zeta.j = log(matchesA$prob / (1 - matchesA$prob))
|
90
|
+
)
|
91
|
+
|
92
|
+
# 4) Map each row back to its original record keys
|
93
|
+
matchesLink <- data.frame(
|
94
|
+
inds.a = links[["Left_Record_Key"]],
|
95
|
+
inds.b = links[["Right_Record_Key"]],
|
96
|
+
stringsAsFactors = FALSE
|
97
|
+
)
|
98
|
+
|
99
|
+
# 5) Call dedupeMatches with linear programming
|
100
|
+
deduped <- dedupeMatches(
|
101
|
+
matchesA = matchesA,
|
102
|
+
matchesB = matchesB,
|
103
|
+
EM = EM,
|
104
|
+
matchesLink = matchesLink,
|
105
|
+
patterns = patterns,
|
106
|
+
linprog = TRUE
|
107
|
+
)
|
108
|
+
|
109
|
+
# Parse out dataset and record ID from keys for both sides, then bind
|
110
|
+
clusters <- deduped$matchesLink %>%
|
111
|
+
transmute(
|
112
|
+
`Input Record Dataset` = str_split_fixed(inds.a, "::", 2)[,1],
|
113
|
+
`Input Record ID` = as.integer(str_split_fixed(inds.a, "::", 2)[,2]),
|
114
|
+
`Cluster ID` = paste0("cluster_", inds.a, "_", inds.b)
|
115
|
+
) %>%
|
116
|
+
bind_rows(
|
117
|
+
deduped$matchesLink %>%
|
118
|
+
transmute(
|
119
|
+
`Input Record Dataset` = str_split_fixed(inds.b, "::", 2)[,1],
|
120
|
+
`Input Record ID` = as.integer(str_split_fixed(inds.b, "::", 2)[,2]),
|
121
|
+
`Cluster ID` = paste0("cluster_", inds.a, "_", inds.b)
|
122
|
+
)
|
123
|
+
)
|
124
|
+
}
|
125
|
+
|
126
|
+
print(clusters)
|
127
|
+
# Write clusters to output
|
128
|
+
arrow::write_parquet(clusters, output_path)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: rocker/r2u@sha256:02311f32a3e58b73027a9f572836ed4772fc530c9678b953604d875ea58ddde4
|
4
|
+
|
5
|
+
%files
|
6
|
+
./fastLink_links_to_clusters.R /fastLink_links_to_clusters.R
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
Rscript -e 'install.packages(c("arrow", "dplyr", "stringr", "fastLink"))'
|
16
|
+
|
17
|
+
%environment
|
18
|
+
export LC_ALL=C
|
19
|
+
|
20
|
+
%runscript
|
21
|
+
Rscript /fastLink_links_to_clusters.R '$@'
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./dummy_canonicalizing_and_downstream_analysis.py /dummy_canonicalizing_and_downstream_analysis.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /dummy_canonicalizing_and_downstream_analysis.py '$@'
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# STEP_NAME: canonicalizing_and_downstream_analysis
|
2
|
+
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
4
|
+
|
5
|
+
# PIPELINE_SCHEMA: main
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
from itertools import chain, combinations
|
10
|
+
from pathlib import Path
|
11
|
+
|
12
|
+
import pandas as pd
|
13
|
+
|
14
|
+
logging.basicConfig(
|
15
|
+
level=logging.INFO,
|
16
|
+
format="%(asctime)s %(message)s",
|
17
|
+
handlers=[logging.StreamHandler()],
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
def load_file(file_path, file_format=None):
|
22
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
23
|
+
if file_format is None:
|
24
|
+
file_format = file_path.split(".")[-1]
|
25
|
+
if file_format == "parquet":
|
26
|
+
return pd.read_parquet(file_path)
|
27
|
+
raise ValueError(f"Unknown file format {file_format}")
|
28
|
+
|
29
|
+
|
30
|
+
# LOAD INPUTS and SAVE OUTPUTS
|
31
|
+
|
32
|
+
# For dummy we will load the clusters and output (only) them as-is
|
33
|
+
|
34
|
+
# CLUSTERS_FILE_PATH is a path to a single file
|
35
|
+
clusters_path = os.environ["CLUSTERS_FILE_PATH"]
|
36
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (results.parquet)
|
37
|
+
results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
|
38
|
+
|
39
|
+
clusters_df = load_file(clusters_path)
|
40
|
+
|
41
|
+
logging.info(f"Writing output for dataset from input {clusters_path} to {results_filepath}")
|
42
|
+
clusters_df.to_parquet(results_filepath)
|