easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +24 -3
- easylink/configuration.py +43 -36
- easylink/devtools/implementation_creator.py +71 -22
- easylink/implementation.py +88 -11
- easylink/implementation_metadata.yaml +177 -29
- easylink/pipeline.py +15 -6
- easylink/pipeline_schema.py +12 -13
- easylink/pipeline_schema_constants/__init__.py +4 -5
- easylink/pipeline_schema_constants/main.py +489 -0
- easylink/runner.py +11 -7
- easylink/step.py +89 -0
- easylink/steps/cascading/exclude_clustered.def +22 -0
- easylink/steps/cascading/exclude_clustered.py +76 -0
- easylink/steps/cascading/exclude_none.def +22 -0
- easylink/steps/cascading/exclude_none.py +76 -0
- easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
- easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
- easylink/steps/default/default_clusters_to_links.def +22 -0
- easylink/steps/default/default_clusters_to_links.py +91 -0
- easylink/steps/default/default_determining_exclusions.def +22 -0
- easylink/steps/default/default_determining_exclusions.py +81 -0
- easylink/steps/default/default_removing_records.def +22 -0
- easylink/steps/default/default_removing_records.py +59 -0
- easylink/steps/default/default_schema_alignment.def +22 -0
- easylink/steps/default/default_schema_alignment.py +53 -0
- easylink/steps/default/default_updating_clusters.def +22 -0
- easylink/steps/default/default_updating_clusters.py +67 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
- easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
- easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
- easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
- easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
- easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
- easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
- easylink/steps/splink/splink_evaluating_pairs.def +22 -0
- easylink/steps/splink/splink_evaluating_pairs.py +164 -0
- easylink/steps/splink/splink_links_to_clusters.def +22 -0
- easylink/steps/splink/splink_links_to_clusters.py +63 -0
- easylink/utilities/data_utils.py +72 -0
- easylink/utilities/paths.py +4 -3
- easylink/utilities/validation_utils.py +509 -11
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
- easylink-0.1.19.dist-info/RECORD +91 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
- easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
- easylink-0.1.17.dist-info/RECORD +0 -55
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
# STEP_NAME: determining_exclusions
|
2
|
+
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
4
|
+
|
5
|
+
# PIPELINE_SCHEMA: main
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
|
13
|
+
logging.basicConfig(
|
14
|
+
level=logging.INFO,
|
15
|
+
format="%(asctime)s %(message)s",
|
16
|
+
handlers=[logging.StreamHandler()],
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
def load_file(file_path, file_format=None):
|
21
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
22
|
+
if file_format is None:
|
23
|
+
file_format = file_path.split(".")[-1]
|
24
|
+
if file_format == "parquet":
|
25
|
+
return pd.read_parquet(file_path)
|
26
|
+
raise ValueError(f"Unknown file format {file_format}")
|
27
|
+
|
28
|
+
|
29
|
+
# LOAD INPUTS
|
30
|
+
|
31
|
+
# INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS is list of filepaths which includes
|
32
|
+
# the known_clusters filepath due to workaround
|
33
|
+
dataset_paths = os.environ["INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS"].split(",")
|
34
|
+
dataset_paths = [path for path in dataset_paths if "clusters" not in Path(path).stem]
|
35
|
+
|
36
|
+
# for workaround, choose path based on INPUT_DATASET configuration
|
37
|
+
splitter_choice = os.environ["INPUT_DATASET"]
|
38
|
+
dataset_path = None
|
39
|
+
for path in dataset_paths:
|
40
|
+
if splitter_choice == Path(path).stem:
|
41
|
+
dataset_path = path
|
42
|
+
break
|
43
|
+
if dataset_path is None:
|
44
|
+
raise ValueError(f"No dataset matching {splitter_choice} found")
|
45
|
+
|
46
|
+
# KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
|
47
|
+
# There is one item in it that is a file with "clusters" in the filename.
|
48
|
+
# That's the only item we're interested in here.
|
49
|
+
# The other items may be there if this is coming from the user's input,
|
50
|
+
# due to our workaround for only having one slot of user input.
|
51
|
+
clusters_filepaths = [
|
52
|
+
path
|
53
|
+
for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
|
54
|
+
if "clusters" in Path(path).stem
|
55
|
+
]
|
56
|
+
if len(clusters_filepaths) > 1:
|
57
|
+
raise ValueError("Multiple known clusters files found")
|
58
|
+
if len(clusters_filepaths) == 0:
|
59
|
+
raise ValueError("No known clusters file found")
|
60
|
+
|
61
|
+
clusters_filepath = clusters_filepaths[0]
|
62
|
+
|
63
|
+
# Exclude records that have been clustered
|
64
|
+
clusters_df = load_file(clusters_filepath)
|
65
|
+
dataset_df = load_file(dataset_path)
|
66
|
+
clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
|
67
|
+
clusters_df["Input Record ID"].unique()
|
68
|
+
)
|
69
|
+
|
70
|
+
IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
|
71
|
+
|
72
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
|
+
results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
|
74
|
+
|
75
|
+
logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
|
76
|
+
IDS_TO_REMOVE.to_parquet(results_filepath)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./exclude_none.py /exclude_none.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /exclude_none.py '$@'
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# STEP_NAME: determining_exclusions
|
2
|
+
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
4
|
+
|
5
|
+
# PIPELINE_SCHEMA: main
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
|
13
|
+
logging.basicConfig(
|
14
|
+
level=logging.INFO,
|
15
|
+
format="%(asctime)s %(message)s",
|
16
|
+
handlers=[logging.StreamHandler()],
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
def load_file(file_path, file_format=None):
|
21
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
22
|
+
if file_format is None:
|
23
|
+
file_format = file_path.split(".")[-1]
|
24
|
+
if file_format == "parquet":
|
25
|
+
return pd.read_parquet(file_path)
|
26
|
+
raise ValueError(f"Unknown file format {file_format}")
|
27
|
+
|
28
|
+
|
29
|
+
# LOAD INPUTS
|
30
|
+
|
31
|
+
# INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS is list of filepaths which includes
|
32
|
+
# the known_clusters filepath due to workaround
|
33
|
+
dataset_paths = os.environ["INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS"].split(",")
|
34
|
+
dataset_paths = [path for path in dataset_paths if "clusters" not in Path(path).stem]
|
35
|
+
|
36
|
+
# for workaround, choose path based on INPUT_DATASET configuration
|
37
|
+
splitter_choice = os.environ["INPUT_DATASET"]
|
38
|
+
dataset_path = None
|
39
|
+
for path in dataset_paths:
|
40
|
+
if splitter_choice == Path(path).stem:
|
41
|
+
dataset_path = path
|
42
|
+
break
|
43
|
+
if dataset_path is None:
|
44
|
+
raise ValueError(f"No dataset matching {splitter_choice} found")
|
45
|
+
|
46
|
+
# KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
|
47
|
+
# There is one item in it that is a file with "clusters" in the filename.
|
48
|
+
# That's the only item we're interested in here.
|
49
|
+
# The other items may be there if this is coming from the user's input,
|
50
|
+
# due to our workaround for only having one slot of user input.
|
51
|
+
clusters_filepaths = [
|
52
|
+
path
|
53
|
+
for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
|
54
|
+
if "clusters" in Path(path).stem
|
55
|
+
]
|
56
|
+
if len(clusters_filepaths) > 1:
|
57
|
+
raise ValueError("Multiple known clusters files found")
|
58
|
+
if len(clusters_filepaths) == 0:
|
59
|
+
raise ValueError("No known clusters file found")
|
60
|
+
|
61
|
+
clusters_filepath = clusters_filepaths[0]
|
62
|
+
|
63
|
+
clusters_df = load_file(clusters_filepath)
|
64
|
+
|
65
|
+
# don't need to actually load the dataset,
|
66
|
+
# since we will just save an empty ids_to_remove
|
67
|
+
|
68
|
+
# SAVE OUTPUTS
|
69
|
+
|
70
|
+
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
71
|
+
|
72
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
|
+
results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
|
74
|
+
|
75
|
+
logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
|
76
|
+
IDS_TO_REMOVE.to_parquet(results_filepath)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./update_clusters_by_connected_components.py /update_clusters_by_connected_components.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml networkx
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /update_clusters_by_connected_components.py '$@'
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# STEP_NAME: updating_clusters
|
2
|
+
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml networkx
|
4
|
+
|
5
|
+
# PIPELINE_SCHEMA: main
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
import networkx as nx
|
12
|
+
import pandas as pd
|
13
|
+
|
14
|
+
logging.basicConfig(
|
15
|
+
level=logging.INFO,
|
16
|
+
format="%(asctime)s %(message)s",
|
17
|
+
handlers=[logging.StreamHandler()],
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
def load_file(file_path, file_format=None):
|
22
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
23
|
+
if file_format is None:
|
24
|
+
file_format = file_path.split(".")[-1]
|
25
|
+
if file_format == "parquet":
|
26
|
+
return pd.read_parquet(file_path)
|
27
|
+
raise ValueError(f"Unknown file format {file_format}")
|
28
|
+
|
29
|
+
|
30
|
+
# LOAD INPUTS and SAVE OUTPUTS
|
31
|
+
|
32
|
+
# NEW_CLUSTERS_FILE_PATH is a path to a single file
|
33
|
+
new_clusters_filepath = os.environ["NEW_CLUSTERS_FILE_PATH"]
|
34
|
+
|
35
|
+
# KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
|
36
|
+
# There is one item in it that is a file with "clusters" in the filename.
|
37
|
+
# That's the only item we're interested in here.
|
38
|
+
# The other items may be there if this is coming from the user's input,
|
39
|
+
# due to our workaround for only having one slot of user input.
|
40
|
+
known_clusters_filepaths = [
|
41
|
+
path
|
42
|
+
for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
|
43
|
+
if "clusters" in Path(path).stem
|
44
|
+
]
|
45
|
+
if len(known_clusters_filepaths) > 1:
|
46
|
+
raise ValueError("Multiple known clusters files found")
|
47
|
+
if len(known_clusters_filepaths) == 0:
|
48
|
+
raise ValueError("No known clusters file found")
|
49
|
+
|
50
|
+
known_clusters_filepath = known_clusters_filepaths[0]
|
51
|
+
known_clusters_df = load_file(known_clusters_filepath)
|
52
|
+
|
53
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (clusters.parquet)
|
54
|
+
results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
|
55
|
+
Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
|
56
|
+
|
57
|
+
new_clusters_df = load_file(new_clusters_filepath)
|
58
|
+
|
59
|
+
|
60
|
+
def merge_clusters(known_clusters_df, new_clusters_df):
|
61
|
+
# Combine both dataframes
|
62
|
+
combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
|
63
|
+
|
64
|
+
# Drop records with missing cluster IDs
|
65
|
+
combined_df = combined_df.dropna(subset=["Cluster ID"])
|
66
|
+
|
67
|
+
# Group by Cluster ID to get connected records
|
68
|
+
cluster_groups = combined_df.groupby("Cluster ID")["Input Record ID"].apply(list)
|
69
|
+
|
70
|
+
# Build a graph of all connections implied by cluster IDs
|
71
|
+
G = nx.Graph()
|
72
|
+
for group in cluster_groups:
|
73
|
+
for i in range(len(group)):
|
74
|
+
for j in range(i + 1, len(group)):
|
75
|
+
G.add_edge(group[i], group[j])
|
76
|
+
|
77
|
+
# Add isolated nodes (records with unique clusters)
|
78
|
+
all_ids = set(combined_df["Input Record ID"])
|
79
|
+
G.add_nodes_from(all_ids)
|
80
|
+
|
81
|
+
# Compute connected components
|
82
|
+
components = list(nx.connected_components(G))
|
83
|
+
|
84
|
+
# Assign new cluster IDs
|
85
|
+
merged_data = []
|
86
|
+
for cluster_id, records in enumerate(components, start=1):
|
87
|
+
for record_id in records:
|
88
|
+
merged_data.append((record_id, cluster_id))
|
89
|
+
|
90
|
+
# Build the final DataFrame
|
91
|
+
merged_df = pd.DataFrame(merged_data, columns=["Input Record ID", "Cluster ID"])
|
92
|
+
|
93
|
+
return merged_df
|
94
|
+
|
95
|
+
|
96
|
+
output_df = merge_clusters(known_clusters_df, new_clusters_df)
|
97
|
+
|
98
|
+
logging.info(
|
99
|
+
f"Writing output for dataset from input {new_clusters_filepath} to {results_filepath}"
|
100
|
+
)
|
101
|
+
output_df.to_parquet(results_filepath)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./default_clusters_to_links.py /default_clusters_to_links.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /default_clusters_to_links.py '$@'
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# STEP_NAME: clusters_to_links
|
2
|
+
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
4
|
+
|
5
|
+
# PIPELINE_SCHEMA: main
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
|
13
|
+
logging.basicConfig(
|
14
|
+
level=logging.INFO,
|
15
|
+
format="%(asctime)s %(message)s",
|
16
|
+
handlers=[logging.StreamHandler()],
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
def load_file(file_path, file_format=None):
|
21
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
22
|
+
if file_format is None:
|
23
|
+
file_format = file_path.split(".")[-1]
|
24
|
+
if file_format == "parquet":
|
25
|
+
return pd.read_parquet(file_path)
|
26
|
+
raise ValueError(f"Unknown file format {file_format}")
|
27
|
+
|
28
|
+
|
29
|
+
# code example from pipeline schema docs
|
30
|
+
def clusters_to_links(clusters_df):
|
31
|
+
# Merge the dataframe with itself on Cluster ID to get all pairs within each cluster
|
32
|
+
merged = clusters_df.merge(
|
33
|
+
clusters_df,
|
34
|
+
on="Cluster ID",
|
35
|
+
suffixes=("_left", "_right"),
|
36
|
+
)
|
37
|
+
|
38
|
+
# Compare tuples row-wise to keep only unique pairs (left < right)
|
39
|
+
mask = (merged["Input Record Dataset_left"] < merged["Input Record Dataset_right"]) | (
|
40
|
+
(merged["Input Record Dataset_left"] == merged["Input Record Dataset_right"])
|
41
|
+
& (merged["Input Record ID_left"] < merged["Input Record ID_right"])
|
42
|
+
)
|
43
|
+
filtered = merged[mask]
|
44
|
+
|
45
|
+
# Build the output DataFrame
|
46
|
+
links_df = filtered[
|
47
|
+
[
|
48
|
+
"Input Record Dataset_left",
|
49
|
+
"Input Record ID_left",
|
50
|
+
"Input Record Dataset_right",
|
51
|
+
"Input Record ID_right",
|
52
|
+
]
|
53
|
+
].copy()
|
54
|
+
links_df.columns = [
|
55
|
+
"Left Record Dataset",
|
56
|
+
"Left Record ID",
|
57
|
+
"Right Record Dataset",
|
58
|
+
"Right Record ID",
|
59
|
+
]
|
60
|
+
links_df["Probability"] = 1.0
|
61
|
+
return links_df
|
62
|
+
|
63
|
+
|
64
|
+
# LOAD INPUTS and SAVE OUTPUTS
|
65
|
+
|
66
|
+
# KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
|
67
|
+
# There is one item in it that is a file with "clusters" in the filename.
|
68
|
+
# That's the only item we're interested in here.
|
69
|
+
# The other items may be there if this is coming from the user's input,
|
70
|
+
# due to our workaround for only having one slot of user input.
|
71
|
+
clusters_filepaths = [
|
72
|
+
path
|
73
|
+
for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
|
74
|
+
if "clusters" in Path(path).stem
|
75
|
+
]
|
76
|
+
if len(clusters_filepaths) > 1:
|
77
|
+
raise ValueError("Multiple known clusters files found")
|
78
|
+
if len(clusters_filepaths) == 0:
|
79
|
+
raise ValueError("No known clusters file found")
|
80
|
+
|
81
|
+
clusters_filepath = clusters_filepaths[0]
|
82
|
+
|
83
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (results.parquet)
|
84
|
+
results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
|
85
|
+
|
86
|
+
clusters_df = load_file(clusters_filepath)
|
87
|
+
links_df = clusters_to_links(clusters_df)
|
88
|
+
logging.info(
|
89
|
+
f"Writing output for dataset from input {clusters_filepath} to {results_filepath}"
|
90
|
+
)
|
91
|
+
links_df.to_parquet(results_filepath)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./default_determining_exclusions.py /default_determining_exclusions.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /default_determining_exclusions.py '$@'
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# STEP_NAME: determining_exclusions
|
2
|
+
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
4
|
+
|
5
|
+
# PIPELINE_SCHEMA: main
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
|
13
|
+
logging.basicConfig(
|
14
|
+
level=logging.INFO,
|
15
|
+
format="%(asctime)s %(message)s",
|
16
|
+
handlers=[logging.StreamHandler()],
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
def load_file(file_path, file_format=None):
|
21
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
22
|
+
if file_format is None:
|
23
|
+
file_format = file_path.split(".")[-1]
|
24
|
+
if file_format == "parquet":
|
25
|
+
return pd.read_parquet(file_path)
|
26
|
+
raise ValueError(f"Unknown file format {file_format}")
|
27
|
+
|
28
|
+
|
29
|
+
# LOAD INPUTS
|
30
|
+
|
31
|
+
# INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS is list of filepaths which includes
|
32
|
+
# the known_clusters filepath due to workaround
|
33
|
+
dataset_paths = os.environ["INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS"].split(",")
|
34
|
+
dataset_paths = [path for path in dataset_paths if "clusters" not in Path(path).stem]
|
35
|
+
|
36
|
+
# for workaround, choose path based on INPUT_DATASET configuration
|
37
|
+
splitter_choice = os.environ["INPUT_DATASET"]
|
38
|
+
dataset_path = None
|
39
|
+
for path in dataset_paths:
|
40
|
+
if splitter_choice == Path(path).stem:
|
41
|
+
dataset_path = path
|
42
|
+
break
|
43
|
+
if dataset_path is None:
|
44
|
+
raise ValueError(f"No dataset matching {splitter_choice} found")
|
45
|
+
|
46
|
+
# KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
|
47
|
+
# There is one item in it that is a file with "clusters" in the filename.
|
48
|
+
# That's the only item we're interested in here.
|
49
|
+
# The other items may be there if this is coming from the user's input,
|
50
|
+
# due to our workaround for only having one slot of user input.
|
51
|
+
clusters_filepaths = [
|
52
|
+
path
|
53
|
+
for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
|
54
|
+
if "clusters" in Path(path).stem
|
55
|
+
]
|
56
|
+
if len(clusters_filepaths) > 1:
|
57
|
+
raise ValueError("Multiple known clusters files found")
|
58
|
+
if len(clusters_filepaths) == 0:
|
59
|
+
raise ValueError("No known clusters file found")
|
60
|
+
|
61
|
+
clusters_filepath = clusters_filepaths[0]
|
62
|
+
|
63
|
+
clusters_df = load_file(clusters_filepath)
|
64
|
+
|
65
|
+
if len(clusters_df) > 0:
|
66
|
+
raise ValueError(
|
67
|
+
"Default implementation of determining_exclusions passed a non-empty set of known clusters"
|
68
|
+
)
|
69
|
+
|
70
|
+
# don't need to actually load the dataset,
|
71
|
+
# since we will just save an empty ids_to_remove
|
72
|
+
|
73
|
+
# SAVE OUTPUTS
|
74
|
+
|
75
|
+
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
76
|
+
|
77
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
|
78
|
+
results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
|
79
|
+
|
80
|
+
logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
|
81
|
+
IDS_TO_REMOVE.to_parquet(results_filepath)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./default_removing_records.py /default_removing_records.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /default_removing_records.py '$@'
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# STEP_NAME: removing_records
|
2
|
+
|
3
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
4
|
+
|
5
|
+
# PIPELINE_SCHEMA: main
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
|
13
|
+
logging.basicConfig(
|
14
|
+
level=logging.INFO,
|
15
|
+
format="%(asctime)s %(message)s",
|
16
|
+
handlers=[logging.StreamHandler()],
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
def load_file(file_path, file_format=None):
|
21
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
22
|
+
if file_format is None:
|
23
|
+
file_format = file_path.split(".")[-1]
|
24
|
+
if file_format == "parquet":
|
25
|
+
return pd.read_parquet(file_path)
|
26
|
+
raise ValueError(f"Unknown file format {file_format}")
|
27
|
+
|
28
|
+
|
29
|
+
# LOAD INPUTS and SAVE OUTPUTS
|
30
|
+
|
31
|
+
# INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS is list of filepaths which includes
|
32
|
+
# the known_clusters filepath due to workaround
|
33
|
+
dataset_paths = os.environ["INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS"].split(",")
|
34
|
+
dataset_paths = [path for path in dataset_paths if "clusters.parquet" not in Path(path).stem]
|
35
|
+
|
36
|
+
# for workaround, choose path based on INPUT_DATASET configuration
|
37
|
+
splitter_choice = os.environ["INPUT_DATASET"]
|
38
|
+
dataset_path = None
|
39
|
+
for path in dataset_paths:
|
40
|
+
if splitter_choice == Path(path).stem:
|
41
|
+
dataset_path = path
|
42
|
+
break
|
43
|
+
if dataset_path is None:
|
44
|
+
raise ValueError(f"No dataset matching {splitter_choice} found")
|
45
|
+
|
46
|
+
# IDS_TO_REMOVE_FILE_PATH is a single filepath (Cloneable section)
|
47
|
+
ids_filepath = os.environ["IDS_TO_REMOVE_FILE_PATH"]
|
48
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
|
49
|
+
results_dir = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
|
50
|
+
results_dir.mkdir(exist_ok=True, parents=True)
|
51
|
+
|
52
|
+
dataset = load_file(dataset_path)
|
53
|
+
ids_to_remove = load_file(ids_filepath)
|
54
|
+
|
55
|
+
dataset = dataset[~dataset["Record ID"].isin(ids_to_remove)]
|
56
|
+
|
57
|
+
output_path = results_dir / Path(dataset_path).name
|
58
|
+
logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
|
59
|
+
dataset.to_parquet(output_path)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./default_schema_alignment.py /default_schema_alignment.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /default_schema_alignment.py '$@'
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# STEP_NAME: schema_alignment
|
2
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
logging.basicConfig(
|
11
|
+
level=logging.INFO,
|
12
|
+
format="%(asctime)s %(message)s",
|
13
|
+
handlers=[logging.StreamHandler()],
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
def load_file(file_path, file_format=None):
|
18
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
19
|
+
if file_format is None:
|
20
|
+
file_format = file_path.split(".")[-1]
|
21
|
+
if file_format == "parquet":
|
22
|
+
return pd.read_parquet(file_path)
|
23
|
+
raise ValueError(f"Unknown file format {file_format}")
|
24
|
+
|
25
|
+
|
26
|
+
# LOAD INPUTS and SAVE OUTPUTS
|
27
|
+
|
28
|
+
# DATASETS_DIR_PATHS is list of directories
|
29
|
+
dataset_dirs = os.environ["DATASETS_DIR_PATHS"].split(",")
|
30
|
+
|
31
|
+
datasets = {}
|
32
|
+
|
33
|
+
for dir in dataset_dirs:
|
34
|
+
for root, dirs, files in os.walk(dir):
|
35
|
+
for file in files:
|
36
|
+
if file.startswith("."):
|
37
|
+
continue
|
38
|
+
datasets[Path(file).stem] = load_file(os.path.join(root, file))
|
39
|
+
|
40
|
+
records = pd.concat(
|
41
|
+
[df.assign(**{"Input Record Dataset": dataset}) for dataset, df in datasets.items()],
|
42
|
+
ignore_index=True,
|
43
|
+
sort=False,
|
44
|
+
)
|
45
|
+
|
46
|
+
records = records.rename(columns={"Record ID": "Input Record ID"})
|
47
|
+
|
48
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a single filepath
|
49
|
+
output_path = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
|
50
|
+
Path(output_path).parent.mkdir(exist_ok=True, parents=True)
|
51
|
+
|
52
|
+
logging.info(f"Writing output to {output_path}")
|
53
|
+
records.to_parquet(output_path)
|