easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +24 -3
- easylink/configuration.py +43 -36
- easylink/devtools/implementation_creator.py +71 -22
- easylink/implementation.py +88 -11
- easylink/implementation_metadata.yaml +177 -29
- easylink/pipeline.py +15 -6
- easylink/pipeline_schema.py +12 -13
- easylink/pipeline_schema_constants/__init__.py +4 -5
- easylink/pipeline_schema_constants/main.py +489 -0
- easylink/runner.py +11 -7
- easylink/step.py +89 -0
- easylink/steps/cascading/exclude_clustered.def +22 -0
- easylink/steps/cascading/exclude_clustered.py +76 -0
- easylink/steps/cascading/exclude_none.def +22 -0
- easylink/steps/cascading/exclude_none.py +76 -0
- easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
- easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
- easylink/steps/default/default_clusters_to_links.def +22 -0
- easylink/steps/default/default_clusters_to_links.py +91 -0
- easylink/steps/default/default_determining_exclusions.def +22 -0
- easylink/steps/default/default_determining_exclusions.py +81 -0
- easylink/steps/default/default_removing_records.def +22 -0
- easylink/steps/default/default_removing_records.py +59 -0
- easylink/steps/default/default_schema_alignment.def +22 -0
- easylink/steps/default/default_schema_alignment.py +53 -0
- easylink/steps/default/default_updating_clusters.def +22 -0
- easylink/steps/default/default_updating_clusters.py +67 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
- easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
- easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
- easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
- easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
- easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
- easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
- easylink/steps/splink/splink_evaluating_pairs.def +22 -0
- easylink/steps/splink/splink_evaluating_pairs.py +164 -0
- easylink/steps/splink/splink_links_to_clusters.def +22 -0
- easylink/steps/splink/splink_links_to_clusters.py +63 -0
- easylink/utilities/data_utils.py +72 -0
- easylink/utilities/paths.py +4 -3
- easylink/utilities/validation_utils.py +509 -11
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
- easylink-0.1.19.dist-info/RECORD +91 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
- easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
- easylink-0.1.17.dist-info/RECORD +0 -55
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./dummy_pre-processing.py /dummy_pre-processing.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /dummy_pre-processing.py '$@'
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# STEP_NAME: pre-processing
|
2
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
logging.basicConfig(
|
11
|
+
level=logging.INFO,
|
12
|
+
format="%(asctime)s %(message)s",
|
13
|
+
handlers=[logging.StreamHandler()],
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
def load_file(file_path, file_format=None):
|
18
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
19
|
+
if file_format is None:
|
20
|
+
file_format = file_path.split(".")[-1]
|
21
|
+
if file_format == "parquet":
|
22
|
+
return pd.read_parquet(file_path)
|
23
|
+
raise ValueError(f"Unknown file format {file_format}")
|
24
|
+
|
25
|
+
|
26
|
+
# LOAD INPUTS and SAVE OUTPUTS
|
27
|
+
|
28
|
+
# DATASET_DIR_PATHS is list of directories, each containing one file
|
29
|
+
dataset_paths = os.environ["DATASET_DIR_PATHS"].split(",")
|
30
|
+
logging.info(f"{dataset_paths=}")
|
31
|
+
|
32
|
+
# for workaround, choose path based on INPUT_DATASET configuration
|
33
|
+
splitter_choice = os.environ["INPUT_DATASET"]
|
34
|
+
logging.info(f"splitter_choice={splitter_choice}")
|
35
|
+
dataset_path = None
|
36
|
+
for path in dataset_paths:
|
37
|
+
path = Path(path)
|
38
|
+
# NOTE: We iterate the dir here, but it should only have one non-hidden
|
39
|
+
# file in it. We don't validate that here as it is checked in the validator.
|
40
|
+
for path_to_check in path.iterdir():
|
41
|
+
if path_to_check.stem == splitter_choice:
|
42
|
+
dataset_path = str(path_to_check)
|
43
|
+
break
|
44
|
+
|
45
|
+
if dataset_path is None:
|
46
|
+
raise ValueError(f"No dataset matching {splitter_choice} found")
|
47
|
+
|
48
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
|
49
|
+
results_dir = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
|
50
|
+
results_dir.mkdir(exist_ok=True, parents=True)
|
51
|
+
|
52
|
+
output_path = results_dir / Path(dataset_path).name
|
53
|
+
|
54
|
+
dataset = load_file(dataset_path)
|
55
|
+
|
56
|
+
# NOTE: No actual pre-processing here, we save as-is.
|
57
|
+
|
58
|
+
logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
|
59
|
+
dataset.to_parquet(output_path)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./splink_blocking_and_filtering.py /splink_blocking_and_filtering.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas pyarrow splink==4.0.7
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /splink_blocking_and_filtering.py '$@'
|
@@ -0,0 +1,130 @@
|
|
1
|
+
# STEP_NAME: blocking_and_filtering
|
2
|
+
# REQUIREMENTS: pandas pyarrow splink==4.0.7
|
3
|
+
|
4
|
+
import os
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
records = pd.read_parquet(os.environ["RECORDS_FILE_PATH"])
|
9
|
+
|
10
|
+
# DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
|
11
|
+
results_dir = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
|
12
|
+
|
13
|
+
import splink
|
14
|
+
|
15
|
+
blocking_rules = os.environ["BLOCKING_RULES"].split(",")
|
16
|
+
|
17
|
+
from splink import Linker, SettingsCreator
|
18
|
+
|
19
|
+
# Create the Splink linker in dedupe mode
|
20
|
+
settings = SettingsCreator(
|
21
|
+
link_type="link_and_dedupe",
|
22
|
+
blocking_rules_to_generate_predictions=blocking_rules,
|
23
|
+
comparisons=[],
|
24
|
+
)
|
25
|
+
from splink import DuckDBAPI
|
26
|
+
|
27
|
+
grouped = records.rename(columns={"Input Record ID": "unique_id"}).groupby(
|
28
|
+
"Input Record Dataset"
|
29
|
+
)
|
30
|
+
|
31
|
+
db_api = DuckDBAPI()
|
32
|
+
linker = Linker(
|
33
|
+
[df for _, df in grouped],
|
34
|
+
settings,
|
35
|
+
db_api=db_api,
|
36
|
+
input_table_aliases=[name for name, _ in grouped],
|
37
|
+
)
|
38
|
+
|
39
|
+
# Copied/adapted from https://github.com/moj-analytical-services/splink/blob/3eb1921eaff6b8471d3ebacd3238eb514f62c844/splink/internals/linker_components/inference.py#L86-L131
|
40
|
+
from splink.internals.pipeline import CTEPipeline
|
41
|
+
from splink.internals.vertically_concatenate import compute_df_concat_with_tf
|
42
|
+
|
43
|
+
pipeline = CTEPipeline()
|
44
|
+
|
45
|
+
# In duckdb, calls to random() in a CTE pipeline cause problems:
|
46
|
+
# https://gist.github.com/RobinL/d329e7004998503ce91b68479aa41139
|
47
|
+
df_concat_with_tf = compute_df_concat_with_tf(linker, pipeline)
|
48
|
+
pipeline = CTEPipeline([df_concat_with_tf])
|
49
|
+
|
50
|
+
blocking_input_tablename_l = "__splink__df_concat_with_tf"
|
51
|
+
blocking_input_tablename_r = "__splink__df_concat_with_tf"
|
52
|
+
|
53
|
+
link_type = linker._settings_obj._link_type
|
54
|
+
|
55
|
+
# If exploded blocking rules exist, we need to materialise
|
56
|
+
# the tables of ID pairs
|
57
|
+
from splink.internals.blocking import materialise_exploded_id_tables
|
58
|
+
|
59
|
+
exploding_br_with_id_tables = materialise_exploded_id_tables(
|
60
|
+
link_type=link_type,
|
61
|
+
blocking_rules=linker._settings_obj._blocking_rules_to_generate_predictions,
|
62
|
+
db_api=linker._db_api,
|
63
|
+
splink_df_dict=linker._input_tables_dict,
|
64
|
+
source_dataset_input_column=linker._settings_obj.column_info_settings.source_dataset_input_column,
|
65
|
+
unique_id_input_column=linker._settings_obj.column_info_settings.unique_id_input_column,
|
66
|
+
)
|
67
|
+
|
68
|
+
from splink.internals.blocking import block_using_rules_sqls
|
69
|
+
|
70
|
+
sqls = block_using_rules_sqls(
|
71
|
+
input_tablename_l=blocking_input_tablename_l,
|
72
|
+
input_tablename_r=blocking_input_tablename_r,
|
73
|
+
blocking_rules=linker._settings_obj._blocking_rules_to_generate_predictions,
|
74
|
+
link_type=link_type,
|
75
|
+
source_dataset_input_column=linker._settings_obj.column_info_settings.source_dataset_input_column,
|
76
|
+
unique_id_input_column=linker._settings_obj.column_info_settings.unique_id_input_column,
|
77
|
+
)
|
78
|
+
|
79
|
+
pipeline.enqueue_list_of_sqls(sqls)
|
80
|
+
|
81
|
+
blocked_pairs = (
|
82
|
+
linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
83
|
+
.as_pandas_dataframe()
|
84
|
+
.drop(columns=["match_key"])
|
85
|
+
)
|
86
|
+
|
87
|
+
blocked_pairs[["Left Record Dataset", "Left Record ID"]] = blocked_pairs.pop(
|
88
|
+
"join_key_l"
|
89
|
+
).str.split("-__-", n=1, expand=True)
|
90
|
+
blocked_pairs[["Right Record Dataset", "Right Record ID"]] = blocked_pairs.pop(
|
91
|
+
"join_key_r"
|
92
|
+
).str.split("-__-", n=1, expand=True)
|
93
|
+
blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
|
94
|
+
["Left Record ID", "Right Record ID"]
|
95
|
+
].astype(int)
|
96
|
+
|
97
|
+
# Now ensure correct ordering
|
98
|
+
wrong_order_dataset = (
|
99
|
+
blocked_pairs["Left Record Dataset"] > blocked_pairs["Right Record Dataset"]
|
100
|
+
)
|
101
|
+
id_cols = ["Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID"]
|
102
|
+
switched_id_cols = [
|
103
|
+
"Right Record Dataset",
|
104
|
+
"Right Record ID",
|
105
|
+
"Left Record Dataset",
|
106
|
+
"Left Record ID",
|
107
|
+
]
|
108
|
+
blocked_pairs.loc[wrong_order_dataset, id_cols] = blocked_pairs.loc[
|
109
|
+
wrong_order_dataset, switched_id_cols
|
110
|
+
].values
|
111
|
+
|
112
|
+
wrong_order_ids = (
|
113
|
+
blocked_pairs["Left Record Dataset"] == blocked_pairs["Right Record Dataset"]
|
114
|
+
) & (blocked_pairs["Left Record ID"] > blocked_pairs["Right Record ID"])
|
115
|
+
blocked_pairs.loc[wrong_order_ids, id_cols] = blocked_pairs.loc[
|
116
|
+
wrong_order_ids, switched_id_cols
|
117
|
+
].values
|
118
|
+
blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
|
119
|
+
["Left Record ID", "Right Record ID"]
|
120
|
+
].astype(int)
|
121
|
+
|
122
|
+
print(blocked_pairs)
|
123
|
+
|
124
|
+
from pathlib import Path
|
125
|
+
|
126
|
+
output_path = Path(results_dir) / "block_0"
|
127
|
+
output_path.mkdir(exist_ok=True, parents=True)
|
128
|
+
|
129
|
+
records.to_parquet(output_path / "records.parquet", index=False)
|
130
|
+
blocked_pairs.to_parquet(output_path / "pairs.parquet", index=False)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./splink_evaluating_pairs.py /splink_evaluating_pairs.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas pyarrow splink==4.0.7
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /splink_evaluating_pairs.py '$@'
|
@@ -0,0 +1,164 @@
|
|
1
|
+
# STEP_NAME: evaluating_pairs
|
2
|
+
# REQUIREMENTS: pandas pyarrow splink==4.0.7
|
3
|
+
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import splink
|
9
|
+
import splink.comparison_library as cl
|
10
|
+
from splink import Linker, SettingsCreator
|
11
|
+
|
12
|
+
blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
|
13
|
+
diagnostics_dir = Path(os.environ["DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY"])
|
14
|
+
output_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
|
15
|
+
Path(output_path).parent.mkdir(exist_ok=True, parents=True)
|
16
|
+
|
17
|
+
all_predictions = []
|
18
|
+
|
19
|
+
for block_dir in blocks_dir.iterdir():
|
20
|
+
if str(block_dir.stem).startswith("."):
|
21
|
+
continue
|
22
|
+
encoded_comparisons = os.environ["COMPARISONS"].split(",")
|
23
|
+
|
24
|
+
comparisons = []
|
25
|
+
for encoded_comparison in encoded_comparisons:
|
26
|
+
column, method = encoded_comparison.split(":")
|
27
|
+
if method == "exact":
|
28
|
+
comparisons.append(cl.ExactMatch(column))
|
29
|
+
elif method == "name":
|
30
|
+
comparisons.append(cl.NameComparison(column))
|
31
|
+
elif method == "dob":
|
32
|
+
comparisons.append(cl.DateOfBirthComparison(column))
|
33
|
+
else:
|
34
|
+
raise ValueError(f"Unknown comparison method {method}")
|
35
|
+
|
36
|
+
# Create the Splink linker in dedupe mode
|
37
|
+
settings = SettingsCreator(
|
38
|
+
link_type="link_and_dedupe",
|
39
|
+
blocking_rules_to_generate_predictions=[],
|
40
|
+
comparisons=comparisons,
|
41
|
+
probability_two_random_records_match=float(
|
42
|
+
os.environ["PROBABILITY_TWO_RANDOM_RECORDS_MATCH"]
|
43
|
+
),
|
44
|
+
)
|
45
|
+
|
46
|
+
grouped = (
|
47
|
+
pd.read_parquet(block_dir / "records.parquet")
|
48
|
+
.rename(columns={"Input Record ID": "unique_id"})
|
49
|
+
.groupby("Input Record Dataset")
|
50
|
+
)
|
51
|
+
|
52
|
+
from splink import DuckDBAPI
|
53
|
+
|
54
|
+
db_api = DuckDBAPI()
|
55
|
+
linker = Linker(
|
56
|
+
[df for _, df in grouped],
|
57
|
+
settings,
|
58
|
+
db_api=db_api,
|
59
|
+
input_table_aliases=[name for name, _ in grouped],
|
60
|
+
)
|
61
|
+
|
62
|
+
linker.training.estimate_u_using_random_sampling(max_pairs=5e6)
|
63
|
+
|
64
|
+
blocking_rules_for_training = os.environ["BLOCKING_RULES_FOR_TRAINING"].split(",")
|
65
|
+
|
66
|
+
for blocking_rule_for_training in blocking_rules_for_training:
|
67
|
+
linker.training.estimate_parameters_using_expectation_maximisation(
|
68
|
+
blocking_rule_for_training
|
69
|
+
)
|
70
|
+
|
71
|
+
chart_path = diagnostics_dir / f"match_weights_chart_{block_dir}.html"
|
72
|
+
chart_path.parent.mkdir(exist_ok=True, parents=True)
|
73
|
+
linker.visualisations.match_weights_chart().save(chart_path)
|
74
|
+
|
75
|
+
# Copied/adapted from https://github.com/moj-analytical-services/splink/blob/3eb1921eaff6b8471d3ebacd3238eb514f62c844/splink/internals/linker_components/inference.py#L264-L293
|
76
|
+
from splink.internals.pipeline import CTEPipeline
|
77
|
+
from splink.internals.vertically_concatenate import compute_df_concat_with_tf
|
78
|
+
|
79
|
+
pipeline = CTEPipeline()
|
80
|
+
|
81
|
+
# In duckdb, calls to random() in a CTE pipeline cause problems:
|
82
|
+
# https://gist.github.com/RobinL/d329e7004998503ce91b68479aa41139
|
83
|
+
pairs = (
|
84
|
+
pd.read_parquet(block_dir / "pairs.parquet")
|
85
|
+
.assign(
|
86
|
+
join_key_l=lambda df: df["Left Record Dataset"]
|
87
|
+
+ "-__-"
|
88
|
+
+ df["Left Record ID"].astype(int).astype(str),
|
89
|
+
join_key_r=lambda df: df["Right Record Dataset"]
|
90
|
+
+ "-__-"
|
91
|
+
+ df["Right Record ID"].astype(int).astype(str),
|
92
|
+
)
|
93
|
+
.drop(
|
94
|
+
columns=[
|
95
|
+
"Left Record Dataset",
|
96
|
+
"Left Record ID",
|
97
|
+
"Right Record Dataset",
|
98
|
+
"Right Record ID",
|
99
|
+
]
|
100
|
+
)
|
101
|
+
.assign(match_key=0)
|
102
|
+
) # What is this?
|
103
|
+
db_api._table_registration(pairs, "__splink__blocked_id_pairs")
|
104
|
+
df_concat_with_tf = compute_df_concat_with_tf(linker, pipeline)
|
105
|
+
pipeline = CTEPipeline(
|
106
|
+
[
|
107
|
+
db_api.table_to_splink_dataframe(
|
108
|
+
"__splink__blocked_id_pairs", "__splink__blocked_id_pairs"
|
109
|
+
),
|
110
|
+
df_concat_with_tf,
|
111
|
+
]
|
112
|
+
)
|
113
|
+
|
114
|
+
from splink.internals.comparison_vector_values import (
|
115
|
+
compute_comparison_vector_values_from_id_pairs_sqls,
|
116
|
+
)
|
117
|
+
|
118
|
+
sqls = compute_comparison_vector_values_from_id_pairs_sqls(
|
119
|
+
linker._settings_obj._columns_to_select_for_blocking,
|
120
|
+
linker._settings_obj._columns_to_select_for_comparison_vector_values,
|
121
|
+
input_tablename_l="__splink__df_concat_with_tf",
|
122
|
+
input_tablename_r="__splink__df_concat_with_tf",
|
123
|
+
source_dataset_input_column=linker._settings_obj.column_info_settings.source_dataset_input_column,
|
124
|
+
unique_id_input_column=linker._settings_obj.column_info_settings.unique_id_input_column,
|
125
|
+
)
|
126
|
+
pipeline.enqueue_list_of_sqls(sqls)
|
127
|
+
|
128
|
+
from splink.internals.predict import (
|
129
|
+
predict_from_comparison_vectors_sqls_using_settings,
|
130
|
+
)
|
131
|
+
|
132
|
+
sqls = predict_from_comparison_vectors_sqls_using_settings(
|
133
|
+
linker._settings_obj,
|
134
|
+
float(os.environ["THRESHOLD_MATCH_PROBABILITY"]),
|
135
|
+
threshold_match_weight=None,
|
136
|
+
sql_infinity_expression=linker._infinity_expression,
|
137
|
+
)
|
138
|
+
pipeline.enqueue_list_of_sqls(sqls)
|
139
|
+
|
140
|
+
predictions = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
141
|
+
|
142
|
+
linker._predict_warning()
|
143
|
+
|
144
|
+
all_predictions.append(predictions.as_pandas_dataframe())
|
145
|
+
|
146
|
+
all_predictions = pd.concat(all_predictions, ignore_index=True)[
|
147
|
+
[
|
148
|
+
"source_dataset_l",
|
149
|
+
"unique_id_l",
|
150
|
+
"source_dataset_r",
|
151
|
+
"unique_id_r",
|
152
|
+
"match_probability",
|
153
|
+
]
|
154
|
+
].rename(
|
155
|
+
columns={
|
156
|
+
"source_dataset_l": "Left Record Dataset",
|
157
|
+
"unique_id_l": "Left Record ID",
|
158
|
+
"source_dataset_r": "Right Record Dataset",
|
159
|
+
"unique_id_r": "Right Record ID",
|
160
|
+
"match_probability": "Probability",
|
161
|
+
}
|
162
|
+
)
|
163
|
+
print(all_predictions)
|
164
|
+
all_predictions.to_parquet(output_path)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./splink_links_to_clusters.py /splink_links_to_clusters.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas pyarrow splink==4.0.7
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /splink_links_to_clusters.py '$@'
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# STEP_NAME: links_to_clusters
|
2
|
+
# REQUIREMENTS: pandas pyarrow splink==4.0.7
|
3
|
+
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
# Adapted from example on https://moj-analytical-services.github.io/splink/api_docs/clustering.html
|
11
|
+
from splink import DuckDBAPI
|
12
|
+
from splink.clustering import cluster_pairwise_predictions_at_threshold
|
13
|
+
|
14
|
+
links = pd.read_parquet(os.environ["LINKS_FILE_PATH"]).rename(
|
15
|
+
columns={
|
16
|
+
"Probability": "match_probability",
|
17
|
+
}
|
18
|
+
)
|
19
|
+
|
20
|
+
# Create unique record keys by concatenating Input Record Dataset and Record ID for both left and right
|
21
|
+
links["Left Record Key"] = (
|
22
|
+
links["Left Record Dataset"].astype(str) + "-__-" + links["Left Record ID"].astype(str)
|
23
|
+
)
|
24
|
+
links["Right Record Key"] = (
|
25
|
+
links["Right Record Dataset"].astype(str) + "-__-" + links["Right Record ID"].astype(str)
|
26
|
+
)
|
27
|
+
|
28
|
+
dummy_records_df = pd.DataFrame(
|
29
|
+
{
|
30
|
+
"Record Key": np.unique(
|
31
|
+
list(links["Left Record Key"]) + list(links["Right Record Key"])
|
32
|
+
)
|
33
|
+
}
|
34
|
+
)
|
35
|
+
output_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
|
36
|
+
|
37
|
+
db_api = DuckDBAPI()
|
38
|
+
|
39
|
+
cc = (
|
40
|
+
cluster_pairwise_predictions_at_threshold(
|
41
|
+
dummy_records_df,
|
42
|
+
links,
|
43
|
+
node_id_column_name='"Record Key"',
|
44
|
+
edge_id_column_name_left='"Left Record Key"',
|
45
|
+
edge_id_column_name_right='"Right Record Key"',
|
46
|
+
db_api=db_api,
|
47
|
+
threshold_match_probability=float(os.environ["THRESHOLD_MATCH_PROBABILITY"]),
|
48
|
+
)
|
49
|
+
.as_pandas_dataframe()
|
50
|
+
.rename(columns={"cluster_id": "Cluster ID"})
|
51
|
+
)
|
52
|
+
|
53
|
+
# Split "Record Key" back into "Input Record Dataset" and "Input Record ID"
|
54
|
+
cc[["Input Record Dataset", "Input Record ID"]] = (
|
55
|
+
cc["Record Key"].astype(str).str.split("-__-", n=1, expand=True)
|
56
|
+
)
|
57
|
+
cc = cc.drop(columns=["Record Key"])
|
58
|
+
cc["Input Record ID"] = cc["Input Record ID"].astype(int)
|
59
|
+
cc = cc[["Input Record Dataset", "Input Record ID", "Cluster ID"]]
|
60
|
+
|
61
|
+
print(cc)
|
62
|
+
|
63
|
+
cc.to_parquet(output_path)
|
easylink/utilities/data_utils.py
CHANGED
@@ -8,13 +8,17 @@ This module contains utility functions for handling data files and directories.
|
|
8
8
|
|
9
9
|
"""
|
10
10
|
|
11
|
+
import hashlib
|
11
12
|
import os
|
12
13
|
import shutil
|
13
14
|
from collections.abc import Callable
|
14
15
|
from datetime import datetime
|
15
16
|
from pathlib import Path
|
16
17
|
|
18
|
+
import requests
|
17
19
|
import yaml
|
20
|
+
from loguru import logger
|
21
|
+
from tqdm import tqdm
|
18
22
|
|
19
23
|
|
20
24
|
def modify_umask(func: Callable) -> Callable:
|
@@ -152,3 +156,71 @@ def load_yaml(filepath: str | Path) -> dict:
|
|
152
156
|
with open(filepath, "r") as file:
|
153
157
|
data = yaml.safe_load(file)
|
154
158
|
return data
|
159
|
+
|
160
|
+
|
161
|
+
@modify_umask
|
162
|
+
def download_image(
|
163
|
+
images_dir: str | Path, record_id: int, filename: str, md5_checksum: str
|
164
|
+
) -> None:
|
165
|
+
"""Downloads an image from zenodo.
|
166
|
+
|
167
|
+
Parameters
|
168
|
+
----------
|
169
|
+
images_dir
|
170
|
+
The directory to download the image to.
|
171
|
+
record_id
|
172
|
+
The zenodo record ID that the image is a part of.
|
173
|
+
filename
|
174
|
+
The name of the image file to download.
|
175
|
+
md5_checksum
|
176
|
+
The expected MD5 checksum of the image file.
|
177
|
+
|
178
|
+
Raises
|
179
|
+
------
|
180
|
+
FileNotFoundError
|
181
|
+
If the image file was not downloaded.
|
182
|
+
ValueError
|
183
|
+
If the MD5 checksum of the downloaded file does not match the expected checksum.
|
184
|
+
"""
|
185
|
+
|
186
|
+
images_dir = Path(images_dir).resolve()
|
187
|
+
if not images_dir.exists():
|
188
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
189
|
+
|
190
|
+
url = f"https://zenodo.org/record/{record_id}/files/{filename}?download=1"
|
191
|
+
|
192
|
+
response = requests.get(url, stream=True)
|
193
|
+
response.raise_for_status()
|
194
|
+
|
195
|
+
total_size = int(response.headers.get("Content-Length", 0))
|
196
|
+
output_path = images_dir / filename
|
197
|
+
logger.info(f"Downloading {filename} to {output_path}...")
|
198
|
+
with open(output_path, "wb") as file, tqdm(
|
199
|
+
total=total_size, unit="B", unit_scale=True, desc=filename
|
200
|
+
) as progress_bar:
|
201
|
+
for chunk in response.iter_content(chunk_size=8192):
|
202
|
+
if chunk:
|
203
|
+
file.write(chunk)
|
204
|
+
progress_bar.update(len(chunk))
|
205
|
+
|
206
|
+
if not output_path.exists():
|
207
|
+
raise FileNotFoundError(f"Failed to download the image: {filename}")
|
208
|
+
|
209
|
+
# Verify MD5 checksum
|
210
|
+
calculated_md5_checksum = calculate_md5_checksum(output_path)
|
211
|
+
if calculated_md5_checksum != md5_checksum:
|
212
|
+
raise ValueError(
|
213
|
+
f"MD5 checksum does not match for {filename}.\n"
|
214
|
+
f"Try manually downloading the image and then moving it to the {images_dir} directory.\n"
|
215
|
+
f"Download the image by visiting this link: {url}"
|
216
|
+
)
|
217
|
+
|
218
|
+
|
219
|
+
def calculate_md5_checksum(output_path: Path) -> str:
|
220
|
+
md5_hash = hashlib.md5()
|
221
|
+
with open(output_path, "rb") as file:
|
222
|
+
while chunk := file.read(8192):
|
223
|
+
md5_hash.update(chunk)
|
224
|
+
|
225
|
+
calculated_md5_checksum = md5_hash.hexdigest()
|
226
|
+
return calculated_md5_checksum
|
easylink/utilities/paths.py
CHANGED
@@ -9,9 +9,10 @@ This module contains commonly-used filepaths and directories.
|
|
9
9
|
|
10
10
|
from pathlib import Path
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
DEV_IMAGES_DIR = "/mnt/team/simulation_science/priv/engineering/er_ecosystem/images"
|
13
|
+
"""Path to the directory where the development/dummy pipeline images are stored."""
|
14
|
+
DEFAULT_IMAGES_DIR = Path.home() / ".easylink_images"
|
15
|
+
"""Default subdirectory for storing downloaded images."""
|
15
16
|
IMPLEMENTATION_METADATA = Path(__file__).parent.parent / "implementation_metadata.yaml"
|
16
17
|
"""Path to the implementation metadata file."""
|
17
18
|
EASYLINK_TEMP = {"local": Path("/tmp/easylink"), "slurm": Path("/tmp")}
|