easylink 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/configuration.py +4 -3
- easylink/implementation_metadata.yaml +53 -46
- easylink/pipeline_schema.py +3 -3
- easylink/pipeline_schema_constants/__init__.py +1 -0
- easylink/pipeline_schema_constants/testing.py +124 -1
- easylink/rule.py +5 -5
- easylink/step.py +46 -14
- easylink/steps/cascading/exclude_clustered.py +2 -2
- easylink/steps/cascading/exclude_none.py +2 -2
- easylink/steps/cascading/update_clusters_by_connected_components.py +2 -2
- easylink/steps/default/default_clusters_to_links.py +2 -2
- easylink/steps/default/default_determining_exclusions.py +2 -2
- easylink/steps/default/default_removing_records.py +2 -2
- easylink/steps/default/default_schema_alignment.py +3 -2
- easylink/steps/default/default_updating_clusters.py +2 -2
- easylink/steps/dev/README.md +1 -1
- easylink/steps/dev/python_pandas/dummy_step.py +4 -4
- easylink/steps/dev/python_pandas/python_pandas.def +2 -13
- easylink/steps/dev/python_pyspark/dummy_step.py +5 -7
- easylink/steps/dev/python_pyspark/python_pyspark.def +2 -12
- easylink/steps/dev/r/dummy_step.R +2 -2
- easylink/steps/dev/r/r-image.def +2 -12
- easylink/steps/example/middle_name_to_initial.def +22 -0
- easylink/steps/example/middle_name_to_initial.py +60 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +4 -4
- easylink/steps/fastLink/fastLink_links_to_clusters.R +2 -2
- easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py +1 -1
- easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py +2 -2
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +2 -2
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +2 -2
- easylink/steps/splink/splink_blocking_and_filtering.def +1 -1
- easylink/steps/splink/splink_blocking_and_filtering.py +32 -6
- easylink/steps/splink/splink_evaluating_pairs.py +14 -4
- easylink/steps/splink/splink_links_to_clusters.py +1 -1
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/METADATA +1 -1
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/RECORD +41 -41
- easylink/images/spark_cluster/Dockerfile +0 -16
- easylink/images/spark_cluster/README.md +0 -15
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/WHEEL +0 -0
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/licenses/LICENSE +0 -0
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/top_level.txt +0 -0
@@ -45,8 +45,8 @@ if dataset_path is None:
|
|
45
45
|
|
46
46
|
# IDS_TO_REMOVE_FILE_PATH is a single filepath (Cloneable section)
|
47
47
|
ids_filepath = os.environ["IDS_TO_REMOVE_FILE_PATH"]
|
48
|
-
#
|
49
|
-
results_dir = Path(os.environ["
|
48
|
+
# OUTPUT_PATHS is a single path to a directory ('dataset')
|
49
|
+
results_dir = Path(os.environ["OUTPUT_PATHS"])
|
50
50
|
results_dir.mkdir(exist_ok=True, parents=True)
|
51
51
|
|
52
52
|
dataset = load_file(dataset_path)
|
@@ -42,11 +42,12 @@ records = pd.concat(
|
|
42
42
|
ignore_index=True,
|
43
43
|
sort=False,
|
44
44
|
)
|
45
|
+
# TODO: check both datasets contain all the columns
|
45
46
|
|
46
47
|
records = records.rename(columns={"Record ID": "Input Record ID"})
|
47
48
|
|
48
|
-
#
|
49
|
-
output_path = os.environ["
|
49
|
+
# OUTPUT_PATHS is a single filepath
|
50
|
+
output_path = os.environ["OUTPUT_PATHS"]
|
50
51
|
Path(output_path).parent.mkdir(exist_ok=True, parents=True)
|
51
52
|
|
52
53
|
logging.info(f"Writing output to {output_path}")
|
@@ -54,8 +54,8 @@ if len(known_clusters_df) > 0:
|
|
54
54
|
"Default implementation of updating_clusters passed a non-empty set of known clusters"
|
55
55
|
)
|
56
56
|
|
57
|
-
#
|
58
|
-
results_filepath = os.environ["
|
57
|
+
# OUTPUT_PATHS is a path to a single file (clusters.parquet)
|
58
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
59
59
|
Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
|
60
60
|
|
61
61
|
clusters_df = load_file(new_clusters_filepath)
|
easylink/steps/dev/README.md
CHANGED
@@ -46,7 +46,7 @@ is `DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS`, but you can also specify *what* the
|
|
46
46
|
You can (optionally) provide another input file at `/extra_implementation_specific_input_data/input*` (Parquet or CSV) or a different path passed as `DUMMY_CONTAINER_EXTRA_IMPLEMENTATION_SPECIFIC_INPUT_FILE_PATH`.
|
47
47
|
This is meant to represent an input that is specific to a given implementation.
|
48
48
|
|
49
|
-
Output is written to `/results/result.<ext>` or a different comma-separated list of paths passed as `
|
49
|
+
Output is written to `/results/result.<ext>` or a different comma-separated list of paths passed as `OUTPUT_PATHS`.
|
50
50
|
If `DUMMY_CONTAINER_OUTPUT_FILE_TYPE` is `csv` it will be in CSV format, otherwise it will be Parquet.
|
51
51
|
|
52
52
|
The environment variable `DUMMY_CONTAINER_BROKEN` makes the container return data that does not meet the specification.
|
@@ -101,9 +101,9 @@ else:
|
|
101
101
|
df.drop(columns=columns_to_drop, inplace=True)
|
102
102
|
|
103
103
|
output_file_format = os.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
|
104
|
-
output_file_paths = os.getenv(
|
105
|
-
"
|
106
|
-
)
|
104
|
+
output_file_paths = os.getenv("OUTPUT_PATHS", f"/results/result.{output_file_format}").split(
|
105
|
+
","
|
106
|
+
)
|
107
107
|
|
108
108
|
diagnostics["num_output_files"] = len(output_file_paths)
|
109
109
|
diagnostics["output_file_paths"] = output_file_paths
|
@@ -117,7 +117,7 @@ for output_file_path in output_file_paths:
|
|
117
117
|
else:
|
118
118
|
raise ValueError()
|
119
119
|
|
120
|
-
diagnostics_dir = os.getenv("
|
120
|
+
diagnostics_dir = os.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
|
121
121
|
try:
|
122
122
|
with open(f"{diagnostics_dir}/diagnostics.yaml", "w") as f:
|
123
123
|
yaml.dump(diagnostics, f, default_flow_style=False)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
Bootstrap: docker
|
2
3
|
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
3
4
|
|
@@ -16,18 +17,6 @@ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a72
|
|
16
17
|
|
17
18
|
%environment
|
18
19
|
export LC_ALL=C
|
19
|
-
export PYTHONPATH=/app:$PYTHONPATH
|
20
20
|
|
21
21
|
%runscript
|
22
|
-
python /dummy_step.py
|
23
|
-
|
24
|
-
%labels
|
25
|
-
Author Patrick Nast
|
26
|
-
Version v1.0
|
27
|
-
Description Python Pandas Implementation
|
28
|
-
|
29
|
-
%startscript
|
30
|
-
# These directories should be bound when running the container
|
31
|
-
mkdir -p /results
|
32
|
-
mkdir -p /diagnostics
|
33
|
-
mkdir -p /input_data
|
22
|
+
python /dummy_step.py '$@'
|
@@ -17,9 +17,7 @@ logging.basicConfig(
|
|
17
17
|
pyspark_log = logging.getLogger("pyspark")
|
18
18
|
pyspark_log.setLevel(logging.WARNING)
|
19
19
|
|
20
|
-
spark = SparkSession.builder.master(
|
21
|
-
os.getenv("DUMMY_CONTAINER_SPARK_MASTER_URL")
|
22
|
-
).getOrCreate()
|
20
|
+
spark = SparkSession.builder.master(os.getenv("SPARK_MASTER_URL")).getOrCreate()
|
23
21
|
|
24
22
|
|
25
23
|
def load_file(file_path, file_format=None):
|
@@ -115,9 +113,9 @@ else:
|
|
115
113
|
df = df.drop(*columns_to_drop)
|
116
114
|
|
117
115
|
output_file_format = os.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
|
118
|
-
output_file_paths = os.getenv(
|
119
|
-
"
|
120
|
-
)
|
116
|
+
output_file_paths = os.getenv("OUTPUT_PATHS", f"/results/result.{output_file_format}").split(
|
117
|
+
","
|
118
|
+
)
|
121
119
|
|
122
120
|
diagnostics["num_output_files"] = len(output_file_paths)
|
123
121
|
diagnostics["output_file_paths"] = output_file_paths
|
@@ -132,7 +130,7 @@ for output_file_path in output_file_paths:
|
|
132
130
|
else:
|
133
131
|
raise ValueError()
|
134
132
|
|
135
|
-
diagnostics_dir = os.getenv("
|
133
|
+
diagnostics_dir = os.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
|
136
134
|
try:
|
137
135
|
with open(f"{diagnostics_dir}/diagnostics.yaml", "w") as f:
|
138
136
|
yaml.dump(diagnostics, f, default_flow_style=False)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
Bootstrap: docker
|
2
3
|
From: apache/spark-py@sha256:489f904a77f21134df4840de5f8bd9f110925e7b439ca6a04b7c033813edfebc
|
3
4
|
|
@@ -22,15 +23,4 @@ From: apache/spark-py@sha256:489f904a77f21134df4840de5f8bd9f110925e7b439ca6a04b7
|
|
22
23
|
|
23
24
|
%runscript
|
24
25
|
cd /workdir
|
25
|
-
python3 /code/dummy_step.py
|
26
|
-
|
27
|
-
%labels
|
28
|
-
Author Patrick Nast
|
29
|
-
Version v1.0
|
30
|
-
Description Python Pyspark Implementation
|
31
|
-
|
32
|
-
%startscript
|
33
|
-
# These directories should be bound when running the container
|
34
|
-
mkdir -p /results
|
35
|
-
mkdir -p /diagnostics
|
36
|
-
mkdir -p /input_data
|
26
|
+
python3 /code/dummy_step.py '$@'
|
@@ -106,7 +106,7 @@ if (broken) {
|
|
106
106
|
}
|
107
107
|
|
108
108
|
output_file_format <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
|
109
|
-
output_file_paths <- strsplit(Sys.getenv("
|
109
|
+
output_file_paths <- strsplit(Sys.getenv("OUTPUT_PATHS", paste0("/results/result.", output_file_format)), ",")[[1]]
|
110
110
|
|
111
111
|
diagnostics$num_output_files <- length(output_file_paths)
|
112
112
|
diagnostics$output_file_paths <- output_file_paths
|
@@ -123,7 +123,7 @@ for (output_file_path in output_file_paths) {
|
|
123
123
|
}
|
124
124
|
}
|
125
125
|
|
126
|
-
diagnostics_dir <- Sys.getenv("
|
126
|
+
diagnostics_dir <- Sys.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
|
127
127
|
if (dir.exists(diagnostics_dir) && file.access(diagnostics_dir, mode = 2) == 0) {
|
128
128
|
write_yaml(diagnostics, file.path(diagnostics_dir, 'diagnostics.yaml'))
|
129
129
|
}
|
easylink/steps/dev/r/r-image.def
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
Bootstrap: docker
|
2
3
|
From: rocker/tidyverse@sha256:6a7c913590e758b5fe2ad9921ccc5df7c7160e5de1db5f353630fe8e0ee2f876
|
3
4
|
|
@@ -15,15 +16,4 @@ From: rocker/tidyverse@sha256:6a7c913590e758b5fe2ad9921ccc5df7c7160e5de1db5f3536
|
|
15
16
|
export LC_ALL=C
|
16
17
|
|
17
18
|
%runscript
|
18
|
-
Rscript /dummy_step.R
|
19
|
-
|
20
|
-
%labels
|
21
|
-
Author Patrick Nast
|
22
|
-
Version v1.0
|
23
|
-
Description R Implementation
|
24
|
-
|
25
|
-
%startscript
|
26
|
-
# These directories should be bound when running the container
|
27
|
-
mkdir -p /results
|
28
|
-
mkdir -p /diagnostics
|
29
|
-
mkdir -p /input_data
|
19
|
+
Rscript /dummy_step.R '$@'
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./middle_name_to_initial.py /middle_name_to_initial.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /middle_name_to_initial.py '$@'
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# STEP_NAME: pre-processing
|
2
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
logging.basicConfig(
|
11
|
+
level=logging.INFO,
|
12
|
+
format="%(asctime)s %(message)s",
|
13
|
+
handlers=[logging.StreamHandler()],
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
def load_file(file_path, file_format=None):
|
18
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
19
|
+
if file_format is None:
|
20
|
+
file_format = file_path.split(".")[-1]
|
21
|
+
if file_format == "parquet":
|
22
|
+
return pd.read_parquet(file_path)
|
23
|
+
raise ValueError(f"Unknown file format {file_format}")
|
24
|
+
|
25
|
+
|
26
|
+
# LOAD INPUTS and SAVE OUTPUTS
|
27
|
+
|
28
|
+
# DATASET_DIR_PATHS is list of directories, each containing one file
|
29
|
+
dataset_paths = os.environ["DATASET_DIR_PATHS"].split(",")
|
30
|
+
logging.info(f"{dataset_paths=}")
|
31
|
+
|
32
|
+
# for workaround, choose path based on INPUT_DATASET configuration
|
33
|
+
splitter_choice = os.environ["INPUT_DATASET"]
|
34
|
+
logging.info(f"splitter_choice={splitter_choice}")
|
35
|
+
dataset_path = None
|
36
|
+
for path in dataset_paths:
|
37
|
+
path = Path(path)
|
38
|
+
# NOTE: We iterate the dir here, but it should only have one non-hidden
|
39
|
+
# file in it. We don't validate that here as it is checked in the validator.
|
40
|
+
for path_to_check in path.iterdir():
|
41
|
+
if path_to_check.stem == splitter_choice:
|
42
|
+
dataset_path = str(path_to_check)
|
43
|
+
break
|
44
|
+
|
45
|
+
if dataset_path is None:
|
46
|
+
raise ValueError(f"No dataset matching {splitter_choice} found")
|
47
|
+
|
48
|
+
# OUTPUT_PATHS is a single path to a directory ('dataset')
|
49
|
+
results_dir = Path(os.environ["OUTPUT_PATHS"])
|
50
|
+
results_dir.mkdir(exist_ok=True, parents=True)
|
51
|
+
|
52
|
+
output_path = results_dir / Path(dataset_path).name
|
53
|
+
|
54
|
+
dataset = load_file(dataset_path)
|
55
|
+
|
56
|
+
# add middle initial column from middle name
|
57
|
+
dataset["middle_initial"] = dataset["middle_name"].str[0]
|
58
|
+
|
59
|
+
logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
|
60
|
+
dataset.to_parquet(output_path)
|
@@ -10,8 +10,8 @@ library(stringr)
|
|
10
10
|
# Check required environment variables
|
11
11
|
required_env_vars <- c(
|
12
12
|
"BLOCKS_DIR_PATH",
|
13
|
-
"
|
14
|
-
"
|
13
|
+
"DIAGNOSTICS_DIRECTORY",
|
14
|
+
"OUTPUT_PATHS",
|
15
15
|
"COMPARISONS",
|
16
16
|
"THRESHOLD_MATCH_PROBABILITY"
|
17
17
|
)
|
@@ -24,8 +24,8 @@ if (length(missing_vars) > 0) {
|
|
24
24
|
}
|
25
25
|
|
26
26
|
blocks_dir <- Sys.getenv("BLOCKS_DIR_PATH")
|
27
|
-
diagnostics_dir <- Sys.getenv("
|
28
|
-
output_path <- Sys.getenv("
|
27
|
+
diagnostics_dir <- Sys.getenv("DIAGNOSTICS_DIRECTORY")
|
28
|
+
output_path <- Sys.getenv("OUTPUT_PATHS")
|
29
29
|
comparisons <- strsplit(Sys.getenv("COMPARISONS"), ",")[[1]]
|
30
30
|
|
31
31
|
all_predictions <- list()
|
@@ -12,7 +12,7 @@ library(stringr)
|
|
12
12
|
# Check required environment variables
|
13
13
|
required_env_vars <- c(
|
14
14
|
"LINKS_FILE_PATH",
|
15
|
-
"
|
15
|
+
"OUTPUT_PATHS",
|
16
16
|
"THRESHOLD_MATCH_PROBABILITY"
|
17
17
|
)
|
18
18
|
missing_vars <- required_env_vars[!nzchar(Sys.getenv(required_env_vars))]
|
@@ -24,7 +24,7 @@ if (length(missing_vars) > 0) {
|
|
24
24
|
}
|
25
25
|
|
26
26
|
links_file_path <- Sys.getenv("LINKS_FILE_PATH")
|
27
|
-
output_path <- Sys.getenv("
|
27
|
+
output_path <- Sys.getenv("OUTPUT_PATHS")
|
28
28
|
|
29
29
|
if (!file.exists(links_file_path)) {
|
30
30
|
stop(sprintf("File not found: %s", links_file_path))
|
@@ -11,7 +11,7 @@ data = pd.read_parquet(os.environ["STEP_1_MAIN_INPUT_FILE_PATHS"])
|
|
11
11
|
|
12
12
|
print(data)
|
13
13
|
|
14
|
-
dir_path = Path(os.environ["
|
14
|
+
dir_path = Path(os.environ["OUTPUT_PATHS"])
|
15
15
|
dir_path.mkdir(parents=True, exist_ok=True)
|
16
16
|
|
17
17
|
for i in range(3):
|
@@ -8,7 +8,7 @@ from pathlib import Path
|
|
8
8
|
|
9
9
|
import pandas as pd
|
10
10
|
|
11
|
-
dir_path = Path(os.environ["
|
11
|
+
dir_path = Path(os.environ["MAIN_INPUT_DIR_PATH"])
|
12
12
|
saved = False
|
13
13
|
|
14
14
|
for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
|
@@ -16,7 +16,7 @@ for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
|
|
16
16
|
continue
|
17
17
|
|
18
18
|
if not saved:
|
19
|
-
shutil.copy(f, os.environ["
|
19
|
+
shutil.copy(f, os.environ["OUTPUT_PATHS"])
|
20
20
|
saved = True
|
21
21
|
|
22
22
|
print(pd.read_parquet(f))
|
@@ -33,8 +33,8 @@ def load_file(file_path, file_format=None):
|
|
33
33
|
|
34
34
|
# CLUSTERS_FILE_PATH is a path to a single file
|
35
35
|
clusters_path = os.environ["CLUSTERS_FILE_PATH"]
|
36
|
-
#
|
37
|
-
results_filepath = os.environ["
|
36
|
+
# OUTPUT_PATHS is a path to a single file (results.parquet)
|
37
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
38
38
|
|
39
39
|
clusters_df = load_file(clusters_path)
|
40
40
|
|
@@ -45,8 +45,8 @@ for path in dataset_paths:
|
|
45
45
|
if dataset_path is None:
|
46
46
|
raise ValueError(f"No dataset matching {splitter_choice} found")
|
47
47
|
|
48
|
-
#
|
49
|
-
results_dir = Path(os.environ["
|
48
|
+
# OUTPUT_PATHS is a single path to a directory ('dataset')
|
49
|
+
results_dir = Path(os.environ["OUTPUT_PATHS"])
|
50
50
|
results_dir.mkdir(exist_ok=True, parents=True)
|
51
51
|
|
52
52
|
output_path = results_dir / Path(dataset_path).name
|
@@ -13,7 +13,7 @@ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a72
|
|
13
13
|
mkdir -p /diagnostics
|
14
14
|
|
15
15
|
# Install Python packages with specific versions
|
16
|
-
pip install pandas pyarrow splink==4.0.7
|
16
|
+
pip install pandas pyarrow splink==4.0.7 vl-convert-python
|
17
17
|
|
18
18
|
%environment
|
19
19
|
export LC_ALL=C
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# STEP_NAME: blocking_and_filtering
|
2
|
-
# REQUIREMENTS: pandas pyarrow splink==4.0.7
|
2
|
+
# REQUIREMENTS: pandas pyarrow splink==4.0.7 vl-convert-python
|
3
3
|
|
4
4
|
import os
|
5
5
|
|
@@ -7,22 +7,27 @@ import pandas as pd
|
|
7
7
|
|
8
8
|
records = pd.read_parquet(os.environ["RECORDS_FILE_PATH"])
|
9
9
|
|
10
|
-
#
|
11
|
-
results_dir = os.environ["
|
10
|
+
# OUTPUT_PATHS is a single path to a directory ('dataset')
|
11
|
+
results_dir = os.environ["OUTPUT_PATHS"]
|
12
12
|
|
13
13
|
import splink
|
14
14
|
|
15
15
|
blocking_rules = os.environ["BLOCKING_RULES"].split(",")
|
16
16
|
|
17
|
-
|
17
|
+
link_only = os.getenv("LINK_ONLY", "false").lower() in ("true", "yes", "1")
|
18
|
+
|
19
|
+
from splink import DuckDBAPI, Linker, SettingsCreator
|
18
20
|
|
19
21
|
# Create the Splink linker in dedupe mode
|
20
22
|
settings = SettingsCreator(
|
21
|
-
link_type="link_and_dedupe",
|
23
|
+
link_type="link_only" if link_only else "link_and_dedupe",
|
22
24
|
blocking_rules_to_generate_predictions=blocking_rules,
|
23
25
|
comparisons=[],
|
24
26
|
)
|
25
27
|
from splink import DuckDBAPI
|
28
|
+
from splink.blocking_analysis import (
|
29
|
+
cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
|
30
|
+
)
|
26
31
|
|
27
32
|
grouped = records.rename(columns={"Input Record ID": "unique_id"}).groupby(
|
28
33
|
"Input Record Dataset"
|
@@ -52,6 +57,7 @@ blocking_input_tablename_r = "__splink__df_concat_with_tf"
|
|
52
57
|
|
53
58
|
link_type = linker._settings_obj._link_type
|
54
59
|
|
60
|
+
|
55
61
|
# If exploded blocking rules exist, we need to materialise
|
56
62
|
# the tables of ID pairs
|
57
63
|
from splink.internals.blocking import materialise_exploded_id_tables
|
@@ -98,7 +104,12 @@ blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
|
|
98
104
|
wrong_order_dataset = (
|
99
105
|
blocked_pairs["Left Record Dataset"] > blocked_pairs["Right Record Dataset"]
|
100
106
|
)
|
101
|
-
id_cols = [
|
107
|
+
id_cols = [
|
108
|
+
"Left Record Dataset",
|
109
|
+
"Left Record ID",
|
110
|
+
"Right Record Dataset",
|
111
|
+
"Right Record ID",
|
112
|
+
]
|
102
113
|
switched_id_cols = [
|
103
114
|
"Right Record Dataset",
|
104
115
|
"Right Record ID",
|
@@ -128,3 +139,18 @@ output_path.mkdir(exist_ok=True, parents=True)
|
|
128
139
|
|
129
140
|
records.to_parquet(output_path / "records.parquet", index=False)
|
130
141
|
blocked_pairs.to_parquet(output_path / "pairs.parquet", index=False)
|
142
|
+
|
143
|
+
records["unique_id"] = (
|
144
|
+
str(records["Input Record Dataset"]) + "_" + str(records["Input Record ID"])
|
145
|
+
)
|
146
|
+
db_api = DuckDBAPI()
|
147
|
+
diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
|
148
|
+
chart_path = diagnostics_dir / f"blocking_cumulative_comparisons_chart_block_0.png"
|
149
|
+
cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
|
150
|
+
table_or_tables=records,
|
151
|
+
blocking_rules=blocking_rules,
|
152
|
+
db_api=db_api,
|
153
|
+
link_type=link_type,
|
154
|
+
unique_id_column_name="unique_id",
|
155
|
+
source_dataset_column_name="Input Record Dataset",
|
156
|
+
).save(chart_path)
|
@@ -10,9 +10,10 @@ import splink.comparison_library as cl
|
|
10
10
|
from splink import Linker, SettingsCreator
|
11
11
|
|
12
12
|
blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
|
13
|
-
diagnostics_dir = Path(os.environ["
|
14
|
-
output_path = Path(os.environ["
|
13
|
+
diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
|
14
|
+
output_path = Path(os.environ["OUTPUT_PATHS"])
|
15
15
|
Path(output_path).parent.mkdir(exist_ok=True, parents=True)
|
16
|
+
link_only = os.getenv("LINK_ONLY", "false").lower() in ("true", "yes", "1")
|
16
17
|
|
17
18
|
all_predictions = []
|
18
19
|
|
@@ -30,17 +31,20 @@ for block_dir in blocks_dir.iterdir():
|
|
30
31
|
comparisons.append(cl.NameComparison(column))
|
31
32
|
elif method == "dob":
|
32
33
|
comparisons.append(cl.DateOfBirthComparison(column))
|
34
|
+
elif method == "levenshtein":
|
35
|
+
comparisons.append(cl.LevenshteinAtThresholds(column))
|
33
36
|
else:
|
34
37
|
raise ValueError(f"Unknown comparison method {method}")
|
35
38
|
|
36
39
|
# Create the Splink linker in dedupe mode
|
37
40
|
settings = SettingsCreator(
|
38
|
-
link_type="link_and_dedupe",
|
41
|
+
link_type="link_only" if link_only else "link_and_dedupe",
|
39
42
|
blocking_rules_to_generate_predictions=[],
|
40
43
|
comparisons=comparisons,
|
41
44
|
probability_two_random_records_match=float(
|
42
45
|
os.environ["PROBABILITY_TWO_RANDOM_RECORDS_MATCH"]
|
43
46
|
),
|
47
|
+
retain_intermediate_calculation_columns=True,
|
44
48
|
)
|
45
49
|
|
46
50
|
grouped = (
|
@@ -59,7 +63,7 @@ for block_dir in blocks_dir.iterdir():
|
|
59
63
|
input_table_aliases=[name for name, _ in grouped],
|
60
64
|
)
|
61
65
|
|
62
|
-
linker.training.estimate_u_using_random_sampling(max_pairs=5e6)
|
66
|
+
linker.training.estimate_u_using_random_sampling(max_pairs=5e6, seed=1234)
|
63
67
|
|
64
68
|
blocking_rules_for_training = os.environ["BLOCKING_RULES_FOR_TRAINING"].split(",")
|
65
69
|
|
@@ -143,6 +147,12 @@ for block_dir in blocks_dir.iterdir():
|
|
143
147
|
|
144
148
|
all_predictions.append(predictions.as_pandas_dataframe())
|
145
149
|
|
150
|
+
comparisons_path = diagnostics_dir / f"comparisons_chart_{block_dir}.html"
|
151
|
+
comparisons_path.parent.mkdir(exist_ok=True, parents=True)
|
152
|
+
linker.visualisations.comparison_viewer_dashboard(
|
153
|
+
predictions, comparisons_path, overwrite=True
|
154
|
+
)
|
155
|
+
|
146
156
|
all_predictions = pd.concat(all_predictions, ignore_index=True)[
|
147
157
|
[
|
148
158
|
"source_dataset_l",
|