easylink 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. easylink/_version.py +1 -1
  2. easylink/configuration.py +4 -3
  3. easylink/implementation_metadata.yaml +53 -46
  4. easylink/pipeline_schema.py +3 -3
  5. easylink/pipeline_schema_constants/__init__.py +1 -0
  6. easylink/pipeline_schema_constants/testing.py +124 -1
  7. easylink/rule.py +5 -5
  8. easylink/step.py +46 -14
  9. easylink/steps/cascading/exclude_clustered.py +2 -2
  10. easylink/steps/cascading/exclude_none.py +2 -2
  11. easylink/steps/cascading/update_clusters_by_connected_components.py +2 -2
  12. easylink/steps/default/default_clusters_to_links.py +2 -2
  13. easylink/steps/default/default_determining_exclusions.py +2 -2
  14. easylink/steps/default/default_removing_records.py +2 -2
  15. easylink/steps/default/default_schema_alignment.py +3 -2
  16. easylink/steps/default/default_updating_clusters.py +2 -2
  17. easylink/steps/dev/README.md +1 -1
  18. easylink/steps/dev/python_pandas/dummy_step.py +4 -4
  19. easylink/steps/dev/python_pandas/python_pandas.def +2 -13
  20. easylink/steps/dev/python_pyspark/dummy_step.py +5 -7
  21. easylink/steps/dev/python_pyspark/python_pyspark.def +2 -12
  22. easylink/steps/dev/r/dummy_step.R +2 -2
  23. easylink/steps/dev/r/r-image.def +2 -12
  24. easylink/steps/example/middle_name_to_initial.def +22 -0
  25. easylink/steps/example/middle_name_to_initial.py +60 -0
  26. easylink/steps/fastLink/fastLink_evaluating_pairs.R +4 -4
  27. easylink/steps/fastLink/fastLink_links_to_clusters.R +2 -2
  28. easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py +1 -1
  29. easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py +2 -2
  30. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +2 -2
  31. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +2 -2
  32. easylink/steps/splink/splink_blocking_and_filtering.def +1 -1
  33. easylink/steps/splink/splink_blocking_and_filtering.py +32 -6
  34. easylink/steps/splink/splink_evaluating_pairs.py +14 -4
  35. easylink/steps/splink/splink_links_to_clusters.py +1 -1
  36. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/METADATA +1 -1
  37. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/RECORD +41 -41
  38. easylink/images/spark_cluster/Dockerfile +0 -16
  39. easylink/images/spark_cluster/README.md +0 -15
  40. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/WHEEL +0 -0
  41. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/entry_points.txt +0 -0
  42. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/licenses/LICENSE +0 -0
  43. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/top_level.txt +0 -0
@@ -45,8 +45,8 @@ if dataset_path is None:
45
45
 
46
46
  # IDS_TO_REMOVE_FILE_PATH is a single filepath (Cloneable section)
47
47
  ids_filepath = os.environ["IDS_TO_REMOVE_FILE_PATH"]
48
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
49
- results_dir = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
48
+ # OUTPUT_PATHS is a single path to a directory ('dataset')
49
+ results_dir = Path(os.environ["OUTPUT_PATHS"])
50
50
  results_dir.mkdir(exist_ok=True, parents=True)
51
51
 
52
52
  dataset = load_file(dataset_path)
@@ -42,11 +42,12 @@ records = pd.concat(
42
42
  ignore_index=True,
43
43
  sort=False,
44
44
  )
45
+ # TODO: check both datasets contain all the columns
45
46
 
46
47
  records = records.rename(columns={"Record ID": "Input Record ID"})
47
48
 
48
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single filepath
49
- output_path = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
49
+ # OUTPUT_PATHS is a single filepath
50
+ output_path = os.environ["OUTPUT_PATHS"]
50
51
  Path(output_path).parent.mkdir(exist_ok=True, parents=True)
51
52
 
52
53
  logging.info(f"Writing output to {output_path}")
@@ -54,8 +54,8 @@ if len(known_clusters_df) > 0:
54
54
  "Default implementation of updating_clusters passed a non-empty set of known clusters"
55
55
  )
56
56
 
57
- # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (clusters.parquet)
58
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
57
+ # OUTPUT_PATHS is a path to a single file (clusters.parquet)
58
+ results_filepath = os.environ["OUTPUT_PATHS"]
59
59
  Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
60
60
 
61
61
  clusters_df = load_file(new_clusters_filepath)
@@ -46,7 +46,7 @@ is `DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS`, but you can also specify *what* the
46
46
  You can (optionally) provide another input file at `/extra_implementation_specific_input_data/input*` (Parquet or CSV) or a different path passed as `DUMMY_CONTAINER_EXTRA_IMPLEMENTATION_SPECIFIC_INPUT_FILE_PATH`.
47
47
  This is meant to represent an input that is specific to a given implementation.
48
48
 
49
- Output is written to `/results/result.<ext>` or a different comma-separated list of paths passed as `DUMMY_CONTAINER_OUTPUT_PATHS`.
49
+ Output is written to `/results/result.<ext>` or a different comma-separated list of paths passed as `OUTPUT_PATHS`.
50
50
  If `DUMMY_CONTAINER_OUTPUT_FILE_TYPE` is `csv` it will be in CSV format, otherwise it will be Parquet.
51
51
 
52
52
  The environment variable `DUMMY_CONTAINER_BROKEN` makes the container return data that does not meet the specification.
@@ -101,9 +101,9 @@ else:
101
101
  df.drop(columns=columns_to_drop, inplace=True)
102
102
 
103
103
  output_file_format = os.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
104
- output_file_paths = os.getenv(
105
- "DUMMY_CONTAINER_OUTPUT_PATHS", f"/results/result.{output_file_format}"
106
- ).split(",")
104
+ output_file_paths = os.getenv("OUTPUT_PATHS", f"/results/result.{output_file_format}").split(
105
+ ","
106
+ )
107
107
 
108
108
  diagnostics["num_output_files"] = len(output_file_paths)
109
109
  diagnostics["output_file_paths"] = output_file_paths
@@ -117,7 +117,7 @@ for output_file_path in output_file_paths:
117
117
  else:
118
118
  raise ValueError()
119
119
 
120
- diagnostics_dir = os.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY", "/diagnostics")
120
+ diagnostics_dir = os.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
121
121
  try:
122
122
  with open(f"{diagnostics_dir}/diagnostics.yaml", "w") as f:
123
123
  yaml.dump(diagnostics, f, default_flow_style=False)
@@ -1,3 +1,4 @@
1
+
1
2
  Bootstrap: docker
2
3
  From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
3
4
 
@@ -16,18 +17,6 @@ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a72
16
17
 
17
18
  %environment
18
19
  export LC_ALL=C
19
- export PYTHONPATH=/app:$PYTHONPATH
20
20
 
21
21
  %runscript
22
- python /dummy_step.py "$@"
23
-
24
- %labels
25
- Author Patrick Nast
26
- Version v1.0
27
- Description Python Pandas Implementation
28
-
29
- %startscript
30
- # These directories should be bound when running the container
31
- mkdir -p /results
32
- mkdir -p /diagnostics
33
- mkdir -p /input_data
22
+ python /dummy_step.py '$@'
@@ -17,9 +17,7 @@ logging.basicConfig(
17
17
  pyspark_log = logging.getLogger("pyspark")
18
18
  pyspark_log.setLevel(logging.WARNING)
19
19
 
20
- spark = SparkSession.builder.master(
21
- os.getenv("DUMMY_CONTAINER_SPARK_MASTER_URL")
22
- ).getOrCreate()
20
+ spark = SparkSession.builder.master(os.getenv("SPARK_MASTER_URL")).getOrCreate()
23
21
 
24
22
 
25
23
  def load_file(file_path, file_format=None):
@@ -115,9 +113,9 @@ else:
115
113
  df = df.drop(*columns_to_drop)
116
114
 
117
115
  output_file_format = os.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
118
- output_file_paths = os.getenv(
119
- "DUMMY_CONTAINER_OUTPUT_PATHS", f"/results/result.{output_file_format}"
120
- ).split(",")
116
+ output_file_paths = os.getenv("OUTPUT_PATHS", f"/results/result.{output_file_format}").split(
117
+ ","
118
+ )
121
119
 
122
120
  diagnostics["num_output_files"] = len(output_file_paths)
123
121
  diagnostics["output_file_paths"] = output_file_paths
@@ -132,7 +130,7 @@ for output_file_path in output_file_paths:
132
130
  else:
133
131
  raise ValueError()
134
132
 
135
- diagnostics_dir = os.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY", "/diagnostics")
133
+ diagnostics_dir = os.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
136
134
  try:
137
135
  with open(f"{diagnostics_dir}/diagnostics.yaml", "w") as f:
138
136
  yaml.dump(diagnostics, f, default_flow_style=False)
@@ -1,3 +1,4 @@
1
+
1
2
  Bootstrap: docker
2
3
  From: apache/spark-py@sha256:489f904a77f21134df4840de5f8bd9f110925e7b439ca6a04b7c033813edfebc
3
4
 
@@ -22,15 +23,4 @@ From: apache/spark-py@sha256:489f904a77f21134df4840de5f8bd9f110925e7b439ca6a04b7
22
23
 
23
24
  %runscript
24
25
  cd /workdir
25
- python3 /code/dummy_step.py "$@"
26
-
27
- %labels
28
- Author Patrick Nast
29
- Version v1.0
30
- Description Python Pyspark Implementation
31
-
32
- %startscript
33
- # These directories should be bound when running the container
34
- mkdir -p /results
35
- mkdir -p /diagnostics
36
- mkdir -p /input_data
26
+ python3 /code/dummy_step.py '$@'
@@ -106,7 +106,7 @@ if (broken) {
106
106
  }
107
107
 
108
108
  output_file_format <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
109
- output_file_paths <- strsplit(Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS", paste0("/results/result.", output_file_format)), ",")[[1]]
109
+ output_file_paths <- strsplit(Sys.getenv("OUTPUT_PATHS", paste0("/results/result.", output_file_format)), ",")[[1]]
110
110
 
111
111
  diagnostics$num_output_files <- length(output_file_paths)
112
112
  diagnostics$output_file_paths <- output_file_paths
@@ -123,7 +123,7 @@ for (output_file_path in output_file_paths) {
123
123
  }
124
124
  }
125
125
 
126
- diagnostics_dir <- Sys.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY", "/diagnostics")
126
+ diagnostics_dir <- Sys.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
127
127
  if (dir.exists(diagnostics_dir) && file.access(diagnostics_dir, mode = 2) == 0) {
128
128
  write_yaml(diagnostics, file.path(diagnostics_dir, 'diagnostics.yaml'))
129
129
  }
@@ -1,3 +1,4 @@
1
+
1
2
  Bootstrap: docker
2
3
  From: rocker/tidyverse@sha256:6a7c913590e758b5fe2ad9921ccc5df7c7160e5de1db5f353630fe8e0ee2f876
3
4
 
@@ -15,15 +16,4 @@ From: rocker/tidyverse@sha256:6a7c913590e758b5fe2ad9921ccc5df7c7160e5de1db5f3536
15
16
  export LC_ALL=C
16
17
 
17
18
  %runscript
18
- Rscript /dummy_step.R "$@"
19
-
20
- %labels
21
- Author Patrick Nast
22
- Version v1.0
23
- Description R Implementation
24
-
25
- %startscript
26
- # These directories should be bound when running the container
27
- mkdir -p /results
28
- mkdir -p /diagnostics
29
- mkdir -p /input_data
19
+ Rscript /dummy_step.R '$@'
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./middle_name_to_initial.py /middle_name_to_initial.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /middle_name_to_initial.py '$@'
@@ -0,0 +1,60 @@
1
+ # STEP_NAME: pre-processing
2
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
3
+
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s %(message)s",
13
+ handlers=[logging.StreamHandler()],
14
+ )
15
+
16
+
17
+ def load_file(file_path, file_format=None):
18
+ logging.info(f"Loading file {file_path} with format {file_format}")
19
+ if file_format is None:
20
+ file_format = file_path.split(".")[-1]
21
+ if file_format == "parquet":
22
+ return pd.read_parquet(file_path)
23
+ raise ValueError(f"Unknown file format {file_format}")
24
+
25
+
26
+ # LOAD INPUTS and SAVE OUTPUTS
27
+
28
+ # DATASET_DIR_PATHS is list of directories, each containing one file
29
+ dataset_paths = os.environ["DATASET_DIR_PATHS"].split(",")
30
+ logging.info(f"{dataset_paths=}")
31
+
32
+ # for workaround, choose path based on INPUT_DATASET configuration
33
+ splitter_choice = os.environ["INPUT_DATASET"]
34
+ logging.info(f"splitter_choice={splitter_choice}")
35
+ dataset_path = None
36
+ for path in dataset_paths:
37
+ path = Path(path)
38
+ # NOTE: We iterate the dir here, but it should only have one non-hidden
39
+ # file in it. We don't validate that here as it is checked in the validator.
40
+ for path_to_check in path.iterdir():
41
+ if path_to_check.stem == splitter_choice:
42
+ dataset_path = str(path_to_check)
43
+ break
44
+
45
+ if dataset_path is None:
46
+ raise ValueError(f"No dataset matching {splitter_choice} found")
47
+
48
+ # OUTPUT_PATHS is a single path to a directory ('dataset')
49
+ results_dir = Path(os.environ["OUTPUT_PATHS"])
50
+ results_dir.mkdir(exist_ok=True, parents=True)
51
+
52
+ output_path = results_dir / Path(dataset_path).name
53
+
54
+ dataset = load_file(dataset_path)
55
+
56
+ # add middle initial column from middle name
57
+ dataset["middle_initial"] = dataset["middle_name"].str[0]
58
+
59
+ logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
60
+ dataset.to_parquet(output_path)
@@ -10,8 +10,8 @@ library(stringr)
10
10
  # Check required environment variables
11
11
  required_env_vars <- c(
12
12
  "BLOCKS_DIR_PATH",
13
- "DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY",
14
- "DUMMY_CONTAINER_OUTPUT_PATHS",
13
+ "DIAGNOSTICS_DIRECTORY",
14
+ "OUTPUT_PATHS",
15
15
  "COMPARISONS",
16
16
  "THRESHOLD_MATCH_PROBABILITY"
17
17
  )
@@ -24,8 +24,8 @@ if (length(missing_vars) > 0) {
24
24
  }
25
25
 
26
26
  blocks_dir <- Sys.getenv("BLOCKS_DIR_PATH")
27
- diagnostics_dir <- Sys.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY")
28
- output_path <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS")
27
+ diagnostics_dir <- Sys.getenv("DIAGNOSTICS_DIRECTORY")
28
+ output_path <- Sys.getenv("OUTPUT_PATHS")
29
29
  comparisons <- strsplit(Sys.getenv("COMPARISONS"), ",")[[1]]
30
30
 
31
31
  all_predictions <- list()
@@ -12,7 +12,7 @@ library(stringr)
12
12
  # Check required environment variables
13
13
  required_env_vars <- c(
14
14
  "LINKS_FILE_PATH",
15
- "DUMMY_CONTAINER_OUTPUT_PATHS",
15
+ "OUTPUT_PATHS",
16
16
  "THRESHOLD_MATCH_PROBABILITY"
17
17
  )
18
18
  missing_vars <- required_env_vars[!nzchar(Sys.getenv(required_env_vars))]
@@ -24,7 +24,7 @@ if (length(missing_vars) > 0) {
24
24
  }
25
25
 
26
26
  links_file_path <- Sys.getenv("LINKS_FILE_PATH")
27
- output_path <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS")
27
+ output_path <- Sys.getenv("OUTPUT_PATHS")
28
28
 
29
29
  if (!file.exists(links_file_path)) {
30
30
  stop(sprintf("File not found: %s", links_file_path))
@@ -11,7 +11,7 @@ data = pd.read_parquet(os.environ["STEP_1_MAIN_INPUT_FILE_PATHS"])
11
11
 
12
12
  print(data)
13
13
 
14
- dir_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
14
+ dir_path = Path(os.environ["OUTPUT_PATHS"])
15
15
  dir_path.mkdir(parents=True, exist_ok=True)
16
16
 
17
17
  for i in range(3):
@@ -8,7 +8,7 @@ from pathlib import Path
8
8
 
9
9
  import pandas as pd
10
10
 
11
- dir_path = Path(os.environ["DUMMY_CONTAINER_MAIN_INPUT_DIR_PATH"])
11
+ dir_path = Path(os.environ["MAIN_INPUT_DIR_PATH"])
12
12
  saved = False
13
13
 
14
14
  for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
@@ -16,7 +16,7 @@ for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
16
16
  continue
17
17
 
18
18
  if not saved:
19
- shutil.copy(f, os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
19
+ shutil.copy(f, os.environ["OUTPUT_PATHS"])
20
20
  saved = True
21
21
 
22
22
  print(pd.read_parquet(f))
@@ -33,8 +33,8 @@ def load_file(file_path, file_format=None):
33
33
 
34
34
  # CLUSTERS_FILE_PATH is a path to a single file
35
35
  clusters_path = os.environ["CLUSTERS_FILE_PATH"]
36
- # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (results.parquet)
37
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
36
+ # OUTPUT_PATHS is a path to a single file (results.parquet)
37
+ results_filepath = os.environ["OUTPUT_PATHS"]
38
38
 
39
39
  clusters_df = load_file(clusters_path)
40
40
 
@@ -45,8 +45,8 @@ for path in dataset_paths:
45
45
  if dataset_path is None:
46
46
  raise ValueError(f"No dataset matching {splitter_choice} found")
47
47
 
48
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
49
- results_dir = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
48
+ # OUTPUT_PATHS is a single path to a directory ('dataset')
49
+ results_dir = Path(os.environ["OUTPUT_PATHS"])
50
50
  results_dir.mkdir(exist_ok=True, parents=True)
51
51
 
52
52
  output_path = results_dir / Path(dataset_path).name
@@ -13,7 +13,7 @@ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a72
13
13
  mkdir -p /diagnostics
14
14
 
15
15
  # Install Python packages with specific versions
16
- pip install pandas pyarrow splink==4.0.7
16
+ pip install pandas pyarrow splink==4.0.7 vl-convert-python
17
17
 
18
18
  %environment
19
19
  export LC_ALL=C
@@ -1,5 +1,5 @@
1
1
  # STEP_NAME: blocking_and_filtering
2
- # REQUIREMENTS: pandas pyarrow splink==4.0.7
2
+ # REQUIREMENTS: pandas pyarrow splink==4.0.7 vl-convert-python
3
3
 
4
4
  import os
5
5
 
@@ -7,22 +7,27 @@ import pandas as pd
7
7
 
8
8
  records = pd.read_parquet(os.environ["RECORDS_FILE_PATH"])
9
9
 
10
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
11
- results_dir = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
10
+ # OUTPUT_PATHS is a single path to a directory ('dataset')
11
+ results_dir = os.environ["OUTPUT_PATHS"]
12
12
 
13
13
  import splink
14
14
 
15
15
  blocking_rules = os.environ["BLOCKING_RULES"].split(",")
16
16
 
17
- from splink import Linker, SettingsCreator
17
+ link_only = os.getenv("LINK_ONLY", "false").lower() in ("true", "yes", "1")
18
+
19
+ from splink import DuckDBAPI, Linker, SettingsCreator
18
20
 
19
21
  # Create the Splink linker in dedupe mode
20
22
  settings = SettingsCreator(
21
- link_type="link_and_dedupe",
23
+ link_type="link_only" if link_only else "link_and_dedupe",
22
24
  blocking_rules_to_generate_predictions=blocking_rules,
23
25
  comparisons=[],
24
26
  )
25
27
  from splink import DuckDBAPI
28
+ from splink.blocking_analysis import (
29
+ cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
30
+ )
26
31
 
27
32
  grouped = records.rename(columns={"Input Record ID": "unique_id"}).groupby(
28
33
  "Input Record Dataset"
@@ -52,6 +57,7 @@ blocking_input_tablename_r = "__splink__df_concat_with_tf"
52
57
 
53
58
  link_type = linker._settings_obj._link_type
54
59
 
60
+
55
61
  # If exploded blocking rules exist, we need to materialise
56
62
  # the tables of ID pairs
57
63
  from splink.internals.blocking import materialise_exploded_id_tables
@@ -98,7 +104,12 @@ blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
98
104
  wrong_order_dataset = (
99
105
  blocked_pairs["Left Record Dataset"] > blocked_pairs["Right Record Dataset"]
100
106
  )
101
- id_cols = ["Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID"]
107
+ id_cols = [
108
+ "Left Record Dataset",
109
+ "Left Record ID",
110
+ "Right Record Dataset",
111
+ "Right Record ID",
112
+ ]
102
113
  switched_id_cols = [
103
114
  "Right Record Dataset",
104
115
  "Right Record ID",
@@ -128,3 +139,18 @@ output_path.mkdir(exist_ok=True, parents=True)
128
139
 
129
140
  records.to_parquet(output_path / "records.parquet", index=False)
130
141
  blocked_pairs.to_parquet(output_path / "pairs.parquet", index=False)
142
+
143
+ records["unique_id"] = (
144
+ str(records["Input Record Dataset"]) + "_" + str(records["Input Record ID"])
145
+ )
146
+ db_api = DuckDBAPI()
147
+ diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
148
+ chart_path = diagnostics_dir / f"blocking_cumulative_comparisons_chart_block_0.png"
149
+ cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
150
+ table_or_tables=records,
151
+ blocking_rules=blocking_rules,
152
+ db_api=db_api,
153
+ link_type=link_type,
154
+ unique_id_column_name="unique_id",
155
+ source_dataset_column_name="Input Record Dataset",
156
+ ).save(chart_path)
@@ -10,9 +10,10 @@ import splink.comparison_library as cl
10
10
  from splink import Linker, SettingsCreator
11
11
 
12
12
  blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
13
- diagnostics_dir = Path(os.environ["DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY"])
14
- output_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
13
+ diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
14
+ output_path = Path(os.environ["OUTPUT_PATHS"])
15
15
  Path(output_path).parent.mkdir(exist_ok=True, parents=True)
16
+ link_only = os.getenv("LINK_ONLY", "false").lower() in ("true", "yes", "1")
16
17
 
17
18
  all_predictions = []
18
19
 
@@ -30,17 +31,20 @@ for block_dir in blocks_dir.iterdir():
30
31
  comparisons.append(cl.NameComparison(column))
31
32
  elif method == "dob":
32
33
  comparisons.append(cl.DateOfBirthComparison(column))
34
+ elif method == "levenshtein":
35
+ comparisons.append(cl.LevenshteinAtThresholds(column))
33
36
  else:
34
37
  raise ValueError(f"Unknown comparison method {method}")
35
38
 
36
39
  # Create the Splink linker in dedupe mode
37
40
  settings = SettingsCreator(
38
- link_type="link_and_dedupe",
41
+ link_type="link_only" if link_only else "link_and_dedupe",
39
42
  blocking_rules_to_generate_predictions=[],
40
43
  comparisons=comparisons,
41
44
  probability_two_random_records_match=float(
42
45
  os.environ["PROBABILITY_TWO_RANDOM_RECORDS_MATCH"]
43
46
  ),
47
+ retain_intermediate_calculation_columns=True,
44
48
  )
45
49
 
46
50
  grouped = (
@@ -59,7 +63,7 @@ for block_dir in blocks_dir.iterdir():
59
63
  input_table_aliases=[name for name, _ in grouped],
60
64
  )
61
65
 
62
- linker.training.estimate_u_using_random_sampling(max_pairs=5e6)
66
+ linker.training.estimate_u_using_random_sampling(max_pairs=5e6, seed=1234)
63
67
 
64
68
  blocking_rules_for_training = os.environ["BLOCKING_RULES_FOR_TRAINING"].split(",")
65
69
 
@@ -143,6 +147,12 @@ for block_dir in blocks_dir.iterdir():
143
147
 
144
148
  all_predictions.append(predictions.as_pandas_dataframe())
145
149
 
150
+ comparisons_path = diagnostics_dir / f"comparisons_chart_{block_dir}.html"
151
+ comparisons_path.parent.mkdir(exist_ok=True, parents=True)
152
+ linker.visualisations.comparison_viewer_dashboard(
153
+ predictions, comparisons_path, overwrite=True
154
+ )
155
+
146
156
  all_predictions = pd.concat(all_predictions, ignore_index=True)[
147
157
  [
148
158
  "source_dataset_l",
@@ -32,7 +32,7 @@ dummy_records_df = pd.DataFrame(
32
32
  )
33
33
  }
34
34
  )
35
- output_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
35
+ output_path = Path(os.environ["OUTPUT_PATHS"])
36
36
 
37
37
  db_api = DuckDBAPI()
38
38
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.21
3
+ Version: 0.1.23
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers