easylink 0.1.22__py3-none-any.whl → 0.1.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. easylink/_version.py +1 -1
  2. easylink/implementation_metadata.yaml +53 -46
  3. easylink/pipeline_schema_constants/testing.py +1 -1
  4. easylink/rule.py +5 -5
  5. easylink/step.py +1 -1
  6. easylink/steps/cascading/exclude_clustered.py +2 -2
  7. easylink/steps/cascading/exclude_none.py +2 -2
  8. easylink/steps/cascading/update_clusters_by_connected_components.py +2 -2
  9. easylink/steps/default/default_clusters_to_links.py +2 -2
  10. easylink/steps/default/default_determining_exclusions.py +2 -2
  11. easylink/steps/default/default_removing_records.py +2 -2
  12. easylink/steps/default/default_schema_alignment.py +3 -2
  13. easylink/steps/default/default_updating_clusters.py +2 -2
  14. easylink/steps/dev/README.md +1 -1
  15. easylink/steps/dev/python_pandas/dummy_step.py +4 -4
  16. easylink/steps/dev/python_pandas/python_pandas.def +2 -13
  17. easylink/steps/dev/python_pyspark/dummy_step.py +5 -7
  18. easylink/steps/dev/python_pyspark/python_pyspark.def +2 -12
  19. easylink/steps/dev/r/dummy_step.R +2 -2
  20. easylink/steps/dev/r/r-image.def +2 -12
  21. easylink/steps/example/middle_name_to_initial.def +22 -0
  22. easylink/steps/example/middle_name_to_initial.py +60 -0
  23. easylink/steps/fastLink/fastLink_evaluating_pairs.R +4 -4
  24. easylink/steps/fastLink/fastLink_links_to_clusters.R +2 -2
  25. easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py +1 -1
  26. easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py +2 -2
  27. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +2 -2
  28. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +2 -2
  29. easylink/steps/splink/splink_blocking_and_filtering.def +1 -1
  30. easylink/steps/splink/splink_blocking_and_filtering.py +32 -6
  31. easylink/steps/splink/splink_evaluating_pairs.py +14 -4
  32. easylink/steps/splink/splink_links_to_clusters.py +1 -1
  33. {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/METADATA +1 -1
  34. {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/RECORD +38 -36
  35. {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/WHEEL +0 -0
  36. {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/entry_points.txt +0 -0
  37. {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/licenses/LICENSE +0 -0
  38. {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/top_level.txt +0 -0
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.22"
1
+ __version__ = "0.1.23"
@@ -2,8 +2,8 @@ step_1_python_pandas:
2
2
  steps:
3
3
  - step_1
4
4
  image_name: python_pandas.sif
5
- zenodo_record_id: 15611084
6
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
5
+ zenodo_record_id: 15733426
6
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
7
7
  script_cmd: python /dummy_step.py
8
8
  outputs:
9
9
  step_1_main_output: result.parquet
@@ -11,8 +11,8 @@ step_1a_python_pandas:
11
11
  steps:
12
12
  - step_1a
13
13
  image_name: python_pandas.sif
14
- zenodo_record_id: 15611084
15
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
14
+ zenodo_record_id: 15733426
15
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
16
16
  script_cmd: python /dummy_step.py
17
17
  env:
18
18
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -22,8 +22,8 @@ step_1b_python_pandas:
22
22
  steps:
23
23
  - step_1b
24
24
  image_name: python_pandas.sif
25
- zenodo_record_id: 15611084
26
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
25
+ zenodo_record_id: 15733426
26
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
27
27
  script_cmd: python /dummy_step.py
28
28
  env:
29
29
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -33,8 +33,8 @@ step_2_python_pandas:
33
33
  steps:
34
34
  - step_2
35
35
  image_name: python_pandas.sif
36
- zenodo_record_id: 15611084
37
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
36
+ zenodo_record_id: 15733426
37
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
38
38
  script_cmd: python /dummy_step.py
39
39
  outputs:
40
40
  step_2_main_output: result.parquet
@@ -42,8 +42,8 @@ step_3_python_pandas:
42
42
  steps:
43
43
  - step_3
44
44
  image_name: python_pandas.sif
45
- zenodo_record_id: 15611084
46
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
45
+ zenodo_record_id: 15733426
46
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
47
47
  script_cmd: python /dummy_step.py
48
48
  outputs:
49
49
  step_3_main_output: result.parquet
@@ -51,8 +51,8 @@ step_4_python_pandas:
51
51
  steps:
52
52
  - step_4
53
53
  image_name: python_pandas.sif
54
- zenodo_record_id: 15611084
55
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
54
+ zenodo_record_id: 15733426
55
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
56
56
  script_cmd: python /dummy_step.py
57
57
  env:
58
58
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -62,8 +62,8 @@ step_5_python_pandas:
62
62
  steps:
63
63
  - step_5
64
64
  image_name: python_pandas.sif
65
- zenodo_record_id: 15611084
66
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
65
+ zenodo_record_id: 15733426
66
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
67
67
  script_cmd: python /dummy_step.py
68
68
  env:
69
69
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -73,8 +73,8 @@ step_6_python_pandas:
73
73
  steps:
74
74
  - step_6
75
75
  image_name: python_pandas.sif
76
- zenodo_record_id: 15611084
77
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
76
+ zenodo_record_id: 15733426
77
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
78
78
  script_cmd: python /dummy_step.py
79
79
  env:
80
80
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -84,8 +84,8 @@ step_4a_python_pandas:
84
84
  steps:
85
85
  - step_4a
86
86
  image_name: python_pandas.sif
87
- zenodo_record_id: 15611084
88
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
87
+ zenodo_record_id: 15733426
88
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
89
89
  script_cmd: python /dummy_step.py
90
90
  env:
91
91
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -95,8 +95,8 @@ step_4b_python_pandas:
95
95
  steps:
96
96
  - step_4b
97
97
  image_name: python_pandas.sif
98
- zenodo_record_id: 15611084
99
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
98
+ zenodo_record_id: 15733426
99
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
100
100
  script_cmd: python /dummy_step.py
101
101
  env:
102
102
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -106,8 +106,8 @@ step_4b_r:
106
106
  steps:
107
107
  - step_4b
108
108
  image_name: r-image.sif
109
- zenodo_record_id: 15611084
110
- md5_checksum: 9410af1317aabc332604cbec33b59d42
109
+ zenodo_record_id: 15733426
110
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
111
111
  script_cmd: Rscript /dummy_step.R
112
112
  env:
113
113
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -117,8 +117,8 @@ step_1_python_pyspark:
117
117
  steps:
118
118
  - step_1
119
119
  image_name: python_pyspark.sif
120
- zenodo_record_id: 15611084
121
- md5_checksum: 6fb2a2119630138f4db82356b8d78b87
120
+ zenodo_record_id: 15733426
121
+ md5_checksum: c948577ab0607411dd4b640622d9ec3a
122
122
  script_cmd: python3 /code/dummy_step.py
123
123
  outputs:
124
124
  step_1_main_output: result.parquet
@@ -127,8 +127,8 @@ step_2_python_pyspark:
127
127
  steps:
128
128
  - step_2
129
129
  image_name: python_pyspark.sif
130
- zenodo_record_id: 15611084
131
- md5_checksum: 6fb2a2119630138f4db82356b8d78b87
130
+ zenodo_record_id: 15733426
131
+ md5_checksum: c948577ab0607411dd4b640622d9ec3a
132
132
  script_cmd: python3 /code/dummy_step.py
133
133
  outputs:
134
134
  step_2_main_output: result.parquet
@@ -137,8 +137,8 @@ step_3_python_pyspark:
137
137
  steps:
138
138
  - step_3
139
139
  image_name: python_pyspark.sif
140
- zenodo_record_id: 15611084
141
- md5_checksum: 6fb2a2119630138f4db82356b8d78b87
140
+ zenodo_record_id: 15733426
141
+ md5_checksum: c948577ab0607411dd4b640622d9ec3a
142
142
  script_cmd: python3 /code/dummy_step.py
143
143
  outputs:
144
144
  step_3_main_output: result.parquet
@@ -147,8 +147,8 @@ step_4_python_pyspark:
147
147
  steps:
148
148
  - step_4
149
149
  image_name: python_pyspark.sif
150
- zenodo_record_id: 15611084
151
- md5_checksum: 6fb2a2119630138f4db82356b8d78b87
150
+ zenodo_record_id: 15733426
151
+ md5_checksum: c948577ab0607411dd4b640622d9ec3a
152
152
  script_cmd: python3 /code/dummy_step.py
153
153
  env:
154
154
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -158,8 +158,8 @@ step_1_r:
158
158
  steps:
159
159
  - step_1
160
160
  image_name: r-image.sif
161
- zenodo_record_id: 15611084
162
- md5_checksum: 9410af1317aabc332604cbec33b59d42
161
+ zenodo_record_id: 15733426
162
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
163
163
  script_cmd: Rscript /dummy_step.R
164
164
  outputs:
165
165
  step_1_main_output: result.parquet
@@ -168,8 +168,8 @@ step_2_r:
168
168
  steps:
169
169
  - step_2
170
170
  image_name: r-image.sif
171
- zenodo_record_id: 15611084
172
- md5_checksum: 9410af1317aabc332604cbec33b59d42
171
+ zenodo_record_id: 15733426
172
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
173
173
  script_cmd: Rscript /dummy_step.R
174
174
  outputs:
175
175
  step_2_main_output: result.parquet
@@ -178,8 +178,8 @@ step_3_r:
178
178
  steps:
179
179
  - step_3
180
180
  image_name: r-image.sif
181
- zenodo_record_id: 15611084
182
- md5_checksum: 9410af1317aabc332604cbec33b59d42
181
+ zenodo_record_id: 15733426
182
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
183
183
  script_cmd: Rscript /dummy_step.R
184
184
  outputs:
185
185
  step_3_main_output: result.parquet
@@ -188,8 +188,8 @@ step_4_r:
188
188
  steps:
189
189
  - step_4
190
190
  image_name: r-image.sif
191
- zenodo_record_id: 15611084
192
- md5_checksum: 9410af1317aabc332604cbec33b59d42
191
+ zenodo_record_id: 15733426
192
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
193
193
  script_cmd: Rscript /dummy_step.R
194
194
  env:
195
195
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -201,8 +201,8 @@ step_1_and_step_2_combined_python_pandas:
201
201
  - step_1
202
202
  - step_2
203
203
  image_name: python_pandas.sif
204
- zenodo_record_id: 15611084
205
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
204
+ zenodo_record_id: 15733426
205
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
206
206
  script_cmd: python /dummy_step.py
207
207
  outputs:
208
208
  step_2_main_output: result.parquet
@@ -211,8 +211,8 @@ step_1_and_step_2_parallel_python_pandas:
211
211
  - step_1
212
212
  - step_2
213
213
  image_name: python_pandas.sif
214
- zenodo_record_id: 15611084
215
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
214
+ zenodo_record_id: 15733426
215
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
216
216
  script_cmd: python /dummy_step.py
217
217
  env:
218
218
  INPUT_ENV_VARS: STEP_1_DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,STEP_2_DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS
@@ -223,8 +223,8 @@ step_3_and_step_4_combined_python_pandas:
223
223
  - step_3
224
224
  - step_4
225
225
  image_name: python_pandas.sif
226
- zenodo_record_id: 15611084
227
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
226
+ zenodo_record_id: 15733426
227
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
228
228
  script_cmd: python /dummy_step.py
229
229
  outputs:
230
230
  step_4_main_output: result.parquet
@@ -233,8 +233,8 @@ step_1a_and_step_1b_combined_python_pandas:
233
233
  - step_1a
234
234
  - step_1b
235
235
  image_name: python_pandas.sif
236
- zenodo_record_id: 15611084
237
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
236
+ zenodo_record_id: 15733426
237
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
238
238
  script_cmd: python /dummy_step.py
239
239
  outputs:
240
240
  step_1_main_output: result.parquet
@@ -362,3 +362,10 @@ update_clusters_by_connected_components:
362
362
  script_cmd: python /update_clusters_by_connected_components.py
363
363
  outputs:
364
364
  clusters: result.parquet
365
+ middle_name_to_initial:
366
+ steps:
367
+ - pre-processing
368
+ image_name: main/middle_name_to_initial.sif
369
+ script_cmd: python /middle_name_to_initial.py
370
+ outputs:
371
+ dataset: dataset
@@ -607,7 +607,7 @@ NODES_OUTPUT_DIR = [
607
607
  input_slots=[
608
608
  InputSlot(
609
609
  name="step_2_main_input",
610
- env_var="DUMMY_CONTAINER_MAIN_INPUT_DIR_PATH",
610
+ env_var="MAIN_INPUT_DIR_PATH",
611
611
  validator=validate_dir,
612
612
  )
613
613
  ],
easylink/rule.py CHANGED
@@ -182,15 +182,15 @@ rule:
182
182
  # TODO [MIC-5787]: handle multiple wildcards, e.g.
183
183
  # output_paths = ",".join(self.output)
184
184
  # wildcards_subdir = "/".join([f"{{wildcards.{wc}}}" for wc in self.wildcards])
185
- # and then in shell cmd: export DUMMY_CONTAINER_OUTPUT_PATHS={output_paths}/{wildcards_subdir}
185
+ # and then in shell cmd: export OUTPUT_PATHS={output_paths}/{wildcards_subdir}
186
186
 
187
187
  # snakemake shell commands require wildcards to be prefaced with 'wildcards.'
188
188
  output_files = ",".join(self.output).replace("{chunk}", "{wildcards.chunk}")
189
189
  shell_cmd = f"""
190
190
  shell:
191
191
  '''
192
- export DUMMY_CONTAINER_OUTPUT_PATHS={output_files}
193
- export DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
192
+ export OUTPUT_PATHS={output_files}
193
+ export DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
194
194
  for input_slot_attrs in self.input_slots.values():
195
195
  # snakemake shell commands require wildcards to be prefaced with 'wildcards.'
196
196
  input_files = ",".join(input_slot_attrs["filepaths"]).replace(
@@ -200,8 +200,8 @@ rule:
200
200
  export {input_slot_attrs["env_var"]}={input_files}"""
201
201
  if self.requires_spark:
202
202
  shell_cmd += f"""
203
- read -r DUMMY_CONTAINER_SPARK_MASTER_URL < {{input.master_url}}
204
- export DUMMY_CONTAINER_SPARK_MASTER_URL"""
203
+ read -r SPARK_MASTER_URL < {{input.master_url}}
204
+ export SPARK_MASTER_URL"""
205
205
  for var_name, var_value in self.envvars.items():
206
206
  shell_cmd += f"""
207
207
  export {var_name}={var_value}"""
easylink/step.py CHANGED
@@ -1134,7 +1134,7 @@ class LoopStep(TemplatedStep):
1134
1134
  @property
1135
1135
  def config_key(self):
1136
1136
  """The pipeline specification key required for a ``LoopStep``."""
1137
- return "iterate"
1137
+ return "iterations"
1138
1138
 
1139
1139
  @property
1140
1140
  def node_prefix(self):
@@ -69,8 +69,8 @@ clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
69
69
 
70
70
  IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
71
71
 
72
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
73
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
72
+ # OUTPUT_PATHS is a single path to a file (results.parquet)
73
+ results_filepath = os.environ["OUTPUT_PATHS"]
74
74
 
75
75
  logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
76
76
  IDS_TO_REMOVE.to_parquet(results_filepath)
@@ -69,8 +69,8 @@ clusters_df = load_file(clusters_filepath)
69
69
 
70
70
  IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
71
71
 
72
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
73
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
72
+ # OUTPUT_PATHS is a single path to a file (results.parquet)
73
+ results_filepath = os.environ["OUTPUT_PATHS"]
74
74
 
75
75
  logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
76
76
  IDS_TO_REMOVE.to_parquet(results_filepath)
@@ -50,8 +50,8 @@ if len(known_clusters_filepaths) == 0:
50
50
  known_clusters_filepath = known_clusters_filepaths[0]
51
51
  known_clusters_df = load_file(known_clusters_filepath)
52
52
 
53
- # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (clusters.parquet)
54
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
53
+ # OUTPUT_PATHS is a path to a single file (clusters.parquet)
54
+ results_filepath = os.environ["OUTPUT_PATHS"]
55
55
  Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
56
56
 
57
57
  new_clusters_df = load_file(new_clusters_filepath)
@@ -80,8 +80,8 @@ if len(clusters_filepaths) == 0:
80
80
 
81
81
  clusters_filepath = clusters_filepaths[0]
82
82
 
83
- # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (results.parquet)
84
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
83
+ # OUTPUT_PATHS is a path to a single file (results.parquet)
84
+ results_filepath = os.environ["OUTPUT_PATHS"]
85
85
 
86
86
  clusters_df = load_file(clusters_filepath)
87
87
  links_df = clusters_to_links(clusters_df)
@@ -74,8 +74,8 @@ if len(clusters_df) > 0:
74
74
 
75
75
  IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
76
76
 
77
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
78
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
77
+ # OUTPUT_PATHS is a single path to a file (results.parquet)
78
+ results_filepath = os.environ["OUTPUT_PATHS"]
79
79
 
80
80
  logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
81
81
  IDS_TO_REMOVE.to_parquet(results_filepath)
@@ -45,8 +45,8 @@ if dataset_path is None:
45
45
 
46
46
  # IDS_TO_REMOVE_FILE_PATH is a single filepath (Cloneable section)
47
47
  ids_filepath = os.environ["IDS_TO_REMOVE_FILE_PATH"]
48
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
49
- results_dir = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
48
+ # OUTPUT_PATHS is a single path to a directory ('dataset')
49
+ results_dir = Path(os.environ["OUTPUT_PATHS"])
50
50
  results_dir.mkdir(exist_ok=True, parents=True)
51
51
 
52
52
  dataset = load_file(dataset_path)
@@ -42,11 +42,12 @@ records = pd.concat(
42
42
  ignore_index=True,
43
43
  sort=False,
44
44
  )
45
+ # TODO: check both datasets contain all the columns
45
46
 
46
47
  records = records.rename(columns={"Record ID": "Input Record ID"})
47
48
 
48
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single filepath
49
- output_path = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
49
+ # OUTPUT_PATHS is a single filepath
50
+ output_path = os.environ["OUTPUT_PATHS"]
50
51
  Path(output_path).parent.mkdir(exist_ok=True, parents=True)
51
52
 
52
53
  logging.info(f"Writing output to {output_path}")
@@ -54,8 +54,8 @@ if len(known_clusters_df) > 0:
54
54
  "Default implementation of updating_clusters passed a non-empty set of known clusters"
55
55
  )
56
56
 
57
- # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (clusters.parquet)
58
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
57
+ # OUTPUT_PATHS is a path to a single file (clusters.parquet)
58
+ results_filepath = os.environ["OUTPUT_PATHS"]
59
59
  Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
60
60
 
61
61
  clusters_df = load_file(new_clusters_filepath)
@@ -46,7 +46,7 @@ is `DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS`, but you can also specify *what* the
46
46
  You can (optionally) provide another input file at `/extra_implementation_specific_input_data/input*` (Parquet or CSV) or a different path passed as `DUMMY_CONTAINER_EXTRA_IMPLEMENTATION_SPECIFIC_INPUT_FILE_PATH`.
47
47
  This is meant to represent an input that is specific to a given implementation.
48
48
 
49
- Output is written to `/results/result.<ext>` or a different comma-separated list of paths passed as `DUMMY_CONTAINER_OUTPUT_PATHS`.
49
+ Output is written to `/results/result.<ext>` or a different comma-separated list of paths passed as `OUTPUT_PATHS`.
50
50
  If `DUMMY_CONTAINER_OUTPUT_FILE_TYPE` is `csv` it will be in CSV format, otherwise it will be Parquet.
51
51
 
52
52
  The environment variable `DUMMY_CONTAINER_BROKEN` makes the container return data that does not meet the specification.
@@ -101,9 +101,9 @@ else:
101
101
  df.drop(columns=columns_to_drop, inplace=True)
102
102
 
103
103
  output_file_format = os.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
104
- output_file_paths = os.getenv(
105
- "DUMMY_CONTAINER_OUTPUT_PATHS", f"/results/result.{output_file_format}"
106
- ).split(",")
104
+ output_file_paths = os.getenv("OUTPUT_PATHS", f"/results/result.{output_file_format}").split(
105
+ ","
106
+ )
107
107
 
108
108
  diagnostics["num_output_files"] = len(output_file_paths)
109
109
  diagnostics["output_file_paths"] = output_file_paths
@@ -117,7 +117,7 @@ for output_file_path in output_file_paths:
117
117
  else:
118
118
  raise ValueError()
119
119
 
120
- diagnostics_dir = os.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY", "/diagnostics")
120
+ diagnostics_dir = os.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
121
121
  try:
122
122
  with open(f"{diagnostics_dir}/diagnostics.yaml", "w") as f:
123
123
  yaml.dump(diagnostics, f, default_flow_style=False)
@@ -1,3 +1,4 @@
1
+
1
2
  Bootstrap: docker
2
3
  From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
3
4
 
@@ -16,18 +17,6 @@ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a72
16
17
 
17
18
  %environment
18
19
  export LC_ALL=C
19
- export PYTHONPATH=/app:$PYTHONPATH
20
20
 
21
21
  %runscript
22
- python /dummy_step.py "$@"
23
-
24
- %labels
25
- Author Patrick Nast
26
- Version v1.0
27
- Description Python Pandas Implementation
28
-
29
- %startscript
30
- # These directories should be bound when running the container
31
- mkdir -p /results
32
- mkdir -p /diagnostics
33
- mkdir -p /input_data
22
+ python /dummy_step.py '$@'
@@ -17,9 +17,7 @@ logging.basicConfig(
17
17
  pyspark_log = logging.getLogger("pyspark")
18
18
  pyspark_log.setLevel(logging.WARNING)
19
19
 
20
- spark = SparkSession.builder.master(
21
- os.getenv("DUMMY_CONTAINER_SPARK_MASTER_URL")
22
- ).getOrCreate()
20
+ spark = SparkSession.builder.master(os.getenv("SPARK_MASTER_URL")).getOrCreate()
23
21
 
24
22
 
25
23
  def load_file(file_path, file_format=None):
@@ -115,9 +113,9 @@ else:
115
113
  df = df.drop(*columns_to_drop)
116
114
 
117
115
  output_file_format = os.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
118
- output_file_paths = os.getenv(
119
- "DUMMY_CONTAINER_OUTPUT_PATHS", f"/results/result.{output_file_format}"
120
- ).split(",")
116
+ output_file_paths = os.getenv("OUTPUT_PATHS", f"/results/result.{output_file_format}").split(
117
+ ","
118
+ )
121
119
 
122
120
  diagnostics["num_output_files"] = len(output_file_paths)
123
121
  diagnostics["output_file_paths"] = output_file_paths
@@ -132,7 +130,7 @@ for output_file_path in output_file_paths:
132
130
  else:
133
131
  raise ValueError()
134
132
 
135
- diagnostics_dir = os.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY", "/diagnostics")
133
+ diagnostics_dir = os.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
136
134
  try:
137
135
  with open(f"{diagnostics_dir}/diagnostics.yaml", "w") as f:
138
136
  yaml.dump(diagnostics, f, default_flow_style=False)
@@ -1,3 +1,4 @@
1
+
1
2
  Bootstrap: docker
2
3
  From: apache/spark-py@sha256:489f904a77f21134df4840de5f8bd9f110925e7b439ca6a04b7c033813edfebc
3
4
 
@@ -22,15 +23,4 @@ From: apache/spark-py@sha256:489f904a77f21134df4840de5f8bd9f110925e7b439ca6a04b7
22
23
 
23
24
  %runscript
24
25
  cd /workdir
25
- python3 /code/dummy_step.py "$@"
26
-
27
- %labels
28
- Author Patrick Nast
29
- Version v1.0
30
- Description Python Pyspark Implementation
31
-
32
- %startscript
33
- # These directories should be bound when running the container
34
- mkdir -p /results
35
- mkdir -p /diagnostics
36
- mkdir -p /input_data
26
+ python3 /code/dummy_step.py '$@'
@@ -106,7 +106,7 @@ if (broken) {
106
106
  }
107
107
 
108
108
  output_file_format <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
109
- output_file_paths <- strsplit(Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS", paste0("/results/result.", output_file_format)), ",")[[1]]
109
+ output_file_paths <- strsplit(Sys.getenv("OUTPUT_PATHS", paste0("/results/result.", output_file_format)), ",")[[1]]
110
110
 
111
111
  diagnostics$num_output_files <- length(output_file_paths)
112
112
  diagnostics$output_file_paths <- output_file_paths
@@ -123,7 +123,7 @@ for (output_file_path in output_file_paths) {
123
123
  }
124
124
  }
125
125
 
126
- diagnostics_dir <- Sys.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY", "/diagnostics")
126
+ diagnostics_dir <- Sys.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
127
127
  if (dir.exists(diagnostics_dir) && file.access(diagnostics_dir, mode = 2) == 0) {
128
128
  write_yaml(diagnostics, file.path(diagnostics_dir, 'diagnostics.yaml'))
129
129
  }
@@ -1,3 +1,4 @@
1
+
1
2
  Bootstrap: docker
2
3
  From: rocker/tidyverse@sha256:6a7c913590e758b5fe2ad9921ccc5df7c7160e5de1db5f353630fe8e0ee2f876
3
4
 
@@ -15,15 +16,4 @@ From: rocker/tidyverse@sha256:6a7c913590e758b5fe2ad9921ccc5df7c7160e5de1db5f3536
15
16
  export LC_ALL=C
16
17
 
17
18
  %runscript
18
- Rscript /dummy_step.R "$@"
19
-
20
- %labels
21
- Author Patrick Nast
22
- Version v1.0
23
- Description R Implementation
24
-
25
- %startscript
26
- # These directories should be bound when running the container
27
- mkdir -p /results
28
- mkdir -p /diagnostics
29
- mkdir -p /input_data
19
+ Rscript /dummy_step.R '$@'
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./middle_name_to_initial.py /middle_name_to_initial.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /middle_name_to_initial.py '$@'
@@ -0,0 +1,60 @@
1
+ # STEP_NAME: pre-processing
2
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
3
+
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s %(message)s",
13
+ handlers=[logging.StreamHandler()],
14
+ )
15
+
16
+
17
+ def load_file(file_path, file_format=None):
18
+ logging.info(f"Loading file {file_path} with format {file_format}")
19
+ if file_format is None:
20
+ file_format = file_path.split(".")[-1]
21
+ if file_format == "parquet":
22
+ return pd.read_parquet(file_path)
23
+ raise ValueError(f"Unknown file format {file_format}")
24
+
25
+
26
+ # LOAD INPUTS and SAVE OUTPUTS
27
+
28
+ # DATASET_DIR_PATHS is list of directories, each containing one file
29
+ dataset_paths = os.environ["DATASET_DIR_PATHS"].split(",")
30
+ logging.info(f"{dataset_paths=}")
31
+
32
+ # for workaround, choose path based on INPUT_DATASET configuration
33
+ splitter_choice = os.environ["INPUT_DATASET"]
34
+ logging.info(f"splitter_choice={splitter_choice}")
35
+ dataset_path = None
36
+ for path in dataset_paths:
37
+ path = Path(path)
38
+ # NOTE: We iterate the dir here, but it should only have one non-hidden
39
+ # file in it. We don't validate that here as it is checked in the validator.
40
+ for path_to_check in path.iterdir():
41
+ if path_to_check.stem == splitter_choice:
42
+ dataset_path = str(path_to_check)
43
+ break
44
+
45
+ if dataset_path is None:
46
+ raise ValueError(f"No dataset matching {splitter_choice} found")
47
+
48
+ # OUTPUT_PATHS is a single path to a directory ('dataset')
49
+ results_dir = Path(os.environ["OUTPUT_PATHS"])
50
+ results_dir.mkdir(exist_ok=True, parents=True)
51
+
52
+ output_path = results_dir / Path(dataset_path).name
53
+
54
+ dataset = load_file(dataset_path)
55
+
56
+ # add middle initial column from middle name
57
+ dataset["middle_initial"] = dataset["middle_name"].str[0]
58
+
59
+ logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
60
+ dataset.to_parquet(output_path)
@@ -10,8 +10,8 @@ library(stringr)
10
10
  # Check required environment variables
11
11
  required_env_vars <- c(
12
12
  "BLOCKS_DIR_PATH",
13
- "DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY",
14
- "DUMMY_CONTAINER_OUTPUT_PATHS",
13
+ "DIAGNOSTICS_DIRECTORY",
14
+ "OUTPUT_PATHS",
15
15
  "COMPARISONS",
16
16
  "THRESHOLD_MATCH_PROBABILITY"
17
17
  )
@@ -24,8 +24,8 @@ if (length(missing_vars) > 0) {
24
24
  }
25
25
 
26
26
  blocks_dir <- Sys.getenv("BLOCKS_DIR_PATH")
27
- diagnostics_dir <- Sys.getenv("DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY")
28
- output_path <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS")
27
+ diagnostics_dir <- Sys.getenv("DIAGNOSTICS_DIRECTORY")
28
+ output_path <- Sys.getenv("OUTPUT_PATHS")
29
29
  comparisons <- strsplit(Sys.getenv("COMPARISONS"), ",")[[1]]
30
30
 
31
31
  all_predictions <- list()
@@ -12,7 +12,7 @@ library(stringr)
12
12
  # Check required environment variables
13
13
  required_env_vars <- c(
14
14
  "LINKS_FILE_PATH",
15
- "DUMMY_CONTAINER_OUTPUT_PATHS",
15
+ "OUTPUT_PATHS",
16
16
  "THRESHOLD_MATCH_PROBABILITY"
17
17
  )
18
18
  missing_vars <- required_env_vars[!nzchar(Sys.getenv(required_env_vars))]
@@ -24,7 +24,7 @@ if (length(missing_vars) > 0) {
24
24
  }
25
25
 
26
26
  links_file_path <- Sys.getenv("LINKS_FILE_PATH")
27
- output_path <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_PATHS")
27
+ output_path <- Sys.getenv("OUTPUT_PATHS")
28
28
 
29
29
  if (!file.exists(links_file_path)) {
30
30
  stop(sprintf("File not found: %s", links_file_path))
@@ -11,7 +11,7 @@ data = pd.read_parquet(os.environ["STEP_1_MAIN_INPUT_FILE_PATHS"])
11
11
 
12
12
  print(data)
13
13
 
14
- dir_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
14
+ dir_path = Path(os.environ["OUTPUT_PATHS"])
15
15
  dir_path.mkdir(parents=True, exist_ok=True)
16
16
 
17
17
  for i in range(3):
@@ -8,7 +8,7 @@ from pathlib import Path
8
8
 
9
9
  import pandas as pd
10
10
 
11
- dir_path = Path(os.environ["DUMMY_CONTAINER_MAIN_INPUT_DIR_PATH"])
11
+ dir_path = Path(os.environ["MAIN_INPUT_DIR_PATH"])
12
12
  saved = False
13
13
 
14
14
  for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
@@ -16,7 +16,7 @@ for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
16
16
  continue
17
17
 
18
18
  if not saved:
19
- shutil.copy(f, os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
19
+ shutil.copy(f, os.environ["OUTPUT_PATHS"])
20
20
  saved = True
21
21
 
22
22
  print(pd.read_parquet(f))
@@ -33,8 +33,8 @@ def load_file(file_path, file_format=None):
33
33
 
34
34
  # CLUSTERS_FILE_PATH is a path to a single file
35
35
  clusters_path = os.environ["CLUSTERS_FILE_PATH"]
36
- # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (results.parquet)
37
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
36
+ # OUTPUT_PATHS is a path to a single file (results.parquet)
37
+ results_filepath = os.environ["OUTPUT_PATHS"]
38
38
 
39
39
  clusters_df = load_file(clusters_path)
40
40
 
@@ -45,8 +45,8 @@ for path in dataset_paths:
45
45
  if dataset_path is None:
46
46
  raise ValueError(f"No dataset matching {splitter_choice} found")
47
47
 
48
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
49
- results_dir = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
48
+ # OUTPUT_PATHS is a single path to a directory ('dataset')
49
+ results_dir = Path(os.environ["OUTPUT_PATHS"])
50
50
  results_dir.mkdir(exist_ok=True, parents=True)
51
51
 
52
52
  output_path = results_dir / Path(dataset_path).name
@@ -13,7 +13,7 @@ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a72
13
13
  mkdir -p /diagnostics
14
14
 
15
15
  # Install Python packages with specific versions
16
- pip install pandas pyarrow splink==4.0.7
16
+ pip install pandas pyarrow splink==4.0.7 vl-convert-python
17
17
 
18
18
  %environment
19
19
  export LC_ALL=C
@@ -1,5 +1,5 @@
1
1
  # STEP_NAME: blocking_and_filtering
2
- # REQUIREMENTS: pandas pyarrow splink==4.0.7
2
+ # REQUIREMENTS: pandas pyarrow splink==4.0.7 vl-convert-python
3
3
 
4
4
  import os
5
5
 
@@ -7,22 +7,27 @@ import pandas as pd
7
7
 
8
8
  records = pd.read_parquet(os.environ["RECORDS_FILE_PATH"])
9
9
 
10
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
11
- results_dir = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
10
+ # OUTPUT_PATHS is a single path to a directory ('dataset')
11
+ results_dir = os.environ["OUTPUT_PATHS"]
12
12
 
13
13
  import splink
14
14
 
15
15
  blocking_rules = os.environ["BLOCKING_RULES"].split(",")
16
16
 
17
- from splink import Linker, SettingsCreator
17
+ link_only = os.getenv("LINK_ONLY", "false").lower() in ("true", "yes", "1")
18
+
19
+ from splink import DuckDBAPI, Linker, SettingsCreator
18
20
 
19
21
  # Create the Splink linker in dedupe mode
20
22
  settings = SettingsCreator(
21
- link_type="link_and_dedupe",
23
+ link_type="link_only" if link_only else "link_and_dedupe",
22
24
  blocking_rules_to_generate_predictions=blocking_rules,
23
25
  comparisons=[],
24
26
  )
25
27
  from splink import DuckDBAPI
28
+ from splink.blocking_analysis import (
29
+ cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
30
+ )
26
31
 
27
32
  grouped = records.rename(columns={"Input Record ID": "unique_id"}).groupby(
28
33
  "Input Record Dataset"
@@ -52,6 +57,7 @@ blocking_input_tablename_r = "__splink__df_concat_with_tf"
52
57
 
53
58
  link_type = linker._settings_obj._link_type
54
59
 
60
+
55
61
  # If exploded blocking rules exist, we need to materialise
56
62
  # the tables of ID pairs
57
63
  from splink.internals.blocking import materialise_exploded_id_tables
@@ -98,7 +104,12 @@ blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
98
104
  wrong_order_dataset = (
99
105
  blocked_pairs["Left Record Dataset"] > blocked_pairs["Right Record Dataset"]
100
106
  )
101
- id_cols = ["Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID"]
107
+ id_cols = [
108
+ "Left Record Dataset",
109
+ "Left Record ID",
110
+ "Right Record Dataset",
111
+ "Right Record ID",
112
+ ]
102
113
  switched_id_cols = [
103
114
  "Right Record Dataset",
104
115
  "Right Record ID",
@@ -128,3 +139,18 @@ output_path.mkdir(exist_ok=True, parents=True)
128
139
 
129
140
  records.to_parquet(output_path / "records.parquet", index=False)
130
141
  blocked_pairs.to_parquet(output_path / "pairs.parquet", index=False)
142
+
143
+ records["unique_id"] = (
144
+ str(records["Input Record Dataset"]) + "_" + str(records["Input Record ID"])
145
+ )
146
+ db_api = DuckDBAPI()
147
+ diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
148
+ chart_path = diagnostics_dir / f"blocking_cumulative_comparisons_chart_block_0.png"
149
+ cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
150
+ table_or_tables=records,
151
+ blocking_rules=blocking_rules,
152
+ db_api=db_api,
153
+ link_type=link_type,
154
+ unique_id_column_name="unique_id",
155
+ source_dataset_column_name="Input Record Dataset",
156
+ ).save(chart_path)
@@ -10,9 +10,10 @@ import splink.comparison_library as cl
10
10
  from splink import Linker, SettingsCreator
11
11
 
12
12
  blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
13
- diagnostics_dir = Path(os.environ["DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY"])
14
- output_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
13
+ diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
14
+ output_path = Path(os.environ["OUTPUT_PATHS"])
15
15
  Path(output_path).parent.mkdir(exist_ok=True, parents=True)
16
+ link_only = os.getenv("LINK_ONLY", "false").lower() in ("true", "yes", "1")
16
17
 
17
18
  all_predictions = []
18
19
 
@@ -30,17 +31,20 @@ for block_dir in blocks_dir.iterdir():
30
31
  comparisons.append(cl.NameComparison(column))
31
32
  elif method == "dob":
32
33
  comparisons.append(cl.DateOfBirthComparison(column))
34
+ elif method == "levenshtein":
35
+ comparisons.append(cl.LevenshteinAtThresholds(column))
33
36
  else:
34
37
  raise ValueError(f"Unknown comparison method {method}")
35
38
 
36
39
  # Create the Splink linker in dedupe mode
37
40
  settings = SettingsCreator(
38
- link_type="link_and_dedupe",
41
+ link_type="link_only" if link_only else "link_and_dedupe",
39
42
  blocking_rules_to_generate_predictions=[],
40
43
  comparisons=comparisons,
41
44
  probability_two_random_records_match=float(
42
45
  os.environ["PROBABILITY_TWO_RANDOM_RECORDS_MATCH"]
43
46
  ),
47
+ retain_intermediate_calculation_columns=True,
44
48
  )
45
49
 
46
50
  grouped = (
@@ -59,7 +63,7 @@ for block_dir in blocks_dir.iterdir():
59
63
  input_table_aliases=[name for name, _ in grouped],
60
64
  )
61
65
 
62
- linker.training.estimate_u_using_random_sampling(max_pairs=5e6)
66
+ linker.training.estimate_u_using_random_sampling(max_pairs=5e6, seed=1234)
63
67
 
64
68
  blocking_rules_for_training = os.environ["BLOCKING_RULES_FOR_TRAINING"].split(",")
65
69
 
@@ -143,6 +147,12 @@ for block_dir in blocks_dir.iterdir():
143
147
 
144
148
  all_predictions.append(predictions.as_pandas_dataframe())
145
149
 
150
+ comparisons_path = diagnostics_dir / f"comparisons_chart_{block_dir}.html"
151
+ comparisons_path.parent.mkdir(exist_ok=True, parents=True)
152
+ linker.visualisations.comparison_viewer_dashboard(
153
+ predictions, comparisons_path, overwrite=True
154
+ )
155
+
146
156
  all_predictions = pd.concat(all_predictions, ignore_index=True)[
147
157
  [
148
158
  "source_dataset_l",
@@ -32,7 +32,7 @@ dummy_records_df = pd.DataFrame(
32
32
  )
33
33
  }
34
34
  )
35
- output_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
35
+ output_path = Path(os.environ["OUTPUT_PATHS"])
36
36
 
37
37
  db_api = DuckDBAPI()
38
38
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.22
3
+ Version: 0.1.23
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -1,39 +1,39 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=zmP2TRnzKPjZJ1eiBcT-cRInsji6FW-OVD3FafQFCc4,23
3
+ easylink/_version.py,sha256=0byemO6n6WCv41u9vBG2AIsOkVbxLvok7puvwy8EhfU,23
4
4
  easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
5
5
  easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
6
6
  easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
7
  easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
8
- easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
8
+ easylink/implementation_metadata.yaml,sha256=pKu_H9fLnTsS8E4wCnYRitumW1-zs7mfE3z66BAyO30,10848
9
9
  easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
10
10
  easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
11
11
  easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
12
- easylink/rule.py,sha256=QJPmrvQUZPnqGFD9UmMK8imdJ7VODzGlUOSnpJhb9AU,16677
12
+ easylink/rule.py,sha256=MM7WyW56J7zT2FVjHlFtjuz62PfdSBGTD3MNcpLfEZM,16598
13
13
  easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
14
- easylink/step.py,sha256=zQAoz4HlSVvgS7iMlfmCrXluOtPQxbSgPZOeyZwjdpo,91085
14
+ easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
15
15
  easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
16
16
  easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
17
17
  easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
18
18
  easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
19
- easylink/pipeline_schema_constants/testing.py,sha256=ZFD19CpcidZPVUYBvh8LAa5sZEERT2yfoFa-3xmskFs,24595
19
+ easylink/pipeline_schema_constants/testing.py,sha256=ti08DeUuF-eWrGKMj4BMyOFFJnEYooDaWX0DGiferbk,24579
20
20
  easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
21
- easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
21
+ easylink/steps/cascading/exclude_clustered.py,sha256=Bpsyf9vAZ431Fh96RVzHkF7fy77NQjo1Cl6bHCIy69c,2580
22
22
  easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
23
- easylink/steps/cascading/exclude_none.py,sha256=KntBX3q-V47d96ztOlPNRY_kCFJNi1LNYQ7UNs5wB4c,2507
23
+ easylink/steps/cascading/exclude_none.py,sha256=5DK5bNG4TneMwUKE49Kmz7VDnKBNZWjOERkuSJU3BmA,2475
24
24
  easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
25
- easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=43D5GBmPXSgxcjgbJTvEoGFvPzBCGqYgBaT42pncNNw,3661
25
+ easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=sFZXMGXl17jcGt8Fu5hgQz1KW5bFvPYdCoQGZ9Erc0I,3629
26
26
  easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
27
- easylink/steps/default/default_clusters_to_links.py,sha256=EIYeP0lj0plBl2OpTRuv3iDEQl-zNVJONUg0kgKSEF0,2848
27
+ easylink/steps/default/default_clusters_to_links.py,sha256=Ckm53d3W-ETNlTvQJPOkpHmSqCmxSWknMPQrEAIoTBY,2816
28
28
  easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
29
- easylink/steps/default/default_determining_exclusions.py,sha256=tF2lcga-6n99shgYEmhpNuqok33u7dcW9q5wV3xgp5w,2661
29
+ easylink/steps/default/default_determining_exclusions.py,sha256=4diLfuqYm_Koj7gwifjwe_7mLZ6xb6RQiEdk-RRtB94,2629
30
30
  easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
31
- easylink/steps/default/default_removing_records.py,sha256=LIlFS8EvJ6h5XqEfgWZYyIAjcKj7Oo8_I5a-vXHOozs,1938
31
+ easylink/steps/default/default_removing_records.py,sha256=P4mmX2D4mhSoWd_S5CaNT4hlHOMAeZiMhCScWQiR_fQ,1906
32
32
  easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
33
- easylink/steps/default/default_schema_alignment.py,sha256=Uxi6uTFveFKSiiRZG9MnTXOklQngSKGMafqnvKDc0rY,1459
33
+ easylink/steps/default/default_schema_alignment.py,sha256=oT5jbYQ3C3ocLgqqOnvH0SIJ6NeTtPBWWmCqr_frnAI,1479
34
34
  easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
35
- easylink/steps/default/default_updating_clusters.py,sha256=A-lO3ussM1Ntffp-ZyPQGbbxZg4QNiZ8AvSOGVJDXnA,2139
36
- easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
35
+ easylink/steps/default/default_updating_clusters.py,sha256=uwblSM9w4PoqPgWA_5cwmnU7ARSruickWxTfr2EZCM0,2107
36
+ easylink/steps/dev/README.md,sha256=lEHtM48SkFTV1FL-B5bbrEGjSVa_mb690Ed08nnheww,4533
37
37
  easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
38
38
  easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
39
39
  easylink/steps/dev/test.py,sha256=4iudKBD6CFz2CxbjSBUkc8LCWlMl-Nmw_rB35ZN6TrQ,6835
@@ -43,36 +43,38 @@ easylink/steps/dev/input_data/input_file_1.parquet,sha256=Km8jRyfGNdq0MFdz_-bewl
43
43
  easylink/steps/dev/input_data/input_file_2.csv,sha256=YqKLZDC4d-aYN8Dh9OB6iQWWUKmvueu5CszckH1AApU,100016
44
44
  easylink/steps/dev/input_data/input_file_2.parquet,sha256=Vpo0sUqQ78qlWLRk8p303Nh89BVcK4uvXJljRGHmsWk,60392
45
45
  easylink/steps/dev/python_pandas/README.md,sha256=c_FbtkKKOTjt2R_LfHUo5lBga1qHiYkxLdQeewRr45g,977
46
- easylink/steps/dev/python_pandas/dummy_step.py,sha256=NvhLUZu40B3Xbj_S-chQ6IkYUPr6X2aGBxYUa3DqwmY,4362
47
- easylink/steps/dev/python_pandas/python_pandas.def,sha256=24cxwGF8Cqkv2a1zVsu94MfC_bAXBqAINLwfW2zyB_0,769
46
+ easylink/steps/dev/python_pandas/dummy_step.py,sha256=ASZIxk_d46zNz0xUtZ37OuTJUpzfi98EwQduZXDiwK0,4330
47
+ easylink/steps/dev/python_pandas/python_pandas.def,sha256=umAUJL3RQZsCLIvFvsaif_-kCGfWzsnIH8d6Zwx_qYQ,482
48
48
  easylink/steps/dev/python_pyspark/README.md,sha256=di29SAfcdTTpar7gdoJRLqKrL8DEfNeayYUyaywdhUg,1563
49
- easylink/steps/dev/python_pyspark/dummy_step.py,sha256=wxHHI3Uv8MTipKG2ffHbT_eL4JkoNpx49bJoErXumdc,5003
50
- easylink/steps/dev/python_pyspark/python_pyspark.def,sha256=j_RmVjspmXGOhJTr10ED13RYfbimgxRU3WVTL7VOIUQ,915
49
+ easylink/steps/dev/python_pyspark/dummy_step.py,sha256=6BJi-L6tlXGc6GJnGTI06AnzuRYh3KYBgU09tvId954,4949
50
+ easylink/steps/dev/python_pyspark/python_pyspark.def,sha256=e-OXax5t96OEgAJdsKDqjGXPuCk12GpPYbwIdrUpBOU,666
51
51
  easylink/steps/dev/r/README.md,sha256=dPjZdDTqcJsZCiwhddzlOj1ob0P7YocZUNFrLIGM1-0,1201
52
- easylink/steps/dev/r/dummy_step.R,sha256=1TWZY8CEkT6gavrulBxFsKbDSKJJjk0NtJrGH7TIikE,4975
53
- easylink/steps/dev/r/r-image.def,sha256=LrhXlt0C3k7d_VJWopRPEVARnFWSuq_oILlwo7g03bE,627
54
- easylink/steps/fastLink/fastLink_evaluating_pairs.R,sha256=fQRrTPrgb1t5hrQi0V5H55J-PHdWjsATrVRYdXNYtdU,4603
52
+ easylink/steps/dev/r/dummy_step.R,sha256=4eFZgmKaagydjYOVnrOB3W2vjHHcGDhJ8LXsWDrDNyI,4943
53
+ easylink/steps/dev/r/r-image.def,sha256=gBNCfMb_HtgrGPhTt8qEws5tWVfjQIS_GCIHRGAHG9c,391
54
+ easylink/steps/example/middle_name_to_initial.def,sha256=UmD3FCuK8CMD0gQRUqg1BFnGq5Mucu7x8eU19jq7pZ0,518
55
+ easylink/steps/example/middle_name_to_initial.py,sha256=1Q7xaXIxkIvPN6jW98WKTvVhWB9qMC23mRIoO7NYRa8,1901
56
+ easylink/steps/fastLink/fastLink_evaluating_pairs.R,sha256=ucbHibtoYJ4-GDg1mWv-dtv0r_1XomhdT-KC3Zkat2E,4539
55
57
  easylink/steps/fastLink/fastLink_evaluating_pairs.def,sha256=5rDi-cmWhyuFEsiGFPpTxtySMqq5TpgJG-y8g_MtEvA,509
56
- easylink/steps/fastLink/fastLink_links_to_clusters.R,sha256=exVzJl4r7k7cRlMCHSmigOqTlxShqzK-FO3EDhlPksg,4087
58
+ easylink/steps/fastLink/fastLink_links_to_clusters.R,sha256=iM6bi27bHNJRxfOIBOCvZcFUuKT3VauOWbM6d0Ws5dk,4055
57
59
  easylink/steps/fastLink/fastLink_links_to_clusters.def,sha256=1xYjOMsHtSS2-AI4EC2r6kL8ZX5F2JhmvESefEKeJVY,512
58
60
  easylink/steps/output_dir/dummy_step_1_for_output_dir_example.def,sha256=CkQVG-uDRQ9spAavdkZbhx2GD_fRsKZGELPrr8yltsc,550
59
- easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py,sha256=dI0OWugE35ABLcSwsI-T3C4dvuPTKXwjE52dtSsCo8Y,428
61
+ easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py,sha256=sBmF-wMgTpcqeM9gVWbHZkcy_w0OzfDYo98-3P2WMaM,412
60
62
  easylink/steps/output_dir/dummy_step_2_for_output_dir_example.def,sha256=9gShg1EDJEHZcz7Z5VfZ1A4Gpm9XQes8ezn6rAZDgDM,550
61
- easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py,sha256=DMJW5TXjhELxhY4U9q2RpLjqxlS1YSosTGL2AfRnaZM,521
63
+ easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py,sha256=ExFljptolMiidU7LiOfQtH13ChbDUGIF3r5qM5paKsA,489
62
64
  easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def,sha256=YOWtJZxoe-kHFeEyrgGcVGfdqcbD_Fg17A9shOaK-yc,584
63
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py,sha256=skZUiZWcSXAOqq8TAlN5I0wztXgCWHQYA_xkuiL5s28,1202
65
+ easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py,sha256=zctz6LktX1BsVbeSR2gWuNCzRU7WkmWDGg68wlzZw0Q,1170
64
66
  easylink/steps/rl-dummy/input_data/create_input_files.ipynb,sha256=uXvJ8zTTPg0077HgA7MhQ_9jD-aeISFLeMeEBbSnOu8,54498
65
67
  easylink/steps/rl-dummy/input_data/input_file_1.parquet,sha256=GQ_7v7ucwdJn-9mTgKVcvqkJ5gTkwb0B7y38mfOYbic,15200
66
68
  easylink/steps/rl-dummy/input_data/input_file_2.parquet,sha256=Y4eseBm0HmFroksQr_VApgozRL8h8u7nQO6x_Utyns8,14902
67
69
  easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4AgjVJsm0ueUxPIZAJjbtRm9SVM08JE,2598
68
70
  easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
69
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=aeDgn9z2um0oTPNSwPcTkBou3-1ajud_MWhkuRoHdOU,1884
70
- easylink/steps/splink/splink_blocking_and_filtering.def,sha256=foAQAPvhDEkXkevpghS-uftsTbIQnQy9PvTkyldQeAA,539
71
- easylink/steps/splink/splink_blocking_and_filtering.py,sha256=8-_a9PkOmKSa-8TJ9YMjqI7gLo-YD9JCAO1f8uBhdoE,4469
71
+ easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2yFW5MpsMXdpoY_NjHcBvVVAHWv89LUgXE,1852
72
+ easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
73
+ easylink/steps/splink/splink_blocking_and_filtering.py,sha256=FO8YJ2_KgCLpQoq5xsM339bTSN1DhCXCL8XT1pb5STY,5259
72
74
  easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
73
- easylink/steps/splink/splink_evaluating_pairs.py,sha256=JR2qVgb14cNZKozDyOrN11nr1mXOwWv69E6WP0pRlMw,5713
75
+ easylink/steps/splink/splink_evaluating_pairs.py,sha256=m-j1QMRSvPCiSoWVSV1kzzzsK1c_xG8nqYKMd3cj7kM,6195
74
76
  easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
75
- easylink/steps/splink/splink_links_to_clusters.py,sha256=z5ymdYl9ytp1e5MA6vn8wpGRFWVuhh23LqGq8NJJxZQ,1936
77
+ easylink/steps/splink/splink_links_to_clusters.py,sha256=5Sw8yi0dVLuRB-trN2kXmxbHBR0VJBxYee6u4_usg2Y,1920
76
78
  easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
77
79
  easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
78
80
  easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
@@ -81,9 +83,9 @@ easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,9
81
83
  easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
82
84
  easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
83
85
  easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
84
- easylink-0.1.22.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
85
- easylink-0.1.22.dist-info/METADATA,sha256=hei9KKa0HUgy1Z4aU-nPEAs8KF2_TEe7J0-_esdCG40,3565
86
- easylink-0.1.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
87
- easylink-0.1.22.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
88
- easylink-0.1.22.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
89
- easylink-0.1.22.dist-info/RECORD,,
86
+ easylink-0.1.23.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
+ easylink-0.1.23.dist-info/METADATA,sha256=u-oRyBse4M0AsFkMjTuy0JCpul-BwHJ1JaD9fIALrHU,3565
88
+ easylink-0.1.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
+ easylink-0.1.23.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
+ easylink-0.1.23.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
+ easylink-0.1.23.dist-info/RECORD,,