easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. easylink/_version.py +1 -1
  2. easylink/cli.py +24 -3
  3. easylink/configuration.py +43 -36
  4. easylink/devtools/implementation_creator.py +71 -22
  5. easylink/implementation.py +88 -11
  6. easylink/implementation_metadata.yaml +177 -29
  7. easylink/pipeline.py +15 -6
  8. easylink/pipeline_schema.py +12 -13
  9. easylink/pipeline_schema_constants/__init__.py +4 -5
  10. easylink/pipeline_schema_constants/main.py +489 -0
  11. easylink/runner.py +11 -7
  12. easylink/step.py +89 -0
  13. easylink/steps/cascading/exclude_clustered.def +22 -0
  14. easylink/steps/cascading/exclude_clustered.py +76 -0
  15. easylink/steps/cascading/exclude_none.def +22 -0
  16. easylink/steps/cascading/exclude_none.py +76 -0
  17. easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
  18. easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
  19. easylink/steps/default/default_clusters_to_links.def +22 -0
  20. easylink/steps/default/default_clusters_to_links.py +91 -0
  21. easylink/steps/default/default_determining_exclusions.def +22 -0
  22. easylink/steps/default/default_determining_exclusions.py +81 -0
  23. easylink/steps/default/default_removing_records.def +22 -0
  24. easylink/steps/default/default_removing_records.py +59 -0
  25. easylink/steps/default/default_schema_alignment.def +22 -0
  26. easylink/steps/default/default_schema_alignment.py +53 -0
  27. easylink/steps/default/default_updating_clusters.def +22 -0
  28. easylink/steps/default/default_updating_clusters.py +67 -0
  29. easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
  30. easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
  31. easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
  32. easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
  33. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
  34. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
  35. easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
  36. easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
  37. easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
  38. easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
  39. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
  40. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
  41. easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
  42. easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
  43. easylink/steps/splink/splink_evaluating_pairs.def +22 -0
  44. easylink/steps/splink/splink_evaluating_pairs.py +164 -0
  45. easylink/steps/splink/splink_links_to_clusters.def +22 -0
  46. easylink/steps/splink/splink_links_to_clusters.py +63 -0
  47. easylink/utilities/data_utils.py +72 -0
  48. easylink/utilities/paths.py +4 -3
  49. easylink/utilities/validation_utils.py +509 -11
  50. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
  51. easylink-0.1.19.dist-info/RECORD +91 -0
  52. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
  53. easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
  54. easylink-0.1.17.dist-info/RECORD +0 -55
  55. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
  56. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./dummy_pre-processing.py /dummy_pre-processing.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /dummy_pre-processing.py '$@'
@@ -0,0 +1,59 @@
1
+ # STEP_NAME: pre-processing
2
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
3
+
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s %(message)s",
13
+ handlers=[logging.StreamHandler()],
14
+ )
15
+
16
+
17
+ def load_file(file_path, file_format=None):
18
+ logging.info(f"Loading file {file_path} with format {file_format}")
19
+ if file_format is None:
20
+ file_format = file_path.split(".")[-1]
21
+ if file_format == "parquet":
22
+ return pd.read_parquet(file_path)
23
+ raise ValueError(f"Unknown file format {file_format}")
24
+
25
+
26
+ # LOAD INPUTS and SAVE OUTPUTS
27
+
28
+ # DATASET_DIR_PATHS is list of directories, each containing one file
29
+ dataset_paths = os.environ["DATASET_DIR_PATHS"].split(",")
30
+ logging.info(f"{dataset_paths=}")
31
+
32
+ # for workaround, choose path based on INPUT_DATASET configuration
33
+ splitter_choice = os.environ["INPUT_DATASET"]
34
+ logging.info(f"splitter_choice={splitter_choice}")
35
+ dataset_path = None
36
+ for path in dataset_paths:
37
+ path = Path(path)
38
+ # NOTE: We iterate the dir here, but it should only have one non-hidden
39
+ # file in it. We don't validate that here as it is checked in the validator.
40
+ for path_to_check in path.iterdir():
41
+ if path_to_check.stem == splitter_choice:
42
+ dataset_path = str(path_to_check)
43
+ break
44
+
45
+ if dataset_path is None:
46
+ raise ValueError(f"No dataset matching {splitter_choice} found")
47
+
48
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
49
+ results_dir = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
50
+ results_dir.mkdir(exist_ok=True, parents=True)
51
+
52
+ output_path = results_dir / Path(dataset_path).name
53
+
54
+ dataset = load_file(dataset_path)
55
+
56
+ # NOTE: No actual pre-processing here, we save as-is.
57
+
58
+ logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
59
+ dataset.to_parquet(output_path)
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./splink_blocking_and_filtering.py /splink_blocking_and_filtering.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas pyarrow splink==4.0.7
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /splink_blocking_and_filtering.py '$@'
@@ -0,0 +1,130 @@
1
+ # STEP_NAME: blocking_and_filtering
2
+ # REQUIREMENTS: pandas pyarrow splink==4.0.7
3
+
4
+ import os
5
+
6
+ import pandas as pd
7
+
8
+ records = pd.read_parquet(os.environ["RECORDS_FILE_PATH"])
9
+
10
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
11
+ results_dir = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
12
+
13
+ import splink
14
+
15
+ blocking_rules = os.environ["BLOCKING_RULES"].split(",")
16
+
17
+ from splink import Linker, SettingsCreator
18
+
19
+ # Create the Splink linker in dedupe mode
20
+ settings = SettingsCreator(
21
+ link_type="link_and_dedupe",
22
+ blocking_rules_to_generate_predictions=blocking_rules,
23
+ comparisons=[],
24
+ )
25
+ from splink import DuckDBAPI
26
+
27
+ grouped = records.rename(columns={"Input Record ID": "unique_id"}).groupby(
28
+ "Input Record Dataset"
29
+ )
30
+
31
+ db_api = DuckDBAPI()
32
+ linker = Linker(
33
+ [df for _, df in grouped],
34
+ settings,
35
+ db_api=db_api,
36
+ input_table_aliases=[name for name, _ in grouped],
37
+ )
38
+
39
+ # Copied/adapted from https://github.com/moj-analytical-services/splink/blob/3eb1921eaff6b8471d3ebacd3238eb514f62c844/splink/internals/linker_components/inference.py#L86-L131
40
+ from splink.internals.pipeline import CTEPipeline
41
+ from splink.internals.vertically_concatenate import compute_df_concat_with_tf
42
+
43
+ pipeline = CTEPipeline()
44
+
45
+ # In duckdb, calls to random() in a CTE pipeline cause problems:
46
+ # https://gist.github.com/RobinL/d329e7004998503ce91b68479aa41139
47
+ df_concat_with_tf = compute_df_concat_with_tf(linker, pipeline)
48
+ pipeline = CTEPipeline([df_concat_with_tf])
49
+
50
+ blocking_input_tablename_l = "__splink__df_concat_with_tf"
51
+ blocking_input_tablename_r = "__splink__df_concat_with_tf"
52
+
53
+ link_type = linker._settings_obj._link_type
54
+
55
+ # If exploded blocking rules exist, we need to materialise
56
+ # the tables of ID pairs
57
+ from splink.internals.blocking import materialise_exploded_id_tables
58
+
59
+ exploding_br_with_id_tables = materialise_exploded_id_tables(
60
+ link_type=link_type,
61
+ blocking_rules=linker._settings_obj._blocking_rules_to_generate_predictions,
62
+ db_api=linker._db_api,
63
+ splink_df_dict=linker._input_tables_dict,
64
+ source_dataset_input_column=linker._settings_obj.column_info_settings.source_dataset_input_column,
65
+ unique_id_input_column=linker._settings_obj.column_info_settings.unique_id_input_column,
66
+ )
67
+
68
+ from splink.internals.blocking import block_using_rules_sqls
69
+
70
+ sqls = block_using_rules_sqls(
71
+ input_tablename_l=blocking_input_tablename_l,
72
+ input_tablename_r=blocking_input_tablename_r,
73
+ blocking_rules=linker._settings_obj._blocking_rules_to_generate_predictions,
74
+ link_type=link_type,
75
+ source_dataset_input_column=linker._settings_obj.column_info_settings.source_dataset_input_column,
76
+ unique_id_input_column=linker._settings_obj.column_info_settings.unique_id_input_column,
77
+ )
78
+
79
+ pipeline.enqueue_list_of_sqls(sqls)
80
+
81
+ blocked_pairs = (
82
+ linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
83
+ .as_pandas_dataframe()
84
+ .drop(columns=["match_key"])
85
+ )
86
+
87
+ blocked_pairs[["Left Record Dataset", "Left Record ID"]] = blocked_pairs.pop(
88
+ "join_key_l"
89
+ ).str.split("-__-", n=1, expand=True)
90
+ blocked_pairs[["Right Record Dataset", "Right Record ID"]] = blocked_pairs.pop(
91
+ "join_key_r"
92
+ ).str.split("-__-", n=1, expand=True)
93
+ blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
94
+ ["Left Record ID", "Right Record ID"]
95
+ ].astype(int)
96
+
97
+ # Now ensure correct ordering
98
+ wrong_order_dataset = (
99
+ blocked_pairs["Left Record Dataset"] > blocked_pairs["Right Record Dataset"]
100
+ )
101
+ id_cols = ["Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID"]
102
+ switched_id_cols = [
103
+ "Right Record Dataset",
104
+ "Right Record ID",
105
+ "Left Record Dataset",
106
+ "Left Record ID",
107
+ ]
108
+ blocked_pairs.loc[wrong_order_dataset, id_cols] = blocked_pairs.loc[
109
+ wrong_order_dataset, switched_id_cols
110
+ ].values
111
+
112
+ wrong_order_ids = (
113
+ blocked_pairs["Left Record Dataset"] == blocked_pairs["Right Record Dataset"]
114
+ ) & (blocked_pairs["Left Record ID"] > blocked_pairs["Right Record ID"])
115
+ blocked_pairs.loc[wrong_order_ids, id_cols] = blocked_pairs.loc[
116
+ wrong_order_ids, switched_id_cols
117
+ ].values
118
+ blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
119
+ ["Left Record ID", "Right Record ID"]
120
+ ].astype(int)
121
+
122
+ print(blocked_pairs)
123
+
124
+ from pathlib import Path
125
+
126
+ output_path = Path(results_dir) / "block_0"
127
+ output_path.mkdir(exist_ok=True, parents=True)
128
+
129
+ records.to_parquet(output_path / "records.parquet", index=False)
130
+ blocked_pairs.to_parquet(output_path / "pairs.parquet", index=False)
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./splink_evaluating_pairs.py /splink_evaluating_pairs.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas pyarrow splink==4.0.7
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /splink_evaluating_pairs.py '$@'
@@ -0,0 +1,164 @@
1
+ # STEP_NAME: evaluating_pairs
2
+ # REQUIREMENTS: pandas pyarrow splink==4.0.7
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ import splink
9
+ import splink.comparison_library as cl
10
+ from splink import Linker, SettingsCreator
11
+
12
+ blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
13
+ diagnostics_dir = Path(os.environ["DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY"])
14
+ output_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
15
+ Path(output_path).parent.mkdir(exist_ok=True, parents=True)
16
+
17
+ all_predictions = []
18
+
19
+ for block_dir in blocks_dir.iterdir():
20
+ if str(block_dir.stem).startswith("."):
21
+ continue
22
+ encoded_comparisons = os.environ["COMPARISONS"].split(",")
23
+
24
+ comparisons = []
25
+ for encoded_comparison in encoded_comparisons:
26
+ column, method = encoded_comparison.split(":")
27
+ if method == "exact":
28
+ comparisons.append(cl.ExactMatch(column))
29
+ elif method == "name":
30
+ comparisons.append(cl.NameComparison(column))
31
+ elif method == "dob":
32
+ comparisons.append(cl.DateOfBirthComparison(column))
33
+ else:
34
+ raise ValueError(f"Unknown comparison method {method}")
35
+
36
+ # Create the Splink linker in dedupe mode
37
+ settings = SettingsCreator(
38
+ link_type="link_and_dedupe",
39
+ blocking_rules_to_generate_predictions=[],
40
+ comparisons=comparisons,
41
+ probability_two_random_records_match=float(
42
+ os.environ["PROBABILITY_TWO_RANDOM_RECORDS_MATCH"]
43
+ ),
44
+ )
45
+
46
+ grouped = (
47
+ pd.read_parquet(block_dir / "records.parquet")
48
+ .rename(columns={"Input Record ID": "unique_id"})
49
+ .groupby("Input Record Dataset")
50
+ )
51
+
52
+ from splink import DuckDBAPI
53
+
54
+ db_api = DuckDBAPI()
55
+ linker = Linker(
56
+ [df for _, df in grouped],
57
+ settings,
58
+ db_api=db_api,
59
+ input_table_aliases=[name for name, _ in grouped],
60
+ )
61
+
62
+ linker.training.estimate_u_using_random_sampling(max_pairs=5e6)
63
+
64
+ blocking_rules_for_training = os.environ["BLOCKING_RULES_FOR_TRAINING"].split(",")
65
+
66
+ for blocking_rule_for_training in blocking_rules_for_training:
67
+ linker.training.estimate_parameters_using_expectation_maximisation(
68
+ blocking_rule_for_training
69
+ )
70
+
71
+ chart_path = diagnostics_dir / f"match_weights_chart_{block_dir}.html"
72
+ chart_path.parent.mkdir(exist_ok=True, parents=True)
73
+ linker.visualisations.match_weights_chart().save(chart_path)
74
+
75
+ # Copied/adapted from https://github.com/moj-analytical-services/splink/blob/3eb1921eaff6b8471d3ebacd3238eb514f62c844/splink/internals/linker_components/inference.py#L264-L293
76
+ from splink.internals.pipeline import CTEPipeline
77
+ from splink.internals.vertically_concatenate import compute_df_concat_with_tf
78
+
79
+ pipeline = CTEPipeline()
80
+
81
+ # In duckdb, calls to random() in a CTE pipeline cause problems:
82
+ # https://gist.github.com/RobinL/d329e7004998503ce91b68479aa41139
83
+ pairs = (
84
+ pd.read_parquet(block_dir / "pairs.parquet")
85
+ .assign(
86
+ join_key_l=lambda df: df["Left Record Dataset"]
87
+ + "-__-"
88
+ + df["Left Record ID"].astype(int).astype(str),
89
+ join_key_r=lambda df: df["Right Record Dataset"]
90
+ + "-__-"
91
+ + df["Right Record ID"].astype(int).astype(str),
92
+ )
93
+ .drop(
94
+ columns=[
95
+ "Left Record Dataset",
96
+ "Left Record ID",
97
+ "Right Record Dataset",
98
+ "Right Record ID",
99
+ ]
100
+ )
101
+ .assign(match_key=0)
102
+ ) # What is this?
103
+ db_api._table_registration(pairs, "__splink__blocked_id_pairs")
104
+ df_concat_with_tf = compute_df_concat_with_tf(linker, pipeline)
105
+ pipeline = CTEPipeline(
106
+ [
107
+ db_api.table_to_splink_dataframe(
108
+ "__splink__blocked_id_pairs", "__splink__blocked_id_pairs"
109
+ ),
110
+ df_concat_with_tf,
111
+ ]
112
+ )
113
+
114
+ from splink.internals.comparison_vector_values import (
115
+ compute_comparison_vector_values_from_id_pairs_sqls,
116
+ )
117
+
118
+ sqls = compute_comparison_vector_values_from_id_pairs_sqls(
119
+ linker._settings_obj._columns_to_select_for_blocking,
120
+ linker._settings_obj._columns_to_select_for_comparison_vector_values,
121
+ input_tablename_l="__splink__df_concat_with_tf",
122
+ input_tablename_r="__splink__df_concat_with_tf",
123
+ source_dataset_input_column=linker._settings_obj.column_info_settings.source_dataset_input_column,
124
+ unique_id_input_column=linker._settings_obj.column_info_settings.unique_id_input_column,
125
+ )
126
+ pipeline.enqueue_list_of_sqls(sqls)
127
+
128
+ from splink.internals.predict import (
129
+ predict_from_comparison_vectors_sqls_using_settings,
130
+ )
131
+
132
+ sqls = predict_from_comparison_vectors_sqls_using_settings(
133
+ linker._settings_obj,
134
+ float(os.environ["THRESHOLD_MATCH_PROBABILITY"]),
135
+ threshold_match_weight=None,
136
+ sql_infinity_expression=linker._infinity_expression,
137
+ )
138
+ pipeline.enqueue_list_of_sqls(sqls)
139
+
140
+ predictions = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
141
+
142
+ linker._predict_warning()
143
+
144
+ all_predictions.append(predictions.as_pandas_dataframe())
145
+
146
+ all_predictions = pd.concat(all_predictions, ignore_index=True)[
147
+ [
148
+ "source_dataset_l",
149
+ "unique_id_l",
150
+ "source_dataset_r",
151
+ "unique_id_r",
152
+ "match_probability",
153
+ ]
154
+ ].rename(
155
+ columns={
156
+ "source_dataset_l": "Left Record Dataset",
157
+ "unique_id_l": "Left Record ID",
158
+ "source_dataset_r": "Right Record Dataset",
159
+ "unique_id_r": "Right Record ID",
160
+ "match_probability": "Probability",
161
+ }
162
+ )
163
+ print(all_predictions)
164
+ all_predictions.to_parquet(output_path)
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./splink_links_to_clusters.py /splink_links_to_clusters.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas pyarrow splink==4.0.7
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /splink_links_to_clusters.py '$@'
@@ -0,0 +1,63 @@
1
+ # STEP_NAME: links_to_clusters
2
+ # REQUIREMENTS: pandas pyarrow splink==4.0.7
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ # Adapted from example on https://moj-analytical-services.github.io/splink/api_docs/clustering.html
11
+ from splink import DuckDBAPI
12
+ from splink.clustering import cluster_pairwise_predictions_at_threshold
13
+
14
+ links = pd.read_parquet(os.environ["LINKS_FILE_PATH"]).rename(
15
+ columns={
16
+ "Probability": "match_probability",
17
+ }
18
+ )
19
+
20
+ # Create unique record keys by concatenating Input Record Dataset and Record ID for both left and right
21
+ links["Left Record Key"] = (
22
+ links["Left Record Dataset"].astype(str) + "-__-" + links["Left Record ID"].astype(str)
23
+ )
24
+ links["Right Record Key"] = (
25
+ links["Right Record Dataset"].astype(str) + "-__-" + links["Right Record ID"].astype(str)
26
+ )
27
+
28
+ dummy_records_df = pd.DataFrame(
29
+ {
30
+ "Record Key": np.unique(
31
+ list(links["Left Record Key"]) + list(links["Right Record Key"])
32
+ )
33
+ }
34
+ )
35
+ output_path = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
36
+
37
+ db_api = DuckDBAPI()
38
+
39
+ cc = (
40
+ cluster_pairwise_predictions_at_threshold(
41
+ dummy_records_df,
42
+ links,
43
+ node_id_column_name='"Record Key"',
44
+ edge_id_column_name_left='"Left Record Key"',
45
+ edge_id_column_name_right='"Right Record Key"',
46
+ db_api=db_api,
47
+ threshold_match_probability=float(os.environ["THRESHOLD_MATCH_PROBABILITY"]),
48
+ )
49
+ .as_pandas_dataframe()
50
+ .rename(columns={"cluster_id": "Cluster ID"})
51
+ )
52
+
53
+ # Split "Record Key" back into "Input Record Dataset" and "Input Record ID"
54
+ cc[["Input Record Dataset", "Input Record ID"]] = (
55
+ cc["Record Key"].astype(str).str.split("-__-", n=1, expand=True)
56
+ )
57
+ cc = cc.drop(columns=["Record Key"])
58
+ cc["Input Record ID"] = cc["Input Record ID"].astype(int)
59
+ cc = cc[["Input Record Dataset", "Input Record ID", "Cluster ID"]]
60
+
61
+ print(cc)
62
+
63
+ cc.to_parquet(output_path)
@@ -8,13 +8,17 @@ This module contains utility functions for handling data files and directories.
8
8
 
9
9
  """
10
10
 
11
+ import hashlib
11
12
  import os
12
13
  import shutil
13
14
  from collections.abc import Callable
14
15
  from datetime import datetime
15
16
  from pathlib import Path
16
17
 
18
+ import requests
17
19
  import yaml
20
+ from loguru import logger
21
+ from tqdm import tqdm
18
22
 
19
23
 
20
24
  def modify_umask(func: Callable) -> Callable:
@@ -152,3 +156,71 @@ def load_yaml(filepath: str | Path) -> dict:
152
156
  with open(filepath, "r") as file:
153
157
  data = yaml.safe_load(file)
154
158
  return data
159
+
160
+
161
+ @modify_umask
162
+ def download_image(
163
+ images_dir: str | Path, record_id: int, filename: str, md5_checksum: str
164
+ ) -> None:
165
+ """Downloads an image from zenodo.
166
+
167
+ Parameters
168
+ ----------
169
+ images_dir
170
+ The directory to download the image to.
171
+ record_id
172
+ The zenodo record ID that the image is a part of.
173
+ filename
174
+ The name of the image file to download.
175
+ md5_checksum
176
+ The expected MD5 checksum of the image file.
177
+
178
+ Raises
179
+ ------
180
+ FileNotFoundError
181
+ If the image file was not downloaded.
182
+ ValueError
183
+ If the MD5 checksum of the downloaded file does not match the expected checksum.
184
+ """
185
+
186
+ images_dir = Path(images_dir).resolve()
187
+ if not images_dir.exists():
188
+ images_dir.mkdir(parents=True, exist_ok=True)
189
+
190
+ url = f"https://zenodo.org/record/{record_id}/files/{filename}?download=1"
191
+
192
+ response = requests.get(url, stream=True)
193
+ response.raise_for_status()
194
+
195
+ total_size = int(response.headers.get("Content-Length", 0))
196
+ output_path = images_dir / filename
197
+ logger.info(f"Downloading {filename} to {output_path}...")
198
+ with open(output_path, "wb") as file, tqdm(
199
+ total=total_size, unit="B", unit_scale=True, desc=filename
200
+ ) as progress_bar:
201
+ for chunk in response.iter_content(chunk_size=8192):
202
+ if chunk:
203
+ file.write(chunk)
204
+ progress_bar.update(len(chunk))
205
+
206
+ if not output_path.exists():
207
+ raise FileNotFoundError(f"Failed to download the image: {filename}")
208
+
209
+ # Verify MD5 checksum
210
+ calculated_md5_checksum = calculate_md5_checksum(output_path)
211
+ if calculated_md5_checksum != md5_checksum:
212
+ raise ValueError(
213
+ f"MD5 checksum does not match for {filename}.\n"
214
+ f"Try manually downloading the image and then moving it to the {images_dir} directory.\n"
215
+ f"Download the image by visiting this link: {url}"
216
+ )
217
+
218
+
219
+ def calculate_md5_checksum(output_path: Path) -> str:
220
+ md5_hash = hashlib.md5()
221
+ with open(output_path, "rb") as file:
222
+ while chunk := file.read(8192):
223
+ md5_hash.update(chunk)
224
+
225
+ calculated_md5_checksum = md5_hash.hexdigest()
226
+ return calculated_md5_checksum
@@ -9,9 +9,10 @@ This module contains commonly-used filepaths and directories.
9
9
 
10
10
  from pathlib import Path
11
11
 
12
- # TODO: We'll need to update this to be more generic for external users and have a way of configuring this
13
- CONTAINER_DIR = "/mnt/team/simulation_science/priv/engineering/er_ecosystem/images"
14
- """Path to the directory where the container images are stored."""
12
+ DEV_IMAGES_DIR = "/mnt/team/simulation_science/priv/engineering/er_ecosystem/images"
13
+ """Path to the directory where the development/dummy pipeline images are stored."""
14
+ DEFAULT_IMAGES_DIR = Path.home() / ".easylink_images"
15
+ """Default subdirectory for storing downloaded images."""
15
16
  IMPLEMENTATION_METADATA = Path(__file__).parent.parent / "implementation_metadata.yaml"
16
17
  """Path to the implementation metadata file."""
17
18
  EASYLINK_TEMP = {"local": Path("/tmp/easylink"), "slurm": Path("/tmp")}