easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. easylink/_version.py +1 -1
  2. easylink/cli.py +24 -3
  3. easylink/configuration.py +43 -36
  4. easylink/devtools/implementation_creator.py +71 -22
  5. easylink/implementation.py +88 -11
  6. easylink/implementation_metadata.yaml +177 -29
  7. easylink/pipeline.py +15 -6
  8. easylink/pipeline_schema.py +12 -13
  9. easylink/pipeline_schema_constants/__init__.py +4 -5
  10. easylink/pipeline_schema_constants/main.py +489 -0
  11. easylink/runner.py +11 -7
  12. easylink/step.py +89 -0
  13. easylink/steps/cascading/exclude_clustered.def +22 -0
  14. easylink/steps/cascading/exclude_clustered.py +76 -0
  15. easylink/steps/cascading/exclude_none.def +22 -0
  16. easylink/steps/cascading/exclude_none.py +76 -0
  17. easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
  18. easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
  19. easylink/steps/default/default_clusters_to_links.def +22 -0
  20. easylink/steps/default/default_clusters_to_links.py +91 -0
  21. easylink/steps/default/default_determining_exclusions.def +22 -0
  22. easylink/steps/default/default_determining_exclusions.py +81 -0
  23. easylink/steps/default/default_removing_records.def +22 -0
  24. easylink/steps/default/default_removing_records.py +59 -0
  25. easylink/steps/default/default_schema_alignment.def +22 -0
  26. easylink/steps/default/default_schema_alignment.py +53 -0
  27. easylink/steps/default/default_updating_clusters.def +22 -0
  28. easylink/steps/default/default_updating_clusters.py +67 -0
  29. easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
  30. easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
  31. easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
  32. easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
  33. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
  34. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
  35. easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
  36. easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
  37. easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
  38. easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
  39. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
  40. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
  41. easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
  42. easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
  43. easylink/steps/splink/splink_evaluating_pairs.def +22 -0
  44. easylink/steps/splink/splink_evaluating_pairs.py +164 -0
  45. easylink/steps/splink/splink_links_to_clusters.def +22 -0
  46. easylink/steps/splink/splink_links_to_clusters.py +63 -0
  47. easylink/utilities/data_utils.py +72 -0
  48. easylink/utilities/paths.py +4 -3
  49. easylink/utilities/validation_utils.py +509 -11
  50. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
  51. easylink-0.1.19.dist-info/RECORD +91 -0
  52. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
  53. easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
  54. easylink-0.1.17.dist-info/RECORD +0 -55
  55. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
  56. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,76 @@
1
+ # STEP_NAME: determining_exclusions
2
+
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
4
+
5
+ # PIPELINE_SCHEMA: main
6
+
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s %(message)s",
16
+ handlers=[logging.StreamHandler()],
17
+ )
18
+
19
+
20
+ def load_file(file_path, file_format=None):
21
+ logging.info(f"Loading file {file_path} with format {file_format}")
22
+ if file_format is None:
23
+ file_format = file_path.split(".")[-1]
24
+ if file_format == "parquet":
25
+ return pd.read_parquet(file_path)
26
+ raise ValueError(f"Unknown file format {file_format}")
27
+
28
+
29
+ # LOAD INPUTS
30
+
31
+ # INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS is list of filepaths which includes
32
+ # the known_clusters filepath due to workaround
33
+ dataset_paths = os.environ["INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS"].split(",")
34
+ dataset_paths = [path for path in dataset_paths if "clusters" not in Path(path).stem]
35
+
36
+ # for workaround, choose path based on INPUT_DATASET configuration
37
+ splitter_choice = os.environ["INPUT_DATASET"]
38
+ dataset_path = None
39
+ for path in dataset_paths:
40
+ if splitter_choice == Path(path).stem:
41
+ dataset_path = path
42
+ break
43
+ if dataset_path is None:
44
+ raise ValueError(f"No dataset matching {splitter_choice} found")
45
+
46
+ # KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
47
+ # There is one item in it that is a file with "clusters" in the filename.
48
+ # That's the only item we're interested in here.
49
+ # The other items may be there if this is coming from the user's input,
50
+ # due to our workaround for only having one slot of user input.
51
+ clusters_filepaths = [
52
+ path
53
+ for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
54
+ if "clusters" in Path(path).stem
55
+ ]
56
+ if len(clusters_filepaths) > 1:
57
+ raise ValueError("Multiple known clusters files found")
58
+ if len(clusters_filepaths) == 0:
59
+ raise ValueError("No known clusters file found")
60
+
61
+ clusters_filepath = clusters_filepaths[0]
62
+
63
+ # Exclude records that have been clustered
64
+ clusters_df = load_file(clusters_filepath)
65
+ dataset_df = load_file(dataset_path)
66
+ clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
67
+ clusters_df["Input Record ID"].unique()
68
+ )
69
+
70
+ IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
71
+
72
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
73
+ results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
74
+
75
+ logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
76
+ IDS_TO_REMOVE.to_parquet(results_filepath)
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./exclude_none.py /exclude_none.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /exclude_none.py '$@'
@@ -0,0 +1,76 @@
1
+ # STEP_NAME: determining_exclusions
2
+
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
4
+
5
+ # PIPELINE_SCHEMA: main
6
+
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s %(message)s",
16
+ handlers=[logging.StreamHandler()],
17
+ )
18
+
19
+
20
+ def load_file(file_path, file_format=None):
21
+ logging.info(f"Loading file {file_path} with format {file_format}")
22
+ if file_format is None:
23
+ file_format = file_path.split(".")[-1]
24
+ if file_format == "parquet":
25
+ return pd.read_parquet(file_path)
26
+ raise ValueError(f"Unknown file format {file_format}")
27
+
28
+
29
+ # LOAD INPUTS
30
+
31
+ # INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS is list of filepaths which includes
32
+ # the known_clusters filepath due to workaround
33
+ dataset_paths = os.environ["INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS"].split(",")
34
+ dataset_paths = [path for path in dataset_paths if "clusters" not in Path(path).stem]
35
+
36
+ # for workaround, choose path based on INPUT_DATASET configuration
37
+ splitter_choice = os.environ["INPUT_DATASET"]
38
+ dataset_path = None
39
+ for path in dataset_paths:
40
+ if splitter_choice == Path(path).stem:
41
+ dataset_path = path
42
+ break
43
+ if dataset_path is None:
44
+ raise ValueError(f"No dataset matching {splitter_choice} found")
45
+
46
+ # KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
47
+ # There is one item in it that is a file with "clusters" in the filename.
48
+ # That's the only item we're interested in here.
49
+ # The other items may be there if this is coming from the user's input,
50
+ # due to our workaround for only having one slot of user input.
51
+ clusters_filepaths = [
52
+ path
53
+ for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
54
+ if "clusters" in Path(path).stem
55
+ ]
56
+ if len(clusters_filepaths) > 1:
57
+ raise ValueError("Multiple known clusters files found")
58
+ if len(clusters_filepaths) == 0:
59
+ raise ValueError("No known clusters file found")
60
+
61
+ clusters_filepath = clusters_filepaths[0]
62
+
63
+ clusters_df = load_file(clusters_filepath)
64
+
65
+ # don't need to actually load the dataset,
66
+ # since we will just save an empty ids_to_remove
67
+
68
+ # SAVE OUTPUTS
69
+
70
+ IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
71
+
72
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
73
+ results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
74
+
75
+ logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
76
+ IDS_TO_REMOVE.to_parquet(results_filepath)
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./update_clusters_by_connected_components.py /update_clusters_by_connected_components.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml networkx
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /update_clusters_by_connected_components.py '$@'
@@ -0,0 +1,101 @@
1
+ # STEP_NAME: updating_clusters
2
+
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml networkx
4
+
5
+ # PIPELINE_SCHEMA: main
6
+
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+
11
+ import networkx as nx
12
+ import pandas as pd
13
+
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format="%(asctime)s %(message)s",
17
+ handlers=[logging.StreamHandler()],
18
+ )
19
+
20
+
21
+ def load_file(file_path, file_format=None):
22
+ logging.info(f"Loading file {file_path} with format {file_format}")
23
+ if file_format is None:
24
+ file_format = file_path.split(".")[-1]
25
+ if file_format == "parquet":
26
+ return pd.read_parquet(file_path)
27
+ raise ValueError(f"Unknown file format {file_format}")
28
+
29
+
30
+ # LOAD INPUTS and SAVE OUTPUTS
31
+
32
+ # NEW_CLUSTERS_FILE_PATH is a path to a single file
33
+ new_clusters_filepath = os.environ["NEW_CLUSTERS_FILE_PATH"]
34
+
35
+ # KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
36
+ # There is one item in it that is a file with "clusters" in the filename.
37
+ # That's the only item we're interested in here.
38
+ # The other items may be there if this is coming from the user's input,
39
+ # due to our workaround for only having one slot of user input.
40
+ known_clusters_filepaths = [
41
+ path
42
+ for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
43
+ if "clusters" in Path(path).stem
44
+ ]
45
+ if len(known_clusters_filepaths) > 1:
46
+ raise ValueError("Multiple known clusters files found")
47
+ if len(known_clusters_filepaths) == 0:
48
+ raise ValueError("No known clusters file found")
49
+
50
+ known_clusters_filepath = known_clusters_filepaths[0]
51
+ known_clusters_df = load_file(known_clusters_filepath)
52
+
53
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (clusters.parquet)
54
+ results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
55
+ Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
56
+
57
+ new_clusters_df = load_file(new_clusters_filepath)
58
+
59
+
60
+ def merge_clusters(known_clusters_df, new_clusters_df):
61
+ # Combine both dataframes
62
+ combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
63
+
64
+ # Drop records with missing cluster IDs
65
+ combined_df = combined_df.dropna(subset=["Cluster ID"])
66
+
67
+ # Group by Cluster ID to get connected records
68
+ cluster_groups = combined_df.groupby("Cluster ID")["Input Record ID"].apply(list)
69
+
70
+ # Build a graph of all connections implied by cluster IDs
71
+ G = nx.Graph()
72
+ for group in cluster_groups:
73
+ for i in range(len(group)):
74
+ for j in range(i + 1, len(group)):
75
+ G.add_edge(group[i], group[j])
76
+
77
+ # Add isolated nodes (records with unique clusters)
78
+ all_ids = set(combined_df["Input Record ID"])
79
+ G.add_nodes_from(all_ids)
80
+
81
+ # Compute connected components
82
+ components = list(nx.connected_components(G))
83
+
84
+ # Assign new cluster IDs
85
+ merged_data = []
86
+ for cluster_id, records in enumerate(components, start=1):
87
+ for record_id in records:
88
+ merged_data.append((record_id, cluster_id))
89
+
90
+ # Build the final DataFrame
91
+ merged_df = pd.DataFrame(merged_data, columns=["Input Record ID", "Cluster ID"])
92
+
93
+ return merged_df
94
+
95
+
96
+ output_df = merge_clusters(known_clusters_df, new_clusters_df)
97
+
98
+ logging.info(
99
+ f"Writing output for dataset from input {new_clusters_filepath} to {results_filepath}"
100
+ )
101
+ output_df.to_parquet(results_filepath)
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./default_clusters_to_links.py /default_clusters_to_links.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /default_clusters_to_links.py '$@'
@@ -0,0 +1,91 @@
1
+ # STEP_NAME: clusters_to_links
2
+
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
4
+
5
+ # PIPELINE_SCHEMA: main
6
+
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s %(message)s",
16
+ handlers=[logging.StreamHandler()],
17
+ )
18
+
19
+
20
+ def load_file(file_path, file_format=None):
21
+ logging.info(f"Loading file {file_path} with format {file_format}")
22
+ if file_format is None:
23
+ file_format = file_path.split(".")[-1]
24
+ if file_format == "parquet":
25
+ return pd.read_parquet(file_path)
26
+ raise ValueError(f"Unknown file format {file_format}")
27
+
28
+
29
+ # code example from pipeline schema docs
30
+ def clusters_to_links(clusters_df):
31
+ # Merge the dataframe with itself on Cluster ID to get all pairs within each cluster
32
+ merged = clusters_df.merge(
33
+ clusters_df,
34
+ on="Cluster ID",
35
+ suffixes=("_left", "_right"),
36
+ )
37
+
38
+ # Compare tuples row-wise to keep only unique pairs (left < right)
39
+ mask = (merged["Input Record Dataset_left"] < merged["Input Record Dataset_right"]) | (
40
+ (merged["Input Record Dataset_left"] == merged["Input Record Dataset_right"])
41
+ & (merged["Input Record ID_left"] < merged["Input Record ID_right"])
42
+ )
43
+ filtered = merged[mask]
44
+
45
+ # Build the output DataFrame
46
+ links_df = filtered[
47
+ [
48
+ "Input Record Dataset_left",
49
+ "Input Record ID_left",
50
+ "Input Record Dataset_right",
51
+ "Input Record ID_right",
52
+ ]
53
+ ].copy()
54
+ links_df.columns = [
55
+ "Left Record Dataset",
56
+ "Left Record ID",
57
+ "Right Record Dataset",
58
+ "Right Record ID",
59
+ ]
60
+ links_df["Probability"] = 1.0
61
+ return links_df
62
+
63
+
64
+ # LOAD INPUTS and SAVE OUTPUTS
65
+
66
+ # KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
67
+ # There is one item in it that is a file with "clusters" in the filename.
68
+ # That's the only item we're interested in here.
69
+ # The other items may be there if this is coming from the user's input,
70
+ # due to our workaround for only having one slot of user input.
71
+ clusters_filepaths = [
72
+ path
73
+ for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
74
+ if "clusters" in Path(path).stem
75
+ ]
76
+ if len(clusters_filepaths) > 1:
77
+ raise ValueError("Multiple known clusters files found")
78
+ if len(clusters_filepaths) == 0:
79
+ raise ValueError("No known clusters file found")
80
+
81
+ clusters_filepath = clusters_filepaths[0]
82
+
83
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (results.parquet)
84
+ results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
85
+
86
+ clusters_df = load_file(clusters_filepath)
87
+ links_df = clusters_to_links(clusters_df)
88
+ logging.info(
89
+ f"Writing output for dataset from input {clusters_filepath} to {results_filepath}"
90
+ )
91
+ links_df.to_parquet(results_filepath)
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./default_determining_exclusions.py /default_determining_exclusions.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /default_determining_exclusions.py '$@'
@@ -0,0 +1,81 @@
1
+ # STEP_NAME: determining_exclusions
2
+
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
4
+
5
+ # PIPELINE_SCHEMA: main
6
+
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s %(message)s",
16
+ handlers=[logging.StreamHandler()],
17
+ )
18
+
19
+
20
+ def load_file(file_path, file_format=None):
21
+ logging.info(f"Loading file {file_path} with format {file_format}")
22
+ if file_format is None:
23
+ file_format = file_path.split(".")[-1]
24
+ if file_format == "parquet":
25
+ return pd.read_parquet(file_path)
26
+ raise ValueError(f"Unknown file format {file_format}")
27
+
28
+
29
+ # LOAD INPUTS
30
+
31
+ # INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS is list of filepaths which includes
32
+ # the known_clusters filepath due to workaround
33
+ dataset_paths = os.environ["INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS"].split(",")
34
+ dataset_paths = [path for path in dataset_paths if "clusters" not in Path(path).stem]
35
+
36
+ # for workaround, choose path based on INPUT_DATASET configuration
37
+ splitter_choice = os.environ["INPUT_DATASET"]
38
+ dataset_path = None
39
+ for path in dataset_paths:
40
+ if splitter_choice == Path(path).stem:
41
+ dataset_path = path
42
+ break
43
+ if dataset_path is None:
44
+ raise ValueError(f"No dataset matching {splitter_choice} found")
45
+
46
+ # KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS is a list of file paths.
47
+ # There is one item in it that is a file with "clusters" in the filename.
48
+ # That's the only item we're interested in here.
49
+ # The other items may be there if this is coming from the user's input,
50
+ # due to our workaround for only having one slot of user input.
51
+ clusters_filepaths = [
52
+ path
53
+ for path in os.environ["KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS"].split(",")
54
+ if "clusters" in Path(path).stem
55
+ ]
56
+ if len(clusters_filepaths) > 1:
57
+ raise ValueError("Multiple known clusters files found")
58
+ if len(clusters_filepaths) == 0:
59
+ raise ValueError("No known clusters file found")
60
+
61
+ clusters_filepath = clusters_filepaths[0]
62
+
63
+ clusters_df = load_file(clusters_filepath)
64
+
65
+ if len(clusters_df) > 0:
66
+ raise ValueError(
67
+ "Default implementation of determining_exclusions passed a non-empty set of known clusters"
68
+ )
69
+
70
+ # don't need to actually load the dataset,
71
+ # since we will just save an empty ids_to_remove
72
+
73
+ # SAVE OUTPUTS
74
+
75
+ IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
76
+
77
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
78
+ results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
79
+
80
+ logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
81
+ IDS_TO_REMOVE.to_parquet(results_filepath)
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./default_removing_records.py /default_removing_records.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /default_removing_records.py '$@'
@@ -0,0 +1,59 @@
1
+ # STEP_NAME: removing_records
2
+
3
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
4
+
5
+ # PIPELINE_SCHEMA: main
6
+
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s %(message)s",
16
+ handlers=[logging.StreamHandler()],
17
+ )
18
+
19
+
20
+ def load_file(file_path, file_format=None):
21
+ logging.info(f"Loading file {file_path} with format {file_format}")
22
+ if file_format is None:
23
+ file_format = file_path.split(".")[-1]
24
+ if file_format == "parquet":
25
+ return pd.read_parquet(file_path)
26
+ raise ValueError(f"Unknown file format {file_format}")
27
+
28
+
29
+ # LOAD INPUTS and SAVE OUTPUTS
30
+
31
+ # INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS is list of filepaths which includes
32
+ # the known_clusters filepath due to workaround
33
+ dataset_paths = os.environ["INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS"].split(",")
34
+ dataset_paths = [path for path in dataset_paths if "clusters.parquet" not in Path(path).stem]
35
+
36
+ # for workaround, choose path based on INPUT_DATASET configuration
37
+ splitter_choice = os.environ["INPUT_DATASET"]
38
+ dataset_path = None
39
+ for path in dataset_paths:
40
+ if splitter_choice == Path(path).stem:
41
+ dataset_path = path
42
+ break
43
+ if dataset_path is None:
44
+ raise ValueError(f"No dataset matching {splitter_choice} found")
45
+
46
+ # IDS_TO_REMOVE_FILE_PATH is a single filepath (Cloneable section)
47
+ ids_filepath = os.environ["IDS_TO_REMOVE_FILE_PATH"]
48
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a directory ('dataset')
49
+ results_dir = Path(os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"])
50
+ results_dir.mkdir(exist_ok=True, parents=True)
51
+
52
+ dataset = load_file(dataset_path)
53
+ ids_to_remove = load_file(ids_filepath)
54
+
55
+ dataset = dataset[~dataset["Record ID"].isin(ids_to_remove)]
56
+
57
+ output_path = results_dir / Path(dataset_path).name
58
+ logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
59
+ dataset.to_parquet(output_path)
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./default_schema_alignment.py /default_schema_alignment.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas==2.1.2 pyarrow pyyaml
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /default_schema_alignment.py '$@'
@@ -0,0 +1,53 @@
1
+ # STEP_NAME: schema_alignment
2
+ # REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
3
+
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s %(message)s",
13
+ handlers=[logging.StreamHandler()],
14
+ )
15
+
16
+
17
+ def load_file(file_path, file_format=None):
18
+ logging.info(f"Loading file {file_path} with format {file_format}")
19
+ if file_format is None:
20
+ file_format = file_path.split(".")[-1]
21
+ if file_format == "parquet":
22
+ return pd.read_parquet(file_path)
23
+ raise ValueError(f"Unknown file format {file_format}")
24
+
25
+
26
+ # LOAD INPUTS and SAVE OUTPUTS
27
+
28
+ # DATASETS_DIR_PATHS is list of directories
29
+ dataset_dirs = os.environ["DATASETS_DIR_PATHS"].split(",")
30
+
31
+ datasets = {}
32
+
33
+ for dir in dataset_dirs:
34
+ for root, dirs, files in os.walk(dir):
35
+ for file in files:
36
+ if file.startswith("."):
37
+ continue
38
+ datasets[Path(file).stem] = load_file(os.path.join(root, file))
39
+
40
+ records = pd.concat(
41
+ [df.assign(**{"Input Record Dataset": dataset}) for dataset, df in datasets.items()],
42
+ ignore_index=True,
43
+ sort=False,
44
+ )
45
+
46
+ records = records.rename(columns={"Record ID": "Input Record ID"})
47
+
48
+ # DUMMY_CONTAINER_OUTPUT_PATHS is a single filepath
49
+ output_path = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
50
+ Path(output_path).parent.mkdir(exist_ok=True, parents=True)
51
+
52
+ logging.info(f"Writing output to {output_path}")
53
+ records.to_parquet(output_path)