easylink 0.1.19__py3-none-any.whl → 0.1.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.19"
1
+ __version__ = "0.1.20"
@@ -29,7 +29,7 @@ from easylink.step import (
29
29
  TemplatedStep,
30
30
  )
31
31
  from easylink.utilities.data_utils import load_yaml
32
- from easylink.utilities.paths import IMPLEMENTATION_METADATA
32
+ from easylink.utilities.paths import DEV_IMAGES_DIR, IMPLEMENTATION_METADATA
33
33
 
34
34
 
35
35
  def main(script_path: Path, host: Path) -> None:
@@ -195,9 +195,24 @@ class ImplementationCreator:
195
195
  f"Implementation '{self.implementation_name}' already exists in the registry. "
196
196
  "Overwriting it with the latest data."
197
197
  )
198
+
199
+ # Handle the fact that developers might be saving to username subdirs
200
+ # If the host folder is a subdirectory of DEV_IMAGES_DIR (e.g., the default
201
+ # host directory when calling `easylink devtools create-implementation`
202
+ # is DEV_IMAGES_DIR/<username>), we want to include the relative path
203
+ # to the DEV_IMAGES_DIR in the image name. This is required because ultimately
204
+ # when running a pipeline, all images are expected to be in a single directory.
205
+ image_name = (
206
+ self.hosted_container_path.name
207
+ # Use just the image name if the hosted path is not a part of DEV_IMAGES_DIR
208
+ if not self.hosted_container_path.is_relative_to(DEV_IMAGES_DIR)
209
+ # Use the path relative to DEV_IMAGES_DIR as the image name
210
+ else str(self.hosted_container_path.relative_to(DEV_IMAGES_DIR))
211
+ )
212
+
198
213
  info[self.implementation_name] = {
199
214
  "steps": [self.step],
200
- "image_path": str(self.hosted_container_path),
215
+ "image_name": str(image_name),
201
216
  "script_cmd": f"{self.script_base_command} /{self.script_path.name}",
202
217
  "outputs": {
203
218
  self.output_slot: "result.parquet",
@@ -60,12 +60,14 @@ new_clusters_df = load_file(new_clusters_filepath)
60
60
  def merge_clusters(known_clusters_df, new_clusters_df):
61
61
  # Combine both dataframes
62
62
  combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
63
-
64
- # Drop records with missing cluster IDs
65
- combined_df = combined_df.dropna(subset=["Cluster ID"])
63
+ combined_df["Input Record Key"] = (
64
+ combined_df["Input Record Dataset"]
65
+ + "-__-"
66
+ + combined_df["Input Record ID"].astype(int).astype(str)
67
+ )
66
68
 
67
69
  # Group by Cluster ID to get connected records
68
- cluster_groups = combined_df.groupby("Cluster ID")["Input Record ID"].apply(list)
70
+ cluster_groups = combined_df.groupby("Cluster ID")["Input Record Key"].apply(list)
69
71
 
70
72
  # Build a graph of all connections implied by cluster IDs
71
73
  G = nx.Graph()
@@ -75,8 +77,8 @@ def merge_clusters(known_clusters_df, new_clusters_df):
75
77
  G.add_edge(group[i], group[j])
76
78
 
77
79
  # Add isolated nodes (records with unique clusters)
78
- all_ids = set(combined_df["Input Record ID"])
79
- G.add_nodes_from(all_ids)
80
+ all_keys = set(combined_df["Input Record Key"])
81
+ G.add_nodes_from(all_keys)
80
82
 
81
83
  # Compute connected components
82
84
  components = list(nx.connected_components(G))
@@ -84,13 +86,19 @@ def merge_clusters(known_clusters_df, new_clusters_df):
84
86
  # Assign new cluster IDs
85
87
  merged_data = []
86
88
  for cluster_id, records in enumerate(components, start=1):
87
- for record_id in records:
88
- merged_data.append((record_id, cluster_id))
89
+ for record_key in records:
90
+ merged_data.append((record_key, cluster_id))
89
91
 
90
92
  # Build the final DataFrame
91
- merged_df = pd.DataFrame(merged_data, columns=["Input Record ID", "Cluster ID"])
93
+ merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
94
+
95
+ merged_df[["Input Record Dataset", "Input Record ID"]] = merged_df[
96
+ "Input Record Key"
97
+ ].str.split("-__-", n=1, expand=True)
98
+
99
+ merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
92
100
 
93
- return merged_df
101
+ return merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]]
94
102
 
95
103
 
96
104
  output_df = merge_clusters(known_clusters_df, new_clusters_df)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.19
3
+ Version: 0.1.20
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -1,6 +1,6 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=cAJAbAh288a9AL-3yxwFzEM1L26izSJ6wma5aiml_9Y,23
3
+ easylink/_version.py,sha256=8XalsVoLEfXslFvdtUEmkNOuYShzOzYOcFbgmOz1oSk,23
4
4
  easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
5
5
  easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
6
6
  easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
@@ -12,7 +12,7 @@ easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6
12
12
  easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
13
13
  easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
14
14
  easylink/step.py,sha256=NGy1KNqM4eXP7kP0kdfcfyGc4K_ExSCSidCdW3h0Qg8,89902
15
- easylink/devtools/implementation_creator.py,sha256=1WQOOrjQYOhjjp8MQM9j1xoeAp-SW51A1f1oW4G792I,18251
15
+ easylink/devtools/implementation_creator.py,sha256=RkwnI1T0aEquRPgGjPOGtJo_87tjoKvDAElRcf6Vqqk,19140
16
16
  easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
17
17
  easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
18
18
  easylink/pipeline_schema_constants/__init__.py,sha256=xYymSjTeH3prvQL_rgGFVrriohANFtW_cy0vDwlF3ds,1355
@@ -24,7 +24,7 @@ easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9s
24
24
  easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
25
25
  easylink/steps/cascading/exclude_none.py,sha256=KntBX3q-V47d96ztOlPNRY_kCFJNi1LNYQ7UNs5wB4c,2507
26
26
  easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
27
- easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=w7tAOs2QtIIcpTDxw2P_dqMIR-BFa-wi-OmZwrKyhmg,3309
27
+ easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=43D5GBmPXSgxcjgbJTvEoGFvPzBCGqYgBaT42pncNNw,3661
28
28
  easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
29
29
  easylink/steps/default/default_clusters_to_links.py,sha256=EIYeP0lj0plBl2OpTRuv3iDEQl-zNVJONUg0kgKSEF0,2848
30
30
  easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
@@ -83,9 +83,9 @@ easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,9
83
83
  easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
84
84
  easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
85
85
  easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
86
- easylink-0.1.19.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
- easylink-0.1.19.dist-info/METADATA,sha256=nFZA-jZKgZUG4DdiDqY-pNOTfdt1H3QeiwNzvo27vpg,3565
88
- easylink-0.1.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
- easylink-0.1.19.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
- easylink-0.1.19.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
- easylink-0.1.19.dist-info/RECORD,,
86
+ easylink-0.1.20.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
+ easylink-0.1.20.dist-info/METADATA,sha256=aGNai6P-z5BQcQ0XYFTBr9JmuZAFTpZJYouFRlTJCzk,3565
88
+ easylink-0.1.20.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
+ easylink-0.1.20.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
+ easylink-0.1.20.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
+ easylink-0.1.20.dist-info/RECORD,,