easylink 0.1.19__py3-none-any.whl → 0.1.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/devtools/implementation_creator.py +17 -2
- easylink/steps/cascading/update_clusters_by_connected_components.py +18 -10
- {easylink-0.1.19.dist-info → easylink-0.1.20.dist-info}/METADATA +1 -1
- {easylink-0.1.19.dist-info → easylink-0.1.20.dist-info}/RECORD +9 -9
- {easylink-0.1.19.dist-info → easylink-0.1.20.dist-info}/WHEEL +0 -0
- {easylink-0.1.19.dist-info → easylink-0.1.20.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.19.dist-info → easylink-0.1.20.dist-info}/licenses/LICENSE +0 -0
- {easylink-0.1.19.dist-info → easylink-0.1.20.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.20"
|
@@ -29,7 +29,7 @@ from easylink.step import (
|
|
29
29
|
TemplatedStep,
|
30
30
|
)
|
31
31
|
from easylink.utilities.data_utils import load_yaml
|
32
|
-
from easylink.utilities.paths import IMPLEMENTATION_METADATA
|
32
|
+
from easylink.utilities.paths import DEV_IMAGES_DIR, IMPLEMENTATION_METADATA
|
33
33
|
|
34
34
|
|
35
35
|
def main(script_path: Path, host: Path) -> None:
|
@@ -195,9 +195,24 @@ class ImplementationCreator:
|
|
195
195
|
f"Implementation '{self.implementation_name}' already exists in the registry. "
|
196
196
|
"Overwriting it with the latest data."
|
197
197
|
)
|
198
|
+
|
199
|
+
# Handle the fact that developers might be saving to username subdirs
|
200
|
+
# If the host folder is a subdirectory of DEV_IMAGES_DIR (e.g., the default
|
201
|
+
# host directory when calling `easylink devtools create-implementation`
|
202
|
+
# is DEV_IMAGES_DIR/<username>), we want to include the relative path
|
203
|
+
# to the DEV_IMAGES_DIR in the image name. This is required because ultimately
|
204
|
+
# when running a pipeline, all images are expected to be in a single directory.
|
205
|
+
image_name = (
|
206
|
+
self.hosted_container_path.name
|
207
|
+
# Use just the image name if the hosted path is not a part of DEV_IMAGES_DIR
|
208
|
+
if not self.hosted_container_path.is_relative_to(DEV_IMAGES_DIR)
|
209
|
+
# Use the path relative to DEV_IMAGES_DIR as the image name
|
210
|
+
else str(self.hosted_container_path.relative_to(DEV_IMAGES_DIR))
|
211
|
+
)
|
212
|
+
|
198
213
|
info[self.implementation_name] = {
|
199
214
|
"steps": [self.step],
|
200
|
-
"
|
215
|
+
"image_name": str(image_name),
|
201
216
|
"script_cmd": f"{self.script_base_command} /{self.script_path.name}",
|
202
217
|
"outputs": {
|
203
218
|
self.output_slot: "result.parquet",
|
@@ -60,12 +60,14 @@ new_clusters_df = load_file(new_clusters_filepath)
|
|
60
60
|
def merge_clusters(known_clusters_df, new_clusters_df):
|
61
61
|
# Combine both dataframes
|
62
62
|
combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
|
63
|
-
|
64
|
-
|
65
|
-
|
63
|
+
combined_df["Input Record Key"] = (
|
64
|
+
combined_df["Input Record Dataset"]
|
65
|
+
+ "-__-"
|
66
|
+
+ combined_df["Input Record ID"].astype(int).astype(str)
|
67
|
+
)
|
66
68
|
|
67
69
|
# Group by Cluster ID to get connected records
|
68
|
-
cluster_groups = combined_df.groupby("Cluster ID")["Input Record
|
70
|
+
cluster_groups = combined_df.groupby("Cluster ID")["Input Record Key"].apply(list)
|
69
71
|
|
70
72
|
# Build a graph of all connections implied by cluster IDs
|
71
73
|
G = nx.Graph()
|
@@ -75,8 +77,8 @@ def merge_clusters(known_clusters_df, new_clusters_df):
|
|
75
77
|
G.add_edge(group[i], group[j])
|
76
78
|
|
77
79
|
# Add isolated nodes (records with unique clusters)
|
78
|
-
|
79
|
-
G.add_nodes_from(
|
80
|
+
all_keys = set(combined_df["Input Record Key"])
|
81
|
+
G.add_nodes_from(all_keys)
|
80
82
|
|
81
83
|
# Compute connected components
|
82
84
|
components = list(nx.connected_components(G))
|
@@ -84,13 +86,19 @@ def merge_clusters(known_clusters_df, new_clusters_df):
|
|
84
86
|
# Assign new cluster IDs
|
85
87
|
merged_data = []
|
86
88
|
for cluster_id, records in enumerate(components, start=1):
|
87
|
-
for
|
88
|
-
merged_data.append((
|
89
|
+
for record_key in records:
|
90
|
+
merged_data.append((record_key, cluster_id))
|
89
91
|
|
90
92
|
# Build the final DataFrame
|
91
|
-
merged_df = pd.DataFrame(merged_data, columns=["Input Record
|
93
|
+
merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
|
94
|
+
|
95
|
+
merged_df[["Input Record Dataset", "Input Record ID"]] = merged_df[
|
96
|
+
"Input Record Key"
|
97
|
+
].str.split("-__-", n=1, expand=True)
|
98
|
+
|
99
|
+
merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
|
92
100
|
|
93
|
-
return merged_df
|
101
|
+
return merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]]
|
94
102
|
|
95
103
|
|
96
104
|
output_df = merge_clusters(known_clusters_df, new_clusters_df)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
3
|
+
easylink/_version.py,sha256=8XalsVoLEfXslFvdtUEmkNOuYShzOzYOcFbgmOz1oSk,23
|
4
4
|
easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
|
5
5
|
easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
|
6
6
|
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
@@ -12,7 +12,7 @@ easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6
|
|
12
12
|
easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
|
13
13
|
easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
|
14
14
|
easylink/step.py,sha256=NGy1KNqM4eXP7kP0kdfcfyGc4K_ExSCSidCdW3h0Qg8,89902
|
15
|
-
easylink/devtools/implementation_creator.py,sha256=
|
15
|
+
easylink/devtools/implementation_creator.py,sha256=RkwnI1T0aEquRPgGjPOGtJo_87tjoKvDAElRcf6Vqqk,19140
|
16
16
|
easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
|
17
17
|
easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
|
18
18
|
easylink/pipeline_schema_constants/__init__.py,sha256=xYymSjTeH3prvQL_rgGFVrriohANFtW_cy0vDwlF3ds,1355
|
@@ -24,7 +24,7 @@ easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9s
|
|
24
24
|
easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
|
25
25
|
easylink/steps/cascading/exclude_none.py,sha256=KntBX3q-V47d96ztOlPNRY_kCFJNi1LNYQ7UNs5wB4c,2507
|
26
26
|
easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
|
27
|
-
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=
|
27
|
+
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=43D5GBmPXSgxcjgbJTvEoGFvPzBCGqYgBaT42pncNNw,3661
|
28
28
|
easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
|
29
29
|
easylink/steps/default/default_clusters_to_links.py,sha256=EIYeP0lj0plBl2OpTRuv3iDEQl-zNVJONUg0kgKSEF0,2848
|
30
30
|
easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
|
@@ -83,9 +83,9 @@ easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,9
|
|
83
83
|
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
84
84
|
easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
|
85
85
|
easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
|
86
|
-
easylink-0.1.
|
87
|
-
easylink-0.1.
|
88
|
-
easylink-0.1.
|
89
|
-
easylink-0.1.
|
90
|
-
easylink-0.1.
|
91
|
-
easylink-0.1.
|
86
|
+
easylink-0.1.20.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
|
87
|
+
easylink-0.1.20.dist-info/METADATA,sha256=aGNai6P-z5BQcQ0XYFTBr9JmuZAFTpZJYouFRlTJCzk,3565
|
88
|
+
easylink-0.1.20.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
89
|
+
easylink-0.1.20.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
90
|
+
easylink-0.1.20.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
91
|
+
easylink-0.1.20.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|