cdc-cluster 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cdc/__init__.py ADDED
@@ -0,0 +1,193 @@
1
+ import math
2
+ import numpy as np
3
+ from sklearn.neighbors import NearestNeighbors
4
+ from scipy.special import gamma
5
+ from scipy.spatial import ConvexHull
6
+
7
+ __all__ = ['cdc_cluster']
8
+
9
+ def cdc_cluster(X: np.ndarray, k_num: int, ratio: float) -> np.ndarray:
10
+ """Clustering by measuring local Direction Centrality (CDC) algorithm.
11
+
12
+ This function implements the CDC clustering algorithm, which is a connectivity-based
13
+ clustering method that identifies boundary points using a directional centrality
14
+ metric (DCM) and connects internal points to generate cluster labels. DCM is defined
15
+ as angle variance in 2D space and simplex volume variance in higher dimensions.
16
+
17
+ paper reference: Peng, D., Gui, Z.*, Wang, D. et al. Clustering by measuring local
18
+ direction centrality for data with heterogeneous density and weak connectivity.
19
+ Nat. Commun. 13, 5455 (2022). https://www.nature.com/articles/s41467-022-33136-9
20
+
21
+ The algorithm works in several steps:
22
+ 1. For each point, find k-nearest neighbors
23
+ 2. For each point, calculate its DCM
24
+ 3. Identify boundary and internal points based on the DCM threshold
25
+ 4. Calculate reachable distances of the internal points
26
+ 5. Form clusters by connecting nearby internal points
27
+ 6. Assign boundary points to nearest clusters
28
+
29
+ Args:
30
+ X (np.ndarray): Input data matrix of shape (n_samples, n_features).
31
+ Each row represents a data point and each column represents a feature.
32
+ k_num (int): Number of nearest neighbors to consider. Must be greater than 0.
33
+ This parameter controls the local neighborhood size.
34
+ ratio (float): Ratio for determining the DCM threshold. Must be between 0 and 1.
35
+ Lower values result in fewer internal points and more boundary points.
36
+
37
+
38
+ Returns:
39
+ np.ndarray: Cluster labels for each data point. Shape (n_samples,).
40
+ Labels are integers starting from 1, where points with the same label
41
+ belong to the same cluster.
42
+
43
+ Raises:
44
+ AssertionError: If k_num <= 0 or ratio is not in (0, 1).
45
+ ValueError: If X is not a 2D array or has insufficient data points.
46
+
47
+ Example:
48
+ >>> import numpy as np
49
+ >>> from CDC import cdc_cluster
50
+ >>>
51
+ >>> # Generate sample 2D data
52
+ >>> X = np.random.rand(100, 2)
53
+ >>>
54
+ >>> # Apply CDC clustering
55
+ >>> labels = cdc_cluster(X=X, k_num=20, ratio=0.9)
56
+ >>>
57
+ >>> # Get number of clusters
58
+ >>> n_clusters = len(np.unique(labels))
59
+ >>> print(f"Number of clusters: {n_clusters}")
60
+
61
+ Note:
62
+ - For 2D data, the algorithm uses angle variance between k-nearest neighbors
63
+ - For higher dimensional data, it uses convex hull simplex volume variance
64
+ - The algorithm automatically handles edge cases and numerical instabilities
65
+ """
66
+ assert k_num > 0, "k_num must be greater than 0"
67
+ assert 0 < ratio < 1, "ratio must be between 0 and 1"
68
+
69
+ [num, d] = X.shape
70
+ nbrs = NearestNeighbors(n_neighbors=k_num+1, algorithm='ball_tree').fit(X)
71
+ indices = nbrs.kneighbors(X, return_distance=False)
72
+ get_knn = indices[:, 1:k_num+1]
73
+
74
+ angle_var = np.zeros(num)
75
+ if (d == 2):
76
+ angle = np.zeros((num, k_num))
77
+ for i in range(num):
78
+ for j in range(k_num):
79
+ delta_x = X[get_knn[i, j], 0] - X[i, 0]
80
+ delta_y = X[get_knn[i, j], 1] - X[i, 1]
81
+ if delta_x == 0:
82
+ if delta_y == 0:
83
+ angle[i, j] = 0
84
+ elif delta_y > 0:
85
+ angle[i, j] = math.pi / 2
86
+ else:
87
+ angle[i, j] = 3 * math.pi / 2
88
+ elif delta_x > 0:
89
+ if math.atan(delta_y / delta_x) >= 0:
90
+ angle[i, j] = math.atan(delta_y / delta_x)
91
+ else:
92
+ angle[i, j] = 2 * math.pi + math.atan(delta_y / delta_x)
93
+ else:
94
+ angle[i, j] = math.pi + math.atan(delta_y / delta_x)
95
+
96
+ for i in range(num):
97
+ angle_order = sorted(angle[i, :])
98
+
99
+ for j in range(k_num - 1):
100
+ point_angle = angle_order[j + 1] - angle_order[j]
101
+ angle_var[i] = angle_var[i] + pow(point_angle - 2 * math.pi / k_num, 2)
102
+
103
+ point_angle = angle_order[0] - angle_order[k_num - 1] + 2 * math.pi
104
+ angle_var[i] = angle_var[i] + pow(point_angle - 2 * math.pi / k_num, 2)
105
+ angle_var[i] = angle_var[i] / k_num
106
+
107
+ angle_var = angle_var / ((k_num - 1) * 4 * pow(math.pi, 2) / pow(k_num, 2))
108
+ else:
109
+ for i in range(num):
110
+ try:
111
+ dif_x = X[get_knn[i], :] - X[i, :]
112
+ map_x = np.linalg.inv(np.diag(np.sqrt(np.diag(np.dot(dif_x, dif_x.T))))) @ dif_x
113
+ hull = ConvexHull(map_x)
114
+ simplex_num = len(hull.simplices)
115
+ simplex_vol = np.zeros(simplex_num)
116
+
117
+ for j in range(simplex_num):
118
+ simplex_coord = map_x[hull.simplices[j], :]
119
+ simplex_vol[j] = np.sqrt(max(0, np.linalg.det(np.dot(simplex_coord, simplex_coord.T)))) / gamma(d-1)
120
+
121
+ angle_var[i] = np.var(simplex_vol)
122
+
123
+ except Exception as e:
124
+ angle_var[i] = 1
125
+
126
+ sort_dcm = sorted(angle_var)
127
+ T_DCM = sort_dcm[math.ceil(num*ratio)]
128
+ ind = np.zeros(num)
129
+ for i in range(num):
130
+ if angle_var[i] < T_DCM:
131
+ ind[i] = 1
132
+
133
+ near_dis = np.zeros(num)
134
+ for i in range(num):
135
+ knn_ind = ind[get_knn[i, :]]
136
+ if ind[i] == 1:
137
+ if 0 in knn_ind:
138
+ bdpts_ind = np.where(knn_ind == 0)
139
+ bd_id = get_knn[i, bdpts_ind[0][0]]
140
+ near_dis[i] = math.sqrt(sum(pow((X[i, :] - X[bd_id, :]), 2)))
141
+ else:
142
+ near_dis[i] = float("inf")
143
+ for j in range(num):
144
+ if ind[j] == 0:
145
+ temp_dis = math.sqrt(sum(pow((X[i, :] - X[j, :]), 2)))
146
+ if temp_dis < near_dis[i]:
147
+ near_dis[i] = temp_dis
148
+ else:
149
+ if 1 in knn_ind:
150
+ bdpts_ind = np.where(knn_ind == 1)
151
+ bd_id = get_knn[i, bdpts_ind[0][0]]
152
+ near_dis[i] = bd_id
153
+ else:
154
+ mark_dis = float("inf")
155
+ for j in range(num):
156
+ if ind[j] == 1:
157
+ temp_dis = math.sqrt(sum(pow((X[i, :] - X[j, :]), 2)))
158
+ if temp_dis < mark_dis:
159
+ mark_dis = temp_dis
160
+ near_dis[i] = j
161
+
162
+ cluster = np.zeros(num)
163
+ mark = 1
164
+ for i in range(num):
165
+ if ind[i] == 1 and cluster[i] == 0:
166
+ cluster[i] = mark
167
+ for j in range(num):
168
+ if ind[j] == 1 and math.sqrt(sum(pow((X[i, :] - X[j, :]), 2))) <= near_dis[i] + near_dis[j]:
169
+ if cluster[j] == 0:
170
+ cluster[j] = cluster[i]
171
+ else:
172
+ temp_cluster = cluster[j]
173
+ temp_ind = np.where(cluster == temp_cluster)
174
+ cluster[temp_ind] = cluster[i]
175
+
176
+ mark = mark + 1
177
+
178
+ for i in range(num):
179
+ if ind[i] == 0:
180
+ cluster[i] = cluster[int(near_dis[i])]
181
+
182
+ mark = 1
183
+ storage = np.zeros(num)
184
+ for i in range(num):
185
+ if cluster[i] in storage:
186
+ temp_ind = np.where(storage == cluster[i])
187
+ cluster[i] = cluster[temp_ind[0][0]]
188
+ else:
189
+ storage[i] = cluster[i]
190
+ cluster[i] = mark
191
+ mark = mark + 1
192
+
193
+ return cluster
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.1
2
+ Name: cdc-cluster
3
+ Version: 0.1.0
4
+ Summary: A novel Clustering algorithm by measuring Direction Centrality (CDC) locally. It adopts a density-independent metric based on the distribution of K-nearest neighbors (KNNs) to distinguish between internal and boundary points. The boundary points generate enclosed cages to bind the connections of internal points.
5
+ Author-email: pdh <pengdh@whu.edu.cn>
6
+ Project-URL: Homepage, https://github.com/ZPGuiGroupWhu/CDC-pkg
7
+ Project-URL: Repository, https://github.com/ZPGuiGroupWhu/CDC-pkg.git
8
+ Project-URL: Bug Tracker, https://github.com/ZPGuiGroupWhu/CDC-pkg/issues
9
+ Keywords: clustering,centrality,boundary detection
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: scikit-learn>=1.3.2
24
+
25
+ # Clustering by measuring local direction centrality for data with heterogeneous density and weak connectivity (CDC)
26
+
27
+
28
+ We propose a novel Clustering algorithm by measuring Direction Centrality (CDC) locally. It adopts a density-independent metric based on the distribution of K-nearest neighbors (KNNs) to distinguish between internal and boundary points. The boundary points generate enclosed cages to bind the connections of internal points, thereby preventing cross-cluster connections and separating weakly-connected clusters. We present an interactive ***Demo*** and a brief introduction to the algorithm at ***https://zpguigroupwhu.github.io/CDC-Introduction-Website/***, and develop a CDC toolkit at ***https://github.com/ZPGuiGroupWhu/ClusteringDirectionCentrality*** This paper has been published in ***Nature Communications***, and more details can be seen https://www.nature.com/articles/s41467-022-33136-9.
29
+
30
+ ![image](https://github.com/ZPGuiGroupWhu/ClusteringDirectionCentrality/blob/master/pics/cdc_algorithm.png)
31
+
32
+ # Installation
33
+ Supported `python` versions are `3.8` and above.
34
+
35
+ This project has been uploaded to [PyPI](https://pypi.org/project/cdc-cluster/), supporting direct download and installation from pypi
36
+
37
+ ```
38
+ pip install cdc-cluster
39
+ ```
40
+
41
+ ## Manual Installation
42
+
43
+ ```
44
+ git clone https://github.com/ZPGuiGroupWhu/CDC-pkg.git
45
+ cd CDC-pkg
46
+ pip install -e .
47
+ ```
48
+
49
+ # How To Run
50
+ The CDC algorithm package provides the `cdc_cluster` function for clustering.
51
+
52
+ The description of the hyperparameters for user configuration are presented as follows
53
+ ```python
54
+ def cdc_cluster(X: np.ndarray, k_num: int, ratio: float) -> np.ndarray:
55
+ """Clustering by measuring local Direction Centrality (CDC) algorithm.
56
+
57
+ This function implements the CDC clustering algorithm, which is a connectivity-based
58
+ clustering method that identifies boundary points using a directional centrality
59
+ metric (DCM) and connects internal points to generate cluster labels. DCM is defined
60
+ as angle variance in 2D space and simplex volume variance in higher dimensions.
61
+
62
+ The algorithm works in several steps:
63
+ 1. For each point, find k-nearest neighbors
64
+ 2. For each point, calculate its DCM
65
+ 3. Identify boundary and internal points based on the DCM threshold
66
+ 4. Calculate reachable distances of the internal points
67
+ 5. Form clusters by connecting nearby internal points
68
+ 6. Assign boundary points to nearest clusters
69
+
70
+ Args:
71
+ X (np.ndarray): Input data matrix of shape (n_samples, n_features).
72
+ Each row represents a data point and each column represents a feature.
73
+ k_num (int): Number of nearest neighbors to consider. Must be greater than 0.
74
+ This parameter controls the local neighborhood size.
75
+ ratio (float): Ratio for determining the DCM threshold. Must be between 0 and 1.
76
+ Lower values result in fewer internal points and more boundary points.
77
+
78
+ Returns:
79
+ np.ndarray: Cluster labels for each data point. Shape (n_samples,).
80
+ Labels are integers starting from 1, where points with the same label
81
+ belong to the same cluster.
82
+
83
+ Raises:
84
+ AssertionError: If k_num <= 0 or ratio is not in (0, 1).
85
+ ValueError: If X is not a 2D array or has insufficient data points.
86
+
87
+ Note:
88
+ - For 2D data, the algorithm uses angle variance between k-nearest neighbors
89
+ - For higher dimensional data, it uses convex hull simplex volume variance
90
+ - The algorithm automatically handles edge cases and numerical instabilities
91
+ """
92
+ ```
93
+ After installing the CDC library, you can use this function as follows:
94
+ ```python
95
+ from cdc import cdc_cluster
96
+ import numpy as np
97
+ import pandas as pd
98
+ import matplotlib.pyplot as plt
99
+ import time
100
+ import math
101
+ # DS1.txt link: https://github.com/ZPGuiGroupWhu/ClusteringDirectionCentrality/blob/master/Toolkit/Python/DS1.txt
102
+ raw_data = pd.read_table('DS1.txt', header=None)
103
+ X = np.array(raw_data)
104
+ [n, d] = X.shape
105
+ data = X[:, :d-1]
106
+ ref = X[:, d-1]
107
+ time_start = time.time()
108
+ res = cdc_cluster(X=data, k_num=30, ratio=0.72)
109
+ time_end = time.time()
110
+ print(time_end-time_start)
111
+
112
+ plt.scatter(data[:, 0], data[:, 1], c=res, s=10, cmap='hsv', marker='o')
113
+ plt.show()
114
+ ```
115
+ # Citation Request:
116
+ Peng, D., Gui, Z.*, Wang, D. et al. Clustering by measuring local direction centrality for data with heterogeneous density and weak connectivity. Nat. Commun. 13, 5455 (2022).
117
+ https://www.nature.com/articles/s41467-022-33136-9
118
+
119
+ # License
120
+
121
+ This project is covered under the MIT License.
@@ -0,0 +1,5 @@
1
+ cdc/__init__.py,sha256=25if62gK_9uHHpJ96Uv2Rq-nMwb5t3gh51o_j9IAP68,7959
2
+ cdc_cluster-0.1.0.dist-info/METADATA,sha256=aKfSTDgCuC7DpG78BVhlErj-fgHI9bwSBTLHyyA_kF0,5964
3
+ cdc_cluster-0.1.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
4
+ cdc_cluster-0.1.0.dist-info/top_level.txt,sha256=v6FEwUWlqiNBwmV7tCXDaBp-LmuRmw309T6GQ7Vd5XQ,4
5
+ cdc_cluster-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.3.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ cdc