graphical-sampling 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphical_sampling/__init__.py +14 -0
- graphical_sampling/clustering/__init__.py +13 -0
- graphical_sampling/clustering/aggregate.py +213 -0
- graphical_sampling/clustering/dubly_balanced_clustering.py +209 -0
- graphical_sampling/clustering/one_boundary.py +233 -0
- graphical_sampling/clustering/soft_balanced_kmeans.py +161 -0
- graphical_sampling/criteria/__init__.py +4 -0
- graphical_sampling/criteria/criteria.py +15 -0
- graphical_sampling/criteria/var_nht.py +26 -0
- graphical_sampling/design.py +128 -0
- graphical_sampling/measure/__init__.py +4 -0
- graphical_sampling/measure/density.py +94 -0
- graphical_sampling/random/__init__.py +4 -0
- graphical_sampling/random/generator.py +251 -0
- graphical_sampling/red_black_tree.py +475 -0
- graphical_sampling/sampling/__init__.py +6 -0
- graphical_sampling/sampling/kmeans_spatial_sampling.py +61 -0
- graphical_sampling/sampling/population.py +234 -0
- graphical_sampling/sampling/random_sampling.py +21 -0
- graphical_sampling/search/__init__.py +4 -0
- graphical_sampling/search/astar.py +119 -0
- graphical_sampling/structs.py +94 -0
- graphical_sampling/type.py +17 -0
- graphical_sampling-0.1.0.dist-info/METADATA +85 -0
- graphical_sampling-0.1.0.dist-info/RECORD +27 -0
- graphical_sampling-0.1.0.dist-info/WHEEL +4 -0
- graphical_sampling-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from importlib import metadata
|
|
2
|
+
|
|
3
|
+
from .design import Design
|
|
4
|
+
from . import criteria
|
|
5
|
+
from . import search
|
|
6
|
+
from . import sampling
|
|
7
|
+
from . import clustering
|
|
8
|
+
from . import random
|
|
9
|
+
from . import measure
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__version__ = metadata.version("geometric_sampling")
|
|
13
|
+
|
|
14
|
+
__all__ = ["Design", "criteria", "search", "sampling", "clustering", "random", "measure"]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .soft_balanced_kmeans import SoftBalancedKMeans
|
|
2
|
+
from .aggregate import AggregateBalancedKMeans
|
|
3
|
+
from .one_boundary import OneBoundaryBalancedKMeans
|
|
4
|
+
from .dubly_balanced_clustering import DublyBalancedKMeans
|
|
5
|
+
# from .agg import Agg
|
|
6
|
+
# from .agg_one import AggOne
|
|
7
|
+
# from .aggregate import FinalAgg
|
|
8
|
+
# from .final2 import FinalAgg2
|
|
9
|
+
# from .swap import SwapAgg
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__all__ = ["SoftBalancedKMeans", "AggregateBalancedKMeans", "OneBoundaryBalancedKMeans", "DublyBalancedKMeans"]
|
|
13
|
+
# __all__ = ["SoftBalancedKMeans", "AggregateBalancingKmeans", "Agg", "AggOne", "FinalAgg", "FinalAgg2", "SwapAgg"]
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from numpy.typing import NDArray
|
|
3
|
+
from sklearn.cluster import KMeans
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AggregateBalancedKMeans:
|
|
7
|
+
def __init__(
|
|
8
|
+
self, k: int, *, initial_centroids: NDArray = None, tolerance: int = 5
|
|
9
|
+
) -> None:
|
|
10
|
+
self.k = k
|
|
11
|
+
self.tolerance = tolerance
|
|
12
|
+
self.Y_features = None
|
|
13
|
+
self.X_features = None
|
|
14
|
+
self.weights = None
|
|
15
|
+
self.m = None
|
|
16
|
+
self.N = None
|
|
17
|
+
self.centroids = initial_centroids
|
|
18
|
+
self.labels: NDArray = None
|
|
19
|
+
self.membership: NDArray = None
|
|
20
|
+
self.Ti: NDArray = None
|
|
21
|
+
self.Tij: NDArray = None
|
|
22
|
+
self.goal: float = None
|
|
23
|
+
self.rng = np.random.default_rng()
|
|
24
|
+
|
|
25
|
+
def _generate_goal_j(self) -> float:
|
|
26
|
+
return self.X_features.sum(axis=0)/self.k
|
|
27
|
+
|
|
28
|
+
def _generate_goal(self) -> float:
|
|
29
|
+
return np.sum(self._generate_goal_j())
|
|
30
|
+
|
|
31
|
+
def _generate_membership(self):
|
|
32
|
+
membership = np.zeros((self.N, self.k))
|
|
33
|
+
for j in range(self.N):
|
|
34
|
+
membership[j, self.labels[j]] = 1
|
|
35
|
+
return membership
|
|
36
|
+
|
|
37
|
+
def _generate_Tij(self) -> NDArray:
|
|
38
|
+
return np.sum(
|
|
39
|
+
self.X_features[:, :, np.newaxis] * self.membership[:, np.newaxis, :],
|
|
40
|
+
axis=0,
|
|
41
|
+
).T
|
|
42
|
+
|
|
43
|
+
def _generate_Ti(self) -> NDArray:
|
|
44
|
+
return np.sum(self._generate_Tij(), axis=1)
|
|
45
|
+
|
|
46
|
+
def _generate_Tij_cost(self):
|
|
47
|
+
return np.sum((self.weights*(self.Tij-self.goal_j))**2)
|
|
48
|
+
|
|
49
|
+
def _transfer_score(
|
|
50
|
+
self,
|
|
51
|
+
data_index: int,
|
|
52
|
+
old_cluster: int,
|
|
53
|
+
new_cluster: int,
|
|
54
|
+
) -> float:
|
|
55
|
+
if (
|
|
56
|
+
self.Ti[old_cluster] - self.Ti[new_cluster]
|
|
57
|
+
> 10**-self.tolerance
|
|
58
|
+
):
|
|
59
|
+
score = (
|
|
60
|
+
np.linalg.norm(
|
|
61
|
+
self.Y_features[data_index] - self.centroids[new_cluster]
|
|
62
|
+
)
|
|
63
|
+
** 2
|
|
64
|
+
- np.linalg.norm(
|
|
65
|
+
self.Y_features[data_index] - self.centroids[old_cluster]
|
|
66
|
+
)
|
|
67
|
+
** 2
|
|
68
|
+
) / (self.Ti[old_cluster] - self.Ti[new_cluster] + 1e-9)
|
|
69
|
+
return score #if score > 0 else np.inf
|
|
70
|
+
else:
|
|
71
|
+
return np.inf
|
|
72
|
+
|
|
73
|
+
def _get_transfer_records(self, top_m: int):
|
|
74
|
+
transfer_records = []
|
|
75
|
+
|
|
76
|
+
for i in range(self.N):
|
|
77
|
+
for j_old in np.nonzero(self.membership[i])[0]:
|
|
78
|
+
j_new_min = np.argmin(
|
|
79
|
+
[self._transfer_score(i, j_old, j_new) for j_new in range(self.k)]
|
|
80
|
+
)
|
|
81
|
+
cost = self._transfer_score(i, j_old, j_new_min)
|
|
82
|
+
transfer_records.append((cost, i, j_old, j_new_min))
|
|
83
|
+
|
|
84
|
+
transfer_records = np.array(transfer_records)
|
|
85
|
+
sorted_transfer_records = transfer_records[np.argsort(transfer_records[:, 0])]
|
|
86
|
+
best_cost = sorted_transfer_records[0, 0]
|
|
87
|
+
top_m_transfer_records = sorted_transfer_records[:top_m, 1:].astype(int)
|
|
88
|
+
|
|
89
|
+
# print("cost")
|
|
90
|
+
# print(sorted_transfer_records[:5])
|
|
91
|
+
# print()
|
|
92
|
+
|
|
93
|
+
return best_cost, top_m_transfer_records
|
|
94
|
+
|
|
95
|
+
def _transfer_percent(self, data_index: int, old_cluster: int, new_cluster: int):
|
|
96
|
+
if (
|
|
97
|
+
self.Ti[old_cluster] >= self.goal
|
|
98
|
+
and self.Ti[new_cluster] >= self.goal
|
|
99
|
+
) or (
|
|
100
|
+
self.Ti[old_cluster] <= self.goal
|
|
101
|
+
and self.Ti[new_cluster] <= self.goal
|
|
102
|
+
):
|
|
103
|
+
# print('case ++ or --')
|
|
104
|
+
return min(
|
|
105
|
+
self.membership[data_index, old_cluster],
|
|
106
|
+
((self.Ti[old_cluster] - self.Ti[new_cluster]) / (2*np.sum(self.X_features[data_index])))
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
# print('case +-')
|
|
110
|
+
return min(
|
|
111
|
+
self.membership[data_index, old_cluster],
|
|
112
|
+
((self.Ti[old_cluster] - self.goal) / np.sum(self.X_features[data_index])),
|
|
113
|
+
((self.goal - self.Ti[new_cluster]) / np.sum(self.X_features[data_index])),
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def _transfer(self, data_index: int, old_cluster: int, new_cluster: int) -> None:
|
|
117
|
+
transfer_percent = self._transfer_percent(data_index, old_cluster, new_cluster)
|
|
118
|
+
|
|
119
|
+
# print(f'transfer {data_index} from {old_cluster} to {new_cluster}')
|
|
120
|
+
# print(f'BEFROE: T_old_cluster={round(self.Ti[old_cluster], 5)} and T_new_cluster={round(self.Ti[new_cluster], 5)}')
|
|
121
|
+
# print(f'transfer percent: {transfer_percent}')
|
|
122
|
+
|
|
123
|
+
self.membership[data_index, old_cluster] -= transfer_percent
|
|
124
|
+
self.membership[data_index, new_cluster] += transfer_percent
|
|
125
|
+
|
|
126
|
+
def _no_transfer_possible(self, best_cost: float) -> bool:
|
|
127
|
+
return best_cost == np.inf
|
|
128
|
+
|
|
129
|
+
def _is_transfer_possible(self, old_cluster: int, new_cluster: int) -> bool:
|
|
130
|
+
return (
|
|
131
|
+
self.Ti[old_cluster] - self.Ti[new_cluster]
|
|
132
|
+
> 10**-self.tolerance
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def _stop_codition(self, tol) -> bool:
|
|
136
|
+
return np.all(np.abs(self.Ti - self.goal) < 10**-tol)
|
|
137
|
+
|
|
138
|
+
def _expected_num_transfers(self) -> float:
|
|
139
|
+
max_diff_sum = np.max(self.Ti - self.Ti[:, None])
|
|
140
|
+
possible_transfers = self.X_features.sum(axis=1)[:, np.newaxis]*self.membership
|
|
141
|
+
mean_nonzero_transfers = np.mean(possible_transfers[np.nonzero(possible_transfers)])
|
|
142
|
+
return max(int(np.floor(max_diff_sum / (2 * mean_nonzero_transfers))), 1)
|
|
143
|
+
|
|
144
|
+
def _update_centroids(self) -> None:
|
|
145
|
+
for i in range(self.k):
|
|
146
|
+
self.centroids[i] = np.mean(
|
|
147
|
+
self.Y_features[self.membership[:, i] > 0], axis=0
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def fit(self, Y_features: NDArray, X_features: NDArray, weights: NDArray) -> None:
|
|
151
|
+
self.Y_features = Y_features
|
|
152
|
+
self.X_features = X_features
|
|
153
|
+
self.weights = weights
|
|
154
|
+
self.m = X_features.shape[1]
|
|
155
|
+
self.N = X_features.shape[0]
|
|
156
|
+
self.goal_j = self._generate_goal_j()
|
|
157
|
+
self.goal = self._generate_goal()
|
|
158
|
+
|
|
159
|
+
# print('Goal_j:', self.goal_j)
|
|
160
|
+
# print('Goal:', self.goal)
|
|
161
|
+
# print()
|
|
162
|
+
|
|
163
|
+
kmeans = KMeans(
|
|
164
|
+
n_clusters=self.k,
|
|
165
|
+
init=self.centroids if self.centroids is not None else "k-means++",
|
|
166
|
+
n_init=10,
|
|
167
|
+
tol=10**-self.tolerance,
|
|
168
|
+
)
|
|
169
|
+
kmeans.fit(self.Y_features)
|
|
170
|
+
|
|
171
|
+
self.centroids = kmeans.cluster_centers_
|
|
172
|
+
self.labels = kmeans.labels_
|
|
173
|
+
self.membership = self._generate_membership()
|
|
174
|
+
self.Tij = self._generate_Tij()
|
|
175
|
+
self.Ti = self._generate_Ti()
|
|
176
|
+
self.Tij_cost = self._generate_Tij_cost()
|
|
177
|
+
iter_ = 0
|
|
178
|
+
|
|
179
|
+
while not self._stop_codition(self.tolerance) and iter_ < 1000:
|
|
180
|
+
# print("================================================")
|
|
181
|
+
# print("iter:", iter_)
|
|
182
|
+
# print("\nTij", self.Tij)
|
|
183
|
+
# print("\nTij - goal_j", self.Tij - self.goal_j)
|
|
184
|
+
# print("\nTi", self.Ti)
|
|
185
|
+
# print("\nTij_cost", round(self.Tij_cost, 5))
|
|
186
|
+
# print()
|
|
187
|
+
best_cost, transfer_records = self._get_transfer_records(top_m=self._expected_num_transfers())
|
|
188
|
+
if self._no_transfer_possible(best_cost):
|
|
189
|
+
break
|
|
190
|
+
for data_index, old_cluster, new_cluster in transfer_records:
|
|
191
|
+
if self._is_transfer_possible(old_cluster, new_cluster):
|
|
192
|
+
self._transfer(data_index, old_cluster, new_cluster)
|
|
193
|
+
self.Tij = self._generate_Tij()
|
|
194
|
+
self.Ti = self._generate_Ti()
|
|
195
|
+
self.Tij_cost = self._generate_Tij_cost()
|
|
196
|
+
# print(f'AFTER: T_old_cluster={round(self.Ti[old_cluster], 5)} and T_new_cluster={round(self.Ti[new_cluster], 5)}')
|
|
197
|
+
# print()
|
|
198
|
+
self._update_centroids()
|
|
199
|
+
iter_ += 1
|
|
200
|
+
|
|
201
|
+
def get_clusters(self) -> NDArray:
|
|
202
|
+
clusters = []
|
|
203
|
+
|
|
204
|
+
for i in range(self.k):
|
|
205
|
+
probs = self.membership[:, i] * self.X_features.reshape(-1)
|
|
206
|
+
ids = np.nonzero(probs)[0]
|
|
207
|
+
units = np.concatenate(
|
|
208
|
+
[ids.reshape(-1, 1), self.Y_features[ids], probs[ids].reshape(-1, 1)],
|
|
209
|
+
axis=1,
|
|
210
|
+
)
|
|
211
|
+
clusters.append(units)
|
|
212
|
+
|
|
213
|
+
return clusters
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from numpy.typing import NDArray
|
|
3
|
+
from k_means_constrained import KMeansConstrained
|
|
4
|
+
from scipy.stats import mode
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DublyBalancedKMeans:
|
|
8
|
+
def __init__(self, k, split_size=0.01):
|
|
9
|
+
self.k = k
|
|
10
|
+
self.split_size = split_size
|
|
11
|
+
|
|
12
|
+
def _generate_expanded_coords(self, coords, probs):
|
|
13
|
+
counts = (probs / self.split_size).round().astype(int)
|
|
14
|
+
expanded_coords = np.repeat(coords, counts, axis=0)
|
|
15
|
+
expanded_idx = np.repeat(np.arange(self.N), counts)
|
|
16
|
+
return expanded_coords, expanded_idx
|
|
17
|
+
|
|
18
|
+
def _generate_labels(self, extended_labels, expanded_idx, coords):
|
|
19
|
+
labels = np.zeros(self.N, dtype=int)
|
|
20
|
+
for i in range(self.N):
|
|
21
|
+
assigned_labels = extended_labels[expanded_idx == i]
|
|
22
|
+
if len(assigned_labels) == 0:
|
|
23
|
+
labels[i] = np.argmin(np.linalg.norm(self.centroids - coords[i], axis=1))
|
|
24
|
+
else:
|
|
25
|
+
labels[i] = mode(assigned_labels, keepdims=True)[0][0]
|
|
26
|
+
return labels
|
|
27
|
+
|
|
28
|
+
def fit(self, coords, probs):
|
|
29
|
+
self.N = coords.shape[0]
|
|
30
|
+
expanded_coords, expanded_idx = self._generate_expanded_coords(coords, probs)
|
|
31
|
+
cluster_size = len(expanded_idx) // self.k
|
|
32
|
+
kmeans = KMeansConstrained(
|
|
33
|
+
n_clusters=self.k,
|
|
34
|
+
size_min=cluster_size,
|
|
35
|
+
size_max=cluster_size+1
|
|
36
|
+
)
|
|
37
|
+
labels = kmeans.fit_predict(expanded_coords)
|
|
38
|
+
self.centroids = kmeans.cluster_centers_
|
|
39
|
+
self.labels = self._generate_labels(labels, expanded_idx, coords)
|
|
40
|
+
|
|
41
|
+
cb = ContinuesBalancing(self.k)
|
|
42
|
+
cb.fit(coords, probs, self.centroids, self.labels)
|
|
43
|
+
|
|
44
|
+
self.centroids = cb.centroids
|
|
45
|
+
self.labels = cb.labels
|
|
46
|
+
self.membership = cb.membership
|
|
47
|
+
self.Ti = cb.Ti
|
|
48
|
+
self.goal = cb.goal
|
|
49
|
+
self.clusters = cb.get_clusters()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ContinuesBalancing:
|
|
54
|
+
def __init__(
|
|
55
|
+
self, k: int, *, tolerance: int = 3
|
|
56
|
+
) -> None:
|
|
57
|
+
self.k = k
|
|
58
|
+
self.tolerance = tolerance
|
|
59
|
+
self.Y_features = None
|
|
60
|
+
self.X_feature = None
|
|
61
|
+
self.labels: NDArray = None
|
|
62
|
+
self.membership: NDArray = None
|
|
63
|
+
self.Ti: NDArray = None
|
|
64
|
+
self.goal: float = None
|
|
65
|
+
|
|
66
|
+
def _generate_goal(self) -> float:
|
|
67
|
+
return self.X_feature.sum()/self.k
|
|
68
|
+
|
|
69
|
+
def _generate_membership(self):
|
|
70
|
+
membership = np.zeros((self.N, self.k))
|
|
71
|
+
for j in range(self.N):
|
|
72
|
+
membership[j, self.labels[j]] = 1
|
|
73
|
+
return membership
|
|
74
|
+
|
|
75
|
+
def _generate_Ti(self) -> NDArray:
|
|
76
|
+
return np.sum(
|
|
77
|
+
self.X_feature[:, np.newaxis] * self.membership,
|
|
78
|
+
axis=0,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def _transfer_score(
|
|
82
|
+
self,
|
|
83
|
+
data_index: int,
|
|
84
|
+
old_cluster: int,
|
|
85
|
+
new_cluster: int,
|
|
86
|
+
) -> float:
|
|
87
|
+
if (
|
|
88
|
+
self.Ti[old_cluster] - self.Ti[new_cluster]
|
|
89
|
+
> 10**-self.tolerance
|
|
90
|
+
):
|
|
91
|
+
score = (
|
|
92
|
+
np.linalg.norm(
|
|
93
|
+
self.Y_features[data_index] - self.centroids[new_cluster]
|
|
94
|
+
)
|
|
95
|
+
** 2
|
|
96
|
+
- np.linalg.norm(
|
|
97
|
+
self.Y_features[data_index] - self.centroids[old_cluster]
|
|
98
|
+
)
|
|
99
|
+
** 2
|
|
100
|
+
) / (self.Ti[old_cluster] - self.Ti[new_cluster] + 1e-9)
|
|
101
|
+
return score
|
|
102
|
+
else:
|
|
103
|
+
return np.inf
|
|
104
|
+
|
|
105
|
+
def _get_transfer_records(self, top_m: int):
|
|
106
|
+
transfer_records = []
|
|
107
|
+
|
|
108
|
+
for i in range(self.N):
|
|
109
|
+
for j_old in np.nonzero(self.membership[i])[0]:
|
|
110
|
+
j_new_min = np.argmin(
|
|
111
|
+
[self._transfer_score(i, j_old, j_new) for j_new in range(self.k)]
|
|
112
|
+
)
|
|
113
|
+
cost = self._transfer_score(i, j_old, j_new_min)
|
|
114
|
+
transfer_records.append((cost, i, j_old, j_new_min))
|
|
115
|
+
|
|
116
|
+
transfer_records = np.array(transfer_records)
|
|
117
|
+
sorted_transfer_records = transfer_records[np.argsort(transfer_records[:, 0])]
|
|
118
|
+
best_cost = sorted_transfer_records[0, 0]
|
|
119
|
+
top_m_transfer_records = sorted_transfer_records[:top_m, 1:].astype(int)
|
|
120
|
+
|
|
121
|
+
return best_cost, top_m_transfer_records
|
|
122
|
+
|
|
123
|
+
def _transfer_percent(self, data_index: int, old_cluster: int, new_cluster: int):
|
|
124
|
+
if (
|
|
125
|
+
self.Ti[old_cluster] >= self.goal
|
|
126
|
+
and self.Ti[new_cluster] >= self.goal
|
|
127
|
+
) or (
|
|
128
|
+
self.Ti[old_cluster] <= self.goal
|
|
129
|
+
and self.Ti[new_cluster] <= self.goal
|
|
130
|
+
):
|
|
131
|
+
return min(
|
|
132
|
+
self.membership[data_index, old_cluster],
|
|
133
|
+
((self.Ti[old_cluster] - self.Ti[new_cluster]) / (2*np.sum(self.X_feature[data_index])))
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
return min(
|
|
137
|
+
self.membership[data_index, old_cluster],
|
|
138
|
+
((self.Ti[old_cluster] - self.goal) / np.sum(self.X_feature[data_index])),
|
|
139
|
+
((self.goal - self.Ti[new_cluster]) / np.sum(self.X_feature[data_index])),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def _transfer(self, data_index: int, old_cluster: int, new_cluster: int) -> None:
|
|
143
|
+
transfer_percent = self._transfer_percent(data_index, old_cluster, new_cluster)
|
|
144
|
+
self.membership[data_index, old_cluster] -= transfer_percent
|
|
145
|
+
self.membership[data_index, new_cluster] += transfer_percent
|
|
146
|
+
|
|
147
|
+
def _no_transfer_possible(self, best_cost: float) -> bool:
|
|
148
|
+
return best_cost == np.inf
|
|
149
|
+
|
|
150
|
+
def _is_transfer_possible(self, old_cluster: int, new_cluster: int) -> bool:
|
|
151
|
+
return (
|
|
152
|
+
self.Ti[old_cluster] - self.Ti[new_cluster]
|
|
153
|
+
> 10**-self.tolerance
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def _stop_codition(self, tol) -> bool:
|
|
157
|
+
return np.all(np.abs(self.Ti - self.goal) < 10**-tol)
|
|
158
|
+
|
|
159
|
+
def _expected_num_transfers(self) -> float:
|
|
160
|
+
max_diff_sum = np.max(self.Ti - self.Ti[:, None])
|
|
161
|
+
possible_transfers = self.X_feature[:, np.newaxis] * self.membership
|
|
162
|
+
mean_nonzero_transfers = np.mean(possible_transfers[np.nonzero(possible_transfers)])
|
|
163
|
+
return max(int(np.floor(max_diff_sum / (2 * mean_nonzero_transfers))), 1)
|
|
164
|
+
|
|
165
|
+
def _update_centroids(self) -> None:
|
|
166
|
+
for i in range(self.k):
|
|
167
|
+
self.centroids[i] = np.mean(
|
|
168
|
+
self.Y_features[self.membership[:, i] > 0], axis=0
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def fit(self, Y_features: NDArray, X_feature: NDArray, centroids: NDArray, labels: NDArray, max_iteration: int = 1000) -> None:
|
|
172
|
+
self.N = len(X_feature)
|
|
173
|
+
self.Y_features = Y_features
|
|
174
|
+
self.X_feature = X_feature
|
|
175
|
+
self.centroids = centroids
|
|
176
|
+
self.labels = labels
|
|
177
|
+
self.goal = self._generate_goal()
|
|
178
|
+
self.membership = self._generate_membership()
|
|
179
|
+
self.Ti = self._generate_Ti()
|
|
180
|
+
self._update_centroids()
|
|
181
|
+
iter_ = 0
|
|
182
|
+
|
|
183
|
+
while not self._stop_codition(self.tolerance) and iter_ < max_iteration:
|
|
184
|
+
print(f"\nIteration {iter_}")
|
|
185
|
+
print(f"Ti: {self.Ti}")
|
|
186
|
+
print(f"Sum Ti: {np.sum(self.Ti)}")
|
|
187
|
+
best_cost, transfer_records = self._get_transfer_records(top_m=self._expected_num_transfers())
|
|
188
|
+
if self._no_transfer_possible(best_cost):
|
|
189
|
+
break
|
|
190
|
+
for data_index, old_cluster, new_cluster in transfer_records:
|
|
191
|
+
if self._is_transfer_possible(old_cluster, new_cluster):
|
|
192
|
+
self._transfer(data_index, old_cluster, new_cluster)
|
|
193
|
+
self.Ti = self._generate_Ti()
|
|
194
|
+
self._update_centroids()
|
|
195
|
+
iter_ += 1
|
|
196
|
+
|
|
197
|
+
def get_clusters(self) -> NDArray:
|
|
198
|
+
clusters = []
|
|
199
|
+
|
|
200
|
+
for i in range(self.k):
|
|
201
|
+
x = self.membership[:, i] * self.X_feature
|
|
202
|
+
ids = np.nonzero(x)[0]
|
|
203
|
+
units = np.concatenate(
|
|
204
|
+
[ids.reshape(-1, 1), self.Y_features[ids], x[ids].reshape(-1, 1)],
|
|
205
|
+
axis=1,
|
|
206
|
+
)
|
|
207
|
+
clusters.append(units)
|
|
208
|
+
|
|
209
|
+
return clusters
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from numpy.typing import NDArray
|
|
3
|
+
from sklearn.cluster import KMeans
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OneBoundaryBalancedKMeans:
|
|
8
|
+
def __init__(
|
|
9
|
+
self, k: int, *, initial_centroids: NDArray = None, tolerance: int = 5
|
|
10
|
+
) -> None:
|
|
11
|
+
self.k = k
|
|
12
|
+
self.tolerance = tolerance
|
|
13
|
+
self.Y_features = None
|
|
14
|
+
self.x_feature = None
|
|
15
|
+
self.N = None
|
|
16
|
+
self.centroids = initial_centroids
|
|
17
|
+
self.labels: NDArray = None
|
|
18
|
+
self.membership: NDArray = None
|
|
19
|
+
self.Ti: NDArray = None
|
|
20
|
+
self.goal: float = None
|
|
21
|
+
self.rng = np.random.default_rng()
|
|
22
|
+
|
|
23
|
+
def _generate_goal(self) -> float:
|
|
24
|
+
return self.x_feature.sum()/self.k
|
|
25
|
+
|
|
26
|
+
def _generate_membership(self):
|
|
27
|
+
membership = np.zeros((self.N, self.k))
|
|
28
|
+
for j in range(self.N):
|
|
29
|
+
membership[j, self.labels[j]] = 1
|
|
30
|
+
return membership
|
|
31
|
+
|
|
32
|
+
def _generate_Ti(self) -> NDArray:
|
|
33
|
+
return np.sum(
|
|
34
|
+
self.x_feature[:, np.newaxis] * self.membership,
|
|
35
|
+
axis=0,
|
|
36
|
+
).T
|
|
37
|
+
|
|
38
|
+
def _determine_points_to_gain(self) -> dict[int, list[tuple[int, float]]]:
|
|
39
|
+
"""
|
|
40
|
+
For each cluster below goal, compute the points (and fractions) needed to fill its deficit,
|
|
41
|
+
then score each cluster by (mean distance of those points to its centroid) / deficit,
|
|
42
|
+
and finally return the gains for the single cluster with the lowest score.
|
|
43
|
+
Restriction: once any fraction of a point is earmarked for a cluster, that fraction is
|
|
44
|
+
'locked' and cannot be reused for another cluster in this round. If 100% is locked,
|
|
45
|
+
the point is out entirely; if 50% is locked, only the remaining 50% can be used elsewhere.
|
|
46
|
+
"""
|
|
47
|
+
deficits = self.goal - self.Ti
|
|
48
|
+
cluster_gains: dict[int, list[tuple[int, float]]] = {}
|
|
49
|
+
cluster_scores: dict[int, float] = {}
|
|
50
|
+
# tracks fraction of each point already locked in this round
|
|
51
|
+
|
|
52
|
+
for cluster_idx, deficit in enumerate(deficits):
|
|
53
|
+
if deficit <= 0:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
# build list of candidates not yet fully in this cluster and not fully locked
|
|
57
|
+
candidates = []
|
|
58
|
+
for j in range(self.N):
|
|
59
|
+
in_cluster = self.membership[j, cluster_idx]
|
|
60
|
+
locked = self.locked_frac.get(j, 0.0)
|
|
61
|
+
# available fraction of the point for this cluster:
|
|
62
|
+
avail_frac = (1 - in_cluster) - locked
|
|
63
|
+
if avail_frac > 0:
|
|
64
|
+
candidates.append(j)
|
|
65
|
+
if not candidates:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
# distances to centroid
|
|
69
|
+
dists = np.linalg.norm(self.Y_features[candidates] - self.centroids[cluster_idx], axis=1)
|
|
70
|
+
sorted_idx = np.argsort(dists)
|
|
71
|
+
sorted_candidates = [candidates[i] for i in sorted_idx]
|
|
72
|
+
# print(f"centroid: {self.centroids[cluster_idx]}")
|
|
73
|
+
# print(dists[sorted_idx][:5], sorted_candidates[:5])
|
|
74
|
+
|
|
75
|
+
gains: list[tuple[int, float]] = []
|
|
76
|
+
dists_for_gains: list[float] = []
|
|
77
|
+
accum = 0.0
|
|
78
|
+
|
|
79
|
+
for pt_idx in sorted_candidates:
|
|
80
|
+
prev_in = self.membership[pt_idx, cluster_idx]
|
|
81
|
+
locked = self.locked_frac.get(pt_idx, 0.0)
|
|
82
|
+
# fraction of whole point still available to move into this cluster
|
|
83
|
+
avail_frac = (1 - prev_in) - locked
|
|
84
|
+
if avail_frac <= 0:
|
|
85
|
+
continue
|
|
86
|
+
# x_mass available
|
|
87
|
+
avail_mass = self.x_feature[pt_idx] * avail_frac
|
|
88
|
+
|
|
89
|
+
if accum + avail_mass <= deficit:
|
|
90
|
+
# lock entire available fraction
|
|
91
|
+
frac_move = avail_frac
|
|
92
|
+
gains.append((pt_idx, frac_move))
|
|
93
|
+
dists_for_gains.append(
|
|
94
|
+
np.linalg.norm(self.Y_features[pt_idx] - self.centroids[cluster_idx])
|
|
95
|
+
)
|
|
96
|
+
accum += avail_mass
|
|
97
|
+
self.locked_frac[pt_idx] = locked + frac_move
|
|
98
|
+
else:
|
|
99
|
+
# we only need part of the available mass
|
|
100
|
+
needed = deficit - accum
|
|
101
|
+
frac_of_mass = needed / self.x_feature[pt_idx]
|
|
102
|
+
# but that is fraction of whole point:
|
|
103
|
+
frac_move = min(frac_of_mass, avail_frac)
|
|
104
|
+
if frac_move > 0:
|
|
105
|
+
gains.append((pt_idx, frac_move))
|
|
106
|
+
dists_for_gains.append(
|
|
107
|
+
np.linalg.norm(self.Y_features[pt_idx] - self.centroids[cluster_idx])
|
|
108
|
+
)
|
|
109
|
+
self.locked_frac[pt_idx] = locked + frac_move
|
|
110
|
+
break
|
|
111
|
+
|
|
112
|
+
if gains:
|
|
113
|
+
mean_dist = sum(dists_for_gains) / len(dists_for_gains)
|
|
114
|
+
score = mean_dist / deficit
|
|
115
|
+
cluster_gains[cluster_idx] = gains
|
|
116
|
+
cluster_scores[cluster_idx] = score
|
|
117
|
+
|
|
118
|
+
if not cluster_scores:
|
|
119
|
+
return {}
|
|
120
|
+
|
|
121
|
+
# choose the cluster with lowest score
|
|
122
|
+
best = min(cluster_scores, key=cluster_scores.get)
|
|
123
|
+
return {best: cluster_gains[best]}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _assign_points_to_clusters(self, points_to_gain: dict[int, list[tuple[int, float]]]) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Assigns the computed gains for the single target cluster:
|
|
129
|
+
- `frac` is the additional fraction of the point to assign to the cluster.
|
|
130
|
+
- New cluster membership = previous membership + frac (capped at 1.0).
|
|
131
|
+
- The remaining mass (1 − new_membership) is redistributed among the other clusters
|
|
132
|
+
in proportion to their previous memberships.
|
|
133
|
+
"""
|
|
134
|
+
if not points_to_gain:
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
# unpack the single entry
|
|
138
|
+
cluster_idx, gains = next(iter(points_to_gain.items()))
|
|
139
|
+
for pt_idx, add_frac in gains:
|
|
140
|
+
prev = self.membership[pt_idx].copy()
|
|
141
|
+
# print(f"pt_idx: {pt_idx}, prev: {prev}, add_frac: {add_frac}, cluster_idx: {cluster_idx}")
|
|
142
|
+
already_in = prev[cluster_idx]
|
|
143
|
+
|
|
144
|
+
# compute new membership for this cluster
|
|
145
|
+
new_cluster_mem = min(already_in + add_frac, 1.0)
|
|
146
|
+
rem_mass = 1.0 - new_cluster_mem
|
|
147
|
+
|
|
148
|
+
# prepare redistribution of remaining mass
|
|
149
|
+
prev_except = prev.copy()
|
|
150
|
+
prev_except[cluster_idx] = 0.0
|
|
151
|
+
total_prev_except = prev_except.sum()
|
|
152
|
+
|
|
153
|
+
# build the updated membership vector
|
|
154
|
+
new_mem = np.zeros_like(prev)
|
|
155
|
+
new_mem[cluster_idx] = new_cluster_mem
|
|
156
|
+
|
|
157
|
+
if total_prev_except > 0 and rem_mass > 0:
|
|
158
|
+
# scale other clusters proportionally
|
|
159
|
+
new_mem += (prev_except / total_prev_except) * rem_mass
|
|
160
|
+
elif rem_mass > 0:
|
|
161
|
+
# if no other membership existed, put remainder back into this cluster
|
|
162
|
+
new_mem[cluster_idx] += rem_mass
|
|
163
|
+
|
|
164
|
+
# apply the update
|
|
165
|
+
self.membership[pt_idx] = new_mem
|
|
166
|
+
|
|
167
|
+
# print(f"new_mem: {new_mem}")
|
|
168
|
+
|
|
169
|
+
# recompute cluster totals
|
|
170
|
+
self.Ti = self._generate_Ti()
|
|
171
|
+
|
|
172
|
+
def _update_centroids(self) -> None:
|
|
173
|
+
for i in range(self.k):
|
|
174
|
+
points = self.Y_features[self.membership[:, i] > 0]
|
|
175
|
+
self.centroids[i] = np.mean(points, axis=0)
|
|
176
|
+
|
|
177
|
+
def _stop_codition(self) -> bool:
|
|
178
|
+
return np.all(np.abs(self.Ti - self.goal) < 10**-self.tolerance)
|
|
179
|
+
|
|
180
|
+
def fit(self, Y_features: NDArray, x_feature: NDArray) -> None:
|
|
181
|
+
self.Y_features = Y_features
|
|
182
|
+
self.x_feature = x_feature
|
|
183
|
+
self.N = x_feature.size
|
|
184
|
+
self.goal = self._generate_goal()
|
|
185
|
+
|
|
186
|
+
kmeans = KMeans(
|
|
187
|
+
n_clusters=self.k,
|
|
188
|
+
init=self.centroids if self.centroids is not None else "k-means++",
|
|
189
|
+
n_init=10,
|
|
190
|
+
tol=10**-self.tolerance,
|
|
191
|
+
)
|
|
192
|
+
kmeans.fit(self.Y_features)
|
|
193
|
+
|
|
194
|
+
self.centroids = kmeans.cluster_centers_
|
|
195
|
+
self.labels = kmeans.labels_
|
|
196
|
+
self.membership = self._generate_membership()
|
|
197
|
+
self.Ti = self._generate_Ti()
|
|
198
|
+
|
|
199
|
+
iter_ = 0
|
|
200
|
+
self.locked_frac: dict[int, float] = {}
|
|
201
|
+
|
|
202
|
+
while not self._stop_codition() and iter_ < 5:
|
|
203
|
+
print("\nIteration:", iter_)
|
|
204
|
+
print("(BEF) Ti:", self.Ti)
|
|
205
|
+
points_to_gain = self._determine_points_to_gain()
|
|
206
|
+
if not points_to_gain:
|
|
207
|
+
break
|
|
208
|
+
self._assign_points_to_clusters(points_to_gain)
|
|
209
|
+
iter_ += 1
|
|
210
|
+
|
|
211
|
+
points = np.array(list(points_to_gain.values())[0])
|
|
212
|
+
points_id, points_frac = points[:, 0], points[:, 1]
|
|
213
|
+
points_id = points_id.astype(int)
|
|
214
|
+
points_frac = points_frac.astype(float)
|
|
215
|
+
|
|
216
|
+
mem = np.argmax(self.membership, axis=1)
|
|
217
|
+
# boarders = np.where(np.count_nonzero(self.membership, axis=1)>1)[0]
|
|
218
|
+
plt.figure(figsize=(5, 5), dpi=200)
|
|
219
|
+
plt.scatter(*Y_features.T, c=mem)
|
|
220
|
+
# plt.scatter(*Y_features[boarders].T, c='red', s=10)
|
|
221
|
+
plt.scatter(*Y_features[points_id].T, c='red', marker='*', s=10)
|
|
222
|
+
plt.scatter(*self.centroids.T, c='black', marker='x', s=100)
|
|
223
|
+
|
|
224
|
+
for i, point_id in enumerate(points_id):
|
|
225
|
+
x, y = Y_features[point_id]
|
|
226
|
+
plt.text(x, y - 0.05, f"{points_frac[i]:.2f}, {point_id}", color='black', fontsize=8, ha='center') # Adjust y-offset as needed
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
print("(AFT) Ti:", self.Ti)
|
|
230
|
+
print("Diff Ti:", np.sum(np.abs(self.Ti - self.goal)))
|
|
231
|
+
print("Points to gain:", points_to_gain)
|
|
232
|
+
|
|
233
|
+
self._update_centroids()
|