graphical-sampling 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ from importlib import metadata
2
+
3
+ from .design import Design
4
+ from . import criteria
5
+ from . import search
6
+ from . import sampling
7
+ from . import clustering
8
+ from . import random
9
+ from . import measure
10
+
11
+
12
+ __version__ = metadata.version("geometric_sampling")
13
+
14
+ __all__ = ["Design", "criteria", "search", "sampling", "clustering", "random", "measure"]
@@ -0,0 +1,13 @@
1
+ from .soft_balanced_kmeans import SoftBalancedKMeans
2
+ from .aggregate import AggregateBalancedKMeans
3
+ from .one_boundary import OneBoundaryBalancedKMeans
4
+ from .dubly_balanced_clustering import DublyBalancedKMeans
5
+ # from .agg import Agg
6
+ # from .agg_one import AggOne
7
+ # from .aggregate import FinalAgg
8
+ # from .final2 import FinalAgg2
9
+ # from .swap import SwapAgg
10
+
11
+
12
+ __all__ = ["SoftBalancedKMeans", "AggregateBalancedKMeans", "OneBoundaryBalancedKMeans", "DublyBalancedKMeans"]
13
+ # __all__ = ["SoftBalancedKMeans", "AggregateBalancingKmeans", "Agg", "AggOne", "FinalAgg", "FinalAgg2", "SwapAgg"]
@@ -0,0 +1,213 @@
1
+ import numpy as np
2
+ from numpy.typing import NDArray
3
+ from sklearn.cluster import KMeans
4
+
5
+
6
+ class AggregateBalancedKMeans:
7
+ def __init__(
8
+ self, k: int, *, initial_centroids: NDArray = None, tolerance: int = 5
9
+ ) -> None:
10
+ self.k = k
11
+ self.tolerance = tolerance
12
+ self.Y_features = None
13
+ self.X_features = None
14
+ self.weights = None
15
+ self.m = None
16
+ self.N = None
17
+ self.centroids = initial_centroids
18
+ self.labels: NDArray = None
19
+ self.membership: NDArray = None
20
+ self.Ti: NDArray = None
21
+ self.Tij: NDArray = None
22
+ self.goal: float = None
23
+ self.rng = np.random.default_rng()
24
+
25
+ def _generate_goal_j(self) -> float:
26
+ return self.X_features.sum(axis=0)/self.k
27
+
28
+ def _generate_goal(self) -> float:
29
+ return np.sum(self._generate_goal_j())
30
+
31
+ def _generate_membership(self):
32
+ membership = np.zeros((self.N, self.k))
33
+ for j in range(self.N):
34
+ membership[j, self.labels[j]] = 1
35
+ return membership
36
+
37
+ def _generate_Tij(self) -> NDArray:
38
+ return np.sum(
39
+ self.X_features[:, :, np.newaxis] * self.membership[:, np.newaxis, :],
40
+ axis=0,
41
+ ).T
42
+
43
+ def _generate_Ti(self) -> NDArray:
44
+ return np.sum(self._generate_Tij(), axis=1)
45
+
46
+ def _generate_Tij_cost(self):
47
+ return np.sum((self.weights*(self.Tij-self.goal_j))**2)
48
+
49
+ def _transfer_score(
50
+ self,
51
+ data_index: int,
52
+ old_cluster: int,
53
+ new_cluster: int,
54
+ ) -> float:
55
+ if (
56
+ self.Ti[old_cluster] - self.Ti[new_cluster]
57
+ > 10**-self.tolerance
58
+ ):
59
+ score = (
60
+ np.linalg.norm(
61
+ self.Y_features[data_index] - self.centroids[new_cluster]
62
+ )
63
+ ** 2
64
+ - np.linalg.norm(
65
+ self.Y_features[data_index] - self.centroids[old_cluster]
66
+ )
67
+ ** 2
68
+ ) / (self.Ti[old_cluster] - self.Ti[new_cluster] + 1e-9)
69
+ return score #if score > 0 else np.inf
70
+ else:
71
+ return np.inf
72
+
73
+ def _get_transfer_records(self, top_m: int):
74
+ transfer_records = []
75
+
76
+ for i in range(self.N):
77
+ for j_old in np.nonzero(self.membership[i])[0]:
78
+ j_new_min = np.argmin(
79
+ [self._transfer_score(i, j_old, j_new) for j_new in range(self.k)]
80
+ )
81
+ cost = self._transfer_score(i, j_old, j_new_min)
82
+ transfer_records.append((cost, i, j_old, j_new_min))
83
+
84
+ transfer_records = np.array(transfer_records)
85
+ sorted_transfer_records = transfer_records[np.argsort(transfer_records[:, 0])]
86
+ best_cost = sorted_transfer_records[0, 0]
87
+ top_m_transfer_records = sorted_transfer_records[:top_m, 1:].astype(int)
88
+
89
+ # print("cost")
90
+ # print(sorted_transfer_records[:5])
91
+ # print()
92
+
93
+ return best_cost, top_m_transfer_records
94
+
95
+ def _transfer_percent(self, data_index: int, old_cluster: int, new_cluster: int):
96
+ if (
97
+ self.Ti[old_cluster] >= self.goal
98
+ and self.Ti[new_cluster] >= self.goal
99
+ ) or (
100
+ self.Ti[old_cluster] <= self.goal
101
+ and self.Ti[new_cluster] <= self.goal
102
+ ):
103
+ # print('case ++ or --')
104
+ return min(
105
+ self.membership[data_index, old_cluster],
106
+ ((self.Ti[old_cluster] - self.Ti[new_cluster]) / (2*np.sum(self.X_features[data_index])))
107
+ )
108
+ else:
109
+ # print('case +-')
110
+ return min(
111
+ self.membership[data_index, old_cluster],
112
+ ((self.Ti[old_cluster] - self.goal) / np.sum(self.X_features[data_index])),
113
+ ((self.goal - self.Ti[new_cluster]) / np.sum(self.X_features[data_index])),
114
+ )
115
+
116
+ def _transfer(self, data_index: int, old_cluster: int, new_cluster: int) -> None:
117
+ transfer_percent = self._transfer_percent(data_index, old_cluster, new_cluster)
118
+
119
+ # print(f'transfer {data_index} from {old_cluster} to {new_cluster}')
120
+ # print(f'BEFROE: T_old_cluster={round(self.Ti[old_cluster], 5)} and T_new_cluster={round(self.Ti[new_cluster], 5)}')
121
+ # print(f'transfer percent: {transfer_percent}')
122
+
123
+ self.membership[data_index, old_cluster] -= transfer_percent
124
+ self.membership[data_index, new_cluster] += transfer_percent
125
+
126
+ def _no_transfer_possible(self, best_cost: float) -> bool:
127
+ return best_cost == np.inf
128
+
129
+ def _is_transfer_possible(self, old_cluster: int, new_cluster: int) -> bool:
130
+ return (
131
+ self.Ti[old_cluster] - self.Ti[new_cluster]
132
+ > 10**-self.tolerance
133
+ )
134
+
135
+ def _stop_codition(self, tol) -> bool:
136
+ return np.all(np.abs(self.Ti - self.goal) < 10**-tol)
137
+
138
+ def _expected_num_transfers(self) -> float:
139
+ max_diff_sum = np.max(self.Ti - self.Ti[:, None])
140
+ possible_transfers = self.X_features.sum(axis=1)[:, np.newaxis]*self.membership
141
+ mean_nonzero_transfers = np.mean(possible_transfers[np.nonzero(possible_transfers)])
142
+ return max(int(np.floor(max_diff_sum / (2 * mean_nonzero_transfers))), 1)
143
+
144
+ def _update_centroids(self) -> None:
145
+ for i in range(self.k):
146
+ self.centroids[i] = np.mean(
147
+ self.Y_features[self.membership[:, i] > 0], axis=0
148
+ )
149
+
150
+ def fit(self, Y_features: NDArray, X_features: NDArray, weights: NDArray) -> None:
151
+ self.Y_features = Y_features
152
+ self.X_features = X_features
153
+ self.weights = weights
154
+ self.m = X_features.shape[1]
155
+ self.N = X_features.shape[0]
156
+ self.goal_j = self._generate_goal_j()
157
+ self.goal = self._generate_goal()
158
+
159
+ # print('Goal_j:', self.goal_j)
160
+ # print('Goal:', self.goal)
161
+ # print()
162
+
163
+ kmeans = KMeans(
164
+ n_clusters=self.k,
165
+ init=self.centroids if self.centroids is not None else "k-means++",
166
+ n_init=10,
167
+ tol=10**-self.tolerance,
168
+ )
169
+ kmeans.fit(self.Y_features)
170
+
171
+ self.centroids = kmeans.cluster_centers_
172
+ self.labels = kmeans.labels_
173
+ self.membership = self._generate_membership()
174
+ self.Tij = self._generate_Tij()
175
+ self.Ti = self._generate_Ti()
176
+ self.Tij_cost = self._generate_Tij_cost()
177
+ iter_ = 0
178
+
179
+ while not self._stop_codition(self.tolerance) and iter_ < 1000:
180
+ # print("================================================")
181
+ # print("iter:", iter_)
182
+ # print("\nTij", self.Tij)
183
+ # print("\nTij - goal_j", self.Tij - self.goal_j)
184
+ # print("\nTi", self.Ti)
185
+ # print("\nTij_cost", round(self.Tij_cost, 5))
186
+ # print()
187
+ best_cost, transfer_records = self._get_transfer_records(top_m=self._expected_num_transfers())
188
+ if self._no_transfer_possible(best_cost):
189
+ break
190
+ for data_index, old_cluster, new_cluster in transfer_records:
191
+ if self._is_transfer_possible(old_cluster, new_cluster):
192
+ self._transfer(data_index, old_cluster, new_cluster)
193
+ self.Tij = self._generate_Tij()
194
+ self.Ti = self._generate_Ti()
195
+ self.Tij_cost = self._generate_Tij_cost()
196
+ # print(f'AFTER: T_old_cluster={round(self.Ti[old_cluster], 5)} and T_new_cluster={round(self.Ti[new_cluster], 5)}')
197
+ # print()
198
+ self._update_centroids()
199
+ iter_ += 1
200
+
201
+ def get_clusters(self) -> NDArray:
202
+ clusters = []
203
+
204
+ for i in range(self.k):
205
+ probs = self.membership[:, i] * self.X_features.reshape(-1)
206
+ ids = np.nonzero(probs)[0]
207
+ units = np.concatenate(
208
+ [ids.reshape(-1, 1), self.Y_features[ids], probs[ids].reshape(-1, 1)],
209
+ axis=1,
210
+ )
211
+ clusters.append(units)
212
+
213
+ return clusters
@@ -0,0 +1,209 @@
1
+ import numpy as np
2
+ from numpy.typing import NDArray
3
+ from k_means_constrained import KMeansConstrained
4
+ from scipy.stats import mode
5
+
6
+
7
+ class DublyBalancedKMeans:
8
+ def __init__(self, k, split_size=0.01):
9
+ self.k = k
10
+ self.split_size = split_size
11
+
12
+ def _generate_expanded_coords(self, coords, probs):
13
+ counts = (probs / self.split_size).round().astype(int)
14
+ expanded_coords = np.repeat(coords, counts, axis=0)
15
+ expanded_idx = np.repeat(np.arange(self.N), counts)
16
+ return expanded_coords, expanded_idx
17
+
18
+ def _generate_labels(self, extended_labels, expanded_idx, coords):
19
+ labels = np.zeros(self.N, dtype=int)
20
+ for i in range(self.N):
21
+ assigned_labels = extended_labels[expanded_idx == i]
22
+ if len(assigned_labels) == 0:
23
+ labels[i] = np.argmin(np.linalg.norm(self.centroids - coords[i], axis=1))
24
+ else:
25
+ labels[i] = mode(assigned_labels, keepdims=True)[0][0]
26
+ return labels
27
+
28
+ def fit(self, coords, probs):
29
+ self.N = coords.shape[0]
30
+ expanded_coords, expanded_idx = self._generate_expanded_coords(coords, probs)
31
+ cluster_size = len(expanded_idx) // self.k
32
+ kmeans = KMeansConstrained(
33
+ n_clusters=self.k,
34
+ size_min=cluster_size,
35
+ size_max=cluster_size+1
36
+ )
37
+ labels = kmeans.fit_predict(expanded_coords)
38
+ self.centroids = kmeans.cluster_centers_
39
+ self.labels = self._generate_labels(labels, expanded_idx, coords)
40
+
41
+ cb = ContinuesBalancing(self.k)
42
+ cb.fit(coords, probs, self.centroids, self.labels)
43
+
44
+ self.centroids = cb.centroids
45
+ self.labels = cb.labels
46
+ self.membership = cb.membership
47
+ self.Ti = cb.Ti
48
+ self.goal = cb.goal
49
+ self.clusters = cb.get_clusters()
50
+
51
+
52
+
53
+ class ContinuesBalancing:
54
+ def __init__(
55
+ self, k: int, *, tolerance: int = 3
56
+ ) -> None:
57
+ self.k = k
58
+ self.tolerance = tolerance
59
+ self.Y_features = None
60
+ self.X_feature = None
61
+ self.labels: NDArray = None
62
+ self.membership: NDArray = None
63
+ self.Ti: NDArray = None
64
+ self.goal: float = None
65
+
66
+ def _generate_goal(self) -> float:
67
+ return self.X_feature.sum()/self.k
68
+
69
+ def _generate_membership(self):
70
+ membership = np.zeros((self.N, self.k))
71
+ for j in range(self.N):
72
+ membership[j, self.labels[j]] = 1
73
+ return membership
74
+
75
+ def _generate_Ti(self) -> NDArray:
76
+ return np.sum(
77
+ self.X_feature[:, np.newaxis] * self.membership,
78
+ axis=0,
79
+ )
80
+
81
+ def _transfer_score(
82
+ self,
83
+ data_index: int,
84
+ old_cluster: int,
85
+ new_cluster: int,
86
+ ) -> float:
87
+ if (
88
+ self.Ti[old_cluster] - self.Ti[new_cluster]
89
+ > 10**-self.tolerance
90
+ ):
91
+ score = (
92
+ np.linalg.norm(
93
+ self.Y_features[data_index] - self.centroids[new_cluster]
94
+ )
95
+ ** 2
96
+ - np.linalg.norm(
97
+ self.Y_features[data_index] - self.centroids[old_cluster]
98
+ )
99
+ ** 2
100
+ ) / (self.Ti[old_cluster] - self.Ti[new_cluster] + 1e-9)
101
+ return score
102
+ else:
103
+ return np.inf
104
+
105
+ def _get_transfer_records(self, top_m: int):
106
+ transfer_records = []
107
+
108
+ for i in range(self.N):
109
+ for j_old in np.nonzero(self.membership[i])[0]:
110
+ j_new_min = np.argmin(
111
+ [self._transfer_score(i, j_old, j_new) for j_new in range(self.k)]
112
+ )
113
+ cost = self._transfer_score(i, j_old, j_new_min)
114
+ transfer_records.append((cost, i, j_old, j_new_min))
115
+
116
+ transfer_records = np.array(transfer_records)
117
+ sorted_transfer_records = transfer_records[np.argsort(transfer_records[:, 0])]
118
+ best_cost = sorted_transfer_records[0, 0]
119
+ top_m_transfer_records = sorted_transfer_records[:top_m, 1:].astype(int)
120
+
121
+ return best_cost, top_m_transfer_records
122
+
123
+ def _transfer_percent(self, data_index: int, old_cluster: int, new_cluster: int):
124
+ if (
125
+ self.Ti[old_cluster] >= self.goal
126
+ and self.Ti[new_cluster] >= self.goal
127
+ ) or (
128
+ self.Ti[old_cluster] <= self.goal
129
+ and self.Ti[new_cluster] <= self.goal
130
+ ):
131
+ return min(
132
+ self.membership[data_index, old_cluster],
133
+ ((self.Ti[old_cluster] - self.Ti[new_cluster]) / (2*np.sum(self.X_feature[data_index])))
134
+ )
135
+ else:
136
+ return min(
137
+ self.membership[data_index, old_cluster],
138
+ ((self.Ti[old_cluster] - self.goal) / np.sum(self.X_feature[data_index])),
139
+ ((self.goal - self.Ti[new_cluster]) / np.sum(self.X_feature[data_index])),
140
+ )
141
+
142
+ def _transfer(self, data_index: int, old_cluster: int, new_cluster: int) -> None:
143
+ transfer_percent = self._transfer_percent(data_index, old_cluster, new_cluster)
144
+ self.membership[data_index, old_cluster] -= transfer_percent
145
+ self.membership[data_index, new_cluster] += transfer_percent
146
+
147
+ def _no_transfer_possible(self, best_cost: float) -> bool:
148
+ return best_cost == np.inf
149
+
150
+ def _is_transfer_possible(self, old_cluster: int, new_cluster: int) -> bool:
151
+ return (
152
+ self.Ti[old_cluster] - self.Ti[new_cluster]
153
+ > 10**-self.tolerance
154
+ )
155
+
156
+ def _stop_codition(self, tol) -> bool:
157
+ return np.all(np.abs(self.Ti - self.goal) < 10**-tol)
158
+
159
+ def _expected_num_transfers(self) -> float:
160
+ max_diff_sum = np.max(self.Ti - self.Ti[:, None])
161
+ possible_transfers = self.X_feature[:, np.newaxis] * self.membership
162
+ mean_nonzero_transfers = np.mean(possible_transfers[np.nonzero(possible_transfers)])
163
+ return max(int(np.floor(max_diff_sum / (2 * mean_nonzero_transfers))), 1)
164
+
165
+ def _update_centroids(self) -> None:
166
+ for i in range(self.k):
167
+ self.centroids[i] = np.mean(
168
+ self.Y_features[self.membership[:, i] > 0], axis=0
169
+ )
170
+
171
+ def fit(self, Y_features: NDArray, X_feature: NDArray, centroids: NDArray, labels: NDArray, max_iteration: int = 1000) -> None:
172
+ self.N = len(X_feature)
173
+ self.Y_features = Y_features
174
+ self.X_feature = X_feature
175
+ self.centroids = centroids
176
+ self.labels = labels
177
+ self.goal = self._generate_goal()
178
+ self.membership = self._generate_membership()
179
+ self.Ti = self._generate_Ti()
180
+ self._update_centroids()
181
+ iter_ = 0
182
+
183
+ while not self._stop_codition(self.tolerance) and iter_ < max_iteration:
184
+ print(f"\nIteration {iter_}")
185
+ print(f"Ti: {self.Ti}")
186
+ print(f"Sum Ti: {np.sum(self.Ti)}")
187
+ best_cost, transfer_records = self._get_transfer_records(top_m=self._expected_num_transfers())
188
+ if self._no_transfer_possible(best_cost):
189
+ break
190
+ for data_index, old_cluster, new_cluster in transfer_records:
191
+ if self._is_transfer_possible(old_cluster, new_cluster):
192
+ self._transfer(data_index, old_cluster, new_cluster)
193
+ self.Ti = self._generate_Ti()
194
+ self._update_centroids()
195
+ iter_ += 1
196
+
197
+ def get_clusters(self) -> NDArray:
198
+ clusters = []
199
+
200
+ for i in range(self.k):
201
+ x = self.membership[:, i] * self.X_feature
202
+ ids = np.nonzero(x)[0]
203
+ units = np.concatenate(
204
+ [ids.reshape(-1, 1), self.Y_features[ids], x[ids].reshape(-1, 1)],
205
+ axis=1,
206
+ )
207
+ clusters.append(units)
208
+
209
+ return clusters
@@ -0,0 +1,233 @@
1
+ import numpy as np
2
+ from numpy.typing import NDArray
3
+ from sklearn.cluster import KMeans
4
+ import matplotlib.pyplot as plt
5
+
6
+
7
+ class OneBoundaryBalancedKMeans:
8
+ def __init__(
9
+ self, k: int, *, initial_centroids: NDArray = None, tolerance: int = 5
10
+ ) -> None:
11
+ self.k = k
12
+ self.tolerance = tolerance
13
+ self.Y_features = None
14
+ self.x_feature = None
15
+ self.N = None
16
+ self.centroids = initial_centroids
17
+ self.labels: NDArray = None
18
+ self.membership: NDArray = None
19
+ self.Ti: NDArray = None
20
+ self.goal: float = None
21
+ self.rng = np.random.default_rng()
22
+
23
+ def _generate_goal(self) -> float:
24
+ return self.x_feature.sum()/self.k
25
+
26
+ def _generate_membership(self):
27
+ membership = np.zeros((self.N, self.k))
28
+ for j in range(self.N):
29
+ membership[j, self.labels[j]] = 1
30
+ return membership
31
+
32
+ def _generate_Ti(self) -> NDArray:
33
+ return np.sum(
34
+ self.x_feature[:, np.newaxis] * self.membership,
35
+ axis=0,
36
+ ).T
37
+
38
+ def _determine_points_to_gain(self) -> dict[int, list[tuple[int, float]]]:
39
+ """
40
+ For each cluster below goal, compute the points (and fractions) needed to fill its deficit,
41
+ then score each cluster by (mean distance of those points to its centroid) / deficit,
42
+ and finally return the gains for the single cluster with the lowest score.
43
+ Restriction: once any fraction of a point is earmarked for a cluster, that fraction is
44
+ 'locked' and cannot be reused for another cluster in this round. If 100% is locked,
45
+ the point is out entirely; if 50% is locked, only the remaining 50% can be used elsewhere.
46
+ """
47
+ deficits = self.goal - self.Ti
48
+ cluster_gains: dict[int, list[tuple[int, float]]] = {}
49
+ cluster_scores: dict[int, float] = {}
50
+ # tracks fraction of each point already locked in this round
51
+
52
+ for cluster_idx, deficit in enumerate(deficits):
53
+ if deficit <= 0:
54
+ continue
55
+
56
+ # build list of candidates not yet fully in this cluster and not fully locked
57
+ candidates = []
58
+ for j in range(self.N):
59
+ in_cluster = self.membership[j, cluster_idx]
60
+ locked = self.locked_frac.get(j, 0.0)
61
+ # available fraction of the point for this cluster:
62
+ avail_frac = (1 - in_cluster) - locked
63
+ if avail_frac > 0:
64
+ candidates.append(j)
65
+ if not candidates:
66
+ continue
67
+
68
+ # distances to centroid
69
+ dists = np.linalg.norm(self.Y_features[candidates] - self.centroids[cluster_idx], axis=1)
70
+ sorted_idx = np.argsort(dists)
71
+ sorted_candidates = [candidates[i] for i in sorted_idx]
72
+ # print(f"centroid: {self.centroids[cluster_idx]}")
73
+ # print(dists[sorted_idx][:5], sorted_candidates[:5])
74
+
75
+ gains: list[tuple[int, float]] = []
76
+ dists_for_gains: list[float] = []
77
+ accum = 0.0
78
+
79
+ for pt_idx in sorted_candidates:
80
+ prev_in = self.membership[pt_idx, cluster_idx]
81
+ locked = self.locked_frac.get(pt_idx, 0.0)
82
+ # fraction of whole point still available to move into this cluster
83
+ avail_frac = (1 - prev_in) - locked
84
+ if avail_frac <= 0:
85
+ continue
86
+ # x_mass available
87
+ avail_mass = self.x_feature[pt_idx] * avail_frac
88
+
89
+ if accum + avail_mass <= deficit:
90
+ # lock entire available fraction
91
+ frac_move = avail_frac
92
+ gains.append((pt_idx, frac_move))
93
+ dists_for_gains.append(
94
+ np.linalg.norm(self.Y_features[pt_idx] - self.centroids[cluster_idx])
95
+ )
96
+ accum += avail_mass
97
+ self.locked_frac[pt_idx] = locked + frac_move
98
+ else:
99
+ # we only need part of the available mass
100
+ needed = deficit - accum
101
+ frac_of_mass = needed / self.x_feature[pt_idx]
102
+ # but that is fraction of whole point:
103
+ frac_move = min(frac_of_mass, avail_frac)
104
+ if frac_move > 0:
105
+ gains.append((pt_idx, frac_move))
106
+ dists_for_gains.append(
107
+ np.linalg.norm(self.Y_features[pt_idx] - self.centroids[cluster_idx])
108
+ )
109
+ self.locked_frac[pt_idx] = locked + frac_move
110
+ break
111
+
112
+ if gains:
113
+ mean_dist = sum(dists_for_gains) / len(dists_for_gains)
114
+ score = mean_dist / deficit
115
+ cluster_gains[cluster_idx] = gains
116
+ cluster_scores[cluster_idx] = score
117
+
118
+ if not cluster_scores:
119
+ return {}
120
+
121
+ # choose the cluster with lowest score
122
+ best = min(cluster_scores, key=cluster_scores.get)
123
+ return {best: cluster_gains[best]}
124
+
125
+
126
+ def _assign_points_to_clusters(self, points_to_gain: dict[int, list[tuple[int, float]]]) -> None:
127
+ """
128
+ Assigns the computed gains for the single target cluster:
129
+ - `frac` is the additional fraction of the point to assign to the cluster.
130
+ - New cluster membership = previous membership + frac (capped at 1.0).
131
+ - The remaining mass (1 − new_membership) is redistributed among the other clusters
132
+ in proportion to their previous memberships.
133
+ """
134
+ if not points_to_gain:
135
+ return
136
+
137
+ # unpack the single entry
138
+ cluster_idx, gains = next(iter(points_to_gain.items()))
139
+ for pt_idx, add_frac in gains:
140
+ prev = self.membership[pt_idx].copy()
141
+ # print(f"pt_idx: {pt_idx}, prev: {prev}, add_frac: {add_frac}, cluster_idx: {cluster_idx}")
142
+ already_in = prev[cluster_idx]
143
+
144
+ # compute new membership for this cluster
145
+ new_cluster_mem = min(already_in + add_frac, 1.0)
146
+ rem_mass = 1.0 - new_cluster_mem
147
+
148
+ # prepare redistribution of remaining mass
149
+ prev_except = prev.copy()
150
+ prev_except[cluster_idx] = 0.0
151
+ total_prev_except = prev_except.sum()
152
+
153
+ # build the updated membership vector
154
+ new_mem = np.zeros_like(prev)
155
+ new_mem[cluster_idx] = new_cluster_mem
156
+
157
+ if total_prev_except > 0 and rem_mass > 0:
158
+ # scale other clusters proportionally
159
+ new_mem += (prev_except / total_prev_except) * rem_mass
160
+ elif rem_mass > 0:
161
+ # if no other membership existed, put remainder back into this cluster
162
+ new_mem[cluster_idx] += rem_mass
163
+
164
+ # apply the update
165
+ self.membership[pt_idx] = new_mem
166
+
167
+ # print(f"new_mem: {new_mem}")
168
+
169
+ # recompute cluster totals
170
+ self.Ti = self._generate_Ti()
171
+
172
+ def _update_centroids(self) -> None:
173
+ for i in range(self.k):
174
+ points = self.Y_features[self.membership[:, i] > 0]
175
+ self.centroids[i] = np.mean(points, axis=0)
176
+
177
+ def _stop_codition(self) -> bool:
178
+ return np.all(np.abs(self.Ti - self.goal) < 10**-self.tolerance)
179
+
180
+ def fit(self, Y_features: NDArray, x_feature: NDArray) -> None:
181
+ self.Y_features = Y_features
182
+ self.x_feature = x_feature
183
+ self.N = x_feature.size
184
+ self.goal = self._generate_goal()
185
+
186
+ kmeans = KMeans(
187
+ n_clusters=self.k,
188
+ init=self.centroids if self.centroids is not None else "k-means++",
189
+ n_init=10,
190
+ tol=10**-self.tolerance,
191
+ )
192
+ kmeans.fit(self.Y_features)
193
+
194
+ self.centroids = kmeans.cluster_centers_
195
+ self.labels = kmeans.labels_
196
+ self.membership = self._generate_membership()
197
+ self.Ti = self._generate_Ti()
198
+
199
+ iter_ = 0
200
+ self.locked_frac: dict[int, float] = {}
201
+
202
+ while not self._stop_codition() and iter_ < 5:
203
+ print("\nIteration:", iter_)
204
+ print("(BEF) Ti:", self.Ti)
205
+ points_to_gain = self._determine_points_to_gain()
206
+ if not points_to_gain:
207
+ break
208
+ self._assign_points_to_clusters(points_to_gain)
209
+ iter_ += 1
210
+
211
+ points = np.array(list(points_to_gain.values())[0])
212
+ points_id, points_frac = points[:, 0], points[:, 1]
213
+ points_id = points_id.astype(int)
214
+ points_frac = points_frac.astype(float)
215
+
216
+ mem = np.argmax(self.membership, axis=1)
217
+ # boarders = np.where(np.count_nonzero(self.membership, axis=1)>1)[0]
218
+ plt.figure(figsize=(5, 5), dpi=200)
219
+ plt.scatter(*Y_features.T, c=mem)
220
+ # plt.scatter(*Y_features[boarders].T, c='red', s=10)
221
+ plt.scatter(*Y_features[points_id].T, c='red', marker='*', s=10)
222
+ plt.scatter(*self.centroids.T, c='black', marker='x', s=100)
223
+
224
+ for i, point_id in enumerate(points_id):
225
+ x, y = Y_features[point_id]
226
+ plt.text(x, y - 0.05, f"{points_frac[i]:.2f}, {point_id}", color='black', fontsize=8, ha='center') # Adjust y-offset as needed
227
+
228
+
229
+ print("(AFT) Ti:", self.Ti)
230
+ print("Diff Ti:", np.sum(np.abs(self.Ti - self.goal)))
231
+ print("Points to gain:", points_to_gain)
232
+
233
+ self._update_centroids()