nystrom-ncut 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,371 @@
1
+ import logging
2
+ import math
3
+ from typing import Literal
4
+
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn.functional as F
8
+
9
+
10
+ @torch.no_grad()
11
+ def run_subgraph_sampling(
12
+ features: torch.Tensor,
13
+ num_sample: int = 300,
14
+ max_draw: int = 1000000,
15
+ sample_method: Literal["farthest", "random"] = "farthest",
16
+ ):
17
+ if num_sample >= features.shape[0]:
18
+ # if too many samples, use all samples and bypass Nystrom-like approximation
19
+ logging.info(
20
+ "num_sample is larger than total, bypass Nystrom-like approximation"
21
+ )
22
+ sampled_indices = torch.arange(features.shape[0])
23
+ else:
24
+ # sample subgraph
25
+ if sample_method == "farthest": # default
26
+ if num_sample > max_draw:
27
+ logging.warning(
28
+ f"num_sample is larger than max_draw, apply farthest point sampling on random sampled {max_draw} samples"
29
+ )
30
+ draw_indices = torch.randperm(features.shape[0])[:max_draw]
31
+ sampled_indices = farthest_point_sampling(
32
+ features[draw_indices].detach(),
33
+ num_sample=num_sample,
34
+ )
35
+ sampled_indices = draw_indices[sampled_indices]
36
+ else:
37
+ sampled_indices = farthest_point_sampling(
38
+ features.detach(),
39
+ num_sample=num_sample,
40
+ )
41
+ elif sample_method == "random": # not recommended
42
+ sampled_indices = torch.randperm(features.shape[0])[:num_sample]
43
+ else:
44
+ raise ValueError("sample_method should be 'farthest' or 'random'")
45
+ return sampled_indices
46
+
47
+
48
+ def farthest_point_sampling(
49
+ features: torch.Tensor,
50
+ num_sample: int = 300,
51
+ h: int = 9,
52
+ ):
53
+ try:
54
+ import fpsample
55
+ except ImportError:
56
+ raise ImportError(
57
+ "fpsample import failed, please install `pip install fpsample`"
58
+ )
59
+
60
+ # PCA to reduce the dimension
61
+ if features.shape[1] > 8:
62
+ u, s, v = torch.pca_lowrank(features, q=8)
63
+ _n = features.shape[0]
64
+ s /= math.sqrt(_n)
65
+ features = u @ torch.diag(s)
66
+
67
+ h = min(h, int(np.log2(features.shape[0])))
68
+
69
+ kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(
70
+ features.cpu().numpy(), num_sample, h
71
+ ).astype(np.int64)
72
+ return torch.from_numpy(kdline_fps_samples_idx)
73
+
74
+
75
+ def distance_from_features(
76
+ features: torch.Tensor,
77
+ features_B: torch.Tensor,
78
+ distance: Literal["cosine", "euclidean", "rbf"],
79
+ fill_diagonal: bool,
80
+ ):
81
+ """Compute affinity matrix from input features.
82
+ Args:
83
+ features (torch.Tensor): input features, shape (n_samples, n_features)
84
+ features_B (torch.Tensor, optional): optional, if not None, compute affinity between two features
85
+ affinity_focal_gamma (float): affinity matrix parameter, lower t reduce the edge weights
86
+ on weak connections, default 1.0
87
+ distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'.
88
+ normalize_features (bool): normalize input features before computing affinity matrix
89
+
90
+ Returns:
91
+ (torch.Tensor): affinity matrix, shape (n_samples, n_samples)
92
+ """
93
+ # compute distance matrix from input features
94
+ if distance == "cosine":
95
+ if not check_if_normalized(features):
96
+ features = F.normalize(features, dim=-1)
97
+ if not check_if_normalized(features_B):
98
+ features_B = F.normalize(features_B, dim=-1)
99
+ D = 1 - features @ features_B.T
100
+ elif distance == "euclidean":
101
+ D = torch.cdist(features, features_B, p=2)
102
+ elif distance == "rbf":
103
+ D = torch.cdist(features, features_B, p=2) ** 2
104
+ D = D / (2 * features.var(dim=0).sum())
105
+ else:
106
+ raise ValueError("distance should be 'cosine' or 'euclidean', 'rbf'")
107
+
108
+ if fill_diagonal:
109
+ D[torch.arange(D.shape[0]), torch.arange(D.shape[0])] = 0
110
+ return D
111
+
112
+
113
+ def affinity_from_features(
114
+ features: torch.Tensor,
115
+ features_B: torch.Tensor = None,
116
+ affinity_focal_gamma: float = 1.0,
117
+ distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
118
+ fill_diagonal: bool = True,
119
+ ):
120
+ """Compute affinity matrix from input features.
121
+
122
+ Args:
123
+ features (torch.Tensor): input features, shape (n_samples, n_features)
124
+ features_B (torch.Tensor, optional): optional, if not None, compute affinity between two features
125
+ affinity_focal_gamma (float): affinity matrix parameter, lower t reduce the edge weights
126
+ on weak connections, default 1.0
127
+ distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'.
128
+ normalize_features (bool): normalize input features before computing affinity matrix
129
+
130
+ Returns:
131
+ (torch.Tensor): affinity matrix, shape (n_samples, n_samples)
132
+ """
133
+ # compute affinity matrix from input features
134
+
135
+ # if feature_B is not provided, compute affinity matrix on features x features
136
+ # if feature_B is provided, compute affinity matrix on features x feature_B
137
+ if features_B is not None:
138
+ assert not fill_diagonal, "fill_diagonal should be False when feature_B is None"
139
+ features_B = features if features_B is None else features_B
140
+
141
+ # compute distance matrix from input features
142
+ D = distance_from_features(features, features_B, distance, fill_diagonal)
143
+
144
+ # torch.exp make affinity matrix positive definite,
145
+ # lower affinity_focal_gamma reduce the weak edge weights
146
+ A = torch.exp(-D / affinity_focal_gamma)
147
+ return A
148
+
149
+
150
+ def propagate_knn(
151
+ subgraph_output: torch.Tensor,
152
+ inp_features: torch.Tensor,
153
+ subgraph_features: torch.Tensor,
154
+ knn: int = 10,
155
+ distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
156
+ affinity_focal_gamma: float = 1.0,
157
+ chunk_size: int = 8096,
158
+ device: str = None,
159
+ use_tqdm: bool = False,
160
+ move_output_to_cpu: bool = False,
161
+ ):
162
+ """A generic function to propagate new nodes using KNN.
163
+
164
+ Args:
165
+ subgraph_output (torch.Tensor): output from subgraph, shape (num_sample, D)
166
+ inp_features (torch.Tensor): features from existing nodes, shape (new_num_samples, n_features)
167
+ subgraph_features (torch.Tensor): features from subgraph, shape (num_sample, n_features)
168
+ knn (int): number of KNN to propagate eige nvectors
169
+ distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'
170
+ chunk_size (int): chunk size for matrix multiplication
171
+ device (str): device to use for computation, if None, will not change device
172
+ use_tqdm (bool): show progress bar when propagating eigenvectors from subgraph to full graph
173
+
174
+ Returns:
175
+ torch.Tensor: propagated eigenvectors, shape (new_num_samples, D)
176
+
177
+ Examples:
178
+ >>> old_eigenvectors = torch.randn(3000, 20)
179
+ >>> old_features = torch.randn(3000, 100)
180
+ >>> new_features = torch.randn(200, 100)
181
+ >>> new_eigenvectors = propagate_knn(old_eigenvectors, new_features, old_features, knn=3)
182
+ >>> # new_eigenvectors.shape = (200, 20)
183
+
184
+ """
185
+ device = subgraph_output.device if device is None else device
186
+
187
+ if knn == 1:
188
+ return propagate_nearest(
189
+ subgraph_output,
190
+ inp_features,
191
+ subgraph_features,
192
+ chunk_size=chunk_size,
193
+ device=device,
194
+ move_output_to_cpu=move_output_to_cpu,
195
+ )
196
+
197
+ # used in nystrom_ncut
198
+ # propagate eigen_vector from subgraph to full graph
199
+ subgraph_output = subgraph_output.to(device)
200
+ V_list = []
201
+ iterator = range(0, inp_features.shape[0], chunk_size)
202
+ try:
203
+ assert use_tqdm
204
+ from tqdm import tqdm
205
+ iterator = tqdm(iterator, "propagate by KNN")
206
+ except (AssertionError, ImportError):
207
+ pass
208
+
209
+ subgraph_features = subgraph_features.to(device)
210
+ for i in iterator:
211
+ end = min(i + chunk_size, inp_features.shape[0])
212
+ _v = inp_features[i:end].to(device)
213
+ _A = affinity_from_features(subgraph_features, _v, affinity_focal_gamma, distance, False).mT
214
+
215
+ if knn is not None:
216
+ mask = torch.full_like(_A, True, dtype=torch.bool)
217
+ mask[torch.arange(end - i)[:, None], _A.topk(knn, dim=-1, largest=True).indices] = False
218
+ _A[mask] = 0.0
219
+ _A = F.normalize(_A, p=1, dim=-1)
220
+
221
+ _V = _A @ subgraph_output
222
+ if move_output_to_cpu:
223
+ _V = _V.cpu()
224
+ V_list.append(_V)
225
+
226
+ subgraph_output = torch.cat(V_list, dim=0)
227
+ return subgraph_output
228
+
229
+
230
+ def propagate_nearest(
231
+ subgraph_output: torch.Tensor,
232
+ inp_features: torch.Tensor,
233
+ subgraph_features: torch.Tensor,
234
+ distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
235
+ chunk_size: int = 8096,
236
+ device: str = None,
237
+ move_output_to_cpu: bool = False,
238
+ ):
239
+ device = subgraph_output.device if device is None else device
240
+ if distance == 'cosine':
241
+ if not check_if_normalized(inp_features):
242
+ inp_features = F.normalize(inp_features, dim=-1)
243
+ if not check_if_normalized(subgraph_features):
244
+ subgraph_features = F.normalize(subgraph_features, dim=-1)
245
+
246
+ # used in nystrom_tsne, equivalent to propagate_by_knn with knn=1
247
+ # propagate tSNE from subgraph to full graph
248
+ V_list = []
249
+ subgraph_features = subgraph_features.to(device)
250
+ for i in range(0, inp_features.shape[0], chunk_size):
251
+ end = min(i + chunk_size, inp_features.shape[0])
252
+ _v = inp_features[i:end].to(device)
253
+ _A = -distance_from_features(subgraph_features, _v, distance, False).mT
254
+
255
+ # keep top1 for each row
256
+ top_idx = _A.argmax(dim=-1).cpu()
257
+ _V = subgraph_output[top_idx]
258
+ if move_output_to_cpu:
259
+ _V = _V.cpu()
260
+ V_list.append(_V)
261
+
262
+ subgraph_output = torch.cat(V_list, dim=0)
263
+ return subgraph_output
264
+
265
+
266
+ # wrapper functions for adding new nodes to existing graph
267
+ def propagate_eigenvectors(
268
+ eigenvectors: torch.Tensor,
269
+ features: torch.Tensor,
270
+ new_features: torch.Tensor,
271
+ knn: int,
272
+ num_sample: int,
273
+ sample_method: Literal["farthest", "random"],
274
+ chunk_size: int,
275
+ device: str,
276
+ use_tqdm: bool,
277
+ ):
278
+ """Propagate eigenvectors to new nodes using KNN. Note: this is equivalent to the class API `NCUT.tranform(new_features)`, expect for the sampling is re-done in this function.
279
+ Args:
280
+ eigenvectors (torch.Tensor): eigenvectors from existing nodes, shape (num_sample, num_eig)
281
+ features (torch.Tensor): features from existing nodes, shape (n_samples, n_features)
282
+ new_features (torch.Tensor): features from new nodes, shape (n_new_samples, n_features)
283
+ knn (int): number of KNN to propagate eigenvectors, default 3
284
+ num_sample (int): number of samples for subgraph sampling, default 50000
285
+ sample_method (str): sample method, 'farthest' (default) or 'random'
286
+ chunk_size (int): chunk size for matrix multiplication, default 8096
287
+ device (str): device to use for computation, if None, will not change device
288
+ use_tqdm (bool): show progress bar when propagating eigenvectors from subgraph to full graph
289
+
290
+ Returns:
291
+ torch.Tensor: propagated eigenvectors, shape (n_new_samples, num_eig)
292
+
293
+ Examples:
294
+ >>> old_eigenvectors = torch.randn(3000, 20)
295
+ >>> old_features = torch.randn(3000, 100)
296
+ >>> new_features = torch.randn(200, 100)
297
+ >>> new_eigenvectors = propagate_eigenvectors(old_eigenvectors, new_features, old_features, knn=3)
298
+ >>> # new_eigenvectors.shape = (200, 20)
299
+ """
300
+
301
+ device = eigenvectors.device if device is None else device
302
+
303
+ # sample subgraph
304
+ subgraph_indices = run_subgraph_sampling(
305
+ features,
306
+ num_sample=num_sample,
307
+ sample_method=sample_method,
308
+ )
309
+
310
+ subgraph_eigenvectors = eigenvectors[subgraph_indices].to(device)
311
+ subgraph_features = features[subgraph_indices].to(device)
312
+ new_features = new_features.to(device)
313
+
314
+ # propagate eigenvectors from subgraph to new nodes
315
+ new_eigenvectors = propagate_knn(
316
+ subgraph_eigenvectors,
317
+ new_features,
318
+ subgraph_features,
319
+ knn=knn,
320
+ chunk_size=chunk_size,
321
+ device=device,
322
+ use_tqdm=use_tqdm,
323
+ )
324
+
325
+ return new_eigenvectors
326
+
327
+
328
+ def check_if_normalized(x, n=1000):
329
+ """check if the input tensor is normalized (unit norm)"""
330
+ n = min(n, x.shape[0])
331
+ random_indices = torch.randperm(x.shape[0])[:n]
332
+ _x = x[random_indices]
333
+ flag = torch.allclose(torch.norm(_x, dim=-1), torch.ones(n, device=x.device))
334
+ return flag
335
+
336
+
337
+ def quantile_min_max(x, q1=0.01, q2=0.99, n_sample=10000):
338
+ if x.shape[0] > n_sample:
339
+ np.random.seed(0)
340
+ random_idx = np.random.choice(x.shape[0], n_sample, replace=False)
341
+ vmin, vmax = x[random_idx].quantile(q1), x[random_idx].quantile(q2)
342
+ else:
343
+ vmin, vmax = x.quantile(q1), x.quantile(q2)
344
+ return vmin, vmax
345
+
346
+
347
+ def quantile_normalize(x, q=0.95):
348
+ """normalize each dimension of x to [0, 1], take 95-th percentage, this robust to outliers
349
+ </br> 1. sort x
350
+ </br> 2. take q-th quantile
351
+ </br> min_value -> (1-q)-th quantile
352
+ </br> max_value -> q-th quantile
353
+ </br> 3. normalize
354
+ </br> x = (x - min_value) / (max_value - min_value)
355
+
356
+ Args:
357
+ x (torch.Tensor): input tensor, shape (n_samples, n_features)
358
+ normalize each feature to 0-1 range
359
+ q (float): quantile, default 0.95
360
+
361
+ Returns:
362
+ torch.Tensor: quantile normalized tensor
363
+ """
364
+ # normalize x to 0-1 range, max value is q-th quantile
365
+ # quantile makes the normalization robust to outliers
366
+ if isinstance(x, np.ndarray):
367
+ x = torch.tensor(x)
368
+ vmax, vmin = quantile_min_max(x, q, 1 - q)
369
+ x = (x - vmin) / (vmax - vmin)
370
+ x = x.clamp(0, 1)
371
+ return x