nystrom-ncut 0.0.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,371 @@
1
+ import logging
2
+ import math
3
+ from typing import Literal
4
+
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn.functional as F
8
+
9
+
10
+ @torch.no_grad()
11
+ def run_subgraph_sampling(
12
+ features: torch.Tensor,
13
+ num_sample: int = 300,
14
+ max_draw: int = 1000000,
15
+ sample_method: Literal["farthest", "random"] = "farthest",
16
+ ):
17
+ if num_sample >= features.shape[0]:
18
+ # if too many samples, use all samples and bypass Nystrom-like approximation
19
+ logging.info(
20
+ "num_sample is larger than total, bypass Nystrom-like approximation"
21
+ )
22
+ sampled_indices = torch.arange(features.shape[0])
23
+ else:
24
+ # sample subgraph
25
+ if sample_method == "farthest": # default
26
+ if num_sample > max_draw:
27
+ logging.warning(
28
+ f"num_sample is larger than max_draw, apply farthest point sampling on random sampled {max_draw} samples"
29
+ )
30
+ draw_indices = torch.randperm(features.shape[0])[:max_draw]
31
+ sampled_indices = farthest_point_sampling(
32
+ features[draw_indices].detach(),
33
+ num_sample=num_sample,
34
+ )
35
+ sampled_indices = draw_indices[sampled_indices]
36
+ else:
37
+ sampled_indices = farthest_point_sampling(
38
+ features.detach(),
39
+ num_sample=num_sample,
40
+ )
41
+ elif sample_method == "random": # not recommended
42
+ sampled_indices = torch.randperm(features.shape[0])[:num_sample]
43
+ else:
44
+ raise ValueError("sample_method should be 'farthest' or 'random'")
45
+ return sampled_indices
46
+
47
+
48
+ def farthest_point_sampling(
49
+ features: torch.Tensor,
50
+ num_sample: int = 300,
51
+ h: int = 9,
52
+ ):
53
+ try:
54
+ import fpsample
55
+ except ImportError:
56
+ raise ImportError(
57
+ "fpsample import failed, please install `pip install fpsample`"
58
+ )
59
+
60
+ # PCA to reduce the dimension
61
+ if features.shape[1] > 8:
62
+ u, s, v = torch.pca_lowrank(features, q=8)
63
+ _n = features.shape[0]
64
+ s /= math.sqrt(_n)
65
+ features = u @ torch.diag(s)
66
+
67
+ h = min(h, int(np.log2(features.shape[0])))
68
+
69
+ kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(
70
+ features.cpu().numpy(), num_sample, h
71
+ ).astype(np.int64)
72
+ return torch.from_numpy(kdline_fps_samples_idx)
73
+
74
+
75
+ def distance_from_features(
76
+ features: torch.Tensor,
77
+ features_B: torch.Tensor,
78
+ distance: Literal["cosine", "euclidean", "rbf"],
79
+ fill_diagonal: bool,
80
+ ):
81
+ """Compute affinity matrix from input features.
82
+ Args:
83
+ features (torch.Tensor): input features, shape (n_samples, n_features)
84
+ features_B (torch.Tensor, optional): optional, if not None, compute affinity between two features
85
+ affinity_focal_gamma (float): affinity matrix parameter, lower t reduce the edge weights
86
+ on weak connections, default 1.0
87
+ distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'.
88
+ normalize_features (bool): normalize input features before computing affinity matrix
89
+
90
+ Returns:
91
+ (torch.Tensor): affinity matrix, shape (n_samples, n_samples)
92
+ """
93
+ # compute distance matrix from input features
94
+ if distance == "cosine":
95
+ if not check_if_normalized(features):
96
+ features = F.normalize(features, dim=-1)
97
+ if not check_if_normalized(features_B):
98
+ features_B = F.normalize(features_B, dim=-1)
99
+ D = 1 - features @ features_B.T
100
+ elif distance == "euclidean":
101
+ D = torch.cdist(features, features_B, p=2)
102
+ elif distance == "rbf":
103
+ D = torch.cdist(features, features_B, p=2) ** 2
104
+ D = D / (2 * features.var(dim=0).sum())
105
+ else:
106
+ raise ValueError("distance should be 'cosine' or 'euclidean', 'rbf'")
107
+
108
+ if fill_diagonal:
109
+ D[torch.arange(D.shape[0]), torch.arange(D.shape[0])] = 0
110
+ return D
111
+
112
+
113
+ def affinity_from_features(
114
+ features: torch.Tensor,
115
+ features_B: torch.Tensor = None,
116
+ affinity_focal_gamma: float = 1.0,
117
+ distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
118
+ fill_diagonal: bool = True,
119
+ ):
120
+ """Compute affinity matrix from input features.
121
+
122
+ Args:
123
+ features (torch.Tensor): input features, shape (n_samples, n_features)
124
+ features_B (torch.Tensor, optional): optional, if not None, compute affinity between two features
125
+ affinity_focal_gamma (float): affinity matrix parameter, lower t reduce the edge weights
126
+ on weak connections, default 1.0
127
+ distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'.
128
+ normalize_features (bool): normalize input features before computing affinity matrix
129
+
130
+ Returns:
131
+ (torch.Tensor): affinity matrix, shape (n_samples, n_samples)
132
+ """
133
+ # compute affinity matrix from input features
134
+
135
+ # if feature_B is not provided, compute affinity matrix on features x features
136
+ # if feature_B is provided, compute affinity matrix on features x feature_B
137
+ if features_B is not None:
138
+ assert not fill_diagonal, "fill_diagonal should be False when feature_B is None"
139
+ features_B = features if features_B is None else features_B
140
+
141
+ # compute distance matrix from input features
142
+ D = distance_from_features(features, features_B, distance, fill_diagonal)
143
+
144
+ # torch.exp make affinity matrix positive definite,
145
+ # lower affinity_focal_gamma reduce the weak edge weights
146
+ A = torch.exp(-D / affinity_focal_gamma)
147
+ return A
148
+
149
+
150
+ def propagate_knn(
151
+ subgraph_output: torch.Tensor,
152
+ inp_features: torch.Tensor,
153
+ subgraph_features: torch.Tensor,
154
+ knn: int = 10,
155
+ distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
156
+ affinity_focal_gamma: float = 1.0,
157
+ chunk_size: int = 8096,
158
+ device: str = None,
159
+ use_tqdm: bool = False,
160
+ move_output_to_cpu: bool = False,
161
+ ):
162
+ """A generic function to propagate new nodes using KNN.
163
+
164
+ Args:
165
+ subgraph_output (torch.Tensor): output from subgraph, shape (num_sample, D)
166
+ inp_features (torch.Tensor): features from existing nodes, shape (new_num_samples, n_features)
167
+ subgraph_features (torch.Tensor): features from subgraph, shape (num_sample, n_features)
168
+ knn (int): number of KNN to propagate eige nvectors
169
+ distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'
170
+ chunk_size (int): chunk size for matrix multiplication
171
+ device (str): device to use for computation, if None, will not change device
172
+ use_tqdm (bool): show progress bar when propagating eigenvectors from subgraph to full graph
173
+
174
+ Returns:
175
+ torch.Tensor: propagated eigenvectors, shape (new_num_samples, D)
176
+
177
+ Examples:
178
+ >>> old_eigenvectors = torch.randn(3000, 20)
179
+ >>> old_features = torch.randn(3000, 100)
180
+ >>> new_features = torch.randn(200, 100)
181
+ >>> new_eigenvectors = propagate_knn(old_eigenvectors, new_features, old_features, knn=3)
182
+ >>> # new_eigenvectors.shape = (200, 20)
183
+
184
+ """
185
+ device = subgraph_output.device if device is None else device
186
+
187
+ if knn == 1:
188
+ return propagate_nearest(
189
+ subgraph_output,
190
+ inp_features,
191
+ subgraph_features,
192
+ chunk_size=chunk_size,
193
+ device=device,
194
+ move_output_to_cpu=move_output_to_cpu,
195
+ )
196
+
197
+ # used in nystrom_ncut
198
+ # propagate eigen_vector from subgraph to full graph
199
+ subgraph_output = subgraph_output.to(device)
200
+ V_list = []
201
+ iterator = range(0, inp_features.shape[0], chunk_size)
202
+ try:
203
+ assert use_tqdm
204
+ from tqdm import tqdm
205
+ iterator = tqdm(iterator, "propagate by KNN")
206
+ except (AssertionError, ImportError):
207
+ pass
208
+
209
+ subgraph_features = subgraph_features.to(device)
210
+ for i in iterator:
211
+ end = min(i + chunk_size, inp_features.shape[0])
212
+ _v = inp_features[i:end].to(device)
213
+ _A = affinity_from_features(subgraph_features, _v, affinity_focal_gamma, distance, False).mT
214
+
215
+ if knn is not None:
216
+ mask = torch.full_like(_A, True, dtype=torch.bool)
217
+ mask[torch.arange(end - i)[:, None], _A.topk(knn, dim=-1, largest=True).indices] = False
218
+ _A[mask] = 0.0
219
+ _A = F.normalize(_A, p=1, dim=-1)
220
+
221
+ _V = _A @ subgraph_output
222
+ if move_output_to_cpu:
223
+ _V = _V.cpu()
224
+ V_list.append(_V)
225
+
226
+ subgraph_output = torch.cat(V_list, dim=0)
227
+ return subgraph_output
228
+
229
+
230
+ def propagate_nearest(
231
+ subgraph_output: torch.Tensor,
232
+ inp_features: torch.Tensor,
233
+ subgraph_features: torch.Tensor,
234
+ distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
235
+ chunk_size: int = 8096,
236
+ device: str = None,
237
+ move_output_to_cpu: bool = False,
238
+ ):
239
+ device = subgraph_output.device if device is None else device
240
+ if distance == 'cosine':
241
+ if not check_if_normalized(inp_features):
242
+ inp_features = F.normalize(inp_features, dim=-1)
243
+ if not check_if_normalized(subgraph_features):
244
+ subgraph_features = F.normalize(subgraph_features, dim=-1)
245
+
246
+ # used in nystrom_tsne, equivalent to propagate_by_knn with knn=1
247
+ # propagate tSNE from subgraph to full graph
248
+ V_list = []
249
+ subgraph_features = subgraph_features.to(device)
250
+ for i in range(0, inp_features.shape[0], chunk_size):
251
+ end = min(i + chunk_size, inp_features.shape[0])
252
+ _v = inp_features[i:end].to(device)
253
+ _A = -distance_from_features(subgraph_features, _v, distance, False).mT
254
+
255
+ # keep top1 for each row
256
+ top_idx = _A.argmax(dim=-1).cpu()
257
+ _V = subgraph_output[top_idx]
258
+ if move_output_to_cpu:
259
+ _V = _V.cpu()
260
+ V_list.append(_V)
261
+
262
+ subgraph_output = torch.cat(V_list, dim=0)
263
+ return subgraph_output
264
+
265
+
266
+ # wrapper functions for adding new nodes to existing graph
267
+ def propagate_eigenvectors(
268
+ eigenvectors: torch.Tensor,
269
+ features: torch.Tensor,
270
+ new_features: torch.Tensor,
271
+ knn: int,
272
+ num_sample: int,
273
+ sample_method: Literal["farthest", "random"],
274
+ chunk_size: int,
275
+ device: str,
276
+ use_tqdm: bool,
277
+ ):
278
+ """Propagate eigenvectors to new nodes using KNN. Note: this is equivalent to the class API `NCUT.tranform(new_features)`, expect for the sampling is re-done in this function.
279
+ Args:
280
+ eigenvectors (torch.Tensor): eigenvectors from existing nodes, shape (num_sample, num_eig)
281
+ features (torch.Tensor): features from existing nodes, shape (n_samples, n_features)
282
+ new_features (torch.Tensor): features from new nodes, shape (n_new_samples, n_features)
283
+ knn (int): number of KNN to propagate eigenvectors, default 3
284
+ num_sample (int): number of samples for subgraph sampling, default 50000
285
+ sample_method (str): sample method, 'farthest' (default) or 'random'
286
+ chunk_size (int): chunk size for matrix multiplication, default 8096
287
+ device (str): device to use for computation, if None, will not change device
288
+ use_tqdm (bool): show progress bar when propagating eigenvectors from subgraph to full graph
289
+
290
+ Returns:
291
+ torch.Tensor: propagated eigenvectors, shape (n_new_samples, num_eig)
292
+
293
+ Examples:
294
+ >>> old_eigenvectors = torch.randn(3000, 20)
295
+ >>> old_features = torch.randn(3000, 100)
296
+ >>> new_features = torch.randn(200, 100)
297
+ >>> new_eigenvectors = propagate_eigenvectors(old_eigenvectors, new_features, old_features, knn=3)
298
+ >>> # new_eigenvectors.shape = (200, 20)
299
+ """
300
+
301
+ device = eigenvectors.device if device is None else device
302
+
303
+ # sample subgraph
304
+ subgraph_indices = run_subgraph_sampling(
305
+ features,
306
+ num_sample=num_sample,
307
+ sample_method=sample_method,
308
+ )
309
+
310
+ subgraph_eigenvectors = eigenvectors[subgraph_indices].to(device)
311
+ subgraph_features = features[subgraph_indices].to(device)
312
+ new_features = new_features.to(device)
313
+
314
+ # propagate eigenvectors from subgraph to new nodes
315
+ new_eigenvectors = propagate_knn(
316
+ subgraph_eigenvectors,
317
+ new_features,
318
+ subgraph_features,
319
+ knn=knn,
320
+ chunk_size=chunk_size,
321
+ device=device,
322
+ use_tqdm=use_tqdm,
323
+ )
324
+
325
+ return new_eigenvectors
326
+
327
+
328
+ def check_if_normalized(x, n=1000):
329
+ """check if the input tensor is normalized (unit norm)"""
330
+ n = min(n, x.shape[0])
331
+ random_indices = torch.randperm(x.shape[0])[:n]
332
+ _x = x[random_indices]
333
+ flag = torch.allclose(torch.norm(_x, dim=-1), torch.ones(n, device=x.device))
334
+ return flag
335
+
336
+
337
+ def quantile_min_max(x, q1=0.01, q2=0.99, n_sample=10000):
338
+ if x.shape[0] > n_sample:
339
+ np.random.seed(0)
340
+ random_idx = np.random.choice(x.shape[0], n_sample, replace=False)
341
+ vmin, vmax = x[random_idx].quantile(q1), x[random_idx].quantile(q2)
342
+ else:
343
+ vmin, vmax = x.quantile(q1), x.quantile(q2)
344
+ return vmin, vmax
345
+
346
+
347
+ def quantile_normalize(x, q=0.95):
348
+ """normalize each dimension of x to [0, 1], take 95-th percentage, this robust to outliers
349
+ </br> 1. sort x
350
+ </br> 2. take q-th quantile
351
+ </br> min_value -> (1-q)-th quantile
352
+ </br> max_value -> q-th quantile
353
+ </br> 3. normalize
354
+ </br> x = (x - min_value) / (max_value - min_value)
355
+
356
+ Args:
357
+ x (torch.Tensor): input tensor, shape (n_samples, n_features)
358
+ normalize each feature to 0-1 range
359
+ q (float): quantile, default 0.95
360
+
361
+ Returns:
362
+ torch.Tensor: quantile normalized tensor
363
+ """
364
+ # normalize x to 0-1 range, max value is q-th quantile
365
+ # quantile makes the normalization robust to outliers
366
+ if isinstance(x, np.ndarray):
367
+ x = torch.tensor(x)
368
+ vmax, vmin = quantile_min_max(x, q, 1 - q)
369
+ x = (x - vmin) / (vmax - vmin)
370
+ x = x.clamp(0, 1)
371
+ return x