nystrom-ncut 0.0.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- nystrom_ncut/__init__.py +22 -0
- nystrom_ncut/ncut_pytorch.py +561 -0
- nystrom_ncut/new_ncut_pytorch.py +241 -0
- nystrom_ncut/nystrom.py +170 -0
- nystrom_ncut/propagation_utils.py +371 -0
- nystrom_ncut/visualize_utils.py +655 -0
- nystrom_ncut-0.0.1.dist-info/LICENSE +19 -0
- nystrom_ncut-0.0.1.dist-info/METADATA +164 -0
- nystrom_ncut-0.0.1.dist-info/RECORD +11 -0
- nystrom_ncut-0.0.1.dist-info/WHEEL +5 -0
- nystrom_ncut-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,371 @@
|
|
1
|
+
import logging
|
2
|
+
import math
|
3
|
+
from typing import Literal
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import torch
|
7
|
+
import torch.nn.functional as F
|
8
|
+
|
9
|
+
|
10
|
+
@torch.no_grad()
|
11
|
+
def run_subgraph_sampling(
|
12
|
+
features: torch.Tensor,
|
13
|
+
num_sample: int = 300,
|
14
|
+
max_draw: int = 1000000,
|
15
|
+
sample_method: Literal["farthest", "random"] = "farthest",
|
16
|
+
):
|
17
|
+
if num_sample >= features.shape[0]:
|
18
|
+
# if too many samples, use all samples and bypass Nystrom-like approximation
|
19
|
+
logging.info(
|
20
|
+
"num_sample is larger than total, bypass Nystrom-like approximation"
|
21
|
+
)
|
22
|
+
sampled_indices = torch.arange(features.shape[0])
|
23
|
+
else:
|
24
|
+
# sample subgraph
|
25
|
+
if sample_method == "farthest": # default
|
26
|
+
if num_sample > max_draw:
|
27
|
+
logging.warning(
|
28
|
+
f"num_sample is larger than max_draw, apply farthest point sampling on random sampled {max_draw} samples"
|
29
|
+
)
|
30
|
+
draw_indices = torch.randperm(features.shape[0])[:max_draw]
|
31
|
+
sampled_indices = farthest_point_sampling(
|
32
|
+
features[draw_indices].detach(),
|
33
|
+
num_sample=num_sample,
|
34
|
+
)
|
35
|
+
sampled_indices = draw_indices[sampled_indices]
|
36
|
+
else:
|
37
|
+
sampled_indices = farthest_point_sampling(
|
38
|
+
features.detach(),
|
39
|
+
num_sample=num_sample,
|
40
|
+
)
|
41
|
+
elif sample_method == "random": # not recommended
|
42
|
+
sampled_indices = torch.randperm(features.shape[0])[:num_sample]
|
43
|
+
else:
|
44
|
+
raise ValueError("sample_method should be 'farthest' or 'random'")
|
45
|
+
return sampled_indices
|
46
|
+
|
47
|
+
|
48
|
+
def farthest_point_sampling(
|
49
|
+
features: torch.Tensor,
|
50
|
+
num_sample: int = 300,
|
51
|
+
h: int = 9,
|
52
|
+
):
|
53
|
+
try:
|
54
|
+
import fpsample
|
55
|
+
except ImportError:
|
56
|
+
raise ImportError(
|
57
|
+
"fpsample import failed, please install `pip install fpsample`"
|
58
|
+
)
|
59
|
+
|
60
|
+
# PCA to reduce the dimension
|
61
|
+
if features.shape[1] > 8:
|
62
|
+
u, s, v = torch.pca_lowrank(features, q=8)
|
63
|
+
_n = features.shape[0]
|
64
|
+
s /= math.sqrt(_n)
|
65
|
+
features = u @ torch.diag(s)
|
66
|
+
|
67
|
+
h = min(h, int(np.log2(features.shape[0])))
|
68
|
+
|
69
|
+
kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(
|
70
|
+
features.cpu().numpy(), num_sample, h
|
71
|
+
).astype(np.int64)
|
72
|
+
return torch.from_numpy(kdline_fps_samples_idx)
|
73
|
+
|
74
|
+
|
75
|
+
def distance_from_features(
|
76
|
+
features: torch.Tensor,
|
77
|
+
features_B: torch.Tensor,
|
78
|
+
distance: Literal["cosine", "euclidean", "rbf"],
|
79
|
+
fill_diagonal: bool,
|
80
|
+
):
|
81
|
+
"""Compute affinity matrix from input features.
|
82
|
+
Args:
|
83
|
+
features (torch.Tensor): input features, shape (n_samples, n_features)
|
84
|
+
features_B (torch.Tensor, optional): optional, if not None, compute affinity between two features
|
85
|
+
affinity_focal_gamma (float): affinity matrix parameter, lower t reduce the edge weights
|
86
|
+
on weak connections, default 1.0
|
87
|
+
distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'.
|
88
|
+
normalize_features (bool): normalize input features before computing affinity matrix
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
(torch.Tensor): affinity matrix, shape (n_samples, n_samples)
|
92
|
+
"""
|
93
|
+
# compute distance matrix from input features
|
94
|
+
if distance == "cosine":
|
95
|
+
if not check_if_normalized(features):
|
96
|
+
features = F.normalize(features, dim=-1)
|
97
|
+
if not check_if_normalized(features_B):
|
98
|
+
features_B = F.normalize(features_B, dim=-1)
|
99
|
+
D = 1 - features @ features_B.T
|
100
|
+
elif distance == "euclidean":
|
101
|
+
D = torch.cdist(features, features_B, p=2)
|
102
|
+
elif distance == "rbf":
|
103
|
+
D = torch.cdist(features, features_B, p=2) ** 2
|
104
|
+
D = D / (2 * features.var(dim=0).sum())
|
105
|
+
else:
|
106
|
+
raise ValueError("distance should be 'cosine' or 'euclidean', 'rbf'")
|
107
|
+
|
108
|
+
if fill_diagonal:
|
109
|
+
D[torch.arange(D.shape[0]), torch.arange(D.shape[0])] = 0
|
110
|
+
return D
|
111
|
+
|
112
|
+
|
113
|
+
def affinity_from_features(
|
114
|
+
features: torch.Tensor,
|
115
|
+
features_B: torch.Tensor = None,
|
116
|
+
affinity_focal_gamma: float = 1.0,
|
117
|
+
distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
|
118
|
+
fill_diagonal: bool = True,
|
119
|
+
):
|
120
|
+
"""Compute affinity matrix from input features.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
features (torch.Tensor): input features, shape (n_samples, n_features)
|
124
|
+
features_B (torch.Tensor, optional): optional, if not None, compute affinity between two features
|
125
|
+
affinity_focal_gamma (float): affinity matrix parameter, lower t reduce the edge weights
|
126
|
+
on weak connections, default 1.0
|
127
|
+
distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'.
|
128
|
+
normalize_features (bool): normalize input features before computing affinity matrix
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
(torch.Tensor): affinity matrix, shape (n_samples, n_samples)
|
132
|
+
"""
|
133
|
+
# compute affinity matrix from input features
|
134
|
+
|
135
|
+
# if feature_B is not provided, compute affinity matrix on features x features
|
136
|
+
# if feature_B is provided, compute affinity matrix on features x feature_B
|
137
|
+
if features_B is not None:
|
138
|
+
assert not fill_diagonal, "fill_diagonal should be False when feature_B is None"
|
139
|
+
features_B = features if features_B is None else features_B
|
140
|
+
|
141
|
+
# compute distance matrix from input features
|
142
|
+
D = distance_from_features(features, features_B, distance, fill_diagonal)
|
143
|
+
|
144
|
+
# torch.exp make affinity matrix positive definite,
|
145
|
+
# lower affinity_focal_gamma reduce the weak edge weights
|
146
|
+
A = torch.exp(-D / affinity_focal_gamma)
|
147
|
+
return A
|
148
|
+
|
149
|
+
|
150
|
+
def propagate_knn(
|
151
|
+
subgraph_output: torch.Tensor,
|
152
|
+
inp_features: torch.Tensor,
|
153
|
+
subgraph_features: torch.Tensor,
|
154
|
+
knn: int = 10,
|
155
|
+
distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
|
156
|
+
affinity_focal_gamma: float = 1.0,
|
157
|
+
chunk_size: int = 8096,
|
158
|
+
device: str = None,
|
159
|
+
use_tqdm: bool = False,
|
160
|
+
move_output_to_cpu: bool = False,
|
161
|
+
):
|
162
|
+
"""A generic function to propagate new nodes using KNN.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
subgraph_output (torch.Tensor): output from subgraph, shape (num_sample, D)
|
166
|
+
inp_features (torch.Tensor): features from existing nodes, shape (new_num_samples, n_features)
|
167
|
+
subgraph_features (torch.Tensor): features from subgraph, shape (num_sample, n_features)
|
168
|
+
knn (int): number of KNN to propagate eige nvectors
|
169
|
+
distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'
|
170
|
+
chunk_size (int): chunk size for matrix multiplication
|
171
|
+
device (str): device to use for computation, if None, will not change device
|
172
|
+
use_tqdm (bool): show progress bar when propagating eigenvectors from subgraph to full graph
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
torch.Tensor: propagated eigenvectors, shape (new_num_samples, D)
|
176
|
+
|
177
|
+
Examples:
|
178
|
+
>>> old_eigenvectors = torch.randn(3000, 20)
|
179
|
+
>>> old_features = torch.randn(3000, 100)
|
180
|
+
>>> new_features = torch.randn(200, 100)
|
181
|
+
>>> new_eigenvectors = propagate_knn(old_eigenvectors, new_features, old_features, knn=3)
|
182
|
+
>>> # new_eigenvectors.shape = (200, 20)
|
183
|
+
|
184
|
+
"""
|
185
|
+
device = subgraph_output.device if device is None else device
|
186
|
+
|
187
|
+
if knn == 1:
|
188
|
+
return propagate_nearest(
|
189
|
+
subgraph_output,
|
190
|
+
inp_features,
|
191
|
+
subgraph_features,
|
192
|
+
chunk_size=chunk_size,
|
193
|
+
device=device,
|
194
|
+
move_output_to_cpu=move_output_to_cpu,
|
195
|
+
)
|
196
|
+
|
197
|
+
# used in nystrom_ncut
|
198
|
+
# propagate eigen_vector from subgraph to full graph
|
199
|
+
subgraph_output = subgraph_output.to(device)
|
200
|
+
V_list = []
|
201
|
+
iterator = range(0, inp_features.shape[0], chunk_size)
|
202
|
+
try:
|
203
|
+
assert use_tqdm
|
204
|
+
from tqdm import tqdm
|
205
|
+
iterator = tqdm(iterator, "propagate by KNN")
|
206
|
+
except (AssertionError, ImportError):
|
207
|
+
pass
|
208
|
+
|
209
|
+
subgraph_features = subgraph_features.to(device)
|
210
|
+
for i in iterator:
|
211
|
+
end = min(i + chunk_size, inp_features.shape[0])
|
212
|
+
_v = inp_features[i:end].to(device)
|
213
|
+
_A = affinity_from_features(subgraph_features, _v, affinity_focal_gamma, distance, False).mT
|
214
|
+
|
215
|
+
if knn is not None:
|
216
|
+
mask = torch.full_like(_A, True, dtype=torch.bool)
|
217
|
+
mask[torch.arange(end - i)[:, None], _A.topk(knn, dim=-1, largest=True).indices] = False
|
218
|
+
_A[mask] = 0.0
|
219
|
+
_A = F.normalize(_A, p=1, dim=-1)
|
220
|
+
|
221
|
+
_V = _A @ subgraph_output
|
222
|
+
if move_output_to_cpu:
|
223
|
+
_V = _V.cpu()
|
224
|
+
V_list.append(_V)
|
225
|
+
|
226
|
+
subgraph_output = torch.cat(V_list, dim=0)
|
227
|
+
return subgraph_output
|
228
|
+
|
229
|
+
|
230
|
+
def propagate_nearest(
|
231
|
+
subgraph_output: torch.Tensor,
|
232
|
+
inp_features: torch.Tensor,
|
233
|
+
subgraph_features: torch.Tensor,
|
234
|
+
distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
|
235
|
+
chunk_size: int = 8096,
|
236
|
+
device: str = None,
|
237
|
+
move_output_to_cpu: bool = False,
|
238
|
+
):
|
239
|
+
device = subgraph_output.device if device is None else device
|
240
|
+
if distance == 'cosine':
|
241
|
+
if not check_if_normalized(inp_features):
|
242
|
+
inp_features = F.normalize(inp_features, dim=-1)
|
243
|
+
if not check_if_normalized(subgraph_features):
|
244
|
+
subgraph_features = F.normalize(subgraph_features, dim=-1)
|
245
|
+
|
246
|
+
# used in nystrom_tsne, equivalent to propagate_by_knn with knn=1
|
247
|
+
# propagate tSNE from subgraph to full graph
|
248
|
+
V_list = []
|
249
|
+
subgraph_features = subgraph_features.to(device)
|
250
|
+
for i in range(0, inp_features.shape[0], chunk_size):
|
251
|
+
end = min(i + chunk_size, inp_features.shape[0])
|
252
|
+
_v = inp_features[i:end].to(device)
|
253
|
+
_A = -distance_from_features(subgraph_features, _v, distance, False).mT
|
254
|
+
|
255
|
+
# keep top1 for each row
|
256
|
+
top_idx = _A.argmax(dim=-1).cpu()
|
257
|
+
_V = subgraph_output[top_idx]
|
258
|
+
if move_output_to_cpu:
|
259
|
+
_V = _V.cpu()
|
260
|
+
V_list.append(_V)
|
261
|
+
|
262
|
+
subgraph_output = torch.cat(V_list, dim=0)
|
263
|
+
return subgraph_output
|
264
|
+
|
265
|
+
|
266
|
+
# wrapper functions for adding new nodes to existing graph
|
267
|
+
def propagate_eigenvectors(
|
268
|
+
eigenvectors: torch.Tensor,
|
269
|
+
features: torch.Tensor,
|
270
|
+
new_features: torch.Tensor,
|
271
|
+
knn: int,
|
272
|
+
num_sample: int,
|
273
|
+
sample_method: Literal["farthest", "random"],
|
274
|
+
chunk_size: int,
|
275
|
+
device: str,
|
276
|
+
use_tqdm: bool,
|
277
|
+
):
|
278
|
+
"""Propagate eigenvectors to new nodes using KNN. Note: this is equivalent to the class API `NCUT.tranform(new_features)`, expect for the sampling is re-done in this function.
|
279
|
+
Args:
|
280
|
+
eigenvectors (torch.Tensor): eigenvectors from existing nodes, shape (num_sample, num_eig)
|
281
|
+
features (torch.Tensor): features from existing nodes, shape (n_samples, n_features)
|
282
|
+
new_features (torch.Tensor): features from new nodes, shape (n_new_samples, n_features)
|
283
|
+
knn (int): number of KNN to propagate eigenvectors, default 3
|
284
|
+
num_sample (int): number of samples for subgraph sampling, default 50000
|
285
|
+
sample_method (str): sample method, 'farthest' (default) or 'random'
|
286
|
+
chunk_size (int): chunk size for matrix multiplication, default 8096
|
287
|
+
device (str): device to use for computation, if None, will not change device
|
288
|
+
use_tqdm (bool): show progress bar when propagating eigenvectors from subgraph to full graph
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
torch.Tensor: propagated eigenvectors, shape (n_new_samples, num_eig)
|
292
|
+
|
293
|
+
Examples:
|
294
|
+
>>> old_eigenvectors = torch.randn(3000, 20)
|
295
|
+
>>> old_features = torch.randn(3000, 100)
|
296
|
+
>>> new_features = torch.randn(200, 100)
|
297
|
+
>>> new_eigenvectors = propagate_eigenvectors(old_eigenvectors, new_features, old_features, knn=3)
|
298
|
+
>>> # new_eigenvectors.shape = (200, 20)
|
299
|
+
"""
|
300
|
+
|
301
|
+
device = eigenvectors.device if device is None else device
|
302
|
+
|
303
|
+
# sample subgraph
|
304
|
+
subgraph_indices = run_subgraph_sampling(
|
305
|
+
features,
|
306
|
+
num_sample=num_sample,
|
307
|
+
sample_method=sample_method,
|
308
|
+
)
|
309
|
+
|
310
|
+
subgraph_eigenvectors = eigenvectors[subgraph_indices].to(device)
|
311
|
+
subgraph_features = features[subgraph_indices].to(device)
|
312
|
+
new_features = new_features.to(device)
|
313
|
+
|
314
|
+
# propagate eigenvectors from subgraph to new nodes
|
315
|
+
new_eigenvectors = propagate_knn(
|
316
|
+
subgraph_eigenvectors,
|
317
|
+
new_features,
|
318
|
+
subgraph_features,
|
319
|
+
knn=knn,
|
320
|
+
chunk_size=chunk_size,
|
321
|
+
device=device,
|
322
|
+
use_tqdm=use_tqdm,
|
323
|
+
)
|
324
|
+
|
325
|
+
return new_eigenvectors
|
326
|
+
|
327
|
+
|
328
|
+
def check_if_normalized(x, n=1000):
|
329
|
+
"""check if the input tensor is normalized (unit norm)"""
|
330
|
+
n = min(n, x.shape[0])
|
331
|
+
random_indices = torch.randperm(x.shape[0])[:n]
|
332
|
+
_x = x[random_indices]
|
333
|
+
flag = torch.allclose(torch.norm(_x, dim=-1), torch.ones(n, device=x.device))
|
334
|
+
return flag
|
335
|
+
|
336
|
+
|
337
|
+
def quantile_min_max(x, q1=0.01, q2=0.99, n_sample=10000):
|
338
|
+
if x.shape[0] > n_sample:
|
339
|
+
np.random.seed(0)
|
340
|
+
random_idx = np.random.choice(x.shape[0], n_sample, replace=False)
|
341
|
+
vmin, vmax = x[random_idx].quantile(q1), x[random_idx].quantile(q2)
|
342
|
+
else:
|
343
|
+
vmin, vmax = x.quantile(q1), x.quantile(q2)
|
344
|
+
return vmin, vmax
|
345
|
+
|
346
|
+
|
347
|
+
def quantile_normalize(x, q=0.95):
|
348
|
+
"""normalize each dimension of x to [0, 1], take 95-th percentage, this robust to outliers
|
349
|
+
</br> 1. sort x
|
350
|
+
</br> 2. take q-th quantile
|
351
|
+
</br> min_value -> (1-q)-th quantile
|
352
|
+
</br> max_value -> q-th quantile
|
353
|
+
</br> 3. normalize
|
354
|
+
</br> x = (x - min_value) / (max_value - min_value)
|
355
|
+
|
356
|
+
Args:
|
357
|
+
x (torch.Tensor): input tensor, shape (n_samples, n_features)
|
358
|
+
normalize each feature to 0-1 range
|
359
|
+
q (float): quantile, default 0.95
|
360
|
+
|
361
|
+
Returns:
|
362
|
+
torch.Tensor: quantile normalized tensor
|
363
|
+
"""
|
364
|
+
# normalize x to 0-1 range, max value is q-th quantile
|
365
|
+
# quantile makes the normalization robust to outliers
|
366
|
+
if isinstance(x, np.ndarray):
|
367
|
+
x = torch.tensor(x)
|
368
|
+
vmax, vmin = quantile_min_max(x, q, 1 - q)
|
369
|
+
x = (x - vmin) / (vmax - vmin)
|
370
|
+
x = x.clamp(0, 1)
|
371
|
+
return x
|