nystrom-ncut 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nystrom_ncut/__init__.py +22 -0
- nystrom_ncut/ncut_pytorch.py +561 -0
- nystrom_ncut/new_ncut_pytorch.py +241 -0
- nystrom_ncut/nystrom.py +170 -0
- nystrom_ncut/propagation_utils.py +371 -0
- nystrom_ncut/visualize_utils.py +655 -0
- nystrom_ncut-0.0.1.dist-info/LICENSE +19 -0
- nystrom_ncut-0.0.1.dist-info/METADATA +164 -0
- nystrom_ncut-0.0.1.dist-info/RECORD +11 -0
- nystrom_ncut-0.0.1.dist-info/WHEEL +5 -0
- nystrom_ncut-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,371 @@
|
|
1
|
+
import logging
|
2
|
+
import math
|
3
|
+
from typing import Literal
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import torch
|
7
|
+
import torch.nn.functional as F
|
8
|
+
|
9
|
+
|
10
|
+
@torch.no_grad()
|
11
|
+
def run_subgraph_sampling(
|
12
|
+
features: torch.Tensor,
|
13
|
+
num_sample: int = 300,
|
14
|
+
max_draw: int = 1000000,
|
15
|
+
sample_method: Literal["farthest", "random"] = "farthest",
|
16
|
+
):
|
17
|
+
if num_sample >= features.shape[0]:
|
18
|
+
# if too many samples, use all samples and bypass Nystrom-like approximation
|
19
|
+
logging.info(
|
20
|
+
"num_sample is larger than total, bypass Nystrom-like approximation"
|
21
|
+
)
|
22
|
+
sampled_indices = torch.arange(features.shape[0])
|
23
|
+
else:
|
24
|
+
# sample subgraph
|
25
|
+
if sample_method == "farthest": # default
|
26
|
+
if num_sample > max_draw:
|
27
|
+
logging.warning(
|
28
|
+
f"num_sample is larger than max_draw, apply farthest point sampling on random sampled {max_draw} samples"
|
29
|
+
)
|
30
|
+
draw_indices = torch.randperm(features.shape[0])[:max_draw]
|
31
|
+
sampled_indices = farthest_point_sampling(
|
32
|
+
features[draw_indices].detach(),
|
33
|
+
num_sample=num_sample,
|
34
|
+
)
|
35
|
+
sampled_indices = draw_indices[sampled_indices]
|
36
|
+
else:
|
37
|
+
sampled_indices = farthest_point_sampling(
|
38
|
+
features.detach(),
|
39
|
+
num_sample=num_sample,
|
40
|
+
)
|
41
|
+
elif sample_method == "random": # not recommended
|
42
|
+
sampled_indices = torch.randperm(features.shape[0])[:num_sample]
|
43
|
+
else:
|
44
|
+
raise ValueError("sample_method should be 'farthest' or 'random'")
|
45
|
+
return sampled_indices
|
46
|
+
|
47
|
+
|
48
|
+
def farthest_point_sampling(
|
49
|
+
features: torch.Tensor,
|
50
|
+
num_sample: int = 300,
|
51
|
+
h: int = 9,
|
52
|
+
):
|
53
|
+
try:
|
54
|
+
import fpsample
|
55
|
+
except ImportError:
|
56
|
+
raise ImportError(
|
57
|
+
"fpsample import failed, please install `pip install fpsample`"
|
58
|
+
)
|
59
|
+
|
60
|
+
# PCA to reduce the dimension
|
61
|
+
if features.shape[1] > 8:
|
62
|
+
u, s, v = torch.pca_lowrank(features, q=8)
|
63
|
+
_n = features.shape[0]
|
64
|
+
s /= math.sqrt(_n)
|
65
|
+
features = u @ torch.diag(s)
|
66
|
+
|
67
|
+
h = min(h, int(np.log2(features.shape[0])))
|
68
|
+
|
69
|
+
kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(
|
70
|
+
features.cpu().numpy(), num_sample, h
|
71
|
+
).astype(np.int64)
|
72
|
+
return torch.from_numpy(kdline_fps_samples_idx)
|
73
|
+
|
74
|
+
|
75
|
+
def distance_from_features(
|
76
|
+
features: torch.Tensor,
|
77
|
+
features_B: torch.Tensor,
|
78
|
+
distance: Literal["cosine", "euclidean", "rbf"],
|
79
|
+
fill_diagonal: bool,
|
80
|
+
):
|
81
|
+
"""Compute affinity matrix from input features.
|
82
|
+
Args:
|
83
|
+
features (torch.Tensor): input features, shape (n_samples, n_features)
|
84
|
+
features_B (torch.Tensor, optional): optional, if not None, compute affinity between two features
|
85
|
+
affinity_focal_gamma (float): affinity matrix parameter, lower t reduce the edge weights
|
86
|
+
on weak connections, default 1.0
|
87
|
+
distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'.
|
88
|
+
normalize_features (bool): normalize input features before computing affinity matrix
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
(torch.Tensor): affinity matrix, shape (n_samples, n_samples)
|
92
|
+
"""
|
93
|
+
# compute distance matrix from input features
|
94
|
+
if distance == "cosine":
|
95
|
+
if not check_if_normalized(features):
|
96
|
+
features = F.normalize(features, dim=-1)
|
97
|
+
if not check_if_normalized(features_B):
|
98
|
+
features_B = F.normalize(features_B, dim=-1)
|
99
|
+
D = 1 - features @ features_B.T
|
100
|
+
elif distance == "euclidean":
|
101
|
+
D = torch.cdist(features, features_B, p=2)
|
102
|
+
elif distance == "rbf":
|
103
|
+
D = torch.cdist(features, features_B, p=2) ** 2
|
104
|
+
D = D / (2 * features.var(dim=0).sum())
|
105
|
+
else:
|
106
|
+
raise ValueError("distance should be 'cosine' or 'euclidean', 'rbf'")
|
107
|
+
|
108
|
+
if fill_diagonal:
|
109
|
+
D[torch.arange(D.shape[0]), torch.arange(D.shape[0])] = 0
|
110
|
+
return D
|
111
|
+
|
112
|
+
|
113
|
+
def affinity_from_features(
|
114
|
+
features: torch.Tensor,
|
115
|
+
features_B: torch.Tensor = None,
|
116
|
+
affinity_focal_gamma: float = 1.0,
|
117
|
+
distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
|
118
|
+
fill_diagonal: bool = True,
|
119
|
+
):
|
120
|
+
"""Compute affinity matrix from input features.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
features (torch.Tensor): input features, shape (n_samples, n_features)
|
124
|
+
features_B (torch.Tensor, optional): optional, if not None, compute affinity between two features
|
125
|
+
affinity_focal_gamma (float): affinity matrix parameter, lower t reduce the edge weights
|
126
|
+
on weak connections, default 1.0
|
127
|
+
distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'.
|
128
|
+
normalize_features (bool): normalize input features before computing affinity matrix
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
(torch.Tensor): affinity matrix, shape (n_samples, n_samples)
|
132
|
+
"""
|
133
|
+
# compute affinity matrix from input features
|
134
|
+
|
135
|
+
# if feature_B is not provided, compute affinity matrix on features x features
|
136
|
+
# if feature_B is provided, compute affinity matrix on features x feature_B
|
137
|
+
if features_B is not None:
|
138
|
+
assert not fill_diagonal, "fill_diagonal should be False when feature_B is None"
|
139
|
+
features_B = features if features_B is None else features_B
|
140
|
+
|
141
|
+
# compute distance matrix from input features
|
142
|
+
D = distance_from_features(features, features_B, distance, fill_diagonal)
|
143
|
+
|
144
|
+
# torch.exp make affinity matrix positive definite,
|
145
|
+
# lower affinity_focal_gamma reduce the weak edge weights
|
146
|
+
A = torch.exp(-D / affinity_focal_gamma)
|
147
|
+
return A
|
148
|
+
|
149
|
+
|
150
|
+
def propagate_knn(
|
151
|
+
subgraph_output: torch.Tensor,
|
152
|
+
inp_features: torch.Tensor,
|
153
|
+
subgraph_features: torch.Tensor,
|
154
|
+
knn: int = 10,
|
155
|
+
distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
|
156
|
+
affinity_focal_gamma: float = 1.0,
|
157
|
+
chunk_size: int = 8096,
|
158
|
+
device: str = None,
|
159
|
+
use_tqdm: bool = False,
|
160
|
+
move_output_to_cpu: bool = False,
|
161
|
+
):
|
162
|
+
"""A generic function to propagate new nodes using KNN.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
subgraph_output (torch.Tensor): output from subgraph, shape (num_sample, D)
|
166
|
+
inp_features (torch.Tensor): features from existing nodes, shape (new_num_samples, n_features)
|
167
|
+
subgraph_features (torch.Tensor): features from subgraph, shape (num_sample, n_features)
|
168
|
+
knn (int): number of KNN to propagate eige nvectors
|
169
|
+
distance (str): distance metric, 'cosine' (default) or 'euclidean', 'rbf'
|
170
|
+
chunk_size (int): chunk size for matrix multiplication
|
171
|
+
device (str): device to use for computation, if None, will not change device
|
172
|
+
use_tqdm (bool): show progress bar when propagating eigenvectors from subgraph to full graph
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
torch.Tensor: propagated eigenvectors, shape (new_num_samples, D)
|
176
|
+
|
177
|
+
Examples:
|
178
|
+
>>> old_eigenvectors = torch.randn(3000, 20)
|
179
|
+
>>> old_features = torch.randn(3000, 100)
|
180
|
+
>>> new_features = torch.randn(200, 100)
|
181
|
+
>>> new_eigenvectors = propagate_knn(old_eigenvectors, new_features, old_features, knn=3)
|
182
|
+
>>> # new_eigenvectors.shape = (200, 20)
|
183
|
+
|
184
|
+
"""
|
185
|
+
device = subgraph_output.device if device is None else device
|
186
|
+
|
187
|
+
if knn == 1:
|
188
|
+
return propagate_nearest(
|
189
|
+
subgraph_output,
|
190
|
+
inp_features,
|
191
|
+
subgraph_features,
|
192
|
+
chunk_size=chunk_size,
|
193
|
+
device=device,
|
194
|
+
move_output_to_cpu=move_output_to_cpu,
|
195
|
+
)
|
196
|
+
|
197
|
+
# used in nystrom_ncut
|
198
|
+
# propagate eigen_vector from subgraph to full graph
|
199
|
+
subgraph_output = subgraph_output.to(device)
|
200
|
+
V_list = []
|
201
|
+
iterator = range(0, inp_features.shape[0], chunk_size)
|
202
|
+
try:
|
203
|
+
assert use_tqdm
|
204
|
+
from tqdm import tqdm
|
205
|
+
iterator = tqdm(iterator, "propagate by KNN")
|
206
|
+
except (AssertionError, ImportError):
|
207
|
+
pass
|
208
|
+
|
209
|
+
subgraph_features = subgraph_features.to(device)
|
210
|
+
for i in iterator:
|
211
|
+
end = min(i + chunk_size, inp_features.shape[0])
|
212
|
+
_v = inp_features[i:end].to(device)
|
213
|
+
_A = affinity_from_features(subgraph_features, _v, affinity_focal_gamma, distance, False).mT
|
214
|
+
|
215
|
+
if knn is not None:
|
216
|
+
mask = torch.full_like(_A, True, dtype=torch.bool)
|
217
|
+
mask[torch.arange(end - i)[:, None], _A.topk(knn, dim=-1, largest=True).indices] = False
|
218
|
+
_A[mask] = 0.0
|
219
|
+
_A = F.normalize(_A, p=1, dim=-1)
|
220
|
+
|
221
|
+
_V = _A @ subgraph_output
|
222
|
+
if move_output_to_cpu:
|
223
|
+
_V = _V.cpu()
|
224
|
+
V_list.append(_V)
|
225
|
+
|
226
|
+
subgraph_output = torch.cat(V_list, dim=0)
|
227
|
+
return subgraph_output
|
228
|
+
|
229
|
+
|
230
|
+
def propagate_nearest(
|
231
|
+
subgraph_output: torch.Tensor,
|
232
|
+
inp_features: torch.Tensor,
|
233
|
+
subgraph_features: torch.Tensor,
|
234
|
+
distance: Literal["cosine", "euclidean", "rbf"] = "cosine",
|
235
|
+
chunk_size: int = 8096,
|
236
|
+
device: str = None,
|
237
|
+
move_output_to_cpu: bool = False,
|
238
|
+
):
|
239
|
+
device = subgraph_output.device if device is None else device
|
240
|
+
if distance == 'cosine':
|
241
|
+
if not check_if_normalized(inp_features):
|
242
|
+
inp_features = F.normalize(inp_features, dim=-1)
|
243
|
+
if not check_if_normalized(subgraph_features):
|
244
|
+
subgraph_features = F.normalize(subgraph_features, dim=-1)
|
245
|
+
|
246
|
+
# used in nystrom_tsne, equivalent to propagate_by_knn with knn=1
|
247
|
+
# propagate tSNE from subgraph to full graph
|
248
|
+
V_list = []
|
249
|
+
subgraph_features = subgraph_features.to(device)
|
250
|
+
for i in range(0, inp_features.shape[0], chunk_size):
|
251
|
+
end = min(i + chunk_size, inp_features.shape[0])
|
252
|
+
_v = inp_features[i:end].to(device)
|
253
|
+
_A = -distance_from_features(subgraph_features, _v, distance, False).mT
|
254
|
+
|
255
|
+
# keep top1 for each row
|
256
|
+
top_idx = _A.argmax(dim=-1).cpu()
|
257
|
+
_V = subgraph_output[top_idx]
|
258
|
+
if move_output_to_cpu:
|
259
|
+
_V = _V.cpu()
|
260
|
+
V_list.append(_V)
|
261
|
+
|
262
|
+
subgraph_output = torch.cat(V_list, dim=0)
|
263
|
+
return subgraph_output
|
264
|
+
|
265
|
+
|
266
|
+
# wrapper functions for adding new nodes to existing graph
|
267
|
+
def propagate_eigenvectors(
|
268
|
+
eigenvectors: torch.Tensor,
|
269
|
+
features: torch.Tensor,
|
270
|
+
new_features: torch.Tensor,
|
271
|
+
knn: int,
|
272
|
+
num_sample: int,
|
273
|
+
sample_method: Literal["farthest", "random"],
|
274
|
+
chunk_size: int,
|
275
|
+
device: str,
|
276
|
+
use_tqdm: bool,
|
277
|
+
):
|
278
|
+
"""Propagate eigenvectors to new nodes using KNN. Note: this is equivalent to the class API `NCUT.tranform(new_features)`, expect for the sampling is re-done in this function.
|
279
|
+
Args:
|
280
|
+
eigenvectors (torch.Tensor): eigenvectors from existing nodes, shape (num_sample, num_eig)
|
281
|
+
features (torch.Tensor): features from existing nodes, shape (n_samples, n_features)
|
282
|
+
new_features (torch.Tensor): features from new nodes, shape (n_new_samples, n_features)
|
283
|
+
knn (int): number of KNN to propagate eigenvectors, default 3
|
284
|
+
num_sample (int): number of samples for subgraph sampling, default 50000
|
285
|
+
sample_method (str): sample method, 'farthest' (default) or 'random'
|
286
|
+
chunk_size (int): chunk size for matrix multiplication, default 8096
|
287
|
+
device (str): device to use for computation, if None, will not change device
|
288
|
+
use_tqdm (bool): show progress bar when propagating eigenvectors from subgraph to full graph
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
torch.Tensor: propagated eigenvectors, shape (n_new_samples, num_eig)
|
292
|
+
|
293
|
+
Examples:
|
294
|
+
>>> old_eigenvectors = torch.randn(3000, 20)
|
295
|
+
>>> old_features = torch.randn(3000, 100)
|
296
|
+
>>> new_features = torch.randn(200, 100)
|
297
|
+
>>> new_eigenvectors = propagate_eigenvectors(old_eigenvectors, new_features, old_features, knn=3)
|
298
|
+
>>> # new_eigenvectors.shape = (200, 20)
|
299
|
+
"""
|
300
|
+
|
301
|
+
device = eigenvectors.device if device is None else device
|
302
|
+
|
303
|
+
# sample subgraph
|
304
|
+
subgraph_indices = run_subgraph_sampling(
|
305
|
+
features,
|
306
|
+
num_sample=num_sample,
|
307
|
+
sample_method=sample_method,
|
308
|
+
)
|
309
|
+
|
310
|
+
subgraph_eigenvectors = eigenvectors[subgraph_indices].to(device)
|
311
|
+
subgraph_features = features[subgraph_indices].to(device)
|
312
|
+
new_features = new_features.to(device)
|
313
|
+
|
314
|
+
# propagate eigenvectors from subgraph to new nodes
|
315
|
+
new_eigenvectors = propagate_knn(
|
316
|
+
subgraph_eigenvectors,
|
317
|
+
new_features,
|
318
|
+
subgraph_features,
|
319
|
+
knn=knn,
|
320
|
+
chunk_size=chunk_size,
|
321
|
+
device=device,
|
322
|
+
use_tqdm=use_tqdm,
|
323
|
+
)
|
324
|
+
|
325
|
+
return new_eigenvectors
|
326
|
+
|
327
|
+
|
328
|
+
def check_if_normalized(x, n=1000):
|
329
|
+
"""check if the input tensor is normalized (unit norm)"""
|
330
|
+
n = min(n, x.shape[0])
|
331
|
+
random_indices = torch.randperm(x.shape[0])[:n]
|
332
|
+
_x = x[random_indices]
|
333
|
+
flag = torch.allclose(torch.norm(_x, dim=-1), torch.ones(n, device=x.device))
|
334
|
+
return flag
|
335
|
+
|
336
|
+
|
337
|
+
def quantile_min_max(x, q1=0.01, q2=0.99, n_sample=10000):
|
338
|
+
if x.shape[0] > n_sample:
|
339
|
+
np.random.seed(0)
|
340
|
+
random_idx = np.random.choice(x.shape[0], n_sample, replace=False)
|
341
|
+
vmin, vmax = x[random_idx].quantile(q1), x[random_idx].quantile(q2)
|
342
|
+
else:
|
343
|
+
vmin, vmax = x.quantile(q1), x.quantile(q2)
|
344
|
+
return vmin, vmax
|
345
|
+
|
346
|
+
|
347
|
+
def quantile_normalize(x, q=0.95):
|
348
|
+
"""normalize each dimension of x to [0, 1], take 95-th percentage, this robust to outliers
|
349
|
+
</br> 1. sort x
|
350
|
+
</br> 2. take q-th quantile
|
351
|
+
</br> min_value -> (1-q)-th quantile
|
352
|
+
</br> max_value -> q-th quantile
|
353
|
+
</br> 3. normalize
|
354
|
+
</br> x = (x - min_value) / (max_value - min_value)
|
355
|
+
|
356
|
+
Args:
|
357
|
+
x (torch.Tensor): input tensor, shape (n_samples, n_features)
|
358
|
+
normalize each feature to 0-1 range
|
359
|
+
q (float): quantile, default 0.95
|
360
|
+
|
361
|
+
Returns:
|
362
|
+
torch.Tensor: quantile normalized tensor
|
363
|
+
"""
|
364
|
+
# normalize x to 0-1 range, max value is q-th quantile
|
365
|
+
# quantile makes the normalization robust to outliers
|
366
|
+
if isinstance(x, np.ndarray):
|
367
|
+
x = torch.tensor(x)
|
368
|
+
vmax, vmin = quantile_min_max(x, q, 1 - q)
|
369
|
+
x = (x - vmin) / (vmax - vmin)
|
370
|
+
x = x.clamp(0, 1)
|
371
|
+
return x
|