scikit-network 0.28.3__cp39-cp39-macosx_12_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of scikit-network might be problematic. Click here for more details.
- scikit_network-0.28.3.dist-info/AUTHORS.rst +41 -0
- scikit_network-0.28.3.dist-info/LICENSE +34 -0
- scikit_network-0.28.3.dist-info/METADATA +457 -0
- scikit_network-0.28.3.dist-info/RECORD +240 -0
- scikit_network-0.28.3.dist-info/WHEEL +5 -0
- scikit_network-0.28.3.dist-info/top_level.txt +1 -0
- sknetwork/__init__.py +21 -0
- sknetwork/classification/__init__.py +8 -0
- sknetwork/classification/base.py +84 -0
- sknetwork/classification/base_rank.py +143 -0
- sknetwork/classification/diffusion.py +134 -0
- sknetwork/classification/knn.py +162 -0
- sknetwork/classification/metrics.py +205 -0
- sknetwork/classification/pagerank.py +66 -0
- sknetwork/classification/propagation.py +152 -0
- sknetwork/classification/tests/__init__.py +1 -0
- sknetwork/classification/tests/test_API.py +35 -0
- sknetwork/classification/tests/test_diffusion.py +37 -0
- sknetwork/classification/tests/test_knn.py +24 -0
- sknetwork/classification/tests/test_metrics.py +53 -0
- sknetwork/classification/tests/test_pagerank.py +20 -0
- sknetwork/classification/tests/test_propagation.py +24 -0
- sknetwork/classification/vote.cpython-39-darwin.so +0 -0
- sknetwork/classification/vote.pyx +58 -0
- sknetwork/clustering/__init__.py +7 -0
- sknetwork/clustering/base.py +102 -0
- sknetwork/clustering/kmeans.py +142 -0
- sknetwork/clustering/louvain.py +255 -0
- sknetwork/clustering/louvain_core.cpython-39-darwin.so +0 -0
- sknetwork/clustering/louvain_core.pyx +134 -0
- sknetwork/clustering/metrics.py +91 -0
- sknetwork/clustering/postprocess.py +66 -0
- sknetwork/clustering/propagation_clustering.py +108 -0
- sknetwork/clustering/tests/__init__.py +1 -0
- sknetwork/clustering/tests/test_API.py +37 -0
- sknetwork/clustering/tests/test_kmeans.py +47 -0
- sknetwork/clustering/tests/test_louvain.py +104 -0
- sknetwork/clustering/tests/test_metrics.py +50 -0
- sknetwork/clustering/tests/test_post_processing.py +23 -0
- sknetwork/clustering/tests/test_postprocess.py +39 -0
- sknetwork/data/__init__.py +5 -0
- sknetwork/data/load.py +408 -0
- sknetwork/data/models.py +459 -0
- sknetwork/data/parse.py +621 -0
- sknetwork/data/test_graphs.py +84 -0
- sknetwork/data/tests/__init__.py +1 -0
- sknetwork/data/tests/test_API.py +30 -0
- sknetwork/data/tests/test_load.py +95 -0
- sknetwork/data/tests/test_models.py +52 -0
- sknetwork/data/tests/test_parse.py +253 -0
- sknetwork/data/tests/test_test_graphs.py +30 -0
- sknetwork/data/tests/test_toy_graphs.py +68 -0
- sknetwork/data/toy_graphs.py +619 -0
- sknetwork/embedding/__init__.py +10 -0
- sknetwork/embedding/base.py +90 -0
- sknetwork/embedding/force_atlas.py +197 -0
- sknetwork/embedding/louvain_embedding.py +174 -0
- sknetwork/embedding/louvain_hierarchy.py +142 -0
- sknetwork/embedding/metrics.py +66 -0
- sknetwork/embedding/random_projection.py +133 -0
- sknetwork/embedding/spectral.py +214 -0
- sknetwork/embedding/spring.py +198 -0
- sknetwork/embedding/svd.py +363 -0
- sknetwork/embedding/tests/__init__.py +1 -0
- sknetwork/embedding/tests/test_API.py +73 -0
- sknetwork/embedding/tests/test_force_atlas.py +35 -0
- sknetwork/embedding/tests/test_louvain_embedding.py +33 -0
- sknetwork/embedding/tests/test_louvain_hierarchy.py +19 -0
- sknetwork/embedding/tests/test_metrics.py +29 -0
- sknetwork/embedding/tests/test_random_projection.py +28 -0
- sknetwork/embedding/tests/test_spectral.py +84 -0
- sknetwork/embedding/tests/test_spring.py +50 -0
- sknetwork/embedding/tests/test_svd.py +37 -0
- sknetwork/flow/__init__.py +3 -0
- sknetwork/flow/flow.py +73 -0
- sknetwork/flow/tests/__init__.py +1 -0
- sknetwork/flow/tests/test_flow.py +17 -0
- sknetwork/flow/tests/test_utils.py +69 -0
- sknetwork/flow/utils.py +91 -0
- sknetwork/gnn/__init__.py +10 -0
- sknetwork/gnn/activation.py +117 -0
- sknetwork/gnn/base.py +155 -0
- sknetwork/gnn/base_activation.py +89 -0
- sknetwork/gnn/base_layer.py +109 -0
- sknetwork/gnn/gnn_classifier.py +381 -0
- sknetwork/gnn/layer.py +153 -0
- sknetwork/gnn/layers.py +127 -0
- sknetwork/gnn/loss.py +180 -0
- sknetwork/gnn/neighbor_sampler.py +65 -0
- sknetwork/gnn/optimizer.py +163 -0
- sknetwork/gnn/tests/__init__.py +1 -0
- sknetwork/gnn/tests/test_activation.py +56 -0
- sknetwork/gnn/tests/test_base.py +79 -0
- sknetwork/gnn/tests/test_base_layer.py +37 -0
- sknetwork/gnn/tests/test_gnn_classifier.py +192 -0
- sknetwork/gnn/tests/test_layers.py +80 -0
- sknetwork/gnn/tests/test_loss.py +33 -0
- sknetwork/gnn/tests/test_neigh_sampler.py +23 -0
- sknetwork/gnn/tests/test_optimizer.py +43 -0
- sknetwork/gnn/tests/test_utils.py +93 -0
- sknetwork/gnn/utils.py +219 -0
- sknetwork/hierarchy/__init__.py +7 -0
- sknetwork/hierarchy/base.py +69 -0
- sknetwork/hierarchy/louvain_hierarchy.py +264 -0
- sknetwork/hierarchy/metrics.py +234 -0
- sknetwork/hierarchy/paris.cpython-39-darwin.so +0 -0
- sknetwork/hierarchy/paris.pyx +317 -0
- sknetwork/hierarchy/postprocess.py +350 -0
- sknetwork/hierarchy/tests/__init__.py +1 -0
- sknetwork/hierarchy/tests/test_API.py +25 -0
- sknetwork/hierarchy/tests/test_algos.py +29 -0
- sknetwork/hierarchy/tests/test_metrics.py +62 -0
- sknetwork/hierarchy/tests/test_postprocess.py +57 -0
- sknetwork/hierarchy/tests/test_ward.py +25 -0
- sknetwork/hierarchy/ward.py +94 -0
- sknetwork/linalg/__init__.py +9 -0
- sknetwork/linalg/basics.py +37 -0
- sknetwork/linalg/diteration.cpython-39-darwin.so +0 -0
- sknetwork/linalg/diteration.pyx +49 -0
- sknetwork/linalg/eig_solver.py +93 -0
- sknetwork/linalg/laplacian.py +15 -0
- sknetwork/linalg/normalization.py +66 -0
- sknetwork/linalg/operators.py +225 -0
- sknetwork/linalg/polynome.py +76 -0
- sknetwork/linalg/ppr_solver.py +170 -0
- sknetwork/linalg/push.cpython-39-darwin.so +0 -0
- sknetwork/linalg/push.pyx +73 -0
- sknetwork/linalg/sparse_lowrank.py +142 -0
- sknetwork/linalg/svd_solver.py +91 -0
- sknetwork/linalg/tests/__init__.py +1 -0
- sknetwork/linalg/tests/test_eig.py +44 -0
- sknetwork/linalg/tests/test_laplacian.py +18 -0
- sknetwork/linalg/tests/test_normalization.py +38 -0
- sknetwork/linalg/tests/test_operators.py +70 -0
- sknetwork/linalg/tests/test_polynome.py +38 -0
- sknetwork/linalg/tests/test_ppr.py +50 -0
- sknetwork/linalg/tests/test_sparse_lowrank.py +61 -0
- sknetwork/linalg/tests/test_svd.py +38 -0
- sknetwork/linkpred/__init__.py +4 -0
- sknetwork/linkpred/base.py +80 -0
- sknetwork/linkpred/first_order.py +508 -0
- sknetwork/linkpred/first_order_core.cpython-39-darwin.so +0 -0
- sknetwork/linkpred/first_order_core.pyx +315 -0
- sknetwork/linkpred/postprocessing.py +98 -0
- sknetwork/linkpred/tests/__init__.py +1 -0
- sknetwork/linkpred/tests/test_API.py +49 -0
- sknetwork/linkpred/tests/test_postprocessing.py +21 -0
- sknetwork/path/__init__.py +4 -0
- sknetwork/path/metrics.py +148 -0
- sknetwork/path/search.py +65 -0
- sknetwork/path/shortest_path.py +186 -0
- sknetwork/path/tests/__init__.py +1 -0
- sknetwork/path/tests/test_metrics.py +29 -0
- sknetwork/path/tests/test_search.py +25 -0
- sknetwork/path/tests/test_shortest_path.py +45 -0
- sknetwork/ranking/__init__.py +9 -0
- sknetwork/ranking/base.py +56 -0
- sknetwork/ranking/betweenness.cpython-39-darwin.so +0 -0
- sknetwork/ranking/betweenness.pyx +99 -0
- sknetwork/ranking/closeness.py +95 -0
- sknetwork/ranking/harmonic.py +82 -0
- sknetwork/ranking/hits.py +94 -0
- sknetwork/ranking/katz.py +81 -0
- sknetwork/ranking/pagerank.py +107 -0
- sknetwork/ranking/postprocess.py +25 -0
- sknetwork/ranking/tests/__init__.py +1 -0
- sknetwork/ranking/tests/test_API.py +34 -0
- sknetwork/ranking/tests/test_betweenness.py +38 -0
- sknetwork/ranking/tests/test_closeness.py +34 -0
- sknetwork/ranking/tests/test_hits.py +20 -0
- sknetwork/ranking/tests/test_pagerank.py +69 -0
- sknetwork/regression/__init__.py +4 -0
- sknetwork/regression/base.py +56 -0
- sknetwork/regression/diffusion.py +190 -0
- sknetwork/regression/tests/__init__.py +1 -0
- sknetwork/regression/tests/test_API.py +34 -0
- sknetwork/regression/tests/test_diffusion.py +48 -0
- sknetwork/sknetwork.py +3 -0
- sknetwork/topology/__init__.py +9 -0
- sknetwork/topology/dag.py +74 -0
- sknetwork/topology/dag_core.cpython-39-darwin.so +0 -0
- sknetwork/topology/dag_core.pyx +38 -0
- sknetwork/topology/kcliques.cpython-39-darwin.so +0 -0
- sknetwork/topology/kcliques.pyx +193 -0
- sknetwork/topology/kcore.cpython-39-darwin.so +0 -0
- sknetwork/topology/kcore.pyx +120 -0
- sknetwork/topology/structure.py +234 -0
- sknetwork/topology/tests/__init__.py +1 -0
- sknetwork/topology/tests/test_cliques.py +28 -0
- sknetwork/topology/tests/test_cores.py +21 -0
- sknetwork/topology/tests/test_dag.py +26 -0
- sknetwork/topology/tests/test_structure.py +99 -0
- sknetwork/topology/tests/test_triangles.py +42 -0
- sknetwork/topology/tests/test_wl_coloring.py +49 -0
- sknetwork/topology/tests/test_wl_kernel.py +31 -0
- sknetwork/topology/triangles.cpython-39-darwin.so +0 -0
- sknetwork/topology/triangles.pyx +166 -0
- sknetwork/topology/weisfeiler_lehman.py +163 -0
- sknetwork/topology/weisfeiler_lehman_core.cpython-39-darwin.so +0 -0
- sknetwork/topology/weisfeiler_lehman_core.pyx +116 -0
- sknetwork/utils/__init__.py +40 -0
- sknetwork/utils/base.py +35 -0
- sknetwork/utils/check.py +354 -0
- sknetwork/utils/co_neighbor.py +71 -0
- sknetwork/utils/format.py +219 -0
- sknetwork/utils/kmeans.py +89 -0
- sknetwork/utils/knn.py +166 -0
- sknetwork/utils/knn1d.cpython-39-darwin.so +0 -0
- sknetwork/utils/knn1d.pyx +80 -0
- sknetwork/utils/membership.py +82 -0
- sknetwork/utils/minheap.cpython-39-darwin.so +0 -0
- sknetwork/utils/minheap.pxd +22 -0
- sknetwork/utils/minheap.pyx +111 -0
- sknetwork/utils/neighbors.py +115 -0
- sknetwork/utils/seeds.py +75 -0
- sknetwork/utils/simplex.py +140 -0
- sknetwork/utils/tests/__init__.py +1 -0
- sknetwork/utils/tests/test_base.py +28 -0
- sknetwork/utils/tests/test_bunch.py +16 -0
- sknetwork/utils/tests/test_check.py +190 -0
- sknetwork/utils/tests/test_co_neighbor.py +43 -0
- sknetwork/utils/tests/test_format.py +61 -0
- sknetwork/utils/tests/test_kmeans.py +21 -0
- sknetwork/utils/tests/test_knn.py +32 -0
- sknetwork/utils/tests/test_membership.py +24 -0
- sknetwork/utils/tests/test_neighbors.py +41 -0
- sknetwork/utils/tests/test_projection_simplex.py +33 -0
- sknetwork/utils/tests/test_seeds.py +67 -0
- sknetwork/utils/tests/test_verbose.py +15 -0
- sknetwork/utils/tests/test_ward.py +20 -0
- sknetwork/utils/timeout.py +38 -0
- sknetwork/utils/verbose.py +37 -0
- sknetwork/utils/ward.py +60 -0
- sknetwork/visualization/__init__.py +4 -0
- sknetwork/visualization/colors.py +34 -0
- sknetwork/visualization/dendrograms.py +229 -0
- sknetwork/visualization/graphs.py +819 -0
- sknetwork/visualization/tests/__init__.py +1 -0
- sknetwork/visualization/tests/test_dendrograms.py +53 -0
- sknetwork/visualization/tests/test_graphs.py +167 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on March 2019
|
|
5
|
+
@author: Thomas Bonald <bonald@enst.fr>
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
from scipy import sparse
|
|
9
|
+
from sknetwork.hierarchy.paris import AggregateGraph
|
|
10
|
+
|
|
11
|
+
from sknetwork.utils.check import check_format, get_probs, check_square
|
|
12
|
+
from sknetwork.utils.check import check_min_size, check_min_nnz
|
|
13
|
+
from sknetwork.utils.format import directed2undirected
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _instantiate_vars(adjacency: sparse.csr_matrix, weights: str = 'uniform'):
|
|
17
|
+
"""Initialize standard variables for metrics."""
|
|
18
|
+
weights_row = get_probs(weights, adjacency)
|
|
19
|
+
weights_col = get_probs(weights, adjacency.T)
|
|
20
|
+
sym_adjacency = directed2undirected(adjacency)
|
|
21
|
+
aggregate_graph = AggregateGraph(weights_row, weights_col, sym_adjacency.data.astype(float),
|
|
22
|
+
sym_adjacency.indices, sym_adjacency.indptr)
|
|
23
|
+
return aggregate_graph, weights_row, weights_col
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_sampling_distributions(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform'):
|
|
27
|
+
"""Get sampling distributions over each internal node of the tree.
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
adjacency :
|
|
31
|
+
Adjacency matrix of the graph.
|
|
32
|
+
dendrogram :
|
|
33
|
+
Dendrogram.
|
|
34
|
+
weights :
|
|
35
|
+
Weights of nodes.
|
|
36
|
+
``'degree'`` or ``'uniform'`` (default).
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
edge_sampling: np.ndarray
|
|
40
|
+
Edge sampling distribution.
|
|
41
|
+
node_sampling: np.ndarray
|
|
42
|
+
Node sampling distribution.
|
|
43
|
+
cluster_weights: np.ndarray
|
|
44
|
+
Cluster weights.
|
|
45
|
+
"""
|
|
46
|
+
n = adjacency.shape[0]
|
|
47
|
+
aggregate_graph, weights_row, weights_col = _instantiate_vars(adjacency, weights)
|
|
48
|
+
cluster_weight = np.zeros(n-1)
|
|
49
|
+
edge_sampling = np.zeros(n-1)
|
|
50
|
+
node_sampling = np.zeros(n-1)
|
|
51
|
+
|
|
52
|
+
for t in range(n - 1):
|
|
53
|
+
i = int(dendrogram[t][0])
|
|
54
|
+
j = int(dendrogram[t][1])
|
|
55
|
+
if j in aggregate_graph.neighbors[i]:
|
|
56
|
+
edge_sampling[t] += 2 * aggregate_graph.neighbors[i][j]
|
|
57
|
+
node_sampling[t] += aggregate_graph.cluster_out_weights[i] * aggregate_graph.cluster_in_weights[j] + \
|
|
58
|
+
aggregate_graph.cluster_out_weights[j] * aggregate_graph.cluster_in_weights[i]
|
|
59
|
+
cluster_weight[t] = aggregate_graph.cluster_out_weights[i] + aggregate_graph.cluster_out_weights[j] + \
|
|
60
|
+
aggregate_graph.cluster_in_weights[i] + aggregate_graph.cluster_in_weights[j]
|
|
61
|
+
for node in {i, j}:
|
|
62
|
+
if node < n:
|
|
63
|
+
# self-loop
|
|
64
|
+
node_sampling[t] += aggregate_graph.cluster_out_weights[node] * aggregate_graph.cluster_in_weights[node]
|
|
65
|
+
if node in aggregate_graph.neighbors[node]:
|
|
66
|
+
edge_sampling[t] += aggregate_graph.neighbors[node][node]
|
|
67
|
+
aggregate_graph.merge(i, j)
|
|
68
|
+
return edge_sampling, node_sampling, cluster_weight / 2
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def dasgupta_cost(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform',
|
|
72
|
+
normalized: bool = False) -> float:
|
|
73
|
+
"""Dasgupta's cost of a hierarchy.
|
|
74
|
+
|
|
75
|
+
Expected size (weights = ``'uniform'``) or expected volume (weights = ``'degree'``) of the cluster induced by
|
|
76
|
+
random edge sampling (closest ancestor of the two nodes in the hierarchy).
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
adjacency :
|
|
81
|
+
Adjacency matrix of the graph.
|
|
82
|
+
dendrogram :
|
|
83
|
+
Dendrogram.
|
|
84
|
+
weights :
|
|
85
|
+
Weights of nodes.
|
|
86
|
+
``'degree'`` or ``'uniform'`` (default).
|
|
87
|
+
normalized :
|
|
88
|
+
If ``True``, normalized cost (between 0 and 1).
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
cost : float
|
|
93
|
+
Cost.
|
|
94
|
+
|
|
95
|
+
Example
|
|
96
|
+
-------
|
|
97
|
+
>>> from sknetwork.hierarchy import dasgupta_score, Paris
|
|
98
|
+
>>> from sknetwork.data import house
|
|
99
|
+
>>> paris = Paris()
|
|
100
|
+
>>> adjacency = house()
|
|
101
|
+
>>> dendrogram = paris.fit_transform(adjacency)
|
|
102
|
+
>>> cost = dasgupta_cost(adjacency, dendrogram)
|
|
103
|
+
>>> np.round(cost, 2)
|
|
104
|
+
3.33
|
|
105
|
+
|
|
106
|
+
References
|
|
107
|
+
----------
|
|
108
|
+
Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering.
|
|
109
|
+
Proceedings of ACM symposium on Theory of Computing.
|
|
110
|
+
"""
|
|
111
|
+
adjacency = check_format(adjacency)
|
|
112
|
+
check_square(adjacency)
|
|
113
|
+
|
|
114
|
+
n = adjacency.shape[0]
|
|
115
|
+
check_min_size(n, 2)
|
|
116
|
+
|
|
117
|
+
edge_sampling, _, cluster_weight = get_sampling_distributions(adjacency, dendrogram, weights)
|
|
118
|
+
cost = edge_sampling.dot(cluster_weight)
|
|
119
|
+
|
|
120
|
+
if not normalized:
|
|
121
|
+
if weights == 'degree':
|
|
122
|
+
cost *= adjacency.data.sum()
|
|
123
|
+
else:
|
|
124
|
+
cost *= n
|
|
125
|
+
|
|
126
|
+
return cost
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def dasgupta_score(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform') -> float:
|
|
130
|
+
"""Dasgupta's score of a hierarchy (quality metric, between 0 and 1).
|
|
131
|
+
|
|
132
|
+
Defined as 1 - normalized Dasgupta's cost.
|
|
133
|
+
|
|
134
|
+
Parameters
|
|
135
|
+
----------
|
|
136
|
+
adjacency :
|
|
137
|
+
Adjacency matrix of the graph.
|
|
138
|
+
dendrogram :
|
|
139
|
+
Dendrogram.
|
|
140
|
+
weights :
|
|
141
|
+
Weights of nodes.
|
|
142
|
+
``'degree'`` or ``'uniform'`` (default).
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
score : float
|
|
147
|
+
Score.
|
|
148
|
+
|
|
149
|
+
Example
|
|
150
|
+
-------
|
|
151
|
+
>>> from sknetwork.hierarchy import dasgupta_score, Paris
|
|
152
|
+
>>> from sknetwork.data import house
|
|
153
|
+
>>> paris = Paris()
|
|
154
|
+
>>> adjacency = house()
|
|
155
|
+
>>> dendrogram = paris.fit_transform(adjacency)
|
|
156
|
+
>>> score = dasgupta_score(adjacency, dendrogram)
|
|
157
|
+
>>> np.round(score, 2)
|
|
158
|
+
0.33
|
|
159
|
+
|
|
160
|
+
References
|
|
161
|
+
----------
|
|
162
|
+
Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering.
|
|
163
|
+
Proceedings of ACM symposium on Theory of Computing.
|
|
164
|
+
"""
|
|
165
|
+
return 1 - dasgupta_cost(adjacency, dendrogram, weights, normalized=True)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree',
|
|
169
|
+
normalized: bool = True) -> float:
|
|
170
|
+
"""Tree sampling divergence of a hierarchy (quality metric).
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
adjacency :
|
|
175
|
+
Adjacency matrix of the graph.
|
|
176
|
+
dendrogram :
|
|
177
|
+
Dendrogram.
|
|
178
|
+
weights :
|
|
179
|
+
Weights of nodes.
|
|
180
|
+
``'degree'`` (default) or ``'uniform'``.
|
|
181
|
+
normalized :
|
|
182
|
+
If ``True``, normalized score (between 0 and 1).
|
|
183
|
+
|
|
184
|
+
Returns
|
|
185
|
+
-------
|
|
186
|
+
score : float
|
|
187
|
+
Score.
|
|
188
|
+
|
|
189
|
+
Example
|
|
190
|
+
-------
|
|
191
|
+
>>> from sknetwork.hierarchy import tree_sampling_divergence, Paris
|
|
192
|
+
>>> from sknetwork.data import house
|
|
193
|
+
>>> paris = Paris()
|
|
194
|
+
>>> adjacency = house()
|
|
195
|
+
>>> dendrogram = paris.fit_transform(adjacency)
|
|
196
|
+
>>> score = tree_sampling_divergence(adjacency, dendrogram)
|
|
197
|
+
>>> np.round(score, 2)
|
|
198
|
+
0.05
|
|
199
|
+
|
|
200
|
+
References
|
|
201
|
+
----------
|
|
202
|
+
Charpentier, B. & Bonald, T. (2019).
|
|
203
|
+
`Tree Sampling Divergence: An Information-Theoretic Metric for
|
|
204
|
+
Hierarchical Graph Clustering.
|
|
205
|
+
<https://hal.telecom-paristech.fr/hal-02144394/document>`_
|
|
206
|
+
Proceedings of IJCAI.
|
|
207
|
+
"""
|
|
208
|
+
adjacency = check_format(adjacency)
|
|
209
|
+
check_square(adjacency)
|
|
210
|
+
check_min_nnz(adjacency.nnz, 1)
|
|
211
|
+
adjacency = adjacency.astype(float)
|
|
212
|
+
n = adjacency.shape[0]
|
|
213
|
+
check_min_size(n, 2)
|
|
214
|
+
|
|
215
|
+
adjacency.data /= adjacency.data.sum()
|
|
216
|
+
edge_sampling, node_sampling, _ = get_sampling_distributions(adjacency, dendrogram, weights)
|
|
217
|
+
|
|
218
|
+
index = np.where(edge_sampling)[0]
|
|
219
|
+
score = edge_sampling[index].dot(np.log(edge_sampling[index] / node_sampling[index]))
|
|
220
|
+
if normalized:
|
|
221
|
+
weights_row = get_probs(weights, adjacency)
|
|
222
|
+
weights_col = get_probs(weights, adjacency.T)
|
|
223
|
+
inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr')
|
|
224
|
+
inv_out_weights.data = 1 / inv_out_weights.data
|
|
225
|
+
inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr')
|
|
226
|
+
inv_in_weights.data = 1 / inv_in_weights.data
|
|
227
|
+
sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
|
|
228
|
+
inv_out_weights.data = np.ones(len(inv_out_weights.data))
|
|
229
|
+
inv_in_weights.data = np.ones(len(inv_in_weights.data))
|
|
230
|
+
edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
|
|
231
|
+
mutual_information = edge_sampling.data.dot(np.log(sampling_ratio.data))
|
|
232
|
+
if mutual_information > 0:
|
|
233
|
+
score /= mutual_information
|
|
234
|
+
return score
|
|
Binary file
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
# distutils: language = c++
|
|
2
|
+
# cython: language_level=3
|
|
3
|
+
# cython: linetrace=True
|
|
4
|
+
# distutils: define_macros=CYTHON_TRACE_NOGIL=1
|
|
5
|
+
"""
|
|
6
|
+
Created on March 2019
|
|
7
|
+
@author: Thomas Bonald <bonald@enst.fr>
|
|
8
|
+
@author: Bertrand Charpentier <bertrand.charpentier@live.fr>
|
|
9
|
+
@author: Quentin Lutz <qlutz@enst.fr>
|
|
10
|
+
"""
|
|
11
|
+
import numpy as np
|
|
12
|
+
cimport numpy as np
|
|
13
|
+
|
|
14
|
+
cimport cython
|
|
15
|
+
|
|
16
|
+
from libcpp.vector cimport vector
|
|
17
|
+
|
|
18
|
+
from typing import Union
|
|
19
|
+
|
|
20
|
+
from scipy import sparse
|
|
21
|
+
|
|
22
|
+
from sknetwork.hierarchy.base import BaseHierarchy
|
|
23
|
+
from sknetwork.hierarchy.postprocess import reorder_dendrogram
|
|
24
|
+
from sknetwork.utils.format import check_format, get_adjacency, directed2undirected
|
|
25
|
+
from sknetwork.utils.check import get_probs, is_symmetric
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
cdef class AggregateGraph:
|
|
29
|
+
"""A class of graphs suitable for aggregation. Each node represents a cluster.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
out_weights :
|
|
34
|
+
Out-weights (sums to 1).
|
|
35
|
+
in_weights :
|
|
36
|
+
In-weights (sums to 1).
|
|
37
|
+
data :
|
|
38
|
+
CSR format data array of the normalized adjacency matrix.
|
|
39
|
+
indices :
|
|
40
|
+
CSR format index array of the normalized adjacency matrix.
|
|
41
|
+
indptr :
|
|
42
|
+
CSR format index pointer array of the normalized adjacency matrix.
|
|
43
|
+
|
|
44
|
+
Attributes
|
|
45
|
+
----------
|
|
46
|
+
neighbors : dict[dict]
|
|
47
|
+
Dictionary of dictionary of edge weights.
|
|
48
|
+
next_cluster : int
|
|
49
|
+
Index of the next cluster (resulting from aggregation).
|
|
50
|
+
cluster_sizes : dict
|
|
51
|
+
Dictionary of cluster sizes.
|
|
52
|
+
cluster_out_weights : dict
|
|
53
|
+
Dictionary of cluster out-weights (sums to 1).
|
|
54
|
+
cluster_in_weights : dict
|
|
55
|
+
Dictionary of cluster in-weights (sums to 1).
|
|
56
|
+
"""
|
|
57
|
+
cdef public int next_cluster
|
|
58
|
+
cdef public dict neighbors
|
|
59
|
+
cdef public dict tmp
|
|
60
|
+
cdef dict cluster_sizes
|
|
61
|
+
cdef public dict cluster_out_weights
|
|
62
|
+
cdef public dict cluster_in_weights
|
|
63
|
+
|
|
64
|
+
def __init__(self, double[:] out_weights, double[:] in_weights, double[:] data, int[:] indices,
|
|
65
|
+
int[:] indptr):
|
|
66
|
+
cdef int n = indptr.shape[0] - 1
|
|
67
|
+
cdef float total_weight = np.sum(data)
|
|
68
|
+
cdef int i
|
|
69
|
+
cdef int j
|
|
70
|
+
|
|
71
|
+
self.next_cluster = n
|
|
72
|
+
self.neighbors = {}
|
|
73
|
+
for i in range(n):
|
|
74
|
+
# normalize so that the sum of edge weights is equal to 1
|
|
75
|
+
self.neighbors[i] = {}
|
|
76
|
+
for j in range(indptr[i], indptr[i + 1]):
|
|
77
|
+
self.neighbors[i][indices[j]] = data[j] / total_weight
|
|
78
|
+
|
|
79
|
+
cluster_sizes = {}
|
|
80
|
+
cluster_out_weights = {}
|
|
81
|
+
cluster_in_weights = {}
|
|
82
|
+
for i in range(n):
|
|
83
|
+
cluster_sizes[i] = 1
|
|
84
|
+
cluster_out_weights[i] = out_weights[i]
|
|
85
|
+
cluster_in_weights[i] = in_weights[i]
|
|
86
|
+
self.cluster_sizes = cluster_sizes
|
|
87
|
+
self.cluster_out_weights = cluster_out_weights
|
|
88
|
+
self.cluster_in_weights = cluster_in_weights
|
|
89
|
+
|
|
90
|
+
cdef float similarity(self, int node1, int node2):
|
|
91
|
+
"""Similarity of two nodes.
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
node1, node2 :
|
|
96
|
+
Nodes.
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
sim: float
|
|
101
|
+
Similarity.
|
|
102
|
+
"""
|
|
103
|
+
cdef float sim = -float("inf")
|
|
104
|
+
cdef float a = self.cluster_out_weights[node1] * self.cluster_in_weights[node2]
|
|
105
|
+
cdef float b = self.cluster_out_weights[node2] * self.cluster_in_weights[node1]
|
|
106
|
+
cdef float den = a + b
|
|
107
|
+
|
|
108
|
+
if den > 0:
|
|
109
|
+
sim = 2 * self.neighbors[node1][node2] / den
|
|
110
|
+
return sim
|
|
111
|
+
|
|
112
|
+
@cython.boundscheck(False)
|
|
113
|
+
@cython.wraparound(False)
|
|
114
|
+
cpdef AggregateGraph merge(self, int node1, int node2):
|
|
115
|
+
"""Merges two nodes.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
node1, node2 :
|
|
120
|
+
The two nodes to merge.
|
|
121
|
+
|
|
122
|
+
Returns
|
|
123
|
+
-------
|
|
124
|
+
self: :class:`AggregateGraph`
|
|
125
|
+
The aggregate grate (without self-loop).
|
|
126
|
+
"""
|
|
127
|
+
cdef int new_node = self.next_cluster
|
|
128
|
+
self.neighbors[new_node] = {}
|
|
129
|
+
self.neighbors[new_node][new_node] = 0
|
|
130
|
+
cdef set common_neighbors = set(self.neighbors[node1].keys()) & set(self.neighbors[node2].keys()) - {node1, node2}
|
|
131
|
+
for node in common_neighbors:
|
|
132
|
+
self.neighbors[new_node][node] = self.neighbors[node1].pop(node) + self.neighbors[node2].pop(node)
|
|
133
|
+
self.neighbors[node][new_node] = self.neighbors[node].pop(node1) + self.neighbors[node].pop(node2)
|
|
134
|
+
for node in {node1, node2}:
|
|
135
|
+
for neighbor in set(self.neighbors[node].keys()) - {node1, node2}:
|
|
136
|
+
self.neighbors[new_node][neighbor] = self.neighbors[node].pop(neighbor)
|
|
137
|
+
self.neighbors[neighbor][new_node] = self.neighbors[neighbor].pop(node)
|
|
138
|
+
for other_node in {node1, node2}:
|
|
139
|
+
if other_node in self.neighbors[node]:
|
|
140
|
+
self.neighbors[new_node][new_node] += self.neighbors[node][other_node]
|
|
141
|
+
del self.neighbors[node]
|
|
142
|
+
self.cluster_sizes[new_node] = self.cluster_sizes.pop(node1) + self.cluster_sizes.pop(node2)
|
|
143
|
+
self.cluster_out_weights[new_node] = self.cluster_out_weights.pop(node1) + self.cluster_out_weights.pop(node2)
|
|
144
|
+
self.cluster_in_weights[new_node] = self.cluster_in_weights.pop(node1) + self.cluster_in_weights.pop(node2)
|
|
145
|
+
self.next_cluster += 1
|
|
146
|
+
return self
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class Paris(BaseHierarchy):
|
|
150
|
+
"""Agglomerative clustering algorithm that performs greedy merge of nodes based on their similarity.
|
|
151
|
+
|
|
152
|
+
The similarity between nodes :math:`i,j` is :math:`\\dfrac{A_{ij}}{w_i w_j}` where
|
|
153
|
+
|
|
154
|
+
* :math:`A_{ij}` is the weight of edge :math:`i,j`,
|
|
155
|
+
* :math:`w_i, w_j` are the weights of nodes :math:`i,j`
|
|
156
|
+
|
|
157
|
+
If the input matrix :math:`B` is a biadjacency matrix (i.e., rectangular), the algorithm is applied
|
|
158
|
+
to the corresponding adjacency matrix :math:`A = \\begin{bmatrix} 0 & B \\\\ B^T & 0 \\end{bmatrix}`
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
weights :
|
|
163
|
+
Weights of nodes.
|
|
164
|
+
``'degree'`` (default) or ``'uniform'``.
|
|
165
|
+
reorder :
|
|
166
|
+
If ``True`` (default), reorder the dendrogram in non-decreasing order of height.
|
|
167
|
+
|
|
168
|
+
Attributes
|
|
169
|
+
----------
|
|
170
|
+
dendrogram_ :
|
|
171
|
+
Dendrogram of the graph.
|
|
172
|
+
dendrogram_row_ :
|
|
173
|
+
Dendrogram for the rows, for bipartite graphs.
|
|
174
|
+
dendrogram_col_ :
|
|
175
|
+
Dendrogram for the columns, for bipartite graphs.
|
|
176
|
+
dendrogram_full_ :
|
|
177
|
+
Dendrogram for both rows and columns, indexed in this order, for bipartite graphs.
|
|
178
|
+
|
|
179
|
+
Examples
|
|
180
|
+
--------
|
|
181
|
+
>>> from sknetwork.hierarchy import Paris
|
|
182
|
+
>>> from sknetwork.data import house
|
|
183
|
+
>>> paris = Paris()
|
|
184
|
+
>>> adjacency = house()
|
|
185
|
+
>>> dendrogram = paris.fit_predict(adjacency)
|
|
186
|
+
>>> np.round(dendrogram, 2)
|
|
187
|
+
array([[3. , 2. , 0.17 , 2. ],
|
|
188
|
+
[1. , 0. , 0.25 , 2. ],
|
|
189
|
+
[6. , 4. , 0.31 , 3. ],
|
|
190
|
+
[7. , 5. , 0.67 , 5. ]])
|
|
191
|
+
|
|
192
|
+
Notes
|
|
193
|
+
-----
|
|
194
|
+
Each row of the dendrogram = :math:`i, j`, distance, size of cluster :math:`i + j`.
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
See Also
|
|
198
|
+
--------
|
|
199
|
+
scipy.cluster.hierarchy.linkage
|
|
200
|
+
|
|
201
|
+
References
|
|
202
|
+
----------
|
|
203
|
+
T. Bonald, B. Charpentier, A. Galland, A. Hollocou (2018).
|
|
204
|
+
`Hierarchical Graph Clustering using Node Pair Sampling.
|
|
205
|
+
<https://arxiv.org/abs/1806.01664>`_
|
|
206
|
+
Workshop on Mining and Learning with Graphs.
|
|
207
|
+
"""
|
|
208
|
+
def __init__(self, weights: str = 'degree', reorder: bool = True):
|
|
209
|
+
super(Paris, self).__init__()
|
|
210
|
+
self.dendrogram_ = None
|
|
211
|
+
self.weights = weights
|
|
212
|
+
self.reorder = reorder
|
|
213
|
+
self.bipartite = None
|
|
214
|
+
|
|
215
|
+
@cython.boundscheck(False)
|
|
216
|
+
@cython.wraparound(False)
|
|
217
|
+
def fit(self, input_matrix: Union[sparse.csr_matrix, np.ndarray]) -> 'Paris':
|
|
218
|
+
"""Agglomerative clustering using the nearest neighbor chain.
|
|
219
|
+
|
|
220
|
+
Parameters
|
|
221
|
+
----------
|
|
222
|
+
input_matrix :
|
|
223
|
+
Adjacency matrix or biadjacency matrix of the graph.
|
|
224
|
+
|
|
225
|
+
Returns
|
|
226
|
+
-------
|
|
227
|
+
self: :class:`Paris`
|
|
228
|
+
"""
|
|
229
|
+
self._init_vars()
|
|
230
|
+
|
|
231
|
+
# input
|
|
232
|
+
input_matrix = check_format(input_matrix)
|
|
233
|
+
adjacency, self.bipartite = get_adjacency(input_matrix)
|
|
234
|
+
|
|
235
|
+
weights = self.weights
|
|
236
|
+
out_weights = get_probs(weights, adjacency)
|
|
237
|
+
in_weights = get_probs(weights, adjacency.T)
|
|
238
|
+
|
|
239
|
+
if not is_symmetric(adjacency):
|
|
240
|
+
adjacency = directed2undirected(adjacency)
|
|
241
|
+
|
|
242
|
+
null_weights = (out_weights + in_weights) == 0
|
|
243
|
+
if any(null_weights):
|
|
244
|
+
adjacency += sparse.diags(null_weights.astype(int))
|
|
245
|
+
|
|
246
|
+
if adjacency.shape[0] <= 1:
|
|
247
|
+
raise ValueError('The graph must contain at least two nodes.')
|
|
248
|
+
|
|
249
|
+
# agglomerative clustering
|
|
250
|
+
aggregate_graph = AggregateGraph(out_weights, in_weights, adjacency.data.astype(float),
|
|
251
|
+
adjacency.indices, adjacency.indptr)
|
|
252
|
+
|
|
253
|
+
cdef vector[(int, int)] connected_components
|
|
254
|
+
dendrogram = []
|
|
255
|
+
cdef int node
|
|
256
|
+
cdef int next_node
|
|
257
|
+
cdef int cluster_size
|
|
258
|
+
cdef int next_cluster_size
|
|
259
|
+
cdef int neighbor
|
|
260
|
+
cdef int nearest_neighbor
|
|
261
|
+
cdef int nearest_neighbor_last
|
|
262
|
+
cdef vector[int] chain
|
|
263
|
+
cdef float sim
|
|
264
|
+
cdef float max_sim
|
|
265
|
+
|
|
266
|
+
while len(aggregate_graph.cluster_sizes):
|
|
267
|
+
for node in aggregate_graph.cluster_sizes:
|
|
268
|
+
break
|
|
269
|
+
chain.clear()
|
|
270
|
+
chain.push_back(node)
|
|
271
|
+
while chain.size():
|
|
272
|
+
node = chain[chain.size() - 1]
|
|
273
|
+
chain.pop_back()
|
|
274
|
+
if set(aggregate_graph.neighbors[node].keys()) - {node}:
|
|
275
|
+
max_sim = -float("inf")
|
|
276
|
+
for neighbor in set(aggregate_graph.neighbors[node].keys()) - {node}:
|
|
277
|
+
sim = aggregate_graph.similarity(node, neighbor)
|
|
278
|
+
if sim > max_sim:
|
|
279
|
+
nearest_neighbor = neighbor
|
|
280
|
+
max_sim = sim
|
|
281
|
+
elif sim == max_sim:
|
|
282
|
+
nearest_neighbor = min(neighbor, nearest_neighbor)
|
|
283
|
+
if chain.size():
|
|
284
|
+
nearest_neighbor_last = chain[chain.size() - 1]
|
|
285
|
+
chain.pop_back()
|
|
286
|
+
if nearest_neighbor_last == nearest_neighbor:
|
|
287
|
+
size = aggregate_graph.cluster_sizes[node] + aggregate_graph.cluster_sizes[nearest_neighbor]
|
|
288
|
+
dendrogram.append([node, nearest_neighbor, 1. / max_sim, size])
|
|
289
|
+
aggregate_graph.merge(node, nearest_neighbor)
|
|
290
|
+
else:
|
|
291
|
+
chain.push_back(nearest_neighbor_last)
|
|
292
|
+
chain.push_back(node)
|
|
293
|
+
chain.push_back(nearest_neighbor)
|
|
294
|
+
else:
|
|
295
|
+
chain.push_back(node)
|
|
296
|
+
chain.push_back(nearest_neighbor)
|
|
297
|
+
else:
|
|
298
|
+
connected_components.push_back((node, aggregate_graph.cluster_sizes[node]))
|
|
299
|
+
del aggregate_graph.cluster_sizes[node]
|
|
300
|
+
|
|
301
|
+
node, cluster_size = connected_components[connected_components.size() - 1]
|
|
302
|
+
connected_components.pop_back()
|
|
303
|
+
for next_node, next_cluster_size in connected_components:
|
|
304
|
+
cluster_size += next_cluster_size
|
|
305
|
+
dendrogram.append([node, next_node, float("inf"), cluster_size])
|
|
306
|
+
node = aggregate_graph.next_cluster
|
|
307
|
+
aggregate_graph.next_cluster += 1
|
|
308
|
+
|
|
309
|
+
dendrogram = np.array(dendrogram)
|
|
310
|
+
if self.reorder:
|
|
311
|
+
dendrogram = reorder_dendrogram(dendrogram)
|
|
312
|
+
|
|
313
|
+
self.dendrogram_ = dendrogram
|
|
314
|
+
if self.bipartite:
|
|
315
|
+
self._split_vars(input_matrix.shape)
|
|
316
|
+
|
|
317
|
+
return self
|