egosplit-sknetwork 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- egosplit_sknetwork-0.0.4/LICENSE +21 -0
- egosplit_sknetwork-0.0.4/PKG-INFO +58 -0
- egosplit_sknetwork-0.0.4/README.md +42 -0
- egosplit_sknetwork-0.0.4/egosplit_sknetwork/__init__.py +4 -0
- egosplit_sknetwork-0.0.4/egosplit_sknetwork/egosplit_sknetwork_.py +279 -0
- egosplit_sknetwork-0.0.4/egosplit_sknetwork.egg-info/PKG-INFO +58 -0
- egosplit_sknetwork-0.0.4/egosplit_sknetwork.egg-info/SOURCES.txt +11 -0
- egosplit_sknetwork-0.0.4/egosplit_sknetwork.egg-info/dependency_links.txt +1 -0
- egosplit_sknetwork-0.0.4/egosplit_sknetwork.egg-info/requires.txt +3 -0
- egosplit_sknetwork-0.0.4/egosplit_sknetwork.egg-info/top_level.txt +1 -0
- egosplit_sknetwork-0.0.4/pyproject.toml +25 -0
- egosplit_sknetwork-0.0.4/setup.cfg +4 -0
- egosplit_sknetwork-0.0.4/tests/test_egosplit_sknetwork.py +101 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ryan DeWolfe
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: egosplit_sknetwork
|
|
3
|
+
Version: 0.0.4
|
|
4
|
+
Summary: Fast python implementation of the egosplitting framework for overlapping clustering using sknetwork.
|
|
5
|
+
Author-email: Ryan DeWolfe <ryandewolfe33@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy>=2.0
|
|
13
|
+
Requires-Dist: scikit-network>=0.33
|
|
14
|
+
Requires-Dist: numba>=0.60.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# EgoSplit-sknetwork
|
|
18
|
+
|
|
19
|
+
This package provides a fast and flexible implementation of the egosplitting community detection paradigm for detecting overlapping communities.
|
|
20
|
+
For details and motivation of the algorithm, please see the paper below.
|
|
21
|
+
The reference implementation is available [here](https://github.com/google-research/google-research/blob/master/graph_embedding/persona/persona.py).
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
> Alessandro Epasto, Silvio Lattanzi, and Renato Paes Leme. 2017. Ego-Splitting Framework: from Non-Overlapping to Overlapping Clusters. In Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD '17). Association for Computing Machinery, New York, NY, USA, 145-154. https://doi.org/10.1145/3097983.3098054
|
|
25
|
+
|
|
26
|
+
# Installation
|
|
27
|
+
|
|
28
|
+
Currently you can install this package by cloning this repository and installing locally.
|
|
29
|
+
```sh
|
|
30
|
+
git clone https://github.com/ryandewolfe33/egosplit-sknetwork.git
|
|
31
|
+
cd egosplit-sknetwork
|
|
32
|
+
pip install .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Example
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import sknetwork as sn
|
|
40
|
+
from egosplit_sknetwork import EgoSplit
|
|
41
|
+
|
|
42
|
+
g = sn.data.toy_graphs.karate_club()
|
|
43
|
+
egosplit = EgoSplit()
|
|
44
|
+
labels = egosplit.fit_predict(g)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
By default the algorithm uses [Propagation Clustering](https://scikit-network.readthedocs.io/en/latest/reference/clustering.html#sknetwork.clustering.PropagationClustering) for local clustering and [Leiden](https://scikit-network.readthedocs.io/en/latest/reference/clustering.html#sknetwork.clustering.Leiden) for global clustering.
|
|
48
|
+
To pass other clustering algorithms to egosplit, they must be initialized in advace and passed as parameters.
|
|
49
|
+
The algorithm accepts any subclass of [sknetwork.clustering.BaseClustering](https://scikit-network.readthedocs.io/en/latest/reference/clustering.html) for either local_clustering (used to cluster the egonets) or global_clustering (used to cluster the persona graph).
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
high_res_clusterer = sn.clustering.Louvain(resolution=5, random_state=42)
|
|
53
|
+
egosplit = EgoSplit(local_clustering='PC', global_clustering=high_res_clusterer)
|
|
54
|
+
egosplit.fit(g)
|
|
55
|
+
labels = egosplit.labels_
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Labels is a sparse matrix with dimensions (n_labels, n_vertices), where `labels[i,j] = True` if vertex j is in cluster i.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# EgoSplit-sknetwork
|
|
2
|
+
|
|
3
|
+
This package provides a fast and flexible implementation of the egosplitting community detection paradigm for detecting overlapping communities.
|
|
4
|
+
For details and motivation of the algorithm, please see the paper below.
|
|
5
|
+
The reference implementation is available [here](https://github.com/google-research/google-research/blob/master/graph_embedding/persona/persona.py).
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
> Alessandro Epasto, Silvio Lattanzi, and Renato Paes Leme. 2017. Ego-Splitting Framework: from Non-Overlapping to Overlapping Clusters. In Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD '17). Association for Computing Machinery, New York, NY, USA, 145-154. https://doi.org/10.1145/3097983.3098054
|
|
9
|
+
|
|
10
|
+
# Installation
|
|
11
|
+
|
|
12
|
+
Currently you can install this package by cloning this repository and installing locally.
|
|
13
|
+
```sh
|
|
14
|
+
git clone https://github.com/ryandewolfe33/egosplit-sknetwork.git
|
|
15
|
+
cd egosplit-sknetwork
|
|
16
|
+
pip install .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Example
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
import sknetwork as sn
|
|
24
|
+
from egosplit_sknetwork import EgoSplit
|
|
25
|
+
|
|
26
|
+
g = sn.data.toy_graphs.karate_club()
|
|
27
|
+
egosplit = EgoSplit()
|
|
28
|
+
labels = egosplit.fit_predict(g)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
By default the algorithm uses [Propagation Clustering](https://scikit-network.readthedocs.io/en/latest/reference/clustering.html#sknetwork.clustering.PropagationClustering) for local clustering and [Leiden](https://scikit-network.readthedocs.io/en/latest/reference/clustering.html#sknetwork.clustering.Leiden) for global clustering.
|
|
32
|
+
To pass other clustering algorithms to egosplit, they must be initialized in advace and passed as parameters.
|
|
33
|
+
The algorithm accepts any subclass of [sknetwork.clustering.BaseClustering](https://scikit-network.readthedocs.io/en/latest/reference/clustering.html) for either local_clustering (used to cluster the egonets) or global_clustering (used to cluster the persona graph).
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
high_res_clusterer = sn.clustering.Louvain(resolution=5, random_state=42)
|
|
37
|
+
egosplit = EgoSplit(local_clustering='PC', global_clustering=high_res_clusterer)
|
|
38
|
+
egosplit.fit(g)
|
|
39
|
+
labels = egosplit.labels_
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Labels is a sparse matrix with dimensions (n_labels, n_vertices), where `labels[i,j] = True` if vertex j is in cluster i.
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
import sknetwork as sn
|
|
2
|
+
import numpy as np
|
|
3
|
+
import scipy.sparse as sp
|
|
4
|
+
import numba
|
|
5
|
+
from numba.typed import List
|
|
6
|
+
from numba.types import int32
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConnectedComponents(sn.clustering.BaseClustering):
|
|
11
|
+
"""
|
|
12
|
+
A helper class that allows connected components to behave like a clustering algorithm.
|
|
13
|
+
The clusters are the connected components of the input graph
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
sort_clusters: bool = True,
|
|
19
|
+
return_probs: bool = False,
|
|
20
|
+
return_aggregate: bool = False,
|
|
21
|
+
):
|
|
22
|
+
super(ConnectedComponents, self).__init__(
|
|
23
|
+
sort_clusters=sort_clusters,
|
|
24
|
+
return_probs=return_probs,
|
|
25
|
+
return_aggregate=return_aggregate,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def fit(self, g):
|
|
29
|
+
self.labels_ = sp.csgraph.connected_components(g)[1]
|
|
30
|
+
return self
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
#################################
|
|
34
|
+
# Helper Functions for EgoSplit #
|
|
35
|
+
#################################
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@numba.njit(nogil=True)
|
|
39
|
+
def _get_data(indptr, indices, data, i, j):
|
|
40
|
+
for index in range(indptr[i], indptr[i + 1]):
|
|
41
|
+
if indices[index] == j:
|
|
42
|
+
return data[index]
|
|
43
|
+
return -1
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@numba.njit
|
|
47
|
+
def _make_neighbor_sets(indptr, indices):
|
|
48
|
+
neighbors = List(
|
|
49
|
+
[set(indices[indptr[i] : indptr[i + 1]]) for i in range(len(indptr) - 1)]
|
|
50
|
+
)
|
|
51
|
+
return neighbors
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@numba.njit
|
|
55
|
+
def _make_egonet(i, neighbor_sets, global_indptr, global_indices, global_data):
|
|
56
|
+
egonet_nodes = neighbor_sets[i]
|
|
57
|
+
egonet_old_ids = np.empty(len(egonet_nodes), dtype="int32")
|
|
58
|
+
for i, node in enumerate(egonet_nodes):
|
|
59
|
+
egonet_old_ids[i] = node
|
|
60
|
+
new_ids = {node: int32(index) for index, node in enumerate(egonet_old_ids)}
|
|
61
|
+
lil = List.empty_list(int32[:])
|
|
62
|
+
for node in egonet_old_ids:
|
|
63
|
+
egonet_neighbors_old_ids = neighbor_sets[node].intersection(egonet_nodes)
|
|
64
|
+
egonet_neighbors_new_ids = np.empty(
|
|
65
|
+
len(egonet_neighbors_old_ids), dtype="int32"
|
|
66
|
+
)
|
|
67
|
+
for i, old_id in enumerate(egonet_neighbors_old_ids):
|
|
68
|
+
egonet_neighbors_new_ids[i] = new_ids[old_id]
|
|
69
|
+
egonet_neighbors_new_ids.sort()
|
|
70
|
+
lil.append(egonet_neighbors_new_ids)
|
|
71
|
+
indptr = np.empty(len(lil) + 1, dtype="int32")
|
|
72
|
+
indptr[0] = 0
|
|
73
|
+
n_edges = 0
|
|
74
|
+
for i in lil:
|
|
75
|
+
n_edges += len(i)
|
|
76
|
+
indices = np.empty(n_edges, dtype="int32")
|
|
77
|
+
data = np.empty_like(indices, dtype=global_data.dtype)
|
|
78
|
+
for i, neighbors in enumerate(lil):
|
|
79
|
+
indptr[i + 1] = indptr[i] + len(neighbors)
|
|
80
|
+
indices[indptr[i] : indptr[i + 1]] = neighbors
|
|
81
|
+
|
|
82
|
+
i_old_id = egonet_old_ids[i]
|
|
83
|
+
neighbors_old_ids = np.empty_like(neighbors)
|
|
84
|
+
for j, n in enumerate(neighbors):
|
|
85
|
+
neighbors_old_ids[j] = egonet_old_ids[n]
|
|
86
|
+
for j, index in enumerate(range(indptr[i], indptr[i + 1])):
|
|
87
|
+
data[index] = _get_data(
|
|
88
|
+
global_indptr,
|
|
89
|
+
global_indices,
|
|
90
|
+
global_data,
|
|
91
|
+
i_old_id,
|
|
92
|
+
neighbors_old_ids[j],
|
|
93
|
+
)
|
|
94
|
+
return indptr, indices, data
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@numba.njit
|
|
98
|
+
def make_persona_graph(
|
|
99
|
+
g_indptr, g_indices, g_data, egonet_indices, egonet_community, first_personae_index
|
|
100
|
+
):
|
|
101
|
+
persona_indptr = np.empty(first_personae_index[-1] + 1, dtype="int32")
|
|
102
|
+
persona_indptr[-1] = len(g_indices)
|
|
103
|
+
persona_indices = np.empty_like(g_indices)
|
|
104
|
+
persona_data = np.empty_like(g_data)
|
|
105
|
+
|
|
106
|
+
next_index = 0
|
|
107
|
+
for og_n1 in range(len(g_indptr) - 1):
|
|
108
|
+
og_neighbors = egonet_indices[og_n1]
|
|
109
|
+
communities = egonet_community[og_n1]
|
|
110
|
+
|
|
111
|
+
for c in range(np.max(communities) + 1):
|
|
112
|
+
new_n1 = first_personae_index[og_n1] + c
|
|
113
|
+
new_n1_indptr = next_index
|
|
114
|
+
persona_indptr[new_n1] = new_n1_indptr
|
|
115
|
+
for i in range(len(communities)):
|
|
116
|
+
if communities[i] != c:
|
|
117
|
+
continue
|
|
118
|
+
og_n2 = og_neighbors[i]
|
|
119
|
+
# Get new id of the other end of the edge (og_n1, og_n2)
|
|
120
|
+
og_n2_neighbors = egonet_indices[og_n2]
|
|
121
|
+
# search for og_n1
|
|
122
|
+
for j in range(len(og_n2_neighbors)):
|
|
123
|
+
if og_n2_neighbors[j] != og_n1:
|
|
124
|
+
continue
|
|
125
|
+
# Get the egonet commuity of og_n1
|
|
126
|
+
n2_persona_for_n1 = egonet_community[og_n2][j]
|
|
127
|
+
new_n2 = first_personae_index[og_n2] + n2_persona_for_n1
|
|
128
|
+
# write new n2_persona and data into persona graph
|
|
129
|
+
persona_indices[next_index] = new_n2
|
|
130
|
+
persona_data[next_index] = g_data[g_indptr[og_n1]] + i
|
|
131
|
+
next_index += 1
|
|
132
|
+
return persona_data, persona_indices, persona_indptr
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class EgoSplit:
|
|
136
|
+
"""
|
|
137
|
+
Implementation of the Egosplitting framework method for overlapping clustering using
|
|
138
|
+
sknetwork. Since sknetwork does not allow overlapping clusterings, this is not a
|
|
139
|
+
subclass of the sknetwork.clustering.BaseClustering, but it is built to behave similarly.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
local_clustering: The clustering method used for the egonet. Should be either "CC"
|
|
144
|
+
(ConnectedCompnents), "PC" (PropagationClustering), or a subclass of
|
|
145
|
+
sknetwork.clustering.BaseClustering.
|
|
146
|
+
global_clustering: The clustering method used for the persona graph. Should
|
|
147
|
+
be either "Louvain", "Leiden", or a subclass of sknetwork.clustering.BaseClustering.
|
|
148
|
+
random_state: The random state to pass to the default clustering algorithms
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
scipy.sparse.csr_matrix: An overlapping clustering of the nodes. Rows correspond to clusters
|
|
153
|
+
and columns to nodes.
|
|
154
|
+
|
|
155
|
+
Example
|
|
156
|
+
-------
|
|
157
|
+
>>> g = sn.data.karate_club()
|
|
158
|
+
>>> part1 = EgoSplit().fit_predict(g)
|
|
159
|
+
|
|
160
|
+
Reference
|
|
161
|
+
---------
|
|
162
|
+
Alessandro Epasto, Silvio Lattanzi, and Renato Paes Leme. 2017. Ego-Splitting Framework:
|
|
163
|
+
from Non-Overlapping to Overlapping Clusters. In Proceedings of the 23rd ACM SIGKDD
|
|
164
|
+
International Conference on Knowledge Discovery and Data Mining (KDD '17). Association
|
|
165
|
+
for Computing Machinery, New York, NY, USA, 145-154. https://doi.org/10.1145/3097983.3098054
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
def __init__(
|
|
169
|
+
self,
|
|
170
|
+
local_clustering="PC",
|
|
171
|
+
global_clustering="Leiden",
|
|
172
|
+
min_cluster_size=5,
|
|
173
|
+
random_state=None,
|
|
174
|
+
verbose=False,
|
|
175
|
+
):
|
|
176
|
+
if local_clustering == "CC":
|
|
177
|
+
self.local_clustering_ = ConnectedComponents()
|
|
178
|
+
elif local_clustering == "PC":
|
|
179
|
+
self.local_clustering_ = sn.clustering.PropagationClustering()
|
|
180
|
+
elif issubclass(type(local_clustering), sn.clustering.BaseClustering):
|
|
181
|
+
self.local_clustering_ = local_clustering
|
|
182
|
+
else:
|
|
183
|
+
raise ValueError(
|
|
184
|
+
f"local_clustering should be either 'CC' or 'PC', or a subclass of sknetwork.clustering.BaseClustering. Got {type(local_clustering)}"
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if global_clustering == "Leiden":
|
|
188
|
+
self.global_clustering_ = sn.clustering.Leiden(random_state=random_state)
|
|
189
|
+
elif global_clustering == "Louvain":
|
|
190
|
+
self.global_clustering_ = sn.clustering.Louvain(random_state=random_state)
|
|
191
|
+
elif global_clustering == "PC":
|
|
192
|
+
self.global_clustering_ = sn.clustering.PropagationClustering()
|
|
193
|
+
elif issubclass(type(global_clustering), sn.clustering.BaseClustering):
|
|
194
|
+
self.global_clustering_ = global_clustering
|
|
195
|
+
else:
|
|
196
|
+
raise ValueError(
|
|
197
|
+
f"global_clustering should be in ['Louvain', 'Leiden', 'PC'] or a subclass of sknetwork.clustering.BaseClustering. Got {type(global_clustering)}"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
self.min_cluster_size = min_cluster_size
|
|
201
|
+
if not isinstance(self.min_cluster_size, int):
|
|
202
|
+
if self.max_rounds % 1 != 0:
|
|
203
|
+
raise ValueError("min_cluster_size must be a whole number")
|
|
204
|
+
try:
|
|
205
|
+
# convert other types of int to python int
|
|
206
|
+
self.min_cluster_size = int(self.min_cluster_size)
|
|
207
|
+
except ValueError:
|
|
208
|
+
raise ValueError("min_cluster_size must be an int")
|
|
209
|
+
if self.min_cluster_size < 0:
|
|
210
|
+
raise ValueError("min_cluster_size must be non-negative")
|
|
211
|
+
self.verbose = verbose
|
|
212
|
+
|
|
213
|
+
def fit(self, g):
|
|
214
|
+
egonet_indices = [] # Store the original indices of the egonet
|
|
215
|
+
egonet_community = [] # Store the community labels of the ego nets
|
|
216
|
+
self.first_personae_index_ = np.empty(
|
|
217
|
+
g.shape[0] + 1, dtype="int32"
|
|
218
|
+
) # Store the first index for a nodes new personae.
|
|
219
|
+
# The new personae of node i will be stored in rows
|
|
220
|
+
# first_personae_index[i], first_personae_index[i]+1, ... , first_personae_index[i+1]-1.
|
|
221
|
+
next_index = 0
|
|
222
|
+
neighbor_sets = _make_neighbor_sets(g.indptr, g.indices)
|
|
223
|
+
print("Making Egonets") if self.verbose else None
|
|
224
|
+
for node in tqdm(range(g.shape[0]), disable=not self.verbose):
|
|
225
|
+
neighbors = g.indices[g.indptr[node] : g.indptr[node + 1]]
|
|
226
|
+
egonet_indices.append(neighbors)
|
|
227
|
+
indptr, indices, data = _make_egonet(
|
|
228
|
+
node, neighbor_sets, g.indptr, g.indices, g.data
|
|
229
|
+
)
|
|
230
|
+
egonet = sp.csr_matrix(
|
|
231
|
+
(data, indices, indptr), shape=(len(neighbors), len(neighbors))
|
|
232
|
+
)
|
|
233
|
+
if (
|
|
234
|
+
len(egonet.data) == 0
|
|
235
|
+
): # egonet has no edges, each node is its own cluster
|
|
236
|
+
persona_map = sp.csgraph.connected_components(egonet)[1]
|
|
237
|
+
else:
|
|
238
|
+
persona_map = self.local_clustering_.fit_predict(egonet).astype("int32")
|
|
239
|
+
egonet_community.append(persona_map)
|
|
240
|
+
self.first_personae_index_[node] = next_index
|
|
241
|
+
next_index += np.max(persona_map) + 1
|
|
242
|
+
|
|
243
|
+
self.first_personae_index_[-1] = next_index
|
|
244
|
+
ei = List(egonet_indices)
|
|
245
|
+
ec = List(egonet_community)
|
|
246
|
+
print("Making Persona Graph") if self.verbose else None
|
|
247
|
+
persona_graph_data = make_persona_graph(
|
|
248
|
+
g.indptr, g.indices, g.data, ei, ec, self.first_personae_index_
|
|
249
|
+
)
|
|
250
|
+
self.persona_graph_ = sp.csr_matrix(
|
|
251
|
+
persona_graph_data,
|
|
252
|
+
shape=(self.first_personae_index_[-1], self.first_personae_index_[-1]),
|
|
253
|
+
)
|
|
254
|
+
print("Clustering Persona Graph") if self.verbose else None
|
|
255
|
+
self.persona_clusters_ = self.global_clustering_.fit_predict(
|
|
256
|
+
self.persona_graph_
|
|
257
|
+
)
|
|
258
|
+
print("Mapping Clusters") if self.verbose else None
|
|
259
|
+
n_clusters = np.max(self.persona_clusters_) + 1
|
|
260
|
+
clusters = sp.lil_matrix((g.shape[0], n_clusters), dtype="bool")
|
|
261
|
+
for node in tqdm(range(g.shape[0]), disable=not self.verbose):
|
|
262
|
+
node_clusters = np.unique(
|
|
263
|
+
self.persona_clusters_[
|
|
264
|
+
self.first_personae_index_[node] : self.first_personae_index_[
|
|
265
|
+
node + 1
|
|
266
|
+
]
|
|
267
|
+
]
|
|
268
|
+
)
|
|
269
|
+
clusters[node, node_clusters] = True
|
|
270
|
+
clusters = clusters.tocsc().transpose()
|
|
271
|
+
if self.min_cluster_size > 0:
|
|
272
|
+
clusters = clusters[clusters.getnnz(1) >= self.min_cluster_size]
|
|
273
|
+
|
|
274
|
+
self.labels_ = clusters
|
|
275
|
+
return self
|
|
276
|
+
|
|
277
|
+
def fit_predict(self, g):
|
|
278
|
+
self.fit(g)
|
|
279
|
+
return self.labels_
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: egosplit_sknetwork
|
|
3
|
+
Version: 0.0.4
|
|
4
|
+
Summary: Fast python implementation of the egosplitting framework for overlapping clustering using sknetwork.
|
|
5
|
+
Author-email: Ryan DeWolfe <ryandewolfe33@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy>=2.0
|
|
13
|
+
Requires-Dist: scikit-network>=0.33
|
|
14
|
+
Requires-Dist: numba>=0.60.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# EgoSplit-sknetwork
|
|
18
|
+
|
|
19
|
+
This package provides a fast and flexible implementation of the egosplitting community detection paradigm for detecting overlapping communities.
|
|
20
|
+
For details and motivation of the algorithm, please see the paper below.
|
|
21
|
+
The reference implementation is available [here](https://github.com/google-research/google-research/blob/master/graph_embedding/persona/persona.py).
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
> Alessandro Epasto, Silvio Lattanzi, and Renato Paes Leme. 2017. Ego-Splitting Framework: from Non-Overlapping to Overlapping Clusters. In Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD '17). Association for Computing Machinery, New York, NY, USA, 145-154. https://doi.org/10.1145/3097983.3098054
|
|
25
|
+
|
|
26
|
+
# Installation
|
|
27
|
+
|
|
28
|
+
Currently you can install this package by cloning this repository and installing locally.
|
|
29
|
+
```sh
|
|
30
|
+
git clone https://github.com/ryandewolfe33/egosplit-sknetwork.git
|
|
31
|
+
cd egosplit-sknetwork
|
|
32
|
+
pip install .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Example
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import sknetwork as sn
|
|
40
|
+
from egosplit_sknetwork import EgoSplit
|
|
41
|
+
|
|
42
|
+
g = sn.data.toy_graphs.karate_club()
|
|
43
|
+
egosplit = EgoSplit()
|
|
44
|
+
labels = egosplit.fit_predict(g)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
By default the algorithm uses [Propagation Clustering](https://scikit-network.readthedocs.io/en/latest/reference/clustering.html#sknetwork.clustering.PropagationClustering) for local clustering and [Leiden](https://scikit-network.readthedocs.io/en/latest/reference/clustering.html#sknetwork.clustering.Leiden) for global clustering.
|
|
48
|
+
To pass other clustering algorithms to egosplit, they must be initialized in advace and passed as parameters.
|
|
49
|
+
The algorithm accepts any subclass of [sknetwork.clustering.BaseClustering](https://scikit-network.readthedocs.io/en/latest/reference/clustering.html) for either local_clustering (used to cluster the egonets) or global_clustering (used to cluster the persona graph).
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
high_res_clusterer = sn.clustering.Louvain(resolution=5, random_state=42)
|
|
53
|
+
egosplit = EgoSplit(local_clustering='PC', global_clustering=high_res_clusterer)
|
|
54
|
+
egosplit.fit(g)
|
|
55
|
+
labels = egosplit.labels_
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Labels is a sparse matrix with dimensions (n_labels, n_vertices), where `labels[i,j] = True` if vertex j is in cluster i.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
egosplit_sknetwork/__init__.py
|
|
5
|
+
egosplit_sknetwork/egosplit_sknetwork_.py
|
|
6
|
+
egosplit_sknetwork.egg-info/PKG-INFO
|
|
7
|
+
egosplit_sknetwork.egg-info/SOURCES.txt
|
|
8
|
+
egosplit_sknetwork.egg-info/dependency_links.txt
|
|
9
|
+
egosplit_sknetwork.egg-info/requires.txt
|
|
10
|
+
egosplit_sknetwork.egg-info/top_level.txt
|
|
11
|
+
tests/test_egosplit_sknetwork.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
egosplit_sknetwork
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools >= 77.0.3"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "egosplit_sknetwork"
|
|
7
|
+
version = "0.0.4"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Ryan DeWolfe", email="ryandewolfe33@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "Fast python implementation of the egosplitting framework for overlapping clustering using sknetwork."
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
]
|
|
16
|
+
readme = "README.md"
|
|
17
|
+
license = "MIT"
|
|
18
|
+
license-files = ["LICEN[CS]E*"]
|
|
19
|
+
|
|
20
|
+
requires-python = ">=3.10"
|
|
21
|
+
dependencies = [
|
|
22
|
+
"numpy >= 2.0",
|
|
23
|
+
"scikit-network >= 0.33",
|
|
24
|
+
"numba >= 0.60.0",
|
|
25
|
+
]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import scipy.sparse as sp
|
|
3
|
+
import sknetwork as sn
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
import egosplit_sknetwork as esn
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pytest.fixture
|
|
10
|
+
def karate():
|
|
11
|
+
return sn.data.toy_graphs.karate_club()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_make_egonet():
|
|
15
|
+
g = sp.csr_matrix(
|
|
16
|
+
[
|
|
17
|
+
[0, 1, 1, 1, 0],
|
|
18
|
+
[1, 0, 1, 1, 1],
|
|
19
|
+
[1, 1, 0, 0, 0],
|
|
20
|
+
[0, 1, 0, 0, 1],
|
|
21
|
+
[0, 1, 0, 1, 0],
|
|
22
|
+
]
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
true_egonet = sp.csr_matrix(
|
|
26
|
+
[
|
|
27
|
+
[0, 1, 1],
|
|
28
|
+
[1, 0, 0],
|
|
29
|
+
[1, 0, 0],
|
|
30
|
+
]
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
neighbor_sets = esn.egosplit_sknetwork_._make_neighbor_sets(g.indptr, g.indices)
|
|
34
|
+
neighbors = neighbor_sets[0]
|
|
35
|
+
indptr, indices, data = esn.egosplit_sknetwork_._make_egonet(
|
|
36
|
+
0, neighbor_sets, g.indptr, g.indices, g.data
|
|
37
|
+
)
|
|
38
|
+
egonet = sp.csr_matrix(
|
|
39
|
+
(data, indices, indptr), shape=(len(neighbors), len(neighbors))
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
assert egonet.shape == true_egonet.shape
|
|
43
|
+
assert (egonet - true_egonet).nnz == 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_make_egonet_weighted():
|
|
47
|
+
g = sp.csr_matrix(
|
|
48
|
+
[
|
|
49
|
+
[0, 0.5, 0.5, 0.5, 0],
|
|
50
|
+
[0.5, 0, 0.5, 0.5, 0.5],
|
|
51
|
+
[0.5, 0.5, 0, 0, 0],
|
|
52
|
+
[0, 0.5, 0, 0, 0.5],
|
|
53
|
+
[0, 0.5, 0, 0.5, 0],
|
|
54
|
+
]
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
true_egonet = sp.csr_matrix(
|
|
58
|
+
[
|
|
59
|
+
[0, 0.5, 0.5],
|
|
60
|
+
[0.5, 0, 0],
|
|
61
|
+
[0.5, 0, 0],
|
|
62
|
+
]
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
neighbor_sets = esn.egosplit_sknetwork_._make_neighbor_sets(g.indptr, g.indices)
|
|
66
|
+
neighbors = neighbor_sets[0]
|
|
67
|
+
indptr, indices, data = esn.egosplit_sknetwork_._make_egonet(
|
|
68
|
+
0, neighbor_sets, g.indptr, g.indices, g.data
|
|
69
|
+
)
|
|
70
|
+
egonet = sp.csr_matrix(
|
|
71
|
+
(data, indices, indptr), shape=(len(neighbors), len(neighbors))
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
assert egonet.shape == true_egonet.shape
|
|
75
|
+
assert (egonet - true_egonet).nnz == 0
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@pytest.mark.parametrize(
|
|
79
|
+
"local_alg", ["CC", "PC", sn.clustering.Louvain(random_state=42)]
|
|
80
|
+
)
|
|
81
|
+
@pytest.mark.parametrize(
|
|
82
|
+
"global_alg", ["Louvain", "Leiden", "PC", sn.clustering.KCenters(3)]
|
|
83
|
+
)
|
|
84
|
+
def test_egosplit(karate, local_alg, global_alg):
|
|
85
|
+
egosplit = esn.EgoSplit(
|
|
86
|
+
local_clustering=local_alg, global_clustering=global_alg, random_state=42
|
|
87
|
+
)
|
|
88
|
+
labels = egosplit.fit_predict(karate)
|
|
89
|
+
assert isinstance(labels, sp.csr_matrix)
|
|
90
|
+
assert labels.shape[1] == karate.shape[0]
|
|
91
|
+
assert labels.shape[0] > 0
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_egosplit_weighted(karate):
|
|
95
|
+
rng = np.random.default_rng(seed=42)
|
|
96
|
+
karate.data = rng.random(len(karate.data)) + 0.1 # so no 0
|
|
97
|
+
egosplit = esn.EgoSplit()
|
|
98
|
+
labels = egosplit.fit_predict(karate)
|
|
99
|
+
assert isinstance(labels, sp.csr_matrix)
|
|
100
|
+
assert labels.shape[1] == karate.shape[0]
|
|
101
|
+
assert labels.shape[0] > 0
|