PyPI - scikit-network - Versions diffs - 0.31.0__cp38-cp38-win_amd64.whl → 0.32.1__cp38-cp38-win_amd64.whl - Mend

scikit-network 0.31.0__cp38-cp38-win_amd64.whl → 0.32.1__cp38-cp38-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of scikit-network might be problematic. Click here for more details.

Files changed (114) hide show

{scikit_network-0.31.0.dist-info → scikit_network-0.32.1.dist-info}/AUTHORS.rst +3 -0
{scikit_network-0.31.0.dist-info → scikit_network-0.32.1.dist-info}/METADATA +19 -3
{scikit_network-0.31.0.dist-info → scikit_network-0.32.1.dist-info}/RECORD +112 -105
{scikit_network-0.31.0.dist-info → scikit_network-0.32.1.dist-info}/WHEEL +1 -1
sknetwork/__init__.py +1 -1
sknetwork/classification/base.py +1 -1
sknetwork/classification/base_rank.py +3 -3
sknetwork/classification/diffusion.py +21 -13
sknetwork/classification/knn.py +19 -13
sknetwork/classification/metrics.py +1 -1
sknetwork/classification/pagerank.py +12 -8
sknetwork/classification/propagation.py +22 -15
sknetwork/classification/tests/test_diffusion.py +10 -0
sknetwork/classification/vote.cp38-win_amd64.pyd +0 -0
sknetwork/classification/vote.cpp +14549 -8668
sknetwork/clustering/__init__.py +3 -1
sknetwork/clustering/base.py +1 -1
sknetwork/clustering/kcenters.py +253 -0
sknetwork/clustering/leiden.py +241 -0
sknetwork/clustering/leiden_core.cp38-win_amd64.pyd +0 -0
sknetwork/clustering/leiden_core.cpp +31564 -0
sknetwork/clustering/leiden_core.pyx +124 -0
sknetwork/clustering/louvain.py +118 -83
sknetwork/clustering/louvain_core.cp38-win_amd64.pyd +0 -0
sknetwork/clustering/louvain_core.cpp +21876 -16332
sknetwork/clustering/louvain_core.pyx +86 -94
sknetwork/clustering/postprocess.py +2 -2
sknetwork/clustering/propagation_clustering.py +4 -4
sknetwork/clustering/tests/test_API.py +7 -3
sknetwork/clustering/tests/test_kcenters.py +92 -0
sknetwork/clustering/tests/test_leiden.py +34 -0
sknetwork/clustering/tests/test_louvain.py +2 -3
sknetwork/data/load.py +2 -4
sknetwork/data/parse.py +41 -20
sknetwork/data/tests/test_parse.py +9 -12
sknetwork/embedding/__init__.py +0 -1
sknetwork/embedding/base.py +20 -19
sknetwork/embedding/force_atlas.py +3 -2
sknetwork/embedding/louvain_embedding.py +1 -1
sknetwork/embedding/random_projection.py +5 -3
sknetwork/embedding/spectral.py +0 -73
sknetwork/embedding/tests/test_API.py +4 -28
sknetwork/embedding/tests/test_louvain_embedding.py +4 -9
sknetwork/embedding/tests/test_spectral.py +2 -5
sknetwork/embedding/tests/test_svd.py +1 -1
sknetwork/gnn/base_layer.py +3 -3
sknetwork/gnn/gnn_classifier.py +40 -86
sknetwork/gnn/layer.py +1 -1
sknetwork/gnn/loss.py +1 -1
sknetwork/gnn/optimizer.py +4 -3
sknetwork/gnn/tests/test_base_layer.py +4 -4
sknetwork/gnn/tests/test_gnn_classifier.py +12 -39
sknetwork/gnn/utils.py +8 -8
sknetwork/hierarchy/base.py +27 -0
sknetwork/hierarchy/louvain_hierarchy.py +45 -41
sknetwork/hierarchy/paris.cp38-win_amd64.pyd +0 -0
sknetwork/hierarchy/paris.cpp +27719 -20959
sknetwork/hierarchy/paris.pyx +7 -7
sknetwork/hierarchy/postprocess.py +16 -16
sknetwork/hierarchy/tests/test_algos.py +5 -0
sknetwork/linalg/__init__.py +1 -1
sknetwork/linalg/diteration.cp38-win_amd64.pyd +0 -0
sknetwork/linalg/diteration.cpp +13916 -8050
sknetwork/linalg/{normalization.py → normalizer.py} +17 -14
sknetwork/linalg/operators.py +1 -1
sknetwork/linalg/ppr_solver.py +1 -1
sknetwork/linalg/push.cp38-win_amd64.pyd +0 -0
sknetwork/linalg/push.cpp +23144 -16920
sknetwork/linalg/tests/test_normalization.py +3 -7
sknetwork/linalg/tests/test_operators.py +2 -6
sknetwork/linalg/tests/test_ppr.py +1 -1
sknetwork/linkpred/base.py +12 -1
sknetwork/linkpred/nn.py +6 -6
sknetwork/path/distances.py +11 -4
sknetwork/path/shortest_path.py +1 -1
sknetwork/path/tests/test_distances.py +7 -0
sknetwork/path/tests/test_search.py +2 -2
sknetwork/ranking/base.py +11 -6
sknetwork/ranking/betweenness.cp38-win_amd64.pyd +0 -0
sknetwork/ranking/betweenness.cpp +5256 -2190
sknetwork/ranking/pagerank.py +13 -12
sknetwork/ranking/tests/test_API.py +0 -2
sknetwork/ranking/tests/test_betweenness.py +1 -1
sknetwork/ranking/tests/test_pagerank.py +11 -5
sknetwork/regression/base.py +18 -1
sknetwork/regression/diffusion.py +24 -10
sknetwork/regression/tests/test_diffusion.py +8 -0
sknetwork/topology/__init__.py +3 -1
sknetwork/topology/cliques.cp38-win_amd64.pyd +0 -0
sknetwork/topology/cliques.cpp +23147 -16457
sknetwork/topology/core.cp38-win_amd64.pyd +0 -0
sknetwork/topology/core.cpp +22854 -16576
sknetwork/topology/cycles.py +243 -0
sknetwork/topology/minheap.cp38-win_amd64.pyd +0 -0
sknetwork/topology/minheap.cpp +19495 -13469
sknetwork/topology/structure.py +2 -42
sknetwork/topology/tests/test_cycles.py +65 -0
sknetwork/topology/tests/test_structure.py +2 -16
sknetwork/topology/triangles.cp38-win_amd64.pyd +0 -0
sknetwork/topology/triangles.cpp +5283 -1397
sknetwork/topology/triangles.pyx +7 -4
sknetwork/topology/weisfeiler_lehman_core.cp38-win_amd64.pyd +0 -0
sknetwork/topology/weisfeiler_lehman_core.cpp +14781 -8915
sknetwork/utils/format.py +1 -1
sknetwork/utils/membership.py +2 -2
sknetwork/visualization/__init__.py +2 -2
sknetwork/visualization/dendrograms.py +55 -7
sknetwork/visualization/graphs.py +261 -44
sknetwork/visualization/tests/test_dendrograms.py +9 -9
sknetwork/visualization/tests/test_graphs.py +63 -57
sknetwork/embedding/louvain_hierarchy.py +0 -142
sknetwork/embedding/tests/test_louvain_hierarchy.py +0 -19
{scikit_network-0.31.0.dist-info → scikit_network-0.32.1.dist-info}/LICENSE +0 -0
{scikit_network-0.31.0.dist-info → scikit_network-0.32.1.dist-info}/top_level.txt +0 -0

sknetwork/clustering/louvain_core.pyx CHANGED Viewed

@@ -1,7 +1,6 @@
-# distutils: language = c++
+# distutils: language=c++
 # cython: language_level=3
 from libcpp.set cimport set
-from libcpp.vector cimport vector
 cimport cython
 ctypedef fused int_or_long:
@@ -10,123 +9,116 @@ ctypedef fused int_or_long:
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def fit_core(float resolution, float tol, float[:] ou_node_probs, float[:] in_node_probs, float[:] self_loops,
-             float[:] data, int_or_long[:] indices, int_or_long[:] indptr):  # pragma: no cover
-    """Fit the clusters to the objective function.
+def optimize_core(int_or_long[:] labels, int_or_long[:] indices, int_or_long[:] indptr, float[:] data,
+    float[:] out_weights, float[:] in_weights, float[:] out_cluster_weights, float[:] in_cluster_weights,
+    float[:] cluster_weights, float[:] self_loops, float resolution, float tol_optimization):  # pragma: no cover
+    """Find clusters maximizing modularity.
     Parameters
     ----------
-    resolution :
-        Resolution parameter (positive).
-    tol :
-        Minimum increase in modularity to enter a new optimization pass.
-    ou_node_probs :
-        Distribution of node weights based on their out-edges (sums to 1).
-    in_node_probs :
-        Distribution of node weights based on their in-edges (sums to 1).
-    self_loops :
-        Weights of self loops.
-    data :
-        CSR format data array of the normalized adjacency matrix.
+    labels :
+        Initial labels.
     indices :
         CSR format index array of the normalized adjacency matrix.
     indptr :
         CSR format index pointer array of the normalized adjacency matrix.
+    data :
+        CSR format data array of the normalized adjacency matrix.
+    out_weights :
+        Out-weights of nodes (sum to 1).
+    in_weights :
+        In-weights of nodes (sum to 1).
+    out_cluster_weights :
+        Out-weights of clusters (sum to 1).
+    in_cluster_weights :
+        In-weights of clusters (sum to 1).
+    cluster_weights :
+        Weights of clusters (initialized to 0).
+    self_loops :
+        Weights of self loops.
+    resolution :
+        Resolution parameter (positive).
+    tol_optimization :
+        Minimum increase in modularity to enter a new optimization pass.
     Returns
     -------
     labels :
-        Cluster index of each node.
-    total_increase :
-        Score of the clustering (total increase in modularity).
+        Labels of nodes.
+    increase :
+        Increase in modularity.
     """
-    cdef int_or_long n = indptr.shape[0] - 1
-    cdef int_or_long increase = 1
-    cdef int_or_long cluster
-    cdef int_or_long cluster_best
-    cdef int_or_long cluster_node
+    cdef int_or_long n
+    cdef int_or_long stop = 0
+    cdef int_or_long label
+    cdef int_or_long label_target
+    cdef int_or_long label_best
     cdef int_or_long i
     cdef int_or_long j
-    cdef int_or_long j1
-    cdef int_or_long j2
-    cdef int_or_long label
+    cdef int_or_long start
+    cdef int_or_long end
-    cdef float increase_total = 0
+    cdef float increase = 0
     cdef float increase_pass
     cdef float delta
-    cdef float delta_best
-    cdef float delta_exit
     cdef float delta_local
-    cdef float node_prob_in
-    cdef float node_prob_ou
-    cdef float ratio_in
-    cdef float ratio_ou
-    cdef vector[int_or_long] labels
-    cdef vector[float] neighbor_clusters_weights
-    cdef vector[float] ou_clusters_weights
-    cdef vector[float] in_clusters_weights
-    cdef set[int_or_long] unique_clusters = ()
-    for i in range(n):
-        labels.push_back(i)
-        neighbor_clusters_weights.push_back(0.)
-        ou_clusters_weights.push_back(ou_node_probs[i])
-        in_clusters_weights.push_back(in_node_probs[i])
-    while increase == 1:
-        increase = 0
-        increase_pass = 0
-        for i in range(n):
-            unique_clusters.clear()
-            cluster_node = labels[i]
-            j1 = indptr[i]
-            j2 = indptr[i + 1]
-            for j in range(j1, j2):
-                label = labels[indices[j]]
-                neighbor_clusters_weights[label] += data[j]
-                unique_clusters.insert(label)
+    cdef float delta_best
+    cdef float in_weight
+    cdef float out_weight
-            unique_clusters.erase(cluster_node)
+    cdef set[int_or_long] label_set = ()
-            if not unique_clusters.empty():
-                node_prob_ou = ou_node_probs[i]
-                node_prob_in = in_node_probs[i]
-                ratio_ou = resolution * node_prob_ou
-                ratio_in = resolution * node_prob_in
+    n = labels.shape[0]
+    while not stop:
+        increase_pass = 0
-                delta_exit = 2 * (neighbor_clusters_weights[cluster_node] - self_loops[i])
-                delta_exit -= ratio_ou * (in_clusters_weights[cluster_node] - node_prob_in)
-                delta_exit -= ratio_in * (ou_clusters_weights[cluster_node] - node_prob_ou)
+        for i in range(n):
+            label_set.clear()
+            label = labels[i]
+            start = indptr[i]
+            end = indptr[i+1]
+            # neighboring clusters
+            for j in range(start, end):
+                label_target = labels[indices[j]]
+                label_set.insert(label_target)
+                cluster_weights[label_target] += data[j]
+            label_set.erase(label)
+            if not label_set.empty():
+                out_weight = out_weights[i]
+                in_weight = in_weights[i]
+                # node leaving the current cluster
+                delta = 2 * (cluster_weights[label] - self_loops[i])
+                delta -= resolution * out_weight * (in_cluster_weights[label] - in_weight)
+                delta -= resolution * in_weight * (out_cluster_weights[label] - out_weight)
                 delta_best = 0
-                cluster_best = cluster_node
+                label_best = label
-                for cluster in unique_clusters:
-                    delta = 2 * neighbor_clusters_weights[cluster]
-                    delta -= ratio_ou * in_clusters_weights[cluster]
-                    delta -= ratio_in * ou_clusters_weights[cluster]
-                    delta_local = delta - delta_exit
+                for label_target in label_set:
+                    delta_local = 2 * cluster_weights[label_target]
+                    delta_local -= resolution * out_weight * in_cluster_weights[label_target]
+                    delta_local -= resolution * in_weight * out_cluster_weights[label_target]
+                    delta_local -= delta
                     if delta_local > delta_best:
                         delta_best = delta_local
-                        cluster_best = cluster
-                    neighbor_clusters_weights[cluster] = 0
+                        label_best = label_target
+                    cluster_weights[label_target] = 0
-                if delta_best > 0:
+                if label_best != label:
                     increase_pass += delta_best
-                    ou_clusters_weights[cluster_node] -= node_prob_ou
-                    in_clusters_weights[cluster_node] -= node_prob_in
-                    ou_clusters_weights[cluster_best] += node_prob_ou
-                    in_clusters_weights[cluster_best] += node_prob_in
-                    labels[i] = cluster_best
-            neighbor_clusters_weights[cluster_node] = 0
-        increase_total += increase_pass
-        if increase_pass > tol:
-            increase = 1
-    return labels, increase_total
+                    labels[i] = label_best
+                    # update weights
+                    out_cluster_weights[label] -= out_weight
+                    in_cluster_weights[label] -= in_weight
+                    out_cluster_weights[label_best] += out_weight
+                    in_cluster_weights[label_best] += in_weight
+            cluster_weights[label] = 0
+        increase += increase_pass
+        stop = increase_pass <= tol_optimization
+    return labels, increase

sknetwork/clustering/postprocess.py CHANGED Viewed

@@ -41,7 +41,7 @@ def aggregate_graph(input_matrix: sparse.csr_matrix, labels: Optional[np.ndarray
                     labels_row: Optional[np.ndarray] = None, labels_col: Optional[np.ndarray] = None) \
         -> sparse.csr_matrix:
     """Aggregate graph per label. All nodes with the same label become a single node.
-    Negative labels are ignored (corresponding nodes are not discarded).
+    Negative labels are ignored (corresponding nodes are discarded).
     Parameters
     ----------
@@ -63,4 +63,4 @@ def aggregate_graph(input_matrix: sparse.csr_matrix, labels: Optional[np.ndarray
     else:
         membership_col = membership_row
     aggregate_matrix = membership_row.T.dot(input_matrix).dot(membership_col)
-    return aggregate_matrix
+    return aggregate_matrix.tocsr()

sknetwork/clustering/propagation_clustering.py CHANGED Viewed

@@ -29,11 +29,11 @@ class PropagationClustering(BaseClustering, Propagation):
     weighted : bool
         If ``True``, the vote of each neighbor is proportional to the edge weight.
         Otherwise, all votes have weight 1.
-    sort_clusters :
+    sort_clusters : bool
         If ``True``, sort labels in decreasing order of cluster size.
-    return_probs :
+    return_probs : bool
         If ``True``, return the probability distribution over clusters (soft clustering).
-    return_aggregate :
+    return_aggregate : bool
         If ``True``, return the aggregate adjacency matrix or biadjacency matrix between clusters.
     Attributes
@@ -78,7 +78,7 @@ class PropagationClustering(BaseClustering, Propagation):
         Parameters
         ----------
-        input_matrix :
+        input_matrix : sparse.csr_matrix, np.ndarray
             Adjacency matrix or biadjacency matrix of the graph.
         Returns

sknetwork/clustering/tests/test_API.py CHANGED Viewed

@@ -9,8 +9,12 @@ from sknetwork.data.test_graphs import *
 class TestClusteringAPI(unittest.TestCase):
+    def setUp(self):
+        self.algos = [Louvain(return_aggregate=True), Leiden(return_aggregate=True),
+                      PropagationClustering(return_aggregate=True)]
     def test_regular(self):
-        for algo in [Louvain(return_aggregate=True), PropagationClustering(return_aggregate=True)]:
+        for algo in self.algos:
             for adjacency in [test_graph(), test_digraph(), test_disconnected_graph()]:
                 n = adjacency.shape[0]
                 labels = algo.fit_predict(adjacency)
@@ -22,13 +26,13 @@ class TestClusteringAPI(unittest.TestCase):
                 n_labels = len(set(labels))
                 self.assertEqual(labels.shape, (n,))
                 self.assertEqual(algo.aggregate_.shape, (n_labels, n_labels))
-                membership = algo.fit_transform(adjacency)
+                membership = algo.fit_transform(adjacency_bool)
                 self.assertEqual(membership.shape, (n, n_labels))
     def test_bipartite(self):
         biadjacency = test_bigraph()
         n_row, n_col = biadjacency.shape
-        for algo in [Louvain(return_aggregate=True), PropagationClustering(return_aggregate=True)]:
+        for algo in self.algos:
             algo.fit(biadjacency)
             self.assertEqual(algo.labels_row_.shape, (n_row,))
             self.assertEqual(algo.labels_col_.shape, (n_col,))

sknetwork/clustering/tests/test_kcenters.py ADDED Viewed

@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Tests for KCenters"""
+import unittest
+from sknetwork.clustering import KCenters
+from sknetwork.data import karate_club, painters, star_wars
+from sknetwork.data.test_graphs import *
+class TestKCentersClustering(unittest.TestCase):
+    def test_kcenters(self):
+        # Test undirected graph
+        n_clusters = 2
+        adjacency = karate_club()
+        n_row = adjacency.shape[0]
+        kcenters = KCenters(n_clusters=n_clusters)
+        labels = kcenters.fit_predict(adjacency)
+        self.assertEqual(len(labels), n_row)
+        self.assertEqual(len(set(labels)), n_clusters)
+        # Test directed graph
+        n_clusters = 3
+        adjacency = painters()
+        n_row = adjacency.shape[0]
+        kcenters = KCenters(n_clusters=n_clusters, directed=True)
+        labels = kcenters.fit_predict(adjacency)
+        self.assertEqual(len(labels), n_row)
+        self.assertEqual(len(set(labels)), n_clusters)
+        # Test bipartite graph
+        n_clusters = 2
+        biadjacency = star_wars()
+        n_row, n_col = biadjacency.shape
+        kcenters = KCenters(n_clusters=n_clusters)
+        kcenters.fit(biadjacency)
+        labels = kcenters.labels_
+        self.assertEqual(len(kcenters.labels_row_), n_row)
+        self.assertEqual(len(kcenters.labels_col_), n_col)
+        self.assertEqual(len(set(labels)), n_clusters)
+    def test_kcenters_centers(self):
+        # Test centers for undirected graphs
+        n_clusters = 2
+        adjacency = karate_club()
+        kcenters = KCenters(n_clusters=n_clusters)
+        kcenters.fit(adjacency)
+        centers = kcenters.centers_
+        self.assertEqual(n_clusters, len(set(centers)))
+        # Test centers for bipartite graphs
+        n_clusters = 2
+        biadjacency = star_wars()
+        n_row, n_col = biadjacency.shape
+        for position in ["row", "col", "both"]:
+            kcenters = KCenters(n_clusters=n_clusters, center_position=position)
+            kcenters.fit(biadjacency)
+            centers_row = kcenters.centers_row_
+            centers_col = kcenters.centers_col_
+            if position == "row":
+                self.assertEqual(n_clusters, len(set(centers_row)))
+                self.assertTrue(np.all(centers_row < n_row))
+                self.assertTrue(centers_col is None)
+            if position == "col":
+                self.assertEqual(n_clusters, len(set(centers_col)))
+                self.assertTrue(np.all((centers_col < n_col) & (0 <= centers_col)))
+                self.assertTrue(centers_row is None)
+            if position == "both":
+                self.assertEqual(n_clusters, len(set(centers_row)) + len(set(centers_col)))
+                self.assertTrue(np.all(centers_row < n_row))
+                self.assertTrue(np.all((centers_col < n_col) & (0 <= centers_col)))
+    def test_kcenters_error(self):
+        # Test value errors
+        adjacency = karate_club()
+        biadjacency = star_wars()
+        # test n_clusters error
+        kcenters = KCenters(n_clusters=1)
+        with self.assertRaises(ValueError):
+            kcenters.fit(adjacency)
+        # test n_init error
+        kcenters = KCenters(n_clusters=2, n_init=0)
+        with self.assertRaises(ValueError):
+            kcenters.fit(adjacency)
+        # test center_position error
+        kcenters = KCenters(n_clusters=2, center_position="other")
+        with self.assertRaises(ValueError):
+            kcenters.fit(biadjacency)

sknetwork/clustering/tests/test_leiden.py ADDED Viewed

@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Tests for Leiden"""
+import unittest
+from sknetwork.clustering import Leiden
+from sknetwork.data.test_graphs import *
+from sknetwork.utils import bipartite2undirected
+class TestLeidenClustering(unittest.TestCase):
+    def test_disconnected(self):
+        adjacency = test_disconnected_graph()
+        n = adjacency.shape[0]
+        labels = Leiden().fit_predict(adjacency)
+        self.assertEqual(len(labels), n)
+    def test_modularity(self):
+        adjacency = test_graph()
+        leiden_d = Leiden(modularity='dugue')
+        leiden_n = Leiden(modularity='newman')
+        labels_d = leiden_d.fit_predict(adjacency)
+        labels_n = leiden_n.fit_predict(adjacency)
+        self.assertTrue((labels_d == labels_n).all())
+    def test_bipartite(self):
+        biadjacency = test_bigraph()
+        adjacency = bipartite2undirected(biadjacency)
+        leiden = Leiden(modularity='newman')
+        labels1 = leiden.fit_predict(adjacency)
+        leiden.fit(biadjacency)
+        labels2 = np.concatenate((leiden.labels_row_, leiden.labels_col_))
+        self.assertTrue((labels1 == labels2).all())

sknetwork/clustering/tests/test_louvain.py CHANGED Viewed

@@ -24,7 +24,6 @@ class TestLouvainClustering(unittest.TestCase):
         labels_d = louvain_d.fit_predict(adjacency)
         labels_n = louvain_n.fit_predict(adjacency)
         self.assertTrue((labels_d == labels_n).all())
         louvain_p = Louvain(modularity='potts')
         louvain_p.fit_predict(adjacency)
@@ -48,7 +47,7 @@ class TestLouvainClustering(unittest.TestCase):
         # tolerance
         louvain = Louvain(resolution=2, tol_aggregation=0.1)
         labels = louvain.fit_predict(adjacency)
-        self.assertEqual(len(set(labels)), 12)
+        self.assertEqual(len(set(labels)), 7)
         # shuffling
         louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42)
@@ -78,7 +77,7 @@ class TestLouvainClustering(unittest.TestCase):
         # tolerance
         louvain = Louvain(resolution=2, tol_aggregation=0.1)
         labels = louvain.fit_predict(adjacency)
-        self.assertEqual(len(set(labels)), 12)
+        self.assertEqual(len(set(labels)), 7)
         # shuffling
         louvain = Louvain(resolution=2, shuffle_nodes=True, random_state=42)

sknetwork/data/load.py CHANGED Viewed

@@ -250,7 +250,7 @@ def load_konect(name: str, data_home: Optional[Union[str, Path]] = None, auto_nu
     if matrix:
         file = matrix[0]
         directed, bipartite, weighted = load_header(path / file)
-        dataset = from_csv(path / file, directed=directed, bipartite=bipartite, weighted=weighted)
+        dataset = from_csv(path / file, directed=directed, bipartite=bipartite, weighted=weighted, reindex=True)
     metadata = [file for file in files if 'meta.' in file]
     if metadata:
@@ -300,11 +300,9 @@ def save_to_numpy_bundle(data: Bunch, bundle_name: str, data_home: Optional[Unio
             sparse.save_npz(data_path / attribute, data[attribute])
         elif type(data[attribute]) == np.ndarray:
             np.save(data_path / attribute, data[attribute])
-        elif type(data[attribute]) == Bunch or type(data[attribute]) == str:
+        else:
             with open(data_path / (attribute + '.p'), 'wb') as file:
                 pickle.dump(data[attribute], file)
-        else:
-            raise TypeError('Unsupported data attribute type '+str(type(data[attribute])) + '.')
 def load_from_numpy_bundle(bundle_name: str, data_home: Optional[Union[str, Path]] = None):

sknetwork/data/parse.py CHANGED Viewed

@@ -8,7 +8,7 @@ Created in December 2018
 """
 from csv import reader
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Tuple, Union, Optional
 from xml.etree import ElementTree
 import numpy as np
@@ -19,7 +19,7 @@ from sknetwork.utils.format import directed2undirected
 def from_edge_list(edge_list: Union[np.ndarray, List[Tuple]], directed: bool = False,
-                   bipartite: bool = False, weighted: bool = True, reindex: bool = True,
+                   bipartite: bool = False, weighted: bool = True, reindex: bool = False, shape: Optional[tuple] = None,
                    sum_duplicates: bool = True, matrix_only: bool = None) -> Union[Bunch, sparse.csr_matrix]:
     """Load a graph from an edge list.
@@ -37,6 +37,9 @@ def from_edge_list(edge_list: Union[np.ndarray, List[Tuple]], directed: bool = F
     reindex : bool
         If ``True``, reindex nodes and returns the original node indices as names.
         Reindexing is enforced if nodes are not integers.
+    shape : tuple
+        Shape of the adjacency or biadjacency matrix.
+        If not specified or if nodes are reindexed, the shape is the smallest compatible with node indices.
     sum_duplicates : bool
         If ``True`` (default), sums weights of duplicate edges.
         Otherwise, the weight of each edge is that of the first occurrence of this edge.
@@ -83,12 +86,14 @@ def from_edge_list(edge_list: Union[np.ndarray, List[Tuple]], directed: bool = F
     else:
         raise TypeError('The edge list must be given as a NumPy array or a list of tuples.')
     return from_edge_array(edge_array=edge_array, weights=weights, directed=directed, bipartite=bipartite,
-                           weighted=weighted, reindex=reindex, sum_duplicates=sum_duplicates, matrix_only=matrix_only)
+                           weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
+                           matrix_only=matrix_only)
 def from_adjacency_list(adjacency_list: Union[List[List], Dict[str, List]], directed: bool = False,
-                        bipartite: bool = False, weighted: bool = True, reindex: bool = True,
-                        sum_duplicates: bool = True, matrix_only: bool = None) -> Union[Bunch, sparse.csr_matrix]:
+                        bipartite: bool = False, weighted: bool = True, reindex: bool = False,
+                        shape: Optional[tuple] = None, sum_duplicates: bool = True, matrix_only: bool = None) \
+                        -> Union[Bunch, sparse.csr_matrix]:
     """Load a graph from an adjacency list.
     Parameters
@@ -104,6 +109,9 @@ def from_adjacency_list(adjacency_list: Union[List[List], Dict[str, List]], dire
     reindex : bool
         If ``True``, reindex nodes and returns the original node indices as names.
         Reindexing is enforced if nodes are not integers.
+    shape : tuple
+        Shape of the adjacency or biadjacency matrix.
+        If not specified or if nodes are reindexed, the shape is the smallest compatible with node indices.
     sum_duplicates : bool
         If ``True`` (default), sums weights of duplicate edges.
         Otherwise, the weight of each edge is that of the first occurrence of this edge.
@@ -134,12 +142,12 @@ def from_adjacency_list(adjacency_list: Union[List[List], Dict[str, List]], dire
     else:
         raise TypeError('The adjacency list must be given as a list of lists or a dict of lists.')
     return from_edge_list(edge_list=edge_list, directed=directed, bipartite=bipartite, weighted=weighted,
-                          reindex=reindex, sum_duplicates=sum_duplicates, matrix_only=matrix_only)
+                          reindex=reindex, shape=shape, sum_duplicates=sum_duplicates, matrix_only=matrix_only)
 def from_edge_array(edge_array: np.ndarray, weights: np.ndarray = None, directed: bool = False, bipartite: bool = False,
-                    weighted: bool = True, reindex: bool = True, sum_duplicates: bool = True,
-                    matrix_only: bool = None) -> Union[Bunch, sparse.csr_matrix]:
+                    weighted: bool = True, reindex: bool = False, shape: Optional[tuple] = None,
+                    sum_duplicates: bool = True, matrix_only: bool = None) -> Union[Bunch, sparse.csr_matrix]:
     """Load a graph from an edge array of shape (n_edges, 2) and weights (optional).
     Parameters
@@ -157,6 +165,9 @@ def from_edge_array(edge_array: np.ndarray, weights: np.ndarray = None, directed
     reindex : bool
         If ``True``, reindex nodes and returns the original node indices as names.
         Reindexing is enforced if nodes are not integers.
+    shape : tuple
+        Shape of the adjacency or biadjacency matrix.
+        If not specified or if nodes are reindexed, the shape is the smallest compatible with node indices.
     sum_duplicates : bool
         If ``True`` (default), sums weights of duplicate edges.
         Otherwise, the weight of each edge is that of the first occurrence of this edge.
@@ -195,28 +206,34 @@ def from_edge_array(edge_array: np.ndarray, weights: np.ndarray = None, directed
     if bipartite:
         row = edge_array[:, 0]
         col = edge_array[:, 1]
-        if row.dtype != int or (reindex and len(set(row)) < max(row) + 1):
+        if row.dtype != int or reindex:
             names_row, row = np.unique(row, return_inverse=True)
             graph.names_row = names_row
             graph.names = names_row
             n_row = len(names_row)
+        elif shape is not None:
+            n_row = max(shape[0], max(row) + 1)
         else:
             n_row = max(row) + 1
-        if col.dtype != int or (reindex and len(set(col)) < max(col) + 1):
+        if col.dtype != int or reindex:
             names_col, col = np.unique(col, return_inverse=True)
             graph.names_col = names_col
             n_col = len(names_col)
+        elif shape is not None:
+            n_col = max(shape[1], max(col) + 1)
         else:
             n_col = max(col) + 1
         matrix = sparse.csr_matrix((weights, (row, col)), shape=(n_row, n_col))
         graph.biadjacency = matrix
     else:
         nodes = edge_array.ravel()
-        if nodes.dtype != int or (reindex and len(set(nodes)) < max(nodes) + 1):
+        if nodes.dtype != int or reindex:
             names, nodes = np.unique(nodes, return_inverse=True)
             graph.names = names
             n = len(names)
             edge_array = nodes.reshape(-1, 2)
+        elif shape is not None:
+            n = max(shape[0], max(nodes) + 1)
         else:
             n = max(nodes) + 1
         row = edge_array[:, 0]
@@ -233,8 +250,8 @@ def from_edge_array(edge_array: np.ndarray, weights: np.ndarray = None, directed
 def from_csv(file_path: str, delimiter: str = None, sep: str = None, comments: str = '#%',
              data_structure: str = None, directed: bool = False, bipartite: bool = False, weighted: bool = True,
-             reindex: bool = True, sum_duplicates: bool = True, matrix_only: bool = None) \
-        -> Union[Bunch, sparse.csr_matrix]:
+             reindex: bool = False, shape: Optional[tuple] = None, sum_duplicates: bool = True,
+             matrix_only: bool = None) -> Union[Bunch, sparse.csr_matrix]:
     """Load a graph from a CSV or TSV file.
     The delimiter can be specified (e.g., ' ' for space-separated values).
@@ -249,9 +266,10 @@ def from_csv(file_path: str, delimiter: str = None, sep: str = None, comments: s
     comments : str
         Characters for comment lines.
     data_structure : str
-        If 'edge_list', considers each row of the file as an edge (tuple of size 2 or 3).
-        If 'adjacency_list', considers each row of the file as an adjacency list (list of neighbors).
-        If 'adjacency_dict', considers each row of the file as an adjacency dictionary with key
+        If 'edge_list', consider each row of the file as an edge (tuple of size 2 or 3).
+        If 'adjacency_list', consider each row of the file as an adjacency list (list of neighbors,
+        in the order of node indices; an empty line means no neighbor).
+        If 'adjacency_dict', consider each row of the file as an adjacency dictionary with key
         given by the first column (node: list of neighbors).
         If ``None`` (default), data_structure is guessed from the first rows of the file.
     directed : bool
@@ -263,6 +281,9 @@ def from_csv(file_path: str, delimiter: str = None, sep: str = None, comments: s
     reindex : bool
         If ``True``, reindex nodes and returns the original node indices as names.
         Reindexing is enforced if nodes are not integers.
+    shape : tuple
+        Shape of the adjacency or biadjacency matrix.
+        If not specified or if nodes are reindexed, the shape is the smallest compatible with node indices.
     sum_duplicates : bool
         If ``True`` (default), sums weights of duplicate edges.
         Otherwise, the weight of each edge is that of the first occurrence of this edge.
@@ -295,7 +316,7 @@ def from_csv(file_path: str, delimiter: str = None, sep: str = None, comments: s
             else:
                 weights = None
             return from_edge_array(edge_array=edge_array, weights=weights, directed=directed, bipartite=bipartite,
-                                   weighted=weighted, reindex=reindex, sum_duplicates=sum_duplicates,
+                                   weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
                                    matrix_only=matrix_only)
         except TypeError:
             pass
@@ -306,17 +327,17 @@ def from_csv(file_path: str, delimiter: str = None, sep: str = None, comments: s
         if data_structure == 'edge_list':
             edge_list = [tuple(row) for row in csv_reader]
             return from_edge_list(edge_list=edge_list, directed=directed, bipartite=bipartite,
-                                  weighted=weighted, reindex=reindex, sum_duplicates=sum_duplicates,
+                                  weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
                                   matrix_only=matrix_only)
         elif data_structure == 'adjacency_list':
             adjacency_list = [row for row in csv_reader]
             return from_adjacency_list(adjacency_list=adjacency_list, directed=directed, bipartite=bipartite,
-                                       weighted=weighted, reindex=reindex, sum_duplicates=sum_duplicates,
+                                       weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
                                        matrix_only=matrix_only)
         elif data_structure == 'adjacency_dict':
             adjacency_list = {row[0]: row[1:] for row in csv_reader}
             return from_adjacency_list(adjacency_list=adjacency_list, directed=directed, bipartite=bipartite,
-                                       weighted=weighted, reindex=reindex, sum_duplicates=sum_duplicates,
+                                       weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
                                        matrix_only=matrix_only)