muxpack 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
muxpack-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,36 @@
1
+ Metadata-Version: 2.3
2
+ Name: muxpack
3
+ Version: 0.1.0
4
+ Summary: Tools to handle multiplex network data more easily
5
+ Author: Edwin de Jonge, Jan van der Laan
6
+ Author-email: Edwin de Jonge <edwindjonge@gmail.com>, Jan van der Laan <djvanderlaan@gmail.com>
7
+ Requires-Dist: duckdb>=1.4.4
8
+ Requires-Dist: ibis-framework[duckdb]>=12.0.0
9
+ Requires-Dist: networkx>=3.6.1
10
+ Requires-Dist: pandas>=3.0.1
11
+ Requires-Dist: pyarrow>=23.0.1
12
+ Requires-Dist: pyarrow-hotfix>=0.7
13
+ Requires-Dist: scipy>=1.17.1
14
+ Requires-Python: >=3.13
15
+ Project-URL: Homepage, https://codeberg.org/CBS-Networktools/muxpack.py
16
+ Project-URL: Documentation, https://readthedocs.org
17
+ Project-URL: Repository, https://codeberg.org/CBS-Networktools/muxpack.py
18
+ Project-URL: Bug Tracker, https://codeberg.org/CBS-Networktools/muxpack.py/issues
19
+ Description-Content-Type: text/markdown
20
+
21
+ **Under heavy construction, do not use for serious work!**
22
+
23
+ ## Muxpack
24
+
25
+ Muxpack is a Python implementation for working with multiplex network files.
26
+
27
+ ## Documentation
28
+
29
+ Build docs locally using the same dependency path as Read the Docs:
30
+
31
+ ```bash
32
+ uv sync --group docs
33
+ uv run sphinx-build -b html docs docs/_build/html
34
+ ```
35
+
36
+ The generated HTML is available in `docs/_build/html/index.html`.
@@ -0,0 +1,16 @@
1
+ **Under heavy construction, do not use for serious work!**
2
+
3
+ ## Muxpack
4
+
5
+ Muxpack is a Python implementation for working with multiplex network files.
6
+
7
+ ## Documentation
8
+
9
+ Build docs locally using the same dependency path as Read the Docs:
10
+
11
+ ```bash
12
+ uv sync --group docs
13
+ uv run sphinx-build -b html docs docs/_build/html
14
+ ```
15
+
16
+ The generated HTML is available in `docs/_build/html/index.html`.
@@ -0,0 +1,45 @@
1
+ [project]
2
+ name = "muxpack"
3
+ version = "0.1.0"
4
+ description = "Tools to handle multiplex network data more easily"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Edwin de Jonge", email = "edwindjonge@gmail.com" },
8
+ { name = "Jan van der Laan", email = "djvanderlaan@gmail.com" }
9
+ ]
10
+ requires-python = ">=3.13"
11
+ dependencies = [
12
+ "duckdb>=1.4.4",
13
+ "ibis-framework[duckdb]>=12.0.0",
14
+ "networkx>=3.6.1",
15
+ "pandas>=3.0.1",
16
+ "pyarrow>=23.0.1",
17
+ "pyarrow-hotfix>=0.7",
18
+ "scipy>=1.17.1",
19
+ ]
20
+
21
+ [project.urls]
22
+ Homepage = "https://codeberg.org/CBS-Networktools/muxpack.py"
23
+ Documentation = "https://readthedocs.org"
24
+ Repository = "https://codeberg.org/CBS-Networktools/muxpack.py"
25
+ "Bug Tracker" = "https://codeberg.org/CBS-Networktools/muxpack.py/issues"
26
+
27
+ [project.scripts]
28
+ muxpack = "muxpack:main"
29
+
30
+ [build-system]
31
+ requires = ["uv_build>=0.10.6,<0.11.0"]
32
+ build-backend = "uv_build"
33
+
34
+ [dependency-groups]
35
+ dev = [
36
+ "pytest>=9.0.2",
37
+ "ruff>=0.15.4",
38
+ ]
39
+ docs = [
40
+ "sphinx>=9.1.0",
41
+ "sphinx-rtd-theme>=3.0.0",
42
+ ]
43
+
44
+ [tool.ruff.lint]
45
+ extend-select = ["B"]
@@ -0,0 +1,17 @@
1
+ from .check import check_edges, check_vertices
2
+ from .io import load_network, save_network
3
+ from .multiplexseries import MultiplexSeries
4
+ from .multiplex import Multiplex
5
+ from .to_csr_matrix import to_csr_matrix
6
+ from .bipartite import Bipartite
7
+
8
+ __all__ = [
9
+ "check_edges",
10
+ "check_vertices",
11
+ "load_network",
12
+ "Multiplex",
13
+ "MultiplexSeries",
14
+ "save_network",
15
+ "to_csr_matrix",
16
+ "Bipartite",
17
+ ]
@@ -0,0 +1,99 @@
1
+ from pathlib import Path
2
+ from ibis import Table
3
+ from . import io
4
+
5
+
6
+ class Bipartite:
7
+ """
8
+ Lazy Bipartite storage
9
+ - sort on role_src, role_dst
10
+ """
11
+
12
+ edges: Table
13
+ role_src: str
14
+ role_dst: str
15
+ relationtype: str
16
+
17
+ def __init__(
18
+ self,
19
+ edges: Table,
20
+ role_src: str = "src",
21
+ role_dst: str = "dst",
22
+ relationtype: str = "relationtype",
23
+ ):
24
+ """
25
+ Initialize a bipartite graph with the given edges table and role labels.
26
+
27
+ Args:
28
+ - edges: table containing the bipartite edges.
29
+ - role_src: column name for the source role.
30
+ - role_dst: column name for the destination role.
31
+ - relationtype: column name for the relation type.
32
+ """
33
+ self.edges = edges
34
+ self.role_src = role_src
35
+ self.role_dst = role_dst
36
+ self.relationtype = relationtype
37
+
38
+ def project_to_src(self) -> Table:
39
+ """
40
+ Project the bipartite graph onto the source role, producing a unipartite edge table.
41
+ Two source nodes are connected if they share a common destination node.
42
+
43
+ Returns:
44
+ - Table with columns ``src``, ``dst``, and ``relationtype``.
45
+ """
46
+ E = self.edges
47
+ # TODO this is a explicit choice: relationtypes could be more complex, ie when two different role_src have
48
+ # an other relation with the same role_dst. Simplifying that right now
49
+ E_src = E.select(
50
+ src=self.role_src, p=self.role_dst, relationtype=self.relationtype
51
+ )
52
+ E_dst = E.select(dst=self.role_src, p=self.role_dst)
53
+
54
+ E = E_src.inner_join(E_dst, E_src.p == E_dst.p)
55
+ E = E.filter(E.src != E.dst)
56
+ E = E.select(["src", "dst", "relationtype"])
57
+ return E
58
+
59
+ def project_to_dst(self) -> Table:
60
+ """
61
+ Project the bipartite graph onto the destination role, producing a unipartite edge table.
62
+ Two destination nodes are connected if they share a common source node.
63
+
64
+ Returns:
65
+ - Table with columns ``src``, ``dst``, and ``relationtype``.
66
+ """
67
+ E = self.edges
68
+ # TODO this is a explicit choice: relationtypes could be more complex, ie when two different role_src have
69
+ # an other relation with the same role_dst. Simplifying that right now
70
+ # should we sort on role_src and role_dst or the other way around? For projection it does not matter, but for storage it does. We sort on role_src and role_dst for efficient projection, but that means that the projection to dst is less efficient. Maybe we should sort on role_dst and role_src instead?
71
+ E_src = E.select(
72
+ src=self.role_dst, p=self.role_src, relationtype=self.relationtype
73
+ )
74
+ E_dst = E.select(dst=self.role_dst, p=self.role_src)
75
+
76
+ E = E_src.inner_join(E_dst, E_src.p == E_dst.p)
77
+ E = E.filter(E.src != E.dst)
78
+ E = E.select(["src", "dst", "relationtype"])
79
+ return E
80
+
81
+ def save(self, dir: Path | str) -> None:
82
+ """
83
+ Save the bipartite graph to disk.
84
+ Edges are saved as a Parquet file and metadata (``role_src``, ``role_dst``,
85
+ ``relationtype``) as a JSON file. The ``edges`` property is updated to point
86
+ at the saved file.
87
+
88
+ Args:
89
+ - dir: path to the directory where the BiPartite graph will be saved.
90
+ """
91
+ io.save_bipartite(
92
+ edges=self.edges,
93
+ role_src=self.role_src,
94
+ role_dst=self.role_dst,
95
+ relationtype=self.relationtype,
96
+ dir=dir,
97
+ )
98
+ bp = io.read_bipartite(dir=dir)
99
+ self.edges = bp.edges
@@ -0,0 +1,97 @@
1
+ from ibis.expr.types import Table
2
+ from ibis import dtype
3
+
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def check_edges(edges: Table, check_period=True) -> bool:
10
+ """
11
+ Check that the edges table has the required columns and types.
12
+
13
+ Args:
14
+ - edges: the edges table to check.
15
+ - check_period: whether to require a ``period`` column.
16
+
17
+ Returns:
18
+ - ``True`` if the edges table is valid, ``False`` otherwise.
19
+ """
20
+
21
+ # the column types can be int32 or int64, but they must be integers, and the layer column must be a string
22
+ expect_types = {
23
+ "src": "integer",
24
+ "dst": "integer",
25
+ "period": "integer",
26
+ "layer": "string",
27
+ "relationtype": "integer",
28
+ }
29
+
30
+ if not check_period:
31
+ expect_types.pop("period", None)
32
+
33
+ if check_column_type(edges, expect_types):
34
+ return True
35
+
36
+ return False
37
+
38
+
39
+ def check_vertices(vertices: Table, check_period=True) -> bool:
40
+ """
41
+ Check that the vertices table has the required columns and types.
42
+
43
+ Args:
44
+ - vertices: the vertices table to check.
45
+ - check_period: whether to require a ``period`` column.
46
+
47
+ Returns:
48
+ - ``True`` if the vertices table is valid, ``False`` otherwise.
49
+ """
50
+ required_columns = {"id", "period"} if check_period else {"id"}
51
+
52
+ if not required_columns.issubset(set(vertices.columns)):
53
+ logger.warning(f"Missing columns: {required_columns - set(vertices.columns)}")
54
+ return False
55
+
56
+ expect_types = {"id": "integer"}
57
+
58
+ if check_period:
59
+ expect_types["period"] = "integer"
60
+
61
+ if not check_column_type(vertices, expect_types):
62
+ return False
63
+
64
+ return True
65
+
66
+
67
+ def check_column_type(t: Table, expected_types: dict[str, str]) -> bool:
68
+ """
69
+ Check that the columns in a table have the expected types.
70
+
71
+ Args:
72
+ - t: the table to check.
73
+ - expected_types: dictionary mapping column names to expected type strings
74
+ (e.g., ``"integer"``, ``"string"``).
75
+
76
+ Returns:
77
+ - ``True`` if all specified columns exist and have the expected types, ``False`` otherwise.
78
+ """
79
+ for column, expected_type in expected_types.items():
80
+ col = t[column]
81
+ if col is None:
82
+ logger.warning(f"Column '{column}' is missing.")
83
+ return False
84
+ coltype = col.type()
85
+ if expected_type == "integer" and coltype.is_integer():
86
+ continue
87
+ if expected_type == "string" and coltype.is_string():
88
+ continue
89
+ # most specific check, if the expected type is exactly the same as the column type, then it's valid
90
+ if dtype(expected_type) == coltype:
91
+ continue
92
+
93
+ logger.warning(
94
+ f"Incorrect type for column '{column}': '{coltype}', expected {expected_type}"
95
+ )
96
+ return False
97
+ return True
@@ -0,0 +1,271 @@
1
+ import ibis
2
+
3
+ from muxpack.bipartite import Bipartite
4
+ from .multiplexseries import MultiplexSeries
5
+ from pathlib import Path
6
+ import os
7
+ import logging
8
+ from typing import Tuple
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def load_network(dir: Path) -> MultiplexSeries:
14
+ """
15
+ Load a multiplex network from a directory containing Parquet files.
16
+
17
+ The expected directory structure is::
18
+
19
+ dir/
20
+ <period>/
21
+ edges/
22
+ <layer>/
23
+ *.parquet
24
+ vertices.parquet
25
+
26
+ Args:
27
+ - dir: path to the root directory containing the Parquet files.
28
+
29
+ Returns:
30
+ - MultiplexSeries loaded from the directory.
31
+ """
32
+ logger.info("Loading data from {dir}...")
33
+ con = ibis.duckdb.connect()
34
+
35
+ logger.info("Loading edges...")
36
+ edges = con.read_parquet(f"{dir}/*/edges/**/*.parquet", table_name="edges")
37
+
38
+ logger.info("Loading vertices")
39
+ try:
40
+ vertices = ibis.read_parquet(f"{dir}/*/vertices.parquet", table_name="vertices")
41
+ except Exception as e:
42
+ logger.info(f"No vertices found: {e}")
43
+ vertices = None
44
+
45
+ try:
46
+ relationtypes = ibis.read_parquet(f"{dir}/*/relationtypes.csv")
47
+ except Exception as e:
48
+ logger.info(f"No relationtypes found: {e}")
49
+ relationtypes = None
50
+
51
+ m = MultiplexSeries(edges=edges, vertices=vertices, relationtypes=relationtypes)
52
+ return m
53
+
54
+
55
+ def save_network(
56
+ edges: ibis.Table,
57
+ vertices: ibis.Table,
58
+ dir: Path | str,
59
+ existing_data_behavior="delete_matching",
60
+ **kwargs,
61
+ ) -> Tuple[ibis.Table, ibis.Table]:
62
+ """
63
+ Save edges and vertices to disk following the muxpack directory structure.
64
+ The directory and all sub-directories are created if they do not exist.
65
+ Edges and vertices are not validated for consistency.
66
+
67
+ Args:
68
+ - edges: edge table to save.
69
+ - vertices: vertex table to save.
70
+ - dir: root path where the network will be saved.
71
+ - existing_data_behavior: passed through to ``pyarrow.dataset.write_dataset``.
72
+ - **kwargs: additional keyword arguments forwarded to ``pyarrow.dataset.write_dataset``.
73
+
74
+ Returns:
75
+ - Tuple of ``(edges, vertices)`` table objects pointing to the saved files.
76
+ """
77
+ E = edges
78
+ V = vertices
79
+ dir = Path(dir)
80
+
81
+ logger.info(f"Saving network to {dir}...")
82
+
83
+ # We do a manual partitioning to have maximum control.
84
+ # alternative and potentially more efficient would be partitioning using
85
+ # duckdb, however, that would pose some problems:
86
+ # - Hive naming convention does not follow the muxpack specification
87
+ # - Hive partitioning removes columns that are partitioned.
88
+ periods = E[["period"]].distinct().to_pandas().period
89
+
90
+ for period in periods:
91
+ period_dir = dir / f"{period}"
92
+ os.makedirs(period_dir, exist_ok=True)
93
+
94
+ # writing vertices
95
+ vertices_file = period_dir / "vertices.parquet"
96
+ V_period = V.filter(V.period == period)
97
+ V_period.to_parquet(vertices_file)
98
+
99
+ # writing edges
100
+ edges_dir = period_dir / "edges"
101
+ os.makedirs(edges_dir, exist_ok=True)
102
+ E_period = E.filter(E.period == period)
103
+ layers = E_period[["layer"]].distinct().to_pandas().layer
104
+ logger.info(f"layers: {layers}")
105
+ for layer in layers:
106
+ layer_dir = edges_dir / f"{layer}"
107
+ # TODO further partition?
108
+ os.makedirs(layer_dir, exist_ok=True)
109
+ E_period_layer = E_period.filter(E_period.layer == layer).order_by(
110
+ ["src", "relationtype", "dst"]
111
+ )
112
+ E_period_layer.to_parquet_dir(
113
+ layer_dir, existing_data_behavior=existing_data_behavior, **kwargs
114
+ )
115
+ logger.info(f"\t\tSaved layer {layer}")
116
+ logger.info(f"\tFinished saving period {period}")
117
+ logger.info(f"Finished saving network to {dir}.")
118
+
119
+ con = ibis.duckdb.connect()
120
+ edges = con.read_parquet(f"{dir}/*/edges/**/*.parquet", table_name="edges")
121
+ vertices = con.read_parquet(f"{dir}/*/vertices.parquet", table_name="vertices")
122
+ return edges, vertices
123
+
124
+
125
+ def save_multiplex(
126
+ edges: ibis.Table,
127
+ vertices: ibis.Table,
128
+ dir: Path | str,
129
+ period: int | None,
130
+ existing_data_behavior="delete_matching",
131
+ **kwargs,
132
+ ) -> Tuple[ibis.Table, ibis.Table]:
133
+ """
134
+ Save a single-period multiplex to disk following the muxpack directory structure.
135
+ The directory and all sub-directories are created if they do not exist.
136
+ Edges and vertices are not validated for consistency.
137
+
138
+ Args:
139
+ - edges: edge table to save.
140
+ - vertices: vertex table to save.
141
+ - period: the period for this multiplex, or ``None`` to skip period filtering.
142
+ - dir: root path where the multiplex will be saved.
143
+ - existing_data_behavior: passed through to ``pyarrow.dataset.write_dataset``.
144
+ - **kwargs: additional keyword arguments forwarded to ``pyarrow.dataset.write_dataset``.
145
+
146
+ Returns:
147
+ - Tuple of ``(edges, vertices)`` table objects pointing to the saved files.
148
+ """
149
+ E = edges
150
+ V = vertices
151
+ dir = Path(dir)
152
+
153
+ logger.info(f"Saving multiplex to {dir}...")
154
+
155
+ # We do a manual partitioning to have maximum control.
156
+ # alternative and potentially more efficient would be partitioning using
157
+ # duckdb, however, that would pose some problems:
158
+ # - Hive naming convention does not follow the muxpack specification
159
+ # - Hive partitioning removes columns that are partitioned.
160
+ os.makedirs(dir, exist_ok=True)
161
+
162
+ # writing vertices
163
+ vertices_file = dir / "vertices.parquet"
164
+ if period is not None:
165
+ # test if period column is there, if not add it to
166
+ V = V.filter(V.period == period)
167
+ V.to_parquet(vertices_file)
168
+
169
+ # writing edges
170
+ edges_dir = dir / "edges"
171
+
172
+ os.makedirs(edges_dir, exist_ok=True)
173
+ E_period = E.filter(E.period == period)
174
+ layers = E_period[["layer"]].distinct().to_pandas().layer
175
+ logger.info(f"layers: {layers}")
176
+ for layer in layers:
177
+ layer_dir = edges_dir / f"{layer}"
178
+ # TODO further partition?
179
+ os.makedirs(layer_dir, exist_ok=True)
180
+ E_period_layer = E_period.filter(E_period.layer == layer).order_by(
181
+ ["src", "relationtype", "dst"]
182
+ )
183
+ E_period_layer.to_parquet_dir(
184
+ layer_dir, existing_data_behavior=existing_data_behavior, **kwargs
185
+ )
186
+ logger.info(f"\t\tSaved layer {layer}")
187
+ logger.info("\tFinished saving")
188
+
189
+ con = ibis.duckdb.connect()
190
+ edges = con.read_parquet(f"{dir}/edges/**/*.parquet", table_name="edges")
191
+ vertices = con.read_parquet(f"{dir}/vertices.parquet", table_name="vertices")
192
+ return edges, vertices
193
+
194
+
195
+ def save_multiplexseries(
196
+ edges: ibis.Table, vertices: ibis.Table, dir: Path | str
197
+ ) -> None:
198
+ """
199
+ Save a multiplex series to disk by writing each period as a separate sub-directory.
200
+
201
+ Args:
202
+ - edges: edge table with a ``period`` column.
203
+ - vertices: vertex table with a ``period`` column.
204
+ - dir: root path where the multiplex series will be saved.
205
+ """
206
+ dir = Path(dir)
207
+ periods = (
208
+ edges.select("period").distinct().to_pyarrow().column("period").to_pylist()
209
+ )
210
+ for period in periods:
211
+ E = edges.filter(edges.period == period)
212
+ V = vertices.filter(vertices.period == period)
213
+ save_multiplex(edges=E, vertices=V, dir=dir / period)
214
+
215
+
216
+ def save_bipartite(
217
+ edges: ibis.Table, role_src: str, role_dst: str, relationtype: str, dir: Path | str
218
+ ) -> None:
219
+ """
220
+ Save a bipartite graph to disk as a Parquet file plus a JSON metadata file.
221
+
222
+ Args:
223
+ - edges: edge table to save.
224
+ - role_src: column name used for the source role.
225
+ - role_dst: column name used for the destination role.
226
+ - relationtype: column name used for the relation type.
227
+ - dir: path to the directory where the files will be saved.
228
+ """
229
+ dir = Path(dir)
230
+ os.makedirs(dir, exist_ok=True)
231
+ edges.to_parquet(dir / "edges.parquet")
232
+ json_content = {
233
+ "role_src": role_src,
234
+ "role_dst": role_dst,
235
+ "relationtype": relationtype,
236
+ }
237
+ with open(dir / "metadata.json", "w") as f:
238
+ import json
239
+
240
+ json.dump(json_content, f)
241
+
242
+
243
+ def read_bipartite(dir: Path | str) -> Bipartite:
244
+ """
245
+ Load a bipartite graph from disk.
246
+
247
+ Args:
248
+ - dir: path to the directory containing ``edges.parquet`` and ``metadata.json``.
249
+
250
+ Returns:
251
+ - BiPartite object with edges and metadata loaded from disk.
252
+ """
253
+ dir = Path(dir)
254
+ edges = ibis.read_parquet(dir / "edges.parquet")
255
+ with open(dir / "metadata.json", "r") as f:
256
+ import json
257
+
258
+ metadata = json.load(f)
259
+ role_src = metadata["role_src"]
260
+ role_dst = metadata["role_dst"]
261
+ relationtype = metadata["relationtype"]
262
+ return BiPartite(
263
+ edges=edges, role_src=role_src, role_dst=role_dst, relationtype=relationtype
264
+ )
265
+
266
+
267
+ if __name__ == "__main__":
268
+ logging.basicConfig(level=logging.INFO)
269
+ m = load_network("data")
270
+
271
+ save_network(edges=m.edges, vertices=m.vertices, dir="data2")
@@ -0,0 +1,141 @@
1
+ import ibis
2
+
3
+ from .check import check_edges, check_vertices
4
+ from pathlib import Path
5
+ from . import io
6
+ import logging
7
+ from scipy.sparse import csr_matrix
8
+ import networkx as nx
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Multiplex:
14
+ """
15
+ A multiplex is a graph with multiple layers.
16
+ Each layer represents a different type of relationship between the same set of vertices, during one period.
17
+ For example, in a social network, one layer could represent friendships, while
18
+ another layer could represent professional connections.
19
+ For multiple periods, use MultiplexSeries.
20
+ """
21
+
22
+ #: The edges of the multiplex. This is a table with columns "src", "dst", "layer" and "relationtype".
23
+ edges: ibis.Table
24
+
25
+ #: The vertices of the multiplex. This is a table with a column "id" and optional additional columns.
26
+ vertices: ibis.Table
27
+
28
+ period: int | None
29
+
30
+ def __init__(
31
+ self, edges: ibis.Table, vertices: ibis.Table = None, period: int | None = None
32
+ ) -> None:
33
+ """
34
+ Initialize a multiplex with the given edges and vertices tables.
35
+
36
+ Args:
37
+ - edges: table with columns ``src``, ``dst``, ``layer``, and ``relationtype``.
38
+ - vertices: table with column ``id`` and optional additional columns.
39
+ - period: the period this multiplex belongs to, or ``None`` if not applicable.
40
+
41
+ Raises:
42
+ - ValueError: if the edges table does not satisfy the required schema.
43
+ - ValueError: if the vertices table does not satisfy the required schema.
44
+ """
45
+ if not check_edges(edges, check_period=False):
46
+ raise ValueError("Invalid edges table")
47
+
48
+ if vertices is not None and not check_vertices(vertices, check_period=False):
49
+ raise ValueError("Invalid vertices table")
50
+
51
+ self.period = period
52
+ self.edges = edges
53
+ # TODO derive vertices from edges if not provided
54
+ self.vertices = vertices
55
+
56
+ def layers(self) -> list[str]:
57
+ """
58
+ Get the list of layers present in the multiplex.
59
+
60
+ Returns:
61
+ - List of layer names.
62
+ """
63
+ layers = self.edges[["layer"]].distinct().to_pandas().layer.tolist()
64
+ return layers
65
+
66
+ def update_vertices(self) -> None:
67
+ """
68
+ Update the vertices table by deriving it from the edges table.
69
+ This is useful when the vertices table was not provided at initialization.
70
+ ``self.vertices`` is updated in place.
71
+ """
72
+ src = self.edges.select(id="src").distinct()
73
+ dst = self.edges.select(id="dst").distinct()
74
+
75
+ V = src.union(dst, distinct=True).to_pyarrow()
76
+ self.vertices = ibis.memtable(V)
77
+
78
+ def to_csr_matrix(self) -> csr_matrix[bool]:
79
+ """
80
+ Transform the multiplex into a sparse matrix, collapsing all layers into one.
81
+ To keep layers separate, use ``to_csr_matrices`` instead.
82
+
83
+ Returns:
84
+ - Sparse boolean matrix of shape ``(n_vertices, n_vertices)``.
85
+ """
86
+ from .to_csr_matrix import to_row_col_idx, idx_to_csr_matrix
87
+
88
+ idx = to_row_col_idx(self.edges, self.vertices)
89
+ M = idx_to_csr_matrix(idx, self.vertices)
90
+ return M
91
+
92
+ def to_csr_matrices(self) -> dict[str, csr_matrix]:
93
+ """
94
+ Transform the multiplex into a dictionary of sparse matrices, one per layer.
95
+
96
+ Returns:
97
+ - Dictionary mapping layer name to a sparse boolean matrix of shape ``(n_vertices, n_vertices)``.
98
+ """
99
+ from .to_csr_matrix import to_row_col_idx, idx_to_csr_matrix
100
+
101
+ layers = self.layers()
102
+ matrices = {}
103
+ for layer in layers:
104
+ idx = to_row_col_idx(
105
+ self.edges.filter(self.edges.layer == layer), self.vertices
106
+ )
107
+ M = idx_to_csr_matrix(idx, self.vertices)
108
+ matrices[layer] = M
109
+ return matrices
110
+
111
+ def to_networkx(self) -> nx.MultiDiGraph:
112
+ """
113
+ Convert the multiplex to a NetworkX MultiDiGraph.
114
+
115
+ Returns:
116
+ - NetworkX MultiDiGraph built from the CSR matrix representation of the edges.
117
+ """
118
+ from .networkx import to_MultiDiGraph
119
+
120
+ return to_MultiDiGraph(self.edges, self.vertices)
121
+
122
+ def save(self, dir: Path | str, **kw_args) -> None:
123
+ """
124
+ Save the multiplex to disk.
125
+ The directory is created if it does not exist; existing files are overwritten.
126
+ Saving also evaluates the lazy ``edges`` and ``vertices`` expressions and
127
+ updates them to point at the saved files, which can improve subsequent performance.
128
+
129
+ Args:
130
+ - dir: path to the directory where the Multiplex will be saved.
131
+ - **kw_args: additional keyword arguments forwarded to ``io.save_multiplex``.
132
+ """
133
+ edges = self.edges
134
+ vertices = self.vertices
135
+ if vertices is None:
136
+ self.update_vertices()
137
+ vertices = self.vertices
138
+ period = self.period
139
+ edges, vertices = io.save_multiplex(edges, vertices, period, dir=dir, **kw_args)
140
+ self.edges = edges
141
+ self.vertices = vertices
@@ -0,0 +1,273 @@
1
+ import ibis
2
+
3
+ from .check import check_edges, check_vertices
4
+ from pathlib import Path
5
+ from . import io
6
+ from .multiplex import Multiplex
7
+ import logging
8
+ from typing import Tuple
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class MultiplexSeries:
14
+ """
15
+ A multiplexseries is a series of Multiplex graphs with multiple layers, spanning multiple periods.
16
+ """
17
+
18
+ #: The edges of the multiplex. This is a table with columns "src", "dst", "period", "layer" and "relationtype".
19
+ edges: ibis.Table
20
+
21
+ #: The vertices of the multiplex. This is a table with a column "id","period" and optional additional columns.
22
+ vertices: ibis.Table | None
23
+
24
+ #
25
+ vertex_ids: ibis.Table
26
+
27
+ relationtypes: ibis.Table | None
28
+
29
+ def __init__(
30
+ self,
31
+ edges: ibis.Table,
32
+ vertices: ibis.Table = None,
33
+ relationtypes: ibis.Table = None,
34
+ ) -> None:
35
+ """
36
+ Initialize a multiplex series with the given edges and vertices tables.
37
+
38
+ Args:
39
+ - edges: table with columns ``src``, ``dst``, ``period``, ``layer``, and ``relationtype``.
40
+ - vertices: table with column ``id``, ``period``, and optional additional columns.
41
+ Must have a ``period`` column because the edges table has one.
42
+ - relationtypes: table with columns ``relationtype``, ``layer``, ``label``,
43
+ and optional additional columns.
44
+
45
+ Raises:
46
+ - ValueError: if the edges table does not satisfy the required schema.
47
+ - ValueError: if the vertices table does not satisfy the required schema.
48
+ """
49
+ if not check_edges(edges):
50
+ raise ValueError("Invalid edges table")
51
+
52
+ if vertices is not None and not check_vertices(vertices):
53
+ raise ValueError("Invalid vertices table")
54
+
55
+ self.edges = edges
56
+ # TODO derive vertices from edges if not provided
57
+ self.vertices = vertices
58
+ self.relationtypes = relationtypes
59
+
60
+ if not vertices is None:
61
+ logger.info("Vertices table provided, using it as is.")
62
+ self.vertex_ids = vertices[["id"]].distinct()
63
+
64
+ def periods(self) -> list[int]:
65
+ """
66
+ Get the list of periods present in the multiplex series.
67
+
68
+ Returns:
69
+ - Sorted list of period values.
70
+ """
71
+ periods = (
72
+ self.edges.select(self.edges.period)
73
+ .distinct()
74
+ .order_by("period")
75
+ .to_pyarrow()
76
+ .column("period")
77
+ .to_pylist()
78
+ )
79
+ # periods = self.edges[["period"]].distinct().to_pandas().period.tolist()
80
+ return periods
81
+
82
+ def layers(self) -> list[str]:
83
+ """
84
+ Get the list of layers present in the multiplex series.
85
+
86
+ Returns:
87
+ - Sorted list of layer names.
88
+ """
89
+ layers = (
90
+ self.edges.select(self.edges.layer)
91
+ .distinct()
92
+ .order_by("layer")
93
+ .to_pyarrow()
94
+ .column("layer")
95
+ .to_pylist()
96
+ )
97
+ return layers
98
+
99
+ def update_vertices(self) -> None:
100
+ """
101
+ Update the vertices table by deriving it from the edges table.
102
+ This is useful when the vertices table was not provided at initialization.
103
+ Both ``self.vertices`` and ``self.vertex_ids`` are updated in place.
104
+ """
105
+ src = self.edges.select(id="src", period="period").distinct()
106
+ dst = self.edges.select(id="dst", period="period").distinct()
107
+
108
+ V = src.union(dst, distinct=True)
109
+ V_all = V.select(V.id)
110
+ self.vertices = ibis.memtable(V.to_pyarrow())
111
+ self.vertex_ids = ibis.memtable(V_all.to_pyarrow())
112
+
113
+ def update_relationtypes(self) -> None:
114
+ """
115
+ Update the relationtypes table by deriving it from the edges table.
116
+ This is useful when the relationtypes table was not provided at initialization.
117
+ A ``label`` column is constructed as ``"<layer>_<relationtype>"``.
118
+ ``self.relationtypes`` is updated in place.
119
+ """
120
+ relationtypes = (
121
+ self.edges.select(self.edges.relationtype, self.edges.layer)
122
+ .distinct()
123
+ .order_by("layer", "relationtype")
124
+ .to_pandas()
125
+ .assign(
126
+ label=lambda df: (
127
+ df["layer"].astype(str) + "_" + df["relationtype"].astype(str)
128
+ )
129
+ )
130
+ )
131
+ logger.debug(
132
+ f"Updated relationtypes table with {len(relationtypes)} unique relationtypes."
133
+ )
134
+ self.relationtypes = ibis.memtable(relationtypes)
135
+
136
+ def get_multiplex(self, period: int) -> Multiplex:
137
+ """
138
+ Return the multiplex for a specific period.
139
+
140
+ Args:
141
+ - period: the period to retrieve.
142
+
143
+ Returns:
144
+ - Multiplex object containing only the edges and vertices for the given period.
145
+ """
146
+ E_y = self.edges.filter(self.edges.period == period)
147
+ if self.vertices is not None:
148
+ V_y = self.vertices.filter(self.vertices.period == period)
149
+ else:
150
+ V_y = None
151
+ return Multiplex(edges=E_y, vertices=V_y, period=period)
152
+
153
+ def multiplexes(self) -> list[Tuple[int, Multiplex]]:
154
+ """
155
+ Return all multiplexes in the series, one per period.
156
+
157
+ Returns:
158
+ - List of ``(period, Multiplex)`` tuples, ordered by period.
159
+ """
160
+ periods = self.periods()
161
+ return [(period, self.get_multiplex(period)) for period in periods]
162
+
163
+ def add_filter(
164
+ self,
165
+ periods: list[int] = None,
166
+ layers: list[str] = None,
167
+ relationtypes: list[int] = None,
168
+ src: list[int] = None,
169
+ dst: list[int] = None,
170
+ ) -> None:
171
+ """
172
+ Apply a filter to the multiplex series in place.
173
+ Filtering is lazy: the filter is only executed when saving or converting
174
+ to another format. Passing ``None`` or an empty list for any argument
175
+ means no filtering is applied for that dimension.
176
+
177
+ For advanced filtering, modify the ``edges`` property directly using
178
+ ibis expressions.
179
+
180
+ Args:
181
+ - periods: list of periods to keep.
182
+ - layers: list of layer names to keep.
183
+ - relationtypes: list of relationtype values to keep.
184
+ - src: list of source vertex ids (ego) to keep.
185
+ - dst: list of destination vertex ids (non-ego) to keep.
186
+ """
187
+ E = self.edges
188
+
189
+ flt: list[ibis.BooleanValue] = []
190
+
191
+ if periods is not None and len(periods) > 0:
192
+ flt.append(E.period.isin(periods))
193
+
194
+ if layers is not None and len(layers) > 0:
195
+ flt.append(E.layer.isin(layers))
196
+
197
+ if relationtypes is not None and len(relationtypes) > 0:
198
+ flt.append(E.relationtype.isin(relationtypes))
199
+
200
+ if src is not None and len(src) > 0:
201
+ vid = ibis.memtable({"id": src})
202
+ # we use semi join because we expect the vertex list to be large
203
+ E = E.semi_join(vid, E.src == vid.id)
204
+
205
+ if dst is not None and len(dst) > 0:
206
+ vid = ibis.memtable({"id": dst})
207
+ # we use semi join because we expect the vertex list to be large
208
+ E = E.semi_join(vid, E.dst == vid.id)
209
+
210
+ logger.debug("Filter: f{flt}")
211
+ if len(flt):
212
+ E = E.filter(flt)
213
+
214
+ self.edges = E
215
+
216
+ def __copy__(self) -> "MultiplexSeries":
217
+ """
218
+ Return a shallow copy of this MultiplexSeries.
219
+
220
+ Returns:
221
+ - A new MultiplexSeries sharing the same ``edges`` and ``vertices`` tables.
222
+ """
223
+ return MultiplexSeries(self.edges, self.vertices)
224
+
225
+ def collapse(self) -> Multiplex:
226
+ """
227
+ Collapse the multiplex series into a single Multiplex by discarding period
228
+ information. Duplicate edges across periods are removed. This is useful
229
+ for analyses that do not require temporal information.
230
+
231
+ Returns:
232
+ - Multiplex containing all distinct edges across all periods, with ``period=None``.
233
+ """
234
+ E = self.edges.select(["src", "dst", "layer", "relationtype"]).distinct()
235
+ if self.vertices is not None:
236
+ V = self.vertices.select("id").distinct()
237
+ else:
238
+ V = None
239
+ return Multiplex(edges=E, vertices=V, period=None)
240
+
241
+ def collapse_to(self, dir: Path | str) -> None:
242
+ """
243
+ Collapse the multiplex series and save the result to disk.
244
+ This is a convenience method equivalent to calling ``collapse()`` followed
245
+ by ``Multiplex.save()``.
246
+
247
+ Args:
248
+ - dir: path to the directory where the collapsed Multiplex will be saved.
249
+ """
250
+ m = self.collapse()
251
+ return m.save(dir=dir)
252
+
253
+ def save(self, dir: Path | str, **kw_args) -> None:
254
+ """
255
+ Save the multiplex series to disk.
256
+ The directory is created if it does not exist; existing files are overwritten.
257
+ Saving also evaluates the lazy ``edges`` and ``vertices`` expressions and
258
+ updates them to point at the saved files, which can improve subsequent
259
+ performance.
260
+
261
+ Args:
262
+ - dir: path to the directory where the MultiplexSeries will be saved.
263
+ - **kw_args: additional keyword arguments forwarded to ``io.save_network``.
264
+ """
265
+ edges = self.edges
266
+ vertices = self.vertices
267
+ if vertices is None:
268
+ mp = MultiplexSeries(edges=self.edges)
269
+ mp.update_vertices()
270
+ vertices = mp.vertices
271
+ E, V = io.save_network(edges, vertices, dir=dir, **kw_args)
272
+ self.edges = E
273
+ self.vertices = V
@@ -0,0 +1,20 @@
1
+ import networkx as nx
2
+ import ibis
3
+ from .to_csr_matrix import to_csr_matrix
4
+
5
+
6
+ def to_MultiDiGraph(edges: ibis.Table, vertices: ibis.Table) -> nx.MultiDiGraph:
7
+ """
8
+ Convert an edge list and vertex table to a NetworkX MultiDiGraph.
9
+
10
+ Args:
11
+ - edges: table with ``src`` and ``dst`` columns.
12
+ - vertices: table with an ``id`` column.
13
+
14
+ Returns:
15
+ - NetworkX MultiDiGraph built from the CSR matrix representation of the edges.
16
+ """
17
+ # problem: this generates
18
+ csr = to_csr_matrix(edges, vertices)
19
+ mdg = nx.MultiDiGraph(csr)
20
+ return mdg
File without changes
@@ -0,0 +1,132 @@
1
+ from ibis import row_number, Table
2
+ import ibis
3
+ from scipy.sparse import csr_matrix
4
+ from muxpack.multiplex import Multiplex
5
+ from typing import Tuple, Generator
6
+
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+ # from collections.abc import Generator
11
+
12
+
13
+ def to_row_col_idx(edges: Table, vertices: Table) -> Table:
14
+ """
15
+ Turn an edge list into a row/column index table based on the given vertices table.
16
+
17
+ Args:
18
+ - edges: table with ``src`` and ``dst`` columns.
19
+ - vertices: table with an ``id`` column; edges not referencing a vertex in this
20
+ table are filtered out.
21
+
22
+ Returns:
23
+ - Table with columns ``data``, ``row``, and ``col`` containing the boolean edge
24
+ indicator and the row/column indices corresponding to vertex positions in
25
+ ``vertices``. Can be passed directly to ``idx_to_csr_matrix``.
26
+ """
27
+ v = vertices.select("id").mutate(idx=row_number())
28
+ row = v.select(src="id", row="idx")
29
+ col = v.select(dst="id", col="idx")
30
+
31
+ # may sum the number of columns
32
+ idx_edges = (
33
+ edges[["src", "dst"]]
34
+ .distinct()
35
+ .inner_join(row, "src")
36
+ .inner_join(col, "dst")
37
+ .mutate(data=True)
38
+ .select("data", "row", "col")
39
+ )
40
+ logger.debug(
41
+ f"Created row-col index table with {idx_edges.count().execute()} edges."
42
+ )
43
+ return idx_edges
44
+
45
+
46
+ def idx_to_csr_matrix(idx: Table, vertices: Table) -> csr_matrix:
47
+ """
48
+ Convert a row-column index table to a CSR sparse matrix.
49
+
50
+ Args:
51
+ - idx: table with columns ``data``, ``row``, and ``col``, as produced by
52
+ ``to_row_col_idx``.
53
+ - vertices: table with an ``id`` column; its row count determines the matrix size.
54
+
55
+ Returns:
56
+ - Square CSR sparse matrix of shape ``(n_vertices, n_vertices)``.
57
+ """
58
+ # TODO maybe to_parquet()?
59
+ coo = idx.execute()
60
+ logger.debug(f"COO matrix data: {coo}")
61
+
62
+ n = vertices.count().execute()
63
+ logger.debug(f"Number of vertices: {n}")
64
+ M = csr_matrix((coo["data"], (coo["row"], coo["col"])), shape=(n, n))
65
+ return M
66
+
67
+
68
+ def to_csr_matrix(edges: Table, vertices: Table | None) -> csr_matrix:
69
+ """
70
+ Transform an edge list into a sparse matrix (csr_matrix).
71
+
72
+ Args:
73
+ - edges: table with ``src`` and ``dst`` columns.
74
+ - vertices: table with an ``id`` column; edges are filtered to vertices present
75
+ in this table. Pass ``None`` to derive vertices from the edges table.
76
+
77
+ Returns:
78
+ - Square CSR sparse matrix of shape ``(n_vertices, n_vertices)``.
79
+ """
80
+ # vertices may contain multiple periods
81
+ if vertices is not None:
82
+ vertices = vertices[["id"]].distinct()
83
+ edges_row_col = to_row_col_idx(edges, vertices=vertices)
84
+ M = idx_to_csr_matrix(edges_row_col, vertices=vertices)
85
+ return M
86
+
87
+
88
+ def to_period_csr_matrix(
89
+ edges: Table, vertices: Table | None, periods: list[int] = []
90
+ ) -> Generator[Tuple[csr_matrix, int]]:
91
+ """
92
+ Generate a sparse matrix for each period.
93
+
94
+ Args:
95
+ - edges: table with columns ``src``, ``dst``, and ``period``.
96
+ - vertices: table with columns ``id`` and ``period``, or ``None`` to derive
97
+ vertices from the edges table for each period.
98
+ - periods: list of periods to generate matrices for. If empty, all periods
99
+ present in ``edges`` are used.
100
+
101
+ Returns:
102
+ - Generator of ``(csr_matrix, period)`` tuples, one per period.
103
+ """
104
+ if len(periods) == 0:
105
+ periods = edges[["period"]].distinct().to_pandas().period.tolist()
106
+ for period in periods:
107
+ E_y = edges.filter(edges.period == period)
108
+ if vertices is not None:
109
+ V_y = vertices.filter(vertices.period == period)
110
+ else:
111
+ V_y = None
112
+
113
+ yield to_csr_matrix(E_y, V_y), period
114
+
115
+
116
+ if __name__ == "__main__":
117
+ logging.basicConfig(level=logging.DEBUG)
118
+ import pandas as pd
119
+
120
+ edges = pd.DataFrame({"src": [100, 100], "dst": [300, 200]})
121
+ vertices = pd.DataFrame({"id": [100, 200, 300]})
122
+
123
+ E = ibis.memtable(edges)
124
+ V = ibis.memtable(vertices)
125
+
126
+ V1 = V.filter(V.id < 250)
127
+ idx = to_row_col_idx(E, V1)
128
+ M1 = idx_to_csr_matrix(idx, V1)
129
+ print(f"M1 = {M1}")
130
+
131
+ M = to_csr_matrix(E, V)
132
+ print(M)