muxpack 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- muxpack-0.1.0/PKG-INFO +36 -0
- muxpack-0.1.0/README.md +16 -0
- muxpack-0.1.0/pyproject.toml +45 -0
- muxpack-0.1.0/src/muxpack/__init__.py +17 -0
- muxpack-0.1.0/src/muxpack/bipartite.py +99 -0
- muxpack-0.1.0/src/muxpack/check.py +97 -0
- muxpack-0.1.0/src/muxpack/io.py +271 -0
- muxpack-0.1.0/src/muxpack/multiplex.py +141 -0
- muxpack-0.1.0/src/muxpack/multiplexseries.py +273 -0
- muxpack-0.1.0/src/muxpack/networkx.py +20 -0
- muxpack-0.1.0/src/muxpack/py.typed +0 -0
- muxpack-0.1.0/src/muxpack/to_csr_matrix.py +132 -0
muxpack-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: muxpack
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Tools to handle multiplex network data more easily
|
|
5
|
+
Author: Edwin de Jonge, Jan van der Laan
|
|
6
|
+
Author-email: Edwin de Jonge <edwindjonge@gmail.com>, Jan van der Laan <djvanderlaan@gmail.com>
|
|
7
|
+
Requires-Dist: duckdb>=1.4.4
|
|
8
|
+
Requires-Dist: ibis-framework[duckdb]>=12.0.0
|
|
9
|
+
Requires-Dist: networkx>=3.6.1
|
|
10
|
+
Requires-Dist: pandas>=3.0.1
|
|
11
|
+
Requires-Dist: pyarrow>=23.0.1
|
|
12
|
+
Requires-Dist: pyarrow-hotfix>=0.7
|
|
13
|
+
Requires-Dist: scipy>=1.17.1
|
|
14
|
+
Requires-Python: >=3.13
|
|
15
|
+
Project-URL: Homepage, https://codeberg.org/CBS-Networktools/muxpack.py
|
|
16
|
+
Project-URL: Documentation, https://readthedocs.org
|
|
17
|
+
Project-URL: Repository, https://codeberg.org/CBS-Networktools/muxpack.py
|
|
18
|
+
Project-URL: Bug Tracker, https://codeberg.org/CBS-Networktools/muxpack.py/issues
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
**Under heavy construction, do not use for serious work!**
|
|
22
|
+
|
|
23
|
+
## Muxpack
|
|
24
|
+
|
|
25
|
+
Muxpack is a Python implementation for working with multiplex network files.
|
|
26
|
+
|
|
27
|
+
## Documentation
|
|
28
|
+
|
|
29
|
+
Build docs locally using the same dependency path as Read the Docs:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
uv sync --group docs
|
|
33
|
+
uv run sphinx-build -b html docs docs/_build/html
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The generated HTML is available in `docs/_build/html/index.html`.
|
muxpack-0.1.0/README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
**Under heavy construction, do not use for serious work!**
|
|
2
|
+
|
|
3
|
+
## Muxpack
|
|
4
|
+
|
|
5
|
+
Muxpack is a Python implementation for working with multiplex network files.
|
|
6
|
+
|
|
7
|
+
## Documentation
|
|
8
|
+
|
|
9
|
+
Build docs locally using the same dependency path as Read the Docs:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
uv sync --group docs
|
|
13
|
+
uv run sphinx-build -b html docs docs/_build/html
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
The generated HTML is available in `docs/_build/html/index.html`.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "muxpack"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Tools to handle multiplex network data more easily"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Edwin de Jonge", email = "edwindjonge@gmail.com" },
|
|
8
|
+
{ name = "Jan van der Laan", email = "djvanderlaan@gmail.com" }
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.13"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"duckdb>=1.4.4",
|
|
13
|
+
"ibis-framework[duckdb]>=12.0.0",
|
|
14
|
+
"networkx>=3.6.1",
|
|
15
|
+
"pandas>=3.0.1",
|
|
16
|
+
"pyarrow>=23.0.1",
|
|
17
|
+
"pyarrow-hotfix>=0.7",
|
|
18
|
+
"scipy>=1.17.1",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
Homepage = "https://codeberg.org/CBS-Networktools/muxpack.py"
|
|
23
|
+
Documentation = "https://readthedocs.org"
|
|
24
|
+
Repository = "https://codeberg.org/CBS-Networktools/muxpack.py"
|
|
25
|
+
"Bug Tracker" = "https://codeberg.org/CBS-Networktools/muxpack.py/issues"
|
|
26
|
+
|
|
27
|
+
[project.scripts]
|
|
28
|
+
muxpack = "muxpack:main"
|
|
29
|
+
|
|
30
|
+
[build-system]
|
|
31
|
+
requires = ["uv_build>=0.10.6,<0.11.0"]
|
|
32
|
+
build-backend = "uv_build"
|
|
33
|
+
|
|
34
|
+
[dependency-groups]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=9.0.2",
|
|
37
|
+
"ruff>=0.15.4",
|
|
38
|
+
]
|
|
39
|
+
docs = [
|
|
40
|
+
"sphinx>=9.1.0",
|
|
41
|
+
"sphinx-rtd-theme>=3.0.0",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[tool.ruff.lint]
|
|
45
|
+
extend-select = ["B"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .check import check_edges, check_vertices
|
|
2
|
+
from .io import load_network, save_network
|
|
3
|
+
from .multiplexseries import MultiplexSeries
|
|
4
|
+
from .multiplex import Multiplex
|
|
5
|
+
from .to_csr_matrix import to_csr_matrix
|
|
6
|
+
from .bipartite import Bipartite
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"check_edges",
|
|
10
|
+
"check_vertices",
|
|
11
|
+
"load_network",
|
|
12
|
+
"Multiplex",
|
|
13
|
+
"MultiplexSeries",
|
|
14
|
+
"save_network",
|
|
15
|
+
"to_csr_matrix",
|
|
16
|
+
"Bipartite",
|
|
17
|
+
]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from ibis import Table
|
|
3
|
+
from . import io
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Bipartite:
|
|
7
|
+
"""
|
|
8
|
+
Lazy Bipartite storage
|
|
9
|
+
- sort on role_src, role_dst
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
edges: Table
|
|
13
|
+
role_src: str
|
|
14
|
+
role_dst: str
|
|
15
|
+
relationtype: str
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
edges: Table,
|
|
20
|
+
role_src: str = "src",
|
|
21
|
+
role_dst: str = "dst",
|
|
22
|
+
relationtype: str = "relationtype",
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialize a bipartite graph with the given edges table and role labels.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
- edges: table containing the bipartite edges.
|
|
29
|
+
- role_src: column name for the source role.
|
|
30
|
+
- role_dst: column name for the destination role.
|
|
31
|
+
- relationtype: column name for the relation type.
|
|
32
|
+
"""
|
|
33
|
+
self.edges = edges
|
|
34
|
+
self.role_src = role_src
|
|
35
|
+
self.role_dst = role_dst
|
|
36
|
+
self.relationtype = relationtype
|
|
37
|
+
|
|
38
|
+
def project_to_src(self) -> Table:
|
|
39
|
+
"""
|
|
40
|
+
Project the bipartite graph onto the source role, producing a unipartite edge table.
|
|
41
|
+
Two source nodes are connected if they share a common destination node.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
- Table with columns ``src``, ``dst``, and ``relationtype``.
|
|
45
|
+
"""
|
|
46
|
+
E = self.edges
|
|
47
|
+
# TODO this is a explicit choice: relationtypes could be more complex, ie when two different role_src have
|
|
48
|
+
# an other relation with the same role_dst. Simplifying that right now
|
|
49
|
+
E_src = E.select(
|
|
50
|
+
src=self.role_src, p=self.role_dst, relationtype=self.relationtype
|
|
51
|
+
)
|
|
52
|
+
E_dst = E.select(dst=self.role_src, p=self.role_dst)
|
|
53
|
+
|
|
54
|
+
E = E_src.inner_join(E_dst, E_src.p == E_dst.p)
|
|
55
|
+
E = E.filter(E.src != E.dst)
|
|
56
|
+
E = E.select(["src", "dst", "relationtype"])
|
|
57
|
+
return E
|
|
58
|
+
|
|
59
|
+
def project_to_dst(self) -> Table:
|
|
60
|
+
"""
|
|
61
|
+
Project the bipartite graph onto the destination role, producing a unipartite edge table.
|
|
62
|
+
Two destination nodes are connected if they share a common source node.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
- Table with columns ``src``, ``dst``, and ``relationtype``.
|
|
66
|
+
"""
|
|
67
|
+
E = self.edges
|
|
68
|
+
# TODO this is a explicit choice: relationtypes could be more complex, ie when two different role_src have
|
|
69
|
+
# an other relation with the same role_dst. Simplifying that right now
|
|
70
|
+
# should we sort on role_src and role_dst or the other way around? For projection it does not matter, but for storage it does. We sort on role_src and role_dst for efficient projection, but that means that the projection to dst is less efficient. Maybe we should sort on role_dst and role_src instead?
|
|
71
|
+
E_src = E.select(
|
|
72
|
+
src=self.role_dst, p=self.role_src, relationtype=self.relationtype
|
|
73
|
+
)
|
|
74
|
+
E_dst = E.select(dst=self.role_dst, p=self.role_src)
|
|
75
|
+
|
|
76
|
+
E = E_src.inner_join(E_dst, E_src.p == E_dst.p)
|
|
77
|
+
E = E.filter(E.src != E.dst)
|
|
78
|
+
E = E.select(["src", "dst", "relationtype"])
|
|
79
|
+
return E
|
|
80
|
+
|
|
81
|
+
def save(self, dir: Path | str) -> None:
|
|
82
|
+
"""
|
|
83
|
+
Save the bipartite graph to disk.
|
|
84
|
+
Edges are saved as a Parquet file and metadata (``role_src``, ``role_dst``,
|
|
85
|
+
``relationtype``) as a JSON file. The ``edges`` property is updated to point
|
|
86
|
+
at the saved file.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
- dir: path to the directory where the BiPartite graph will be saved.
|
|
90
|
+
"""
|
|
91
|
+
io.save_bipartite(
|
|
92
|
+
edges=self.edges,
|
|
93
|
+
role_src=self.role_src,
|
|
94
|
+
role_dst=self.role_dst,
|
|
95
|
+
relationtype=self.relationtype,
|
|
96
|
+
dir=dir,
|
|
97
|
+
)
|
|
98
|
+
bp = io.read_bipartite(dir=dir)
|
|
99
|
+
self.edges = bp.edges
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from ibis.expr.types import Table
|
|
2
|
+
from ibis import dtype
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def check_edges(edges: Table, check_period=True) -> bool:
|
|
10
|
+
"""
|
|
11
|
+
Check that the edges table has the required columns and types.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
- edges: the edges table to check.
|
|
15
|
+
- check_period: whether to require a ``period`` column.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
- ``True`` if the edges table is valid, ``False`` otherwise.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# the column types can be int32 or int64, but they must be integers, and the layer column must be a string
|
|
22
|
+
expect_types = {
|
|
23
|
+
"src": "integer",
|
|
24
|
+
"dst": "integer",
|
|
25
|
+
"period": "integer",
|
|
26
|
+
"layer": "string",
|
|
27
|
+
"relationtype": "integer",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if not check_period:
|
|
31
|
+
expect_types.pop("period", None)
|
|
32
|
+
|
|
33
|
+
if check_column_type(edges, expect_types):
|
|
34
|
+
return True
|
|
35
|
+
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def check_vertices(vertices: Table, check_period=True) -> bool:
|
|
40
|
+
"""
|
|
41
|
+
Check that the vertices table has the required columns and types.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
- vertices: the vertices table to check.
|
|
45
|
+
- check_period: whether to require a ``period`` column.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
- ``True`` if the vertices table is valid, ``False`` otherwise.
|
|
49
|
+
"""
|
|
50
|
+
required_columns = {"id", "period"} if check_period else {"id"}
|
|
51
|
+
|
|
52
|
+
if not required_columns.issubset(set(vertices.columns)):
|
|
53
|
+
logger.warning(f"Missing columns: {required_columns - set(vertices.columns)}")
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
expect_types = {"id": "integer"}
|
|
57
|
+
|
|
58
|
+
if check_period:
|
|
59
|
+
expect_types["period"] = "integer"
|
|
60
|
+
|
|
61
|
+
if not check_column_type(vertices, expect_types):
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def check_column_type(t: Table, expected_types: dict[str, str]) -> bool:
|
|
68
|
+
"""
|
|
69
|
+
Check that the columns in a table have the expected types.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
- t: the table to check.
|
|
73
|
+
- expected_types: dictionary mapping column names to expected type strings
|
|
74
|
+
(e.g., ``"integer"``, ``"string"``).
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
- ``True`` if all specified columns exist and have the expected types, ``False`` otherwise.
|
|
78
|
+
"""
|
|
79
|
+
for column, expected_type in expected_types.items():
|
|
80
|
+
col = t[column]
|
|
81
|
+
if col is None:
|
|
82
|
+
logger.warning(f"Column '{column}' is missing.")
|
|
83
|
+
return False
|
|
84
|
+
coltype = col.type()
|
|
85
|
+
if expected_type == "integer" and coltype.is_integer():
|
|
86
|
+
continue
|
|
87
|
+
if expected_type == "string" and coltype.is_string():
|
|
88
|
+
continue
|
|
89
|
+
# most specific check, if the expected type is exactly the same as the column type, then it's valid
|
|
90
|
+
if dtype(expected_type) == coltype:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
logger.warning(
|
|
94
|
+
f"Incorrect type for column '{column}': '{coltype}', expected {expected_type}"
|
|
95
|
+
)
|
|
96
|
+
return False
|
|
97
|
+
return True
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
import ibis
|
|
2
|
+
|
|
3
|
+
from muxpack.bipartite import Bipartite
|
|
4
|
+
from .multiplexseries import MultiplexSeries
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import os
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def load_network(dir: Path) -> MultiplexSeries:
|
|
14
|
+
"""
|
|
15
|
+
Load a multiplex network from a directory containing Parquet files.
|
|
16
|
+
|
|
17
|
+
The expected directory structure is::
|
|
18
|
+
|
|
19
|
+
dir/
|
|
20
|
+
<period>/
|
|
21
|
+
edges/
|
|
22
|
+
<layer>/
|
|
23
|
+
*.parquet
|
|
24
|
+
vertices.parquet
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
- dir: path to the root directory containing the Parquet files.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
- MultiplexSeries loaded from the directory.
|
|
31
|
+
"""
|
|
32
|
+
logger.info("Loading data from {dir}...")
|
|
33
|
+
con = ibis.duckdb.connect()
|
|
34
|
+
|
|
35
|
+
logger.info("Loading edges...")
|
|
36
|
+
edges = con.read_parquet(f"{dir}/*/edges/**/*.parquet", table_name="edges")
|
|
37
|
+
|
|
38
|
+
logger.info("Loading vertices")
|
|
39
|
+
try:
|
|
40
|
+
vertices = ibis.read_parquet(f"{dir}/*/vertices.parquet", table_name="vertices")
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.info(f"No vertices found: {e}")
|
|
43
|
+
vertices = None
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
relationtypes = ibis.read_parquet(f"{dir}/*/relationtypes.csv")
|
|
47
|
+
except Exception as e:
|
|
48
|
+
logger.info(f"No relationtypes found: {e}")
|
|
49
|
+
relationtypes = None
|
|
50
|
+
|
|
51
|
+
m = MultiplexSeries(edges=edges, vertices=vertices, relationtypes=relationtypes)
|
|
52
|
+
return m
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def save_network(
|
|
56
|
+
edges: ibis.Table,
|
|
57
|
+
vertices: ibis.Table,
|
|
58
|
+
dir: Path | str,
|
|
59
|
+
existing_data_behavior="delete_matching",
|
|
60
|
+
**kwargs,
|
|
61
|
+
) -> Tuple[ibis.Table, ibis.Table]:
|
|
62
|
+
"""
|
|
63
|
+
Save edges and vertices to disk following the muxpack directory structure.
|
|
64
|
+
The directory and all sub-directories are created if they do not exist.
|
|
65
|
+
Edges and vertices are not validated for consistency.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
- edges: edge table to save.
|
|
69
|
+
- vertices: vertex table to save.
|
|
70
|
+
- dir: root path where the network will be saved.
|
|
71
|
+
- existing_data_behavior: passed through to ``pyarrow.dataset.write_dataset``.
|
|
72
|
+
- **kwargs: additional keyword arguments forwarded to ``pyarrow.dataset.write_dataset``.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
- Tuple of ``(edges, vertices)`` table objects pointing to the saved files.
|
|
76
|
+
"""
|
|
77
|
+
E = edges
|
|
78
|
+
V = vertices
|
|
79
|
+
dir = Path(dir)
|
|
80
|
+
|
|
81
|
+
logger.info(f"Saving network to {dir}...")
|
|
82
|
+
|
|
83
|
+
# We do a manual partitioning to have maximum control.
|
|
84
|
+
# alternative and potentially more efficient would be partitioning using
|
|
85
|
+
# duckdb, however, that would pose some problems:
|
|
86
|
+
# - Hive naming convention does not follow the muxpack specification
|
|
87
|
+
# - Hive partitioning removes columns that are partitioned.
|
|
88
|
+
periods = E[["period"]].distinct().to_pandas().period
|
|
89
|
+
|
|
90
|
+
for period in periods:
|
|
91
|
+
period_dir = dir / f"{period}"
|
|
92
|
+
os.makedirs(period_dir, exist_ok=True)
|
|
93
|
+
|
|
94
|
+
# writing vertices
|
|
95
|
+
vertices_file = period_dir / "vertices.parquet"
|
|
96
|
+
V_period = V.filter(V.period == period)
|
|
97
|
+
V_period.to_parquet(vertices_file)
|
|
98
|
+
|
|
99
|
+
# writing edges
|
|
100
|
+
edges_dir = period_dir / "edges"
|
|
101
|
+
os.makedirs(edges_dir, exist_ok=True)
|
|
102
|
+
E_period = E.filter(E.period == period)
|
|
103
|
+
layers = E_period[["layer"]].distinct().to_pandas().layer
|
|
104
|
+
logger.info(f"layers: {layers}")
|
|
105
|
+
for layer in layers:
|
|
106
|
+
layer_dir = edges_dir / f"{layer}"
|
|
107
|
+
# TODO further partition?
|
|
108
|
+
os.makedirs(layer_dir, exist_ok=True)
|
|
109
|
+
E_period_layer = E_period.filter(E_period.layer == layer).order_by(
|
|
110
|
+
["src", "relationtype", "dst"]
|
|
111
|
+
)
|
|
112
|
+
E_period_layer.to_parquet_dir(
|
|
113
|
+
layer_dir, existing_data_behavior=existing_data_behavior, **kwargs
|
|
114
|
+
)
|
|
115
|
+
logger.info(f"\t\tSaved layer {layer}")
|
|
116
|
+
logger.info(f"\tFinished saving period {period}")
|
|
117
|
+
logger.info(f"Finished saving network to {dir}.")
|
|
118
|
+
|
|
119
|
+
con = ibis.duckdb.connect()
|
|
120
|
+
edges = con.read_parquet(f"{dir}/*/edges/**/*.parquet", table_name="edges")
|
|
121
|
+
vertices = con.read_parquet(f"{dir}/*/vertices.parquet", table_name="vertices")
|
|
122
|
+
return edges, vertices
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def save_multiplex(
|
|
126
|
+
edges: ibis.Table,
|
|
127
|
+
vertices: ibis.Table,
|
|
128
|
+
dir: Path | str,
|
|
129
|
+
period: int | None,
|
|
130
|
+
existing_data_behavior="delete_matching",
|
|
131
|
+
**kwargs,
|
|
132
|
+
) -> Tuple[ibis.Table, ibis.Table]:
|
|
133
|
+
"""
|
|
134
|
+
Save a single-period multiplex to disk following the muxpack directory structure.
|
|
135
|
+
The directory and all sub-directories are created if they do not exist.
|
|
136
|
+
Edges and vertices are not validated for consistency.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
- edges: edge table to save.
|
|
140
|
+
- vertices: vertex table to save.
|
|
141
|
+
- period: the period for this multiplex, or ``None`` to skip period filtering.
|
|
142
|
+
- dir: root path where the multiplex will be saved.
|
|
143
|
+
- existing_data_behavior: passed through to ``pyarrow.dataset.write_dataset``.
|
|
144
|
+
- **kwargs: additional keyword arguments forwarded to ``pyarrow.dataset.write_dataset``.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
- Tuple of ``(edges, vertices)`` table objects pointing to the saved files.
|
|
148
|
+
"""
|
|
149
|
+
E = edges
|
|
150
|
+
V = vertices
|
|
151
|
+
dir = Path(dir)
|
|
152
|
+
|
|
153
|
+
logger.info(f"Saving multiplex to {dir}...")
|
|
154
|
+
|
|
155
|
+
# We do a manual partitioning to have maximum control.
|
|
156
|
+
# alternative and potentially more efficient would be partitioning using
|
|
157
|
+
# duckdb, however, that would pose some problems:
|
|
158
|
+
# - Hive naming convention does not follow the muxpack specification
|
|
159
|
+
# - Hive partitioning removes columns that are partitioned.
|
|
160
|
+
os.makedirs(dir, exist_ok=True)
|
|
161
|
+
|
|
162
|
+
# writing vertices
|
|
163
|
+
vertices_file = dir / "vertices.parquet"
|
|
164
|
+
if period is not None:
|
|
165
|
+
# test if period column is there, if not add it to
|
|
166
|
+
V = V.filter(V.period == period)
|
|
167
|
+
V.to_parquet(vertices_file)
|
|
168
|
+
|
|
169
|
+
# writing edges
|
|
170
|
+
edges_dir = dir / "edges"
|
|
171
|
+
|
|
172
|
+
os.makedirs(edges_dir, exist_ok=True)
|
|
173
|
+
E_period = E.filter(E.period == period)
|
|
174
|
+
layers = E_period[["layer"]].distinct().to_pandas().layer
|
|
175
|
+
logger.info(f"layers: {layers}")
|
|
176
|
+
for layer in layers:
|
|
177
|
+
layer_dir = edges_dir / f"{layer}"
|
|
178
|
+
# TODO further partition?
|
|
179
|
+
os.makedirs(layer_dir, exist_ok=True)
|
|
180
|
+
E_period_layer = E_period.filter(E_period.layer == layer).order_by(
|
|
181
|
+
["src", "relationtype", "dst"]
|
|
182
|
+
)
|
|
183
|
+
E_period_layer.to_parquet_dir(
|
|
184
|
+
layer_dir, existing_data_behavior=existing_data_behavior, **kwargs
|
|
185
|
+
)
|
|
186
|
+
logger.info(f"\t\tSaved layer {layer}")
|
|
187
|
+
logger.info("\tFinished saving")
|
|
188
|
+
|
|
189
|
+
con = ibis.duckdb.connect()
|
|
190
|
+
edges = con.read_parquet(f"{dir}/edges/**/*.parquet", table_name="edges")
|
|
191
|
+
vertices = con.read_parquet(f"{dir}/vertices.parquet", table_name="vertices")
|
|
192
|
+
return edges, vertices
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def save_multiplexseries(
|
|
196
|
+
edges: ibis.Table, vertices: ibis.Table, dir: Path | str
|
|
197
|
+
) -> None:
|
|
198
|
+
"""
|
|
199
|
+
Save a multiplex series to disk by writing each period as a separate sub-directory.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
- edges: edge table with a ``period`` column.
|
|
203
|
+
- vertices: vertex table with a ``period`` column.
|
|
204
|
+
- dir: root path where the multiplex series will be saved.
|
|
205
|
+
"""
|
|
206
|
+
dir = Path(dir)
|
|
207
|
+
periods = (
|
|
208
|
+
edges.select("period").distinct().to_pyarrow().column("period").to_pylist()
|
|
209
|
+
)
|
|
210
|
+
for period in periods:
|
|
211
|
+
E = edges.filter(edges.period == period)
|
|
212
|
+
V = vertices.filter(vertices.period == period)
|
|
213
|
+
save_multiplex(edges=E, vertices=V, dir=dir / period)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def save_bipartite(
|
|
217
|
+
edges: ibis.Table, role_src: str, role_dst: str, relationtype: str, dir: Path | str
|
|
218
|
+
) -> None:
|
|
219
|
+
"""
|
|
220
|
+
Save a bipartite graph to disk as a Parquet file plus a JSON metadata file.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
- edges: edge table to save.
|
|
224
|
+
- role_src: column name used for the source role.
|
|
225
|
+
- role_dst: column name used for the destination role.
|
|
226
|
+
- relationtype: column name used for the relation type.
|
|
227
|
+
- dir: path to the directory where the files will be saved.
|
|
228
|
+
"""
|
|
229
|
+
dir = Path(dir)
|
|
230
|
+
os.makedirs(dir, exist_ok=True)
|
|
231
|
+
edges.to_parquet(dir / "edges.parquet")
|
|
232
|
+
json_content = {
|
|
233
|
+
"role_src": role_src,
|
|
234
|
+
"role_dst": role_dst,
|
|
235
|
+
"relationtype": relationtype,
|
|
236
|
+
}
|
|
237
|
+
with open(dir / "metadata.json", "w") as f:
|
|
238
|
+
import json
|
|
239
|
+
|
|
240
|
+
json.dump(json_content, f)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def read_bipartite(dir: Path | str) -> Bipartite:
|
|
244
|
+
"""
|
|
245
|
+
Load a bipartite graph from disk.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
- dir: path to the directory containing ``edges.parquet`` and ``metadata.json``.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
- BiPartite object with edges and metadata loaded from disk.
|
|
252
|
+
"""
|
|
253
|
+
dir = Path(dir)
|
|
254
|
+
edges = ibis.read_parquet(dir / "edges.parquet")
|
|
255
|
+
with open(dir / "metadata.json", "r") as f:
|
|
256
|
+
import json
|
|
257
|
+
|
|
258
|
+
metadata = json.load(f)
|
|
259
|
+
role_src = metadata["role_src"]
|
|
260
|
+
role_dst = metadata["role_dst"]
|
|
261
|
+
relationtype = metadata["relationtype"]
|
|
262
|
+
return BiPartite(
|
|
263
|
+
edges=edges, role_src=role_src, role_dst=role_dst, relationtype=relationtype
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
if __name__ == "__main__":
|
|
268
|
+
logging.basicConfig(level=logging.INFO)
|
|
269
|
+
m = load_network("data")
|
|
270
|
+
|
|
271
|
+
save_network(edges=m.edges, vertices=m.vertices, dir="data2")
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import ibis
|
|
2
|
+
|
|
3
|
+
from .check import check_edges, check_vertices
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from . import io
|
|
6
|
+
import logging
|
|
7
|
+
from scipy.sparse import csr_matrix
|
|
8
|
+
import networkx as nx
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Multiplex:
|
|
14
|
+
"""
|
|
15
|
+
A multiplex is a graph with multiple layers.
|
|
16
|
+
Each layer represents a different type of relationship between the same set of vertices, during one period.
|
|
17
|
+
For example, in a social network, one layer could represent friendships, while
|
|
18
|
+
another layer could represent professional connections.
|
|
19
|
+
For multiple periods, use MultiplexSeries.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
#: The edges of the multiplex. This is a table with columns "src", "dst", "layer" and "relationtype".
|
|
23
|
+
edges: ibis.Table
|
|
24
|
+
|
|
25
|
+
#: The vertices of the multiplex. This is a table with a column "id" and optional additional columns.
|
|
26
|
+
vertices: ibis.Table
|
|
27
|
+
|
|
28
|
+
period: int | None
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self, edges: ibis.Table, vertices: ibis.Table = None, period: int | None = None
|
|
32
|
+
) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Initialize a multiplex with the given edges and vertices tables.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
- edges: table with columns ``src``, ``dst``, ``layer``, and ``relationtype``.
|
|
38
|
+
- vertices: table with column ``id`` and optional additional columns.
|
|
39
|
+
- period: the period this multiplex belongs to, or ``None`` if not applicable.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
- ValueError: if the edges table does not satisfy the required schema.
|
|
43
|
+
- ValueError: if the vertices table does not satisfy the required schema.
|
|
44
|
+
"""
|
|
45
|
+
if not check_edges(edges, check_period=False):
|
|
46
|
+
raise ValueError("Invalid edges table")
|
|
47
|
+
|
|
48
|
+
if vertices is not None and not check_vertices(vertices, check_period=False):
|
|
49
|
+
raise ValueError("Invalid vertices table")
|
|
50
|
+
|
|
51
|
+
self.period = period
|
|
52
|
+
self.edges = edges
|
|
53
|
+
# TODO derive vertices from edges if not provided
|
|
54
|
+
self.vertices = vertices
|
|
55
|
+
|
|
56
|
+
def layers(self) -> list[str]:
|
|
57
|
+
"""
|
|
58
|
+
Get the list of layers present in the multiplex.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
- List of layer names.
|
|
62
|
+
"""
|
|
63
|
+
layers = self.edges[["layer"]].distinct().to_pandas().layer.tolist()
|
|
64
|
+
return layers
|
|
65
|
+
|
|
66
|
+
def update_vertices(self) -> None:
|
|
67
|
+
"""
|
|
68
|
+
Update the vertices table by deriving it from the edges table.
|
|
69
|
+
This is useful when the vertices table was not provided at initialization.
|
|
70
|
+
``self.vertices`` is updated in place.
|
|
71
|
+
"""
|
|
72
|
+
src = self.edges.select(id="src").distinct()
|
|
73
|
+
dst = self.edges.select(id="dst").distinct()
|
|
74
|
+
|
|
75
|
+
V = src.union(dst, distinct=True).to_pyarrow()
|
|
76
|
+
self.vertices = ibis.memtable(V)
|
|
77
|
+
|
|
78
|
+
def to_csr_matrix(self) -> csr_matrix[bool]:
|
|
79
|
+
"""
|
|
80
|
+
Transform the multiplex into a sparse matrix, collapsing all layers into one.
|
|
81
|
+
To keep layers separate, use ``to_csr_matrices`` instead.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
- Sparse boolean matrix of shape ``(n_vertices, n_vertices)``.
|
|
85
|
+
"""
|
|
86
|
+
from .to_csr_matrix import to_row_col_idx, idx_to_csr_matrix
|
|
87
|
+
|
|
88
|
+
idx = to_row_col_idx(self.edges, self.vertices)
|
|
89
|
+
M = idx_to_csr_matrix(idx, self.vertices)
|
|
90
|
+
return M
|
|
91
|
+
|
|
92
|
+
def to_csr_matrices(self) -> dict[str, csr_matrix]:
|
|
93
|
+
"""
|
|
94
|
+
Transform the multiplex into a dictionary of sparse matrices, one per layer.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
- Dictionary mapping layer name to a sparse boolean matrix of shape ``(n_vertices, n_vertices)``.
|
|
98
|
+
"""
|
|
99
|
+
from .to_csr_matrix import to_row_col_idx, idx_to_csr_matrix
|
|
100
|
+
|
|
101
|
+
layers = self.layers()
|
|
102
|
+
matrices = {}
|
|
103
|
+
for layer in layers:
|
|
104
|
+
idx = to_row_col_idx(
|
|
105
|
+
self.edges.filter(self.edges.layer == layer), self.vertices
|
|
106
|
+
)
|
|
107
|
+
M = idx_to_csr_matrix(idx, self.vertices)
|
|
108
|
+
matrices[layer] = M
|
|
109
|
+
return matrices
|
|
110
|
+
|
|
111
|
+
def to_networkx(self) -> nx.MultiDiGraph:
|
|
112
|
+
"""
|
|
113
|
+
Convert the multiplex to a NetworkX MultiDiGraph.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
- NetworkX MultiDiGraph built from the CSR matrix representation of the edges.
|
|
117
|
+
"""
|
|
118
|
+
from .networkx import to_MultiDiGraph
|
|
119
|
+
|
|
120
|
+
return to_MultiDiGraph(self.edges, self.vertices)
|
|
121
|
+
|
|
122
|
+
def save(self, dir: Path | str, **kw_args) -> None:
|
|
123
|
+
"""
|
|
124
|
+
Save the multiplex to disk.
|
|
125
|
+
The directory is created if it does not exist; existing files are overwritten.
|
|
126
|
+
Saving also evaluates the lazy ``edges`` and ``vertices`` expressions and
|
|
127
|
+
updates them to point at the saved files, which can improve subsequent performance.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
- dir: path to the directory where the Multiplex will be saved.
|
|
131
|
+
- **kw_args: additional keyword arguments forwarded to ``io.save_multiplex``.
|
|
132
|
+
"""
|
|
133
|
+
edges = self.edges
|
|
134
|
+
vertices = self.vertices
|
|
135
|
+
if vertices is None:
|
|
136
|
+
self.update_vertices()
|
|
137
|
+
vertices = self.vertices
|
|
138
|
+
period = self.period
|
|
139
|
+
edges, vertices = io.save_multiplex(edges, vertices, period, dir=dir, **kw_args)
|
|
140
|
+
self.edges = edges
|
|
141
|
+
self.vertices = vertices
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
import ibis
|
|
2
|
+
|
|
3
|
+
from .check import check_edges, check_vertices
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from . import io
|
|
6
|
+
from .multiplex import Multiplex
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MultiplexSeries:
|
|
14
|
+
"""
|
|
15
|
+
A multiplexseries is a series of Multiplex graphs with multiple layers, spanning multiple periods.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
#: The edges of the multiplex. This is a table with columns "src", "dst", "period", "layer" and "relationtype".
|
|
19
|
+
edges: ibis.Table
|
|
20
|
+
|
|
21
|
+
#: The vertices of the multiplex. This is a table with a column "id","period" and optional additional columns.
|
|
22
|
+
vertices: ibis.Table | None
|
|
23
|
+
|
|
24
|
+
#
|
|
25
|
+
vertex_ids: ibis.Table
|
|
26
|
+
|
|
27
|
+
relationtypes: ibis.Table | None
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
edges: ibis.Table,
|
|
32
|
+
vertices: ibis.Table = None,
|
|
33
|
+
relationtypes: ibis.Table = None,
|
|
34
|
+
) -> None:
|
|
35
|
+
"""
|
|
36
|
+
Initialize a multiplex series with the given edges and vertices tables.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
- edges: table with columns ``src``, ``dst``, ``period``, ``layer``, and ``relationtype``.
|
|
40
|
+
- vertices: table with column ``id``, ``period``, and optional additional columns.
|
|
41
|
+
Must have a ``period`` column because the edges table has one.
|
|
42
|
+
- relationtypes: table with columns ``relationtype``, ``layer``, ``label``,
|
|
43
|
+
and optional additional columns.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
- ValueError: if the edges table does not satisfy the required schema.
|
|
47
|
+
- ValueError: if the vertices table does not satisfy the required schema.
|
|
48
|
+
"""
|
|
49
|
+
if not check_edges(edges):
|
|
50
|
+
raise ValueError("Invalid edges table")
|
|
51
|
+
|
|
52
|
+
if vertices is not None and not check_vertices(vertices):
|
|
53
|
+
raise ValueError("Invalid vertices table")
|
|
54
|
+
|
|
55
|
+
self.edges = edges
|
|
56
|
+
# TODO derive vertices from edges if not provided
|
|
57
|
+
self.vertices = vertices
|
|
58
|
+
self.relationtypes = relationtypes
|
|
59
|
+
|
|
60
|
+
if not vertices is None:
|
|
61
|
+
logger.info("Vertices table provided, using it as is.")
|
|
62
|
+
self.vertex_ids = vertices[["id"]].distinct()
|
|
63
|
+
|
|
64
|
+
def periods(self) -> list[int]:
|
|
65
|
+
"""
|
|
66
|
+
Get the list of periods present in the multiplex series.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
- Sorted list of period values.
|
|
70
|
+
"""
|
|
71
|
+
periods = (
|
|
72
|
+
self.edges.select(self.edges.period)
|
|
73
|
+
.distinct()
|
|
74
|
+
.order_by("period")
|
|
75
|
+
.to_pyarrow()
|
|
76
|
+
.column("period")
|
|
77
|
+
.to_pylist()
|
|
78
|
+
)
|
|
79
|
+
# periods = self.edges[["period"]].distinct().to_pandas().period.tolist()
|
|
80
|
+
return periods
|
|
81
|
+
|
|
82
|
+
def layers(self) -> list[str]:
|
|
83
|
+
"""
|
|
84
|
+
Get the list of layers present in the multiplex series.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
- Sorted list of layer names.
|
|
88
|
+
"""
|
|
89
|
+
layers = (
|
|
90
|
+
self.edges.select(self.edges.layer)
|
|
91
|
+
.distinct()
|
|
92
|
+
.order_by("layer")
|
|
93
|
+
.to_pyarrow()
|
|
94
|
+
.column("layer")
|
|
95
|
+
.to_pylist()
|
|
96
|
+
)
|
|
97
|
+
return layers
|
|
98
|
+
|
|
99
|
+
def update_vertices(self) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Update the vertices table by deriving it from the edges table.
|
|
102
|
+
This is useful when the vertices table was not provided at initialization.
|
|
103
|
+
Both ``self.vertices`` and ``self.vertex_ids`` are updated in place.
|
|
104
|
+
"""
|
|
105
|
+
src = self.edges.select(id="src", period="period").distinct()
|
|
106
|
+
dst = self.edges.select(id="dst", period="period").distinct()
|
|
107
|
+
|
|
108
|
+
V = src.union(dst, distinct=True)
|
|
109
|
+
V_all = V.select(V.id)
|
|
110
|
+
self.vertices = ibis.memtable(V.to_pyarrow())
|
|
111
|
+
self.vertex_ids = ibis.memtable(V_all.to_pyarrow())
|
|
112
|
+
|
|
113
|
+
def update_relationtypes(self) -> None:
|
|
114
|
+
"""
|
|
115
|
+
Update the relationtypes table by deriving it from the edges table.
|
|
116
|
+
This is useful when the relationtypes table was not provided at initialization.
|
|
117
|
+
A ``label`` column is constructed as ``"<layer>_<relationtype>"``.
|
|
118
|
+
``self.relationtypes`` is updated in place.
|
|
119
|
+
"""
|
|
120
|
+
relationtypes = (
|
|
121
|
+
self.edges.select(self.edges.relationtype, self.edges.layer)
|
|
122
|
+
.distinct()
|
|
123
|
+
.order_by("layer", "relationtype")
|
|
124
|
+
.to_pandas()
|
|
125
|
+
.assign(
|
|
126
|
+
label=lambda df: (
|
|
127
|
+
df["layer"].astype(str) + "_" + df["relationtype"].astype(str)
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
logger.debug(
|
|
132
|
+
f"Updated relationtypes table with {len(relationtypes)} unique relationtypes."
|
|
133
|
+
)
|
|
134
|
+
self.relationtypes = ibis.memtable(relationtypes)
|
|
135
|
+
|
|
136
|
+
def get_multiplex(self, period: int) -> Multiplex:
|
|
137
|
+
"""
|
|
138
|
+
Return the multiplex for a specific period.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
- period: the period to retrieve.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
- Multiplex object containing only the edges and vertices for the given period.
|
|
145
|
+
"""
|
|
146
|
+
E_y = self.edges.filter(self.edges.period == period)
|
|
147
|
+
if self.vertices is not None:
|
|
148
|
+
V_y = self.vertices.filter(self.vertices.period == period)
|
|
149
|
+
else:
|
|
150
|
+
V_y = None
|
|
151
|
+
return Multiplex(edges=E_y, vertices=V_y, period=period)
|
|
152
|
+
|
|
153
|
+
def multiplexes(self) -> list[Tuple[int, Multiplex]]:
|
|
154
|
+
"""
|
|
155
|
+
Return all multiplexes in the series, one per period.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
- List of ``(period, Multiplex)`` tuples, ordered by period.
|
|
159
|
+
"""
|
|
160
|
+
periods = self.periods()
|
|
161
|
+
return [(period, self.get_multiplex(period)) for period in periods]
|
|
162
|
+
|
|
163
|
+
def add_filter(
|
|
164
|
+
self,
|
|
165
|
+
periods: list[int] = None,
|
|
166
|
+
layers: list[str] = None,
|
|
167
|
+
relationtypes: list[int] = None,
|
|
168
|
+
src: list[int] = None,
|
|
169
|
+
dst: list[int] = None,
|
|
170
|
+
) -> None:
|
|
171
|
+
"""
|
|
172
|
+
Apply a filter to the multiplex series in place.
|
|
173
|
+
Filtering is lazy: the filter is only executed when saving or converting
|
|
174
|
+
to another format. Passing ``None`` or an empty list for any argument
|
|
175
|
+
means no filtering is applied for that dimension.
|
|
176
|
+
|
|
177
|
+
For advanced filtering, modify the ``edges`` property directly using
|
|
178
|
+
ibis expressions.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
- periods: list of periods to keep.
|
|
182
|
+
- layers: list of layer names to keep.
|
|
183
|
+
- relationtypes: list of relationtype values to keep.
|
|
184
|
+
- src: list of source vertex ids (ego) to keep.
|
|
185
|
+
- dst: list of destination vertex ids (non-ego) to keep.
|
|
186
|
+
"""
|
|
187
|
+
E = self.edges
|
|
188
|
+
|
|
189
|
+
flt: list[ibis.BooleanValue] = []
|
|
190
|
+
|
|
191
|
+
if periods is not None and len(periods) > 0:
|
|
192
|
+
flt.append(E.period.isin(periods))
|
|
193
|
+
|
|
194
|
+
if layers is not None and len(layers) > 0:
|
|
195
|
+
flt.append(E.layer.isin(layers))
|
|
196
|
+
|
|
197
|
+
if relationtypes is not None and len(relationtypes) > 0:
|
|
198
|
+
flt.append(E.relationtype.isin(relationtypes))
|
|
199
|
+
|
|
200
|
+
if src is not None and len(src) > 0:
|
|
201
|
+
vid = ibis.memtable({"id": src})
|
|
202
|
+
# we use semi join because we expect the vertex list to be large
|
|
203
|
+
E = E.semi_join(vid, E.src == vid.id)
|
|
204
|
+
|
|
205
|
+
if dst is not None and len(dst) > 0:
|
|
206
|
+
vid = ibis.memtable({"id": dst})
|
|
207
|
+
# we use semi join because we expect the vertex list to be large
|
|
208
|
+
E = E.semi_join(vid, E.dst == vid.id)
|
|
209
|
+
|
|
210
|
+
logger.debug("Filter: f{flt}")
|
|
211
|
+
if len(flt):
|
|
212
|
+
E = E.filter(flt)
|
|
213
|
+
|
|
214
|
+
self.edges = E
|
|
215
|
+
|
|
216
|
+
def __copy__(self) -> "MultiplexSeries":
|
|
217
|
+
"""
|
|
218
|
+
Return a shallow copy of this MultiplexSeries.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
- A new MultiplexSeries sharing the same ``edges`` and ``vertices`` tables.
|
|
222
|
+
"""
|
|
223
|
+
return MultiplexSeries(self.edges, self.vertices)
|
|
224
|
+
|
|
225
|
+
def collapse(self) -> Multiplex:
|
|
226
|
+
"""
|
|
227
|
+
Collapse the multiplex series into a single Multiplex by discarding period
|
|
228
|
+
information. Duplicate edges across periods are removed. This is useful
|
|
229
|
+
for analyses that do not require temporal information.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
- Multiplex containing all distinct edges across all periods, with ``period=None``.
|
|
233
|
+
"""
|
|
234
|
+
E = self.edges.select(["src", "dst", "layer", "relationtype"]).distinct()
|
|
235
|
+
if self.vertices is not None:
|
|
236
|
+
V = self.vertices.select("id").distinct()
|
|
237
|
+
else:
|
|
238
|
+
V = None
|
|
239
|
+
return Multiplex(edges=E, vertices=V, period=None)
|
|
240
|
+
|
|
241
|
+
def collapse_to(self, dir: Path | str) -> None:
|
|
242
|
+
"""
|
|
243
|
+
Collapse the multiplex series and save the result to disk.
|
|
244
|
+
This is a convenience method equivalent to calling ``collapse()`` followed
|
|
245
|
+
by ``Multiplex.save()``.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
- dir: path to the directory where the collapsed Multiplex will be saved.
|
|
249
|
+
"""
|
|
250
|
+
m = self.collapse()
|
|
251
|
+
return m.save(dir=dir)
|
|
252
|
+
|
|
253
|
+
def save(self, dir: Path | str, **kw_args) -> None:
|
|
254
|
+
"""
|
|
255
|
+
Save the multiplex series to disk.
|
|
256
|
+
The directory is created if it does not exist; existing files are overwritten.
|
|
257
|
+
Saving also evaluates the lazy ``edges`` and ``vertices`` expressions and
|
|
258
|
+
updates them to point at the saved files, which can improve subsequent
|
|
259
|
+
performance.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
- dir: path to the directory where the MultiplexSeries will be saved.
|
|
263
|
+
- **kw_args: additional keyword arguments forwarded to ``io.save_network``.
|
|
264
|
+
"""
|
|
265
|
+
edges = self.edges
|
|
266
|
+
vertices = self.vertices
|
|
267
|
+
if vertices is None:
|
|
268
|
+
mp = MultiplexSeries(edges=self.edges)
|
|
269
|
+
mp.update_vertices()
|
|
270
|
+
vertices = mp.vertices
|
|
271
|
+
E, V = io.save_network(edges, vertices, dir=dir, **kw_args)
|
|
272
|
+
self.edges = E
|
|
273
|
+
self.vertices = V
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import networkx as nx
|
|
2
|
+
import ibis
|
|
3
|
+
from .to_csr_matrix import to_csr_matrix
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def to_MultiDiGraph(edges: ibis.Table, vertices: ibis.Table) -> nx.MultiDiGraph:
|
|
7
|
+
"""
|
|
8
|
+
Convert an edge list and vertex table to a NetworkX MultiDiGraph.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
- edges: table with ``src`` and ``dst`` columns.
|
|
12
|
+
- vertices: table with an ``id`` column.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
- NetworkX MultiDiGraph built from the CSR matrix representation of the edges.
|
|
16
|
+
"""
|
|
17
|
+
# problem: this generates
|
|
18
|
+
csr = to_csr_matrix(edges, vertices)
|
|
19
|
+
mdg = nx.MultiDiGraph(csr)
|
|
20
|
+
return mdg
|
|
File without changes
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from ibis import row_number, Table
|
|
2
|
+
import ibis
|
|
3
|
+
from scipy.sparse import csr_matrix
|
|
4
|
+
from muxpack.multiplex import Multiplex
|
|
5
|
+
from typing import Tuple, Generator
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
# from collections.abc import Generator
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def to_row_col_idx(edges: Table, vertices: Table) -> Table:
|
|
14
|
+
"""
|
|
15
|
+
Turn an edge list into a row/column index table based on the given vertices table.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
- edges: table with ``src`` and ``dst`` columns.
|
|
19
|
+
- vertices: table with an ``id`` column; edges not referencing a vertex in this
|
|
20
|
+
table are filtered out.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
- Table with columns ``data``, ``row``, and ``col`` containing the boolean edge
|
|
24
|
+
indicator and the row/column indices corresponding to vertex positions in
|
|
25
|
+
``vertices``. Can be passed directly to ``idx_to_csr_matrix``.
|
|
26
|
+
"""
|
|
27
|
+
v = vertices.select("id").mutate(idx=row_number())
|
|
28
|
+
row = v.select(src="id", row="idx")
|
|
29
|
+
col = v.select(dst="id", col="idx")
|
|
30
|
+
|
|
31
|
+
# may sum the number of columns
|
|
32
|
+
idx_edges = (
|
|
33
|
+
edges[["src", "dst"]]
|
|
34
|
+
.distinct()
|
|
35
|
+
.inner_join(row, "src")
|
|
36
|
+
.inner_join(col, "dst")
|
|
37
|
+
.mutate(data=True)
|
|
38
|
+
.select("data", "row", "col")
|
|
39
|
+
)
|
|
40
|
+
logger.debug(
|
|
41
|
+
f"Created row-col index table with {idx_edges.count().execute()} edges."
|
|
42
|
+
)
|
|
43
|
+
return idx_edges
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def idx_to_csr_matrix(idx: Table, vertices: Table) -> csr_matrix:
|
|
47
|
+
"""
|
|
48
|
+
Convert a row-column index table to a CSR sparse matrix.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
- idx: table with columns ``data``, ``row``, and ``col``, as produced by
|
|
52
|
+
``to_row_col_idx``.
|
|
53
|
+
- vertices: table with an ``id`` column; its row count determines the matrix size.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
- Square CSR sparse matrix of shape ``(n_vertices, n_vertices)``.
|
|
57
|
+
"""
|
|
58
|
+
# TODO maybe to_parquet()?
|
|
59
|
+
coo = idx.execute()
|
|
60
|
+
logger.debug(f"COO matrix data: {coo}")
|
|
61
|
+
|
|
62
|
+
n = vertices.count().execute()
|
|
63
|
+
logger.debug(f"Number of vertices: {n}")
|
|
64
|
+
M = csr_matrix((coo["data"], (coo["row"], coo["col"])), shape=(n, n))
|
|
65
|
+
return M
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def to_csr_matrix(edges: Table, vertices: Table | None) -> csr_matrix:
|
|
69
|
+
"""
|
|
70
|
+
Transform an edge list into a sparse matrix (csr_matrix).
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
- edges: table with ``src`` and ``dst`` columns.
|
|
74
|
+
- vertices: table with an ``id`` column; edges are filtered to vertices present
|
|
75
|
+
in this table. Pass ``None`` to derive vertices from the edges table.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
- Square CSR sparse matrix of shape ``(n_vertices, n_vertices)``.
|
|
79
|
+
"""
|
|
80
|
+
# vertices may contain multiple periods
|
|
81
|
+
if vertices is not None:
|
|
82
|
+
vertices = vertices[["id"]].distinct()
|
|
83
|
+
edges_row_col = to_row_col_idx(edges, vertices=vertices)
|
|
84
|
+
M = idx_to_csr_matrix(edges_row_col, vertices=vertices)
|
|
85
|
+
return M
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def to_period_csr_matrix(
|
|
89
|
+
edges: Table, vertices: Table | None, periods: list[int] = []
|
|
90
|
+
) -> Generator[Tuple[csr_matrix, int]]:
|
|
91
|
+
"""
|
|
92
|
+
Generate a sparse matrix for each period.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
- edges: table with columns ``src``, ``dst``, and ``period``.
|
|
96
|
+
- vertices: table with columns ``id`` and ``period``, or ``None`` to derive
|
|
97
|
+
vertices from the edges table for each period.
|
|
98
|
+
- periods: list of periods to generate matrices for. If empty, all periods
|
|
99
|
+
present in ``edges`` are used.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
- Generator of ``(csr_matrix, period)`` tuples, one per period.
|
|
103
|
+
"""
|
|
104
|
+
if len(periods) == 0:
|
|
105
|
+
periods = edges[["period"]].distinct().to_pandas().period.tolist()
|
|
106
|
+
for period in periods:
|
|
107
|
+
E_y = edges.filter(edges.period == period)
|
|
108
|
+
if vertices is not None:
|
|
109
|
+
V_y = vertices.filter(vertices.period == period)
|
|
110
|
+
else:
|
|
111
|
+
V_y = None
|
|
112
|
+
|
|
113
|
+
yield to_csr_matrix(E_y, V_y), period
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
118
|
+
import pandas as pd
|
|
119
|
+
|
|
120
|
+
edges = pd.DataFrame({"src": [100, 100], "dst": [300, 200]})
|
|
121
|
+
vertices = pd.DataFrame({"id": [100, 200, 300]})
|
|
122
|
+
|
|
123
|
+
E = ibis.memtable(edges)
|
|
124
|
+
V = ibis.memtable(vertices)
|
|
125
|
+
|
|
126
|
+
V1 = V.filter(V.id < 250)
|
|
127
|
+
idx = to_row_col_idx(E, V1)
|
|
128
|
+
M1 = idx_to_csr_matrix(idx, V1)
|
|
129
|
+
print(f"M1 = {M1}")
|
|
130
|
+
|
|
131
|
+
M = to_csr_matrix(E, V)
|
|
132
|
+
print(M)
|