submine 0.1.1__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- submine/__init__.py +37 -0
- submine/algorithms/__init__.py +23 -0
- submine/algorithms/base.py +143 -0
- submine/algorithms/gspan.py +156 -0
- submine/algorithms/gspan_cpp.cpython-312-darwin.so +0 -0
- submine/algorithms/sopagrami.py +250 -0
- submine/algorithms/sopagrami_cpp.cpython-312-darwin.so +0 -0
- submine/api.py +134 -0
- submine/backends/__init__.py +0 -0
- submine/backends/gspan/CMakeLists.txt +65 -0
- submine/backends/gspan/dfs.cpp +98 -0
- submine/backends/gspan/graph.cpp +165 -0
- submine/backends/gspan/gspan.cpp +776 -0
- submine/backends/gspan/gspan.h +296 -0
- submine/backends/gspan/ismin.cpp +124 -0
- submine/backends/gspan/main.cpp +106 -0
- submine/backends/gspan/misc.cpp +177 -0
- submine/backends/gspan/python_bindings.cpp +133 -0
- submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
- submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
- submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
- submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
- submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
- submine/backends/sopagrami/cpp/src/main.cpp +94 -0
- submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
- submine/cli/__init__.py +6 -0
- submine/cli/main.py +87 -0
- submine/core/__init__.py +12 -0
- submine/core/graph.py +179 -0
- submine/core/result.py +121 -0
- submine/datasets/__init__.py +11 -0
- submine/datasets/loaders.py +145 -0
- submine/errors.py +41 -0
- submine/io/__init__.py +30 -0
- submine/io/common.py +173 -0
- submine/io/gexf.py +88 -0
- submine/io/gspan.py +268 -0
- submine/io/sopagrami.py +143 -0
- submine/io/transcode.py +147 -0
- submine/registry.py +8 -0
- submine/utils/__init__.py +6 -0
- submine/utils/checks.py +115 -0
- submine/utils/logging.py +41 -0
- submine-0.1.1.dist-info/METADATA +178 -0
- submine-0.1.1.dist-info/RECORD +47 -0
- submine-0.1.1.dist-info/WHEEL +6 -0
- submine-0.1.1.dist-info/licenses/LICENSE +21 -0
submine/core/graph.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Core graph container used throughout *submine*.
|
|
2
|
+
|
|
3
|
+
Design goals
|
|
4
|
+
------------
|
|
5
|
+
1) Keep a lightweight, dependency-free representation.
|
|
6
|
+
2) Preserve the existing public surface used by wrappers:
|
|
7
|
+
- ``Graph.nodes`` : list of node ids
|
|
8
|
+
- ``Graph.edges`` : list of (u, v)
|
|
9
|
+
- ``Graph.node_labels`` : dict[node_id] -> label (optional)
|
|
10
|
+
- ``Graph.edge_labels`` : dict[(u, v)] -> label (optional)
|
|
11
|
+
3) Add *optional* edge weights without breaking unweighted algorithms.
|
|
12
|
+
|
|
13
|
+
Weights are stored as ``Graph.edge_weights`` (dict[(u, v)] -> float). If a
|
|
14
|
+
weight is missing for an edge, it is treated as 1.0.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Any, Dict, Hashable, Iterable, Iterator, List, Optional, Protocol, Tuple
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class GraphSource(Protocol):
|
|
24
|
+
"""Anything that can yield Graph objects."""
|
|
25
|
+
|
|
26
|
+
def __iter__(self) -> Iterable["Graph"]: # pragma: no cover
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(eq=True, frozen=True)
|
|
31
|
+
class Node:
|
|
32
|
+
id: Any
|
|
33
|
+
label: Optional[Any] = None
|
|
34
|
+
data: Dict[str, Any] | None = None
|
|
35
|
+
|
|
36
|
+
def __post_init__(self) -> None:
|
|
37
|
+
if self.data is None:
|
|
38
|
+
object.__setattr__(self, "data", {})
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Graph:
|
|
42
|
+
"""Lightweight labeled (optionally weighted) undirected graph."""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
nodes: Optional[Iterable[Hashable]] = None,
|
|
47
|
+
edges: Optional[Iterable[Tuple[Hashable, Hashable]]] = None,
|
|
48
|
+
node_labels: Optional[Dict[Hashable, Any]] = None,
|
|
49
|
+
edge_labels: Optional[Dict[Tuple[Hashable, Hashable], Any]] = None,
|
|
50
|
+
edge_weights: Optional[Dict[Tuple[Hashable, Hashable], float]] = None,
|
|
51
|
+
) -> None:
|
|
52
|
+
self.nodes: List[Hashable] = list(nodes) if nodes is not None else []
|
|
53
|
+
self.edges: List[Tuple[Hashable, Hashable]] = list(edges) if edges is not None else []
|
|
54
|
+
self.node_labels: Optional[Dict[Hashable, Any]] = node_labels
|
|
55
|
+
self.edge_labels: Optional[Dict[Tuple[Hashable, Hashable], Any]] = edge_labels
|
|
56
|
+
self.edge_weights: Optional[Dict[Tuple[Hashable, Hashable], float]] = edge_weights
|
|
57
|
+
|
|
58
|
+
# For incremental construction APIs.
|
|
59
|
+
self._nodes: Dict[Hashable, Node] = {}
|
|
60
|
+
self._adj: Dict[Hashable, List[Tuple[Hashable, Optional[Any], float]]] = {}
|
|
61
|
+
|
|
62
|
+
if self.nodes or self.edges:
|
|
63
|
+
# Seed internal indices for add_node/add_edge compatibility.
|
|
64
|
+
for nid in self.nodes:
|
|
65
|
+
lbl = self.node_labels.get(nid) if self.node_labels else None
|
|
66
|
+
self._nodes[nid] = Node(nid, lbl, {})
|
|
67
|
+
self._adj.setdefault(nid, [])
|
|
68
|
+
for (u, v) in self.edges:
|
|
69
|
+
lbl = None
|
|
70
|
+
if self.edge_labels is not None:
|
|
71
|
+
lbl = self.edge_labels.get((u, v), self.edge_labels.get((v, u)))
|
|
72
|
+
w = 1.0
|
|
73
|
+
if self.edge_weights is not None:
|
|
74
|
+
w = float(self.edge_weights.get((u, v), self.edge_weights.get((v, u), 1.0)))
|
|
75
|
+
self._adj.setdefault(u, []).append((v, lbl, w))
|
|
76
|
+
self._adj.setdefault(v, []).append((u, lbl, w))
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def is_weighted(self) -> bool:
|
|
80
|
+
if not self.edge_weights:
|
|
81
|
+
return False
|
|
82
|
+
return any(float(w) != 1.0 for w in self.edge_weights.values())
|
|
83
|
+
|
|
84
|
+
def add_node(self, node_id: Hashable, label: Optional[Any] = None, **data: Any) -> Node:
|
|
85
|
+
if node_id in self._nodes:
|
|
86
|
+
n = self._nodes[node_id]
|
|
87
|
+
lbl = n.label if label is None else label
|
|
88
|
+
merged = dict(n.data)
|
|
89
|
+
merged.update(data)
|
|
90
|
+
n2 = Node(node_id, lbl, merged)
|
|
91
|
+
self._nodes[node_id] = n2
|
|
92
|
+
if node_id not in self.nodes:
|
|
93
|
+
self.nodes.append(node_id)
|
|
94
|
+
if self.node_labels is not None:
|
|
95
|
+
self.node_labels[node_id] = lbl
|
|
96
|
+
self._adj.setdefault(node_id, [])
|
|
97
|
+
return n2
|
|
98
|
+
|
|
99
|
+
n = Node(node_id, label, data or None)
|
|
100
|
+
self._nodes[node_id] = n
|
|
101
|
+
self._adj.setdefault(node_id, [])
|
|
102
|
+
if node_id not in self.nodes:
|
|
103
|
+
self.nodes.append(node_id)
|
|
104
|
+
if label is not None:
|
|
105
|
+
if self.node_labels is None:
|
|
106
|
+
self.node_labels = {}
|
|
107
|
+
self.node_labels[node_id] = label
|
|
108
|
+
return n
|
|
109
|
+
|
|
110
|
+
def add_edge(
|
|
111
|
+
self,
|
|
112
|
+
u: Hashable,
|
|
113
|
+
v: Hashable,
|
|
114
|
+
label: Optional[Any] = None,
|
|
115
|
+
weight: float = 1.0,
|
|
116
|
+
) -> None:
|
|
117
|
+
if u == v:
|
|
118
|
+
raise ValueError("Self loops are not supported.")
|
|
119
|
+
if u not in self._nodes:
|
|
120
|
+
self.add_node(u)
|
|
121
|
+
if v not in self._nodes:
|
|
122
|
+
self.add_node(v)
|
|
123
|
+
|
|
124
|
+
self.edges.append((u, v))
|
|
125
|
+
self._adj.setdefault(u, []).append((v, label, float(weight)))
|
|
126
|
+
self._adj.setdefault(v, []).append((u, label, float(weight)))
|
|
127
|
+
|
|
128
|
+
if label is not None:
|
|
129
|
+
if self.edge_labels is None:
|
|
130
|
+
self.edge_labels = {}
|
|
131
|
+
self.edge_labels[(u, v)] = label
|
|
132
|
+
|
|
133
|
+
if float(weight) != 1.0:
|
|
134
|
+
if self.edge_weights is None:
|
|
135
|
+
self.edge_weights = {}
|
|
136
|
+
self.edge_weights[(u, v)] = float(weight)
|
|
137
|
+
|
|
138
|
+
def iter_edges(self) -> Iterator[Tuple[Hashable, Hashable, Optional[Any]]]:
|
|
139
|
+
"""Iterate edges once, yielding (u, v, label).
|
|
140
|
+
|
|
141
|
+
This preserves the legacy shape expected by existing writers.
|
|
142
|
+
"""
|
|
143
|
+
seen: set[Tuple[Hashable, Hashable]] = set()
|
|
144
|
+
for (u, v) in self.edges:
|
|
145
|
+
a, b = (u, v) if u <= v else (v, u)
|
|
146
|
+
if (a, b) in seen:
|
|
147
|
+
continue
|
|
148
|
+
seen.add((a, b))
|
|
149
|
+
lbl = None
|
|
150
|
+
if self.edge_labels is not None:
|
|
151
|
+
lbl = self.edge_labels.get((u, v), self.edge_labels.get((v, u)))
|
|
152
|
+
yield (a, b, lbl)
|
|
153
|
+
|
|
154
|
+
def iter_edges_with_weights(
|
|
155
|
+
self,
|
|
156
|
+
) -> Iterator[Tuple[Hashable, Hashable, Optional[Any], float]]:
|
|
157
|
+
"""Iterate edges once, yielding (u, v, label, weight)."""
|
|
158
|
+
seen: set[Tuple[Hashable, Hashable]] = set()
|
|
159
|
+
for (u, v) in self.edges:
|
|
160
|
+
a, b = (u, v) if u <= v else (v, u)
|
|
161
|
+
if (a, b) in seen:
|
|
162
|
+
continue
|
|
163
|
+
seen.add((a, b))
|
|
164
|
+
lbl = None
|
|
165
|
+
if self.edge_labels is not None:
|
|
166
|
+
lbl = self.edge_labels.get((u, v), self.edge_labels.get((v, u)))
|
|
167
|
+
w = 1.0
|
|
168
|
+
if self.edge_weights is not None:
|
|
169
|
+
w = float(self.edge_weights.get((u, v), self.edge_weights.get((v, u), 1.0)))
|
|
170
|
+
yield (a, b, lbl, w)
|
|
171
|
+
|
|
172
|
+
def number_of_nodes(self) -> int:
|
|
173
|
+
return len(self.nodes)
|
|
174
|
+
|
|
175
|
+
def number_of_edges(self) -> int:
|
|
176
|
+
return len(self.edges)
|
|
177
|
+
|
|
178
|
+
def __repr__(self) -> str:
|
|
179
|
+
return f"Graph(num_nodes={self.number_of_nodes()}, num_edges={self.number_of_edges()}, weighted={self.is_weighted})"
|
submine/core/result.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# submine/core/result.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple
|
|
6
|
+
|
|
7
|
+
from .graph import Graph
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"SubgraphOccurrence",
|
|
11
|
+
"SubgraphPattern",
|
|
12
|
+
"MiningResult",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class SubgraphOccurrence:
|
|
18
|
+
"""
|
|
19
|
+
One occurrence (embedding) of a pattern inside a particular input graph.
|
|
20
|
+
|
|
21
|
+
Attributes
|
|
22
|
+
----------
|
|
23
|
+
graph_id : int
|
|
24
|
+
Index of the input graph in the dataset (0-based).
|
|
25
|
+
node_mapping : Mapping[int, int]
|
|
26
|
+
Mapping from pattern-local node ids -> node ids in the host graph.
|
|
27
|
+
Pattern-local ids should be 0..k-1 for a k-node pattern.
|
|
28
|
+
extra : dict
|
|
29
|
+
Optional algorithm-specific information
|
|
30
|
+
(e.g. edge mapping, score, position, etc.).
|
|
31
|
+
"""
|
|
32
|
+
graph_id: int
|
|
33
|
+
node_mapping: Mapping[int, int]
|
|
34
|
+
extra: Dict[str, Any] = field(default_factory=dict)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class SubgraphPattern:
|
|
39
|
+
"""
|
|
40
|
+
A mined subgraph pattern.
|
|
41
|
+
|
|
42
|
+
Attributes
|
|
43
|
+
----------
|
|
44
|
+
pid : int
|
|
45
|
+
Pattern identifier (unique within a MiningResult).
|
|
46
|
+
graph : Graph
|
|
47
|
+
The pattern itself as a Graph object (pattern-local node ids 0..k-1).
|
|
48
|
+
support : int
|
|
49
|
+
Support count (e.g. number of graphs / embeddings where it appears).
|
|
50
|
+
frequency : Optional[float]
|
|
51
|
+
Relative frequency (e.g. support / num_graphs); optional.
|
|
52
|
+
occurrences : list[SubgraphOccurrence]
|
|
53
|
+
Concrete occurrences of this pattern in the input graphs (optional;
|
|
54
|
+
some algorithms may not provide embeddings).
|
|
55
|
+
attributes : dict
|
|
56
|
+
Algorithm-specific attributes (e.g. score, interestingness, DFScode).
|
|
57
|
+
"""
|
|
58
|
+
pid: int
|
|
59
|
+
graph: Graph
|
|
60
|
+
support: int
|
|
61
|
+
frequency: Optional[float] = None
|
|
62
|
+
occurrences: List[SubgraphOccurrence] = field(default_factory=list)
|
|
63
|
+
attributes: Dict[str, Any] = field(default_factory=dict)
|
|
64
|
+
|
|
65
|
+
def add_occurrence(self, occ: SubgraphOccurrence) -> None:
|
|
66
|
+
self.occurrences.append(occ)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class MiningResult:
|
|
71
|
+
"""
|
|
72
|
+
Container for the output of a subgraph mining run.
|
|
73
|
+
|
|
74
|
+
Attributes
|
|
75
|
+
----------
|
|
76
|
+
patterns : list[SubgraphPattern]
|
|
77
|
+
List of mined patterns.
|
|
78
|
+
algorithm : str
|
|
79
|
+
Name of the algorithm that produced these patterns.
|
|
80
|
+
params : dict
|
|
81
|
+
Parameters used for the run (min_support, max_size, etc.).
|
|
82
|
+
runtime : Optional[float]
|
|
83
|
+
Wall-clock runtime in seconds, if measured.
|
|
84
|
+
metadata : dict
|
|
85
|
+
Additional metadata (dataset info, version, logs, etc.).
|
|
86
|
+
"""
|
|
87
|
+
patterns: List[SubgraphPattern]
|
|
88
|
+
algorithm: str
|
|
89
|
+
params: Dict[str, Any] = field(default_factory=dict)
|
|
90
|
+
runtime: Optional[float] = None
|
|
91
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
92
|
+
|
|
93
|
+
def __len__(self) -> int:
|
|
94
|
+
return len(self.patterns)
|
|
95
|
+
|
|
96
|
+
def top_k(self, k: int, key: str = "support") -> List[SubgraphPattern]:
|
|
97
|
+
"""
|
|
98
|
+
Return the top-k patterns sorted by a given key.
|
|
99
|
+
|
|
100
|
+
Parameters
|
|
101
|
+
----------
|
|
102
|
+
k : int
|
|
103
|
+
Number of patterns to return.
|
|
104
|
+
key : str
|
|
105
|
+
Sort key: 'support', 'frequency', or any numeric attribute name
|
|
106
|
+
stored in pattern.attributes[key].
|
|
107
|
+
|
|
108
|
+
Returns
|
|
109
|
+
-------
|
|
110
|
+
list[SubgraphPattern]
|
|
111
|
+
"""
|
|
112
|
+
def key_fn(p: SubgraphPattern):
|
|
113
|
+
if key == "support":
|
|
114
|
+
return p.support
|
|
115
|
+
if key == "frequency":
|
|
116
|
+
return p.frequency if p.frequency is not None else 0.0
|
|
117
|
+
# fallback to attributes
|
|
118
|
+
val = p.attributes.get(key, 0.0)
|
|
119
|
+
return float(val)
|
|
120
|
+
|
|
121
|
+
return sorted(self.patterns, key=key_fn, reverse=True)[:k]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Built‑in datasets for submine.
|
|
2
|
+
|
|
3
|
+
This package provides convenience loaders for commonly used benchmark
|
|
4
|
+
datasets in frequent subgraph mining research. Datasets are returned
|
|
5
|
+
as lists of :class:`~submine.core.graph.Graph` objects. The available
|
|
6
|
+
datasets are documented in :mod:`submine.datasets.loaders`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .loaders import get_dataset # noqa: F401
|
|
10
|
+
|
|
11
|
+
__all__ = ["get_dataset"]
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Dataset loaders for submine.
|
|
2
|
+
|
|
3
|
+
This module contains functions to load well‑known benchmark datasets
|
|
4
|
+
such as MUTAG, ENZYMES and more. Where possible it attempts to use
|
|
5
|
+
existing libraries to fetch the datasets (e.g. torch_geometric or
|
|
6
|
+
networkx), but falls back to a simple synthetic dataset if these are
|
|
7
|
+
unavailable. To add a new dataset, implement a function named
|
|
8
|
+
``load_<datasetname>()`` that returns a list of
|
|
9
|
+
:class:`~submine.core.graph.Graph` objects and register it in the
|
|
10
|
+
``_DATASETS`` dictionary.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Callable, Dict, List
|
|
16
|
+
|
|
17
|
+
from ..core.graph import Graph
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _load_toy() -> List[Graph]:
|
|
21
|
+
"""Return a small synthetic dataset useful for testing.
|
|
22
|
+
|
|
23
|
+
The dataset contains two graphs: a triangle and a path of length 3.
|
|
24
|
+
Node labels are single characters.
|
|
25
|
+
"""
|
|
26
|
+
# Graph 1: triangle A-B-C-A
|
|
27
|
+
g1 = Graph()
|
|
28
|
+
g1.add_node(0, label="A")
|
|
29
|
+
g1.add_node(1, label="B")
|
|
30
|
+
g1.add_node(2, label="C")
|
|
31
|
+
g1.add_edge(0, 1, label="ab")
|
|
32
|
+
g1.add_edge(1, 2, label="bc")
|
|
33
|
+
g1.add_edge(2, 0, label="ca")
|
|
34
|
+
# Graph 2: path D-E-F
|
|
35
|
+
g2 = Graph()
|
|
36
|
+
g2.add_node(0, label="D")
|
|
37
|
+
g2.add_node(1, label="E")
|
|
38
|
+
g2.add_node(2, label="F")
|
|
39
|
+
g2.add_edge(0, 1, label="de")
|
|
40
|
+
g2.add_edge(1, 2, label="ef")
|
|
41
|
+
return [g1, g2]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _load_mutag() -> List[Graph]:
|
|
45
|
+
"""Load the MUTAG dataset if torch_geometric is available.
|
|
46
|
+
|
|
47
|
+
If the dataset cannot be downloaded or the library is missing a
|
|
48
|
+
NotImplementedError is raised.
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
from torch_geometric.datasets import TUDataset # type: ignore
|
|
52
|
+
import torch_geometric.utils # noqa: F401 # type: ignore
|
|
53
|
+
except Exception as e:
|
|
54
|
+
raise NotImplementedError("Loading MUTAG requires torch_geometric and internet access") from e
|
|
55
|
+
# Fetch dataset
|
|
56
|
+
dataset = TUDataset(root="/tmp/MUTAG", name="MUTAG")
|
|
57
|
+
graphs: List[Graph] = []
|
|
58
|
+
for data in dataset:
|
|
59
|
+
# Convert to networkx graph then to our Graph
|
|
60
|
+
import networkx as nx # type: ignore
|
|
61
|
+
g_nx = nx.Graph()
|
|
62
|
+
# Nodes with labels from x attribute (one-hot) if present
|
|
63
|
+
for i in range(data.num_nodes):
|
|
64
|
+
label = None
|
|
65
|
+
if hasattr(data, "node_label") and data.node_label is not None:
|
|
66
|
+
label = str(int(data.node_label[i]))
|
|
67
|
+
g_nx.add_node(i, label=label)
|
|
68
|
+
# Edges
|
|
69
|
+
edge_attr = data.edge_attr if data.edge_attr is not None else None
|
|
70
|
+
for j in range(data.edge_index.size(1)):
|
|
71
|
+
u = int(data.edge_index[0, j])
|
|
72
|
+
v = int(data.edge_index[1, j])
|
|
73
|
+
label = None
|
|
74
|
+
if edge_attr is not None:
|
|
75
|
+
label = str(int(edge_attr[j].item()))
|
|
76
|
+
g_nx.add_edge(u, v, label=label)
|
|
77
|
+
graphs.append(Graph.from_networkx(g_nx))
|
|
78
|
+
return graphs
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _load_enzymes() -> List[Graph]:
|
|
82
|
+
"""Load the ENZYMES dataset if torch_geometric is available."""
|
|
83
|
+
try:
|
|
84
|
+
from torch_geometric.datasets import TUDataset # type: ignore
|
|
85
|
+
import torch_geometric.utils # noqa: F401 # type: ignore
|
|
86
|
+
except Exception as e:
|
|
87
|
+
raise NotImplementedError("Loading ENZYMES requires torch_geometric and internet access") from e
|
|
88
|
+
dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
|
|
89
|
+
graphs: List[Graph] = []
|
|
90
|
+
for data in dataset:
|
|
91
|
+
import networkx as nx # type: ignore
|
|
92
|
+
g_nx = nx.Graph()
|
|
93
|
+
for i in range(data.num_nodes):
|
|
94
|
+
label = None
|
|
95
|
+
if hasattr(data, "node_label") and data.node_label is not None:
|
|
96
|
+
label = str(int(data.node_label[i]))
|
|
97
|
+
g_nx.add_node(i, label=label)
|
|
98
|
+
edge_attr = data.edge_attr if data.edge_attr is not None else None
|
|
99
|
+
for j in range(data.edge_index.size(1)):
|
|
100
|
+
u = int(data.edge_index[0, j])
|
|
101
|
+
v = int(data.edge_index[1, j])
|
|
102
|
+
label = None
|
|
103
|
+
if edge_attr is not None:
|
|
104
|
+
label = str(int(edge_attr[j].item()))
|
|
105
|
+
g_nx.add_edge(u, v, label=label)
|
|
106
|
+
graphs.append(Graph.from_networkx(g_nx))
|
|
107
|
+
return graphs
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
DATASET_LOADERS: Dict[str, Callable[[], List[Graph]]] = {
|
|
111
|
+
"toy": _load_toy,
|
|
112
|
+
"mutag": _load_mutag,
|
|
113
|
+
"enzymes": _load_enzymes,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_dataset(name: str, **kwargs) -> List[Graph]:
|
|
118
|
+
"""Load a dataset by name.
|
|
119
|
+
|
|
120
|
+
Supported names include ``"toy"``, ``"mutag"`` and ``"enzymes"``.
|
|
121
|
+
Names are case insensitive.
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
name: str
|
|
126
|
+
Dataset identifier.
|
|
127
|
+
**kwargs: dict
|
|
128
|
+
Additional keyword arguments passed to the underlying loader. Not
|
|
129
|
+
currently used.
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
List[Graph]
|
|
134
|
+
List of graphs comprising the dataset.
|
|
135
|
+
|
|
136
|
+
Raises
|
|
137
|
+
------
|
|
138
|
+
KeyError
|
|
139
|
+
If the dataset name is unknown.
|
|
140
|
+
"""
|
|
141
|
+
key = name.lower()
|
|
142
|
+
if key not in DATASET_LOADERS:
|
|
143
|
+
raise KeyError(f"Unknown dataset '{name}'. Available datasets: {list(DATASET_LOADERS.keys())}")
|
|
144
|
+
loader = DATASET_LOADERS[key]
|
|
145
|
+
return loader()
|
submine/errors.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Submine exception hierarchy.
|
|
2
|
+
|
|
3
|
+
These exceptions provide stable, semantically meaningful error types that
|
|
4
|
+
downstream applications can catch without parsing strings.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SubmineError(Exception):
|
|
11
|
+
"""Base class for all library-defined exceptions."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SubmineInputError(SubmineError, ValueError):
|
|
15
|
+
"""Raised when user-supplied inputs (files, graphs, parameters) are invalid."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ParameterValidationError(SubmineInputError):
|
|
19
|
+
"""Raised when algorithm parameters fail validation."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BackendUnavailableError(SubmineError, RuntimeError):
|
|
23
|
+
"""Raised when an optional external backend (binary, JVM, etc.) is unavailable."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BackendExecutionError(SubmineError, RuntimeError):
|
|
27
|
+
"""Raised when an external backend fails during execution."""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ResourceLimitError(SubmineInputError):
|
|
31
|
+
"""Raised when an input exceeds configured resource limits."""
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"SubmineError",
|
|
35
|
+
"SubmineInputError",
|
|
36
|
+
"ParameterValidationError",
|
|
37
|
+
"BackendUnavailableError",
|
|
38
|
+
"BackendExecutionError",
|
|
39
|
+
"ResourceLimitError",
|
|
40
|
+
]
|
|
41
|
+
|
submine/io/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Input/Output utilities for submine.
|
|
2
|
+
|
|
3
|
+
This package contains helper functions to serialise graphs to the input
|
|
4
|
+
formats expected by different subgraph mining algorithms and to parse
|
|
5
|
+
their outputs back into :class:`~submine.core.graph.Graph` objects.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .common import ensure_dir # noqa: F401
|
|
9
|
+
from .gspan import write_gspan_dataset # noqa: F401
|
|
10
|
+
from .transcode import ( # noqa: F401
|
|
11
|
+
detect_format,
|
|
12
|
+
load_graphs,
|
|
13
|
+
transcode_path,
|
|
14
|
+
FMT_EDGELIST,
|
|
15
|
+
FMT_GEXF,
|
|
16
|
+
FMT_GSPAN,
|
|
17
|
+
FMT_LG,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"ensure_dir",
|
|
22
|
+
"write_gspan_dataset",
|
|
23
|
+
"detect_format",
|
|
24
|
+
"load_graphs",
|
|
25
|
+
"transcode_path",
|
|
26
|
+
"FMT_EDGELIST",
|
|
27
|
+
"FMT_GEXF",
|
|
28
|
+
"FMT_GSPAN",
|
|
29
|
+
"FMT_LG",
|
|
30
|
+
]
|
submine/io/common.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Common I/O helpers used by algorithm wrappers.
|
|
2
|
+
|
|
3
|
+
Functions defined here are not tied to a specific algorithm. They
|
|
4
|
+
perform tasks such as ensuring that directories exist or creating
|
|
5
|
+
temporary working directories.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from contextlib import contextmanager
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from tempfile import TemporaryDirectory
|
|
14
|
+
from typing import Iterator
|
|
15
|
+
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import List, Optional, Tuple, Any
|
|
18
|
+
|
|
19
|
+
from ..core.graph import Graph
|
|
20
|
+
from ..errors import SubmineInputError
|
|
21
|
+
from ..utils.checks import iter_text_lines
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def ensure_dir(path: str | Path) -> Path:
|
|
25
|
+
"""Ensure that the directory at ``path`` exists.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
path: str or pathlib.Path
|
|
30
|
+
Directory path to create if it does not exist.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
pathlib.Path
|
|
35
|
+
The Path instance corresponding to ``path``.
|
|
36
|
+
"""
|
|
37
|
+
p = Path(path)
|
|
38
|
+
if not p.exists():
|
|
39
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
return p
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@contextmanager
|
|
44
|
+
def temporary_directory() -> Iterator[Path]:
|
|
45
|
+
"""Context manager yielding a temporary directory as a Path.
|
|
46
|
+
|
|
47
|
+
The directory and its contents are removed on exit. Useful when
|
|
48
|
+
writing temporary files for external algorithms.
|
|
49
|
+
"""
|
|
50
|
+
with TemporaryDirectory() as tmp:
|
|
51
|
+
yield Path(tmp)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _maybe_int(x: str) -> Any:
|
|
55
|
+
"""Try to parse as int, fallback to string."""
|
|
56
|
+
try:
|
|
57
|
+
return int(x)
|
|
58
|
+
except ValueError:
|
|
59
|
+
return x
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def read_edgelist_dataset(path: str | Path) -> List[Graph]:
|
|
63
|
+
"""
|
|
64
|
+
Read an edge-list dataset file and return a list of Graph objects.
|
|
65
|
+
|
|
66
|
+
Supported formats
|
|
67
|
+
-----------------
|
|
68
|
+
1) Single graph (no 't # gid' headers):
|
|
69
|
+
|
|
70
|
+
u v
|
|
71
|
+
u v label
|
|
72
|
+
|
|
73
|
+
- Nodes are inferred from all endpoints.
|
|
74
|
+
- Edge labels are used if a third column is present on ANY line.
|
|
75
|
+
|
|
76
|
+
2) Multiple graphs (with gSpan-like headers):
|
|
77
|
+
|
|
78
|
+
t # 0
|
|
79
|
+
u v
|
|
80
|
+
u v label
|
|
81
|
+
t # 1
|
|
82
|
+
u v
|
|
83
|
+
...
|
|
84
|
+
|
|
85
|
+
- Each 't # gid' starts a new graph.
|
|
86
|
+
- Nodes are inferred per graph.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
List[Graph]
|
|
91
|
+
A list of Graph objects constructed from the file.
|
|
92
|
+
"""
|
|
93
|
+
path = Path(path)
|
|
94
|
+
|
|
95
|
+
graphs: List[Graph] = []
|
|
96
|
+
|
|
97
|
+
current_edges: Optional[List[Tuple[Any, Any]]] = None
|
|
98
|
+
current_edge_labels: Optional[dict[Tuple[Any, Any], Any]] = None
|
|
99
|
+
any_labels_in_current = False
|
|
100
|
+
|
|
101
|
+
def flush_current_graph():
|
|
102
|
+
nonlocal current_edges, current_edge_labels, any_labels_in_current
|
|
103
|
+
if current_edges is None:
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
# Build node set from edges
|
|
107
|
+
nodes_set = set()
|
|
108
|
+
for u, v in current_edges:
|
|
109
|
+
nodes_set.add(u)
|
|
110
|
+
nodes_set.add(v)
|
|
111
|
+
# Deterministic node ordering for reproducible transcoding/writing.
|
|
112
|
+
# Edge lists may mix ints/strings; sort by type name then by string value.
|
|
113
|
+
nodes = sorted(nodes_set, key=lambda x: (type(x).__name__, str(x)))
|
|
114
|
+
|
|
115
|
+
edge_labels = current_edge_labels if any_labels_in_current else None
|
|
116
|
+
|
|
117
|
+
graphs.append(
|
|
118
|
+
Graph(
|
|
119
|
+
nodes=nodes,
|
|
120
|
+
edges=current_edges,
|
|
121
|
+
node_labels=None, # edge-list doesn't carry node labels
|
|
122
|
+
edge_labels=edge_labels,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
current_edges = None
|
|
126
|
+
current_edge_labels = None
|
|
127
|
+
any_labels_in_current = False
|
|
128
|
+
|
|
129
|
+
for raw_line in iter_text_lines(path):
|
|
130
|
+
line = raw_line.strip()
|
|
131
|
+
if not line or line.startswith("#"):
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# Accept both whitespace-separated and comma-separated edge lists.
|
|
135
|
+
if "," in line:
|
|
136
|
+
line = line.replace(",", " ")
|
|
137
|
+
parts = line.split()
|
|
138
|
+
rec_type = parts[0]
|
|
139
|
+
|
|
140
|
+
# Multi-graph header: t # gid
|
|
141
|
+
if rec_type == "t" and len(parts) >= 3 and parts[1] == "#":
|
|
142
|
+
# flush previous graph, start new
|
|
143
|
+
flush_current_graph()
|
|
144
|
+
current_edges = []
|
|
145
|
+
current_edge_labels = {}
|
|
146
|
+
any_labels_in_current = False
|
|
147
|
+
# we ignore gid value; it's just a marker
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
# Otherwise, treat as an edge line: u v [label]
|
|
151
|
+
if current_edges is None:
|
|
152
|
+
# No graph header seen yet: assume single-graph file
|
|
153
|
+
current_edges = []
|
|
154
|
+
current_edge_labels = {}
|
|
155
|
+
any_labels_in_current = False
|
|
156
|
+
|
|
157
|
+
if len(parts) < 2:
|
|
158
|
+
raise ValueError(f"Malformed edge line: {line!r}")
|
|
159
|
+
|
|
160
|
+
u = _maybe_int(parts[0])
|
|
161
|
+
v = _maybe_int(parts[1])
|
|
162
|
+
|
|
163
|
+
if len(parts) >= 3:
|
|
164
|
+
lbl = _maybe_int(parts[2])
|
|
165
|
+
any_labels_in_current = True
|
|
166
|
+
current_edges.append((u, v))
|
|
167
|
+
current_edge_labels[(u, v)] = lbl # type: ignore[index]
|
|
168
|
+
else:
|
|
169
|
+
current_edges.append((u, v))
|
|
170
|
+
|
|
171
|
+
# Flush final graph (single-graph files without a trailing header)
|
|
172
|
+
flush_current_graph()
|
|
173
|
+
return graphs
|