submine 0.1.1__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- submine/__init__.py +37 -0
- submine/algorithms/__init__.py +23 -0
- submine/algorithms/base.py +143 -0
- submine/algorithms/gspan.py +156 -0
- submine/algorithms/gspan_cpp.cpython-312-darwin.so +0 -0
- submine/algorithms/sopagrami.py +250 -0
- submine/algorithms/sopagrami_cpp.cpython-312-darwin.so +0 -0
- submine/api.py +134 -0
- submine/backends/__init__.py +0 -0
- submine/backends/gspan/CMakeLists.txt +65 -0
- submine/backends/gspan/dfs.cpp +98 -0
- submine/backends/gspan/graph.cpp +165 -0
- submine/backends/gspan/gspan.cpp +776 -0
- submine/backends/gspan/gspan.h +296 -0
- submine/backends/gspan/ismin.cpp +124 -0
- submine/backends/gspan/main.cpp +106 -0
- submine/backends/gspan/misc.cpp +177 -0
- submine/backends/gspan/python_bindings.cpp +133 -0
- submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
- submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
- submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
- submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
- submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
- submine/backends/sopagrami/cpp/src/main.cpp +94 -0
- submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
- submine/cli/__init__.py +6 -0
- submine/cli/main.py +87 -0
- submine/core/__init__.py +12 -0
- submine/core/graph.py +179 -0
- submine/core/result.py +121 -0
- submine/datasets/__init__.py +11 -0
- submine/datasets/loaders.py +145 -0
- submine/errors.py +41 -0
- submine/io/__init__.py +30 -0
- submine/io/common.py +173 -0
- submine/io/gexf.py +88 -0
- submine/io/gspan.py +268 -0
- submine/io/sopagrami.py +143 -0
- submine/io/transcode.py +147 -0
- submine/registry.py +8 -0
- submine/utils/__init__.py +6 -0
- submine/utils/checks.py +115 -0
- submine/utils/logging.py +41 -0
- submine-0.1.1.dist-info/METADATA +178 -0
- submine-0.1.1.dist-info/RECORD +47 -0
- submine-0.1.1.dist-info/WHEEL +6 -0
- submine-0.1.1.dist-info/licenses/LICENSE +21 -0
submine/io/gexf.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""GEXF reader.
|
|
2
|
+
|
|
3
|
+
We use NetworkX's GEXF parser to support common graph exports.
|
|
4
|
+
|
|
5
|
+
Mapping rules
|
|
6
|
+
-------------
|
|
7
|
+
- Node labels: if a node attribute named ``label`` exists, it is used; otherwise
|
|
8
|
+
we use the node id as its label.
|
|
9
|
+
- Edge labels: if an edge attribute named ``label`` exists, it is used.
|
|
10
|
+
- Edge weights: if an edge attribute named ``weight`` exists and is not 1.0, it
|
|
11
|
+
is stored in :attr:`submine.core.graph.Graph.edge_weights`.
|
|
12
|
+
|
|
13
|
+
Notes
|
|
14
|
+
-----
|
|
15
|
+
GEXF can represent directed graphs, multi-edges, and parallel edges. The internal
|
|
16
|
+
``Graph`` container in *submine* is currently undirected and does not preserve
|
|
17
|
+
parallel edges. We therefore:
|
|
18
|
+
- coerce to an undirected simple graph
|
|
19
|
+
- keep the first-seen label/weight for each undirected edge
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any, Dict, Hashable, Tuple
|
|
26
|
+
|
|
27
|
+
import networkx as nx
|
|
28
|
+
|
|
29
|
+
from ..core.graph import Graph
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def read_gexf(path: str | Path) -> Graph:
|
|
33
|
+
p = Path(path)
|
|
34
|
+
g_nx = nx.read_gexf(p)
|
|
35
|
+
|
|
36
|
+
# Coerce to undirected simple graph to match our internal container.
|
|
37
|
+
if isinstance(g_nx, (nx.MultiGraph, nx.MultiDiGraph)):
|
|
38
|
+
g_simple = nx.Graph()
|
|
39
|
+
for u, v, data in g_nx.edges(data=True):
|
|
40
|
+
if u == v:
|
|
41
|
+
continue
|
|
42
|
+
if g_simple.has_edge(u, v):
|
|
43
|
+
continue
|
|
44
|
+
g_simple.add_edge(u, v, **(data or {}))
|
|
45
|
+
for n, data in g_nx.nodes(data=True):
|
|
46
|
+
g_simple.add_node(n, **(data or {}))
|
|
47
|
+
g_nx = g_simple
|
|
48
|
+
else:
|
|
49
|
+
g_nx = nx.Graph(g_nx)
|
|
50
|
+
|
|
51
|
+
nodes = list(g_nx.nodes())
|
|
52
|
+
|
|
53
|
+
node_labels: Dict[Hashable, Any] = {}
|
|
54
|
+
for n, data in g_nx.nodes(data=True):
|
|
55
|
+
if data is None:
|
|
56
|
+
node_labels[n] = str(n)
|
|
57
|
+
else:
|
|
58
|
+
node_labels[n] = data.get("label", str(n))
|
|
59
|
+
|
|
60
|
+
edges = []
|
|
61
|
+
edge_labels: Dict[Tuple[Hashable, Hashable], Any] = {}
|
|
62
|
+
edge_weights: Dict[Tuple[Hashable, Hashable], float] = {}
|
|
63
|
+
|
|
64
|
+
for u, v, data in g_nx.edges(data=True):
|
|
65
|
+
if u == v:
|
|
66
|
+
continue
|
|
67
|
+
a, b = (u, v) if str(u) <= str(v) else (v, u)
|
|
68
|
+
key = (a, b)
|
|
69
|
+
edges.append(key)
|
|
70
|
+
|
|
71
|
+
if data:
|
|
72
|
+
if "label" in data and data["label"] is not None:
|
|
73
|
+
edge_labels[key] = data["label"]
|
|
74
|
+
if "weight" in data and data["weight"] is not None:
|
|
75
|
+
try:
|
|
76
|
+
w = float(data["weight"])
|
|
77
|
+
except Exception:
|
|
78
|
+
w = 1.0
|
|
79
|
+
if w != 1.0:
|
|
80
|
+
edge_weights[key] = w
|
|
81
|
+
|
|
82
|
+
return Graph(
|
|
83
|
+
nodes=nodes,
|
|
84
|
+
edges=edges,
|
|
85
|
+
node_labels=node_labels or None,
|
|
86
|
+
edge_labels=edge_labels or None,
|
|
87
|
+
edge_weights=edge_weights or None,
|
|
88
|
+
)
|
submine/io/gspan.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# submine/io/gspan.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, Hashable, Iterable, List, Tuple
|
|
6
|
+
|
|
7
|
+
from ..core.graph import Graph
|
|
8
|
+
from ..errors import SubmineInputError
|
|
9
|
+
from ..utils.checks import iter_text_lines
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Iterable, List, Optional
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def read_gspan_dataset(path: Path | str) -> List[Graph]:
|
|
19
|
+
"""
|
|
20
|
+
Read a gSpan-formatted dataset file and return a list of Graph objects.
|
|
21
|
+
|
|
22
|
+
Expected format:
|
|
23
|
+
|
|
24
|
+
t # N # start of N-th graph
|
|
25
|
+
v M L # vertex M has label L
|
|
26
|
+
e P Q L # edge (P, Q) has label L
|
|
27
|
+
...
|
|
28
|
+
t # -1 # end of file sentinel (required by some gSpan impls)
|
|
29
|
+
|
|
30
|
+
Notes
|
|
31
|
+
-----
|
|
32
|
+
- Vertex ids within each graph are assumed to be integers (0..n-1).
|
|
33
|
+
- Labels are read as integers (you can remap them later if desired).
|
|
34
|
+
- Edges are treated as undirected; we store them as (min(u, v), max(u, v))
|
|
35
|
+
and avoid duplicates.
|
|
36
|
+
"""
|
|
37
|
+
path = Path(path)
|
|
38
|
+
|
|
39
|
+
graphs: List[Graph] = []
|
|
40
|
+
|
|
41
|
+
current_nodes: Optional[list[int]] = None
|
|
42
|
+
current_node_labels: Optional[dict[int, int]] = None
|
|
43
|
+
current_edges: Optional[list[tuple[int, int]]] = None
|
|
44
|
+
current_edge_labels: Optional[dict[tuple[int, int], int]] = None
|
|
45
|
+
|
|
46
|
+
def flush_current_graph():
|
|
47
|
+
nonlocal current_nodes, current_node_labels, current_edges, current_edge_labels
|
|
48
|
+
if current_nodes is None:
|
|
49
|
+
return
|
|
50
|
+
graphs.append(
|
|
51
|
+
Graph(
|
|
52
|
+
nodes=current_nodes,
|
|
53
|
+
edges=current_edges,
|
|
54
|
+
node_labels=current_node_labels,
|
|
55
|
+
edge_labels=current_edge_labels,
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
current_nodes = None
|
|
59
|
+
current_node_labels = None
|
|
60
|
+
current_edges = None
|
|
61
|
+
current_edge_labels = None
|
|
62
|
+
|
|
63
|
+
for raw_line in iter_text_lines(path):
|
|
64
|
+
line = raw_line.strip()
|
|
65
|
+
if not line:
|
|
66
|
+
continue
|
|
67
|
+
parts = line.split()
|
|
68
|
+
|
|
69
|
+
rec_type = parts[0]
|
|
70
|
+
|
|
71
|
+
# Graph header: t # gid
|
|
72
|
+
if rec_type == "t":
|
|
73
|
+
# flush previous graph if any
|
|
74
|
+
if len(parts) >= 3:
|
|
75
|
+
gid = int(parts[2])
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError(f"Malformed 't' line: {line!r}")
|
|
78
|
+
|
|
79
|
+
if gid == -1:
|
|
80
|
+
# End-of-dataset sentinel: flush the last graph and stop
|
|
81
|
+
flush_current_graph()
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
# start a new graph
|
|
85
|
+
flush_current_graph()
|
|
86
|
+
current_nodes = []
|
|
87
|
+
current_node_labels = {}
|
|
88
|
+
current_edges = []
|
|
89
|
+
current_edge_labels = {}
|
|
90
|
+
|
|
91
|
+
elif rec_type == "v":
|
|
92
|
+
if current_nodes is None:
|
|
93
|
+
raise ValueError(f"Vertex line outside of any graph: {line!r}")
|
|
94
|
+
if len(parts) < 3:
|
|
95
|
+
raise ValueError(f"Malformed 'v' line: {line!r}")
|
|
96
|
+
vid = int(parts[1])
|
|
97
|
+
lbl = int(parts[2])
|
|
98
|
+
current_nodes.append(vid)
|
|
99
|
+
current_node_labels[vid] = lbl # type: ignore[arg-type]
|
|
100
|
+
|
|
101
|
+
elif rec_type == "e":
|
|
102
|
+
if current_edges is None:
|
|
103
|
+
raise ValueError(f"Edge line outside of any graph: {line!r}")
|
|
104
|
+
if len(parts) < 4:
|
|
105
|
+
raise ValueError(f"Malformed 'e' line: {line!r}")
|
|
106
|
+
u = int(parts[1])
|
|
107
|
+
v = int(parts[2])
|
|
108
|
+
lbl = int(parts[3])
|
|
109
|
+
|
|
110
|
+
# treat as undirected, avoid duplicates
|
|
111
|
+
if u <= v:
|
|
112
|
+
key = (u, v)
|
|
113
|
+
else:
|
|
114
|
+
key = (v, u)
|
|
115
|
+
|
|
116
|
+
if key not in current_edge_labels: # type: ignore[operator]
|
|
117
|
+
current_edges.append(key) # type: ignore[arg-type]
|
|
118
|
+
current_edge_labels[key] = lbl # type: ignore[index]
|
|
119
|
+
|
|
120
|
+
else:
|
|
121
|
+
# Unknown record; you can choose to ignore or raise
|
|
122
|
+
raise SubmineInputError(f"Unknown record type '{rec_type}' in line: {line!r}")
|
|
123
|
+
|
|
124
|
+
# If file ended without the required terminator, still return what we have.
|
|
125
|
+
flush_current_graph()
|
|
126
|
+
return graphs
|
|
127
|
+
|
|
128
|
+
def convert_gspan_graph(gspan_g) -> Graph:
|
|
129
|
+
"""
|
|
130
|
+
Convert a vendored gSpan graph.Graph object into submine.core.graph.Graph.
|
|
131
|
+
|
|
132
|
+
Assumes:
|
|
133
|
+
- gspan_g.vertices is a dict {vid: Vertex}
|
|
134
|
+
- Vertex has: vid, vlb, edges (dict[to_vid, Edge])
|
|
135
|
+
- Edge has: eid, frm, to, elb
|
|
136
|
+
"""
|
|
137
|
+
nodes = []
|
|
138
|
+
node_labels = {}
|
|
139
|
+
edges = []
|
|
140
|
+
edge_labels = {}
|
|
141
|
+
|
|
142
|
+
# 1. Nodes + labels
|
|
143
|
+
for vid, v in gspan_g.vertices.items():
|
|
144
|
+
nodes.append(vid)
|
|
145
|
+
node_labels[vid] = v.vlb
|
|
146
|
+
|
|
147
|
+
# 2. Edges (avoid duplicates in undirected graphs)
|
|
148
|
+
seen = set()
|
|
149
|
+
for vid, v in gspan_g.vertices.items():
|
|
150
|
+
# v.edges is a dict: {to_vid: Edge}
|
|
151
|
+
for to, e in v.edges.items():
|
|
152
|
+
u, w = e.frm, e.to
|
|
153
|
+
# canonicalize for undirected graph
|
|
154
|
+
key = (u, w) if u <= w else (w, u)
|
|
155
|
+
if key in seen:
|
|
156
|
+
continue
|
|
157
|
+
seen.add(key)
|
|
158
|
+
edges.append(key)
|
|
159
|
+
edge_labels[key] = e.elb
|
|
160
|
+
|
|
161
|
+
return Graph(
|
|
162
|
+
nodes=nodes,
|
|
163
|
+
edges=edges,
|
|
164
|
+
node_labels=node_labels,
|
|
165
|
+
edge_labels=edge_labels,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _build_label_maps(graphs: List[Graph]):
|
|
170
|
+
"""
|
|
171
|
+
Map arbitrary node/edge labels to consecutive ints >= 2,
|
|
172
|
+
because gSpan forbids 0 and 1.
|
|
173
|
+
"""
|
|
174
|
+
node_label_map: Dict[Hashable, int] = {}
|
|
175
|
+
edge_label_map: Dict[Hashable, int] = {}
|
|
176
|
+
|
|
177
|
+
next_node_label = 2
|
|
178
|
+
next_edge_label = 2
|
|
179
|
+
|
|
180
|
+
for G in graphs:
|
|
181
|
+
# Node labels
|
|
182
|
+
if G.node_labels is not None:
|
|
183
|
+
for nid in G.nodes:
|
|
184
|
+
lbl = G.node_labels.get(nid, None)
|
|
185
|
+
if lbl is None:
|
|
186
|
+
continue
|
|
187
|
+
if lbl not in node_label_map:
|
|
188
|
+
node_label_map[lbl] = next_node_label
|
|
189
|
+
next_node_label += 1
|
|
190
|
+
|
|
191
|
+
# Edge labels
|
|
192
|
+
if G.edge_labels is not None:
|
|
193
|
+
for e, lbl in G.edge_labels.items():
|
|
194
|
+
if lbl not in edge_label_map:
|
|
195
|
+
edge_label_map[lbl] = next_edge_label
|
|
196
|
+
next_edge_label += 1
|
|
197
|
+
|
|
198
|
+
# Fallback: if there are unlabeled nodes/edges, give them a default label
|
|
199
|
+
if not node_label_map:
|
|
200
|
+
node_label_map["__default_node__"] = 2
|
|
201
|
+
if not edge_label_map:
|
|
202
|
+
edge_label_map["__default_edge__"] = 2
|
|
203
|
+
|
|
204
|
+
return node_label_map, edge_label_map
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def write_gspan_dataset(graphs: Iterable[Graph], path: Path) -> None:
|
|
208
|
+
"""
|
|
209
|
+
Write a list of Graph objects to a gSpan-compatible file.
|
|
210
|
+
|
|
211
|
+
Format:
|
|
212
|
+
t # N -> N-th graph
|
|
213
|
+
v M L -> vertex M has label L
|
|
214
|
+
e P Q L -> edge (P, Q) has label L
|
|
215
|
+
...
|
|
216
|
+
t # -1 -> end of file
|
|
217
|
+
|
|
218
|
+
NOTE:
|
|
219
|
+
- Vertex ids must be 0..n-1 *within each graph*.
|
|
220
|
+
- All labels must be integers >= 2 (we map them if needed).
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
graphs = list(graphs)
|
|
224
|
+
node_label_map, edge_label_map = _build_label_maps(graphs)
|
|
225
|
+
|
|
226
|
+
with path.open("w") as f:
|
|
227
|
+
for gid, G in enumerate(graphs):
|
|
228
|
+
f.write(f"t # {gid}\n")
|
|
229
|
+
|
|
230
|
+
# Remap node ids to 0..n-1 locally
|
|
231
|
+
id_map = {orig_id: new_id for new_id, orig_id in enumerate(G.nodes)}
|
|
232
|
+
|
|
233
|
+
# Write vertices
|
|
234
|
+
for orig_id in G.nodes:
|
|
235
|
+
new_id = id_map[orig_id]
|
|
236
|
+
if G.node_labels is None:
|
|
237
|
+
# use default label
|
|
238
|
+
label_key = "__default_node__"
|
|
239
|
+
else:
|
|
240
|
+
raw_lbl = G.node_labels.get(orig_id, "__default_node__")
|
|
241
|
+
label_key = raw_lbl if raw_lbl in node_label_map else "__default_node__"
|
|
242
|
+
|
|
243
|
+
lbl_int = node_label_map[label_key]
|
|
244
|
+
f.write(f"v {new_id} {lbl_int}\n")
|
|
245
|
+
|
|
246
|
+
# Write edges
|
|
247
|
+
for e in G.edges:
|
|
248
|
+
if len(e) == 2:
|
|
249
|
+
u, v = e
|
|
250
|
+
raw_elbl = "__default_edge__"
|
|
251
|
+
elif len(e) == 3:
|
|
252
|
+
u, v, raw_elbl = e
|
|
253
|
+
else:
|
|
254
|
+
raise ValueError(f"Edge tuple must be (u,v) or (u,v,label), got {e!r}")
|
|
255
|
+
|
|
256
|
+
u_new = id_map[u]
|
|
257
|
+
v_new = id_map[v]
|
|
258
|
+
|
|
259
|
+
if G.edge_labels is not None:
|
|
260
|
+
raw_elbl = G.edge_labels.get((u, v), raw_elbl)
|
|
261
|
+
|
|
262
|
+
label_key = raw_elbl if raw_elbl in edge_label_map else "__default_edge__"
|
|
263
|
+
elbl_int = edge_label_map[label_key]
|
|
264
|
+
|
|
265
|
+
f.write(f"e {u_new} {v_new} {elbl_int}\n")
|
|
266
|
+
|
|
267
|
+
# Required terminator for this implementation
|
|
268
|
+
f.write("t # -1\n")
|
submine/io/sopagrami.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, Hashable, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from ..core.graph import Graph
|
|
8
|
+
from ..errors import SubmineInputError
|
|
9
|
+
from ..utils.checks import iter_text_lines
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def read_lg(path: str | Path) -> Graph:
|
|
13
|
+
"""Read a single SoPaGraMi/gSpan-style ``.lg`` file into :class:`~submine.core.graph.Graph`.
|
|
14
|
+
|
|
15
|
+
Supports edge lines of the form:
|
|
16
|
+
|
|
17
|
+
- ``e u v``
|
|
18
|
+
- ``e u v label``
|
|
19
|
+
- ``e u v label weight``
|
|
20
|
+
- ``e u v weight`` (rare; treated as unlabeled edge with weight)
|
|
21
|
+
|
|
22
|
+
The reader is streaming and suitable for large graphs.
|
|
23
|
+
"""
|
|
24
|
+
path = Path(path)
|
|
25
|
+
|
|
26
|
+
nodes: list[int] = []
|
|
27
|
+
node_labels: dict[int, str] = {}
|
|
28
|
+
edges: list[Tuple[int, int]] = []
|
|
29
|
+
edge_labels: dict[Tuple[int, int], str] = {}
|
|
30
|
+
edge_weights: dict[Tuple[int, int], float] = {}
|
|
31
|
+
|
|
32
|
+
with path.open("r", encoding="utf-8", errors="replace") as f:
|
|
33
|
+
for raw_line in f:
|
|
34
|
+
line = raw_line.strip()
|
|
35
|
+
if not line or line.startswith("#"):
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
parts = line.split()
|
|
39
|
+
rec = parts[0]
|
|
40
|
+
|
|
41
|
+
if rec == "t":
|
|
42
|
+
# dataset marker (ignored); SoPaGraMi uses a single graph in practice
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
if rec == "v":
|
|
46
|
+
if len(parts) < 3:
|
|
47
|
+
raise SubmineInputError(f"Malformed vertex line: {line!r}")
|
|
48
|
+
vid = int(parts[1])
|
|
49
|
+
lbl = parts[2]
|
|
50
|
+
if vid not in node_labels:
|
|
51
|
+
nodes.append(vid)
|
|
52
|
+
node_labels[vid] = lbl
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
if rec == "e":
|
|
56
|
+
if len(parts) < 3:
|
|
57
|
+
raise SubmineInputError(f"Malformed edge line: {line!r}")
|
|
58
|
+
u = int(parts[1])
|
|
59
|
+
v = int(parts[2])
|
|
60
|
+
edges.append((u, v))
|
|
61
|
+
|
|
62
|
+
# Parse optional label / weight. We accept a few variants.
|
|
63
|
+
lbl: Optional[str] = None
|
|
64
|
+
w: Optional[float] = None
|
|
65
|
+
if len(parts) == 4:
|
|
66
|
+
# Could be label or weight. Prefer weight if it parses cleanly.
|
|
67
|
+
try:
|
|
68
|
+
w = float(parts[3])
|
|
69
|
+
except ValueError:
|
|
70
|
+
lbl = parts[3]
|
|
71
|
+
elif len(parts) >= 5:
|
|
72
|
+
lbl = parts[3]
|
|
73
|
+
try:
|
|
74
|
+
w = float(parts[4])
|
|
75
|
+
except ValueError:
|
|
76
|
+
w = None
|
|
77
|
+
|
|
78
|
+
a, b = (u, v) if u <= v else (v, u)
|
|
79
|
+
if lbl is not None:
|
|
80
|
+
edge_labels[(a, b)] = lbl
|
|
81
|
+
if w is not None and float(w) != 1.0:
|
|
82
|
+
edge_weights[(a, b)] = float(w)
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
# Unknown line type: ignore for robustness.
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
# In case vertices were not explicitly listed, infer nodes from edges.
|
|
89
|
+
if not nodes:
|
|
90
|
+
s = set()
|
|
91
|
+
for (u, v) in edges:
|
|
92
|
+
s.add(u)
|
|
93
|
+
s.add(v)
|
|
94
|
+
nodes = sorted(s)
|
|
95
|
+
|
|
96
|
+
return Graph(nodes=nodes, edges=edges, node_labels=node_labels or None, edge_labels=edge_labels or None, edge_weights=edge_weights or None)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def write_lg(graph: Graph, path: str | Path, directed: bool = False, include_weight: bool = False) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Write a single Graph to SoPaGraMi's .lg format.
|
|
102
|
+
|
|
103
|
+
- Nodes are reindexed to 0..n-1 internally.
|
|
104
|
+
- Node labels come from graph.node_labels (fallback to string of node id).
|
|
105
|
+
- Edge labels come from graph.edge_labels (fallback to empty string).
|
|
106
|
+
"""
|
|
107
|
+
path = Path(path)
|
|
108
|
+
|
|
109
|
+
# Map original node ids -> contiguous [0..n-1]
|
|
110
|
+
node_ids = list(graph.nodes)
|
|
111
|
+
id_map: Dict[Hashable, int] = {nid: i for i, nid in enumerate(node_ids)}
|
|
112
|
+
|
|
113
|
+
node_labels = graph.node_labels or {}
|
|
114
|
+
edge_labels = graph.edge_labels or {}
|
|
115
|
+
edge_weights = graph.edge_weights or {}
|
|
116
|
+
|
|
117
|
+
with path.open("w") as f:
|
|
118
|
+
# vertices
|
|
119
|
+
for nid in node_ids:
|
|
120
|
+
idx = id_map[nid]
|
|
121
|
+
lbl = node_labels.get(nid, str(nid))
|
|
122
|
+
f.write(f"v {idx} {lbl}\n")
|
|
123
|
+
|
|
124
|
+
# edges
|
|
125
|
+
for (u_orig, v_orig) in graph.edges:
|
|
126
|
+
u = id_map[u_orig]
|
|
127
|
+
v = id_map[v_orig]
|
|
128
|
+
|
|
129
|
+
# SoPaGraMi supports directed; if undirected we still write one edge
|
|
130
|
+
lbl = edge_labels.get((u_orig, v_orig)) or edge_labels.get((v_orig, u_orig)) or ""
|
|
131
|
+
w = edge_weights.get((u_orig, v_orig), edge_weights.get((v_orig, u_orig), 1.0))
|
|
132
|
+
|
|
133
|
+
if include_weight and float(w) != 1.0:
|
|
134
|
+
# If there is no label, we still emit a placeholder label to keep parsing unambiguous.
|
|
135
|
+
if lbl == "":
|
|
136
|
+
f.write(f"e {u} {v} _ {float(w)}\n")
|
|
137
|
+
else:
|
|
138
|
+
f.write(f"e {u} {v} {lbl} {float(w)}\n")
|
|
139
|
+
else:
|
|
140
|
+
if lbl == "":
|
|
141
|
+
f.write(f"e {u} {v}\n")
|
|
142
|
+
else:
|
|
143
|
+
f.write(f"e {u} {v} {lbl}\n")
|
submine/io/transcode.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Format detection and transcoding utilities.
|
|
2
|
+
|
|
3
|
+
Many third-party miners operate on a *native on-disk* format (e.g., gSpan datasets
|
|
4
|
+
or SoPaGraMi ``.lg``). For these miners, the most efficient pipeline is:
|
|
5
|
+
|
|
6
|
+
user input file -> transcode to miner native file -> miner runs
|
|
7
|
+
|
|
8
|
+
This module implements the 'transcode to native' step so the API does not need to
|
|
9
|
+
round-trip through an intermediate :class:`~submine.core.graph.Graph` unless the
|
|
10
|
+
input itself is not in the miner's native format.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Iterable, List, Optional
|
|
18
|
+
import re
|
|
19
|
+
from ..core.graph import Graph
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class UnknownFormatError(ValueError):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class FormatSpec:
|
|
28
|
+
"""A lightweight format descriptor."""
|
|
29
|
+
|
|
30
|
+
key: str
|
|
31
|
+
suffixes: tuple[str, ...]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Canonical format keys used across the library.
|
|
35
|
+
FMT_LG = "lg" # SoPaGraMi single-graph format
|
|
36
|
+
FMT_GSPAN = "gspan" # gSpan dataset format (t/v/e ... t#-1)
|
|
37
|
+
FMT_GEXF = "gexf" # NetworkX-readable GEXF
|
|
38
|
+
FMT_EDGELIST = "edgelist" # whitespace-separated u v [label]
|
|
39
|
+
|
|
40
|
+
_GSPAN_DATA_RE = re.compile(r".*\.data(\.[A-Za-z0-9_-]+)?$") # matches .data, .data.x, .data.2 etc
|
|
41
|
+
_KNOWN_FORMATS: List[FormatSpec] = [
|
|
42
|
+
FormatSpec(FMT_LG, (".lg",)),
|
|
43
|
+
# treat both classic .gspan and Gatech-like *.data / *.data.x as gSpan datasets
|
|
44
|
+
FormatSpec(FMT_GSPAN, (".gspan", ".data", ".data.x")),
|
|
45
|
+
FormatSpec(FMT_GEXF, (".gexf",)),
|
|
46
|
+
FormatSpec(FMT_EDGELIST, (".edgelist", ".txt", ".tsv", ".csv")),
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def detect_format(path: str | Path) -> str:
|
|
51
|
+
"""Detect the most likely input format from a file path.
|
|
52
|
+
|
|
53
|
+
Detection is filename-based (not content-based) by design for speed.
|
|
54
|
+
|
|
55
|
+
Notes
|
|
56
|
+
-----
|
|
57
|
+
- Gatech-style gSpan datasets frequently use ``*.data`` and ``*.data.<tag>`` where
|
|
58
|
+
``<tag>`` may be an integer shard index (e.g., ``.data.2``) or an arbitrary token
|
|
59
|
+
(e.g., ``.data.x``). We treat all of these as :data:`FMT_GSPAN`.
|
|
60
|
+
- For ``.txt/.csv/.tsv`` we assume edge-list unless otherwise specified.
|
|
61
|
+
"""
|
|
62
|
+
p = Path(path)
|
|
63
|
+
name = p.name.lower()
|
|
64
|
+
suf = p.suffix.lower()
|
|
65
|
+
|
|
66
|
+
# Handle multi-suffix gSpan dataset conventions first.
|
|
67
|
+
if _GSPAN_DATA_RE.match(name):
|
|
68
|
+
return FMT_GSPAN
|
|
69
|
+
|
|
70
|
+
# Exact suffix matches (single suffix) for known formats.
|
|
71
|
+
for spec in _KNOWN_FORMATS:
|
|
72
|
+
if suf in spec.suffixes:
|
|
73
|
+
return spec.key
|
|
74
|
+
|
|
75
|
+
# Fallback for compound suffixes like ".data.x" where Path.suffix == ".x".
|
|
76
|
+
for spec in _KNOWN_FORMATS:
|
|
77
|
+
for ss in spec.suffixes:
|
|
78
|
+
if name.endswith(ss):
|
|
79
|
+
return spec.key
|
|
80
|
+
|
|
81
|
+
raise UnknownFormatError(f"Cannot detect graph format from file: {p}")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def load_graphs(path: str | Path, *, fmt: Optional[str] = None) -> List[Graph]:
|
|
85
|
+
"""Load graphs from a supported file format."""
|
|
86
|
+
p = Path(path)
|
|
87
|
+
fmt = fmt or detect_format(p)
|
|
88
|
+
|
|
89
|
+
if fmt == FMT_GSPAN:
|
|
90
|
+
from .gspan import read_gspan_dataset
|
|
91
|
+
|
|
92
|
+
return list(read_gspan_dataset(p))
|
|
93
|
+
|
|
94
|
+
if fmt == FMT_LG:
|
|
95
|
+
from .sopagrami import read_lg
|
|
96
|
+
|
|
97
|
+
return [read_lg(p)]
|
|
98
|
+
|
|
99
|
+
if fmt == FMT_EDGELIST:
|
|
100
|
+
from .common import read_edgelist_dataset
|
|
101
|
+
|
|
102
|
+
return list(read_edgelist_dataset(p))
|
|
103
|
+
|
|
104
|
+
if fmt == FMT_GEXF:
|
|
105
|
+
from .gexf import read_gexf
|
|
106
|
+
|
|
107
|
+
return [read_gexf(p)]
|
|
108
|
+
|
|
109
|
+
raise UnknownFormatError(f"Unsupported input format: {fmt}")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def write_graphs(graphs: Iterable[Graph], path: str | Path, *, fmt: str) -> Path:
|
|
113
|
+
"""Write graphs to a given native format."""
|
|
114
|
+
p = Path(path)
|
|
115
|
+
|
|
116
|
+
if fmt == FMT_GSPAN:
|
|
117
|
+
from .gspan import write_gspan_dataset
|
|
118
|
+
|
|
119
|
+
write_gspan_dataset(list(graphs), p)
|
|
120
|
+
return p
|
|
121
|
+
|
|
122
|
+
if fmt == FMT_LG:
|
|
123
|
+
from .sopagrami import write_lg
|
|
124
|
+
|
|
125
|
+
gs = list(graphs)
|
|
126
|
+
if len(gs) != 1:
|
|
127
|
+
raise ValueError(f".lg expects exactly one graph; got {len(gs)}")
|
|
128
|
+
write_lg(gs[0], p)
|
|
129
|
+
return p
|
|
130
|
+
|
|
131
|
+
raise UnknownFormatError(f"Unsupported output format: {fmt}")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def transcode_path(
|
|
135
|
+
src_path: str | Path,
|
|
136
|
+
dst_path: str | Path,
|
|
137
|
+
*,
|
|
138
|
+
dst_fmt: str,
|
|
139
|
+
src_fmt: Optional[str] = None,
|
|
140
|
+
) -> Path:
|
|
141
|
+
"""Transcode an on-disk graph dataset to another format.
|
|
142
|
+
|
|
143
|
+
This function parses the input *once* into in-memory graphs (only when needed)
|
|
144
|
+
and then writes the target native file.
|
|
145
|
+
"""
|
|
146
|
+
graphs = load_graphs(src_path, fmt=src_fmt)
|
|
147
|
+
return write_graphs(graphs, dst_path, fmt=dst_fmt)
|
submine/registry.py
ADDED