submine 0.1.0__cp311-cp311-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. submine/__init__.py +37 -0
  2. submine/algorithms/__init__.py +23 -0
  3. submine/algorithms/base.py +143 -0
  4. submine/algorithms/gspan.py +156 -0
  5. submine/algorithms/gspan_cpp.cpython-311-x86_64-linux-musl.so +0 -0
  6. submine/algorithms/sopagrami.py +250 -0
  7. submine/algorithms/sopagrami_cpp.cpython-311-x86_64-linux-musl.so +0 -0
  8. submine/api.py +134 -0
  9. submine/backends/__init__.py +0 -0
  10. submine/backends/gspan/CMakeLists.txt +65 -0
  11. submine/backends/gspan/dfs.cpp +98 -0
  12. submine/backends/gspan/graph.cpp +165 -0
  13. submine/backends/gspan/gspan.cpp +776 -0
  14. submine/backends/gspan/gspan.h +296 -0
  15. submine/backends/gspan/ismin.cpp +124 -0
  16. submine/backends/gspan/main.cpp +106 -0
  17. submine/backends/gspan/misc.cpp +177 -0
  18. submine/backends/gspan/python_bindings.cpp +133 -0
  19. submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
  20. submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
  21. submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
  22. submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
  23. submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
  24. submine/backends/sopagrami/cpp/src/main.cpp +94 -0
  25. submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
  26. submine/cli/__init__.py +6 -0
  27. submine/cli/main.py +87 -0
  28. submine/core/__init__.py +12 -0
  29. submine/core/graph.py +179 -0
  30. submine/core/result.py +121 -0
  31. submine/datasets/__init__.py +11 -0
  32. submine/datasets/loaders.py +145 -0
  33. submine/errors.py +41 -0
  34. submine/io/__init__.py +30 -0
  35. submine/io/common.py +173 -0
  36. submine/io/gexf.py +88 -0
  37. submine/io/gspan.py +268 -0
  38. submine/io/sopagrami.py +143 -0
  39. submine/io/transcode.py +147 -0
  40. submine/registry.py +8 -0
  41. submine/utils/__init__.py +6 -0
  42. submine/utils/checks.py +115 -0
  43. submine/utils/logging.py +41 -0
  44. submine-0.1.0.dist-info/METADATA +178 -0
  45. submine-0.1.0.dist-info/RECORD +49 -0
  46. submine-0.1.0.dist-info/WHEEL +5 -0
  47. submine-0.1.0.dist-info/licenses/LICENSE +21 -0
  48. submine.libs/libgcc_s-2298274a.so.1 +0 -0
  49. submine.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
submine/io/gexf.py ADDED
@@ -0,0 +1,88 @@
1
+ """GEXF reader.
2
+
3
+ We use NetworkX's GEXF parser to support common graph exports.
4
+
5
+ Mapping rules
6
+ -------------
7
+ - Node labels: if a node attribute named ``label`` exists, it is used; otherwise
8
+ we use the node id as its label.
9
+ - Edge labels: if an edge attribute named ``label`` exists, it is used.
10
+ - Edge weights: if an edge attribute named ``weight`` exists and is not 1.0, it
11
+ is stored in :attr:`submine.core.graph.Graph.edge_weights`.
12
+
13
+ Notes
14
+ -----
15
+ GEXF can represent directed graphs, multi-edges, and parallel edges. The internal
16
+ ``Graph`` container in *submine* is currently undirected and does not preserve
17
+ parallel edges. We therefore:
18
+ - coerce to an undirected simple graph
19
+ - keep the first-seen label/weight for each undirected edge
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from pathlib import Path
25
+ from typing import Any, Dict, Hashable, Tuple
26
+
27
+ import networkx as nx
28
+
29
+ from ..core.graph import Graph
30
+
31
+
32
+ def read_gexf(path: str | Path) -> Graph:
33
+ p = Path(path)
34
+ g_nx = nx.read_gexf(p)
35
+
36
+ # Coerce to undirected simple graph to match our internal container.
37
+ if isinstance(g_nx, (nx.MultiGraph, nx.MultiDiGraph)):
38
+ g_simple = nx.Graph()
39
+ for u, v, data in g_nx.edges(data=True):
40
+ if u == v:
41
+ continue
42
+ if g_simple.has_edge(u, v):
43
+ continue
44
+ g_simple.add_edge(u, v, **(data or {}))
45
+ for n, data in g_nx.nodes(data=True):
46
+ g_simple.add_node(n, **(data or {}))
47
+ g_nx = g_simple
48
+ else:
49
+ g_nx = nx.Graph(g_nx)
50
+
51
+ nodes = list(g_nx.nodes())
52
+
53
+ node_labels: Dict[Hashable, Any] = {}
54
+ for n, data in g_nx.nodes(data=True):
55
+ if data is None:
56
+ node_labels[n] = str(n)
57
+ else:
58
+ node_labels[n] = data.get("label", str(n))
59
+
60
+ edges = []
61
+ edge_labels: Dict[Tuple[Hashable, Hashable], Any] = {}
62
+ edge_weights: Dict[Tuple[Hashable, Hashable], float] = {}
63
+
64
+ for u, v, data in g_nx.edges(data=True):
65
+ if u == v:
66
+ continue
67
+ a, b = (u, v) if str(u) <= str(v) else (v, u)
68
+ key = (a, b)
69
+ edges.append(key)
70
+
71
+ if data:
72
+ if "label" in data and data["label"] is not None:
73
+ edge_labels[key] = data["label"]
74
+ if "weight" in data and data["weight"] is not None:
75
+ try:
76
+ w = float(data["weight"])
77
+ except Exception:
78
+ w = 1.0
79
+ if w != 1.0:
80
+ edge_weights[key] = w
81
+
82
+ return Graph(
83
+ nodes=nodes,
84
+ edges=edges,
85
+ node_labels=node_labels or None,
86
+ edge_labels=edge_labels or None,
87
+ edge_weights=edge_weights or None,
88
+ )
submine/io/gspan.py ADDED
@@ -0,0 +1,268 @@
1
+ # submine/io/gspan.py
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+ from typing import Dict, Hashable, Iterable, List, Tuple
6
+
7
+ from ..core.graph import Graph
8
+ from ..errors import SubmineInputError
9
+ from ..utils.checks import iter_text_lines
10
+
11
+
12
+
13
+ from pathlib import Path
14
+ from typing import Iterable, List, Optional
15
+
16
+
17
+
18
+ def read_gspan_dataset(path: Path | str) -> List[Graph]:
19
+ """
20
+ Read a gSpan-formatted dataset file and return a list of Graph objects.
21
+
22
+ Expected format:
23
+
24
+ t # N # start of N-th graph
25
+ v M L # vertex M has label L
26
+ e P Q L # edge (P, Q) has label L
27
+ ...
28
+ t # -1 # end of file sentinel (required by some gSpan impls)
29
+
30
+ Notes
31
+ -----
32
+ - Vertex ids within each graph are assumed to be integers (0..n-1).
33
+ - Labels are read as integers (you can remap them later if desired).
34
+ - Edges are treated as undirected; we store them as (min(u, v), max(u, v))
35
+ and avoid duplicates.
36
+ """
37
+ path = Path(path)
38
+
39
+ graphs: List[Graph] = []
40
+
41
+ current_nodes: Optional[list[int]] = None
42
+ current_node_labels: Optional[dict[int, int]] = None
43
+ current_edges: Optional[list[tuple[int, int]]] = None
44
+ current_edge_labels: Optional[dict[tuple[int, int], int]] = None
45
+
46
+ def flush_current_graph():
47
+ nonlocal current_nodes, current_node_labels, current_edges, current_edge_labels
48
+ if current_nodes is None:
49
+ return
50
+ graphs.append(
51
+ Graph(
52
+ nodes=current_nodes,
53
+ edges=current_edges,
54
+ node_labels=current_node_labels,
55
+ edge_labels=current_edge_labels,
56
+ )
57
+ )
58
+ current_nodes = None
59
+ current_node_labels = None
60
+ current_edges = None
61
+ current_edge_labels = None
62
+
63
+ for raw_line in iter_text_lines(path):
64
+ line = raw_line.strip()
65
+ if not line:
66
+ continue
67
+ parts = line.split()
68
+
69
+ rec_type = parts[0]
70
+
71
+ # Graph header: t # gid
72
+ if rec_type == "t":
73
+ # flush previous graph if any
74
+ if len(parts) >= 3:
75
+ gid = int(parts[2])
76
+ else:
77
+ raise ValueError(f"Malformed 't' line: {line!r}")
78
+
79
+ if gid == -1:
80
+ # End-of-dataset sentinel: flush the last graph and stop
81
+ flush_current_graph()
82
+ break
83
+
84
+ # start a new graph
85
+ flush_current_graph()
86
+ current_nodes = []
87
+ current_node_labels = {}
88
+ current_edges = []
89
+ current_edge_labels = {}
90
+
91
+ elif rec_type == "v":
92
+ if current_nodes is None:
93
+ raise ValueError(f"Vertex line outside of any graph: {line!r}")
94
+ if len(parts) < 3:
95
+ raise ValueError(f"Malformed 'v' line: {line!r}")
96
+ vid = int(parts[1])
97
+ lbl = int(parts[2])
98
+ current_nodes.append(vid)
99
+ current_node_labels[vid] = lbl # type: ignore[arg-type]
100
+
101
+ elif rec_type == "e":
102
+ if current_edges is None:
103
+ raise ValueError(f"Edge line outside of any graph: {line!r}")
104
+ if len(parts) < 4:
105
+ raise ValueError(f"Malformed 'e' line: {line!r}")
106
+ u = int(parts[1])
107
+ v = int(parts[2])
108
+ lbl = int(parts[3])
109
+
110
+ # treat as undirected, avoid duplicates
111
+ if u <= v:
112
+ key = (u, v)
113
+ else:
114
+ key = (v, u)
115
+
116
+ if key not in current_edge_labels: # type: ignore[operator]
117
+ current_edges.append(key) # type: ignore[arg-type]
118
+ current_edge_labels[key] = lbl # type: ignore[index]
119
+
120
+ else:
121
+ # Unknown record; you can choose to ignore or raise
122
+ raise SubmineInputError(f"Unknown record type '{rec_type}' in line: {line!r}")
123
+
124
+ # If file ended without the required terminator, still return what we have.
125
+ flush_current_graph()
126
+ return graphs
127
+
128
+ def convert_gspan_graph(gspan_g) -> Graph:
129
+ """
130
+ Convert a vendored gSpan graph.Graph object into submine.core.graph.Graph.
131
+
132
+ Assumes:
133
+ - gspan_g.vertices is a dict {vid: Vertex}
134
+ - Vertex has: vid, vlb, edges (dict[to_vid, Edge])
135
+ - Edge has: eid, frm, to, elb
136
+ """
137
+ nodes = []
138
+ node_labels = {}
139
+ edges = []
140
+ edge_labels = {}
141
+
142
+ # 1. Nodes + labels
143
+ for vid, v in gspan_g.vertices.items():
144
+ nodes.append(vid)
145
+ node_labels[vid] = v.vlb
146
+
147
+ # 2. Edges (avoid duplicates in undirected graphs)
148
+ seen = set()
149
+ for vid, v in gspan_g.vertices.items():
150
+ # v.edges is a dict: {to_vid: Edge}
151
+ for to, e in v.edges.items():
152
+ u, w = e.frm, e.to
153
+ # canonicalize for undirected graph
154
+ key = (u, w) if u <= w else (w, u)
155
+ if key in seen:
156
+ continue
157
+ seen.add(key)
158
+ edges.append(key)
159
+ edge_labels[key] = e.elb
160
+
161
+ return Graph(
162
+ nodes=nodes,
163
+ edges=edges,
164
+ node_labels=node_labels,
165
+ edge_labels=edge_labels,
166
+ )
167
+
168
+
169
+ def _build_label_maps(graphs: List[Graph]):
170
+ """
171
+ Map arbitrary node/edge labels to consecutive ints >= 2,
172
+ because gSpan forbids 0 and 1.
173
+ """
174
+ node_label_map: Dict[Hashable, int] = {}
175
+ edge_label_map: Dict[Hashable, int] = {}
176
+
177
+ next_node_label = 2
178
+ next_edge_label = 2
179
+
180
+ for G in graphs:
181
+ # Node labels
182
+ if G.node_labels is not None:
183
+ for nid in G.nodes:
184
+ lbl = G.node_labels.get(nid, None)
185
+ if lbl is None:
186
+ continue
187
+ if lbl not in node_label_map:
188
+ node_label_map[lbl] = next_node_label
189
+ next_node_label += 1
190
+
191
+ # Edge labels
192
+ if G.edge_labels is not None:
193
+ for e, lbl in G.edge_labels.items():
194
+ if lbl not in edge_label_map:
195
+ edge_label_map[lbl] = next_edge_label
196
+ next_edge_label += 1
197
+
198
+ # Fallback: if there are unlabeled nodes/edges, give them a default label
199
+ if not node_label_map:
200
+ node_label_map["__default_node__"] = 2
201
+ if not edge_label_map:
202
+ edge_label_map["__default_edge__"] = 2
203
+
204
+ return node_label_map, edge_label_map
205
+
206
+
207
+ def write_gspan_dataset(graphs: Iterable[Graph], path: Path) -> None:
208
+ """
209
+ Write a list of Graph objects to a gSpan-compatible file.
210
+
211
+ Format:
212
+ t # N -> N-th graph
213
+ v M L -> vertex M has label L
214
+ e P Q L -> edge (P, Q) has label L
215
+ ...
216
+ t # -1 -> end of file
217
+
218
+ NOTE:
219
+ - Vertex ids must be 0..n-1 *within each graph*.
220
+ - All labels must be integers >= 2 (we map them if needed).
221
+ """
222
+
223
+ graphs = list(graphs)
224
+ node_label_map, edge_label_map = _build_label_maps(graphs)
225
+
226
+ with path.open("w") as f:
227
+ for gid, G in enumerate(graphs):
228
+ f.write(f"t # {gid}\n")
229
+
230
+ # Remap node ids to 0..n-1 locally
231
+ id_map = {orig_id: new_id for new_id, orig_id in enumerate(G.nodes)}
232
+
233
+ # Write vertices
234
+ for orig_id in G.nodes:
235
+ new_id = id_map[orig_id]
236
+ if G.node_labels is None:
237
+ # use default label
238
+ label_key = "__default_node__"
239
+ else:
240
+ raw_lbl = G.node_labels.get(orig_id, "__default_node__")
241
+ label_key = raw_lbl if raw_lbl in node_label_map else "__default_node__"
242
+
243
+ lbl_int = node_label_map[label_key]
244
+ f.write(f"v {new_id} {lbl_int}\n")
245
+
246
+ # Write edges
247
+ for e in G.edges:
248
+ if len(e) == 2:
249
+ u, v = e
250
+ raw_elbl = "__default_edge__"
251
+ elif len(e) == 3:
252
+ u, v, raw_elbl = e
253
+ else:
254
+ raise ValueError(f"Edge tuple must be (u,v) or (u,v,label), got {e!r}")
255
+
256
+ u_new = id_map[u]
257
+ v_new = id_map[v]
258
+
259
+ if G.edge_labels is not None:
260
+ raw_elbl = G.edge_labels.get((u, v), raw_elbl)
261
+
262
+ label_key = raw_elbl if raw_elbl in edge_label_map else "__default_edge__"
263
+ elbl_int = edge_label_map[label_key]
264
+
265
+ f.write(f"e {u_new} {v_new} {elbl_int}\n")
266
+
267
+ # Required terminator for this implementation
268
+ f.write("t # -1\n")
@@ -0,0 +1,143 @@
1
+
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+ from typing import Dict, Hashable, Optional, Tuple
6
+
7
+ from ..core.graph import Graph
8
+ from ..errors import SubmineInputError
9
+ from ..utils.checks import iter_text_lines
10
+
11
+
12
+ def read_lg(path: str | Path) -> Graph:
13
+ """Read a single SoPaGraMi/gSpan-style ``.lg`` file into :class:`~submine.core.graph.Graph`.
14
+
15
+ Supports edge lines of the form:
16
+
17
+ - ``e u v``
18
+ - ``e u v label``
19
+ - ``e u v label weight``
20
+ - ``e u v weight`` (rare; treated as unlabeled edge with weight)
21
+
22
+ The reader is streaming and suitable for large graphs.
23
+ """
24
+ path = Path(path)
25
+
26
+ nodes: list[int] = []
27
+ node_labels: dict[int, str] = {}
28
+ edges: list[Tuple[int, int]] = []
29
+ edge_labels: dict[Tuple[int, int], str] = {}
30
+ edge_weights: dict[Tuple[int, int], float] = {}
31
+
32
+ with path.open("r", encoding="utf-8", errors="replace") as f:
33
+ for raw_line in f:
34
+ line = raw_line.strip()
35
+ if not line or line.startswith("#"):
36
+ continue
37
+
38
+ parts = line.split()
39
+ rec = parts[0]
40
+
41
+ if rec == "t":
42
+ # dataset marker (ignored); SoPaGraMi uses a single graph in practice
43
+ continue
44
+
45
+ if rec == "v":
46
+ if len(parts) < 3:
47
+ raise SubmineInputError(f"Malformed vertex line: {line!r}")
48
+ vid = int(parts[1])
49
+ lbl = parts[2]
50
+ if vid not in node_labels:
51
+ nodes.append(vid)
52
+ node_labels[vid] = lbl
53
+ continue
54
+
55
+ if rec == "e":
56
+ if len(parts) < 3:
57
+ raise SubmineInputError(f"Malformed edge line: {line!r}")
58
+ u = int(parts[1])
59
+ v = int(parts[2])
60
+ edges.append((u, v))
61
+
62
+ # Parse optional label / weight. We accept a few variants.
63
+ lbl: Optional[str] = None
64
+ w: Optional[float] = None
65
+ if len(parts) == 4:
66
+ # Could be label or weight. Prefer weight if it parses cleanly.
67
+ try:
68
+ w = float(parts[3])
69
+ except ValueError:
70
+ lbl = parts[3]
71
+ elif len(parts) >= 5:
72
+ lbl = parts[3]
73
+ try:
74
+ w = float(parts[4])
75
+ except ValueError:
76
+ w = None
77
+
78
+ a, b = (u, v) if u <= v else (v, u)
79
+ if lbl is not None:
80
+ edge_labels[(a, b)] = lbl
81
+ if w is not None and float(w) != 1.0:
82
+ edge_weights[(a, b)] = float(w)
83
+ continue
84
+
85
+ # Unknown line type: ignore for robustness.
86
+ continue
87
+
88
+ # In case vertices were not explicitly listed, infer nodes from edges.
89
+ if not nodes:
90
+ s = set()
91
+ for (u, v) in edges:
92
+ s.add(u)
93
+ s.add(v)
94
+ nodes = sorted(s)
95
+
96
+ return Graph(nodes=nodes, edges=edges, node_labels=node_labels or None, edge_labels=edge_labels or None, edge_weights=edge_weights or None)
97
+
98
+
99
+ def write_lg(graph: Graph, path: str | Path, directed: bool = False, include_weight: bool = False) -> None:
100
+ """
101
+ Write a single Graph to SoPaGraMi's .lg format.
102
+
103
+ - Nodes are reindexed to 0..n-1 internally.
104
+ - Node labels come from graph.node_labels (fallback to string of node id).
105
+ - Edge labels come from graph.edge_labels (fallback to empty string).
106
+ """
107
+ path = Path(path)
108
+
109
+ # Map original node ids -> contiguous [0..n-1]
110
+ node_ids = list(graph.nodes)
111
+ id_map: Dict[Hashable, int] = {nid: i for i, nid in enumerate(node_ids)}
112
+
113
+ node_labels = graph.node_labels or {}
114
+ edge_labels = graph.edge_labels or {}
115
+ edge_weights = graph.edge_weights or {}
116
+
117
+ with path.open("w") as f:
118
+ # vertices
119
+ for nid in node_ids:
120
+ idx = id_map[nid]
121
+ lbl = node_labels.get(nid, str(nid))
122
+ f.write(f"v {idx} {lbl}\n")
123
+
124
+ # edges
125
+ for (u_orig, v_orig) in graph.edges:
126
+ u = id_map[u_orig]
127
+ v = id_map[v_orig]
128
+
129
+ # SoPaGraMi supports directed; if undirected we still write one edge
130
+ lbl = edge_labels.get((u_orig, v_orig)) or edge_labels.get((v_orig, u_orig)) or ""
131
+ w = edge_weights.get((u_orig, v_orig), edge_weights.get((v_orig, u_orig), 1.0))
132
+
133
+ if include_weight and float(w) != 1.0:
134
+ # If there is no label, we still emit a placeholder label to keep parsing unambiguous.
135
+ if lbl == "":
136
+ f.write(f"e {u} {v} _ {float(w)}\n")
137
+ else:
138
+ f.write(f"e {u} {v} {lbl} {float(w)}\n")
139
+ else:
140
+ if lbl == "":
141
+ f.write(f"e {u} {v}\n")
142
+ else:
143
+ f.write(f"e {u} {v} {lbl}\n")
@@ -0,0 +1,147 @@
1
+ """Format detection and transcoding utilities.
2
+
3
+ Many third-party miners operate on a *native on-disk* format (e.g., gSpan datasets
4
+ or SoPaGraMi ``.lg``). For these miners, the most efficient pipeline is:
5
+
6
+ user input file -> transcode to miner native file -> miner runs
7
+
8
+ This module implements the 'transcode to native' step so the API does not need to
9
+ round-trip through an intermediate :class:`~submine.core.graph.Graph` unless the
10
+ input itself is not in the miner's native format.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from typing import Iterable, List, Optional
18
+ import re
19
+ from ..core.graph import Graph
20
+
21
+
22
+ class UnknownFormatError(ValueError):
23
+ pass
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class FormatSpec:
28
+ """A lightweight format descriptor."""
29
+
30
+ key: str
31
+ suffixes: tuple[str, ...]
32
+
33
+
34
+ # Canonical format keys used across the library.
35
+ FMT_LG = "lg" # SoPaGraMi single-graph format
36
+ FMT_GSPAN = "gspan" # gSpan dataset format (t/v/e ... t#-1)
37
+ FMT_GEXF = "gexf" # NetworkX-readable GEXF
38
+ FMT_EDGELIST = "edgelist" # whitespace-separated u v [label]
39
+
40
+ _GSPAN_DATA_RE = re.compile(r".*\.data(\.[A-Za-z0-9_-]+)?$") # matches .data, .data.x, .data.2 etc
41
+ _KNOWN_FORMATS: List[FormatSpec] = [
42
+ FormatSpec(FMT_LG, (".lg",)),
43
+ # treat both classic .gspan and Gatech-like *.data / *.data.x as gSpan datasets
44
+ FormatSpec(FMT_GSPAN, (".gspan", ".data", ".data.x")),
45
+ FormatSpec(FMT_GEXF, (".gexf",)),
46
+ FormatSpec(FMT_EDGELIST, (".edgelist", ".txt", ".tsv", ".csv")),
47
+ ]
48
+
49
+
50
+ def detect_format(path: str | Path) -> str:
51
+ """Detect the most likely input format from a file path.
52
+
53
+ Detection is filename-based (not content-based) by design for speed.
54
+
55
+ Notes
56
+ -----
57
+ - Gatech-style gSpan datasets frequently use ``*.data`` and ``*.data.<tag>`` where
58
+ ``<tag>`` may be an integer shard index (e.g., ``.data.2``) or an arbitrary token
59
+ (e.g., ``.data.x``). We treat all of these as :data:`FMT_GSPAN`.
60
+ - For ``.txt/.csv/.tsv`` we assume edge-list unless otherwise specified.
61
+ """
62
+ p = Path(path)
63
+ name = p.name.lower()
64
+ suf = p.suffix.lower()
65
+
66
+ # Handle multi-suffix gSpan dataset conventions first.
67
+ if _GSPAN_DATA_RE.match(name):
68
+ return FMT_GSPAN
69
+
70
+ # Exact suffix matches (single suffix) for known formats.
71
+ for spec in _KNOWN_FORMATS:
72
+ if suf in spec.suffixes:
73
+ return spec.key
74
+
75
+ # Fallback for compound suffixes like ".data.x" where Path.suffix == ".x".
76
+ for spec in _KNOWN_FORMATS:
77
+ for ss in spec.suffixes:
78
+ if name.endswith(ss):
79
+ return spec.key
80
+
81
+ raise UnknownFormatError(f"Cannot detect graph format from file: {p}")
82
+
83
+
84
+ def load_graphs(path: str | Path, *, fmt: Optional[str] = None) -> List[Graph]:
85
+ """Load graphs from a supported file format."""
86
+ p = Path(path)
87
+ fmt = fmt or detect_format(p)
88
+
89
+ if fmt == FMT_GSPAN:
90
+ from .gspan import read_gspan_dataset
91
+
92
+ return list(read_gspan_dataset(p))
93
+
94
+ if fmt == FMT_LG:
95
+ from .sopagrami import read_lg
96
+
97
+ return [read_lg(p)]
98
+
99
+ if fmt == FMT_EDGELIST:
100
+ from .common import read_edgelist_dataset
101
+
102
+ return list(read_edgelist_dataset(p))
103
+
104
+ if fmt == FMT_GEXF:
105
+ from .gexf import read_gexf
106
+
107
+ return [read_gexf(p)]
108
+
109
+ raise UnknownFormatError(f"Unsupported input format: {fmt}")
110
+
111
+
112
+ def write_graphs(graphs: Iterable[Graph], path: str | Path, *, fmt: str) -> Path:
113
+ """Write graphs to a given native format."""
114
+ p = Path(path)
115
+
116
+ if fmt == FMT_GSPAN:
117
+ from .gspan import write_gspan_dataset
118
+
119
+ write_gspan_dataset(list(graphs), p)
120
+ return p
121
+
122
+ if fmt == FMT_LG:
123
+ from .sopagrami import write_lg
124
+
125
+ gs = list(graphs)
126
+ if len(gs) != 1:
127
+ raise ValueError(f".lg expects exactly one graph; got {len(gs)}")
128
+ write_lg(gs[0], p)
129
+ return p
130
+
131
+ raise UnknownFormatError(f"Unsupported output format: {fmt}")
132
+
133
+
134
+ def transcode_path(
135
+ src_path: str | Path,
136
+ dst_path: str | Path,
137
+ *,
138
+ dst_fmt: str,
139
+ src_fmt: Optional[str] = None,
140
+ ) -> Path:
141
+ """Transcode an on-disk graph dataset to another format.
142
+
143
+ This function parses the input *once* into in-memory graphs (only when needed)
144
+ and then writes the target native file.
145
+ """
146
+ graphs = load_graphs(src_path, fmt=src_fmt)
147
+ return write_graphs(graphs, dst_path, fmt=dst_fmt)
submine/registry.py ADDED
@@ -0,0 +1,8 @@
1
+ # submine/registry.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Dict, Type
5
+
6
+ # We intentionally don't import SubgraphMiner here to avoid cycles.
7
+ # We just store "type" and let base.py handle typing.
8
+ available_algorithms: Dict[str, Type] = {}
@@ -0,0 +1,6 @@
1
+ """Miscellaneous utilities used across submine."""
2
+
3
+ from .logging import get_logger # noqa: F401
4
+ from .checks import is_tool_available # noqa: F401
5
+
6
+ __all__ = ["get_logger", "is_tool_available"]