submine 0.1.0__cp311-cp311-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. submine/__init__.py +37 -0
  2. submine/algorithms/__init__.py +23 -0
  3. submine/algorithms/base.py +143 -0
  4. submine/algorithms/gspan.py +156 -0
  5. submine/algorithms/gspan_cpp.cpython-311-x86_64-linux-musl.so +0 -0
  6. submine/algorithms/sopagrami.py +250 -0
  7. submine/algorithms/sopagrami_cpp.cpython-311-x86_64-linux-musl.so +0 -0
  8. submine/api.py +134 -0
  9. submine/backends/__init__.py +0 -0
  10. submine/backends/gspan/CMakeLists.txt +65 -0
  11. submine/backends/gspan/dfs.cpp +98 -0
  12. submine/backends/gspan/graph.cpp +165 -0
  13. submine/backends/gspan/gspan.cpp +776 -0
  14. submine/backends/gspan/gspan.h +296 -0
  15. submine/backends/gspan/ismin.cpp +124 -0
  16. submine/backends/gspan/main.cpp +106 -0
  17. submine/backends/gspan/misc.cpp +177 -0
  18. submine/backends/gspan/python_bindings.cpp +133 -0
  19. submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
  20. submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
  21. submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
  22. submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
  23. submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
  24. submine/backends/sopagrami/cpp/src/main.cpp +94 -0
  25. submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
  26. submine/cli/__init__.py +6 -0
  27. submine/cli/main.py +87 -0
  28. submine/core/__init__.py +12 -0
  29. submine/core/graph.py +179 -0
  30. submine/core/result.py +121 -0
  31. submine/datasets/__init__.py +11 -0
  32. submine/datasets/loaders.py +145 -0
  33. submine/errors.py +41 -0
  34. submine/io/__init__.py +30 -0
  35. submine/io/common.py +173 -0
  36. submine/io/gexf.py +88 -0
  37. submine/io/gspan.py +268 -0
  38. submine/io/sopagrami.py +143 -0
  39. submine/io/transcode.py +147 -0
  40. submine/registry.py +8 -0
  41. submine/utils/__init__.py +6 -0
  42. submine/utils/checks.py +115 -0
  43. submine/utils/logging.py +41 -0
  44. submine-0.1.0.dist-info/METADATA +178 -0
  45. submine-0.1.0.dist-info/RECORD +49 -0
  46. submine-0.1.0.dist-info/WHEEL +5 -0
  47. submine-0.1.0.dist-info/licenses/LICENSE +21 -0
  48. submine.libs/libgcc_s-2298274a.so.1 +0 -0
  49. submine.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
submine/core/graph.py ADDED
@@ -0,0 +1,179 @@
1
+ """Core graph container used throughout *submine*.
2
+
3
+ Design goals
4
+ ------------
5
+ 1) Keep a lightweight, dependency-free representation.
6
+ 2) Preserve the existing public surface used by wrappers:
7
+ - ``Graph.nodes`` : list of node ids
8
+ - ``Graph.edges`` : list of (u, v)
9
+ - ``Graph.node_labels`` : dict[node_id] -> label (optional)
10
+ - ``Graph.edge_labels`` : dict[(u, v)] -> label (optional)
11
+ 3) Add *optional* edge weights without breaking unweighted algorithms.
12
+
13
+ Weights are stored as ``Graph.edge_weights`` (dict[(u, v)] -> float). If a
14
+ weight is missing for an edge, it is treated as 1.0.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from dataclasses import dataclass
20
+ from typing import Any, Dict, Hashable, Iterable, Iterator, List, Optional, Protocol, Tuple
21
+
22
+
23
+ class GraphSource(Protocol):
24
+ """Anything that can yield Graph objects."""
25
+
26
+ def __iter__(self) -> Iterable["Graph"]: # pragma: no cover
27
+ ...
28
+
29
+
30
+ @dataclass(eq=True, frozen=True)
31
+ class Node:
32
+ id: Any
33
+ label: Optional[Any] = None
34
+ data: Dict[str, Any] | None = None
35
+
36
+ def __post_init__(self) -> None:
37
+ if self.data is None:
38
+ object.__setattr__(self, "data", {})
39
+
40
+
41
+ class Graph:
42
+ """Lightweight labeled (optionally weighted) undirected graph."""
43
+
44
+ def __init__(
45
+ self,
46
+ nodes: Optional[Iterable[Hashable]] = None,
47
+ edges: Optional[Iterable[Tuple[Hashable, Hashable]]] = None,
48
+ node_labels: Optional[Dict[Hashable, Any]] = None,
49
+ edge_labels: Optional[Dict[Tuple[Hashable, Hashable], Any]] = None,
50
+ edge_weights: Optional[Dict[Tuple[Hashable, Hashable], float]] = None,
51
+ ) -> None:
52
+ self.nodes: List[Hashable] = list(nodes) if nodes is not None else []
53
+ self.edges: List[Tuple[Hashable, Hashable]] = list(edges) if edges is not None else []
54
+ self.node_labels: Optional[Dict[Hashable, Any]] = node_labels
55
+ self.edge_labels: Optional[Dict[Tuple[Hashable, Hashable], Any]] = edge_labels
56
+ self.edge_weights: Optional[Dict[Tuple[Hashable, Hashable], float]] = edge_weights
57
+
58
+ # For incremental construction APIs.
59
+ self._nodes: Dict[Hashable, Node] = {}
60
+ self._adj: Dict[Hashable, List[Tuple[Hashable, Optional[Any], float]]] = {}
61
+
62
+ if self.nodes or self.edges:
63
+ # Seed internal indices for add_node/add_edge compatibility.
64
+ for nid in self.nodes:
65
+ lbl = self.node_labels.get(nid) if self.node_labels else None
66
+ self._nodes[nid] = Node(nid, lbl, {})
67
+ self._adj.setdefault(nid, [])
68
+ for (u, v) in self.edges:
69
+ lbl = None
70
+ if self.edge_labels is not None:
71
+ lbl = self.edge_labels.get((u, v), self.edge_labels.get((v, u)))
72
+ w = 1.0
73
+ if self.edge_weights is not None:
74
+ w = float(self.edge_weights.get((u, v), self.edge_weights.get((v, u), 1.0)))
75
+ self._adj.setdefault(u, []).append((v, lbl, w))
76
+ self._adj.setdefault(v, []).append((u, lbl, w))
77
+
78
+ @property
79
+ def is_weighted(self) -> bool:
80
+ if not self.edge_weights:
81
+ return False
82
+ return any(float(w) != 1.0 for w in self.edge_weights.values())
83
+
84
+ def add_node(self, node_id: Hashable, label: Optional[Any] = None, **data: Any) -> Node:
85
+ if node_id in self._nodes:
86
+ n = self._nodes[node_id]
87
+ lbl = n.label if label is None else label
88
+ merged = dict(n.data)
89
+ merged.update(data)
90
+ n2 = Node(node_id, lbl, merged)
91
+ self._nodes[node_id] = n2
92
+ if node_id not in self.nodes:
93
+ self.nodes.append(node_id)
94
+ if self.node_labels is not None:
95
+ self.node_labels[node_id] = lbl
96
+ self._adj.setdefault(node_id, [])
97
+ return n2
98
+
99
+ n = Node(node_id, label, data or None)
100
+ self._nodes[node_id] = n
101
+ self._adj.setdefault(node_id, [])
102
+ if node_id not in self.nodes:
103
+ self.nodes.append(node_id)
104
+ if label is not None:
105
+ if self.node_labels is None:
106
+ self.node_labels = {}
107
+ self.node_labels[node_id] = label
108
+ return n
109
+
110
+ def add_edge(
111
+ self,
112
+ u: Hashable,
113
+ v: Hashable,
114
+ label: Optional[Any] = None,
115
+ weight: float = 1.0,
116
+ ) -> None:
117
+ if u == v:
118
+ raise ValueError("Self loops are not supported.")
119
+ if u not in self._nodes:
120
+ self.add_node(u)
121
+ if v not in self._nodes:
122
+ self.add_node(v)
123
+
124
+ self.edges.append((u, v))
125
+ self._adj.setdefault(u, []).append((v, label, float(weight)))
126
+ self._adj.setdefault(v, []).append((u, label, float(weight)))
127
+
128
+ if label is not None:
129
+ if self.edge_labels is None:
130
+ self.edge_labels = {}
131
+ self.edge_labels[(u, v)] = label
132
+
133
+ if float(weight) != 1.0:
134
+ if self.edge_weights is None:
135
+ self.edge_weights = {}
136
+ self.edge_weights[(u, v)] = float(weight)
137
+
138
+ def iter_edges(self) -> Iterator[Tuple[Hashable, Hashable, Optional[Any]]]:
139
+ """Iterate edges once, yielding (u, v, label).
140
+
141
+ This preserves the legacy shape expected by existing writers.
142
+ """
143
+ seen: set[Tuple[Hashable, Hashable]] = set()
144
+ for (u, v) in self.edges:
145
+ a, b = (u, v) if u <= v else (v, u)
146
+ if (a, b) in seen:
147
+ continue
148
+ seen.add((a, b))
149
+ lbl = None
150
+ if self.edge_labels is not None:
151
+ lbl = self.edge_labels.get((u, v), self.edge_labels.get((v, u)))
152
+ yield (a, b, lbl)
153
+
154
+ def iter_edges_with_weights(
155
+ self,
156
+ ) -> Iterator[Tuple[Hashable, Hashable, Optional[Any], float]]:
157
+ """Iterate edges once, yielding (u, v, label, weight)."""
158
+ seen: set[Tuple[Hashable, Hashable]] = set()
159
+ for (u, v) in self.edges:
160
+ a, b = (u, v) if u <= v else (v, u)
161
+ if (a, b) in seen:
162
+ continue
163
+ seen.add((a, b))
164
+ lbl = None
165
+ if self.edge_labels is not None:
166
+ lbl = self.edge_labels.get((u, v), self.edge_labels.get((v, u)))
167
+ w = 1.0
168
+ if self.edge_weights is not None:
169
+ w = float(self.edge_weights.get((u, v), self.edge_weights.get((v, u), 1.0)))
170
+ yield (a, b, lbl, w)
171
+
172
+ def number_of_nodes(self) -> int:
173
+ return len(self.nodes)
174
+
175
+ def number_of_edges(self) -> int:
176
+ return len(self.edges)
177
+
178
+ def __repr__(self) -> str:
179
+ return f"Graph(num_nodes={self.number_of_nodes()}, num_edges={self.number_of_edges()}, weighted={self.is_weighted})"
submine/core/result.py ADDED
@@ -0,0 +1,121 @@
1
+ # submine/core/result.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass, field
5
+ from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple
6
+
7
+ from .graph import Graph
8
+
9
+ __all__ = [
10
+ "SubgraphOccurrence",
11
+ "SubgraphPattern",
12
+ "MiningResult",
13
+ ]
14
+
15
+
16
+ @dataclass
17
+ class SubgraphOccurrence:
18
+ """
19
+ One occurrence (embedding) of a pattern inside a particular input graph.
20
+
21
+ Attributes
22
+ ----------
23
+ graph_id : int
24
+ Index of the input graph in the dataset (0-based).
25
+ node_mapping : Mapping[int, int]
26
+ Mapping from pattern-local node ids -> node ids in the host graph.
27
+ Pattern-local ids should be 0..k-1 for a k-node pattern.
28
+ extra : dict
29
+ Optional algorithm-specific information
30
+ (e.g. edge mapping, score, position, etc.).
31
+ """
32
+ graph_id: int
33
+ node_mapping: Mapping[int, int]
34
+ extra: Dict[str, Any] = field(default_factory=dict)
35
+
36
+
37
+ @dataclass
38
+ class SubgraphPattern:
39
+ """
40
+ A mined subgraph pattern.
41
+
42
+ Attributes
43
+ ----------
44
+ pid : int
45
+ Pattern identifier (unique within a MiningResult).
46
+ graph : Graph
47
+ The pattern itself as a Graph object (pattern-local node ids 0..k-1).
48
+ support : int
49
+ Support count (e.g. number of graphs / embeddings where it appears).
50
+ frequency : Optional[float]
51
+ Relative frequency (e.g. support / num_graphs); optional.
52
+ occurrences : list[SubgraphOccurrence]
53
+ Concrete occurrences of this pattern in the input graphs (optional;
54
+ some algorithms may not provide embeddings).
55
+ attributes : dict
56
+ Algorithm-specific attributes (e.g. score, interestingness, DFScode).
57
+ """
58
+ pid: int
59
+ graph: Graph
60
+ support: int
61
+ frequency: Optional[float] = None
62
+ occurrences: List[SubgraphOccurrence] = field(default_factory=list)
63
+ attributes: Dict[str, Any] = field(default_factory=dict)
64
+
65
+ def add_occurrence(self, occ: SubgraphOccurrence) -> None:
66
+ self.occurrences.append(occ)
67
+
68
+
69
+ @dataclass
70
+ class MiningResult:
71
+ """
72
+ Container for the output of a subgraph mining run.
73
+
74
+ Attributes
75
+ ----------
76
+ patterns : list[SubgraphPattern]
77
+ List of mined patterns.
78
+ algorithm : str
79
+ Name of the algorithm that produced these patterns.
80
+ params : dict
81
+ Parameters used for the run (min_support, max_size, etc.).
82
+ runtime : Optional[float]
83
+ Wall-clock runtime in seconds, if measured.
84
+ metadata : dict
85
+ Additional metadata (dataset info, version, logs, etc.).
86
+ """
87
+ patterns: List[SubgraphPattern]
88
+ algorithm: str
89
+ params: Dict[str, Any] = field(default_factory=dict)
90
+ runtime: Optional[float] = None
91
+ metadata: Dict[str, Any] = field(default_factory=dict)
92
+
93
+ def __len__(self) -> int:
94
+ return len(self.patterns)
95
+
96
+ def top_k(self, k: int, key: str = "support") -> List[SubgraphPattern]:
97
+ """
98
+ Return the top-k patterns sorted by a given key.
99
+
100
+ Parameters
101
+ ----------
102
+ k : int
103
+ Number of patterns to return.
104
+ key : str
105
+ Sort key: 'support', 'frequency', or any numeric attribute name
106
+ stored in pattern.attributes[key].
107
+
108
+ Returns
109
+ -------
110
+ list[SubgraphPattern]
111
+ """
112
+ def key_fn(p: SubgraphPattern):
113
+ if key == "support":
114
+ return p.support
115
+ if key == "frequency":
116
+ return p.frequency if p.frequency is not None else 0.0
117
+ # fallback to attributes
118
+ val = p.attributes.get(key, 0.0)
119
+ return float(val)
120
+
121
+ return sorted(self.patterns, key=key_fn, reverse=True)[:k]
@@ -0,0 +1,11 @@
1
+ """Built‑in datasets for submine.
2
+
3
+ This package provides convenience loaders for commonly used benchmark
4
+ datasets in frequent subgraph mining research. Datasets are returned
5
+ as lists of :class:`~submine.core.graph.Graph` objects. The available
6
+ datasets are documented in :mod:`submine.datasets.loaders`.
7
+ """
8
+
9
+ from .loaders import get_dataset # noqa: F401
10
+
11
+ __all__ = ["get_dataset"]
@@ -0,0 +1,145 @@
1
+ """Dataset loaders for submine.
2
+
3
+ This module contains functions to load well‑known benchmark datasets
4
+ such as MUTAG, ENZYMES and more. Where possible it attempts to use
5
+ existing libraries to fetch the datasets (e.g. torch_geometric or
6
+ networkx), but falls back to a simple synthetic dataset if these are
7
+ unavailable. To add a new dataset, implement a function named
8
+ ``load_<datasetname>()`` that returns a list of
9
+ :class:`~submine.core.graph.Graph` objects and register it in the
10
+ ``_DATASETS`` dictionary.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import Callable, Dict, List
16
+
17
+ from ..core.graph import Graph
18
+
19
+
20
+ def _load_toy() -> List[Graph]:
21
+ """Return a small synthetic dataset useful for testing.
22
+
23
+ The dataset contains two graphs: a triangle and a path of length 3.
24
+ Node labels are single characters.
25
+ """
26
+ # Graph 1: triangle A-B-C-A
27
+ g1 = Graph()
28
+ g1.add_node(0, label="A")
29
+ g1.add_node(1, label="B")
30
+ g1.add_node(2, label="C")
31
+ g1.add_edge(0, 1, label="ab")
32
+ g1.add_edge(1, 2, label="bc")
33
+ g1.add_edge(2, 0, label="ca")
34
+ # Graph 2: path D-E-F
35
+ g2 = Graph()
36
+ g2.add_node(0, label="D")
37
+ g2.add_node(1, label="E")
38
+ g2.add_node(2, label="F")
39
+ g2.add_edge(0, 1, label="de")
40
+ g2.add_edge(1, 2, label="ef")
41
+ return [g1, g2]
42
+
43
+
44
+ def _load_mutag() -> List[Graph]:
45
+ """Load the MUTAG dataset if torch_geometric is available.
46
+
47
+ If the dataset cannot be downloaded or the library is missing a
48
+ NotImplementedError is raised.
49
+ """
50
+ try:
51
+ from torch_geometric.datasets import TUDataset # type: ignore
52
+ import torch_geometric.utils # noqa: F401 # type: ignore
53
+ except Exception as e:
54
+ raise NotImplementedError("Loading MUTAG requires torch_geometric and internet access") from e
55
+ # Fetch dataset
56
+ dataset = TUDataset(root="/tmp/MUTAG", name="MUTAG")
57
+ graphs: List[Graph] = []
58
+ for data in dataset:
59
+ # Convert to networkx graph then to our Graph
60
+ import networkx as nx # type: ignore
61
+ g_nx = nx.Graph()
62
+ # Nodes with labels from x attribute (one-hot) if present
63
+ for i in range(data.num_nodes):
64
+ label = None
65
+ if hasattr(data, "node_label") and data.node_label is not None:
66
+ label = str(int(data.node_label[i]))
67
+ g_nx.add_node(i, label=label)
68
+ # Edges
69
+ edge_attr = data.edge_attr if data.edge_attr is not None else None
70
+ for j in range(data.edge_index.size(1)):
71
+ u = int(data.edge_index[0, j])
72
+ v = int(data.edge_index[1, j])
73
+ label = None
74
+ if edge_attr is not None:
75
+ label = str(int(edge_attr[j].item()))
76
+ g_nx.add_edge(u, v, label=label)
77
+ graphs.append(Graph.from_networkx(g_nx))
78
+ return graphs
79
+
80
+
81
+ def _load_enzymes() -> List[Graph]:
82
+ """Load the ENZYMES dataset if torch_geometric is available."""
83
+ try:
84
+ from torch_geometric.datasets import TUDataset # type: ignore
85
+ import torch_geometric.utils # noqa: F401 # type: ignore
86
+ except Exception as e:
87
+ raise NotImplementedError("Loading ENZYMES requires torch_geometric and internet access") from e
88
+ dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
89
+ graphs: List[Graph] = []
90
+ for data in dataset:
91
+ import networkx as nx # type: ignore
92
+ g_nx = nx.Graph()
93
+ for i in range(data.num_nodes):
94
+ label = None
95
+ if hasattr(data, "node_label") and data.node_label is not None:
96
+ label = str(int(data.node_label[i]))
97
+ g_nx.add_node(i, label=label)
98
+ edge_attr = data.edge_attr if data.edge_attr is not None else None
99
+ for j in range(data.edge_index.size(1)):
100
+ u = int(data.edge_index[0, j])
101
+ v = int(data.edge_index[1, j])
102
+ label = None
103
+ if edge_attr is not None:
104
+ label = str(int(edge_attr[j].item()))
105
+ g_nx.add_edge(u, v, label=label)
106
+ graphs.append(Graph.from_networkx(g_nx))
107
+ return graphs
108
+
109
+
110
+ DATASET_LOADERS: Dict[str, Callable[[], List[Graph]]] = {
111
+ "toy": _load_toy,
112
+ "mutag": _load_mutag,
113
+ "enzymes": _load_enzymes,
114
+ }
115
+
116
+
117
+ def get_dataset(name: str, **kwargs) -> List[Graph]:
118
+ """Load a dataset by name.
119
+
120
+ Supported names include ``"toy"``, ``"mutag"`` and ``"enzymes"``.
121
+ Names are case insensitive.
122
+
123
+ Parameters
124
+ ----------
125
+ name: str
126
+ Dataset identifier.
127
+ **kwargs: dict
128
+ Additional keyword arguments passed to the underlying loader. Not
129
+ currently used.
130
+
131
+ Returns
132
+ -------
133
+ List[Graph]
134
+ List of graphs comprising the dataset.
135
+
136
+ Raises
137
+ ------
138
+ KeyError
139
+ If the dataset name is unknown.
140
+ """
141
+ key = name.lower()
142
+ if key not in DATASET_LOADERS:
143
+ raise KeyError(f"Unknown dataset '{name}'. Available datasets: {list(DATASET_LOADERS.keys())}")
144
+ loader = DATASET_LOADERS[key]
145
+ return loader()
submine/errors.py ADDED
@@ -0,0 +1,41 @@
1
+ """Submine exception hierarchy.
2
+
3
+ These exceptions provide stable, semantically meaningful error types that
4
+ downstream applications can catch without parsing strings.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+
10
+ class SubmineError(Exception):
11
+ """Base class for all library-defined exceptions."""
12
+
13
+
14
+ class SubmineInputError(SubmineError, ValueError):
15
+ """Raised when user-supplied inputs (files, graphs, parameters) are invalid."""
16
+
17
+
18
+ class ParameterValidationError(SubmineInputError):
19
+ """Raised when algorithm parameters fail validation."""
20
+
21
+
22
+ class BackendUnavailableError(SubmineError, RuntimeError):
23
+ """Raised when an optional external backend (binary, JVM, etc.) is unavailable."""
24
+
25
+
26
+ class BackendExecutionError(SubmineError, RuntimeError):
27
+ """Raised when an external backend fails during execution."""
28
+
29
+
30
+ class ResourceLimitError(SubmineInputError):
31
+ """Raised when an input exceeds configured resource limits."""
32
+
33
+ __all__ = [
34
+ "SubmineError",
35
+ "SubmineInputError",
36
+ "ParameterValidationError",
37
+ "BackendUnavailableError",
38
+ "BackendExecutionError",
39
+ "ResourceLimitError",
40
+ ]
41
+
submine/io/__init__.py ADDED
@@ -0,0 +1,30 @@
1
+ """Input/Output utilities for submine.
2
+
3
+ This package contains helper functions to serialise graphs to the input
4
+ formats expected by different subgraph mining algorithms and to parse
5
+ their outputs back into :class:`~submine.core.graph.Graph` objects.
6
+ """
7
+
8
+ from .common import ensure_dir # noqa: F401
9
+ from .gspan import write_gspan_dataset # noqa: F401
10
+ from .transcode import ( # noqa: F401
11
+ detect_format,
12
+ load_graphs,
13
+ transcode_path,
14
+ FMT_EDGELIST,
15
+ FMT_GEXF,
16
+ FMT_GSPAN,
17
+ FMT_LG,
18
+ )
19
+
20
+ __all__ = [
21
+ "ensure_dir",
22
+ "write_gspan_dataset",
23
+ "detect_format",
24
+ "load_graphs",
25
+ "transcode_path",
26
+ "FMT_EDGELIST",
27
+ "FMT_GEXF",
28
+ "FMT_GSPAN",
29
+ "FMT_LG",
30
+ ]
submine/io/common.py ADDED
@@ -0,0 +1,173 @@
1
+ """Common I/O helpers used by algorithm wrappers.
2
+
3
+ Functions defined here are not tied to a specific algorithm. They
4
+ perform tasks such as ensuring that directories exist or creating
5
+ temporary working directories.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from contextlib import contextmanager
12
+ from pathlib import Path
13
+ from tempfile import TemporaryDirectory
14
+ from typing import Iterator
15
+
16
+ from pathlib import Path
17
+ from typing import List, Optional, Tuple, Any
18
+
19
+ from ..core.graph import Graph
20
+ from ..errors import SubmineInputError
21
+ from ..utils.checks import iter_text_lines
22
+
23
+
24
+ def ensure_dir(path: str | Path) -> Path:
25
+ """Ensure that the directory at ``path`` exists.
26
+
27
+ Parameters
28
+ ----------
29
+ path: str or pathlib.Path
30
+ Directory path to create if it does not exist.
31
+
32
+ Returns
33
+ -------
34
+ pathlib.Path
35
+ The Path instance corresponding to ``path``.
36
+ """
37
+ p = Path(path)
38
+ if not p.exists():
39
+ p.mkdir(parents=True, exist_ok=True)
40
+ return p
41
+
42
+
43
+ @contextmanager
44
+ def temporary_directory() -> Iterator[Path]:
45
+ """Context manager yielding a temporary directory as a Path.
46
+
47
+ The directory and its contents are removed on exit. Useful when
48
+ writing temporary files for external algorithms.
49
+ """
50
+ with TemporaryDirectory() as tmp:
51
+ yield Path(tmp)
52
+
53
+
54
+ def _maybe_int(x: str) -> Any:
55
+ """Try to parse as int, fallback to string."""
56
+ try:
57
+ return int(x)
58
+ except ValueError:
59
+ return x
60
+
61
+
62
+ def read_edgelist_dataset(path: str | Path) -> List[Graph]:
63
+ """
64
+ Read an edge-list dataset file and return a list of Graph objects.
65
+
66
+ Supported formats
67
+ -----------------
68
+ 1) Single graph (no 't # gid' headers):
69
+
70
+ u v
71
+ u v label
72
+
73
+ - Nodes are inferred from all endpoints.
74
+ - Edge labels are used if a third column is present on ANY line.
75
+
76
+ 2) Multiple graphs (with gSpan-like headers):
77
+
78
+ t # 0
79
+ u v
80
+ u v label
81
+ t # 1
82
+ u v
83
+ ...
84
+
85
+ - Each 't # gid' starts a new graph.
86
+ - Nodes are inferred per graph.
87
+
88
+ Returns
89
+ -------
90
+ List[Graph]
91
+ A list of Graph objects constructed from the file.
92
+ """
93
+ path = Path(path)
94
+
95
+ graphs: List[Graph] = []
96
+
97
+ current_edges: Optional[List[Tuple[Any, Any]]] = None
98
+ current_edge_labels: Optional[dict[Tuple[Any, Any], Any]] = None
99
+ any_labels_in_current = False
100
+
101
+ def flush_current_graph():
102
+ nonlocal current_edges, current_edge_labels, any_labels_in_current
103
+ if current_edges is None:
104
+ return
105
+
106
+ # Build node set from edges
107
+ nodes_set = set()
108
+ for u, v in current_edges:
109
+ nodes_set.add(u)
110
+ nodes_set.add(v)
111
+ # Deterministic node ordering for reproducible transcoding/writing.
112
+ # Edge lists may mix ints/strings; sort by type name then by string value.
113
+ nodes = sorted(nodes_set, key=lambda x: (type(x).__name__, str(x)))
114
+
115
+ edge_labels = current_edge_labels if any_labels_in_current else None
116
+
117
+ graphs.append(
118
+ Graph(
119
+ nodes=nodes,
120
+ edges=current_edges,
121
+ node_labels=None, # edge-list doesn't carry node labels
122
+ edge_labels=edge_labels,
123
+ )
124
+ )
125
+ current_edges = None
126
+ current_edge_labels = None
127
+ any_labels_in_current = False
128
+
129
+ for raw_line in iter_text_lines(path):
130
+ line = raw_line.strip()
131
+ if not line or line.startswith("#"):
132
+ continue
133
+
134
+ # Accept both whitespace-separated and comma-separated edge lists.
135
+ if "," in line:
136
+ line = line.replace(",", " ")
137
+ parts = line.split()
138
+ rec_type = parts[0]
139
+
140
+ # Multi-graph header: t # gid
141
+ if rec_type == "t" and len(parts) >= 3 and parts[1] == "#":
142
+ # flush previous graph, start new
143
+ flush_current_graph()
144
+ current_edges = []
145
+ current_edge_labels = {}
146
+ any_labels_in_current = False
147
+ # we ignore gid value; it's just a marker
148
+ continue
149
+
150
+ # Otherwise, treat as an edge line: u v [label]
151
+ if current_edges is None:
152
+ # No graph header seen yet: assume single-graph file
153
+ current_edges = []
154
+ current_edge_labels = {}
155
+ any_labels_in_current = False
156
+
157
+ if len(parts) < 2:
158
+ raise ValueError(f"Malformed edge line: {line!r}")
159
+
160
+ u = _maybe_int(parts[0])
161
+ v = _maybe_int(parts[1])
162
+
163
+ if len(parts) >= 3:
164
+ lbl = _maybe_int(parts[2])
165
+ any_labels_in_current = True
166
+ current_edges.append((u, v))
167
+ current_edge_labels[(u, v)] = lbl # type: ignore[index]
168
+ else:
169
+ current_edges.append((u, v))
170
+
171
+ # Flush final graph (single-graph files without a trailing header)
172
+ flush_current_graph()
173
+ return graphs