PyPI - submine - Versions diffs - 0.1.0__cp311-cp311-musllinux_1_2_x86_64.whl - Mend

submine 0.1.0__cp311-cp311-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

submine/__init__.py +37 -0
submine/algorithms/__init__.py +23 -0
submine/algorithms/base.py +143 -0
submine/algorithms/gspan.py +156 -0
submine/algorithms/gspan_cpp.cpython-311-x86_64-linux-musl.so +0 -0
submine/algorithms/sopagrami.py +250 -0
submine/algorithms/sopagrami_cpp.cpython-311-x86_64-linux-musl.so +0 -0
submine/api.py +134 -0
submine/backends/__init__.py +0 -0
submine/backends/gspan/CMakeLists.txt +65 -0
submine/backends/gspan/dfs.cpp +98 -0
submine/backends/gspan/graph.cpp +165 -0
submine/backends/gspan/gspan.cpp +776 -0
submine/backends/gspan/gspan.h +296 -0
submine/backends/gspan/ismin.cpp +124 -0
submine/backends/gspan/main.cpp +106 -0
submine/backends/gspan/misc.cpp +177 -0
submine/backends/gspan/python_bindings.cpp +133 -0
submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
submine/backends/sopagrami/cpp/src/main.cpp +94 -0
submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
submine/cli/__init__.py +6 -0
submine/cli/main.py +87 -0
submine/core/__init__.py +12 -0
submine/core/graph.py +179 -0
submine/core/result.py +121 -0
submine/datasets/__init__.py +11 -0
submine/datasets/loaders.py +145 -0
submine/errors.py +41 -0
submine/io/__init__.py +30 -0
submine/io/common.py +173 -0
submine/io/gexf.py +88 -0
submine/io/gspan.py +268 -0
submine/io/sopagrami.py +143 -0
submine/io/transcode.py +147 -0
submine/registry.py +8 -0
submine/utils/__init__.py +6 -0
submine/utils/checks.py +115 -0
submine/utils/logging.py +41 -0
submine-0.1.0.dist-info/METADATA +178 -0
submine-0.1.0.dist-info/RECORD +49 -0
submine-0.1.0.dist-info/WHEEL +5 -0
submine-0.1.0.dist-info/licenses/LICENSE +21 -0
submine.libs/libgcc_s-2298274a.so.1 +0 -0
submine.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0

submine/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+# submine/__init__.py
+from __future__ import annotations
+__version__ = "0.1.0"
+from .registry import available_algorithms
+from .errors import (
+    SubmineError,
+    SubmineInputError,
+    ParameterValidationError,
+    BackendUnavailableError,
+    BackendExecutionError,
+    ResourceLimitError,
+)
+# Import algorithms so they register themselves via @register
+# (you can add more as you implement them)
+from .algorithms import gspan  # noqa: F401
+# from .algorithms import grami  # noqa: F401
+from .algorithms import sopagrami  # noqa: F401
+# ...
+def get_mining_algorithm(name: str):
+    key = name.lower()
+    try:
+        return available_algorithms[key]
+    except KeyError:
+        raise ValueError(
+            f"Unknown algorithm '{name}'. "
+            f"Available: {sorted(available_algorithms.keys())}"
+        )
+def list_algorithms():
+    return sorted(available_algorithms.keys())

submine/algorithms/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Algorithm implementations for submine.
+Each submodule in this package implements a specific subgraph mining
+algorithm. Modules are expected to define a subclass of
+:class:`~submine.algorithms.base.SubgraphMiner` and register it via
+:func:`~submine.algorithms.base.register`. Registered algorithms will
+automatically appear in :func:`submine.get_algorithm`.
+To avoid the cost of importing heavy dependencies at module import
+time, algorithm modules should not perform expensive setup at the top
+level. Instead they should defer initialization to the constructor or
+:meth:`SubgraphMiner.check_availability`.
+"""
+from .base import SubgraphMiner  # noqa: F401
+# Import algorithm modules so they can register themselves when this
+# package is imported. Additional algorithms can be added here.
+from .gspan import GSpanMiner  # noqa: F401
+from .sopagrami import SoPaGraMiMiner  # noqa: F401
+__all__ = ["SubgraphMiner", "GSpanMiner", "SoPaGraMiMiner"]

submine/algorithms/base.py ADDED Viewed

@@ -0,0 +1,143 @@
+# submine/algorithms/base.py
+from __future__ import annotations
+import subprocess
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List, Optional
+from ..registry import available_algorithms
+from ..core.graph import Graph
+from ..core.result import MiningResult
+from ..utils.logging import get_logger
+from ..errors import BackendExecutionError, ParameterValidationError
+from typing import Iterable
+__all__ = ["SubgraphMiner", "register"]
+class SubgraphMiner(ABC):
+    name: str = "base"
+    # Native file input contract
+    # -------------------------
+    # Some algorithms consume an on-disk dataset in a specific format.
+    # If `expected_input_format` is set (e.g., "lg" or "gspan"), the high-level
+    # API can transcode user-provided files to this format and call `mine_native`.
+    #
+    # By default, miners operate on in-memory Graph objects via `mine()`.
+    expected_input_format: str | None = None
+    multi_graph_policy: str = "reject"  # reject | batch | merge (reserved)
+    # Weight handling
+    # -------------
+    # Most classical subgraph miners operate on *labeled* (unweighted) graphs.
+    # We therefore make weight support explicit. If the input graph contains
+    # weights and the algorithm does not support them, the weight_strategy
+    # controls what happens.
+    supports_weighted: bool = False
+    weight_strategy: str = "ignore"  # one of: ignore | reject
+    def __init__(self, verbose: bool = False) -> None:
+        self.verbose = verbose
+        self.logger = get_logger(self.__class__.__name__)
+        if self.verbose:
+            self.logger.setLevel("DEBUG")
+    def _handle_weights(self, graphs: Iterable[Graph]) -> Iterable[Graph]:
+        """Apply the configured weight strategy to input graphs.
+        - If the algorithm supports weights: pass through.
+        - If it does not and the graph is weighted:
+            * ignore: drop weights (treat as unweighted)
+            * reject: raise
+        """
+        for g in graphs:
+            if getattr(g, "is_weighted", False) and g.is_weighted and not self.supports_weighted:
+                if self.weight_strategy == "reject":
+                    raise ValueError(
+                        f"Algorithm '{self.name}' does not support weighted graphs; "
+                        "set weight_strategy='ignore' to drop weights explicitly."
+                    )
+                # ignore: drop weights
+                g.edge_weights = None
+            yield g
+    @abstractmethod
+    def mine(self, graphs: Iterable[Graph], min_support: int, **kwargs) -> MiningResult:
+        raise NotImplementedError
+    def mine_native(self, path: str | Path, min_support: int, **kwargs) -> MiningResult:
+        """Run the miner on a native on-disk dataset.
+        Miners with `expected_input_format != None` should override this method.
+        The default implementation indicates that the miner does not accept a
+        native path entrypoint.
+        """
+        raise NotImplementedError(
+            f"Algorithm '{self.name}' does not implement mine_native(); "
+            "use mine(graphs=...) instead."
+        )
+    def check_availability(self) -> None:
+        return None
+    def run_external(
+        self,
+        cmd: List[str],
+        *,
+        cwd: Optional[Path] = None,
+        timeout_s: int = 300,
+        env: Optional[dict[str, str]] = None,
+    ) -> str:
+        """Run an external command defensively.
+        - Uses ``shell=False`` implicitly (we pass a list).
+        - Applies a default timeout to avoid hung processes.
+        - Captures stdout/stderr for error reporting.
+        """
+        if not cmd or not isinstance(cmd[0], str):
+            raise ParameterValidationError("cmd must be a non-empty list of strings")
+        # Basic hardening against accidental injection via newlines/NULs.
+        for part in cmd:
+            if not isinstance(part, str):
+                raise TypeError("All cmd parts must be strings")
+            if "\x00" in part or "\n" in part or "\r" in part:
+                raise ParameterValidationError("Unsafe characters in command argument")
+        self.logger.debug("Running external command: %s", " ".join(cmd))
+        completed = subprocess.run(
+            cmd,
+            cwd=cwd,
+            text=True,
+            capture_output=True,
+            timeout=timeout_s,
+            env=env,
+            check=False,
+            close_fds=True,
+        )
+        self.logger.debug("Command stdout: %s", completed.stdout)
+        if completed.returncode != 0:
+            self.logger.error("Command failed with stderr: %s", completed.stderr)
+            raise RuntimeError(
+                f"Command '{' '.join(cmd)}' failed with exit code {completed.returncode}\n"
+                f"stderr:\n{completed.stderr}"
+            )
+        return completed.stdout
+def register(cls: type[SubgraphMiner]) -> type[SubgraphMiner]:
+    if not issubclass(cls, SubgraphMiner):
+        raise TypeError("Only subclasses of SubgraphMiner can be registered")
+    name = getattr(cls, "name", None)
+    if not isinstance(name, str):
+        raise TypeError("Subgraph miner must define a string 'name' attribute")
+    key = name.lower()
+    if key in available_algorithms:
+        raise ValueError(f"Algorithm '{name}' is already registered")
+    available_algorithms[key] = cls
+    return cls

submine/algorithms/gspan.py ADDED Viewed

@@ -0,0 +1,156 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import List, Optional
+from ..utils.checks import safe_read_text, assert_regular_file
+from ..errors import ParameterValidationError
+import tempfile
+import time
+from .base import SubgraphMiner, register
+from ..core.graph import Graph
+from ..core.result import MiningResult, SubgraphPattern
+from ..io.gspan import write_gspan_dataset,convert_gspan_graph
+from typing import Iterable
+@register
+class GSpanMiner(SubgraphMiner):
+    name = "gspan"
+    expected_input_format = "gspan"
+    def __init__(
+        self,
+        min_support: int = 2,
+        directed: bool = False,
+        min_vertices: int = 1,
+        max_vertices: Optional[int] = None,
+        visualize: bool = False,
+        write_out: bool = True,
+        verbose: bool = False,
+    ) -> None:
+        super().__init__(verbose=verbose)
+        # Parameter validation (publish-safe defaults)
+        if not isinstance(min_support, int) or min_support <= 0:
+            raise ParameterValidationError(f"min_support must be a positive int; got {min_support!r}")
+        if not isinstance(min_vertices, int) or min_vertices < 1:
+            raise ParameterValidationError(f"min_vertices must be an int >= 1; got {min_vertices!r}")
+        if max_vertices is not None:
+            if not isinstance(max_vertices, int) or max_vertices < min_vertices:
+                raise ParameterValidationError(
+                    f"max_vertices must be None or an int >= min_vertices ({min_vertices}); got {max_vertices!r}"
+                )
+        self.min_support = min_support
+        self.directed = directed
+        self.min_vertices = min_vertices
+        self.max_vertices = max_vertices
+        self.visualize = visualize
+        self.write_out = write_out
+    def _run_on_dataset(self, db_path: Path, support: int):
+        from . import gspan_cpp as gspan_mine
+        t0 = time.time()
+        db_path = assert_regular_file(db_path)
+        data = safe_read_text(db_path)
+        # TODO: plumb through additional kwargs once exposed by the binding.
+        res = gspan_mine.mine_from_string(
+            data,
+            minsup=support,
+            directed=self.directed,
+            maxpat_min=self.min_vertices,
+            maxpat_max=self.max_vertices if self.max_vertices is not None else 0xFFFFFFFF,
+        )
+        runtime = time.time() - t0
+        return runtime, res
+    def mine(self, graphs: List[Graph], min_support: Optional[int] = None, **kwargs) -> MiningResult:
+        graphs = list(self._handle_weights(graphs))
+        support = int(min_support if min_support is not None else self.min_support)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir_path = Path(tmpdir)
+            db_path = tmpdir_path / "gspan_db.data"
+            # write graphs in gspan format
+            write_gspan_dataset(graphs, db_path)
+            runtime, gs = self._run_on_dataset(db_path, support)
+        patterns = []
+        for pid,rec in enumerate(gs):
+            pattern_graph = Graph(edges=rec["edges"],nodes=rec['nodes'])
+            support = rec['support']
+            patterns.append(
+                SubgraphPattern(
+                    pid=pid,
+                    graph=pattern_graph,
+                    support=support,
+                    frequency=None,
+                    occurrences=[],  # can fill later if track embeddings
+                    attributes={
+                        "num_vertices": pattern_graph.number_of_nodes(),
+                        "graph_ids": rec["graph_ids"],
+                    },
+                )
+            )
+        return MiningResult(
+            patterns=patterns,
+            algorithm=self.name,
+            params=dict(
+                min_support=support,
+                directed=self.directed,
+                min_vertices=self.min_vertices,
+                max_vertices=self.max_vertices,
+                visualize=self.visualize,
+                write_out=self.write_out,
+            ),
+            runtime=runtime,
+            metadata={"backend": "gspan-mining"},
+        )
+    def mine_native(self, path: str | Path, min_support: int, **kwargs) -> MiningResult:
+        """Run gSpan directly on a user-supplied gSpan dataset file."""
+        db_path = Path(path)
+        support = int(min_support)
+        runtime, gs = self._run_on_dataset(db_path, support)
+        patterns = []
+        for pid,rec in enumerate(gs):
+            pattern_graph = Graph(edges=rec["edges"],nodes=rec['nodes'])
+            support = rec['support']
+            patterns.append(
+                SubgraphPattern(
+                    pid=pid,
+                    graph=pattern_graph,
+                    support=support,
+                    frequency=None,
+                    occurrences=[],  # can fill later if track embeddings
+                    attributes={
+                        "num_vertices": pattern_graph.number_of_nodes(),
+                        "graph_ids": rec["graph_ids"],
+                    },
+                )
+            )
+        return MiningResult(
+            patterns=patterns,
+            algorithm=self.name,
+            params=dict(
+                min_support=support,
+                directed=self.directed,
+                min_vertices=self.min_vertices,
+                max_vertices=self.max_vertices,
+                visualize=self.visualize,
+                write_out=self.write_out,
+                input_format="gspan",
+            ),
+            runtime=runtime,
+            metadata={"backend": "gspan-mining", "input_dataset": str(db_path)},
+        )

submine/algorithms/gspan_cpp.cpython-311-x86_64-linux-musl.so ADDED Viewed

Binary file

submine/algorithms/sopagrami.py ADDED Viewed

@@ -0,0 +1,250 @@
+# submine/algorithms/sopagrami.py
+from __future__ import annotations
+import tempfile
+import time
+from pathlib import Path
+from typing import Iterable, List, Optional
+from .base import SubgraphMiner, register
+from ..core.graph import Graph
+from ..core.result import MiningResult, SubgraphPattern
+from ..io.sopagrami import read_lg, write_lg
+from ..errors import ParameterValidationError
+@register
+class SoPaGraMiMiner(SubgraphMiner):
+    """
+    Python wrapper around the C++ SoPaGraMi implementation.
+    Note: SoPaGraMi mines frequent subgraphs from a *single* large graph,
+    not a dataset of many graphs.
+    """
+    name = "sopagrami"
+    expected_input_format = "lg"
+    multi_graph_policy = "reject"
+    def __init__(
+        self,
+        tau: int = 2,
+        directed: bool = False,
+        sorted_seeds: bool = True,
+        num_threads: int = 0,
+        compute_full_support: bool = True,
+        verbose: bool = False,
+    ) -> None:
+        super().__init__(verbose=verbose)
+        # Parameter validation (publish-safe defaults)
+        if not isinstance(tau, int) or tau < 1:
+            raise ParameterValidationError(f"tau must be an int >= 1; got {tau!r}")
+        if not isinstance(num_threads, int) or num_threads < 0:
+            raise ParameterValidationError(f"num_threads must be an int >= 0; got {num_threads!r}")
+        self.tau = tau
+        self.directed = directed
+        self.sorted_seeds = sorted_seeds
+        self.num_threads = num_threads
+        self.compute_full_support = compute_full_support
+    def check_availability(self):
+        try:
+            from . import sopagrami_cpp
+        except ImportError as e:
+            raise RuntimeError("SoPaGraMi backend not available") from e
+    def mine(
+        self,
+        graphs: Iterable[Graph],
+        min_support: Optional[int] = None,
+        out_dir:str=None,dump_images_csv:bool=False,
+                           max_images_per_vertex:int=50,dump_sample_embeddings:bool=False
+    ) -> MiningResult:
+        self.check_availability()
+        # Handle weights explicitly (SoPaGraMi backend treats graphs as labeled, not weighted).
+        graphs = self._handle_weights(graphs)
+        # SoPaGraMi expects a single graph
+        graphs_list = list(graphs)
+        if len(graphs_list) != 1:
+            raise ValueError(
+                "SoPaGraMiMiner currently expects exactly one Graph (single large graph). "
+                f"Got {len(graphs_list)}."
+            )
+        G = graphs_list[0]
+        tau = int(min_support if min_support is not None else self.tau)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir_path = Path(tmpdir)
+            lg_path = tmpdir_path / "graph.lg"
+            # 1) write graph as .lg
+            write_lg(G, lg_path, directed=self.directed)
+            # 2) call C++ binding
+            runtime, patterns_raw = self._run_backend_on_lg(lg_path, tau=tau,out_dir=out_dir,dump_images_csv=dump_images_csv,
+                           max_images_per_vertex=max_images_per_vertex,dump_sample_embeddings=dump_sample_embeddings)
+        # 3) Convert to our SubgraphPattern representation
+        patterns: List[SubgraphPattern] = []
+        for pid, pd in enumerate(patterns_raw):
+            node_labels = list(pd["node_labels"])
+            edges_raw = list(pd["edges"])
+            support = int(pd["full_support"])
+            key = pd["key"]
+            # SoPaGraMi pattern node IDs are 0..k-1
+            k = len(node_labels)
+            nodes = list(range(k))
+            # Build our Graph for the pattern
+            pat_edges = []
+            edge_labels = {}
+            for (a, b, el, dirflag) in edges_raw:
+                a = int(a)
+                b = int(b)
+                # Our Graph is undirected; we store the undirected edge,
+                # and put direction info into the label if needed.
+                u, v = (a, b) if a <= b else (b, a)
+                pat_edges.append((u, v))
+                label = el
+                if self.directed and dirflag == 1:
+                    # encode direction in the label for now
+                    label = f"{el}->"
+                edge_labels[(u, v)] = label
+            node_label_map = {i: lbl for i, lbl in enumerate(node_labels)}
+            pat_graph = Graph(
+                nodes=nodes,
+                edges=pat_edges,
+                node_labels=node_label_map,
+                edge_labels=edge_labels,
+            )
+            patterns.append(
+                SubgraphPattern(
+                    pid=pid,
+                    graph=pat_graph,
+                    support=support,
+                    frequency=None,
+                    occurrences=[],
+                    attributes={
+                        "key": key,
+                        "k": k,
+                        "num_edges": len(pat_edges),
+                    },
+                )
+            )
+        return MiningResult(
+            patterns=patterns,
+            algorithm=self.name,
+            params={
+                "tau": tau,
+                "directed": self.directed,
+                "sorted_seeds": self.sorted_seeds,
+                "num_threads": self.num_threads,
+                "compute_full_support": self.compute_full_support,
+            },
+            runtime=runtime,
+            metadata={"backend": "sopagrami_cpp"},
+        )
+    def mine_native(self, lg_path: str | Path, min_support: Optional[int] = None, out_dir:str=None,dump_images_csv:bool=False,
+                           max_images_per_vertex:int=50,dump_sample_embeddings:bool=False) -> MiningResult:
+        """Run SoPaGraMi directly on a user-supplied ``.lg`` file.
+        This avoids re-parsing/re-writing the file, which is important for
+        large graphs and for preserving any optional attributes present in the
+        original ``.lg``.
+        """
+        self.check_availability()
+        lg_path = Path(lg_path)
+        if lg_path.suffix.lower() != ".lg":
+            raise ValueError(f"Expected a .lg file for SoPaGraMi; got: {lg_path}")
+        tau = int(min_support if min_support is not None else self.tau)
+        runtime, patterns_raw = self._run_backend_on_lg(lg_path, tau=tau,out_dir=out_dir,dump_images_csv=dump_images_csv,
+                           max_images_per_vertex=max_images_per_vertex,dump_sample_embeddings=dump_sample_embeddings)
+        # Convert patterns (same as in mine())
+        patterns: List[SubgraphPattern] = []
+        for pid, pd in enumerate(patterns_raw):
+            node_labels = list(pd["node_labels"])
+            edges_raw = list(pd["edges"])
+            support = int(pd["full_support"])
+            key = pd["key"]
+            k = len(node_labels)
+            nodes = list(range(k))
+            pat_edges = []
+            edge_labels = {}
+            for (a, b, el, dirflag) in edges_raw:
+                a = int(a)
+                b = int(b)
+                u, v = (a, b) if a <= b else (b, a)
+                pat_edges.append((u, v))
+                label = el
+                if self.directed and dirflag == 1:
+                    label = f"{el}->"
+                edge_labels[(u, v)] = label
+            node_label_map = {i: lbl for i, lbl in enumerate(node_labels)}
+            pat_graph = Graph(nodes=nodes, edges=pat_edges, node_labels=node_label_map, edge_labels=edge_labels)
+            patterns.append(
+                SubgraphPattern(
+                    pid=pid,
+                    graph=pat_graph,
+                    support=support,
+                    frequency=None,
+                    occurrences=[],
+                    attributes={"key": key, "k": k, "num_edges": len(pat_edges)},
+                )
+            )
+        return MiningResult(
+            patterns=patterns,
+            algorithm=self.name,
+            params={
+                "tau": tau,
+                "directed": self.directed,
+                "sorted_seeds": self.sorted_seeds,
+                "num_threads": self.num_threads,
+                "compute_full_support": self.compute_full_support,
+                "input_format": "lg"
+            },
+            runtime=runtime,
+            metadata={"backend": "sopagrami_cpp", "input_lg": str(lg_path)},
+        )
+    def _run_backend_on_lg(self, lg_path: Path, tau: int,out_dir:str=None,dump_images_csv:bool=False,
+                           max_images_per_vertex:int=50,dump_sample_embeddings:bool=False):
+        from . import sopagrami_cpp
+        t0 = time.time()
+        self.logger.debug("Running SoPaGraMi on %s", lg_path)
+        if out_dir is None:
+            out_dir = "sopagrami_result"
+        patterns_raw = sopagrami_cpp.run_on_lg_file(
+            str(lg_path),
+            tau=tau,
+            directed=self.directed,
+            sorted_seeds=self.sorted_seeds,
+            num_threads=self.num_threads,
+            compute_full_support=self.compute_full_support,
+            dump_images_csv = dump_images_csv,
+            out_dir = out_dir,
+            max_images_per_vertex = max_images_per_vertex,
+            dump_sample_embeddings=dump_sample_embeddings
+        )
+        return time.time() - t0, patterns_raw

submine/algorithms/sopagrami_cpp.cpython-311-x86_64-linux-musl.so ADDED Viewed

Binary file