PyPI - submine - Versions diffs - 0.1.1__cp312-cp312-macosx_11_0_arm64.whl - Mend

submine 0.1.1__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

submine/__init__.py +37 -0
submine/algorithms/__init__.py +23 -0
submine/algorithms/base.py +143 -0
submine/algorithms/gspan.py +156 -0
submine/algorithms/gspan_cpp.cpython-312-darwin.so +0 -0
submine/algorithms/sopagrami.py +250 -0
submine/algorithms/sopagrami_cpp.cpython-312-darwin.so +0 -0
submine/api.py +134 -0
submine/backends/__init__.py +0 -0
submine/backends/gspan/CMakeLists.txt +65 -0
submine/backends/gspan/dfs.cpp +98 -0
submine/backends/gspan/graph.cpp +165 -0
submine/backends/gspan/gspan.cpp +776 -0
submine/backends/gspan/gspan.h +296 -0
submine/backends/gspan/ismin.cpp +124 -0
submine/backends/gspan/main.cpp +106 -0
submine/backends/gspan/misc.cpp +177 -0
submine/backends/gspan/python_bindings.cpp +133 -0
submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
submine/backends/sopagrami/cpp/src/main.cpp +94 -0
submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
submine/cli/__init__.py +6 -0
submine/cli/main.py +87 -0
submine/core/__init__.py +12 -0
submine/core/graph.py +179 -0
submine/core/result.py +121 -0
submine/datasets/__init__.py +11 -0
submine/datasets/loaders.py +145 -0
submine/errors.py +41 -0
submine/io/__init__.py +30 -0
submine/io/common.py +173 -0
submine/io/gexf.py +88 -0
submine/io/gspan.py +268 -0
submine/io/sopagrami.py +143 -0
submine/io/transcode.py +147 -0
submine/registry.py +8 -0
submine/utils/__init__.py +6 -0
submine/utils/checks.py +115 -0
submine/utils/logging.py +41 -0
submine-0.1.1.dist-info/METADATA +178 -0
submine-0.1.1.dist-info/RECORD +47 -0
submine-0.1.1.dist-info/WHEEL +6 -0
submine-0.1.1.dist-info/licenses/LICENSE +21 -0

submine/backends/sopagrami/cpp/src/dump.cpp ADDED Viewed

@@ -0,0 +1,262 @@
+#include "alg.hpp"
+#include <filesystem>
+#include <fstream>
+#include <sstream>
+#include <functional>
+#include <climits>
+namespace fs = std::filesystem;
+namespace algo {
+    // ---- utilities to reuse your edge-check logic ----
+static inline bool ok_edge_map(const algo::DataGraph& G,
+                               const algo::Pattern::PEdge& e,
+                               int va, int vb,   // pattern endpoints
+                               int ga, int gb)   // graph nodes mapped to (va,vb)
+{
+    if (e.dir == 1) {
+        if (e.a == va && e.b == vb) return G.has_edge(ga, gb, e.el);
+        if (e.a == vb && e.b == va) return G.has_edge(gb, ga, e.el);
+        return true;
+    } else {
+        if ((e.a == va && e.b == vb) || (e.a == vb && e.b == va))
+            return G.has_edge(ga, gb, e.el) || G.has_edge(gb, ga, e.el);
+        return true;
+    }
+}
+    // Forward check: quick “does gi have any neighbor candidate for each incident edge?”
+static bool forward_ok(const algo::DataGraph& G, const algo::Pattern& P,
+                       int v, int gi,
+                       const std::vector<int>& assign,
+                       const std::vector<std::vector<int>>& dom)
+{
+    for (const auto& e : P.pedges){
+        if (e.a!=v && e.b!=v) continue;
+        int w = (e.a==v ? e.b : e.a);
+        if (assign[w] != -1){
+            if (e.a==v){ if (!ok_edge_map(G,e,e.a,e.b,gi,assign[w])) return false; }
+            else       { if (!ok_edge_map(G,e,e.a,e.b,assign[w],gi)) return false; }
+            continue;
+        }
+        bool okN = false;
+        for (int gj : dom[w]){
+            if (e.a==v){ if (ok_edge_map(G,e,e.a,e.b,gi,gj)) { okN=true; break; } }
+            else       { if (ok_edge_map(G,e,e.a,e.b,gj,gi)) { okN=true; break; } }
+        }
+        if (!okN) return false;
+    }
+    return true;
+}
+static bool find_embedding_with_fixed(const algo::DataGraph& G,
+                                      const algo::Pattern& P,
+                                      int fixVar, int fixNode,
+                                      std::vector<int>& assignment)
+{
+    const int k = (int)P.vlab.size();
+    assignment.assign(k, -1);
+    // Build label-consistent domains
+    std::vector<std::vector<int>> dom(k);
+    for (int i=0;i<k;++i){
+        auto it = G.lab2nodes.find(P.vlab[i]);
+        if (it == G.lab2nodes.end()) return false;
+        dom[i].assign(it->second.begin(), it->second.end());
+        if (dom[i].empty()) return false;
+    }
+    // Fix x_fixVar = fixNode
+    assignment[fixVar] = fixNode;
+    std::vector<char> used(G.vlabels.size(), 0);
+    used[fixNode] = 1;
+    auto choose = [&](){
+        int best=-1, bestCnt=INT_MAX;
+        for (int v=0; v<k; ++v){
+            if (assignment[v]!=-1) continue;
+            int cnt=0;
+            for (int gi : dom[v]){
+                if (used[gi]) continue;
+                if (forward_ok(G,P,v,gi,assignment,dom)){ ++cnt; if (cnt>=bestCnt) break; }
+            }
+            if (cnt < bestCnt){ best=v; bestCnt=cnt; }
+        }
+        return best;
+    };
+    std::function<bool()> dfs = [&](){
+        for (int i=0;i<k;++i) if (assignment[i]==-1) goto not_done;
+        return true;
+      not_done:
+        int v = choose(); if (v==-1) return false;
+        for (int gi : dom[v]){
+            if (used[gi]) continue;
+            if (!forward_ok(G,P,v,gi,assignment,dom)) continue;
+            assignment[v]=gi; used[gi]=1;
+            if (dfs()) return true;
+            used[gi]=0; assignment[v]=-1;
+        }
+        return false;
+    };
+    return dfs();
+}
+// For each pattern vertex i, collect up to `max_per_vertex` graph node IDs
+// that participate in at least one full embedding (MNI “image set”).
+// If max_per_vertex < 0 => no cap.
+static std::vector<std::vector<int>>
+collect_mni_image_sets(const algo::DataGraph& G,
+                       const algo::Pattern& P,
+                       int max_per_vertex = 100)
+{
+    const int k = (int)P.vlab.size();
+    std::vector<std::vector<int>> images(k);
+    // Domains by label
+    std::vector<std::vector<int>> dom(k);
+    for (int i=0;i<k;++i){
+        auto it = G.lab2nodes.find(P.vlab[i]);
+        if (it == G.lab2nodes.end()) return images;
+        dom[i].assign(it->second.begin(), it->second.end());
+    }
+    // For each pattern variable v, test each u in dom[v] by trying to find one embedding
+    for (int v=0; v<k; ++v){
+        int kept = 0;
+        for (int u : dom[v]){
+            std::vector<int> a;
+            if (find_embedding_with_fixed(G, P, v, u, a)){
+                images[v].push_back(u);
+                ++kept;
+                if (max_per_vertex >= 0 && kept >= max_per_vertex) break;
+            }
+        }
+    }
+    return images;
+}
+// per-vertex images CSV (patternIndex, graphNodeId) ---
+static void write_pattern_images_csv(const algo::Pattern& P,
+                                     const std::vector<std::vector<int>>& images,
+                                     const std::string& path_csv)
+{
+    std::ofstream out(path_csv);
+    if (!out) return;
+    out << "pattern_vertex,graph_node_id\n";
+    for (size_t i=0;i<images.size();++i){
+        for (int u : images[i]){
+            out << i << "," << u << "\n";
+        }
+    }
+}
+// --- NEW: sample embeddings CSV (one row per embedding, columns are pattern vertex order) ---
+static void write_sample_embeddings_csv(const algo::Pattern& P,
+                                        const std::vector<std::vector<int>>& emb,
+                                        const std::string& path_csv)
+{
+    std::ofstream out(path_csv);
+    if (!out) return;
+    // header
+    out << "emb_id";
+    for (size_t i=0;i<P.vlab.size();++i) out << ",v" << i;
+    out << "\n";
+    for (size_t i=0;i<emb.size();++i){
+        out << i;
+        for (int id : emb[i]) out << "," << id;
+        out << "\n";
+    }
+}
+    // _____________________________________________________
+static std::string sanitize_dot(const std::string& s){
+    std::string t; t.reserve(s.size()*2);
+    for (char c: s){
+        if (c=='"' || c=='\\') t.push_back('\\');
+        t.push_back(c);
+    }
+    return t;
+}
+static void write_pattern_as_lg(const algo::Pattern& P, const std::string& path){
+    std::ofstream out(path);
+    if (!out) return;
+    for (size_t i=0;i<P.vlab.size();++i) out << "v " << i << " " << P.vlab[i] << "\n";
+    for (const auto& e : P.pedges)       out << "e " << e.a << " " << e.b << " " << e.el << "\n";
+}
+static void write_pattern_as_dot(const algo::Pattern& P, bool directed, const std::string& path){
+    std::ofstream out(path);
+    if (!out) return;
+    out << (directed ? "digraph G {\n" : "graph G {\n");
+    // nodes
+    for (size_t i=0;i<P.vlab.size();++i){
+        out << "  " << i << " [shape=circle,label=\"" << sanitize_dot(P.vlab[i]) << "\"];\n";
+    }
+    // edges
+    for (const auto& e : P.pedges){
+        const bool use_arrow = directed || e.dir==1;
+        out << "  " << e.a << (use_arrow ? " -> " : " -- ") << e.b
+            << " [label=\"" << sanitize_dot(e.el) << "\"];\n";
+    }
+    out << "}\n";
+}
+void dump_patterns_to_dir(
+    const Output& out,
+    const std::string& dump_dir,
+    bool directed,
+    const DataGraph& G,
+    bool dump_images_csv,
+    int  max_images_per_vertex,
+    bool dump_sample_embeddings,
+    int  sample_limit
+) {
+    fs::create_directories(dump_dir);
+    // ---- index.tsv ----
+    std::ofstream idx(fs::path(dump_dir) / "index.tsv");
+    idx << "id\tk\tm\tfull_support\tkey\tlg_path\tdot_path\n";
+    for (size_t i=0; i<out.frequent_patterns.size(); ++i){
+        const auto& f = out.frequent_patterns[i];
+        const size_t k = f.pat.vlab.size();
+        const size_t m = f.pat.pedges.size();
+        std::string base = dump_dir + "/pat_" + std::to_string(i)
+                         + "_k" + std::to_string(k)
+                         + "_e" + std::to_string(m)
+                         + "_full" + std::to_string(f.full_support);
+        std::string lgp  = base + ".lg";
+        std::string dotp = base + ".dot";
+        // always write shape artifacts
+        write_pattern_as_lg (f.pat, lgp);
+        write_pattern_as_dot(f.pat, directed, dotp);
+        // optionally: image sets (can be heavy)
+        if (dump_images_csv){
+            auto images = collect_mni_image_sets(G, f.pat, max_images_per_vertex);
+            write_pattern_images_csv(f.pat, images, base + ".images.csv");
+        }
+        // optionally: sample full embeddings (disabled in your current code; left stub)
+        if (dump_sample_embeddings){
+            std::vector<std::vector<int>> samples;
+            // enumerate_embeddings(G, f.pat, sample_limit, samples); // not implelemnted yet
+            write_sample_embeddings_csv(f.pat, samples, base + ".emb.csv");
+        }
+        idx << i << '\t' << k << '\t' << m << '\t'
+            << f.full_support << '\t' << f.pat.key()
+            << '\t' << lgp << '\t' << dotp << "\n";
+    }
+}
+} // namespace algo

submine/backends/sopagrami/cpp/src/main.cpp ADDED Viewed

@@ -0,0 +1,94 @@
+#include "alg.hpp"
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <iomanip>
+#include <filesystem>
+#include <functional>
+#include <climits>
+using namespace algo;
+namespace fs = std::filesystem;
+int main(int argc, char** argv){
+    // Usage:
+    //   run <graph.lg> [tau] [directed(0/1)] [sorted(0/1)] [threads]
+    //
+    // Defaults:
+    //   tau=2, directed=0, sorted=1 (SoGraMi ordering), threads=4
+   if (argc < 2){
+    std::cerr
+      << "Usage: run <graph.lg> [tau] [directed(0/1)] [sorted(0/1)] [threads]\n"
+      << "               [dump_dir] [dump_images(0/1)] [max_images_per_vertex]\n"
+      << "               [dump_emb(0/1)] [sample_limit]\n";
+    return 1;
+}
+    const std::string path = argv[1];
+    const int   tau      = (argc > 2 ? std::stoi(argv[2]) : 2);
+    const bool  directed = (argc > 3 ? (std::stoi(argv[3]) != 0) : false);
+    const bool  sorted   = (argc > 4 ? (std::stoi(argv[4]) != 0) : true);   // default: SoGraMi sorted
+    const int   threads  = (argc > 5 ? std::stoi(argv[5]) : 4);             // default: 4
+    DataGraph G;
+    G.load_from_lg(path, directed);
+    // Graph stats
+    std::cout << "Graph loaded: |V|=" << G.vlabels.size() << ", |E|=";
+    long long edge_count = 0;
+    for (const auto& adj_list : G.adj) edge_count += (long long)adj_list.size();
+    if (!directed) edge_count /= 2;
+    std::cout << edge_count << "\n";
+    // Params
+    Params p;
+    p.tau = tau;
+    p.directed = directed;
+    p.sorted_seeds = sorted;     // SoGraMi ordering toggle
+    p.num_threads = threads;     // run_sopagrami  <=0 will default to all available
+    p.compute_full_support = true;
+    std::cout << "Settings: tau=" << p.tau
+              << " directed=" << (p.directed?1:0)
+              << " sorted=" << (p.sorted_seeds?1:0)
+              << " threads=" << p.num_threads
+              << "\n\n";
+    // Run
+    auto out = run_sopagrami(G, p);
+    // Output
+    std::cout << "Frequent patterns: " << out.frequent_patterns.size() << "\n";
+    for (const auto& f : out.frequent_patterns){
+        std::cout << "k=" << f.pat.vlab.size()
+                  << " |E|=" << f.pat.pedges.size()
+                  << " full=" << f.full_support
+                  << " key=" << f.pat.key() << "\n";
+    }
+    //dump patterns to dir
+    std::string dump_dir = (argc > 6 ? argv[6] : "");
+    bool dump_images_csv = (argc > 7 ? (std::stoi(argv[7]) != 0) : false);
+    int  max_images_per_vertex = (argc > 8 ? std::stoi(argv[8]) : 200);
+    bool dump_sample_embeddings = (argc > 9 ? (std::stoi(argv[9]) != 0) : false);
+    int  sample_limit = (argc > 10 ? std::stoi(argv[10]) : 50);
+    if (!dump_dir.empty()){
+        dump_patterns_to_dir(out, dump_dir, p.directed, G,
+                            dump_images_csv, max_images_per_vertex,
+                            dump_sample_embeddings, sample_limit);
+        std::cout << "Wrote pattern files to: " << dump_dir
+                << " (index.tsv, .lg, .dot"
+                << (dump_images_csv ? ", .images.csv" : "")
+                << (dump_sample_embeddings ? ", .emb.csv" : "")
+                << ")\n";
+    }
+    return 0;
+}

submine/backends/sopagrami/cpp/src/python_bindings.cpp ADDED Viewed

@@ -0,0 +1,123 @@
+// python_bindings.cpp
+#include "alg.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+namespace py = pybind11;
+using namespace algo;
+// Expose: run on a .lg file, optionally dump pattern artifacts to a directory,
+// and return patterns as a list of dicts.
+py::list run_on_lg_file(
+    const std::string& path,
+    int  tau,
+    bool directed,
+    bool sorted_seeds,
+    int  num_threads,
+    bool compute_full_support,
+    const std::string& out_dir,
+    bool dump_images_csv,
+    int  max_images_per_vertex,
+    bool dump_sample_embeddings,
+    int  sample_limit
+) {
+    DataGraph G;
+    G.load_from_lg(path, directed);
+    Params p;
+    p.tau                  = tau;
+    p.directed             = directed;
+    p.sorted_seeds         = sorted_seeds;
+    p.num_threads          = num_threads;
+    p.compute_full_support = compute_full_support;
+    Output out = run_sopagrami(G, p);
+    // Optional side-effect: dump pattern files to directory
+    if (!out_dir.empty()) {
+        dump_patterns_to_dir(
+            out,
+            out_dir,
+            p.directed,
+            G,
+            dump_images_csv,
+            max_images_per_vertex,
+            dump_sample_embeddings,
+            sample_limit
+        );
+    }
+    // Return patterns to Python
+    py::list py_patterns;
+    for (const auto& f : out.frequent_patterns) {
+        const Pattern& P = f.pat;
+        py::dict d;
+        d["node_labels"] = P.vlab; // std::vector<std::string>
+        py::list edges;
+        for (const auto& e : P.pedges) {
+            // (a, b, label, dir) dir: 0 undirected, 1 a->b (per your comment)
+            edges.append(py::make_tuple(e.a, e.b, e.el, e.dir));
+        }
+        d["edges"]        = std::move(edges);
+        d["full_support"] = f.full_support;
+        d["key"]          = P.key();
+        py_patterns.append(std::move(d));
+    }
+    return py_patterns;
+}
+PYBIND11_MODULE(sopagrami_cpp, m) {
+    m.doc() = "pybind11 bindings for SoPaGraMi (C++17)";
+    m.def(
+        "run_on_lg_file",
+        &run_on_lg_file,
+        py::arg("path"),
+        py::arg("tau")                  = 2,
+        py::arg("directed")             = false,
+        py::arg("sorted_seeds")         = true,
+        py::arg("num_threads")          = 0,
+        py::arg("compute_full_support") = true,
+        // dump-related args
+        py::arg("out_dir")              = std::string("result"),
+        py::arg("dump_images_csv")      = false,
+        py::arg("max_images_per_vertex")= 200,
+        py::arg("dump_sample_embeddings")= false,
+        py::arg("sample_limit")         = 50,
+        R"doc(
+Run SoPaGraMi on an input .lg graph.
+Parameters
+----------
+path : str
+    Path to input .lg file.
+tau : int, default=2
+directed : bool, default=False
+sorted_seeds : bool, default=True
+num_threads : int, default=0
+    0 means "use default / auto" as implemented in C++ core.
+compute_full_support : bool, default=True
+out_dir : str, default=""
+    If non-empty, dumps pattern artifacts to this directory:
+    index.tsv, per-pattern .lg, .dot, plus optional .images.csv and .emb.csv.
+dump_images_csv : bool, default=False
+max_images_per_vertex : int, default=200
+dump_sample_embeddings : bool, default=False
+sample_limit : int, default=50
+Returns
+-------
+list[dict]
+    Each dict contains: node_labels, edges, full_support, key.
+)doc"
+    );
+}

submine/cli/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Command line interface for submine.
+This subpackage defines the entry point for the optional `submine`
+command line tool. Users can run mining algorithms from the shell
+without writing Python code. See :mod:`submine.cli.main` for details.
+"""

submine/cli/main.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""Entry point for the submine command line interface.
+Use this CLI to run frequent subgraph mining algorithms from the shell.
+It supports selecting an algorithm, loading a dataset and specifying
+common parameters such as the minimum support threshold. The results
+are printed to standard output.
+Example::
+    python -m submine.cli.main --algorithm gspan --dataset toy --min-support 2
+"""
+from __future__ import annotations
+import argparse
+from typing import List
+from .. import get_algorithm, load_dataset
+def parse_args(argv: List[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run frequent subgraph mining algorithms")
+    parser.add_argument(
+        "--algorithm",
+        "-a",
+        required=True,
+        help="Name of the algorithm to run (e.g., gspan, grami)"
+    )
+    parser.add_argument(
+        "--dataset",
+        "-d",
+        default="toy",
+        help="Dataset name to load (e.g., toy, mutag, enzymes)"
+    )
+    parser.add_argument(
+        "--min-support",
+        "-s",
+        type=int,
+        default=1,
+        help="Minimum support threshold (positive integer)"
+    )
+    parser.add_argument(
+        "--top-k",
+        "-k",
+        type=int,
+        default=5,
+        help="Print the top K subgraphs by support"
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+    return parser.parse_args(argv)
+def main(argv: List[str] | None = None) -> None:
+    args = parse_args(argv)
+    # Load dataset
+    try:
+        graphs = load_dataset(args.dataset)
+    except Exception as e:
+        raise SystemExit(f"Failed to load dataset '{args.dataset}': {e}")
+    # Instantiate algorithm
+    try:
+        miner = get_algorithm(args.algorithm, verbose=args.verbose)
+    except KeyError as e:
+        raise SystemExit(str(e))
+    # Run mining
+    try:
+        result = miner.mine(graphs, min_support=args.min_support)
+    except NotImplementedError as e:
+        raise SystemExit(str(e))
+    except Exception as e:
+        raise SystemExit(f"Error while running algorithm '{args.algorithm}': {e}")
+    # Print results
+    top = result.top_k(args.top_k)
+    print(f"Found {len(result)} frequent subgraphs (displaying top {len(top)})")
+    for idx, fs in enumerate(top, start=1):
+        # Provide a simple textual representation
+        print(f"#{idx}: support={fs.support}, nodes={fs.pattern.number_of_nodes()}, edges={fs.pattern.number_of_edges()}")
+if __name__ == "__main__":
+    main()

submine/core/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Core data structures for the submine library.
+This subpackage contains fundamental classes used throughout the
+library. Currently the primary exported objects are
+:class:`~submine.core.graph.Graph` for representing graphs and
+:class:`~submine.core.result.FrequentSubgraph` for storing mining results.
+"""
+from .graph import Graph
+from .result import SubgraphPattern, MiningResult
+__all__ = ["Graph", "SubgraphPattern", "MiningResult"]