submine 0.1.0__cp311-cp311-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- submine/__init__.py +37 -0
- submine/algorithms/__init__.py +23 -0
- submine/algorithms/base.py +143 -0
- submine/algorithms/gspan.py +156 -0
- submine/algorithms/gspan_cpp.cpython-311-x86_64-linux-musl.so +0 -0
- submine/algorithms/sopagrami.py +250 -0
- submine/algorithms/sopagrami_cpp.cpython-311-x86_64-linux-musl.so +0 -0
- submine/api.py +134 -0
- submine/backends/__init__.py +0 -0
- submine/backends/gspan/CMakeLists.txt +65 -0
- submine/backends/gspan/dfs.cpp +98 -0
- submine/backends/gspan/graph.cpp +165 -0
- submine/backends/gspan/gspan.cpp +776 -0
- submine/backends/gspan/gspan.h +296 -0
- submine/backends/gspan/ismin.cpp +124 -0
- submine/backends/gspan/main.cpp +106 -0
- submine/backends/gspan/misc.cpp +177 -0
- submine/backends/gspan/python_bindings.cpp +133 -0
- submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
- submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
- submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
- submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
- submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
- submine/backends/sopagrami/cpp/src/main.cpp +94 -0
- submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
- submine/cli/__init__.py +6 -0
- submine/cli/main.py +87 -0
- submine/core/__init__.py +12 -0
- submine/core/graph.py +179 -0
- submine/core/result.py +121 -0
- submine/datasets/__init__.py +11 -0
- submine/datasets/loaders.py +145 -0
- submine/errors.py +41 -0
- submine/io/__init__.py +30 -0
- submine/io/common.py +173 -0
- submine/io/gexf.py +88 -0
- submine/io/gspan.py +268 -0
- submine/io/sopagrami.py +143 -0
- submine/io/transcode.py +147 -0
- submine/registry.py +8 -0
- submine/utils/__init__.py +6 -0
- submine/utils/checks.py +115 -0
- submine/utils/logging.py +41 -0
- submine-0.1.0.dist-info/METADATA +178 -0
- submine-0.1.0.dist-info/RECORD +49 -0
- submine-0.1.0.dist-info/WHEEL +5 -0
- submine-0.1.0.dist-info/licenses/LICENSE +21 -0
- submine.libs/libgcc_s-2298274a.so.1 +0 -0
- submine.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
#include <./pybind11/pybind11.h>
|
|
2
|
+
#include <./pybind11/stl.h>
|
|
3
|
+
|
|
4
|
+
#include <sstream>
|
|
5
|
+
#include <unordered_set>
|
|
6
|
+
#include <vector>
|
|
7
|
+
#include <utility>
|
|
8
|
+
#include <cstdint>
|
|
9
|
+
#include <algorithm>
|
|
10
|
+
|
|
11
|
+
#include "gspan.h"
|
|
12
|
+
|
|
13
|
+
namespace py = pybind11;
|
|
14
|
+
|
|
15
|
+
// Hash for pair<int,int>
|
|
16
|
+
struct PairHash {
|
|
17
|
+
std::size_t operator()(const std::pair<int,int>& p) const noexcept {
|
|
18
|
+
return (static_cast<std::size_t>(p.first) << 32) ^ static_cast<std::size_t>(p.second);
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
static py::dict graph_to_dict(const GSPAN::Graph& g,
|
|
23
|
+
unsigned int support,
|
|
24
|
+
const GSPAN::Projected* projected,
|
|
25
|
+
bool directed)
|
|
26
|
+
{
|
|
27
|
+
py::dict d;
|
|
28
|
+
const int n = static_cast<int>(g.size());
|
|
29
|
+
|
|
30
|
+
std::vector<int> nodes;
|
|
31
|
+
nodes.reserve(n);
|
|
32
|
+
|
|
33
|
+
std::vector<int> node_labels;
|
|
34
|
+
node_labels.reserve(n);
|
|
35
|
+
|
|
36
|
+
for (int i = 0; i < n; ++i) {
|
|
37
|
+
nodes.push_back(i);
|
|
38
|
+
node_labels.push_back(static_cast<int>(g[i].label));
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
std::vector<std::pair<int,int>> edges;
|
|
42
|
+
std::vector<int> edge_labels;
|
|
43
|
+
|
|
44
|
+
// IMPORTANT: raw extraction, no dedup
|
|
45
|
+
for (int u = 0; u < n; ++u) {
|
|
46
|
+
for (const auto& e : g[u].edge) {
|
|
47
|
+
edges.emplace_back(u, static_cast<int>(e.to));
|
|
48
|
+
edge_labels.push_back(static_cast<int>(e.elabel));
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
d["nodes"] = std::move(nodes);
|
|
54
|
+
d["edges"] = std::move(edges);
|
|
55
|
+
d["node_labels"] = std::move(node_labels);
|
|
56
|
+
d["edge_labels"] = std::move(edge_labels);
|
|
57
|
+
d["support"] = support;
|
|
58
|
+
|
|
59
|
+
// Graph IDs (gid) where pattern occurs.
|
|
60
|
+
// In your code: PDFS.id is "ID of the original input graph"
|
|
61
|
+
if (projected) {
|
|
62
|
+
std::unordered_set<unsigned int> uniq;
|
|
63
|
+
uniq.reserve(projected->size());
|
|
64
|
+
|
|
65
|
+
std::vector<unsigned int> gids;
|
|
66
|
+
gids.reserve(projected->size());
|
|
67
|
+
|
|
68
|
+
for (const auto& p : *projected) {
|
|
69
|
+
unsigned int gid = p.id; // <-- this is your gid
|
|
70
|
+
if (uniq.insert(gid).second) gids.push_back(gid);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
std::sort(gids.begin(), gids.end());
|
|
74
|
+
d["graph_ids"] = std::move(gids);
|
|
75
|
+
} else {
|
|
76
|
+
d["graph_ids"] = py::none();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return d;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
static std::vector<py::dict> mine_from_string(const std::string& gspan_data,
|
|
84
|
+
unsigned int minsup,
|
|
85
|
+
unsigned int maxpat_min,
|
|
86
|
+
unsigned int maxpat_max,
|
|
87
|
+
bool enc,
|
|
88
|
+
bool where,
|
|
89
|
+
bool directed)
|
|
90
|
+
{
|
|
91
|
+
std::istringstream is(gspan_data);
|
|
92
|
+
std::ostringstream null_out; // discard textual output by default
|
|
93
|
+
|
|
94
|
+
GSPAN::gSpan miner;
|
|
95
|
+
std::vector<py::dict> results;
|
|
96
|
+
results.reserve(1024);
|
|
97
|
+
|
|
98
|
+
miner.set_callback([&](const GSPAN::Graph& pattern,
|
|
99
|
+
unsigned int sup,
|
|
100
|
+
const GSPAN::Projected* projected)
|
|
101
|
+
{
|
|
102
|
+
results.push_back(graph_to_dict(pattern, sup, projected, directed));
|
|
103
|
+
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
miner.run(is, null_out, minsup, maxpat_min, maxpat_max, enc, where, directed);
|
|
107
|
+
return results;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
PYBIND11_MODULE(gspan_cpp, m) {
|
|
111
|
+
m.doc() = "Pure C++ gSpan bindings (structured results)";
|
|
112
|
+
|
|
113
|
+
m.def("mine_from_string",
|
|
114
|
+
&mine_from_string,
|
|
115
|
+
py::arg("gspan_data"),
|
|
116
|
+
py::arg("minsup") = 1,
|
|
117
|
+
py::arg("maxpat_min") = 0,
|
|
118
|
+
py::arg("maxpat_max") = 0xffffffffu,
|
|
119
|
+
py::arg("enc") = false,
|
|
120
|
+
py::arg("where") = false,
|
|
121
|
+
py::arg("directed") = false,
|
|
122
|
+
R"pbdoc(
|
|
123
|
+
Mine frequent subgraphs from a gSpan text dataset passed as a string.
|
|
124
|
+
|
|
125
|
+
Returns: list of dicts:
|
|
126
|
+
- nodes: [0..n-1]
|
|
127
|
+
- edges: list of (u,v)
|
|
128
|
+
- node_labels: list aligned to nodes
|
|
129
|
+
- edge_labels: list aligned to edges
|
|
130
|
+
- support: int
|
|
131
|
+
- graph_ids: list of graph IDs where pattern occurs, or None
|
|
132
|
+
)pbdoc");
|
|
133
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.15)
|
|
2
|
+
project(SopaGrami LANGUAGES CXX)
|
|
3
|
+
|
|
4
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
5
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
6
|
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
7
|
+
|
|
8
|
+
add_library(alg STATIC src/alg.cpp)
|
|
9
|
+
target_sources(alg PRIVATE
|
|
10
|
+
src/alg.cpp
|
|
11
|
+
src/dump.cpp
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
target_include_directories(alg PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
|
|
15
|
+
set_target_properties(alg PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
16
|
+
|
|
17
|
+
include(FetchContent)
|
|
18
|
+
|
|
19
|
+
# Try system / vcpkg / conda / installed pybind11 first
|
|
20
|
+
find_package(pybind11 CONFIG QUIET)
|
|
21
|
+
|
|
22
|
+
if (NOT pybind11_FOUND)
|
|
23
|
+
message(STATUS "pybind11 not found via find_package; fetching with FetchContent...")
|
|
24
|
+
|
|
25
|
+
FetchContent_Declare(
|
|
26
|
+
pybind11
|
|
27
|
+
GIT_REPOSITORY https://github.com/pybind/pybind11.git
|
|
28
|
+
GIT_TAG v2.12.0
|
|
29
|
+
)
|
|
30
|
+
FetchContent_MakeAvailable(pybind11)
|
|
31
|
+
endif()
|
|
32
|
+
|
|
33
|
+
pybind11_add_module(sopagrami_cpp src/python_bindings.cpp)
|
|
34
|
+
|
|
35
|
+
target_include_directories(sopagrami_cpp PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
|
|
36
|
+
target_link_libraries(sopagrami_cpp PRIVATE alg)
|
|
37
|
+
set_target_properties(sopagrami_cpp PROPERTIES PREFIX "")
|
|
38
|
+
|
|
39
|
+
# Install into scikit-build-core wheel
|
|
40
|
+
install(TARGETS sopagrami_cpp
|
|
41
|
+
LIBRARY DESTINATION "submine/algorithms"
|
|
42
|
+
RUNTIME DESTINATION "submine/algorithms"
|
|
43
|
+
ARCHIVE DESTINATION "submine/algorithms"
|
|
44
|
+
)
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <vector>
|
|
3
|
+
#include <string>
|
|
4
|
+
#include <unordered_map>
|
|
5
|
+
#include <unordered_set>
|
|
6
|
+
#include <utility>
|
|
7
|
+
#include <optional>
|
|
8
|
+
#include <tuple>
|
|
9
|
+
#include <limits>
|
|
10
|
+
#include <bitset>
|
|
11
|
+
#include <cstdint>
|
|
12
|
+
namespace algo {
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
// --- portable popcount64 ---
|
|
17
|
+
#if defined(_MSC_VER)
|
|
18
|
+
#include <intrin.h>
|
|
19
|
+
static inline int popcount64(unsigned long long x) {
|
|
20
|
+
#if defined(_M_X64) || defined(_M_ARM64)
|
|
21
|
+
return (int)__popcnt64(x);
|
|
22
|
+
#else
|
|
23
|
+
// 32-bit: split into two 32-bit halves
|
|
24
|
+
return (int)(__popcnt((unsigned int)(x)) +
|
|
25
|
+
__popcnt((unsigned int)(x >> 32)));
|
|
26
|
+
#endif
|
|
27
|
+
}
|
|
28
|
+
#else
|
|
29
|
+
static inline int popcount64(unsigned long long x) {
|
|
30
|
+
return __builtin_popcountll(x);
|
|
31
|
+
}
|
|
32
|
+
#endif
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
// ---------- DataGraph (your spec) ----------
|
|
36
|
+
struct Edge { int u, v; std::string label; };
|
|
37
|
+
struct Bitset {
|
|
38
|
+
std::vector<uint64_t> w;
|
|
39
|
+
void init(int n){ w.assign((n+63)>>6, 0ull); }
|
|
40
|
+
inline void set(int i){ w[i>>6] |= (1ull<<(i&63)); }
|
|
41
|
+
inline void reset(int i){ w[i>>6] &= ~(1ull<<(i&63)); }
|
|
42
|
+
inline bool test(int i) const { return (w[i>>6] >> (i&63)) & 1ull; }
|
|
43
|
+
inline bool any() const { for (auto x: w) if (x) return true; return false; }
|
|
44
|
+
inline size_t count() const { size_t c=0; for (auto x: w) c += popcount64((unsigned long long)x); return c; }
|
|
45
|
+
|
|
46
|
+
// this &= other
|
|
47
|
+
inline void and_inplace(const Bitset& o){
|
|
48
|
+
size_t m = w.size(); for (size_t i=0;i<m;++i) w[i] &= o.w[i];
|
|
49
|
+
}
|
|
50
|
+
// this &= ~other
|
|
51
|
+
inline void andnot_inplace(const Bitset& o){
|
|
52
|
+
size_t m = w.size(); for (size_t i=0;i<m;++i) w[i] &= ~o.w[i];
|
|
53
|
+
}
|
|
54
|
+
// (this & other).any() without allocating
|
|
55
|
+
inline bool any_and(const Bitset& o) const {
|
|
56
|
+
size_t m = w.size(); for (size_t i=0;i<m;++i) if (w[i] & o.w[i]) return true; return false;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// make a copy intersected with 'o'
|
|
60
|
+
inline Bitset copy_and(const Bitset& o) const {
|
|
61
|
+
Bitset t; t.w.resize(w.size());
|
|
62
|
+
for (size_t i=0;i<w.size();++i) t.w[i] = w[i] & o.w[i];
|
|
63
|
+
return t;
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
struct DataGraph {
|
|
67
|
+
bool directed = false;
|
|
68
|
+
std::vector<std::string> vlabels; // node labels
|
|
69
|
+
std::vector<std::vector<std::pair<int,std::string>>> adj, rev; // (nbr, label)
|
|
70
|
+
std::vector<std::unordered_map<int, std::unordered_set<std::string>>> adj_set, rev_set; // fast has_edge
|
|
71
|
+
std::unordered_map<std::string, std::unordered_set<int>> lab2nodes; // node-label -> nodes
|
|
72
|
+
std::vector<std::unordered_map<std::string, std::vector<int>>> out_by_el, in_by_el;
|
|
73
|
+
// Bitset indices (speedup for undirected and directed)
|
|
74
|
+
std::unordered_map<std::string, Bitset> label_bits; // label -> nodes
|
|
75
|
+
std::vector<std::unordered_map<std::string, Bitset>> adj_el_bits; // per u: el -> bitset(neighbors via el)
|
|
76
|
+
|
|
77
|
+
void load_from_lg(const std::string& path, bool as_directed);
|
|
78
|
+
|
|
79
|
+
bool has_edge(int u, int v, const std::string& label) const {
|
|
80
|
+
auto it = adj_set[u].find(v);
|
|
81
|
+
if(it==adj_set[u].end()) return false;
|
|
82
|
+
if(label.empty()) return !it->second.empty();
|
|
83
|
+
return it->second.count(label)>0;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Edge type key for frequent 1-edge seeds (SoGraMi)
|
|
87
|
+
struct EdgeTypeKey {
|
|
88
|
+
std::string lu, lv, el; int dirflag; // 0 undirected, 1 u->v
|
|
89
|
+
bool operator==(EdgeTypeKey const& o) const {
|
|
90
|
+
return lu==o.lu && lv==o.lv && el==o.el && dirflag==o.dirflag;
|
|
91
|
+
}
|
|
92
|
+
bool operator<(EdgeTypeKey const& o) const {
|
|
93
|
+
if(lu!=o.lu) return lu<o.lu;
|
|
94
|
+
if(lv!=o.lv) return lv<o.lv;
|
|
95
|
+
if(el!=o.el) return el<o.el;
|
|
96
|
+
return dirflag<o.dirflag;
|
|
97
|
+
}
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
struct EdgeTypeStat { EdgeTypeKey key; int count; };
|
|
101
|
+
|
|
102
|
+
// Python-parity: counts in FIRST-SEEN INSERTION ORDER while scanning adjacency
|
|
103
|
+
std::vector<EdgeTypeStat> edge_type_counts_insertion_order() const;
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
// ---------- Algorithm API ----------
|
|
107
|
+
struct Params {
|
|
108
|
+
int tau = 2; // frequency threshold
|
|
109
|
+
bool directed = false; // interpret graph as directed?
|
|
110
|
+
bool sorted_seeds = true; // SoGraMi ordering
|
|
111
|
+
int num_threads = 0; // 0 => use hardware_concurrency
|
|
112
|
+
bool compute_full_support = true; // if false, use MNI only
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
struct Pattern {
|
|
116
|
+
// Compact labeled pattern
|
|
117
|
+
// nodes have labels; edges as (i,j,el,dirflag) with i<->j in [0..k-1]
|
|
118
|
+
std::vector<std::string> vlab; // per pattern-node label
|
|
119
|
+
struct PEdge { int a,b; std::string el; int dir; }; // dir: 0 undirected, 1 a->b
|
|
120
|
+
std::vector<PEdge> pedges;
|
|
121
|
+
|
|
122
|
+
// canonical key for de-duplication (label seq + sorted edges)
|
|
123
|
+
std::string key() const;
|
|
124
|
+
// in Pattern:
|
|
125
|
+
std::vector<int> parent; // parent[new_vertex] = anchor on RMP for forward edges, -1 for seed
|
|
126
|
+
int rightmost; // index of the last-added vertex
|
|
127
|
+
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
struct Found {
|
|
131
|
+
Pattern pat;
|
|
132
|
+
long long full_support = 0; // number of isomorphisms (or MNI if compute_full_support=false)
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
struct Output {
|
|
136
|
+
std::vector<Found> frequent_patterns;
|
|
137
|
+
};
|
|
138
|
+
void dump_patterns_to_dir(
|
|
139
|
+
const Output& out,
|
|
140
|
+
const std::string& dump_dir,
|
|
141
|
+
bool directed,
|
|
142
|
+
const DataGraph& G,
|
|
143
|
+
bool dump_images_csv,
|
|
144
|
+
int max_images_per_vertex,
|
|
145
|
+
bool dump_sample_embeddings,
|
|
146
|
+
int sample_limit
|
|
147
|
+
);
|
|
148
|
+
Output run_sopagrami(const DataGraph& G, const Params& p);
|
|
149
|
+
|
|
150
|
+
} // namespace algo
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <chrono>
|
|
3
|
+
|
|
4
|
+
namespace common {
|
|
5
|
+
|
|
6
|
+
class Timer {
|
|
7
|
+
public:
|
|
8
|
+
using clock = std::chrono::steady_clock;
|
|
9
|
+
void start() { t0_ = clock::now(); }
|
|
10
|
+
double ms() const {
|
|
11
|
+
auto t1 = clock::now();
|
|
12
|
+
return std::chrono::duration<double, std::milli>(t1 - t0_).count();
|
|
13
|
+
}
|
|
14
|
+
private:
|
|
15
|
+
clock::time_point t0_;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
} // namespace common
|