submine 0.1.1__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- submine/__init__.py +37 -0
- submine/algorithms/__init__.py +23 -0
- submine/algorithms/base.py +143 -0
- submine/algorithms/gspan.py +156 -0
- submine/algorithms/gspan_cpp.cpython-312-darwin.so +0 -0
- submine/algorithms/sopagrami.py +250 -0
- submine/algorithms/sopagrami_cpp.cpython-312-darwin.so +0 -0
- submine/api.py +134 -0
- submine/backends/__init__.py +0 -0
- submine/backends/gspan/CMakeLists.txt +65 -0
- submine/backends/gspan/dfs.cpp +98 -0
- submine/backends/gspan/graph.cpp +165 -0
- submine/backends/gspan/gspan.cpp +776 -0
- submine/backends/gspan/gspan.h +296 -0
- submine/backends/gspan/ismin.cpp +124 -0
- submine/backends/gspan/main.cpp +106 -0
- submine/backends/gspan/misc.cpp +177 -0
- submine/backends/gspan/python_bindings.cpp +133 -0
- submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
- submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
- submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
- submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
- submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
- submine/backends/sopagrami/cpp/src/main.cpp +94 -0
- submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
- submine/cli/__init__.py +6 -0
- submine/cli/main.py +87 -0
- submine/core/__init__.py +12 -0
- submine/core/graph.py +179 -0
- submine/core/result.py +121 -0
- submine/datasets/__init__.py +11 -0
- submine/datasets/loaders.py +145 -0
- submine/errors.py +41 -0
- submine/io/__init__.py +30 -0
- submine/io/common.py +173 -0
- submine/io/gexf.py +88 -0
- submine/io/gspan.py +268 -0
- submine/io/sopagrami.py +143 -0
- submine/io/transcode.py +147 -0
- submine/registry.py +8 -0
- submine/utils/__init__.py +6 -0
- submine/utils/checks.py +115 -0
- submine/utils/logging.py +41 -0
- submine-0.1.1.dist-info/METADATA +178 -0
- submine-0.1.1.dist-info/RECORD +47 -0
- submine-0.1.1.dist-info/WHEEL +6 -0
- submine-0.1.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,805 @@
|
|
|
1
|
+
// alg.cpp
|
|
2
|
+
#include "alg.hpp"
|
|
3
|
+
|
|
4
|
+
#include <algorithm>
|
|
5
|
+
#include <fstream>
|
|
6
|
+
#include <functional>
|
|
7
|
+
#include <iostream>
|
|
8
|
+
#include <limits>
|
|
9
|
+
#include <queue>
|
|
10
|
+
#include <sstream>
|
|
11
|
+
#include <unordered_map>
|
|
12
|
+
#include <unordered_set>
|
|
13
|
+
#include <utility>
|
|
14
|
+
#include <vector>
|
|
15
|
+
#include <climits>
|
|
16
|
+
#include <map>
|
|
17
|
+
|
|
18
|
+
#ifdef _OPENMP
|
|
19
|
+
#include <omp.h>
|
|
20
|
+
#endif
|
|
21
|
+
|
|
22
|
+
using namespace std;
|
|
23
|
+
|
|
24
|
+
namespace algo {
|
|
25
|
+
|
|
26
|
+
// =========================== DataGraph ===============================
|
|
27
|
+
|
|
28
|
+
static inline string encKey(const DataGraph::EdgeTypeKey& k){
|
|
29
|
+
return k.lu + '\t' + k.lv + '\t' + k.el + '\t' + char('0' + k.dirflag);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
void DataGraph::load_from_lg(const std::string& path, bool as_directed){
|
|
33
|
+
directed = as_directed;
|
|
34
|
+
std::ifstream fin(path);
|
|
35
|
+
if(!fin){ std::cerr<<"Cannot open "<<path<<"\n"; std::exit(1); }
|
|
36
|
+
|
|
37
|
+
struct EdgeRec { int u,v; std::string el; };
|
|
38
|
+
std::unordered_map<int,std::string> vlab_map;
|
|
39
|
+
std::vector<EdgeRec> edges;
|
|
40
|
+
|
|
41
|
+
std::string line;
|
|
42
|
+
while(std::getline(fin,line)){
|
|
43
|
+
if(line.empty()) continue;
|
|
44
|
+
std::stringstream ss(line);
|
|
45
|
+
std::string tag; ss>>tag;
|
|
46
|
+
|
|
47
|
+
if(tag=="v"||tag=="V"){
|
|
48
|
+
int idx; std::string lab; ss>>idx>>lab;
|
|
49
|
+
vlab_map[idx]=lab;
|
|
50
|
+
}else if(tag=="e"||tag=="E"){
|
|
51
|
+
int u,v; std::string elab; ss>>u>>v;
|
|
52
|
+
if(!(ss>>elab)) elab = "";
|
|
53
|
+
if(!elab.empty()){
|
|
54
|
+
bool numlike=true;
|
|
55
|
+
for(unsigned char c : elab){
|
|
56
|
+
if(!(std::isdigit(c)||c=='.'||c=='-'||c=='+')){ numlike=false; break; }
|
|
57
|
+
}
|
|
58
|
+
if(numlike){
|
|
59
|
+
try{
|
|
60
|
+
double d = std::stod(elab);
|
|
61
|
+
long long iv = (long long)d; // toward zero
|
|
62
|
+
elab = std::to_string(iv);
|
|
63
|
+
}catch(...){}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
edges.push_back({u,v,elab});
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const int n = vlab_map.empty()? 0 :
|
|
71
|
+
(std::max_element(vlab_map.begin(), vlab_map.end(),
|
|
72
|
+
[](auto&a, auto&b){ return a.first < b.first; })->first + 1);
|
|
73
|
+
|
|
74
|
+
vlabels.assign(n,"");
|
|
75
|
+
for(int i=0;i<n;++i) if(vlab_map.count(i)) vlabels[i]=vlab_map[i];
|
|
76
|
+
|
|
77
|
+
adj.assign(n,{}); rev.assign(n,{});
|
|
78
|
+
adj_set.assign(n,{}); rev_set.assign(n,{});
|
|
79
|
+
|
|
80
|
+
for(const auto& e: edges){
|
|
81
|
+
adj[e.u].push_back({e.v,e.el});
|
|
82
|
+
adj_set[e.u][e.v].insert(e.el);
|
|
83
|
+
rev[e.v].push_back({e.u,e.el});
|
|
84
|
+
rev_set[e.v][e.u].insert(e.el);
|
|
85
|
+
if(!directed){
|
|
86
|
+
// store both ways for undirected graphs
|
|
87
|
+
adj[e.v].push_back({e.u,e.el});
|
|
88
|
+
adj_set[e.v][e.u].insert(e.el);
|
|
89
|
+
rev[e.u].push_back({e.v,e.el});
|
|
90
|
+
rev_set[e.u][e.v].insert(e.el);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
lab2nodes.clear();
|
|
95
|
+
for(int i=0;i<n;++i) lab2nodes[vlabels[i]].insert(i);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
vector<DataGraph::EdgeTypeStat> DataGraph::edge_type_counts_insertion_order() const {
|
|
99
|
+
vector<EdgeTypeStat> stats;
|
|
100
|
+
unordered_map<string,int> idx; idx.reserve(1<<12);
|
|
101
|
+
auto idstr = [](const EdgeTypeKey& k){
|
|
102
|
+
return k.lu + "\t" + k.lv + "\t" + k.el + "\t" + char('0'+k.dirflag);
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
int n = (int)vlabels.size();
|
|
106
|
+
if(directed){
|
|
107
|
+
for(int u=0; u<n; ++u){
|
|
108
|
+
const string& lu = vlabels[u];
|
|
109
|
+
for(auto [v,el]: adj[u]){
|
|
110
|
+
const string& lv = vlabels[v];
|
|
111
|
+
EdgeTypeKey k{lu, lv, el, 1};
|
|
112
|
+
string s = idstr(k);
|
|
113
|
+
auto it = idx.find(s);
|
|
114
|
+
if(it==idx.end()){
|
|
115
|
+
idx.emplace(s, (int)stats.size());
|
|
116
|
+
stats.push_back({k,1});
|
|
117
|
+
}else{
|
|
118
|
+
stats[it->second].count++;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}else{
|
|
123
|
+
for(int u=0; u<n; ++u){
|
|
124
|
+
const string& lu = vlabels[u];
|
|
125
|
+
for(auto [v,el]: adj[u]){
|
|
126
|
+
if(u>v) continue; // count each undirected edge once
|
|
127
|
+
const string& lv = vlabels[v];
|
|
128
|
+
auto a = lu<=lv? lu:lv;
|
|
129
|
+
auto b = lu<=lv? lv:lu;
|
|
130
|
+
EdgeTypeKey k{a,b,el,0};
|
|
131
|
+
string s = idstr(k);
|
|
132
|
+
auto it = idx.find(s);
|
|
133
|
+
if(it==idx.end()){
|
|
134
|
+
idx.emplace(s, (int)stats.size());
|
|
135
|
+
stats.push_back({k,1});
|
|
136
|
+
}else{
|
|
137
|
+
stats[it->second].count++;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return stats;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// ============================ Pattern =================================
|
|
146
|
+
|
|
147
|
+
string Pattern::key() const {
|
|
148
|
+
// Canonical-ish: labels as given + edges normalized (min,max,dcode,label) sorted
|
|
149
|
+
string s; s.reserve(vlab.size()*8 + pedges.size()*16);
|
|
150
|
+
s += "V:";
|
|
151
|
+
for (size_t i=0;i<vlab.size();++i){ s += vlab[i]; s += '|'; }
|
|
152
|
+
|
|
153
|
+
vector<tuple<int,int,int,string>> es; es.reserve(pedges.size());
|
|
154
|
+
for(const auto& e: pedges){
|
|
155
|
+
int a=e.a,b=e.b;
|
|
156
|
+
int dcode = 0; // 0 undirected, 1 a->b oriented to min(a,b), 2 b->a oriented
|
|
157
|
+
if(e.dir==1) dcode = (a<b ? 1 : 2);
|
|
158
|
+
es.emplace_back(min(a,b), max(a,b), dcode, e.el);
|
|
159
|
+
}
|
|
160
|
+
sort(es.begin(), es.end());
|
|
161
|
+
s += "E:";
|
|
162
|
+
for(auto& t: es){
|
|
163
|
+
s += to_string(get<0>(t))+"-"+to_string(get<1>(t))+"-"+to_string(get<2>(t))+"-"+get<3>(t)+"|";
|
|
164
|
+
}
|
|
165
|
+
return s;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Canonical key across permutations within equal-label groups (exact)
|
|
169
|
+
static string encode_with_order(const Pattern& S, const vector<int>& order){
|
|
170
|
+
const int k = (int)S.vlab.size();
|
|
171
|
+
vector<int> pos(k);
|
|
172
|
+
for (int i=0;i<k;++i) pos[order[i]] = i;
|
|
173
|
+
|
|
174
|
+
string s; s.reserve(k*8 + S.pedges.size()*16);
|
|
175
|
+
s += "V:";
|
|
176
|
+
for (int i=0;i<k;++i){ s += S.vlab[order[i]]; s += '|'; }
|
|
177
|
+
|
|
178
|
+
vector<tuple<int,int,int,string>> es; es.reserve(S.pedges.size());
|
|
179
|
+
for (const auto& e : S.pedges){
|
|
180
|
+
int a = pos[e.a], b = pos[e.b];
|
|
181
|
+
int dcode = 0;
|
|
182
|
+
if (e.dir==1) dcode = (a<b ? 1 : 2);
|
|
183
|
+
es.emplace_back(std::min(a,b), std::max(a,b), dcode, e.el);
|
|
184
|
+
}
|
|
185
|
+
sort(es.begin(), es.end());
|
|
186
|
+
|
|
187
|
+
s += "E:";
|
|
188
|
+
for (auto& t : es){
|
|
189
|
+
s += to_string(get<0>(t))+"-"+to_string(get<1>(t))+"-"+to_string(get<2>(t))+"-"+get<3>(t)+"|";
|
|
190
|
+
}
|
|
191
|
+
return s;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
static std::string canonical_key(const Pattern& S){
|
|
195
|
+
const int k = (int)S.vlab.size();
|
|
196
|
+
if (k<=1) return S.key();
|
|
197
|
+
|
|
198
|
+
// group vertex indices by label (local 'groups' lives only in this function)
|
|
199
|
+
std::map<std::string, std::vector<int>> groups;
|
|
200
|
+
for (int i=0;i<k;++i) groups[S.vlab[i]].push_back(i);
|
|
201
|
+
for (auto& kv : groups) std::sort(kv.second.begin(), kv.second.end());
|
|
202
|
+
|
|
203
|
+
// collect labels to iterate deterministically
|
|
204
|
+
std::vector<std::string> labels; labels.reserve(groups.size());
|
|
205
|
+
for (auto& kv : groups) labels.push_back(kv.first);
|
|
206
|
+
|
|
207
|
+
auto encode_with_order = [&](const std::vector<int>& order)->std::string{
|
|
208
|
+
std::vector<int> pos(k);
|
|
209
|
+
for (int i=0;i<k;++i) pos[order[i]] = i;
|
|
210
|
+
|
|
211
|
+
std::string s; s.reserve(k*8 + S.pedges.size()*16);
|
|
212
|
+
s += "V:";
|
|
213
|
+
for (int i=0;i<k;++i){ s += S.vlab[order[i]]; s += '|'; }
|
|
214
|
+
|
|
215
|
+
std::vector<std::tuple<int,int,int,std::string>> es; es.reserve(S.pedges.size());
|
|
216
|
+
for (const auto& e : S.pedges){
|
|
217
|
+
int a = pos[e.a], b = pos[e.b];
|
|
218
|
+
int dcode = 0; // 0 undirected; 1 a->b aligned to min; 2 b->a aligned
|
|
219
|
+
if (e.dir==1) dcode = (a<b ? 1 : 2);
|
|
220
|
+
es.emplace_back(std::min(a,b), std::max(a,b), dcode, e.el);
|
|
221
|
+
}
|
|
222
|
+
std::sort(es.begin(), es.end());
|
|
223
|
+
|
|
224
|
+
s += "E:";
|
|
225
|
+
for (auto& t : es){
|
|
226
|
+
s += std::to_string(std::get<0>(t)) + "-" +
|
|
227
|
+
std::to_string(std::get<1>(t)) + "-" +
|
|
228
|
+
std::to_string(std::get<2>(t)) + "-" +
|
|
229
|
+
std::get<3>(t) + "|";
|
|
230
|
+
}
|
|
231
|
+
return s;
|
|
232
|
+
};
|
|
233
|
+
|
|
234
|
+
std::string best; bool have=false;
|
|
235
|
+
std::vector<int> current; current.reserve(k);
|
|
236
|
+
|
|
237
|
+
// backtrack over permutations within each equal-label group
|
|
238
|
+
std::function<void(int)> dfs = [&](int gi){
|
|
239
|
+
if (gi == (int)labels.size()){
|
|
240
|
+
std::string code = encode_with_order(current);
|
|
241
|
+
if (!have || code < best){ best = std::move(code); have = true; }
|
|
242
|
+
return;
|
|
243
|
+
}
|
|
244
|
+
const auto& g = groups[labels[gi]];
|
|
245
|
+
std::vector<int> perm = g;
|
|
246
|
+
do{
|
|
247
|
+
size_t old = current.size();
|
|
248
|
+
current.insert(current.end(), perm.begin(), perm.end());
|
|
249
|
+
dfs(gi+1);
|
|
250
|
+
current.resize(old);
|
|
251
|
+
} while (std::next_permutation(perm.begin(), perm.end()));
|
|
252
|
+
};
|
|
253
|
+
|
|
254
|
+
dfs(0);
|
|
255
|
+
return best;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// ======================= Seeds (1-edge) ================================
|
|
259
|
+
|
|
260
|
+
struct SeedInfo {
|
|
261
|
+
DataGraph::EdgeTypeKey key;
|
|
262
|
+
int mni; // MNI support for the 1-edge pattern
|
|
263
|
+
long long full; // number of edges of that type
|
|
264
|
+
};
|
|
265
|
+
|
|
266
|
+
// Correct undirected equal-label handling: mni = |S| (union of endpoints)
|
|
267
|
+
static vector<SeedInfo> compute_frequent_edge_seeds(const DataGraph& G, int tau){
|
|
268
|
+
using K = DataGraph::EdgeTypeKey;
|
|
269
|
+
|
|
270
|
+
struct AccDir { unordered_set<int> L, R; long long full=0; K key; }; // directed or undirected lu!=lv
|
|
271
|
+
struct AccEq { unordered_set<int> S; long long full=0; K key; }; // undirected lu==lv
|
|
272
|
+
|
|
273
|
+
unordered_map<string, AccDir> acc_lr; acc_lr.reserve(1<<14);
|
|
274
|
+
unordered_map<string, AccEq> acc_eq; acc_eq.reserve(1<<14);
|
|
275
|
+
|
|
276
|
+
const int n = (int)G.vlabels.size();
|
|
277
|
+
|
|
278
|
+
if (G.directed){
|
|
279
|
+
for (int u=0; u<n; ++u){
|
|
280
|
+
const string& lu = G.vlabels[u];
|
|
281
|
+
for (auto [v, el] : G.adj[u]){
|
|
282
|
+
const string& lv = G.vlabels[v];
|
|
283
|
+
K k{lu, lv, el, 1};
|
|
284
|
+
auto &A = acc_lr[ encKey(k) ];
|
|
285
|
+
if (A.full == 0) A.key = k;
|
|
286
|
+
A.L.insert(u);
|
|
287
|
+
A.R.insert(v);
|
|
288
|
+
A.full += 1;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
} else {
|
|
292
|
+
for (int u=0; u<n; ++u){
|
|
293
|
+
const string& lu = G.vlabels[u];
|
|
294
|
+
for (auto [v, el] : G.adj[u]){
|
|
295
|
+
if (u > v) continue; // each undirected edge once
|
|
296
|
+
const string& lv = G.vlabels[v];
|
|
297
|
+
|
|
298
|
+
if (lu == lv){
|
|
299
|
+
K k{lu, lv, el, 0};
|
|
300
|
+
auto &E = acc_eq[ encKey(k) ];
|
|
301
|
+
if (E.full == 0) E.key = k;
|
|
302
|
+
E.S.insert(u);
|
|
303
|
+
E.S.insert(v);
|
|
304
|
+
E.full += 1;
|
|
305
|
+
} else {
|
|
306
|
+
K k;
|
|
307
|
+
int leftNode, rightNode;
|
|
308
|
+
if (lu <= lv) { k = {lu, lv, el, 0}; leftNode=u; rightNode=v; }
|
|
309
|
+
else { k = {lv, lu, el, 0}; leftNode=v; rightNode=u; }
|
|
310
|
+
|
|
311
|
+
auto &A = acc_lr[ encKey(k) ];
|
|
312
|
+
if (A.full == 0) A.key = k;
|
|
313
|
+
A.L.insert(leftNode);
|
|
314
|
+
A.R.insert(rightNode);
|
|
315
|
+
A.full += 1;
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
vector<SeedInfo> seeds;
|
|
322
|
+
seeds.reserve(acc_lr.size() + acc_eq.size());
|
|
323
|
+
|
|
324
|
+
for (auto &kv : acc_lr){
|
|
325
|
+
auto &A = kv.second;
|
|
326
|
+
int mni = std::min((int)A.L.size(), (int)A.R.size());
|
|
327
|
+
if (mni >= tau) seeds.push_back({A.key, mni, A.full});
|
|
328
|
+
}
|
|
329
|
+
for (auto &kv : acc_eq){
|
|
330
|
+
auto &E = kv.second;
|
|
331
|
+
int mni = (int)E.S.size();
|
|
332
|
+
if (mni >= tau) seeds.push_back({E.key, mni, E.full});
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
return seeds;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// Build O(1) seed MNI lookups
|
|
339
|
+
static unordered_map<string,int> build_seed_mni_map(const vector<SeedInfo>& seeds){
|
|
340
|
+
unordered_map<string,int> m; m.reserve(seeds.size()*2);
|
|
341
|
+
for (auto &s : seeds) m[encKey(s.key)] = s.mni;
|
|
342
|
+
return m;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// ==================== MNI (exact, AC + MRV) ===========================
|
|
346
|
+
|
|
347
|
+
static inline bool consistent_edge_map(const DataGraph& G,
|
|
348
|
+
const Pattern::PEdge& e,
|
|
349
|
+
int va, int vb, // pattern endpoints
|
|
350
|
+
int ga, int gb) { // graph nodes mapped to (va,vb)
|
|
351
|
+
if (e.dir == 1) {
|
|
352
|
+
if (e.a == va && e.b == vb) return G.has_edge(ga, gb, e.el);
|
|
353
|
+
if (e.a == vb && e.b == va) return G.has_edge(gb, ga, e.el);
|
|
354
|
+
return true;
|
|
355
|
+
} else {
|
|
356
|
+
if ((e.a == va && e.b == vb) || (e.a == vb && e.b == va)) {
|
|
357
|
+
return G.has_edge(ga, gb, e.el) || G.has_edge(gb, ga, e.el);
|
|
358
|
+
}
|
|
359
|
+
return true;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// Local AC (neighbor-existence) with scans (safe for directed + undirected)
|
|
364
|
+
static void filter_domains_by_local_constraints(const DataGraph& G,
|
|
365
|
+
const Pattern& P,
|
|
366
|
+
vector<vector<int>>& dom)
|
|
367
|
+
{
|
|
368
|
+
const int k = (int)P.vlab.size();
|
|
369
|
+
|
|
370
|
+
for (int v = 0; v < k; ++v){
|
|
371
|
+
if (dom[v].empty()) continue;
|
|
372
|
+
vector<int> keep; keep.reserve(dom[v].size());
|
|
373
|
+
|
|
374
|
+
for (int gi : dom[v]){
|
|
375
|
+
bool ok_all = true;
|
|
376
|
+
for (const auto& e : P.pedges){
|
|
377
|
+
if (e.a != v && e.b != v) continue;
|
|
378
|
+
|
|
379
|
+
const int nb = (e.a==v? e.b : e.a);
|
|
380
|
+
const string& needLab = P.vlab[nb];
|
|
381
|
+
const string& el = e.el;
|
|
382
|
+
|
|
383
|
+
bool ok_edge = false;
|
|
384
|
+
if (e.dir == 1){
|
|
385
|
+
if (e.a == v){
|
|
386
|
+
for (auto [x, lbl] : G.adj[gi]) {
|
|
387
|
+
if (lbl == el && G.vlabels[x] == needLab){ ok_edge = true; break; }
|
|
388
|
+
}
|
|
389
|
+
} else {
|
|
390
|
+
for (auto [x, lbl] : G.rev[gi]) {
|
|
391
|
+
if (lbl == el && G.vlabels[x] == needLab){ ok_edge = true; break; }
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
} else {
|
|
395
|
+
for (auto [x, lbl] : G.adj[gi]) {
|
|
396
|
+
if (lbl == el && G.vlabels[x] == needLab){ ok_edge = true; break; }
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
if (!ok_edge){ ok_all = false; break; }
|
|
401
|
+
}
|
|
402
|
+
if (ok_all) keep.push_back(gi);
|
|
403
|
+
}
|
|
404
|
+
dom[v].swap(keep);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// Existence of a full injective embedding with x_fixVar = fixNode
|
|
409
|
+
static bool exists_solution_with(const DataGraph& G, const Pattern& P,
|
|
410
|
+
int fixVar, int fixNode,
|
|
411
|
+
const vector<vector<int>>& domainsInit)
|
|
412
|
+
{
|
|
413
|
+
const int k = (int)P.vlab.size();
|
|
414
|
+
const int n = (int)G.vlabels.size();
|
|
415
|
+
|
|
416
|
+
vector<vector<int>> dom = domainsInit; // do not mutate caller’s copy
|
|
417
|
+
vector<int> assign(k, -1);
|
|
418
|
+
vector<char> used(n, 0);
|
|
419
|
+
|
|
420
|
+
assign[fixVar] = fixNode;
|
|
421
|
+
used[fixNode] = 1;
|
|
422
|
+
|
|
423
|
+
auto choose_var = [&]()->int{
|
|
424
|
+
int best=-1, bestCnt=INT_MAX;
|
|
425
|
+
for (int v=0; v<k; ++v){
|
|
426
|
+
if (assign[v]!=-1) continue;
|
|
427
|
+
int cnt=0;
|
|
428
|
+
for (int gi : dom[v]){
|
|
429
|
+
if (used[gi]) continue;
|
|
430
|
+
bool ok = true;
|
|
431
|
+
for (const auto& e : P.pedges){
|
|
432
|
+
if (e.a == v && assign[e.b] != -1){
|
|
433
|
+
ok = consistent_edge_map(G,e,e.a,e.b,gi,assign[e.b]);
|
|
434
|
+
} else if (e.b == v && assign[e.a] != -1){
|
|
435
|
+
ok = consistent_edge_map(G,e,e.a,e.b,assign[e.a],gi);
|
|
436
|
+
}
|
|
437
|
+
if (!ok) break;
|
|
438
|
+
}
|
|
439
|
+
if (ok){ ++cnt; if (cnt >= bestCnt) break; }
|
|
440
|
+
}
|
|
441
|
+
if (cnt < bestCnt){ best=v; bestCnt=cnt; }
|
|
442
|
+
}
|
|
443
|
+
return best;
|
|
444
|
+
};
|
|
445
|
+
|
|
446
|
+
function<bool()> dfs = [&](){
|
|
447
|
+
bool done=true; for (int i=0;i<k;++i) if (assign[i]==-1){ done=false; break; }
|
|
448
|
+
if (done) return true;
|
|
449
|
+
|
|
450
|
+
int v = choose_var();
|
|
451
|
+
if (v==-1) return false;
|
|
452
|
+
|
|
453
|
+
for (int gi : dom[v]){
|
|
454
|
+
if (used[gi]) continue;
|
|
455
|
+
|
|
456
|
+
bool ok = true;
|
|
457
|
+
for (const auto& e : P.pedges){
|
|
458
|
+
if (e.a == v && assign[e.b] != -1){
|
|
459
|
+
ok = consistent_edge_map(G,e,e.a,e.b,gi,assign[e.b]);
|
|
460
|
+
} else if (e.b == v && assign[e.a] != -1){
|
|
461
|
+
ok = consistent_edge_map(G,e,e.a,e.b,assign[e.a],gi);
|
|
462
|
+
}
|
|
463
|
+
if (!ok) break;
|
|
464
|
+
}
|
|
465
|
+
if (!ok) continue;
|
|
466
|
+
|
|
467
|
+
assign[v]=gi; used[gi]=1;
|
|
468
|
+
if (dfs()) return true;
|
|
469
|
+
used[gi]=0; assign[v]=-1;
|
|
470
|
+
}
|
|
471
|
+
return false;
|
|
472
|
+
};
|
|
473
|
+
|
|
474
|
+
return dfs();
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// Exact MNI: per-variable existence, with local AC
|
|
478
|
+
static int compute_MNI_support_exact(const DataGraph& G, const Pattern& P, int tau){
|
|
479
|
+
const int k = (int)P.vlab.size();
|
|
480
|
+
if (k == 0) return 0;
|
|
481
|
+
|
|
482
|
+
vector<vector<int>> dom(k);
|
|
483
|
+
for (int i=0; i<k; ++i){
|
|
484
|
+
auto it = G.lab2nodes.find(P.vlab[i]);
|
|
485
|
+
if (it != G.lab2nodes.end()) dom[i].assign(it->second.begin(), it->second.end());
|
|
486
|
+
if ((int)dom[i].size() < tau) return 0;
|
|
487
|
+
}
|
|
488
|
+
filter_domains_by_local_constraints(G, P, dom);
|
|
489
|
+
for (int i=0;i<k;++i) if ((int)dom[i].size() < tau) return 0;
|
|
490
|
+
|
|
491
|
+
int support = numeric_limits<int>::max();
|
|
492
|
+
|
|
493
|
+
for (int v=0; v<k; ++v){
|
|
494
|
+
vector<int> Dv = dom[v];
|
|
495
|
+
int count_v = 0;
|
|
496
|
+
|
|
497
|
+
for (int u : Dv){
|
|
498
|
+
if (exists_solution_with(G, P, v, u, dom)){
|
|
499
|
+
++count_v;
|
|
500
|
+
} else {
|
|
501
|
+
auto &Dref = dom[v];
|
|
502
|
+
auto it = find(Dref.begin(), Dref.end(), u);
|
|
503
|
+
if (it != Dref.end()) Dref.erase(it);
|
|
504
|
+
if ((int)Dref.size() < tau) return 0;
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
support = min(support, count_v);
|
|
509
|
+
if (support < tau) return 0;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
return (support==numeric_limits<int>::max()? 0 : support);
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// Quick seed support (2 nodes, 1 edge) from seed map
|
|
516
|
+
static int mni_support_seed_from_map(const DataGraph& G, const Pattern& P,
|
|
517
|
+
const unordered_map<string,int>& seed_mni)
|
|
518
|
+
{
|
|
519
|
+
const auto& e = P.pedges[0];
|
|
520
|
+
DataGraph::EdgeTypeKey k;
|
|
521
|
+
if (G.directed){
|
|
522
|
+
k = { P.vlab[e.a], P.vlab[e.b], e.el, 1 };
|
|
523
|
+
}else{
|
|
524
|
+
const string &la = P.vlab[e.a], &lb = P.vlab[e.b];
|
|
525
|
+
if (la <= lb) k = { la, lb, e.el, 0 };
|
|
526
|
+
else k = { lb, la, e.el, 0 };
|
|
527
|
+
}
|
|
528
|
+
auto it = seed_mni.find(encKey(k));
|
|
529
|
+
return (it==seed_mni.end()? 0 : it->second);
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
// Hybrid: use O(1) seed for k=2, exact for larger patterns
|
|
533
|
+
static int compute_MNI_support_hybrid(const DataGraph& G, const Pattern& P, int tau,
|
|
534
|
+
const unordered_map<string,int>& seed_mni)
|
|
535
|
+
{
|
|
536
|
+
if (P.vlab.size()==2 && P.pedges.size()==1){
|
|
537
|
+
return mni_support_seed_from_map(G, P, seed_mni);
|
|
538
|
+
}
|
|
539
|
+
return compute_MNI_support_exact(G, P, tau);
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// =================== Candidate enumeration ============================
|
|
543
|
+
|
|
544
|
+
static inline bool edge_already_in_pattern(const Pattern& S,
|
|
545
|
+
int a, int b,
|
|
546
|
+
const string& el,
|
|
547
|
+
int dirflag)
|
|
548
|
+
{
|
|
549
|
+
for (const auto& e : S.pedges){
|
|
550
|
+
if (e.el != el) continue;
|
|
551
|
+
if (dirflag == 0 && e.dir == 0){
|
|
552
|
+
if ((e.a == a && e.b == b) || (e.a == b && e.b == a)) return true;
|
|
553
|
+
} else if (dirflag == 1 && e.dir == 1){
|
|
554
|
+
if (e.a == a && e.b == b) return true;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
return false;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
// Necessary seed lower bound: every edge-type in P must have seed MNI >= tau
|
|
561
|
+
static bool seed_lower_bound_ok(const DataGraph& G, const Pattern& P, int tau,
|
|
562
|
+
const unordered_map<string,int>& seed_mni)
|
|
563
|
+
{
|
|
564
|
+
for (const auto& e : P.pedges){
|
|
565
|
+
DataGraph::EdgeTypeKey k;
|
|
566
|
+
if (G.directed){
|
|
567
|
+
k = { P.vlab[e.a], P.vlab[e.b], e.el, 1 };
|
|
568
|
+
}else{
|
|
569
|
+
const string &la = P.vlab[e.a], &lb = P.vlab[e.b];
|
|
570
|
+
if (la <= lb) k = { la, lb, e.el, 0 };
|
|
571
|
+
else k = { lb, la, e.el, 0 };
|
|
572
|
+
}
|
|
573
|
+
auto it = seed_mni.find(encKey(k));
|
|
574
|
+
if (it == seed_mni.end() || it->second < tau) return false;
|
|
575
|
+
}
|
|
576
|
+
return true;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
static void enumerate_candidates(const DataGraph& G,
|
|
580
|
+
const Pattern& S,
|
|
581
|
+
const vector<SeedInfo>& seeds,
|
|
582
|
+
const unordered_map<string,int>& seed_mni,
|
|
583
|
+
int tau,
|
|
584
|
+
vector<Pattern>& out)
|
|
585
|
+
{
|
|
586
|
+
const int k = (int)S.vlab.size();
|
|
587
|
+
unordered_set<string> seen; seen.reserve(512);
|
|
588
|
+
|
|
589
|
+
// Node-extensions: connect a new node to any existing node via a frequent edge-type
|
|
590
|
+
for (int u = 0; u < k; ++u){
|
|
591
|
+
for (const auto& s : seeds){
|
|
592
|
+
const auto& ek = s.key;
|
|
593
|
+
// u must match one side's label
|
|
594
|
+
if (!(ek.lu == S.vlab[u] || ek.lv == S.vlab[u])) continue;
|
|
595
|
+
|
|
596
|
+
if (ek.dirflag == 1){
|
|
597
|
+
// u as source
|
|
598
|
+
if (S.vlab[u] == ek.lu){
|
|
599
|
+
Pattern ext = S;
|
|
600
|
+
ext.vlab.push_back(ek.lv);
|
|
601
|
+
int newId = (int)ext.vlab.size() - 1;
|
|
602
|
+
if (!edge_already_in_pattern(S, u, newId, ek.el, 1)){
|
|
603
|
+
ext.pedges.push_back({u, newId, ek.el, 1});
|
|
604
|
+
if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
|
|
605
|
+
string key = canonical_key(ext);
|
|
606
|
+
if (seen.insert(key).second) out.push_back(std::move(ext));
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
// u as target
|
|
611
|
+
if (S.vlab[u] == ek.lv){
|
|
612
|
+
Pattern ext = S;
|
|
613
|
+
ext.vlab.push_back(ek.lu);
|
|
614
|
+
int newId = (int)ext.vlab.size() - 1;
|
|
615
|
+
if (!edge_already_in_pattern(S, newId, u, ek.el, 1)){
|
|
616
|
+
ext.pedges.push_back({newId, u, ek.el, 1});
|
|
617
|
+
if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
|
|
618
|
+
string key = canonical_key(ext);
|
|
619
|
+
if (seen.insert(key).second) out.push_back(std::move(ext));
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
} else {
|
|
624
|
+
// undirected
|
|
625
|
+
if (S.vlab[u] == ek.lu){
|
|
626
|
+
Pattern ext = S;
|
|
627
|
+
ext.vlab.push_back(ek.lv);
|
|
628
|
+
int newId = (int)ext.vlab.size() - 1;
|
|
629
|
+
if (!edge_already_in_pattern(S, u, newId, ek.el, 0)){
|
|
630
|
+
ext.pedges.push_back({u, newId, ek.el, 0});
|
|
631
|
+
if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
|
|
632
|
+
string key = canonical_key(ext);
|
|
633
|
+
if (seen.insert(key).second) out.push_back(std::move(ext));
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
} else if (S.vlab[u] == ek.lv){
|
|
637
|
+
Pattern ext = S;
|
|
638
|
+
ext.vlab.push_back(ek.lu);
|
|
639
|
+
int newId = (int)ext.vlab.size() - 1;
|
|
640
|
+
if (!edge_already_in_pattern(S, u, newId, ek.el, 0)){
|
|
641
|
+
ext.pedges.push_back({u, newId, ek.el, 0});
|
|
642
|
+
if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
|
|
643
|
+
string key = canonical_key(ext);
|
|
644
|
+
if (seen.insert(key).second) out.push_back(std::move(ext));
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
// Edge-extensions: add a new edge between existing nodes if frequent
|
|
653
|
+
for (int a = 0; a < k; ++a){
|
|
654
|
+
for (int b = a + 1; b < k; ++b){
|
|
655
|
+
for (const auto& s : seeds){
|
|
656
|
+
const auto& ek = s.key;
|
|
657
|
+
|
|
658
|
+
if (ek.dirflag == 0){
|
|
659
|
+
if (!((ek.lu==S.vlab[a] && ek.lv==S.vlab[b]) ||
|
|
660
|
+
(ek.lu==S.vlab[b] && ek.lv==S.vlab[a])))
|
|
661
|
+
continue;
|
|
662
|
+
|
|
663
|
+
if (edge_already_in_pattern(S, a, b, ek.el, 0)) continue;
|
|
664
|
+
|
|
665
|
+
Pattern ext = S;
|
|
666
|
+
ext.pedges.push_back({a, b, ek.el, 0});
|
|
667
|
+
if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
|
|
668
|
+
string key = canonical_key(ext);
|
|
669
|
+
if (seen.insert(key).second) out.push_back(std::move(ext));
|
|
670
|
+
}
|
|
671
|
+
} else {
|
|
672
|
+
// a->b
|
|
673
|
+
if (ek.lu==S.vlab[a] && ek.lv==S.vlab[b]){
|
|
674
|
+
if (!edge_already_in_pattern(S, a, b, ek.el, 1)){
|
|
675
|
+
Pattern ext = S;
|
|
676
|
+
ext.pedges.push_back({a, b, ek.el, 1});
|
|
677
|
+
if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
|
|
678
|
+
string key = canonical_key(ext);
|
|
679
|
+
if (seen.insert(key).second) out.push_back(std::move(ext));
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
// b->a
|
|
684
|
+
if (ek.lu==S.vlab[b] && ek.lv==S.vlab[a]){
|
|
685
|
+
if (!edge_already_in_pattern(S, b, a, ek.el, 1)){
|
|
686
|
+
Pattern ext = S;
|
|
687
|
+
ext.pedges.push_back({b, a, ek.el, 1});
|
|
688
|
+
if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
|
|
689
|
+
string key = canonical_key(ext);
|
|
690
|
+
if (seen.insert(key).second) out.push_back(std::move(ext));
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
// ======================= SUBGRAPHEXTENSION ============================
|
|
701
|
+
|
|
702
|
+
static void SUBGRAPHEXTENSION(const DataGraph& G, int tau,
|
|
703
|
+
const vector<SeedInfo>& seeds,
|
|
704
|
+
const unordered_map<string,int>& seed_mni,
|
|
705
|
+
const Pattern& S,
|
|
706
|
+
unordered_set<string>& emitted,
|
|
707
|
+
vector<Found>& out)
|
|
708
|
+
{
|
|
709
|
+
const string K = canonical_key(S);
|
|
710
|
+
if (!emitted.insert(K).second) return;
|
|
711
|
+
|
|
712
|
+
int mni = compute_MNI_support_hybrid(G, S, tau, seed_mni);
|
|
713
|
+
if (mni < tau) return;
|
|
714
|
+
|
|
715
|
+
out.push_back({S, (long long)mni});
|
|
716
|
+
|
|
717
|
+
vector<Pattern> cand;
|
|
718
|
+
enumerate_candidates(G, S, seeds, seed_mni, tau, cand);
|
|
719
|
+
|
|
720
|
+
for (const auto& c : cand){
|
|
721
|
+
int s = compute_MNI_support_hybrid(G, c, tau, seed_mni);
|
|
722
|
+
if (s >= tau){
|
|
723
|
+
SUBGRAPHEXTENSION(G, tau, seeds, seed_mni, c, emitted, out);
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
// ============================ Driver ==================================
|
|
729
|
+
|
|
730
|
+
Output run_sopagrami(const DataGraph& G, const Params& p){
|
|
731
|
+
// 1) frequent 1-edge seeds by true MNI
|
|
732
|
+
auto seeds = compute_frequent_edge_seeds(G, p.tau);
|
|
733
|
+
if (seeds.empty()){
|
|
734
|
+
Output out; return out;
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
// 2) sort seeds by full support (edge count) descending (SoGraMi ordering)
|
|
738
|
+
sort(seeds.begin(), seeds.end(),
|
|
739
|
+
[](const SeedInfo& a, const SeedInfo& b){ return a.full > b.full; });
|
|
740
|
+
|
|
741
|
+
// 3) precompute O(1) seed MNI map (for fast lower bound + seed support)
|
|
742
|
+
auto seed_mni = build_seed_mni_map(seeds);
|
|
743
|
+
|
|
744
|
+
// 4) parallel expand per seed
|
|
745
|
+
int T = 1;
|
|
746
|
+
#ifdef _OPENMP
|
|
747
|
+
T = (p.num_threads > 0 ? p.num_threads : omp_get_max_threads());
|
|
748
|
+
if (T < 1) T = 1;
|
|
749
|
+
omp_set_num_threads(T);
|
|
750
|
+
#endif
|
|
751
|
+
|
|
752
|
+
vector<vector<Found>> locals(T);
|
|
753
|
+
vector<unordered_set<string>> local_emitted(T);
|
|
754
|
+
for (int t=0; t<T; ++t){
|
|
755
|
+
locals[t].reserve(1<<12);
|
|
756
|
+
local_emitted[t].reserve(1<<14);
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
#ifdef _OPENMP
|
|
760
|
+
#pragma omp parallel
|
|
761
|
+
{
|
|
762
|
+
const int tid = omp_get_thread_num();
|
|
763
|
+
auto& out_loc = locals[tid];
|
|
764
|
+
auto& emit_loc = local_emitted[tid];
|
|
765
|
+
|
|
766
|
+
#pragma omp for schedule(dynamic)
|
|
767
|
+
for (int i = 0; i < (int)seeds.size(); ++i){
|
|
768
|
+
Pattern seed;
|
|
769
|
+
seed.vlab = {seeds[i].key.lu, seeds[i].key.lv};
|
|
770
|
+
seed.pedges.push_back({0,1,seeds[i].key.el,seeds[i].key.dirflag});
|
|
771
|
+
SUBGRAPHEXTENSION(G, p.tau, seeds, seed_mni, seed, emit_loc, out_loc);
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
#else
|
|
775
|
+
// Fallback single-thread
|
|
776
|
+
for (size_t i=0; i<seeds.size(); ++i){
|
|
777
|
+
Pattern seed;
|
|
778
|
+
seed.vlab = {seeds[i].key.lu, seeds[i].key.lv};
|
|
779
|
+
seed.pedges.push_back({0,1,seeds[i].key.el,seeds[i].key.dirflag});
|
|
780
|
+
SUBGRAPHEXTENSION(G, p.tau, seeds, seed_mni, seed,
|
|
781
|
+
local_emitted[0], locals[0]);
|
|
782
|
+
}
|
|
783
|
+
#endif
|
|
784
|
+
|
|
785
|
+
// 5) merge & sort
|
|
786
|
+
Output out;
|
|
787
|
+
unordered_set<string> global_emitted; global_emitted.reserve(1<<16);
|
|
788
|
+
for (int t=0; t<T; ++t){
|
|
789
|
+
for (auto &f : locals[t]){
|
|
790
|
+
string K = canonical_key(f.pat);
|
|
791
|
+
if (global_emitted.insert(K).second) out.frequent_patterns.push_back(std::move(f));
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
stable_sort(out.frequent_patterns.begin(), out.frequent_patterns.end(),
|
|
796
|
+
[](const Found& A, const Found& B){
|
|
797
|
+
if (A.pat.pedges.size()!=B.pat.pedges.size())
|
|
798
|
+
return A.pat.pedges.size() < B.pat.pedges.size();
|
|
799
|
+
return A.full_support > B.full_support;
|
|
800
|
+
});
|
|
801
|
+
|
|
802
|
+
return out;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
} // namespace algo
|