submine 0.1.1__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. submine/__init__.py +37 -0
  2. submine/algorithms/__init__.py +23 -0
  3. submine/algorithms/base.py +143 -0
  4. submine/algorithms/gspan.py +156 -0
  5. submine/algorithms/gspan_cpp.cpython-312-darwin.so +0 -0
  6. submine/algorithms/sopagrami.py +250 -0
  7. submine/algorithms/sopagrami_cpp.cpython-312-darwin.so +0 -0
  8. submine/api.py +134 -0
  9. submine/backends/__init__.py +0 -0
  10. submine/backends/gspan/CMakeLists.txt +65 -0
  11. submine/backends/gspan/dfs.cpp +98 -0
  12. submine/backends/gspan/graph.cpp +165 -0
  13. submine/backends/gspan/gspan.cpp +776 -0
  14. submine/backends/gspan/gspan.h +296 -0
  15. submine/backends/gspan/ismin.cpp +124 -0
  16. submine/backends/gspan/main.cpp +106 -0
  17. submine/backends/gspan/misc.cpp +177 -0
  18. submine/backends/gspan/python_bindings.cpp +133 -0
  19. submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
  20. submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
  21. submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
  22. submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
  23. submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
  24. submine/backends/sopagrami/cpp/src/main.cpp +94 -0
  25. submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
  26. submine/cli/__init__.py +6 -0
  27. submine/cli/main.py +87 -0
  28. submine/core/__init__.py +12 -0
  29. submine/core/graph.py +179 -0
  30. submine/core/result.py +121 -0
  31. submine/datasets/__init__.py +11 -0
  32. submine/datasets/loaders.py +145 -0
  33. submine/errors.py +41 -0
  34. submine/io/__init__.py +30 -0
  35. submine/io/common.py +173 -0
  36. submine/io/gexf.py +88 -0
  37. submine/io/gspan.py +268 -0
  38. submine/io/sopagrami.py +143 -0
  39. submine/io/transcode.py +147 -0
  40. submine/registry.py +8 -0
  41. submine/utils/__init__.py +6 -0
  42. submine/utils/checks.py +115 -0
  43. submine/utils/logging.py +41 -0
  44. submine-0.1.1.dist-info/METADATA +178 -0
  45. submine-0.1.1.dist-info/RECORD +47 -0
  46. submine-0.1.1.dist-info/WHEEL +6 -0
  47. submine-0.1.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,805 @@
1
+ // alg.cpp
2
+ #include "alg.hpp"
3
+
4
+ #include <algorithm>
5
+ #include <fstream>
6
+ #include <functional>
7
+ #include <iostream>
8
+ #include <limits>
9
+ #include <queue>
10
+ #include <sstream>
11
+ #include <unordered_map>
12
+ #include <unordered_set>
13
+ #include <utility>
14
+ #include <vector>
15
+ #include <climits>
16
+ #include <map>
17
+
18
+ #ifdef _OPENMP
19
+ #include <omp.h>
20
+ #endif
21
+
22
+ using namespace std;
23
+
24
+ namespace algo {
25
+
26
+ // =========================== DataGraph ===============================
27
+
28
+ static inline string encKey(const DataGraph::EdgeTypeKey& k){
29
+ return k.lu + '\t' + k.lv + '\t' + k.el + '\t' + char('0' + k.dirflag);
30
+ }
31
+
32
+ void DataGraph::load_from_lg(const std::string& path, bool as_directed){
33
+ directed = as_directed;
34
+ std::ifstream fin(path);
35
+ if(!fin){ std::cerr<<"Cannot open "<<path<<"\n"; std::exit(1); }
36
+
37
+ struct EdgeRec { int u,v; std::string el; };
38
+ std::unordered_map<int,std::string> vlab_map;
39
+ std::vector<EdgeRec> edges;
40
+
41
+ std::string line;
42
+ while(std::getline(fin,line)){
43
+ if(line.empty()) continue;
44
+ std::stringstream ss(line);
45
+ std::string tag; ss>>tag;
46
+
47
+ if(tag=="v"||tag=="V"){
48
+ int idx; std::string lab; ss>>idx>>lab;
49
+ vlab_map[idx]=lab;
50
+ }else if(tag=="e"||tag=="E"){
51
+ int u,v; std::string elab; ss>>u>>v;
52
+ if(!(ss>>elab)) elab = "";
53
+ if(!elab.empty()){
54
+ bool numlike=true;
55
+ for(unsigned char c : elab){
56
+ if(!(std::isdigit(c)||c=='.'||c=='-'||c=='+')){ numlike=false; break; }
57
+ }
58
+ if(numlike){
59
+ try{
60
+ double d = std::stod(elab);
61
+ long long iv = (long long)d; // toward zero
62
+ elab = std::to_string(iv);
63
+ }catch(...){}
64
+ }
65
+ }
66
+ edges.push_back({u,v,elab});
67
+ }
68
+ }
69
+
70
+ const int n = vlab_map.empty()? 0 :
71
+ (std::max_element(vlab_map.begin(), vlab_map.end(),
72
+ [](auto&a, auto&b){ return a.first < b.first; })->first + 1);
73
+
74
+ vlabels.assign(n,"");
75
+ for(int i=0;i<n;++i) if(vlab_map.count(i)) vlabels[i]=vlab_map[i];
76
+
77
+ adj.assign(n,{}); rev.assign(n,{});
78
+ adj_set.assign(n,{}); rev_set.assign(n,{});
79
+
80
+ for(const auto& e: edges){
81
+ adj[e.u].push_back({e.v,e.el});
82
+ adj_set[e.u][e.v].insert(e.el);
83
+ rev[e.v].push_back({e.u,e.el});
84
+ rev_set[e.v][e.u].insert(e.el);
85
+ if(!directed){
86
+ // store both ways for undirected graphs
87
+ adj[e.v].push_back({e.u,e.el});
88
+ adj_set[e.v][e.u].insert(e.el);
89
+ rev[e.u].push_back({e.v,e.el});
90
+ rev_set[e.u][e.v].insert(e.el);
91
+ }
92
+ }
93
+
94
+ lab2nodes.clear();
95
+ for(int i=0;i<n;++i) lab2nodes[vlabels[i]].insert(i);
96
+ }
97
+
98
+ vector<DataGraph::EdgeTypeStat> DataGraph::edge_type_counts_insertion_order() const {
99
+ vector<EdgeTypeStat> stats;
100
+ unordered_map<string,int> idx; idx.reserve(1<<12);
101
+ auto idstr = [](const EdgeTypeKey& k){
102
+ return k.lu + "\t" + k.lv + "\t" + k.el + "\t" + char('0'+k.dirflag);
103
+ };
104
+
105
+ int n = (int)vlabels.size();
106
+ if(directed){
107
+ for(int u=0; u<n; ++u){
108
+ const string& lu = vlabels[u];
109
+ for(auto [v,el]: adj[u]){
110
+ const string& lv = vlabels[v];
111
+ EdgeTypeKey k{lu, lv, el, 1};
112
+ string s = idstr(k);
113
+ auto it = idx.find(s);
114
+ if(it==idx.end()){
115
+ idx.emplace(s, (int)stats.size());
116
+ stats.push_back({k,1});
117
+ }else{
118
+ stats[it->second].count++;
119
+ }
120
+ }
121
+ }
122
+ }else{
123
+ for(int u=0; u<n; ++u){
124
+ const string& lu = vlabels[u];
125
+ for(auto [v,el]: adj[u]){
126
+ if(u>v) continue; // count each undirected edge once
127
+ const string& lv = vlabels[v];
128
+ auto a = lu<=lv? lu:lv;
129
+ auto b = lu<=lv? lv:lu;
130
+ EdgeTypeKey k{a,b,el,0};
131
+ string s = idstr(k);
132
+ auto it = idx.find(s);
133
+ if(it==idx.end()){
134
+ idx.emplace(s, (int)stats.size());
135
+ stats.push_back({k,1});
136
+ }else{
137
+ stats[it->second].count++;
138
+ }
139
+ }
140
+ }
141
+ }
142
+ return stats;
143
+ }
144
+
145
+ // ============================ Pattern =================================
146
+
147
+ string Pattern::key() const {
148
+ // Canonical-ish: labels as given + edges normalized (min,max,dcode,label) sorted
149
+ string s; s.reserve(vlab.size()*8 + pedges.size()*16);
150
+ s += "V:";
151
+ for (size_t i=0;i<vlab.size();++i){ s += vlab[i]; s += '|'; }
152
+
153
+ vector<tuple<int,int,int,string>> es; es.reserve(pedges.size());
154
+ for(const auto& e: pedges){
155
+ int a=e.a,b=e.b;
156
+ int dcode = 0; // 0 undirected, 1 a->b oriented to min(a,b), 2 b->a oriented
157
+ if(e.dir==1) dcode = (a<b ? 1 : 2);
158
+ es.emplace_back(min(a,b), max(a,b), dcode, e.el);
159
+ }
160
+ sort(es.begin(), es.end());
161
+ s += "E:";
162
+ for(auto& t: es){
163
+ s += to_string(get<0>(t))+"-"+to_string(get<1>(t))+"-"+to_string(get<2>(t))+"-"+get<3>(t)+"|";
164
+ }
165
+ return s;
166
+ }
167
+
168
+ // Canonical key across permutations within equal-label groups (exact)
169
+ static string encode_with_order(const Pattern& S, const vector<int>& order){
170
+ const int k = (int)S.vlab.size();
171
+ vector<int> pos(k);
172
+ for (int i=0;i<k;++i) pos[order[i]] = i;
173
+
174
+ string s; s.reserve(k*8 + S.pedges.size()*16);
175
+ s += "V:";
176
+ for (int i=0;i<k;++i){ s += S.vlab[order[i]]; s += '|'; }
177
+
178
+ vector<tuple<int,int,int,string>> es; es.reserve(S.pedges.size());
179
+ for (const auto& e : S.pedges){
180
+ int a = pos[e.a], b = pos[e.b];
181
+ int dcode = 0;
182
+ if (e.dir==1) dcode = (a<b ? 1 : 2);
183
+ es.emplace_back(std::min(a,b), std::max(a,b), dcode, e.el);
184
+ }
185
+ sort(es.begin(), es.end());
186
+
187
+ s += "E:";
188
+ for (auto& t : es){
189
+ s += to_string(get<0>(t))+"-"+to_string(get<1>(t))+"-"+to_string(get<2>(t))+"-"+get<3>(t)+"|";
190
+ }
191
+ return s;
192
+ }
193
+
194
+ static std::string canonical_key(const Pattern& S){
195
+ const int k = (int)S.vlab.size();
196
+ if (k<=1) return S.key();
197
+
198
+ // group vertex indices by label (local 'groups' lives only in this function)
199
+ std::map<std::string, std::vector<int>> groups;
200
+ for (int i=0;i<k;++i) groups[S.vlab[i]].push_back(i);
201
+ for (auto& kv : groups) std::sort(kv.second.begin(), kv.second.end());
202
+
203
+ // collect labels to iterate deterministically
204
+ std::vector<std::string> labels; labels.reserve(groups.size());
205
+ for (auto& kv : groups) labels.push_back(kv.first);
206
+
207
+ auto encode_with_order = [&](const std::vector<int>& order)->std::string{
208
+ std::vector<int> pos(k);
209
+ for (int i=0;i<k;++i) pos[order[i]] = i;
210
+
211
+ std::string s; s.reserve(k*8 + S.pedges.size()*16);
212
+ s += "V:";
213
+ for (int i=0;i<k;++i){ s += S.vlab[order[i]]; s += '|'; }
214
+
215
+ std::vector<std::tuple<int,int,int,std::string>> es; es.reserve(S.pedges.size());
216
+ for (const auto& e : S.pedges){
217
+ int a = pos[e.a], b = pos[e.b];
218
+ int dcode = 0; // 0 undirected; 1 a->b aligned to min; 2 b->a aligned
219
+ if (e.dir==1) dcode = (a<b ? 1 : 2);
220
+ es.emplace_back(std::min(a,b), std::max(a,b), dcode, e.el);
221
+ }
222
+ std::sort(es.begin(), es.end());
223
+
224
+ s += "E:";
225
+ for (auto& t : es){
226
+ s += std::to_string(std::get<0>(t)) + "-" +
227
+ std::to_string(std::get<1>(t)) + "-" +
228
+ std::to_string(std::get<2>(t)) + "-" +
229
+ std::get<3>(t) + "|";
230
+ }
231
+ return s;
232
+ };
233
+
234
+ std::string best; bool have=false;
235
+ std::vector<int> current; current.reserve(k);
236
+
237
+ // backtrack over permutations within each equal-label group
238
+ std::function<void(int)> dfs = [&](int gi){
239
+ if (gi == (int)labels.size()){
240
+ std::string code = encode_with_order(current);
241
+ if (!have || code < best){ best = std::move(code); have = true; }
242
+ return;
243
+ }
244
+ const auto& g = groups[labels[gi]];
245
+ std::vector<int> perm = g;
246
+ do{
247
+ size_t old = current.size();
248
+ current.insert(current.end(), perm.begin(), perm.end());
249
+ dfs(gi+1);
250
+ current.resize(old);
251
+ } while (std::next_permutation(perm.begin(), perm.end()));
252
+ };
253
+
254
+ dfs(0);
255
+ return best;
256
+ }
257
+
258
+ // ======================= Seeds (1-edge) ================================
259
+
260
+ struct SeedInfo {
261
+ DataGraph::EdgeTypeKey key;
262
+ int mni; // MNI support for the 1-edge pattern
263
+ long long full; // number of edges of that type
264
+ };
265
+
266
+ // Correct undirected equal-label handling: mni = |S| (union of endpoints)
267
+ static vector<SeedInfo> compute_frequent_edge_seeds(const DataGraph& G, int tau){
268
+ using K = DataGraph::EdgeTypeKey;
269
+
270
+ struct AccDir { unordered_set<int> L, R; long long full=0; K key; }; // directed or undirected lu!=lv
271
+ struct AccEq { unordered_set<int> S; long long full=0; K key; }; // undirected lu==lv
272
+
273
+ unordered_map<string, AccDir> acc_lr; acc_lr.reserve(1<<14);
274
+ unordered_map<string, AccEq> acc_eq; acc_eq.reserve(1<<14);
275
+
276
+ const int n = (int)G.vlabels.size();
277
+
278
+ if (G.directed){
279
+ for (int u=0; u<n; ++u){
280
+ const string& lu = G.vlabels[u];
281
+ for (auto [v, el] : G.adj[u]){
282
+ const string& lv = G.vlabels[v];
283
+ K k{lu, lv, el, 1};
284
+ auto &A = acc_lr[ encKey(k) ];
285
+ if (A.full == 0) A.key = k;
286
+ A.L.insert(u);
287
+ A.R.insert(v);
288
+ A.full += 1;
289
+ }
290
+ }
291
+ } else {
292
+ for (int u=0; u<n; ++u){
293
+ const string& lu = G.vlabels[u];
294
+ for (auto [v, el] : G.adj[u]){
295
+ if (u > v) continue; // each undirected edge once
296
+ const string& lv = G.vlabels[v];
297
+
298
+ if (lu == lv){
299
+ K k{lu, lv, el, 0};
300
+ auto &E = acc_eq[ encKey(k) ];
301
+ if (E.full == 0) E.key = k;
302
+ E.S.insert(u);
303
+ E.S.insert(v);
304
+ E.full += 1;
305
+ } else {
306
+ K k;
307
+ int leftNode, rightNode;
308
+ if (lu <= lv) { k = {lu, lv, el, 0}; leftNode=u; rightNode=v; }
309
+ else { k = {lv, lu, el, 0}; leftNode=v; rightNode=u; }
310
+
311
+ auto &A = acc_lr[ encKey(k) ];
312
+ if (A.full == 0) A.key = k;
313
+ A.L.insert(leftNode);
314
+ A.R.insert(rightNode);
315
+ A.full += 1;
316
+ }
317
+ }
318
+ }
319
+ }
320
+
321
+ vector<SeedInfo> seeds;
322
+ seeds.reserve(acc_lr.size() + acc_eq.size());
323
+
324
+ for (auto &kv : acc_lr){
325
+ auto &A = kv.second;
326
+ int mni = std::min((int)A.L.size(), (int)A.R.size());
327
+ if (mni >= tau) seeds.push_back({A.key, mni, A.full});
328
+ }
329
+ for (auto &kv : acc_eq){
330
+ auto &E = kv.second;
331
+ int mni = (int)E.S.size();
332
+ if (mni >= tau) seeds.push_back({E.key, mni, E.full});
333
+ }
334
+
335
+ return seeds;
336
+ }
337
+
338
+ // Build O(1) seed MNI lookups
339
+ static unordered_map<string,int> build_seed_mni_map(const vector<SeedInfo>& seeds){
340
+ unordered_map<string,int> m; m.reserve(seeds.size()*2);
341
+ for (auto &s : seeds) m[encKey(s.key)] = s.mni;
342
+ return m;
343
+ }
344
+
345
+ // ==================== MNI (exact, AC + MRV) ===========================
346
+
347
+ static inline bool consistent_edge_map(const DataGraph& G,
348
+ const Pattern::PEdge& e,
349
+ int va, int vb, // pattern endpoints
350
+ int ga, int gb) { // graph nodes mapped to (va,vb)
351
+ if (e.dir == 1) {
352
+ if (e.a == va && e.b == vb) return G.has_edge(ga, gb, e.el);
353
+ if (e.a == vb && e.b == va) return G.has_edge(gb, ga, e.el);
354
+ return true;
355
+ } else {
356
+ if ((e.a == va && e.b == vb) || (e.a == vb && e.b == va)) {
357
+ return G.has_edge(ga, gb, e.el) || G.has_edge(gb, ga, e.el);
358
+ }
359
+ return true;
360
+ }
361
+ }
362
+
363
+ // Local AC (neighbor-existence) with scans (safe for directed + undirected)
364
+ static void filter_domains_by_local_constraints(const DataGraph& G,
365
+ const Pattern& P,
366
+ vector<vector<int>>& dom)
367
+ {
368
+ const int k = (int)P.vlab.size();
369
+
370
+ for (int v = 0; v < k; ++v){
371
+ if (dom[v].empty()) continue;
372
+ vector<int> keep; keep.reserve(dom[v].size());
373
+
374
+ for (int gi : dom[v]){
375
+ bool ok_all = true;
376
+ for (const auto& e : P.pedges){
377
+ if (e.a != v && e.b != v) continue;
378
+
379
+ const int nb = (e.a==v? e.b : e.a);
380
+ const string& needLab = P.vlab[nb];
381
+ const string& el = e.el;
382
+
383
+ bool ok_edge = false;
384
+ if (e.dir == 1){
385
+ if (e.a == v){
386
+ for (auto [x, lbl] : G.adj[gi]) {
387
+ if (lbl == el && G.vlabels[x] == needLab){ ok_edge = true; break; }
388
+ }
389
+ } else {
390
+ for (auto [x, lbl] : G.rev[gi]) {
391
+ if (lbl == el && G.vlabels[x] == needLab){ ok_edge = true; break; }
392
+ }
393
+ }
394
+ } else {
395
+ for (auto [x, lbl] : G.adj[gi]) {
396
+ if (lbl == el && G.vlabels[x] == needLab){ ok_edge = true; break; }
397
+ }
398
+ }
399
+
400
+ if (!ok_edge){ ok_all = false; break; }
401
+ }
402
+ if (ok_all) keep.push_back(gi);
403
+ }
404
+ dom[v].swap(keep);
405
+ }
406
+ }
407
+
408
+ // Existence of a full injective embedding with x_fixVar = fixNode
409
+ static bool exists_solution_with(const DataGraph& G, const Pattern& P,
410
+ int fixVar, int fixNode,
411
+ const vector<vector<int>>& domainsInit)
412
+ {
413
+ const int k = (int)P.vlab.size();
414
+ const int n = (int)G.vlabels.size();
415
+
416
+ vector<vector<int>> dom = domainsInit; // do not mutate caller’s copy
417
+ vector<int> assign(k, -1);
418
+ vector<char> used(n, 0);
419
+
420
+ assign[fixVar] = fixNode;
421
+ used[fixNode] = 1;
422
+
423
+ auto choose_var = [&]()->int{
424
+ int best=-1, bestCnt=INT_MAX;
425
+ for (int v=0; v<k; ++v){
426
+ if (assign[v]!=-1) continue;
427
+ int cnt=0;
428
+ for (int gi : dom[v]){
429
+ if (used[gi]) continue;
430
+ bool ok = true;
431
+ for (const auto& e : P.pedges){
432
+ if (e.a == v && assign[e.b] != -1){
433
+ ok = consistent_edge_map(G,e,e.a,e.b,gi,assign[e.b]);
434
+ } else if (e.b == v && assign[e.a] != -1){
435
+ ok = consistent_edge_map(G,e,e.a,e.b,assign[e.a],gi);
436
+ }
437
+ if (!ok) break;
438
+ }
439
+ if (ok){ ++cnt; if (cnt >= bestCnt) break; }
440
+ }
441
+ if (cnt < bestCnt){ best=v; bestCnt=cnt; }
442
+ }
443
+ return best;
444
+ };
445
+
446
+ function<bool()> dfs = [&](){
447
+ bool done=true; for (int i=0;i<k;++i) if (assign[i]==-1){ done=false; break; }
448
+ if (done) return true;
449
+
450
+ int v = choose_var();
451
+ if (v==-1) return false;
452
+
453
+ for (int gi : dom[v]){
454
+ if (used[gi]) continue;
455
+
456
+ bool ok = true;
457
+ for (const auto& e : P.pedges){
458
+ if (e.a == v && assign[e.b] != -1){
459
+ ok = consistent_edge_map(G,e,e.a,e.b,gi,assign[e.b]);
460
+ } else if (e.b == v && assign[e.a] != -1){
461
+ ok = consistent_edge_map(G,e,e.a,e.b,assign[e.a],gi);
462
+ }
463
+ if (!ok) break;
464
+ }
465
+ if (!ok) continue;
466
+
467
+ assign[v]=gi; used[gi]=1;
468
+ if (dfs()) return true;
469
+ used[gi]=0; assign[v]=-1;
470
+ }
471
+ return false;
472
+ };
473
+
474
+ return dfs();
475
+ }
476
+
477
+ // Exact MNI: per-variable existence, with local AC
478
+ static int compute_MNI_support_exact(const DataGraph& G, const Pattern& P, int tau){
479
+ const int k = (int)P.vlab.size();
480
+ if (k == 0) return 0;
481
+
482
+ vector<vector<int>> dom(k);
483
+ for (int i=0; i<k; ++i){
484
+ auto it = G.lab2nodes.find(P.vlab[i]);
485
+ if (it != G.lab2nodes.end()) dom[i].assign(it->second.begin(), it->second.end());
486
+ if ((int)dom[i].size() < tau) return 0;
487
+ }
488
+ filter_domains_by_local_constraints(G, P, dom);
489
+ for (int i=0;i<k;++i) if ((int)dom[i].size() < tau) return 0;
490
+
491
+ int support = numeric_limits<int>::max();
492
+
493
+ for (int v=0; v<k; ++v){
494
+ vector<int> Dv = dom[v];
495
+ int count_v = 0;
496
+
497
+ for (int u : Dv){
498
+ if (exists_solution_with(G, P, v, u, dom)){
499
+ ++count_v;
500
+ } else {
501
+ auto &Dref = dom[v];
502
+ auto it = find(Dref.begin(), Dref.end(), u);
503
+ if (it != Dref.end()) Dref.erase(it);
504
+ if ((int)Dref.size() < tau) return 0;
505
+ }
506
+ }
507
+
508
+ support = min(support, count_v);
509
+ if (support < tau) return 0;
510
+ }
511
+
512
+ return (support==numeric_limits<int>::max()? 0 : support);
513
+ }
514
+
515
+ // Quick seed support (2 nodes, 1 edge) from seed map
516
+ static int mni_support_seed_from_map(const DataGraph& G, const Pattern& P,
517
+ const unordered_map<string,int>& seed_mni)
518
+ {
519
+ const auto& e = P.pedges[0];
520
+ DataGraph::EdgeTypeKey k;
521
+ if (G.directed){
522
+ k = { P.vlab[e.a], P.vlab[e.b], e.el, 1 };
523
+ }else{
524
+ const string &la = P.vlab[e.a], &lb = P.vlab[e.b];
525
+ if (la <= lb) k = { la, lb, e.el, 0 };
526
+ else k = { lb, la, e.el, 0 };
527
+ }
528
+ auto it = seed_mni.find(encKey(k));
529
+ return (it==seed_mni.end()? 0 : it->second);
530
+ }
531
+
532
+ // Hybrid: use O(1) seed for k=2, exact for larger patterns
533
+ static int compute_MNI_support_hybrid(const DataGraph& G, const Pattern& P, int tau,
534
+ const unordered_map<string,int>& seed_mni)
535
+ {
536
+ if (P.vlab.size()==2 && P.pedges.size()==1){
537
+ return mni_support_seed_from_map(G, P, seed_mni);
538
+ }
539
+ return compute_MNI_support_exact(G, P, tau);
540
+ }
541
+
542
+ // =================== Candidate enumeration ============================
543
+
544
+ static inline bool edge_already_in_pattern(const Pattern& S,
545
+ int a, int b,
546
+ const string& el,
547
+ int dirflag)
548
+ {
549
+ for (const auto& e : S.pedges){
550
+ if (e.el != el) continue;
551
+ if (dirflag == 0 && e.dir == 0){
552
+ if ((e.a == a && e.b == b) || (e.a == b && e.b == a)) return true;
553
+ } else if (dirflag == 1 && e.dir == 1){
554
+ if (e.a == a && e.b == b) return true;
555
+ }
556
+ }
557
+ return false;
558
+ }
559
+
560
+ // Necessary seed lower bound: every edge-type in P must have seed MNI >= tau
561
+ static bool seed_lower_bound_ok(const DataGraph& G, const Pattern& P, int tau,
562
+ const unordered_map<string,int>& seed_mni)
563
+ {
564
+ for (const auto& e : P.pedges){
565
+ DataGraph::EdgeTypeKey k;
566
+ if (G.directed){
567
+ k = { P.vlab[e.a], P.vlab[e.b], e.el, 1 };
568
+ }else{
569
+ const string &la = P.vlab[e.a], &lb = P.vlab[e.b];
570
+ if (la <= lb) k = { la, lb, e.el, 0 };
571
+ else k = { lb, la, e.el, 0 };
572
+ }
573
+ auto it = seed_mni.find(encKey(k));
574
+ if (it == seed_mni.end() || it->second < tau) return false;
575
+ }
576
+ return true;
577
+ }
578
+
579
+ static void enumerate_candidates(const DataGraph& G,
580
+ const Pattern& S,
581
+ const vector<SeedInfo>& seeds,
582
+ const unordered_map<string,int>& seed_mni,
583
+ int tau,
584
+ vector<Pattern>& out)
585
+ {
586
+ const int k = (int)S.vlab.size();
587
+ unordered_set<string> seen; seen.reserve(512);
588
+
589
+ // Node-extensions: connect a new node to any existing node via a frequent edge-type
590
+ for (int u = 0; u < k; ++u){
591
+ for (const auto& s : seeds){
592
+ const auto& ek = s.key;
593
+ // u must match one side's label
594
+ if (!(ek.lu == S.vlab[u] || ek.lv == S.vlab[u])) continue;
595
+
596
+ if (ek.dirflag == 1){
597
+ // u as source
598
+ if (S.vlab[u] == ek.lu){
599
+ Pattern ext = S;
600
+ ext.vlab.push_back(ek.lv);
601
+ int newId = (int)ext.vlab.size() - 1;
602
+ if (!edge_already_in_pattern(S, u, newId, ek.el, 1)){
603
+ ext.pedges.push_back({u, newId, ek.el, 1});
604
+ if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
605
+ string key = canonical_key(ext);
606
+ if (seen.insert(key).second) out.push_back(std::move(ext));
607
+ }
608
+ }
609
+ }
610
+ // u as target
611
+ if (S.vlab[u] == ek.lv){
612
+ Pattern ext = S;
613
+ ext.vlab.push_back(ek.lu);
614
+ int newId = (int)ext.vlab.size() - 1;
615
+ if (!edge_already_in_pattern(S, newId, u, ek.el, 1)){
616
+ ext.pedges.push_back({newId, u, ek.el, 1});
617
+ if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
618
+ string key = canonical_key(ext);
619
+ if (seen.insert(key).second) out.push_back(std::move(ext));
620
+ }
621
+ }
622
+ }
623
+ } else {
624
+ // undirected
625
+ if (S.vlab[u] == ek.lu){
626
+ Pattern ext = S;
627
+ ext.vlab.push_back(ek.lv);
628
+ int newId = (int)ext.vlab.size() - 1;
629
+ if (!edge_already_in_pattern(S, u, newId, ek.el, 0)){
630
+ ext.pedges.push_back({u, newId, ek.el, 0});
631
+ if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
632
+ string key = canonical_key(ext);
633
+ if (seen.insert(key).second) out.push_back(std::move(ext));
634
+ }
635
+ }
636
+ } else if (S.vlab[u] == ek.lv){
637
+ Pattern ext = S;
638
+ ext.vlab.push_back(ek.lu);
639
+ int newId = (int)ext.vlab.size() - 1;
640
+ if (!edge_already_in_pattern(S, u, newId, ek.el, 0)){
641
+ ext.pedges.push_back({u, newId, ek.el, 0});
642
+ if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
643
+ string key = canonical_key(ext);
644
+ if (seen.insert(key).second) out.push_back(std::move(ext));
645
+ }
646
+ }
647
+ }
648
+ }
649
+ }
650
+ }
651
+
652
+ // Edge-extensions: add a new edge between existing nodes if frequent
653
+ for (int a = 0; a < k; ++a){
654
+ for (int b = a + 1; b < k; ++b){
655
+ for (const auto& s : seeds){
656
+ const auto& ek = s.key;
657
+
658
+ if (ek.dirflag == 0){
659
+ if (!((ek.lu==S.vlab[a] && ek.lv==S.vlab[b]) ||
660
+ (ek.lu==S.vlab[b] && ek.lv==S.vlab[a])))
661
+ continue;
662
+
663
+ if (edge_already_in_pattern(S, a, b, ek.el, 0)) continue;
664
+
665
+ Pattern ext = S;
666
+ ext.pedges.push_back({a, b, ek.el, 0});
667
+ if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
668
+ string key = canonical_key(ext);
669
+ if (seen.insert(key).second) out.push_back(std::move(ext));
670
+ }
671
+ } else {
672
+ // a->b
673
+ if (ek.lu==S.vlab[a] && ek.lv==S.vlab[b]){
674
+ if (!edge_already_in_pattern(S, a, b, ek.el, 1)){
675
+ Pattern ext = S;
676
+ ext.pedges.push_back({a, b, ek.el, 1});
677
+ if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
678
+ string key = canonical_key(ext);
679
+ if (seen.insert(key).second) out.push_back(std::move(ext));
680
+ }
681
+ }
682
+ }
683
+ // b->a
684
+ if (ek.lu==S.vlab[b] && ek.lv==S.vlab[a]){
685
+ if (!edge_already_in_pattern(S, b, a, ek.el, 1)){
686
+ Pattern ext = S;
687
+ ext.pedges.push_back({b, a, ek.el, 1});
688
+ if (seed_lower_bound_ok(G, ext, tau, seed_mni)){
689
+ string key = canonical_key(ext);
690
+ if (seen.insert(key).second) out.push_back(std::move(ext));
691
+ }
692
+ }
693
+ }
694
+ }
695
+ }
696
+ }
697
+ }
698
+ }
699
+
700
+ // ======================= SUBGRAPHEXTENSION ============================
701
+
702
+ static void SUBGRAPHEXTENSION(const DataGraph& G, int tau,
703
+ const vector<SeedInfo>& seeds,
704
+ const unordered_map<string,int>& seed_mni,
705
+ const Pattern& S,
706
+ unordered_set<string>& emitted,
707
+ vector<Found>& out)
708
+ {
709
+ const string K = canonical_key(S);
710
+ if (!emitted.insert(K).second) return;
711
+
712
+ int mni = compute_MNI_support_hybrid(G, S, tau, seed_mni);
713
+ if (mni < tau) return;
714
+
715
+ out.push_back({S, (long long)mni});
716
+
717
+ vector<Pattern> cand;
718
+ enumerate_candidates(G, S, seeds, seed_mni, tau, cand);
719
+
720
+ for (const auto& c : cand){
721
+ int s = compute_MNI_support_hybrid(G, c, tau, seed_mni);
722
+ if (s >= tau){
723
+ SUBGRAPHEXTENSION(G, tau, seeds, seed_mni, c, emitted, out);
724
+ }
725
+ }
726
+ }
727
+
728
+ // ============================ Driver ==================================
729
+
730
+ Output run_sopagrami(const DataGraph& G, const Params& p){
731
+ // 1) frequent 1-edge seeds by true MNI
732
+ auto seeds = compute_frequent_edge_seeds(G, p.tau);
733
+ if (seeds.empty()){
734
+ Output out; return out;
735
+ }
736
+
737
+ // 2) sort seeds by full support (edge count) descending (SoGraMi ordering)
738
+ sort(seeds.begin(), seeds.end(),
739
+ [](const SeedInfo& a, const SeedInfo& b){ return a.full > b.full; });
740
+
741
+ // 3) precompute O(1) seed MNI map (for fast lower bound + seed support)
742
+ auto seed_mni = build_seed_mni_map(seeds);
743
+
744
+ // 4) parallel expand per seed
745
+ int T = 1;
746
+ #ifdef _OPENMP
747
+ T = (p.num_threads > 0 ? p.num_threads : omp_get_max_threads());
748
+ if (T < 1) T = 1;
749
+ omp_set_num_threads(T);
750
+ #endif
751
+
752
+ vector<vector<Found>> locals(T);
753
+ vector<unordered_set<string>> local_emitted(T);
754
+ for (int t=0; t<T; ++t){
755
+ locals[t].reserve(1<<12);
756
+ local_emitted[t].reserve(1<<14);
757
+ }
758
+
759
+ #ifdef _OPENMP
760
+ #pragma omp parallel
761
+ {
762
+ const int tid = omp_get_thread_num();
763
+ auto& out_loc = locals[tid];
764
+ auto& emit_loc = local_emitted[tid];
765
+
766
+ #pragma omp for schedule(dynamic)
767
+ for (int i = 0; i < (int)seeds.size(); ++i){
768
+ Pattern seed;
769
+ seed.vlab = {seeds[i].key.lu, seeds[i].key.lv};
770
+ seed.pedges.push_back({0,1,seeds[i].key.el,seeds[i].key.dirflag});
771
+ SUBGRAPHEXTENSION(G, p.tau, seeds, seed_mni, seed, emit_loc, out_loc);
772
+ }
773
+ }
774
+ #else
775
+ // Fallback single-thread
776
+ for (size_t i=0; i<seeds.size(); ++i){
777
+ Pattern seed;
778
+ seed.vlab = {seeds[i].key.lu, seeds[i].key.lv};
779
+ seed.pedges.push_back({0,1,seeds[i].key.el,seeds[i].key.dirflag});
780
+ SUBGRAPHEXTENSION(G, p.tau, seeds, seed_mni, seed,
781
+ local_emitted[0], locals[0]);
782
+ }
783
+ #endif
784
+
785
+ // 5) merge & sort
786
+ Output out;
787
+ unordered_set<string> global_emitted; global_emitted.reserve(1<<16);
788
+ for (int t=0; t<T; ++t){
789
+ for (auto &f : locals[t]){
790
+ string K = canonical_key(f.pat);
791
+ if (global_emitted.insert(K).second) out.frequent_patterns.push_back(std::move(f));
792
+ }
793
+ }
794
+
795
+ stable_sort(out.frequent_patterns.begin(), out.frequent_patterns.end(),
796
+ [](const Found& A, const Found& B){
797
+ if (A.pat.pedges.size()!=B.pat.pedges.size())
798
+ return A.pat.pedges.size() < B.pat.pedges.size();
799
+ return A.full_support > B.full_support;
800
+ });
801
+
802
+ return out;
803
+ }
804
+
805
+ } // namespace algo