submine 0.1.0__cp311-cp311-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. submine/__init__.py +37 -0
  2. submine/algorithms/__init__.py +23 -0
  3. submine/algorithms/base.py +143 -0
  4. submine/algorithms/gspan.py +156 -0
  5. submine/algorithms/gspan_cpp.cpython-311-x86_64-linux-musl.so +0 -0
  6. submine/algorithms/sopagrami.py +250 -0
  7. submine/algorithms/sopagrami_cpp.cpython-311-x86_64-linux-musl.so +0 -0
  8. submine/api.py +134 -0
  9. submine/backends/__init__.py +0 -0
  10. submine/backends/gspan/CMakeLists.txt +65 -0
  11. submine/backends/gspan/dfs.cpp +98 -0
  12. submine/backends/gspan/graph.cpp +165 -0
  13. submine/backends/gspan/gspan.cpp +776 -0
  14. submine/backends/gspan/gspan.h +296 -0
  15. submine/backends/gspan/ismin.cpp +124 -0
  16. submine/backends/gspan/main.cpp +106 -0
  17. submine/backends/gspan/misc.cpp +177 -0
  18. submine/backends/gspan/python_bindings.cpp +133 -0
  19. submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
  20. submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
  21. submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
  22. submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
  23. submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
  24. submine/backends/sopagrami/cpp/src/main.cpp +94 -0
  25. submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
  26. submine/cli/__init__.py +6 -0
  27. submine/cli/main.py +87 -0
  28. submine/core/__init__.py +12 -0
  29. submine/core/graph.py +179 -0
  30. submine/core/result.py +121 -0
  31. submine/datasets/__init__.py +11 -0
  32. submine/datasets/loaders.py +145 -0
  33. submine/errors.py +41 -0
  34. submine/io/__init__.py +30 -0
  35. submine/io/common.py +173 -0
  36. submine/io/gexf.py +88 -0
  37. submine/io/gspan.py +268 -0
  38. submine/io/sopagrami.py +143 -0
  39. submine/io/transcode.py +147 -0
  40. submine/registry.py +8 -0
  41. submine/utils/__init__.py +6 -0
  42. submine/utils/checks.py +115 -0
  43. submine/utils/logging.py +41 -0
  44. submine-0.1.0.dist-info/METADATA +178 -0
  45. submine-0.1.0.dist-info/RECORD +49 -0
  46. submine-0.1.0.dist-info/WHEEL +5 -0
  47. submine-0.1.0.dist-info/licenses/LICENSE +21 -0
  48. submine.libs/libgcc_s-2298274a.so.1 +0 -0
  49. submine.libs/libstdc++-08d5c7eb.so.6.0.33 +0 -0
@@ -0,0 +1,776 @@
1
+ /*
2
+ $Id: gspan.cpp,v 1.8 2004/05/21 09:27:17 taku-ku Exp $;
3
+
4
+ Copyright (C) 2004 Taku Kudo, All rights reserved.
5
+ This is free software with ABSOLUTELY NO WARRANTY.
6
+
7
+ This program is free software; you can redistribute it and/or modify
8
+ it under the terms of the GNU General Public License as published by
9
+ the Free Software Foundation; either version 2 of the License, or
10
+ (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20
+ 02111-1307, USA
21
+ */
22
+ #include "gspan.h"
23
+ #include <iterator>
24
+
25
+ #include <cstdio>
26
+ #include <stdexcept>
27
+
28
+ #include <stdlib.h>
29
+ #include <thread>
30
+ #include <sstream>
31
+
32
+
33
+
34
+ namespace GSPAN {
35
+
36
+ gSpan::gSpan (void)
37
+ {
38
+ boost = false;
39
+ }
40
+
41
+ // MATLAB/MEX support removed.
42
+
43
+ std::istream &gSpan::read (std::istream &is)
44
+ {
45
+ Graph g(directed);
46
+ while (true) {
47
+ g.read (is);
48
+ if (g.empty()) break;
49
+ TRANS.push_back (g);
50
+ }
51
+ return is;
52
+ }
53
+
54
+
55
+ /* 2-class LPBoosting gain function
56
+ */
57
+ double
58
+ gSpan::gain2 (Projected &projected, double y)
59
+ {
60
+ /* Compute actual gain (eq. (10) in [Dimiriz2002]) and Problem 1 in graph
61
+ * boost paper.
62
+ *
63
+ * For L samples x_i, calculate
64
+ * gain = \sum_{i=1}^{L} d_i y_i h(x_i)
65
+ * where h is implicitly defined by this current subgraph pattern.
66
+ */
67
+ unsigned int oid = 0xffffffff;
68
+ double gainm = 0.0;
69
+
70
+ Projected::iterator cur = projected.begin();
71
+ for ( ; cur != projected.end() ; ++cur) {
72
+ if (oid != cur->id) {
73
+ /* A positive pattern: it appears in the graph.
74
+ *
75
+ * h_{<t,y>} = y, if t \subseteq x,
76
+ * = -y, otherwise.
77
+ *
78
+ * t is implicitly defined by projected.
79
+ * First, process all skipped graphs.
80
+ */
81
+ for (unsigned int skipn = oid+1 ; skipn < cur->id ; ++skipn)
82
+ gainm += boostY[skipn]*boostWeights[skipn]*(-y);
83
+
84
+ /* Now process positive pattern.
85
+ * This is also the gain function for 1.5 LP boosting, as we have
86
+ * the constraint
87
+ * \sum_{n=1}^N \lambda_n h_j(x_{1,n})
88
+ * - \sum_{m=1}^M \mu_m h_j(x_{2,m}) <= \gamma.
89
+ * In order to maximize the violation of the current ensemble, we
90
+ * search for the classifier that maximizes the left side.
91
+ */
92
+ gainm += boostY[cur->id]*boostWeights[cur->id]*y;
93
+ }
94
+ oid = cur->id;
95
+ }
96
+
97
+ /* All skipped graphs to the end.
98
+ */
99
+ for (unsigned int skipn = oid+1 ; skipn < boostY.size() ; ++skipn)
100
+ gainm += boostY[skipn]*boostWeights[skipn]*(-y);
101
+
102
+ return (gainm);
103
+ }
104
+
105
+
106
+ /* 1.5-class LPBoosting gain function
107
+ */
108
+ double
109
+ gSpan::gain1d5 (Projected &projected, double y)
110
+ {
111
+ /* Compute actual gain (eq. (10) in [Dimiriz2002]) and Problem 1 in graph
112
+ * boost paper.
113
+ *
114
+ * For L samples x_i, calculate
115
+ * gain = \sum_{i=1}^{L} d_i y_i h(x_i)
116
+ * where h is implicitly defined by this current subgraph pattern. In the
117
+ * 1.5-class case, h(x_i) is zero if the pattern does not appear, hence we
118
+ * can skip those instances silently.
119
+ */
120
+ unsigned int oid = 0xffffffff;
121
+ double gain = 0.0;
122
+
123
+ Projected::iterator cur = projected.begin();
124
+ for ( ; cur != projected.end() ; ++cur) {
125
+ if (oid != cur->id)
126
+ gain += boostY[cur->id]*boostWeights[cur->id]*y;
127
+
128
+ oid = cur->id;
129
+ }
130
+
131
+ return (gain);
132
+ }
133
+
134
+ /* Gain bound for 1.5-class formulation
135
+ */
136
+ double
137
+ gSpan::gainbound1d5 (Projected &projected)
138
+ {
139
+ double gain = 0.0;
140
+ unsigned int oid = 0xffffffff;
141
+
142
+ for (Projected::iterator cur = projected.begin() ;
143
+ cur != projected.end() ; ++cur)
144
+ {
145
+ if (oid != cur->id) {
146
+ oid = cur->id;
147
+ if (boostY[cur->id] <= 0.0)
148
+ continue;
149
+
150
+ gain += boostWeights[cur->id];
151
+ }
152
+ }
153
+
154
+ return (gain);
155
+ }
156
+
157
+ /* Gain bound for 2-class formulation
158
+ */
159
+ double
160
+ gSpan::gainbound2 (Projected &projected)
161
+ {
162
+ /* Lemma 1, section 3.2 in the graph boosting paper. We automatically
163
+ * only enumerate those patterns which contain the subgraph 'projected'.
164
+ *
165
+ * <t^,y^> = argmax_{t \in F,y \in {+/- 1}} d_i y_i h_{<t,y>}, where
166
+ * F = \unionset_{i=1}^{L} { t | t \subseteq x_i }.
167
+ */
168
+ unsigned int oid = 0xffffffff;
169
+ unsigned int size = 0;
170
+ double gain_pos = 0.0;
171
+ double gain_neg = 0.0;
172
+
173
+ for (Projected::iterator cur = projected.begin() ;
174
+ cur != projected.end() ; ++cur)
175
+ {
176
+ if (oid != cur->id) {
177
+ if (boostY[cur->id] <= 0.0)
178
+ gain_neg += boostWeights[cur->id];
179
+ else
180
+ gain_pos += boostWeights[cur->id];
181
+
182
+ ++size;
183
+ }
184
+ oid = cur->id;
185
+ }
186
+
187
+ /* The boostWeightSum has been precomputed as it is always the same for
188
+ * one run.
189
+ */
190
+ gain_neg = 2.0*gain_neg + boostWeightSum;
191
+ gain_pos = 2.0*gain_pos - boostWeightSum;
192
+
193
+ if (gain_neg >= gain_pos)
194
+ return (gain_neg);
195
+
196
+ return (gain_pos);
197
+ }
198
+
199
+
200
+ std::map<unsigned int, unsigned int>
201
+ gSpan::support_counts (Projected &projected)
202
+ {
203
+ std::map<unsigned int, unsigned int> counts;
204
+
205
+ for (Projected::iterator cur = projected.begin() ;
206
+ cur != projected.end() ; ++cur)
207
+ {
208
+ counts[cur->id] += 1;
209
+ }
210
+
211
+ return (counts);
212
+ }
213
+
214
+
215
+ unsigned int
216
+ gSpan::support (Projected &projected)
217
+ {
218
+ unsigned int oid = 0xffffffff;
219
+ unsigned int size = 0;
220
+
221
+ for (Projected::iterator cur = projected.begin(); cur != projected.end(); ++cur) {
222
+ if (oid != cur->id) {
223
+ ++size;
224
+ }
225
+ oid = cur->id;
226
+ }
227
+
228
+ return size;
229
+ }
230
+
231
+ void gSpan::report_boosting (Projected &projected, unsigned int sup,
232
+ double gain, double yval)
233
+ {
234
+ if (maxpat_max > maxpat_min && DFS_CODE.nodeCount () > maxpat_max)
235
+ return;
236
+ if (maxpat_min > 0 && DFS_CODE.nodeCount () < maxpat_min)
237
+ return;
238
+
239
+ Graph g(directed);
240
+ DFS_CODE.toGraph (g);
241
+
242
+ // insert individual counts
243
+ std::map<unsigned int, unsigned int> GYcounts;
244
+ unsigned int oid = 0xffffffff;
245
+ for (Projected::iterator cur = projected.begin(); cur != projected.end(); ++cur) {
246
+ if (oid != cur->id)
247
+ GYcounts[cur->id] = 0;
248
+
249
+ GYcounts[cur->id] += 1;
250
+ oid = cur->id;
251
+ }
252
+
253
+ report_boosting_inter (g, sup, gain, yval, GYcounts);
254
+ }
255
+
256
+ void gSpan::report_boosting_inter (Graph &g, unsigned int sup,
257
+ double gain, double yval, std::map<unsigned int, unsigned int>& GYcounts)
258
+ {
259
+ // Pure C++ build: no MATLAB-specific graph printer.
260
+
261
+ /* Insertion sort based on the gain values.
262
+ */
263
+ unsigned int insertidx;
264
+ for (insertidx = 0 ; insertidx < bestGraphsGain.size () ; ++insertidx) {
265
+ if (gain >= bestGraphsGain[insertidx])
266
+ break;
267
+ }
268
+
269
+ bestGraphs.insert (bestGraphs.begin() + insertidx, g);
270
+ bestGraphsY.insert (bestGraphsY.begin() + insertidx, yval);
271
+ bestGraphsGain.insert (bestGraphsGain.begin() + insertidx, gain);
272
+ bestGraphsCounts.insert (bestGraphsCounts.begin() + insertidx, GYcounts);
273
+
274
+ #ifdef DEBUG
275
+ fprintf(stderr, " inserted good output graph, gain %lf, sup %d, Y %lf, new size %d\n",
276
+ gain, sup, yval, g.size());
277
+ #endif
278
+
279
+ /* If the maximum allowed size is exceeded, prune it down.
280
+ */
281
+ if (bestGraphsGain.size () > boostN) {
282
+ #ifdef DEBUG
283
+ fprintf(stderr, " one element removed at end with gain %lf\n",
284
+ bestGraphsGain[bestGraphsGain.size()-1]);
285
+ #endif
286
+
287
+ bestGraphs.pop_back ();
288
+ bestGraphsY.pop_back ();
289
+ bestGraphsGain.pop_back ();
290
+ bestGraphsCounts.pop_back ();
291
+ }
292
+
293
+ ++ID;
294
+ }
295
+
296
+
297
+ /* Special report function for single node graphs.
298
+ */
299
+ void gSpan::report_single (Graph &g, std::map<unsigned int, unsigned int>& ncount)
300
+ {
301
+ unsigned int sup = 0;
302
+ for (std::map<unsigned int, unsigned int>::iterator it = ncount.begin () ;
303
+ it != ncount.end () ; ++it)
304
+ {
305
+ sup += (*it).second;
306
+ }
307
+
308
+ if (maxpat_max > maxpat_min && g.size () > maxpat_max)
309
+ return;
310
+ if (maxpat_min > 0 && g.size () < maxpat_min)
311
+ return;
312
+
313
+ if (enc == false) {
314
+ if (where == false)
315
+ *os << "t # " << ID << " * " << sup;
316
+ *os << '\n';
317
+
318
+ g.write (*os);
319
+ if (callback_) {
320
+ std::cerr << "pattern n=" << g.size()
321
+ << " e0deg=" << (g.size() ? g[0].edge.size() : 0)
322
+ << " sup=" << sup << "\n";
323
+ callback_(g, sup, nullptr);
324
+ }
325
+
326
+ *os << '\n';
327
+ } else {
328
+ // Encoded output mode historically relied on MATLAB glue in this fork.
329
+ // If you need encoded output, implement it here in pure C++.
330
+ throw std::runtime_error("report_single: encoded output is not supported in the pure C++ build");
331
+ }
332
+ ++ID;
333
+ }
334
+
335
+ void gSpan::report(Projected &projected, unsigned int sup)
336
+ {
337
+ if (maxpat_max > maxpat_min && DFS_CODE.nodeCount() > maxpat_max) return;
338
+ if (maxpat_min > 0 && DFS_CODE.nodeCount() < maxpat_min) return;
339
+
340
+ if (where) {
341
+ *os << "<pattern>\n";
342
+ *os << "<id>" << ID << "</id>\n";
343
+ *os << "<support>" << sup << "</support>\n";
344
+ *os << "<what>";
345
+ }
346
+
347
+ // ALWAYS build Graph for callback (details)
348
+ Graph g(directed);
349
+ DFS_CODE.toGraph(g);
350
+
351
+ // Always call callback if present
352
+ if (callback_) {
353
+ callback_(g, sup, &projected);
354
+ }
355
+
356
+ // Output format to stream still depends on enc/where
357
+ if (!enc) {
358
+ if (!where) *os << "t # " << ID << " * " << sup;
359
+ *os << '\n';
360
+ g.write(*os);
361
+ } else {
362
+ if (!where) *os << '<' << ID << "> " << sup << " [";
363
+ DFS_CODE.write(*os);
364
+ if (!where) *os << ']';
365
+ }
366
+
367
+ if (where) {
368
+ *os << "</what>\n<where>";
369
+ unsigned int oid = 0xffffffff;
370
+ for (Projected::iterator cur = projected.begin(); cur != projected.end(); ++cur) {
371
+ if (oid != cur->id) {
372
+ if (cur != projected.begin()) *os << ' ';
373
+ *os << cur->id;
374
+ }
375
+ oid = cur->id;
376
+ }
377
+ *os << "</where>\n</pattern>";
378
+ }
379
+
380
+ *os << '\n';
381
+ ++ID;
382
+ }
383
+
384
+ /* Recursive subgraph mining function (similar to subprocedure 1
385
+ * Subgraph_Mining in [Yan2002]).
386
+ */
387
+ void gSpan::project (Projected &projected)
388
+ {
389
+ if (boost && (boostmax > 0 && boostseen > boostmax))
390
+ return;
391
+
392
+ /* Check if the pattern is frequent enough.
393
+ */
394
+ unsigned int sup = support (projected);
395
+ if (sup < minsup)
396
+ return;
397
+
398
+ /* The minimal DFS code check is more expensive than the support check,
399
+ * hence it is done now, after checking the support.
400
+ */
401
+ if (is_min () == false) {
402
+ // *os << "NOT MIN ["; DFS_CODE.write (*os); *os << "]" << std::endl;
403
+ return;
404
+ }
405
+
406
+ if (boost) {
407
+ /* Compute the gains for all classifiers we can build out of the current
408
+ * subgraph pattern (which is just two).
409
+ */
410
+ double gain_ypos = 0.0; // Gain if y=1 in the weak learner
411
+ double gain_yneg = 0.0; // Gain if y=-1 in the weak learner
412
+ double gainmax = 0.0; // Maximum of the negative/positive gain
413
+ double yval = 1.0; // y that achieves the maximum gain
414
+
415
+ /* Compute gains. This differs a little for the 1.5-class and the
416
+ * 2-class case. (The gain function is replaced, as well).
417
+ */
418
+ gain_ypos = (this->*gain) (projected, 1.0);
419
+ if (boostType == 1) {
420
+ gainmax = gain_ypos;
421
+ } else if (boostType == 2) {
422
+ gain_yneg = (this->*gain) (projected, -1.0);
423
+
424
+ if (gain_ypos >= gain_yneg) {
425
+ gainmax = gain_ypos;
426
+ } else {
427
+ gainmax = gain_yneg;
428
+ yval = -1.0;
429
+ }
430
+ }
431
+
432
+ /* Check if we have to replace the least best of our top performers.
433
+ */
434
+ if (gainmax > boostTau) {
435
+ /* Update best gain so far. The best gain is also the tau bound.
436
+ */
437
+ #ifdef DEBUG
438
+ fprintf(stderr, "boost gain observed with %lf, support %d\n", boostTau, sup);
439
+ #endif
440
+ report_boosting (projected, sup, gainmax, yval);
441
+
442
+ /* Update lower boosting gain bound in case we already exceeded
443
+ * the number of allowed patterns.
444
+ */
445
+ if (bestGraphsGain.size () >= boostN) {
446
+ boostTau = bestGraphsGain[bestGraphsGain.size() - 1];
447
+ fprintf(stderr, " top gains: %lf ... %lf\n",
448
+ bestGraphsGain[0], boostTau);
449
+ }
450
+ }
451
+ if (boostseen % 1000 == 0) {
452
+ Graph g(directed);
453
+ DFS_CODE.toGraph (g);
454
+
455
+ fprintf(stderr, "%7u: gain %lf this.bound %lf "
456
+ "global.bound %lf DFS.size %zu (gsize %zu)\n", boostseen,
457
+ gainmax, (this->*gainbound) (projected), boostTau,
458
+ DFS_CODE.size(), g.size());
459
+ }
460
+
461
+ boostseen += 1;
462
+ } else {
463
+ // Output the frequent substructure
464
+ report (projected, sup);
465
+ }
466
+
467
+ /* In case we have a valid upper bound and our graph already exceeds it,
468
+ * return. Note: we do not check for equality as the DFS exploration may
469
+ * still add edges within an existing subgraph, without increasing the
470
+ * number of nodes.
471
+ */
472
+ if (maxpat_max > maxpat_min && DFS_CODE.nodeCount () > maxpat_max)
473
+ return;
474
+
475
+ /* Compute gain bound for this pattern (projected) and only explore in
476
+ * case the bound allows this subgraph to be better. That is, if the
477
+ * bound lets no supergraph be better than all reported ones so far, we
478
+ * just return.
479
+ */
480
+ if (boost) {
481
+ double gainb = (this->*gainbound) (projected);
482
+ #ifdef DEBUG
483
+ fprintf(stderr, " gain bound: %lf, gain min required: %lf\n", gainb, boostTau);
484
+ fprintf(stderr, " actual gain for this subgraph: %lf\n",
485
+ (this->*gain) (projected, 1.0));
486
+ #endif
487
+
488
+ if (gainb <= boostTau + 1e-8) {
489
+ #ifdef DEBUG
490
+ fprintf(stderr, " ==> return\n");
491
+ #endif
492
+ return;
493
+ }
494
+ }
495
+
496
+ /* We just outputted a frequent subgraph. As it is frequent enough, so
497
+ * might be its (n+1)-extension-graphs, hence we enumerate them all.
498
+ */
499
+ const RMPath &rmpath = DFS_CODE.buildRMPath ();
500
+ int minlabel = DFS_CODE[0].fromlabel;
501
+ int maxtoc = DFS_CODE[rmpath[0]].to;
502
+
503
+ Projected_map3 new_fwd_root;
504
+ Projected_map2 new_bck_root;
505
+ EdgeList edges;
506
+
507
+ /* Enumerate all possible one edge extensions of the current substructure.
508
+ */
509
+ for (unsigned int n = 0; n < projected.size(); ++n) {
510
+
511
+ unsigned int id = projected[n].id;
512
+ PDFS *cur = &projected[n];
513
+ History history (TRANS[id], cur);
514
+
515
+ // XXX: do we have to change something here for directed edges?
516
+
517
+ // backward
518
+ for (int i = (int)rmpath.size()-1; i >= 1; --i) {
519
+ Edge *e = get_backward (TRANS[id], history[rmpath[i]], history[rmpath[0]], history);
520
+ if (e)
521
+ new_bck_root[DFS_CODE[rmpath[i]].from][e->elabel].push (id, e, cur);
522
+ }
523
+
524
+ // pure forward
525
+ // FIXME: here we pass a too large e->to (== history[rmpath[0]]->to
526
+ // into get_forward_pure, such that the assertion fails.
527
+ //
528
+ // The problem is:
529
+ // history[rmpath[0]]->to > TRANS[id].size()
530
+ if (get_forward_pure (TRANS[id], history[rmpath[0]], minlabel, history, edges))
531
+ for (EdgeList::iterator it = edges.begin(); it != edges.end(); ++it)
532
+ new_fwd_root[maxtoc][(*it)->elabel][TRANS[id][(*it)->to].label].push (id, *it, cur);
533
+
534
+ // backtracked forward
535
+ for (int i = 0; i < (int)rmpath.size(); ++i)
536
+ if (get_forward_rmpath (TRANS[id], history[rmpath[i]], minlabel, history, edges))
537
+ for (EdgeList::iterator it = edges.begin(); it != edges.end(); ++it)
538
+ new_fwd_root[DFS_CODE[rmpath[i]].from][(*it)->elabel][TRANS[id][(*it)->to].label].push (id, *it, cur);
539
+ }
540
+
541
+ /* Test all extended substructures.
542
+ */
543
+ // backward
544
+ for (Projected_iterator2 to = new_bck_root.begin(); to != new_bck_root.end(); ++to) {
545
+ for (Projected_iterator1 elabel = to->second.begin(); elabel != to->second.end(); ++elabel) {
546
+ DFS_CODE.push (maxtoc, to->first, -1, elabel->first, -1);
547
+ project (elabel->second);
548
+ DFS_CODE.pop();
549
+ }
550
+ }
551
+
552
+ // forward
553
+ for (Projected_riterator3 from = new_fwd_root.rbegin() ;
554
+ from != new_fwd_root.rend() ; ++from)
555
+ {
556
+ for (Projected_iterator2 elabel = from->second.begin() ;
557
+ elabel != from->second.end() ; ++elabel)
558
+ {
559
+ for (Projected_iterator1 tolabel = elabel->second.begin();
560
+ tolabel != elabel->second.end(); ++tolabel)
561
+ {
562
+ DFS_CODE.push (from->first, maxtoc+1, -1, elabel->first, tolabel->first);
563
+ project (tolabel->second);
564
+ DFS_CODE.pop ();
565
+ }
566
+ }
567
+ }
568
+
569
+ return;
570
+ }
571
+
572
+ void gSpan::boost_setup (unsigned int _boostN, double _boostTau,
573
+ unsigned int _boostmax,
574
+ std::vector<double>& _boostY,
575
+ std::vector<double>& _boostWeights,
576
+ int _boostType)
577
+ {
578
+ boost = true;
579
+ boostseen = 0;
580
+ boostmax = _boostmax;
581
+ boostN = _boostN;
582
+ boostTau = _boostTau;
583
+ boostY = _boostY;
584
+ boostWeights = _boostWeights;
585
+ boostType = _boostType;
586
+
587
+ /* Setup the corresponding function pointers for the gain function adn the
588
+ * gainbound function.
589
+ */
590
+ if (boostType == 1) {
591
+ gain = &gSpan::gain1d5;
592
+ gainbound = &gSpan::gainbound1d5;
593
+ } else if (boostType == 2) {
594
+ gain = &gSpan::gain2;
595
+ gainbound = &gSpan::gainbound2;
596
+ }
597
+
598
+ /* Precompute the overall label*weight sum-balance.
599
+ */
600
+ boostWeightSum = 0.0;
601
+ for (unsigned int n = 0 ; n < boostY.size() ; ++n)
602
+ boostWeightSum += boostWeights[n] * boostY[n];
603
+
604
+ bestGraphs.clear ();
605
+ }
606
+
607
+ // MATLAB/MEX entrypoint removed.
608
+
609
+ void gSpan::run (std::istream &is, std::ostream &_os,
610
+ unsigned int _minsup,
611
+ unsigned int _maxpat_min, unsigned int _maxpat_max,
612
+ bool _enc,
613
+ bool _where,
614
+ bool _directed)
615
+ {
616
+ os = &_os;
617
+ ID = 0;
618
+ minsup = _minsup;
619
+ maxpat_min = _maxpat_min;
620
+ maxpat_max = _maxpat_max;
621
+ enc = _enc;
622
+ where = _where;
623
+ directed = _directed;
624
+ boost = false;
625
+
626
+ read (is);
627
+ run_intern ();
628
+ }
629
+
630
+ void gSpan::run_intern (void)
631
+ {
632
+ /* In case 1 node subgraphs should also be mined for, do this as
633
+ * preprocessing step.
634
+ */
635
+ if (maxpat_min <= 1) {
636
+ /* Do single node handling, as the normal gspan DFS code based processing
637
+ * cannot find subgraphs of size |subg|==1. Hence, we find frequent node
638
+ * labels explicitly.
639
+ */
640
+ for (unsigned int id = 0; id < TRANS.size(); ++id) {
641
+ for (unsigned int nid = 0 ; nid < TRANS[id].size() ; ++nid) {
642
+ if (singleVertex[id][TRANS[id][nid].label] == 0) {
643
+ // number of graphs it appears in
644
+ singleVertexLabel[TRANS[id][nid].label] += 1;
645
+ }
646
+
647
+ singleVertex[id][TRANS[id][nid].label] += 1;
648
+ }
649
+ }
650
+ /* All minimum support node labels are frequent 'subgraphs'.
651
+ * singleVertexLabel[nodelabel] gives the number of graphs it appears
652
+ * in.
653
+ *
654
+ * 1/1.5-class case: All nodelabels that do not appear at all have a
655
+ * gain of zero, hence we do not need to consider them.
656
+ *
657
+ * 2-class case: Not appearing nodelabels are counted negatively.
658
+ */
659
+ for (std::map<unsigned int, unsigned int>::iterator it =
660
+ singleVertexLabel.begin () ; it != singleVertexLabel.end () ; ++it)
661
+ {
662
+ if ((*it).second < minsup)
663
+ continue;
664
+
665
+ unsigned int frequent_label = (*it).first;
666
+
667
+ /* Found a frequent node label, report it.
668
+ */
669
+ Graph g(directed);
670
+ g.resize (1);
671
+ g[0].label = frequent_label;
672
+
673
+ /* [graph_id] = count for current substructure
674
+ */
675
+ std::vector<unsigned int> counts (TRANS.size ());
676
+ for (std::map<unsigned int, std::map<unsigned int, unsigned int> >::iterator it2 =
677
+ singleVertex.begin () ; it2 != singleVertex.end () ; ++it2)
678
+ {
679
+ counts[(*it2).first] = (*it2).second[frequent_label];
680
+ }
681
+
682
+ if (boost) {
683
+ /* Calculate gain and yval. Here we do not use the normal
684
+ * gain function as there is no Projected/DFS_CODE there yet.
685
+ * Hence we need to make a distinction between the 1/1.5-class
686
+ * and the 2-class case here.
687
+ */
688
+ double gainm = 0.0;
689
+ double gainm_pos = 0.0;
690
+ double gainm_neg = 0.0;
691
+
692
+ for (unsigned int cid = 0 ; cid < counts.size () ; ++cid) {
693
+ if (boostType == 1) {
694
+ // Only consider the positive instances
695
+ if (counts[cid] == 0)
696
+ continue;
697
+
698
+ gainm += boostY[cid]*boostWeights[cid]; // *1.0 (Y)
699
+ } else if (boostType == 2) {
700
+ double addfactor = 1.0;
701
+
702
+ if (counts[cid] == 0)
703
+ addfactor = -1.0; // negation: pattern does not exist
704
+
705
+ gainm_pos += addfactor*boostY[cid]*boostWeights[cid];
706
+ gainm_neg += -addfactor*boostY[cid]*boostWeights[cid];
707
+ }
708
+ }
709
+ double yval = 1.0;
710
+
711
+ if (boostType == 2) {
712
+ if (gainm_pos >= gainm_neg) {
713
+ gainm = gainm_pos;
714
+ } else {
715
+ gainm = gainm_neg;
716
+ yval = -1.0;
717
+ }
718
+ }
719
+
720
+ //#ifdef DEBUG
721
+ fprintf(stderr, " single node graph, node label %d, gain %lf, yval %lf\n",
722
+ frequent_label, gainm, yval);
723
+ //#endif
724
+ if (gainm > boostTau) {
725
+ /* Copy it into vector form
726
+ */
727
+ std::map<unsigned int, unsigned int> gycounts;
728
+ for (unsigned int n = 0 ; n < counts.size () ; ++n)
729
+ gycounts[n] = counts[n];
730
+
731
+ report_boosting_inter (g, (*it).second, gainm, yval, gycounts);
732
+ }
733
+ } else {
734
+ std::map<unsigned int, unsigned int> gycounts;
735
+ for (unsigned int n = 0 ; n < counts.size () ; ++n)
736
+ gycounts[n] = counts[n];
737
+
738
+ report_single (g, gycounts);
739
+ }
740
+ }
741
+ }
742
+
743
+ EdgeList edges;
744
+ Projected_map3 root;
745
+
746
+ for (unsigned int id = 0; id < TRANS.size(); ++id) {
747
+ Graph &g = TRANS[id];
748
+ for (unsigned int from = 0; from < g.size() ; ++from) {
749
+ if (get_forward_root (g, g[from], edges)) {
750
+ for (EdgeList::iterator it = edges.begin(); it != edges.end(); ++it)
751
+ root[g[from].label][(*it)->elabel][g[(*it)->to].label].push (id, *it, 0);
752
+ }
753
+ }
754
+ }
755
+
756
+ for (Projected_iterator3 fromlabel = root.begin() ;
757
+ fromlabel != root.end() ; ++fromlabel)
758
+ {
759
+ for (Projected_iterator2 elabel = fromlabel->second.begin() ;
760
+ elabel != fromlabel->second.end() ; ++elabel)
761
+ {
762
+ for (Projected_iterator1 tolabel = elabel->second.begin();
763
+ tolabel != elabel->second.end(); ++tolabel)
764
+ {
765
+ /* Build the initial two-node graph. It will be grown
766
+ * recursively within project.
767
+ */
768
+ DFS_CODE.push (0, 1, fromlabel->first, elabel->first, tolabel->first);
769
+ project (tolabel->second);
770
+ DFS_CODE.pop ();
771
+ }
772
+ }
773
+ }
774
+ }
775
+
776
+ }