submine 0.1.1__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- submine/__init__.py +37 -0
- submine/algorithms/__init__.py +23 -0
- submine/algorithms/base.py +143 -0
- submine/algorithms/gspan.py +156 -0
- submine/algorithms/gspan_cpp.cpython-312-darwin.so +0 -0
- submine/algorithms/sopagrami.py +250 -0
- submine/algorithms/sopagrami_cpp.cpython-312-darwin.so +0 -0
- submine/api.py +134 -0
- submine/backends/__init__.py +0 -0
- submine/backends/gspan/CMakeLists.txt +65 -0
- submine/backends/gspan/dfs.cpp +98 -0
- submine/backends/gspan/graph.cpp +165 -0
- submine/backends/gspan/gspan.cpp +776 -0
- submine/backends/gspan/gspan.h +296 -0
- submine/backends/gspan/ismin.cpp +124 -0
- submine/backends/gspan/main.cpp +106 -0
- submine/backends/gspan/misc.cpp +177 -0
- submine/backends/gspan/python_bindings.cpp +133 -0
- submine/backends/sopagrami/cpp/CMakeLists.txt +44 -0
- submine/backends/sopagrami/cpp/include/alg.hpp +150 -0
- submine/backends/sopagrami/cpp/include/common/timer.hpp +18 -0
- submine/backends/sopagrami/cpp/src/alg.cpp +805 -0
- submine/backends/sopagrami/cpp/src/dump.cpp +262 -0
- submine/backends/sopagrami/cpp/src/main.cpp +94 -0
- submine/backends/sopagrami/cpp/src/python_bindings.cpp +123 -0
- submine/cli/__init__.py +6 -0
- submine/cli/main.py +87 -0
- submine/core/__init__.py +12 -0
- submine/core/graph.py +179 -0
- submine/core/result.py +121 -0
- submine/datasets/__init__.py +11 -0
- submine/datasets/loaders.py +145 -0
- submine/errors.py +41 -0
- submine/io/__init__.py +30 -0
- submine/io/common.py +173 -0
- submine/io/gexf.py +88 -0
- submine/io/gspan.py +268 -0
- submine/io/sopagrami.py +143 -0
- submine/io/transcode.py +147 -0
- submine/registry.py +8 -0
- submine/utils/__init__.py +6 -0
- submine/utils/checks.py +115 -0
- submine/utils/logging.py +41 -0
- submine-0.1.1.dist-info/METADATA +178 -0
- submine-0.1.1.dist-info/RECORD +47 -0
- submine-0.1.1.dist-info/WHEEL +6 -0
- submine-0.1.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,776 @@
|
|
|
1
|
+
/*
|
|
2
|
+
$Id: gspan.cpp,v 1.8 2004/05/21 09:27:17 taku-ku Exp $;
|
|
3
|
+
|
|
4
|
+
Copyright (C) 2004 Taku Kudo, All rights reserved.
|
|
5
|
+
This is free software with ABSOLUTELY NO WARRANTY.
|
|
6
|
+
|
|
7
|
+
This program is free software; you can redistribute it and/or modify
|
|
8
|
+
it under the terms of the GNU General Public License as published by
|
|
9
|
+
the Free Software Foundation; either version 2 of the License, or
|
|
10
|
+
(at your option) any later version.
|
|
11
|
+
|
|
12
|
+
This program is distributed in the hope that it will be useful,
|
|
13
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
GNU General Public License for more details.
|
|
16
|
+
|
|
17
|
+
You should have received a copy of the GNU General Public License
|
|
18
|
+
along with this program; if not, write to the Free Software
|
|
19
|
+
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
|
20
|
+
02111-1307, USA
|
|
21
|
+
*/
|
|
22
|
+
#include "gspan.h"
|
|
23
|
+
#include <iterator>
|
|
24
|
+
|
|
25
|
+
#include <cstdio>
|
|
26
|
+
#include <stdexcept>
|
|
27
|
+
|
|
28
|
+
#include <stdlib.h>
|
|
29
|
+
#include <thread>
|
|
30
|
+
#include <sstream>
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
namespace GSPAN {
|
|
35
|
+
|
|
36
|
+
gSpan::gSpan (void)
|
|
37
|
+
{
|
|
38
|
+
boost = false;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// MATLAB/MEX support removed.
|
|
42
|
+
|
|
43
|
+
std::istream &gSpan::read (std::istream &is)
|
|
44
|
+
{
|
|
45
|
+
Graph g(directed);
|
|
46
|
+
while (true) {
|
|
47
|
+
g.read (is);
|
|
48
|
+
if (g.empty()) break;
|
|
49
|
+
TRANS.push_back (g);
|
|
50
|
+
}
|
|
51
|
+
return is;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
/* 2-class LPBoosting gain function
|
|
56
|
+
*/
|
|
57
|
+
double
|
|
58
|
+
gSpan::gain2 (Projected &projected, double y)
|
|
59
|
+
{
|
|
60
|
+
/* Compute actual gain (eq. (10) in [Dimiriz2002]) and Problem 1 in graph
|
|
61
|
+
* boost paper.
|
|
62
|
+
*
|
|
63
|
+
* For L samples x_i, calculate
|
|
64
|
+
* gain = \sum_{i=1}^{L} d_i y_i h(x_i)
|
|
65
|
+
* where h is implicitly defined by this current subgraph pattern.
|
|
66
|
+
*/
|
|
67
|
+
unsigned int oid = 0xffffffff;
|
|
68
|
+
double gainm = 0.0;
|
|
69
|
+
|
|
70
|
+
Projected::iterator cur = projected.begin();
|
|
71
|
+
for ( ; cur != projected.end() ; ++cur) {
|
|
72
|
+
if (oid != cur->id) {
|
|
73
|
+
/* A positive pattern: it appears in the graph.
|
|
74
|
+
*
|
|
75
|
+
* h_{<t,y>} = y, if t \subseteq x,
|
|
76
|
+
* = -y, otherwise.
|
|
77
|
+
*
|
|
78
|
+
* t is implicitly defined by projected.
|
|
79
|
+
* First, process all skipped graphs.
|
|
80
|
+
*/
|
|
81
|
+
for (unsigned int skipn = oid+1 ; skipn < cur->id ; ++skipn)
|
|
82
|
+
gainm += boostY[skipn]*boostWeights[skipn]*(-y);
|
|
83
|
+
|
|
84
|
+
/* Now process positive pattern.
|
|
85
|
+
* This is also the gain function for 1.5 LP boosting, as we have
|
|
86
|
+
* the constraint
|
|
87
|
+
* \sum_{n=1}^N \lambda_n h_j(x_{1,n})
|
|
88
|
+
* - \sum_{m=1}^M \mu_m h_j(x_{2,m}) <= \gamma.
|
|
89
|
+
* In order to maximize the violation of the current ensemble, we
|
|
90
|
+
* search for the classifier that maximizes the left side.
|
|
91
|
+
*/
|
|
92
|
+
gainm += boostY[cur->id]*boostWeights[cur->id]*y;
|
|
93
|
+
}
|
|
94
|
+
oid = cur->id;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/* All skipped graphs to the end.
|
|
98
|
+
*/
|
|
99
|
+
for (unsigned int skipn = oid+1 ; skipn < boostY.size() ; ++skipn)
|
|
100
|
+
gainm += boostY[skipn]*boostWeights[skipn]*(-y);
|
|
101
|
+
|
|
102
|
+
return (gainm);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
/* 1.5-class LPBoosting gain function
|
|
107
|
+
*/
|
|
108
|
+
double
|
|
109
|
+
gSpan::gain1d5 (Projected &projected, double y)
|
|
110
|
+
{
|
|
111
|
+
/* Compute actual gain (eq. (10) in [Dimiriz2002]) and Problem 1 in graph
|
|
112
|
+
* boost paper.
|
|
113
|
+
*
|
|
114
|
+
* For L samples x_i, calculate
|
|
115
|
+
* gain = \sum_{i=1}^{L} d_i y_i h(x_i)
|
|
116
|
+
* where h is implicitly defined by this current subgraph pattern. In the
|
|
117
|
+
* 1.5-class case, h(x_i) is zero if the pattern does not appear, hence we
|
|
118
|
+
* can skip those instances silently.
|
|
119
|
+
*/
|
|
120
|
+
unsigned int oid = 0xffffffff;
|
|
121
|
+
double gain = 0.0;
|
|
122
|
+
|
|
123
|
+
Projected::iterator cur = projected.begin();
|
|
124
|
+
for ( ; cur != projected.end() ; ++cur) {
|
|
125
|
+
if (oid != cur->id)
|
|
126
|
+
gain += boostY[cur->id]*boostWeights[cur->id]*y;
|
|
127
|
+
|
|
128
|
+
oid = cur->id;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return (gain);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/* Gain bound for 1.5-class formulation
|
|
135
|
+
*/
|
|
136
|
+
double
|
|
137
|
+
gSpan::gainbound1d5 (Projected &projected)
|
|
138
|
+
{
|
|
139
|
+
double gain = 0.0;
|
|
140
|
+
unsigned int oid = 0xffffffff;
|
|
141
|
+
|
|
142
|
+
for (Projected::iterator cur = projected.begin() ;
|
|
143
|
+
cur != projected.end() ; ++cur)
|
|
144
|
+
{
|
|
145
|
+
if (oid != cur->id) {
|
|
146
|
+
oid = cur->id;
|
|
147
|
+
if (boostY[cur->id] <= 0.0)
|
|
148
|
+
continue;
|
|
149
|
+
|
|
150
|
+
gain += boostWeights[cur->id];
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return (gain);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/* Gain bound for 2-class formulation
|
|
158
|
+
*/
|
|
159
|
+
double
|
|
160
|
+
gSpan::gainbound2 (Projected &projected)
|
|
161
|
+
{
|
|
162
|
+
/* Lemma 1, section 3.2 in the graph boosting paper. We automatically
|
|
163
|
+
* only enumerate those patterns which contain the subgraph 'projected'.
|
|
164
|
+
*
|
|
165
|
+
* <t^,y^> = argmax_{t \in F,y \in {+/- 1}} d_i y_i h_{<t,y>}, where
|
|
166
|
+
* F = \unionset_{i=1}^{L} { t | t \subseteq x_i }.
|
|
167
|
+
*/
|
|
168
|
+
unsigned int oid = 0xffffffff;
|
|
169
|
+
unsigned int size = 0;
|
|
170
|
+
double gain_pos = 0.0;
|
|
171
|
+
double gain_neg = 0.0;
|
|
172
|
+
|
|
173
|
+
for (Projected::iterator cur = projected.begin() ;
|
|
174
|
+
cur != projected.end() ; ++cur)
|
|
175
|
+
{
|
|
176
|
+
if (oid != cur->id) {
|
|
177
|
+
if (boostY[cur->id] <= 0.0)
|
|
178
|
+
gain_neg += boostWeights[cur->id];
|
|
179
|
+
else
|
|
180
|
+
gain_pos += boostWeights[cur->id];
|
|
181
|
+
|
|
182
|
+
++size;
|
|
183
|
+
}
|
|
184
|
+
oid = cur->id;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/* The boostWeightSum has been precomputed as it is always the same for
|
|
188
|
+
* one run.
|
|
189
|
+
*/
|
|
190
|
+
gain_neg = 2.0*gain_neg + boostWeightSum;
|
|
191
|
+
gain_pos = 2.0*gain_pos - boostWeightSum;
|
|
192
|
+
|
|
193
|
+
if (gain_neg >= gain_pos)
|
|
194
|
+
return (gain_neg);
|
|
195
|
+
|
|
196
|
+
return (gain_pos);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
std::map<unsigned int, unsigned int>
|
|
201
|
+
gSpan::support_counts (Projected &projected)
|
|
202
|
+
{
|
|
203
|
+
std::map<unsigned int, unsigned int> counts;
|
|
204
|
+
|
|
205
|
+
for (Projected::iterator cur = projected.begin() ;
|
|
206
|
+
cur != projected.end() ; ++cur)
|
|
207
|
+
{
|
|
208
|
+
counts[cur->id] += 1;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
return (counts);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
unsigned int
|
|
216
|
+
gSpan::support (Projected &projected)
|
|
217
|
+
{
|
|
218
|
+
unsigned int oid = 0xffffffff;
|
|
219
|
+
unsigned int size = 0;
|
|
220
|
+
|
|
221
|
+
for (Projected::iterator cur = projected.begin(); cur != projected.end(); ++cur) {
|
|
222
|
+
if (oid != cur->id) {
|
|
223
|
+
++size;
|
|
224
|
+
}
|
|
225
|
+
oid = cur->id;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return size;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
void gSpan::report_boosting (Projected &projected, unsigned int sup,
|
|
232
|
+
double gain, double yval)
|
|
233
|
+
{
|
|
234
|
+
if (maxpat_max > maxpat_min && DFS_CODE.nodeCount () > maxpat_max)
|
|
235
|
+
return;
|
|
236
|
+
if (maxpat_min > 0 && DFS_CODE.nodeCount () < maxpat_min)
|
|
237
|
+
return;
|
|
238
|
+
|
|
239
|
+
Graph g(directed);
|
|
240
|
+
DFS_CODE.toGraph (g);
|
|
241
|
+
|
|
242
|
+
// insert individual counts
|
|
243
|
+
std::map<unsigned int, unsigned int> GYcounts;
|
|
244
|
+
unsigned int oid = 0xffffffff;
|
|
245
|
+
for (Projected::iterator cur = projected.begin(); cur != projected.end(); ++cur) {
|
|
246
|
+
if (oid != cur->id)
|
|
247
|
+
GYcounts[cur->id] = 0;
|
|
248
|
+
|
|
249
|
+
GYcounts[cur->id] += 1;
|
|
250
|
+
oid = cur->id;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
report_boosting_inter (g, sup, gain, yval, GYcounts);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
void gSpan::report_boosting_inter (Graph &g, unsigned int sup,
|
|
257
|
+
double gain, double yval, std::map<unsigned int, unsigned int>& GYcounts)
|
|
258
|
+
{
|
|
259
|
+
// Pure C++ build: no MATLAB-specific graph printer.
|
|
260
|
+
|
|
261
|
+
/* Insertion sort based on the gain values.
|
|
262
|
+
*/
|
|
263
|
+
unsigned int insertidx;
|
|
264
|
+
for (insertidx = 0 ; insertidx < bestGraphsGain.size () ; ++insertidx) {
|
|
265
|
+
if (gain >= bestGraphsGain[insertidx])
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
bestGraphs.insert (bestGraphs.begin() + insertidx, g);
|
|
270
|
+
bestGraphsY.insert (bestGraphsY.begin() + insertidx, yval);
|
|
271
|
+
bestGraphsGain.insert (bestGraphsGain.begin() + insertidx, gain);
|
|
272
|
+
bestGraphsCounts.insert (bestGraphsCounts.begin() + insertidx, GYcounts);
|
|
273
|
+
|
|
274
|
+
#ifdef DEBUG
|
|
275
|
+
fprintf(stderr, " inserted good output graph, gain %lf, sup %d, Y %lf, new size %d\n",
|
|
276
|
+
gain, sup, yval, g.size());
|
|
277
|
+
#endif
|
|
278
|
+
|
|
279
|
+
/* If the maximum allowed size is exceeded, prune it down.
|
|
280
|
+
*/
|
|
281
|
+
if (bestGraphsGain.size () > boostN) {
|
|
282
|
+
#ifdef DEBUG
|
|
283
|
+
fprintf(stderr, " one element removed at end with gain %lf\n",
|
|
284
|
+
bestGraphsGain[bestGraphsGain.size()-1]);
|
|
285
|
+
#endif
|
|
286
|
+
|
|
287
|
+
bestGraphs.pop_back ();
|
|
288
|
+
bestGraphsY.pop_back ();
|
|
289
|
+
bestGraphsGain.pop_back ();
|
|
290
|
+
bestGraphsCounts.pop_back ();
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
++ID;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
/* Special report function for single node graphs.
|
|
298
|
+
*/
|
|
299
|
+
void gSpan::report_single (Graph &g, std::map<unsigned int, unsigned int>& ncount)
|
|
300
|
+
{
|
|
301
|
+
unsigned int sup = 0;
|
|
302
|
+
for (std::map<unsigned int, unsigned int>::iterator it = ncount.begin () ;
|
|
303
|
+
it != ncount.end () ; ++it)
|
|
304
|
+
{
|
|
305
|
+
sup += (*it).second;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if (maxpat_max > maxpat_min && g.size () > maxpat_max)
|
|
309
|
+
return;
|
|
310
|
+
if (maxpat_min > 0 && g.size () < maxpat_min)
|
|
311
|
+
return;
|
|
312
|
+
|
|
313
|
+
if (enc == false) {
|
|
314
|
+
if (where == false)
|
|
315
|
+
*os << "t # " << ID << " * " << sup;
|
|
316
|
+
*os << '\n';
|
|
317
|
+
|
|
318
|
+
g.write (*os);
|
|
319
|
+
if (callback_) {
|
|
320
|
+
std::cerr << "pattern n=" << g.size()
|
|
321
|
+
<< " e0deg=" << (g.size() ? g[0].edge.size() : 0)
|
|
322
|
+
<< " sup=" << sup << "\n";
|
|
323
|
+
callback_(g, sup, nullptr);
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
*os << '\n';
|
|
327
|
+
} else {
|
|
328
|
+
// Encoded output mode historically relied on MATLAB glue in this fork.
|
|
329
|
+
// If you need encoded output, implement it here in pure C++.
|
|
330
|
+
throw std::runtime_error("report_single: encoded output is not supported in the pure C++ build");
|
|
331
|
+
}
|
|
332
|
+
++ID;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
void gSpan::report(Projected &projected, unsigned int sup)
|
|
336
|
+
{
|
|
337
|
+
if (maxpat_max > maxpat_min && DFS_CODE.nodeCount() > maxpat_max) return;
|
|
338
|
+
if (maxpat_min > 0 && DFS_CODE.nodeCount() < maxpat_min) return;
|
|
339
|
+
|
|
340
|
+
if (where) {
|
|
341
|
+
*os << "<pattern>\n";
|
|
342
|
+
*os << "<id>" << ID << "</id>\n";
|
|
343
|
+
*os << "<support>" << sup << "</support>\n";
|
|
344
|
+
*os << "<what>";
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// ALWAYS build Graph for callback (details)
|
|
348
|
+
Graph g(directed);
|
|
349
|
+
DFS_CODE.toGraph(g);
|
|
350
|
+
|
|
351
|
+
// Always call callback if present
|
|
352
|
+
if (callback_) {
|
|
353
|
+
callback_(g, sup, &projected);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Output format to stream still depends on enc/where
|
|
357
|
+
if (!enc) {
|
|
358
|
+
if (!where) *os << "t # " << ID << " * " << sup;
|
|
359
|
+
*os << '\n';
|
|
360
|
+
g.write(*os);
|
|
361
|
+
} else {
|
|
362
|
+
if (!where) *os << '<' << ID << "> " << sup << " [";
|
|
363
|
+
DFS_CODE.write(*os);
|
|
364
|
+
if (!where) *os << ']';
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
if (where) {
|
|
368
|
+
*os << "</what>\n<where>";
|
|
369
|
+
unsigned int oid = 0xffffffff;
|
|
370
|
+
for (Projected::iterator cur = projected.begin(); cur != projected.end(); ++cur) {
|
|
371
|
+
if (oid != cur->id) {
|
|
372
|
+
if (cur != projected.begin()) *os << ' ';
|
|
373
|
+
*os << cur->id;
|
|
374
|
+
}
|
|
375
|
+
oid = cur->id;
|
|
376
|
+
}
|
|
377
|
+
*os << "</where>\n</pattern>";
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
*os << '\n';
|
|
381
|
+
++ID;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
/* Recursive subgraph mining function (similar to subprocedure 1
|
|
385
|
+
* Subgraph_Mining in [Yan2002]).
|
|
386
|
+
*/
|
|
387
|
+
void gSpan::project (Projected &projected)
|
|
388
|
+
{
|
|
389
|
+
if (boost && (boostmax > 0 && boostseen > boostmax))
|
|
390
|
+
return;
|
|
391
|
+
|
|
392
|
+
/* Check if the pattern is frequent enough.
|
|
393
|
+
*/
|
|
394
|
+
unsigned int sup = support (projected);
|
|
395
|
+
if (sup < minsup)
|
|
396
|
+
return;
|
|
397
|
+
|
|
398
|
+
/* The minimal DFS code check is more expensive than the support check,
|
|
399
|
+
* hence it is done now, after checking the support.
|
|
400
|
+
*/
|
|
401
|
+
if (is_min () == false) {
|
|
402
|
+
// *os << "NOT MIN ["; DFS_CODE.write (*os); *os << "]" << std::endl;
|
|
403
|
+
return;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
if (boost) {
|
|
407
|
+
/* Compute the gains for all classifiers we can build out of the current
|
|
408
|
+
* subgraph pattern (which is just two).
|
|
409
|
+
*/
|
|
410
|
+
double gain_ypos = 0.0; // Gain if y=1 in the weak learner
|
|
411
|
+
double gain_yneg = 0.0; // Gain if y=-1 in the weak learner
|
|
412
|
+
double gainmax = 0.0; // Maximum of the negative/positive gain
|
|
413
|
+
double yval = 1.0; // y that achieves the maximum gain
|
|
414
|
+
|
|
415
|
+
/* Compute gains. This differs a little for the 1.5-class and the
|
|
416
|
+
* 2-class case. (The gain function is replaced, as well).
|
|
417
|
+
*/
|
|
418
|
+
gain_ypos = (this->*gain) (projected, 1.0);
|
|
419
|
+
if (boostType == 1) {
|
|
420
|
+
gainmax = gain_ypos;
|
|
421
|
+
} else if (boostType == 2) {
|
|
422
|
+
gain_yneg = (this->*gain) (projected, -1.0);
|
|
423
|
+
|
|
424
|
+
if (gain_ypos >= gain_yneg) {
|
|
425
|
+
gainmax = gain_ypos;
|
|
426
|
+
} else {
|
|
427
|
+
gainmax = gain_yneg;
|
|
428
|
+
yval = -1.0;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
/* Check if we have to replace the least best of our top performers.
|
|
433
|
+
*/
|
|
434
|
+
if (gainmax > boostTau) {
|
|
435
|
+
/* Update best gain so far. The best gain is also the tau bound.
|
|
436
|
+
*/
|
|
437
|
+
#ifdef DEBUG
|
|
438
|
+
fprintf(stderr, "boost gain observed with %lf, support %d\n", boostTau, sup);
|
|
439
|
+
#endif
|
|
440
|
+
report_boosting (projected, sup, gainmax, yval);
|
|
441
|
+
|
|
442
|
+
/* Update lower boosting gain bound in case we already exceeded
|
|
443
|
+
* the number of allowed patterns.
|
|
444
|
+
*/
|
|
445
|
+
if (bestGraphsGain.size () >= boostN) {
|
|
446
|
+
boostTau = bestGraphsGain[bestGraphsGain.size() - 1];
|
|
447
|
+
fprintf(stderr, " top gains: %lf ... %lf\n",
|
|
448
|
+
bestGraphsGain[0], boostTau);
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
if (boostseen % 1000 == 0) {
|
|
452
|
+
Graph g(directed);
|
|
453
|
+
DFS_CODE.toGraph (g);
|
|
454
|
+
|
|
455
|
+
fprintf(stderr, "%7u: gain %lf this.bound %lf "
|
|
456
|
+
"global.bound %lf DFS.size %zu (gsize %zu)\n", boostseen,
|
|
457
|
+
gainmax, (this->*gainbound) (projected), boostTau,
|
|
458
|
+
DFS_CODE.size(), g.size());
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
boostseen += 1;
|
|
462
|
+
} else {
|
|
463
|
+
// Output the frequent substructure
|
|
464
|
+
report (projected, sup);
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/* In case we have a valid upper bound and our graph already exceeds it,
|
|
468
|
+
* return. Note: we do not check for equality as the DFS exploration may
|
|
469
|
+
* still add edges within an existing subgraph, without increasing the
|
|
470
|
+
* number of nodes.
|
|
471
|
+
*/
|
|
472
|
+
if (maxpat_max > maxpat_min && DFS_CODE.nodeCount () > maxpat_max)
|
|
473
|
+
return;
|
|
474
|
+
|
|
475
|
+
/* Compute gain bound for this pattern (projected) and only explore in
|
|
476
|
+
* case the bound allows this subgraph to be better. That is, if the
|
|
477
|
+
* bound lets no supergraph be better than all reported ones so far, we
|
|
478
|
+
* just return.
|
|
479
|
+
*/
|
|
480
|
+
if (boost) {
|
|
481
|
+
double gainb = (this->*gainbound) (projected);
|
|
482
|
+
#ifdef DEBUG
|
|
483
|
+
fprintf(stderr, " gain bound: %lf, gain min required: %lf\n", gainb, boostTau);
|
|
484
|
+
fprintf(stderr, " actual gain for this subgraph: %lf\n",
|
|
485
|
+
(this->*gain) (projected, 1.0));
|
|
486
|
+
#endif
|
|
487
|
+
|
|
488
|
+
if (gainb <= boostTau + 1e-8) {
|
|
489
|
+
#ifdef DEBUG
|
|
490
|
+
fprintf(stderr, " ==> return\n");
|
|
491
|
+
#endif
|
|
492
|
+
return;
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
/* We just outputted a frequent subgraph. As it is frequent enough, so
|
|
497
|
+
* might be its (n+1)-extension-graphs, hence we enumerate them all.
|
|
498
|
+
*/
|
|
499
|
+
const RMPath &rmpath = DFS_CODE.buildRMPath ();
|
|
500
|
+
int minlabel = DFS_CODE[0].fromlabel;
|
|
501
|
+
int maxtoc = DFS_CODE[rmpath[0]].to;
|
|
502
|
+
|
|
503
|
+
Projected_map3 new_fwd_root;
|
|
504
|
+
Projected_map2 new_bck_root;
|
|
505
|
+
EdgeList edges;
|
|
506
|
+
|
|
507
|
+
/* Enumerate all possible one edge extensions of the current substructure.
|
|
508
|
+
*/
|
|
509
|
+
for (unsigned int n = 0; n < projected.size(); ++n) {
|
|
510
|
+
|
|
511
|
+
unsigned int id = projected[n].id;
|
|
512
|
+
PDFS *cur = &projected[n];
|
|
513
|
+
History history (TRANS[id], cur);
|
|
514
|
+
|
|
515
|
+
// XXX: do we have to change something here for directed edges?
|
|
516
|
+
|
|
517
|
+
// backward
|
|
518
|
+
for (int i = (int)rmpath.size()-1; i >= 1; --i) {
|
|
519
|
+
Edge *e = get_backward (TRANS[id], history[rmpath[i]], history[rmpath[0]], history);
|
|
520
|
+
if (e)
|
|
521
|
+
new_bck_root[DFS_CODE[rmpath[i]].from][e->elabel].push (id, e, cur);
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
// pure forward
|
|
525
|
+
// FIXME: here we pass a too large e->to (== history[rmpath[0]]->to
|
|
526
|
+
// into get_forward_pure, such that the assertion fails.
|
|
527
|
+
//
|
|
528
|
+
// The problem is:
|
|
529
|
+
// history[rmpath[0]]->to > TRANS[id].size()
|
|
530
|
+
if (get_forward_pure (TRANS[id], history[rmpath[0]], minlabel, history, edges))
|
|
531
|
+
for (EdgeList::iterator it = edges.begin(); it != edges.end(); ++it)
|
|
532
|
+
new_fwd_root[maxtoc][(*it)->elabel][TRANS[id][(*it)->to].label].push (id, *it, cur);
|
|
533
|
+
|
|
534
|
+
// backtracked forward
|
|
535
|
+
for (int i = 0; i < (int)rmpath.size(); ++i)
|
|
536
|
+
if (get_forward_rmpath (TRANS[id], history[rmpath[i]], minlabel, history, edges))
|
|
537
|
+
for (EdgeList::iterator it = edges.begin(); it != edges.end(); ++it)
|
|
538
|
+
new_fwd_root[DFS_CODE[rmpath[i]].from][(*it)->elabel][TRANS[id][(*it)->to].label].push (id, *it, cur);
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
/* Test all extended substructures.
|
|
542
|
+
*/
|
|
543
|
+
// backward
|
|
544
|
+
for (Projected_iterator2 to = new_bck_root.begin(); to != new_bck_root.end(); ++to) {
|
|
545
|
+
for (Projected_iterator1 elabel = to->second.begin(); elabel != to->second.end(); ++elabel) {
|
|
546
|
+
DFS_CODE.push (maxtoc, to->first, -1, elabel->first, -1);
|
|
547
|
+
project (elabel->second);
|
|
548
|
+
DFS_CODE.pop();
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
// forward
|
|
553
|
+
for (Projected_riterator3 from = new_fwd_root.rbegin() ;
|
|
554
|
+
from != new_fwd_root.rend() ; ++from)
|
|
555
|
+
{
|
|
556
|
+
for (Projected_iterator2 elabel = from->second.begin() ;
|
|
557
|
+
elabel != from->second.end() ; ++elabel)
|
|
558
|
+
{
|
|
559
|
+
for (Projected_iterator1 tolabel = elabel->second.begin();
|
|
560
|
+
tolabel != elabel->second.end(); ++tolabel)
|
|
561
|
+
{
|
|
562
|
+
DFS_CODE.push (from->first, maxtoc+1, -1, elabel->first, tolabel->first);
|
|
563
|
+
project (tolabel->second);
|
|
564
|
+
DFS_CODE.pop ();
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
return;
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
void gSpan::boost_setup (unsigned int _boostN, double _boostTau,
|
|
573
|
+
unsigned int _boostmax,
|
|
574
|
+
std::vector<double>& _boostY,
|
|
575
|
+
std::vector<double>& _boostWeights,
|
|
576
|
+
int _boostType)
|
|
577
|
+
{
|
|
578
|
+
boost = true;
|
|
579
|
+
boostseen = 0;
|
|
580
|
+
boostmax = _boostmax;
|
|
581
|
+
boostN = _boostN;
|
|
582
|
+
boostTau = _boostTau;
|
|
583
|
+
boostY = _boostY;
|
|
584
|
+
boostWeights = _boostWeights;
|
|
585
|
+
boostType = _boostType;
|
|
586
|
+
|
|
587
|
+
/* Setup the corresponding function pointers for the gain function adn the
|
|
588
|
+
* gainbound function.
|
|
589
|
+
*/
|
|
590
|
+
if (boostType == 1) {
|
|
591
|
+
gain = &gSpan::gain1d5;
|
|
592
|
+
gainbound = &gSpan::gainbound1d5;
|
|
593
|
+
} else if (boostType == 2) {
|
|
594
|
+
gain = &gSpan::gain2;
|
|
595
|
+
gainbound = &gSpan::gainbound2;
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
/* Precompute the overall label*weight sum-balance.
|
|
599
|
+
*/
|
|
600
|
+
boostWeightSum = 0.0;
|
|
601
|
+
for (unsigned int n = 0 ; n < boostY.size() ; ++n)
|
|
602
|
+
boostWeightSum += boostWeights[n] * boostY[n];
|
|
603
|
+
|
|
604
|
+
bestGraphs.clear ();
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// MATLAB/MEX entrypoint removed.
|
|
608
|
+
|
|
609
|
+
void gSpan::run (std::istream &is, std::ostream &_os,
|
|
610
|
+
unsigned int _minsup,
|
|
611
|
+
unsigned int _maxpat_min, unsigned int _maxpat_max,
|
|
612
|
+
bool _enc,
|
|
613
|
+
bool _where,
|
|
614
|
+
bool _directed)
|
|
615
|
+
{
|
|
616
|
+
os = &_os;
|
|
617
|
+
ID = 0;
|
|
618
|
+
minsup = _minsup;
|
|
619
|
+
maxpat_min = _maxpat_min;
|
|
620
|
+
maxpat_max = _maxpat_max;
|
|
621
|
+
enc = _enc;
|
|
622
|
+
where = _where;
|
|
623
|
+
directed = _directed;
|
|
624
|
+
boost = false;
|
|
625
|
+
|
|
626
|
+
read (is);
|
|
627
|
+
run_intern ();
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
void gSpan::run_intern (void)
|
|
631
|
+
{
|
|
632
|
+
/* In case 1 node subgraphs should also be mined for, do this as
|
|
633
|
+
* preprocessing step.
|
|
634
|
+
*/
|
|
635
|
+
if (maxpat_min <= 1) {
|
|
636
|
+
/* Do single node handling, as the normal gspan DFS code based processing
|
|
637
|
+
* cannot find subgraphs of size |subg|==1. Hence, we find frequent node
|
|
638
|
+
* labels explicitly.
|
|
639
|
+
*/
|
|
640
|
+
for (unsigned int id = 0; id < TRANS.size(); ++id) {
|
|
641
|
+
for (unsigned int nid = 0 ; nid < TRANS[id].size() ; ++nid) {
|
|
642
|
+
if (singleVertex[id][TRANS[id][nid].label] == 0) {
|
|
643
|
+
// number of graphs it appears in
|
|
644
|
+
singleVertexLabel[TRANS[id][nid].label] += 1;
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
singleVertex[id][TRANS[id][nid].label] += 1;
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
/* All minimum support node labels are frequent 'subgraphs'.
|
|
651
|
+
* singleVertexLabel[nodelabel] gives the number of graphs it appears
|
|
652
|
+
* in.
|
|
653
|
+
*
|
|
654
|
+
* 1/1.5-class case: All nodelabels that do not appear at all have a
|
|
655
|
+
* gain of zero, hence we do not need to consider them.
|
|
656
|
+
*
|
|
657
|
+
* 2-class case: Not appearing nodelabels are counted negatively.
|
|
658
|
+
*/
|
|
659
|
+
for (std::map<unsigned int, unsigned int>::iterator it =
|
|
660
|
+
singleVertexLabel.begin () ; it != singleVertexLabel.end () ; ++it)
|
|
661
|
+
{
|
|
662
|
+
if ((*it).second < minsup)
|
|
663
|
+
continue;
|
|
664
|
+
|
|
665
|
+
unsigned int frequent_label = (*it).first;
|
|
666
|
+
|
|
667
|
+
/* Found a frequent node label, report it.
|
|
668
|
+
*/
|
|
669
|
+
Graph g(directed);
|
|
670
|
+
g.resize (1);
|
|
671
|
+
g[0].label = frequent_label;
|
|
672
|
+
|
|
673
|
+
/* [graph_id] = count for current substructure
|
|
674
|
+
*/
|
|
675
|
+
std::vector<unsigned int> counts (TRANS.size ());
|
|
676
|
+
for (std::map<unsigned int, std::map<unsigned int, unsigned int> >::iterator it2 =
|
|
677
|
+
singleVertex.begin () ; it2 != singleVertex.end () ; ++it2)
|
|
678
|
+
{
|
|
679
|
+
counts[(*it2).first] = (*it2).second[frequent_label];
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
if (boost) {
|
|
683
|
+
/* Calculate gain and yval. Here we do not use the normal
|
|
684
|
+
* gain function as there is no Projected/DFS_CODE there yet.
|
|
685
|
+
* Hence we need to make a distinction between the 1/1.5-class
|
|
686
|
+
* and the 2-class case here.
|
|
687
|
+
*/
|
|
688
|
+
double gainm = 0.0;
|
|
689
|
+
double gainm_pos = 0.0;
|
|
690
|
+
double gainm_neg = 0.0;
|
|
691
|
+
|
|
692
|
+
for (unsigned int cid = 0 ; cid < counts.size () ; ++cid) {
|
|
693
|
+
if (boostType == 1) {
|
|
694
|
+
// Only consider the positive instances
|
|
695
|
+
if (counts[cid] == 0)
|
|
696
|
+
continue;
|
|
697
|
+
|
|
698
|
+
gainm += boostY[cid]*boostWeights[cid]; // *1.0 (Y)
|
|
699
|
+
} else if (boostType == 2) {
|
|
700
|
+
double addfactor = 1.0;
|
|
701
|
+
|
|
702
|
+
if (counts[cid] == 0)
|
|
703
|
+
addfactor = -1.0; // negation: pattern does not exist
|
|
704
|
+
|
|
705
|
+
gainm_pos += addfactor*boostY[cid]*boostWeights[cid];
|
|
706
|
+
gainm_neg += -addfactor*boostY[cid]*boostWeights[cid];
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
double yval = 1.0;
|
|
710
|
+
|
|
711
|
+
if (boostType == 2) {
|
|
712
|
+
if (gainm_pos >= gainm_neg) {
|
|
713
|
+
gainm = gainm_pos;
|
|
714
|
+
} else {
|
|
715
|
+
gainm = gainm_neg;
|
|
716
|
+
yval = -1.0;
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
//#ifdef DEBUG
|
|
721
|
+
fprintf(stderr, " single node graph, node label %d, gain %lf, yval %lf\n",
|
|
722
|
+
frequent_label, gainm, yval);
|
|
723
|
+
//#endif
|
|
724
|
+
if (gainm > boostTau) {
|
|
725
|
+
/* Copy it into vector form
|
|
726
|
+
*/
|
|
727
|
+
std::map<unsigned int, unsigned int> gycounts;
|
|
728
|
+
for (unsigned int n = 0 ; n < counts.size () ; ++n)
|
|
729
|
+
gycounts[n] = counts[n];
|
|
730
|
+
|
|
731
|
+
report_boosting_inter (g, (*it).second, gainm, yval, gycounts);
|
|
732
|
+
}
|
|
733
|
+
} else {
|
|
734
|
+
std::map<unsigned int, unsigned int> gycounts;
|
|
735
|
+
for (unsigned int n = 0 ; n < counts.size () ; ++n)
|
|
736
|
+
gycounts[n] = counts[n];
|
|
737
|
+
|
|
738
|
+
report_single (g, gycounts);
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
EdgeList edges;
|
|
744
|
+
Projected_map3 root;
|
|
745
|
+
|
|
746
|
+
for (unsigned int id = 0; id < TRANS.size(); ++id) {
|
|
747
|
+
Graph &g = TRANS[id];
|
|
748
|
+
for (unsigned int from = 0; from < g.size() ; ++from) {
|
|
749
|
+
if (get_forward_root (g, g[from], edges)) {
|
|
750
|
+
for (EdgeList::iterator it = edges.begin(); it != edges.end(); ++it)
|
|
751
|
+
root[g[from].label][(*it)->elabel][g[(*it)->to].label].push (id, *it, 0);
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
for (Projected_iterator3 fromlabel = root.begin() ;
|
|
757
|
+
fromlabel != root.end() ; ++fromlabel)
|
|
758
|
+
{
|
|
759
|
+
for (Projected_iterator2 elabel = fromlabel->second.begin() ;
|
|
760
|
+
elabel != fromlabel->second.end() ; ++elabel)
|
|
761
|
+
{
|
|
762
|
+
for (Projected_iterator1 tolabel = elabel->second.begin();
|
|
763
|
+
tolabel != elabel->second.end(); ++tolabel)
|
|
764
|
+
{
|
|
765
|
+
/* Build the initial two-node graph. It will be grown
|
|
766
|
+
* recursively within project.
|
|
767
|
+
*/
|
|
768
|
+
DFS_CODE.push (0, 1, fromlabel->first, elabel->first, tolabel->first);
|
|
769
|
+
project (tolabel->second);
|
|
770
|
+
DFS_CODE.pop ();
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
}
|