see5-installer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
data/ext/c5.0/p-thresh.c
ADDED
@@ -0,0 +1,313 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Soften thresholds for continuous attributes */
|
30
|
+
/* ------------------------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#include "defns.i"
|
36
|
+
#include "extern.i"
|
37
|
+
|
38
|
+
|
39
|
+
/*************************************************************************/
|
40
|
+
/* */
|
41
|
+
/* Soften all thresholds for continuous attributes in tree T */
|
42
|
+
/* */
|
43
|
+
/*************************************************************************/
|
44
|
+
|
45
|
+
|
46
|
+
void SoftenThresh(Tree T)
|
47
|
+
/* ------------ */
|
48
|
+
{
|
49
|
+
ResubErrs(T, 0, MaxCase);
|
50
|
+
|
51
|
+
FindBounds(T, 0, MaxCase);
|
52
|
+
}
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
/*************************************************************************/
|
57
|
+
/* */
|
58
|
+
/* Find resubstitution errors for tree T */
|
59
|
+
/* */
|
60
|
+
/*************************************************************************/
|
61
|
+
|
62
|
+
|
63
|
+
void ResubErrs(Tree T, CaseNo Fp, CaseNo Lp)
|
64
|
+
/* --------- */
|
65
|
+
{
|
66
|
+
CaseNo i, Bp, Ep, Missing;
|
67
|
+
CaseCount Cases=0, KnownCases, BranchCases, MissingCases;
|
68
|
+
double Factor;
|
69
|
+
DiscrValue v;
|
70
|
+
Boolean PrevUnitWeights;
|
71
|
+
Attribute Att;
|
72
|
+
|
73
|
+
if ( ! T->NodeType )
|
74
|
+
{
|
75
|
+
T->Errors = T->Cases - T->ClassDist[T->Leaf];
|
76
|
+
return;
|
77
|
+
}
|
78
|
+
|
79
|
+
/* Estimate errors for each branch */
|
80
|
+
|
81
|
+
Att = T->Tested;
|
82
|
+
Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
|
83
|
+
|
84
|
+
if ( CostWeights )
|
85
|
+
{
|
86
|
+
MissingCases = SumNocostWeights(Fp, Ep);
|
87
|
+
KnownCases = SumNocostWeights(Ep+1, Lp);
|
88
|
+
}
|
89
|
+
else
|
90
|
+
{
|
91
|
+
MissingCases = CountCases(Fp, Ep);
|
92
|
+
KnownCases = Cases - MissingCases;
|
93
|
+
}
|
94
|
+
|
95
|
+
PrevUnitWeights = UnitWeights;
|
96
|
+
if ( Missing ) UnitWeights = false;
|
97
|
+
|
98
|
+
T->Errors = 0;
|
99
|
+
Bp = Fp;
|
100
|
+
|
101
|
+
ForEach(v, 1, T->Forks)
|
102
|
+
{
|
103
|
+
Ep = Group(v, Bp + Missing, Lp, T);
|
104
|
+
|
105
|
+
/* Bp -> first value in missing + remaining values
|
106
|
+
Ep -> last value in missing + current group */
|
107
|
+
|
108
|
+
BranchCases = CountCases(Bp + Missing, Ep);
|
109
|
+
|
110
|
+
Factor = ( ! Missing ? 0 :
|
111
|
+
! CostWeights ? BranchCases / KnownCases :
|
112
|
+
SumNocostWeights(Bp + Missing, Ep) / KnownCases );
|
113
|
+
|
114
|
+
if ( BranchCases + Factor * MissingCases >= MinLeaf )
|
115
|
+
{
|
116
|
+
if ( Missing )
|
117
|
+
{
|
118
|
+
ForEach(i, Bp, Bp + Missing - 1)
|
119
|
+
{
|
120
|
+
Weight(Case[i]) *= Factor;
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
124
|
+
ResubErrs(T->Branch[v], Bp, Ep);
|
125
|
+
|
126
|
+
T->Errors += T->Branch[v]->Errors;
|
127
|
+
|
128
|
+
/* Restore weights if changed */
|
129
|
+
|
130
|
+
if ( Missing )
|
131
|
+
{
|
132
|
+
for ( i = Ep ; i >= Bp ; i-- )
|
133
|
+
{
|
134
|
+
if ( Unknown(Case[i], Att) )
|
135
|
+
{
|
136
|
+
Weight(Case[i]) /= Factor;
|
137
|
+
Swap(i, Ep);
|
138
|
+
Ep--;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
}
|
142
|
+
|
143
|
+
Bp = Ep+1;
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
UnitWeights = PrevUnitWeights;
|
148
|
+
}
|
149
|
+
|
150
|
+
|
151
|
+
|
152
|
+
/*************************************************************************/
|
153
|
+
/* */
|
154
|
+
/* Calculate upper and lower bounds for each test on a continuous */
|
155
|
+
/* attribute in tree T, using cases from Fp to Lp. */
|
156
|
+
/* */
|
157
|
+
/* The lower bound is set so that the error rate of the GT branch */
|
158
|
+
/* on the cases between the bound and the threshold is double that */
|
159
|
+
/* of the correct (LE) branch; the upper bound is set similarly. */
|
160
|
+
/* */
|
161
|
+
/*************************************************************************/
|
162
|
+
|
163
|
+
|
164
|
+
void FindBounds(Tree T, CaseNo Fp, CaseNo Lp)
|
165
|
+
/* -------- */
|
166
|
+
{
|
167
|
+
int v;
|
168
|
+
CaseNo i, j, Kp, Bp, Ap, Missing, SplitI;
|
169
|
+
CaseCount w, LEErrs, GTErrs, KnownCases, SE;
|
170
|
+
ClassNo RealClass;
|
171
|
+
Attribute Att;
|
172
|
+
Boolean PrevUnitWeights;
|
173
|
+
double Factor;
|
174
|
+
|
175
|
+
/* Stop when get to a leaf */
|
176
|
+
|
177
|
+
if ( ! T->NodeType ) return;
|
178
|
+
|
179
|
+
Kp = Group(0, Fp, Lp, T) + 1;
|
180
|
+
Missing = Kp - Fp;
|
181
|
+
|
182
|
+
Att = T->Tested;
|
183
|
+
KnownCases = CountCases(Kp, Lp);
|
184
|
+
|
185
|
+
/* Soften a threshold for a continuous attribute */
|
186
|
+
|
187
|
+
if ( T->NodeType == BrThresh )
|
188
|
+
{
|
189
|
+
Verbosity(1, fprintf(Of, "\nTest %s <> %g\n", AttName[Att], T->Cut))
|
190
|
+
|
191
|
+
/* Skip N/A values */
|
192
|
+
|
193
|
+
Ap = Group(1, Kp, Lp, T) + 1;
|
194
|
+
|
195
|
+
Quicksort(Ap, Lp, Att);
|
196
|
+
|
197
|
+
/* Locate cut point and overall errors of the LE and GT branches */
|
198
|
+
|
199
|
+
SplitI = Ap;
|
200
|
+
LEErrs = GTErrs = 0;
|
201
|
+
ForEach(i, Ap, Lp)
|
202
|
+
{
|
203
|
+
if ( CVal(Case[i], Att) <= T->Cut ) SplitI = i;
|
204
|
+
}
|
205
|
+
|
206
|
+
T->Mid = (CVal(Case[SplitI], Att) + CVal(Case[SplitI+1], Att)) / 2;
|
207
|
+
|
208
|
+
/* Consider cutoff points below and above the threshold.
|
209
|
+
The errors on the cases between the cutoff and the threshold
|
210
|
+
are computed for both the LE and GT branches. The additional
|
211
|
+
errors must be less than 0.5SE and, further, the errors
|
212
|
+
on the "other" branch must not exceed twice the errors
|
213
|
+
on the "real" branch, both after Laplace adjustment */
|
214
|
+
|
215
|
+
SE = sqrt(T->Errors * (KnownCases - T->Errors) / (KnownCases + 1E-3))
|
216
|
+
* 2;
|
217
|
+
|
218
|
+
LEErrs = GTErrs = 0;
|
219
|
+
j = SplitI;
|
220
|
+
for ( i = SplitI ; i > Ap ; i-- )
|
221
|
+
{
|
222
|
+
RealClass = Class(Case[i]);
|
223
|
+
|
224
|
+
w = Weight(Case[i]);
|
225
|
+
GTErrs += w * ( TreeClassify(Case[i], T->Branch[3]) != RealClass );
|
226
|
+
LEErrs += w * ( TreeClassify(Case[i], T->Branch[2]) != RealClass );
|
227
|
+
|
228
|
+
if ( CVal(Case[i-1], Att) < CVal(Case[i], Att) )
|
229
|
+
{
|
230
|
+
if ( GTErrs > 2 * LEErrs + 1 || GTErrs - LEErrs > 0.5 * SE )
|
231
|
+
{
|
232
|
+
break;
|
233
|
+
}
|
234
|
+
|
235
|
+
j = i-1;
|
236
|
+
}
|
237
|
+
}
|
238
|
+
T->Lower = Min(T->Mid, CVal(Case[j], Att));
|
239
|
+
|
240
|
+
LEErrs = GTErrs = 0;
|
241
|
+
j = SplitI+1;
|
242
|
+
for ( i = SplitI+1 ; i < Lp ; i++ )
|
243
|
+
{
|
244
|
+
RealClass = Class(Case[i]);
|
245
|
+
|
246
|
+
w = Weight(Case[i]);
|
247
|
+
LEErrs += w * ( TreeClassify(Case[i], T->Branch[2]) != RealClass );
|
248
|
+
GTErrs += w * ( TreeClassify(Case[i], T->Branch[3]) != RealClass );
|
249
|
+
|
250
|
+
if ( CVal(Case[i], Att) < CVal(Case[i+1], Att) )
|
251
|
+
{
|
252
|
+
if ( LEErrs > 2 * GTErrs + 1 || LEErrs - GTErrs > 0.5 * SE )
|
253
|
+
{
|
254
|
+
break;
|
255
|
+
}
|
256
|
+
|
257
|
+
j = i+1;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
T->Upper = Max(T->Mid, CVal(Case[j], Att));
|
261
|
+
|
262
|
+
Verbosity(1,
|
263
|
+
fprintf(Of, "\tLower = %g, Upper = %g\n", T->Lower, T->Upper))
|
264
|
+
}
|
265
|
+
|
266
|
+
/* Recursively scan each branch */
|
267
|
+
|
268
|
+
PrevUnitWeights = UnitWeights;
|
269
|
+
if ( Missing > 0 ) UnitWeights = false;
|
270
|
+
|
271
|
+
Bp = Fp;
|
272
|
+
|
273
|
+
ForEach(v, 1, T->Forks)
|
274
|
+
{
|
275
|
+
Kp = Group(v, Bp + Missing, Lp, T);
|
276
|
+
|
277
|
+
/* Bp -> first value in missing + remaining values
|
278
|
+
Kp -> last value in missing + current group */
|
279
|
+
|
280
|
+
if ( Bp + Missing <= Kp &&
|
281
|
+
(Factor = CountCases(Bp + Missing, Kp) / KnownCases) > 1E-6 )
|
282
|
+
{
|
283
|
+
if ( Missing )
|
284
|
+
{
|
285
|
+
ForEach(i, Bp, Bp + Missing - 1)
|
286
|
+
{
|
287
|
+
Weight(Case[i]) *= Factor;
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
291
|
+
FindBounds(T->Branch[v], Bp, Kp);
|
292
|
+
|
293
|
+
/* Restore weights if changed */
|
294
|
+
|
295
|
+
if ( Missing )
|
296
|
+
{
|
297
|
+
for ( i = Kp ; i >= Bp ; i-- )
|
298
|
+
{
|
299
|
+
if ( Unknown(Case[i], Att) )
|
300
|
+
{
|
301
|
+
Weight(Case[i]) /= Factor;
|
302
|
+
Swap(i, Kp);
|
303
|
+
Kp--;
|
304
|
+
}
|
305
|
+
}
|
306
|
+
}
|
307
|
+
|
308
|
+
Bp = Kp+1;
|
309
|
+
}
|
310
|
+
}
|
311
|
+
|
312
|
+
UnitWeights = PrevUnitWeights;
|
313
|
+
}
|
data/ext/c5.0/prune.c
ADDED
@@ -0,0 +1,1069 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Prune a decision tree and predict its error rate */
|
30
|
+
/* ------------------------------------------------ */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#include "defns.i"
|
36
|
+
#include "extern.i"
|
37
|
+
|
38
|
+
|
39
|
+
#define LocalVerbosity(x,s) if (Sh >= 0) {Verbosity(x,s)}
|
40
|
+
#define Intab(x) Indent(x, 0)
|
41
|
+
|
42
|
+
#define UPDATE 1 /* flag: change tree */
|
43
|
+
#define REGROW 2 /* regrow branches */
|
44
|
+
#define REPORTPROGRESS 4 /* original tree */
|
45
|
+
#define UNITWEIGHTS 8 /* UnitWeights is true*/
|
46
|
+
|
47
|
+
Set *PossibleValues;
|
48
|
+
|
49
|
+
double MaxExtraErrs, /* limit for global prune */
|
50
|
+
TotalExtraErrs; /* extra errors from ties */
|
51
|
+
Tree *XT; /* subtrees with lowest cost comp */
|
52
|
+
int NXT; /* number ditto */
|
53
|
+
float MinCC; /* cost compexity for XT */
|
54
|
+
Boolean RecalculateErrs; /* if missing values */
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
/*************************************************************************/
|
60
|
+
/* */
|
61
|
+
/* Prune tree T */
|
62
|
+
/* */
|
63
|
+
/*************************************************************************/
|
64
|
+
|
65
|
+
|
66
|
+
void Prune(Tree T)
|
67
|
+
/* ----- */
|
68
|
+
{
|
69
|
+
Attribute Att;
|
70
|
+
int i, Options;
|
71
|
+
Boolean Regrow;
|
72
|
+
|
73
|
+
Verbosity(2, fprintf(Of, "\n"))
|
74
|
+
|
75
|
+
Regrow = ( Trial == 0 || Now == WINNOWATTS );
|
76
|
+
|
77
|
+
/* Local pruning phase */
|
78
|
+
|
79
|
+
|
80
|
+
Options = ( Now == WINNOWATTS ? (UPDATE|REGROW) :
|
81
|
+
Regrow ? (UPDATE|REGROW|REPORTPROGRESS) :
|
82
|
+
(UPDATE|REPORTPROGRESS) );
|
83
|
+
if ( UnitWeights ) Options |= UNITWEIGHTS;
|
84
|
+
|
85
|
+
EstimateErrs(T, 0, MaxCase, 0, Options);
|
86
|
+
|
87
|
+
if ( MCost )
|
88
|
+
{
|
89
|
+
/* Remove any effects of WeightMul and reset leaf classes */
|
90
|
+
|
91
|
+
RestoreDistribs(T);
|
92
|
+
}
|
93
|
+
else
|
94
|
+
{
|
95
|
+
/* Insert information on parents and recalculate errors, noting
|
96
|
+
whether fractional cases might have appeared (for GlobalPrune) */
|
97
|
+
|
98
|
+
RecalculateErrs = false;
|
99
|
+
InsertParents(T, Nil);
|
100
|
+
|
101
|
+
/* Possible global pruning phase */
|
102
|
+
|
103
|
+
if ( GLOBAL && Now != WINNOWATTS )
|
104
|
+
{
|
105
|
+
GlobalPrune(T);
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
/* Remove impossible values from subsets and ordered splits.
|
110
|
+
First record possible values for discrete attributes */
|
111
|
+
|
112
|
+
PossibleValues = AllocZero(MaxAtt+1, Set);
|
113
|
+
ForEach(Att, 1, MaxAtt)
|
114
|
+
{
|
115
|
+
if ( Ordered(Att) || Discrete(Att) && SUBSET )
|
116
|
+
{
|
117
|
+
PossibleValues[Att] = AllocZero((MaxAttVal[Att]>>3)+1, Byte);
|
118
|
+
ForEach(i, 1, MaxAttVal[Att])
|
119
|
+
{
|
120
|
+
SetBit(i, PossibleValues[Att]);
|
121
|
+
}
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
CheckSubsets(T, true);
|
126
|
+
|
127
|
+
FreeVector((void **) PossibleValues, 1, MaxAtt); PossibleValues = Nil;
|
128
|
+
|
129
|
+
/* For multibranch splits, merge non-occurring values. For trees
|
130
|
+
(first boosting trial only), also merge leaves of same class */
|
131
|
+
|
132
|
+
if ( ! SUBSET )
|
133
|
+
{
|
134
|
+
CompressBranches(T);
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
/*************************************************************************/
|
141
|
+
/* */
|
142
|
+
/* Estimate the errors in a given subtree */
|
143
|
+
/* */
|
144
|
+
/*************************************************************************/
|
145
|
+
|
146
|
+
|
147
|
+
void EstimateErrs(Tree T, CaseNo Fp, CaseNo Lp, int Sh, int Flags)
|
148
|
+
/* ------------ */
|
149
|
+
{
|
150
|
+
CaseNo i, Bp, Ep, Missing;
|
151
|
+
CaseCount Cases=0, KnownCases, *BranchCases, MissingCases,
|
152
|
+
*SmallBranches, SmallBranchCases=0,
|
153
|
+
Errs, SaveErrs, TreeErrs, LeafErrs, ExtraLeafErrs=0, BestBrErrs;
|
154
|
+
double Factor, *LocalClassDist;
|
155
|
+
DiscrValue v, BestBr=0;
|
156
|
+
ClassNo c, BestClass=1;
|
157
|
+
int UnitWeights; /* local value */
|
158
|
+
Tree Br;
|
159
|
+
Attribute Att;
|
160
|
+
|
161
|
+
|
162
|
+
if ( Fp > Lp ) return;
|
163
|
+
|
164
|
+
UnitWeights = (Flags & UNITWEIGHTS);
|
165
|
+
|
166
|
+
LocalClassDist = Alloc(MaxClass+1, double);
|
167
|
+
|
168
|
+
FindClassFreq(LocalClassDist, Fp, Lp);
|
169
|
+
|
170
|
+
/* Record new class distribution if updating the tree */
|
171
|
+
|
172
|
+
if ( (Flags & UPDATE) )
|
173
|
+
{
|
174
|
+
ForEach(c, 1, MaxClass)
|
175
|
+
{
|
176
|
+
T->ClassDist[c] = LocalClassDist[c];
|
177
|
+
Cases += LocalClassDist[c];
|
178
|
+
|
179
|
+
if ( LocalClassDist[c] > LocalClassDist[BestClass] ) BestClass = c;
|
180
|
+
}
|
181
|
+
}
|
182
|
+
else
|
183
|
+
{
|
184
|
+
ForEach(c, 1, MaxClass)
|
185
|
+
{
|
186
|
+
Cases += LocalClassDist[c];
|
187
|
+
|
188
|
+
if ( LocalClassDist[c] > LocalClassDist[BestClass] ) BestClass = c;
|
189
|
+
}
|
190
|
+
}
|
191
|
+
|
192
|
+
LeafErrs = Cases - LocalClassDist[BestClass];
|
193
|
+
ExtraLeafErrs = ExtraErrs(Cases, LeafErrs, BestClass);
|
194
|
+
|
195
|
+
Free(LocalClassDist);
|
196
|
+
|
197
|
+
if ( (Flags & UPDATE) )
|
198
|
+
{
|
199
|
+
T->Cases = Cases;
|
200
|
+
T->Leaf = BestClass;
|
201
|
+
}
|
202
|
+
|
203
|
+
if ( ! T->NodeType ) /* leaf */
|
204
|
+
{
|
205
|
+
if ( (Flags & UPDATE) && (Flags & REPORTPROGRESS) &&
|
206
|
+
Now == SIMPLIFYTREE &&
|
207
|
+
T->Cases > 0 )
|
208
|
+
{
|
209
|
+
Progress(T->Cases);
|
210
|
+
}
|
211
|
+
|
212
|
+
T->Errors = LeafErrs + ExtraLeafErrs;
|
213
|
+
|
214
|
+
if ( (Flags & UPDATE) )
|
215
|
+
{
|
216
|
+
if ( Sh >= 0 )
|
217
|
+
{
|
218
|
+
LocalVerbosity(3,
|
219
|
+
Intab(Sh);
|
220
|
+
fprintf(Of, "%s (%.2f:%.2f/%.2f)\n", ClassName[T->Leaf],
|
221
|
+
T->Cases, LeafErrs, T->Errors))
|
222
|
+
}
|
223
|
+
}
|
224
|
+
|
225
|
+
return;
|
226
|
+
}
|
227
|
+
|
228
|
+
/* Estimate errors for each branch */
|
229
|
+
|
230
|
+
Att = T->Tested;
|
231
|
+
Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
|
232
|
+
|
233
|
+
if ( CostWeights )
|
234
|
+
{
|
235
|
+
MissingCases = SumNocostWeights(Fp, Ep);
|
236
|
+
KnownCases = SumNocostWeights(Ep+1, Lp);
|
237
|
+
}
|
238
|
+
else
|
239
|
+
{
|
240
|
+
MissingCases = CountCases(Fp, Ep);
|
241
|
+
KnownCases = Cases - MissingCases;
|
242
|
+
}
|
243
|
+
|
244
|
+
SmallBranches = AllocZero(MaxClass+1, CaseCount);
|
245
|
+
BranchCases = Alloc(T->Forks+1, CaseCount);
|
246
|
+
|
247
|
+
if ( Missing ) UnitWeights = 0;
|
248
|
+
|
249
|
+
TreeErrs = 0;
|
250
|
+
Bp = Fp;
|
251
|
+
|
252
|
+
ForEach(v, 1, T->Forks)
|
253
|
+
{
|
254
|
+
Ep = Group(v, Bp + Missing, Lp, T);
|
255
|
+
|
256
|
+
/* Bp -> first value in missing + remaining values
|
257
|
+
Ep -> last value in missing + current group */
|
258
|
+
|
259
|
+
BranchCases[v] = CountCases(Bp + Missing, Ep);
|
260
|
+
|
261
|
+
Factor = ( ! Missing ? 0 :
|
262
|
+
! CostWeights ? BranchCases[v] / KnownCases :
|
263
|
+
SumNocostWeights(Bp + Missing, Ep) / KnownCases );
|
264
|
+
|
265
|
+
if ( (BranchCases[v] += Factor * MissingCases) >= MinLeaf )
|
266
|
+
{
|
267
|
+
if ( Missing )
|
268
|
+
{
|
269
|
+
ForEach(i, Bp, Bp + Missing - 1)
|
270
|
+
{
|
271
|
+
Weight(Case[i]) *= Factor;
|
272
|
+
}
|
273
|
+
}
|
274
|
+
|
275
|
+
EstimateErrs(T->Branch[v], Bp, Ep, Sh+1, ((Flags&7) | UnitWeights));
|
276
|
+
|
277
|
+
/* Group small branches together for error estimation */
|
278
|
+
|
279
|
+
if ( BranchCases[v] < MINITEMS )
|
280
|
+
{
|
281
|
+
ForEach(i, Bp, Ep)
|
282
|
+
{
|
283
|
+
SmallBranches[ Class(Case[i]) ] += Weight(Case[i]);
|
284
|
+
}
|
285
|
+
|
286
|
+
SmallBranchCases += BranchCases[v];
|
287
|
+
}
|
288
|
+
else
|
289
|
+
{
|
290
|
+
TreeErrs += T->Branch[v]->Errors;
|
291
|
+
}
|
292
|
+
|
293
|
+
/* Restore weights if changed */
|
294
|
+
|
295
|
+
if ( Missing )
|
296
|
+
{
|
297
|
+
for ( i = Ep ; i >= Bp ; i-- )
|
298
|
+
{
|
299
|
+
if ( Unknown(Case[i], Att) )
|
300
|
+
{
|
301
|
+
Weight(Case[i]) /= Factor;
|
302
|
+
Swap(i, Ep);
|
303
|
+
Ep--;
|
304
|
+
}
|
305
|
+
}
|
306
|
+
}
|
307
|
+
|
308
|
+
Bp = Ep+1;
|
309
|
+
}
|
310
|
+
}
|
311
|
+
|
312
|
+
/* Add error estimate from small branches, if any */
|
313
|
+
|
314
|
+
if ( SmallBranchCases )
|
315
|
+
{
|
316
|
+
BestClass = 1;
|
317
|
+
ForEach(c, 2, MaxClass)
|
318
|
+
{
|
319
|
+
if ( SmallBranches[c] > SmallBranches[BestClass] ) BestClass = c;
|
320
|
+
}
|
321
|
+
|
322
|
+
Errs = SmallBranchCases - SmallBranches[BestClass];
|
323
|
+
TreeErrs += Errs + ExtraErrs(SmallBranchCases, Errs, BestClass);
|
324
|
+
}
|
325
|
+
Free(SmallBranches);
|
326
|
+
Free(BranchCases);
|
327
|
+
|
328
|
+
if ( ! (Flags & UPDATE) )
|
329
|
+
{
|
330
|
+
T->Errors = TreeErrs;
|
331
|
+
return;
|
332
|
+
}
|
333
|
+
|
334
|
+
/* See how the largest candidate branch would perform. A branch
|
335
|
+
is a candidate if it is not a leaf, contains at least 10% of
|
336
|
+
the cases, and does not test the same continuous attribute.
|
337
|
+
This test is skipped for boosted trees */
|
338
|
+
|
339
|
+
ForEach(v, 1, T->Forks)
|
340
|
+
{
|
341
|
+
if ( ! T->Branch[v]->NodeType ||
|
342
|
+
T->Branch[v]->Cases < 0.1 * T->Cases ||
|
343
|
+
T->Branch[v]->Tested == Att && Continuous(Att) )
|
344
|
+
{
|
345
|
+
continue;
|
346
|
+
}
|
347
|
+
|
348
|
+
if ( ! BestBr || T->Branch[v]->Cases > T->Branch[BestBr]->Cases )
|
349
|
+
{
|
350
|
+
BestBr = v;
|
351
|
+
}
|
352
|
+
}
|
353
|
+
|
354
|
+
if ( BestBr )
|
355
|
+
{
|
356
|
+
SaveErrs = T->Branch[BestBr]->Errors;
|
357
|
+
EstimateErrs(T->Branch[BestBr], Fp, Lp, -1, 0);
|
358
|
+
BestBrErrs = T->Branch[BestBr]->Errors;
|
359
|
+
T->Branch[BestBr]->Errors = SaveErrs;
|
360
|
+
}
|
361
|
+
else
|
362
|
+
{
|
363
|
+
BestBrErrs = MaxCase+1;
|
364
|
+
}
|
365
|
+
|
366
|
+
LocalVerbosity(2,
|
367
|
+
Intab(Sh);
|
368
|
+
fprintf(Of, "%s: [%d%% N=%.2f tree=%.2f leaf=%.2f+%.2f",
|
369
|
+
AttName[T->Tested],
|
370
|
+
(int) ((TreeErrs * 100) / (T->Cases + 0.001)),
|
371
|
+
T->Cases, TreeErrs, LeafErrs, ExtraLeafErrs);
|
372
|
+
if ( BestBr )
|
373
|
+
{
|
374
|
+
fprintf(Of, " br[%d]=%.2f", BestBr, BestBrErrs);
|
375
|
+
}
|
376
|
+
fprintf(Of, "]\n"))
|
377
|
+
|
378
|
+
/* See whether tree should be replaced with leaf or best branch */
|
379
|
+
|
380
|
+
if ( LeafErrs + ExtraLeafErrs <= BestBrErrs + 0.1 &&
|
381
|
+
LeafErrs + ExtraLeafErrs <= TreeErrs + 0.1 )
|
382
|
+
{
|
383
|
+
LocalVerbosity(2,
|
384
|
+
Intab(Sh);
|
385
|
+
fprintf(Of, "Replaced with leaf %s\n", ClassName[T->Leaf]))
|
386
|
+
|
387
|
+
UnSprout(T);
|
388
|
+
T->Errors = LeafErrs + ExtraLeafErrs;
|
389
|
+
}
|
390
|
+
else
|
391
|
+
if ( BestBrErrs <= TreeErrs + 0.1 )
|
392
|
+
{
|
393
|
+
LocalVerbosity(2,
|
394
|
+
Intab(Sh);
|
395
|
+
fprintf(Of, "Replaced with branch %d\n", BestBr))
|
396
|
+
|
397
|
+
/* Free unused bits of tree */
|
398
|
+
|
399
|
+
ForEach(v, 1, T->Forks)
|
400
|
+
{
|
401
|
+
if ( v != BestBr ) FreeTree(T->Branch[v]);
|
402
|
+
}
|
403
|
+
Br = T->Branch[BestBr];
|
404
|
+
Free(T->Branch);
|
405
|
+
Free(T->ClassDist);
|
406
|
+
if ( T->NodeType == BrSubset )
|
407
|
+
{
|
408
|
+
FreeVector((void **) T->Subset, 1, T->Forks);
|
409
|
+
}
|
410
|
+
|
411
|
+
/* Copy the branch up */
|
412
|
+
|
413
|
+
memcpy((char *) T, (char *) Br, sizeof(TreeRec));
|
414
|
+
Free(Br);
|
415
|
+
|
416
|
+
Factor = T->Cases / Cases;
|
417
|
+
T->Cases = Cases;
|
418
|
+
|
419
|
+
/* If not within a rebuilt tree, not a cascaded test, and
|
420
|
+
sufficient new cases to justify the effort, rebuild the branch */
|
421
|
+
|
422
|
+
if ( T->NodeType && (Flags & REGROW) && Factor < 0.95 )
|
423
|
+
{
|
424
|
+
ForEach(v, 1, T->Forks)
|
425
|
+
{
|
426
|
+
FreeTree(T->Branch[v]); T->Branch[v] = Nil;
|
427
|
+
}
|
428
|
+
|
429
|
+
SetGlobalUnitWeights(Flags & UNITWEIGHTS);
|
430
|
+
|
431
|
+
Divide(T, Fp, Lp, 0);
|
432
|
+
}
|
433
|
+
|
434
|
+
EstimateErrs(T, Fp, Lp, Sh, UPDATE);
|
435
|
+
}
|
436
|
+
else
|
437
|
+
{
|
438
|
+
T->Errors = TreeErrs;
|
439
|
+
}
|
440
|
+
}
|
441
|
+
|
442
|
+
|
443
|
+
|
444
|
+
/*************************************************************************/
|
445
|
+
/* */
|
446
|
+
/* Phase 2 (global) pruning. */
|
447
|
+
/* Prune minimum cost complexity subtrees until "error" */
|
448
|
+
/* (measured by sum of branch errors) increases by 1SE */
|
449
|
+
/* */
|
450
|
+
/*************************************************************************/
|
451
|
+
|
452
|
+
|
453
|
+
void GlobalPrune(Tree T)
|
454
|
+
/* ----------- */
|
455
|
+
{
|
456
|
+
int DeltaLeaves, x;
|
457
|
+
double BaseErrs, DeltaErrs;
|
458
|
+
CaseNo i;
|
459
|
+
Tree ST;
|
460
|
+
|
461
|
+
/* If fractional cases may have been used, calculate errors
|
462
|
+
directly by checking training data */
|
463
|
+
|
464
|
+
if ( RecalculateErrs )
|
465
|
+
{
|
466
|
+
BaseErrs = 0;
|
467
|
+
ForEach(i, 0, MaxCase)
|
468
|
+
{
|
469
|
+
if ( TreeClassify(Case[i], T) != Class(Case[i]) )
|
470
|
+
{
|
471
|
+
BaseErrs += Weight(Case[i]);
|
472
|
+
}
|
473
|
+
}
|
474
|
+
}
|
475
|
+
else
|
476
|
+
{
|
477
|
+
BaseErrs = T->Errors;
|
478
|
+
}
|
479
|
+
|
480
|
+
XT = Alloc(T->Leaves, Tree);
|
481
|
+
|
482
|
+
/* Additional error limit set at 1SE */
|
483
|
+
|
484
|
+
MaxExtraErrs = sqrt(BaseErrs * (1 - BaseErrs / (MaxCase + 1)));
|
485
|
+
|
486
|
+
while ( MaxExtraErrs > 0 )
|
487
|
+
{
|
488
|
+
TotalExtraErrs = 0;
|
489
|
+
|
490
|
+
MinCC = 1E38;
|
491
|
+
NXT = 0;
|
492
|
+
|
493
|
+
/* Find all subtrees with lowest cost complexity */
|
494
|
+
|
495
|
+
FindMinCC(T);
|
496
|
+
|
497
|
+
Verbosity(2,
|
498
|
+
if ( NXT > 0 && TotalExtraErrs > MaxExtraErrs )
|
499
|
+
fprintf(Of, "%d tied with MinCC=%.3f; total extra errs %.1f\n",
|
500
|
+
NXT, MinCC, TotalExtraErrs))
|
501
|
+
|
502
|
+
if ( ! NXT || TotalExtraErrs > MaxExtraErrs ) break;
|
503
|
+
|
504
|
+
/* Make subtree into a leaf */
|
505
|
+
|
506
|
+
ForEach(x, 0, NXT-1)
|
507
|
+
{
|
508
|
+
ST = XT[x];
|
509
|
+
|
510
|
+
UnSprout(ST);
|
511
|
+
|
512
|
+
/* Update errors and leaves for ST and ancestors */
|
513
|
+
|
514
|
+
DeltaErrs = (ST->Cases - ST->ClassDist[ST->Leaf]) - ST->Errors;
|
515
|
+
DeltaLeaves = 1 - ST->Leaves;
|
516
|
+
while ( ST )
|
517
|
+
{
|
518
|
+
ST->Errors += DeltaErrs;
|
519
|
+
ST->Leaves += DeltaLeaves;
|
520
|
+
ST = ST->Parent;
|
521
|
+
}
|
522
|
+
|
523
|
+
MaxExtraErrs -= DeltaErrs;
|
524
|
+
|
525
|
+
Verbosity(2,
|
526
|
+
fprintf(Of, "global: %d leaves, %.1f errs\n",
|
527
|
+
DeltaLeaves, DeltaErrs))
|
528
|
+
}
|
529
|
+
Verbosity(2, fprintf(Of, "\tremaining=%.1f\n", MaxExtraErrs))
|
530
|
+
}
|
531
|
+
|
532
|
+
Free(XT);
|
533
|
+
}
|
534
|
+
|
535
|
+
|
536
|
+
|
537
|
+
/*************************************************************************/
|
538
|
+
/* */
|
539
|
+
/* Scan tree computing cost complexity of each subtree and */
|
540
|
+
/* record lowest in global variable XT */
|
541
|
+
/* */
|
542
|
+
/*************************************************************************/
|
543
|
+
|
544
|
+
|
545
|
+
void FindMinCC(Tree T)
|
546
|
+
/* --------- */
|
547
|
+
{
|
548
|
+
DiscrValue v;
|
549
|
+
double ExtraErrs, CC, SaveMinCC, SaveTotalExtraErrs;
|
550
|
+
int SaveNXT;
|
551
|
+
|
552
|
+
if ( T->NodeType )
|
553
|
+
{
|
554
|
+
/* Save current situation */
|
555
|
+
|
556
|
+
SaveTotalExtraErrs = TotalExtraErrs;
|
557
|
+
SaveMinCC = MinCC;
|
558
|
+
SaveNXT = NXT;
|
559
|
+
|
560
|
+
/* Scan subtrees */
|
561
|
+
|
562
|
+
ForEach(v, 1, T->Forks)
|
563
|
+
{
|
564
|
+
if ( T->Branch[v]->Cases > 0.1 )
|
565
|
+
{
|
566
|
+
FindMinCC(T->Branch[v]);
|
567
|
+
}
|
568
|
+
}
|
569
|
+
|
570
|
+
/* Compute CC for this subtree and check whether minimum */
|
571
|
+
|
572
|
+
ExtraErrs = (T->Cases - T->ClassDist[T->Leaf]) - T->Errors;
|
573
|
+
|
574
|
+
CC = ExtraErrs / (T->Leaves - 1);
|
575
|
+
|
576
|
+
if ( ExtraErrs <= MaxExtraErrs )
|
577
|
+
{
|
578
|
+
/* Have to be careful of ties in descendants, because
|
579
|
+
they would inflate TotalExtraErrs. Any such ties
|
580
|
+
should be discarded */
|
581
|
+
|
582
|
+
if ( CC < MinCC ||
|
583
|
+
CC <= MinCC && CC < SaveMinCC /* changed by descendants */ )
|
584
|
+
{
|
585
|
+
/* This is the first of a possible group of ties */
|
586
|
+
|
587
|
+
MinCC = CC;
|
588
|
+
NXT = 1;
|
589
|
+
XT[0] = T;
|
590
|
+
TotalExtraErrs = ExtraErrs;
|
591
|
+
}
|
592
|
+
else
|
593
|
+
if ( CC <= MinCC )
|
594
|
+
{
|
595
|
+
/* This is a tie. Discard any ties among descendants */
|
596
|
+
|
597
|
+
if ( NXT > SaveNXT )
|
598
|
+
{
|
599
|
+
TotalExtraErrs = SaveTotalExtraErrs;
|
600
|
+
NXT = SaveNXT;
|
601
|
+
}
|
602
|
+
|
603
|
+
XT[NXT++] = T;
|
604
|
+
TotalExtraErrs += ExtraErrs;
|
605
|
+
}
|
606
|
+
}
|
607
|
+
}
|
608
|
+
}
|
609
|
+
|
610
|
+
|
611
|
+
|
612
|
+
/*************************************************************************/
|
613
|
+
/* */
|
614
|
+
/* Annotate tree with information on parent and leaves */
|
615
|
+
/* */
|
616
|
+
/*************************************************************************/
|
617
|
+
|
618
|
+
|
619
|
+
void InsertParents(Tree T, Tree P)
|
620
|
+
/* ------------- */
|
621
|
+
{
|
622
|
+
DiscrValue v;
|
623
|
+
|
624
|
+
T->Parent = P;
|
625
|
+
T->Errors = T->Leaves = 0;
|
626
|
+
|
627
|
+
if ( T->NodeType )
|
628
|
+
{
|
629
|
+
ForEach(v, 1, T->Forks)
|
630
|
+
{
|
631
|
+
InsertParents(T->Branch[v], T);
|
632
|
+
T->Errors += T->Branch[v]->Errors;
|
633
|
+
T->Leaves += T->Branch[v]->Leaves;
|
634
|
+
}
|
635
|
+
|
636
|
+
if ( SomeMiss[T->Tested] ) RecalculateErrs = true;
|
637
|
+
}
|
638
|
+
else
|
639
|
+
if ( T->Cases > 1E-3 )
|
640
|
+
{
|
641
|
+
T->Errors = T->Cases - T->ClassDist[T->Leaf];
|
642
|
+
T->Leaves = 1;
|
643
|
+
}
|
644
|
+
}
|
645
|
+
|
646
|
+
|
647
|
+
|
648
|
+
/*************************************************************************/
|
649
|
+
/* */
|
650
|
+
/* Remove unnecessary subset tests on missing values */
|
651
|
+
/* */
|
652
|
+
/*************************************************************************/
|
653
|
+
|
654
|
+
|
655
|
+
void CheckSubsets(Tree T, Boolean PruneDefaults)
|
656
|
+
/* ------------ */
|
657
|
+
{
|
658
|
+
Set HoldValues;
|
659
|
+
int v, vv, x, Bytes, b, First, Any=0;
|
660
|
+
Attribute A;
|
661
|
+
Tree LeafBr;
|
662
|
+
ClassNo c;
|
663
|
+
|
664
|
+
if ( T->NodeType == BrSubset )
|
665
|
+
{
|
666
|
+
A = T->Tested;
|
667
|
+
|
668
|
+
Bytes = (MaxAttVal[A]>>3) + 1;
|
669
|
+
HoldValues = Alloc(Bytes, Byte);
|
670
|
+
|
671
|
+
/* For non-ordered attributes the last (default) branch contains
|
672
|
+
all values that do not appear in the data. See whether this
|
673
|
+
branch can be simplified or omitted */
|
674
|
+
|
675
|
+
if ( ! Ordered(A) && PruneDefaults )
|
676
|
+
{
|
677
|
+
ForEach(b, 0, Bytes-1)
|
678
|
+
{
|
679
|
+
T->Subset[T->Forks][b] &= PossibleValues[A][b];
|
680
|
+
Any |= T->Subset[T->Forks][b];
|
681
|
+
}
|
682
|
+
|
683
|
+
if ( ! Any )
|
684
|
+
{
|
685
|
+
FreeTree(T->Branch[T->Forks]);
|
686
|
+
Free(T->Subset[T->Forks]);
|
687
|
+
T->Forks--;
|
688
|
+
}
|
689
|
+
}
|
690
|
+
|
691
|
+
/* Process each subtree, leaving only values in branch subset */
|
692
|
+
|
693
|
+
CopyBits(Bytes, PossibleValues[A], HoldValues);
|
694
|
+
|
695
|
+
ForEach(v, 1, T->Forks)
|
696
|
+
{
|
697
|
+
/* Remove any impossible values from ordered subsets */
|
698
|
+
|
699
|
+
if ( Ordered(A) )
|
700
|
+
{
|
701
|
+
ForEach(vv, 1, MaxAttVal[A])
|
702
|
+
{
|
703
|
+
if ( In(vv, T->Subset[v]) && ! In(vv, HoldValues) )
|
704
|
+
{
|
705
|
+
ResetBit(vv, T->Subset[v]);
|
706
|
+
}
|
707
|
+
}
|
708
|
+
}
|
709
|
+
|
710
|
+
CopyBits(Bytes, T->Subset[v], PossibleValues[A]);
|
711
|
+
|
712
|
+
CheckSubsets(T->Branch[v], PruneDefaults);
|
713
|
+
}
|
714
|
+
|
715
|
+
CopyBits(Bytes, HoldValues, PossibleValues[A]);
|
716
|
+
|
717
|
+
Free(HoldValues);
|
718
|
+
|
719
|
+
/* See whether branches other than N/A can be merged.
|
720
|
+
This cannot be done for ordered attributes since the
|
721
|
+
values in the subset represent an interval */
|
722
|
+
|
723
|
+
if ( ! Ordered(A) )
|
724
|
+
{
|
725
|
+
First = ( In(1, T->Subset[1]) ? 2 : 1 );
|
726
|
+
for ( v = First ; v < T->Forks ; v++ )
|
727
|
+
{
|
728
|
+
if ( T->Branch[v]->NodeType ) continue;
|
729
|
+
LeafBr = T->Branch[v];
|
730
|
+
|
731
|
+
/* Consider branches vv that could be merged with branch v */
|
732
|
+
|
733
|
+
for ( vv = v+1 ; vv <= T->Forks ; )
|
734
|
+
{
|
735
|
+
if ( ! T->Branch[vv]->NodeType &&
|
736
|
+
T->Branch[vv]->Leaf == LeafBr->Leaf &&
|
737
|
+
( PruneDefaults || T->Branch[vv]->Cases > 0 ) )
|
738
|
+
{
|
739
|
+
/* Branch vv can be merged with branch v */
|
740
|
+
|
741
|
+
if ( T->Branch[vv]->Cases )
|
742
|
+
{
|
743
|
+
/* Add class distribution from branch vv,
|
744
|
+
or replace if branch v has no cases */
|
745
|
+
|
746
|
+
ForEach(c, 1, MaxClass)
|
747
|
+
{
|
748
|
+
if ( ! LeafBr->Cases )
|
749
|
+
{
|
750
|
+
LeafBr->ClassDist[c] =
|
751
|
+
T->Branch[vv]->ClassDist[c];
|
752
|
+
}
|
753
|
+
else
|
754
|
+
{
|
755
|
+
LeafBr->ClassDist[c] +=
|
756
|
+
T->Branch[vv]->ClassDist[c];
|
757
|
+
}
|
758
|
+
}
|
759
|
+
LeafBr->Cases += T->Branch[vv]->Cases;
|
760
|
+
LeafBr->Errors += T->Branch[vv]->Errors;
|
761
|
+
}
|
762
|
+
|
763
|
+
/* Merge values and free branch vv */
|
764
|
+
|
765
|
+
ForEach(b, 0, Bytes-1)
|
766
|
+
{
|
767
|
+
T->Subset[v][b] |= T->Subset[vv][b];
|
768
|
+
}
|
769
|
+
FreeTree(T->Branch[vv]);
|
770
|
+
Free(T->Subset[vv]);
|
771
|
+
|
772
|
+
T->Forks--;
|
773
|
+
ForEach(x, vv, T->Forks)
|
774
|
+
{
|
775
|
+
T->Branch[x] = T->Branch[x+1];
|
776
|
+
T->Subset[x] = T->Subset[x+1];
|
777
|
+
}
|
778
|
+
}
|
779
|
+
else
|
780
|
+
{
|
781
|
+
vv++;
|
782
|
+
}
|
783
|
+
}
|
784
|
+
}
|
785
|
+
}
|
786
|
+
}
|
787
|
+
else
|
788
|
+
if ( T->NodeType )
|
789
|
+
{
|
790
|
+
ForEach(v, 1, T->Forks)
|
791
|
+
{
|
792
|
+
CheckSubsets(T->Branch[v], PruneDefaults);
|
793
|
+
}
|
794
|
+
}
|
795
|
+
}
|
796
|
+
|
797
|
+
|
798
|
+
|
799
|
+
/*************************************************************************/
|
800
|
+
/* */
|
801
|
+
/* Compute Coeff, used by RawExtraErrs() to adjust resubstitution */
|
802
|
+
/* error rate to upper limit of the confidence level. Coeff is */
|
803
|
+
/* the square of the number of standard deviations corresponding */
|
804
|
+
/* to the selected confidence level. (Taken from Documenta Geigy */
|
805
|
+
/* Scientific Tables (Sixth Edition), p185 (with modifications).) */
|
806
|
+
/* */
|
807
|
+
/*************************************************************************/
|
808
|
+
|
809
|
+
|
810
|
+
float Val[] = { 0, 0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00},
|
811
|
+
Dev[] = {4.0, 3.09, 2.58, 2.33, 1.65, 1.28, 0.84, 0.25, 0.00},
|
812
|
+
Coeff;
|
813
|
+
|
814
|
+
|
815
|
+
void InitialiseExtraErrs()
|
816
|
+
/* ------------------- */
|
817
|
+
{
|
818
|
+
int i=1;
|
819
|
+
|
820
|
+
/* Compute and retain the coefficient value, interpolating from
|
821
|
+
the values in Val and Dev */
|
822
|
+
|
823
|
+
while ( CF > Val[i] ) i++;
|
824
|
+
|
825
|
+
Coeff = Dev[i-1] +
|
826
|
+
(Dev[i] - Dev[i-1]) * (CF - Val[i-1]) /(Val[i] - Val[i-1]);
|
827
|
+
Coeff = Coeff * Coeff;
|
828
|
+
CF = Max(CF, 1E-6);
|
829
|
+
}
|
830
|
+
|
831
|
+
|
832
|
+
/*************************************************************************/
|
833
|
+
/* */
|
834
|
+
/* Calculate extra errors to correct the resubstitution error */
|
835
|
+
/* rate at a leaf with N cases, E errors, predicted class C. */
|
836
|
+
/* If CostWeights are used, N and E are normalised by removing */
|
837
|
+
/* the effects of cost weighting and then reapplying weights to */
|
838
|
+
/* the result. */
|
839
|
+
/* */
|
840
|
+
/*************************************************************************/
|
841
|
+
|
842
|
+
|
843
|
+
float ExtraErrs(CaseCount N, CaseCount E, ClassNo C)
|
844
|
+
/* --------- */
|
845
|
+
{
|
846
|
+
ClassNo EC;
|
847
|
+
CaseCount NormC, NormEC;
|
848
|
+
|
849
|
+
if ( ! CostWeights )
|
850
|
+
{
|
851
|
+
return RawExtraErrs(N, E);
|
852
|
+
}
|
853
|
+
|
854
|
+
EC = 3 - C; /* the other class */
|
855
|
+
NormC = (N - E) / WeightMul[C]; /* normalised cases of class C */
|
856
|
+
NormEC = E / WeightMul[EC]; /* ditto the other class */
|
857
|
+
|
858
|
+
return WeightMul[EC] * RawExtraErrs(NormC + NormEC, NormEC);
|
859
|
+
}
|
860
|
+
|
861
|
+
|
862
|
+
|
863
|
+
float RawExtraErrs(CaseCount N, CaseCount E)
|
864
|
+
/* ------------ */
|
865
|
+
{
|
866
|
+
float Val0, Pr;
|
867
|
+
|
868
|
+
if ( E < 1E-6 )
|
869
|
+
{
|
870
|
+
return N * (1 - exp(log(CF) / N));
|
871
|
+
}
|
872
|
+
else
|
873
|
+
if ( N > 1 && E < 0.9999 )
|
874
|
+
{
|
875
|
+
Val0 = N * (1 - exp(log(CF) / N));
|
876
|
+
return Val0 + E * (RawExtraErrs(N, 1.0) - Val0);
|
877
|
+
}
|
878
|
+
else
|
879
|
+
if ( E + 0.5 >= N )
|
880
|
+
{
|
881
|
+
return 0.67 * (N - E);
|
882
|
+
}
|
883
|
+
else
|
884
|
+
{
|
885
|
+
Pr = (E + 0.5 + Coeff/2
|
886
|
+
+ sqrt(Coeff * ((E + 0.5) * (1 - (E + 0.5)/N) + Coeff/4)) )
|
887
|
+
/ (N + Coeff);
|
888
|
+
return (N * Pr - E);
|
889
|
+
}
|
890
|
+
}
|
891
|
+
|
892
|
+
|
893
|
+
|
894
|
+
/*************************************************************************/
|
895
|
+
/* */
|
896
|
+
/* If there are differential misclassification costs, the weights */
|
897
|
+
/* may have been artificially adjusted. Fix the distributions so */
|
898
|
+
/* that they represent the "true" (possibly boosted) weights */
|
899
|
+
/* */
|
900
|
+
/*************************************************************************/
|
901
|
+
|
902
|
+
|
903
|
+
void RestoreDistribs(Tree T)
|
904
|
+
/* --------------- */
|
905
|
+
{
|
906
|
+
DiscrValue v;
|
907
|
+
ClassNo c;
|
908
|
+
|
909
|
+
if ( T->NodeType )
|
910
|
+
{
|
911
|
+
ForEach(v, 1, T->Forks)
|
912
|
+
{
|
913
|
+
RestoreDistribs(T->Branch[v]);
|
914
|
+
}
|
915
|
+
}
|
916
|
+
|
917
|
+
if ( T->Cases >= MinLeaf )
|
918
|
+
{
|
919
|
+
if ( CostWeights )
|
920
|
+
{
|
921
|
+
T->Cases = 0;
|
922
|
+
ForEach(c, 1, MaxClass)
|
923
|
+
{
|
924
|
+
ClassSum[c] = (T->ClassDist[c] /= WeightMul[c]);
|
925
|
+
T->Cases += T->ClassDist[c];
|
926
|
+
}
|
927
|
+
}
|
928
|
+
else
|
929
|
+
{
|
930
|
+
ForEach(c, 1, MaxClass)
|
931
|
+
{
|
932
|
+
ClassSum[c] = T->ClassDist[c];
|
933
|
+
}
|
934
|
+
}
|
935
|
+
|
936
|
+
T->Leaf = SelectClass(1, true);
|
937
|
+
T->Errors = T->Cases - T->ClassDist[T->Leaf];
|
938
|
+
}
|
939
|
+
}
|
940
|
+
|
941
|
+
|
942
|
+
|
943
|
+
/*************************************************************************/
|
944
|
+
/* */
|
945
|
+
/* See whether empty branches can be formed into subsets. */
|
946
|
+
/* For the first trial only, and when not generating rulesets, */
|
947
|
+
/* combine leaves with the same class. */
|
948
|
+
/* */
|
949
|
+
/*************************************************************************/
|
950
|
+
|
951
|
+
|
952
|
+
void CompressBranches(Tree T)
|
953
|
+
/* ---------------- */
|
954
|
+
{
|
955
|
+
DiscrValue v, vv, S=0, *LocalSet;
|
956
|
+
int Bytes;
|
957
|
+
Tree Br, *OldBranch;
|
958
|
+
ClassNo c;
|
959
|
+
Boolean EmptyOnly;
|
960
|
+
|
961
|
+
EmptyOnly = Trial || RULES;
|
962
|
+
|
963
|
+
if ( T->NodeType )
|
964
|
+
{
|
965
|
+
/* LocalSet[v] is the new branch number to which branch v belongs */
|
966
|
+
|
967
|
+
LocalSet = AllocZero(T->Forks+1, DiscrValue);
|
968
|
+
|
969
|
+
ForEach(v, 1, T->Forks)
|
970
|
+
{
|
971
|
+
Br = T->Branch[v];
|
972
|
+
CompressBranches(Br);
|
973
|
+
|
974
|
+
/* Don't check if compression impossible */
|
975
|
+
|
976
|
+
if ( v == 1 || T->Forks < 4 || Br->NodeType ||
|
977
|
+
EmptyOnly && Br->Cases >= MinLeaf )
|
978
|
+
{
|
979
|
+
vv = v + 1;
|
980
|
+
}
|
981
|
+
else
|
982
|
+
{
|
983
|
+
/* Check whether some previous branch is mergeable.
|
984
|
+
For Trial 0, leaves are mergeable if they are
|
985
|
+
both empty or both non-empty and have the same class;
|
986
|
+
for later trials, they must both be empty */
|
987
|
+
|
988
|
+
for ( vv = 2 ; vv < v ; vv++ )
|
989
|
+
{
|
990
|
+
if ( ! T->Branch[vv]->NodeType &&
|
991
|
+
( EmptyOnly ? T->Branch[vv]->Cases < MinLeaf :
|
992
|
+
( T->Branch[vv]->Cases < MinLeaf ) ==
|
993
|
+
( Br->Cases < MinLeaf ) &&
|
994
|
+
T->Branch[vv]->Leaf == Br->Leaf ) )
|
995
|
+
{
|
996
|
+
break;
|
997
|
+
}
|
998
|
+
}
|
999
|
+
}
|
1000
|
+
|
1001
|
+
/* If no merge was found, this becomes a new branch */
|
1002
|
+
|
1003
|
+
LocalSet[v] = ( vv < v ? LocalSet[vv] : ++S );
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
if ( S < T->Forks )
|
1007
|
+
{
|
1008
|
+
/* Compress! */
|
1009
|
+
|
1010
|
+
T->Subset = Alloc(S+1, Set);
|
1011
|
+
OldBranch = T->Branch;
|
1012
|
+
T->Branch = Alloc(S+1, Tree);
|
1013
|
+
|
1014
|
+
Bytes = (MaxAttVal[T->Tested]>>3) + 1;
|
1015
|
+
S = 0;
|
1016
|
+
|
1017
|
+
ForEach(v, 1, T->Forks)
|
1018
|
+
{
|
1019
|
+
if ( LocalSet[v] > S )
|
1020
|
+
{
|
1021
|
+
S++;
|
1022
|
+
Br = T->Branch[S] = OldBranch[v];
|
1023
|
+
if ( ! Br->ClassDist )
|
1024
|
+
{
|
1025
|
+
Br->ClassDist = AllocZero(MaxClass+1, CaseCount);
|
1026
|
+
}
|
1027
|
+
T->Subset[S] = AllocZero(Bytes, Byte);
|
1028
|
+
|
1029
|
+
/* Must include N/A even when no cases -- otherwise
|
1030
|
+
reader gets the branches muddled */
|
1031
|
+
|
1032
|
+
SetBit(v, T->Subset[S]);
|
1033
|
+
|
1034
|
+
ForEach(vv, v+1, T->Forks)
|
1035
|
+
{
|
1036
|
+
if ( LocalSet[vv] == S )
|
1037
|
+
{
|
1038
|
+
SetBit(vv, T->Subset[S]);
|
1039
|
+
|
1040
|
+
Br->Cases += OldBranch[vv]->Cases;
|
1041
|
+
Br->Errors += OldBranch[vv]->Errors;
|
1042
|
+
ForEach(c, 1, MaxClass)
|
1043
|
+
{
|
1044
|
+
Br->ClassDist[c] += OldBranch[vv]->ClassDist[c];
|
1045
|
+
}
|
1046
|
+
}
|
1047
|
+
}
|
1048
|
+
}
|
1049
|
+
else
|
1050
|
+
{
|
1051
|
+
FreeTree(OldBranch[v]);
|
1052
|
+
}
|
1053
|
+
}
|
1054
|
+
|
1055
|
+
T->NodeType = BrSubset;
|
1056
|
+
T->Forks = S;
|
1057
|
+
Free(OldBranch);
|
1058
|
+
}
|
1059
|
+
Free(LocalSet);
|
1060
|
+
}
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
|
1064
|
+
|
1065
|
+
void SetGlobalUnitWeights(int LocalFlag)
|
1066
|
+
/* -------------------- */
|
1067
|
+
{
|
1068
|
+
UnitWeights = ( LocalFlag != 0 );
|
1069
|
+
}
|