see5-installer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
data/ext/c5.0/contin.c
ADDED
@@ -0,0 +1,613 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Evaluation of a test on a continuous valued attribute */
|
30
|
+
/* ----------------------------------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
#include "defns.i"
|
35
|
+
#include "extern.i"
|
36
|
+
|
37
|
+
#define PartInfo(n) (-(n)*Log((n)/GEnv.Cases))
|
38
|
+
|
39
|
+
|
40
|
+
/*************************************************************************/
|
41
|
+
/* */
|
42
|
+
/* Continuous attributes are treated as if they have possible */
|
43
|
+
/* values 0 (unknown), 1 (not applicable), 2 (less than cut) and */
|
44
|
+
/* 3 (greater than cut). */
|
45
|
+
/* This routine finds the best cut for cases Fp through Lp and */
|
46
|
+
/* sets Info[], Gain[] and Bar[] */
|
47
|
+
/* */
|
48
|
+
/*************************************************************************/
|
49
|
+
|
50
|
+
|
51
|
+
void EvalContinuousAtt(Attribute Att, CaseNo Fp, CaseNo Lp)
|
52
|
+
/* ----------------- */
|
53
|
+
{
|
54
|
+
CaseNo i, j, BestI, Tries=0;
|
55
|
+
double LowInfo, LHInfo, LeastInfo=1E38,
|
56
|
+
w, BestGain, BestInfo, ThreshCost=1;
|
57
|
+
ClassNo c;
|
58
|
+
ContValue Interval;
|
59
|
+
|
60
|
+
Verbosity(3, fprintf(Of, "\tAtt %s\n", AttName[Att]))
|
61
|
+
|
62
|
+
Gain[Att] = None;
|
63
|
+
PrepareForContin(Att, Fp, Lp);
|
64
|
+
|
65
|
+
/* Special case when very few known values */
|
66
|
+
|
67
|
+
if ( GEnv.ApplicCases < 2 * MINITEMS )
|
68
|
+
{
|
69
|
+
Verbosity(2,
|
70
|
+
fprintf(Of, "\tAtt %s\tinsufficient cases with known values\n",
|
71
|
+
AttName[Att]))
|
72
|
+
return;
|
73
|
+
}
|
74
|
+
|
75
|
+
/* Try possible cuts between cases i and i+1, and determine the
|
76
|
+
information and gain of the split in each case */
|
77
|
+
|
78
|
+
/* We have to be wary of splitting a small number of cases off one end,
|
79
|
+
as this has little predictive power. The minimum split GEnv.MinSplit is
|
80
|
+
the maximum of MINITEMS or (the minimum of 25 and 10% of the cases
|
81
|
+
per class) */
|
82
|
+
|
83
|
+
GEnv.MinSplit = 0.10 * GEnv.KnownCases / MaxClass;
|
84
|
+
if ( GEnv.MinSplit > 25 ) GEnv.MinSplit = 25;
|
85
|
+
if ( GEnv.MinSplit < MINITEMS ) GEnv.MinSplit = MINITEMS;
|
86
|
+
|
87
|
+
/* Find first possible cut point and initialise scan parameters */
|
88
|
+
|
89
|
+
i = PrepareForScan(Lp);
|
90
|
+
|
91
|
+
/* Repeatedly check next possible cut */
|
92
|
+
|
93
|
+
for ( ; i <= GEnv.Ep ; i++ )
|
94
|
+
{
|
95
|
+
c = GEnv.SRec[i].C;
|
96
|
+
w = GEnv.SRec[i].W;
|
97
|
+
assert(c >= 1 && c <= MaxClass);
|
98
|
+
|
99
|
+
GEnv.LowCases += w;
|
100
|
+
GEnv.Freq[2][c] += w;
|
101
|
+
GEnv.Freq[3][c] -= w;
|
102
|
+
|
103
|
+
GEnv.HighVal = GEnv.SRec[i+1].V;
|
104
|
+
if ( GEnv.HighVal > GEnv.LowVal )
|
105
|
+
{
|
106
|
+
Tries++;
|
107
|
+
|
108
|
+
GEnv.LowClass = GEnv.HighClass;
|
109
|
+
GEnv.HighClass = GEnv.SRec[i+1].C;
|
110
|
+
for ( j = i+2 ;
|
111
|
+
GEnv.HighClass && j <= GEnv.Ep && GEnv.SRec[j].V == GEnv.HighVal ;
|
112
|
+
j++ )
|
113
|
+
{
|
114
|
+
if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;
|
115
|
+
}
|
116
|
+
|
117
|
+
if ( ! GEnv.LowClass || GEnv.LowClass != GEnv.HighClass || j > GEnv.Ep )
|
118
|
+
{
|
119
|
+
LowInfo = TotalInfo(GEnv.Freq[2], 1, MaxClass);
|
120
|
+
|
121
|
+
/* If cannot improve on best so far, count remaining
|
122
|
+
possible cuts and break */
|
123
|
+
|
124
|
+
if ( LowInfo >= LeastInfo )
|
125
|
+
{
|
126
|
+
for ( i++ ; i <= GEnv.Ep ; i++ )
|
127
|
+
{
|
128
|
+
if ( GEnv.SRec[i+1].V > GEnv.SRec[i].V )
|
129
|
+
{
|
130
|
+
Tries++;
|
131
|
+
}
|
132
|
+
}
|
133
|
+
break;
|
134
|
+
}
|
135
|
+
|
136
|
+
LHInfo = LowInfo + TotalInfo(GEnv.Freq[3], 1, MaxClass);
|
137
|
+
if ( LHInfo < LeastInfo )
|
138
|
+
{
|
139
|
+
LeastInfo = LHInfo;
|
140
|
+
BestI = i;
|
141
|
+
|
142
|
+
BestInfo = (GEnv.FixedSplitInfo
|
143
|
+
+ PartInfo(GEnv.LowCases)
|
144
|
+
+ PartInfo(GEnv.ApplicCases - GEnv.LowCases))
|
145
|
+
/ GEnv.Cases;
|
146
|
+
}
|
147
|
+
|
148
|
+
Verbosity(3,
|
149
|
+
{
|
150
|
+
fprintf(Of, "\t\tCut at %.3f (gain %.3f):",
|
151
|
+
(GEnv.LowVal + GEnv.HighVal) / 2,
|
152
|
+
(1 - GEnv.UnknownRate) *
|
153
|
+
(GEnv.BaseInfo - (GEnv.NAInfo + LHInfo) / GEnv.KnownCases));
|
154
|
+
PrintDistribution(Att, 2, 3, GEnv.Freq, GEnv.ValFreq, true);
|
155
|
+
})
|
156
|
+
}
|
157
|
+
|
158
|
+
GEnv.LowVal = GEnv.HighVal;
|
159
|
+
}
|
160
|
+
}
|
161
|
+
|
162
|
+
BestGain = (1 - GEnv.UnknownRate) *
|
163
|
+
(GEnv.BaseInfo - (GEnv.NAInfo + LeastInfo) / GEnv.KnownCases);
|
164
|
+
|
165
|
+
/* The threshold cost is the lesser of the cost of indicating the
|
166
|
+
cases to split between or the interval containing the split */
|
167
|
+
|
168
|
+
if ( BestGain > 0 )
|
169
|
+
{
|
170
|
+
Interval = (GEnv.SRec[Lp].V - GEnv.SRec[GEnv.Xp].V) /
|
171
|
+
(GEnv.SRec[BestI+1].V - GEnv.SRec[BestI].V);
|
172
|
+
ThreshCost = ( Interval < Tries ? Log(Interval) : Log(Tries) )
|
173
|
+
/ GEnv.Cases;
|
174
|
+
}
|
175
|
+
|
176
|
+
BestGain -= ThreshCost;
|
177
|
+
|
178
|
+
/* If a test on the attribute is able to make a gain,
|
179
|
+
set the best break point, gain and information */
|
180
|
+
|
181
|
+
if ( BestGain <= 0 )
|
182
|
+
{
|
183
|
+
Verbosity(2, fprintf(Of, "\tAtt %s\tno gain\n", AttName[Att]))
|
184
|
+
}
|
185
|
+
else
|
186
|
+
{
|
187
|
+
Gain[Att] = BestGain;
|
188
|
+
Info[Att] = BestInfo;
|
189
|
+
|
190
|
+
GEnv.LowVal = GEnv.SRec[BestI].V;
|
191
|
+
GEnv.HighVal = GEnv.SRec[BestI+1].V;
|
192
|
+
|
193
|
+
/* Set threshold, making sure that rounding problems do not
|
194
|
+
cause it to reach upper value */
|
195
|
+
|
196
|
+
if ( (Bar[Att] = (ContValue) (0.5 * (GEnv.LowVal + GEnv.HighVal)))
|
197
|
+
>= GEnv.HighVal )
|
198
|
+
{
|
199
|
+
Bar[Att] = GEnv.LowVal;
|
200
|
+
}
|
201
|
+
|
202
|
+
Verbosity(2,
|
203
|
+
fprintf(Of, "\tAtt %s\tcut=%.3f, inf %.3f, gain %.3f\n",
|
204
|
+
AttName[Att], Bar[Att], Info[Att], Gain[Att]))
|
205
|
+
}
|
206
|
+
}
|
207
|
+
|
208
|
+
|
209
|
+
|
210
|
+
/*************************************************************************/
|
211
|
+
/* */
|
212
|
+
/* Estimate max gain ratio available from any cut, using sample */
|
213
|
+
/* of SampleFrac of all cases */
|
214
|
+
/* */
|
215
|
+
/*************************************************************************/
|
216
|
+
|
217
|
+
|
218
|
+
void EstimateMaxGR(Attribute Att, CaseNo Fp, CaseNo Lp)
|
219
|
+
/* ------------- */
|
220
|
+
{
|
221
|
+
CaseNo i, j;
|
222
|
+
double LHInfo, w, SplitInfo, ThisGain, GR;
|
223
|
+
ClassNo c;
|
224
|
+
|
225
|
+
EstMaxGR[Att] = 0;
|
226
|
+
|
227
|
+
if ( Skip(Att) || Att == ClassAtt ) return;
|
228
|
+
|
229
|
+
PrepareForContin(Att, Fp, Lp);
|
230
|
+
|
231
|
+
/* Special case when very few known values */
|
232
|
+
|
233
|
+
if ( GEnv.ApplicCases < 2 * MINITEMS * SampleFrac )
|
234
|
+
{
|
235
|
+
return;
|
236
|
+
}
|
237
|
+
|
238
|
+
/* Try possible cuts between cases i and i+1. Use conservative
|
239
|
+
value of GEnv.MinSplit to allow for sampling */
|
240
|
+
|
241
|
+
GEnv.MinSplit = 0.10 * GEnv.KnownCases / MaxClass;
|
242
|
+
if ( GEnv.MinSplit > 25 ) GEnv.MinSplit = 25;
|
243
|
+
if ( GEnv.MinSplit < MINITEMS ) GEnv.MinSplit = MINITEMS;
|
244
|
+
|
245
|
+
GEnv.MinSplit *= SampleFrac * 0.33;
|
246
|
+
|
247
|
+
i = PrepareForScan(Lp);
|
248
|
+
|
249
|
+
/* Repeatedly check next possible cut */
|
250
|
+
|
251
|
+
for ( ; i <= GEnv.Ep ; i++ )
|
252
|
+
{
|
253
|
+
c = GEnv.SRec[i].C;
|
254
|
+
w = GEnv.SRec[i].W;
|
255
|
+
assert(c >= 1 && c <= MaxClass);
|
256
|
+
|
257
|
+
GEnv.LowCases += w;
|
258
|
+
GEnv.Freq[2][c] += w;
|
259
|
+
GEnv.Freq[3][c] -= w;
|
260
|
+
|
261
|
+
GEnv.HighVal = GEnv.SRec[i+1].V;
|
262
|
+
if ( GEnv.HighVal > GEnv.LowVal )
|
263
|
+
{
|
264
|
+
GEnv.LowClass = GEnv.HighClass;
|
265
|
+
GEnv.HighClass = GEnv.SRec[i+1].C;
|
266
|
+
for ( j = i+2 ;
|
267
|
+
GEnv.HighClass && j <= GEnv.Ep && GEnv.SRec[j].V == GEnv.HighVal ;
|
268
|
+
j++ )
|
269
|
+
{
|
270
|
+
if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;
|
271
|
+
}
|
272
|
+
|
273
|
+
if ( ! GEnv.LowClass || GEnv.LowClass != GEnv.HighClass || j > GEnv.Ep )
|
274
|
+
{
|
275
|
+
LHInfo = TotalInfo(GEnv.Freq[2], 1, MaxClass)
|
276
|
+
+ TotalInfo(GEnv.Freq[3], 1, MaxClass);
|
277
|
+
|
278
|
+
SplitInfo = (GEnv.FixedSplitInfo
|
279
|
+
+ PartInfo(GEnv.LowCases)
|
280
|
+
+ PartInfo(GEnv.ApplicCases - GEnv.LowCases)) / GEnv.Cases;
|
281
|
+
|
282
|
+
ThisGain = (1 - GEnv.UnknownRate) *
|
283
|
+
(GEnv.BaseInfo - (GEnv.NAInfo + LHInfo) / GEnv.KnownCases);
|
284
|
+
if ( ThisGain > Gain[Att] ) Gain[Att] = ThisGain;
|
285
|
+
|
286
|
+
/* Adjust GR to make it more conservative upper bound */
|
287
|
+
|
288
|
+
GR = (ThisGain + 1E-5) / SplitInfo;
|
289
|
+
if ( GR > EstMaxGR[Att] )
|
290
|
+
{
|
291
|
+
EstMaxGR[Att] = GR;
|
292
|
+
}
|
293
|
+
|
294
|
+
Verbosity(3,
|
295
|
+
{
|
296
|
+
fprintf(Of, "\t\tCut at %.3f (gain %.3f):",
|
297
|
+
(GEnv.LowVal + GEnv.HighVal) / 2, ThisGain);
|
298
|
+
PrintDistribution(Att, 2, 3, GEnv.Freq, GEnv.ValFreq, true);
|
299
|
+
})
|
300
|
+
}
|
301
|
+
|
302
|
+
GEnv.LowVal = GEnv.HighVal;
|
303
|
+
}
|
304
|
+
}
|
305
|
+
|
306
|
+
Verbosity(2,
|
307
|
+
fprintf(Of, "\tAtt %s: max GR estimate %.3f\n",
|
308
|
+
AttName[Att], EstMaxGR[Att]))
|
309
|
+
}
|
310
|
+
|
311
|
+
|
312
|
+
|
313
|
+
/*************************************************************************/
|
314
|
+
/* */
|
315
|
+
/* Routine to set some preparatory values used by both */
|
316
|
+
/* EvalContinuousAtt and EstimateMaxGR */
|
317
|
+
/* */
|
318
|
+
/*************************************************************************/
|
319
|
+
|
320
|
+
|
321
|
+
void PrepareForContin(Attribute Att, CaseNo Fp, CaseNo Lp)
|
322
|
+
/* ---------------- */
|
323
|
+
{
|
324
|
+
CaseNo i;
|
325
|
+
ClassNo c;
|
326
|
+
DiscrValue v;
|
327
|
+
|
328
|
+
/* Reset frequency tables */
|
329
|
+
|
330
|
+
ForEach(v, 0, 3)
|
331
|
+
{
|
332
|
+
ForEach(c, 1, MaxClass)
|
333
|
+
{
|
334
|
+
GEnv.Freq[v][c] = 0;
|
335
|
+
}
|
336
|
+
GEnv.ValFreq[v] = 0;
|
337
|
+
}
|
338
|
+
|
339
|
+
/* Omit and count unknown and N/A values */
|
340
|
+
|
341
|
+
GEnv.Cases = 0;
|
342
|
+
|
343
|
+
if ( SomeMiss[Att] || SomeNA[Att] )
|
344
|
+
{
|
345
|
+
GEnv.Xp = Lp+1;
|
346
|
+
|
347
|
+
ForEach(i, Fp, Lp)
|
348
|
+
{
|
349
|
+
assert(Class(Case[i]) >= 1 && Class(Case[i]) <= MaxClass);
|
350
|
+
|
351
|
+
GEnv.Cases += Weight(Case[i]);
|
352
|
+
|
353
|
+
if ( Unknown(Case[i], Att) )
|
354
|
+
{
|
355
|
+
GEnv.Freq[ 0 ][ Class(Case[i]) ] += Weight(Case[i]);
|
356
|
+
}
|
357
|
+
else
|
358
|
+
if ( NotApplic(Case[i], Att) )
|
359
|
+
{
|
360
|
+
GEnv.Freq[ 1 ][ Class(Case[i]) ] += Weight(Case[i]);
|
361
|
+
}
|
362
|
+
else
|
363
|
+
{
|
364
|
+
GEnv.Freq[ 3 ][ Class(Case[i]) ] += Weight(Case[i]);
|
365
|
+
GEnv.Xp--;
|
366
|
+
GEnv.SRec[GEnv.Xp].V = CVal(Case[i], Att);
|
367
|
+
GEnv.SRec[GEnv.Xp].W = Weight(Case[i]);
|
368
|
+
GEnv.SRec[GEnv.Xp].C = Class(Case[i]);
|
369
|
+
}
|
370
|
+
}
|
371
|
+
|
372
|
+
ForEach(c, 1, MaxClass)
|
373
|
+
{
|
374
|
+
GEnv.ValFreq[0] += GEnv.Freq[0][c];
|
375
|
+
GEnv.ValFreq[1] += GEnv.Freq[1][c];
|
376
|
+
}
|
377
|
+
|
378
|
+
GEnv.NAInfo = TotalInfo(GEnv.Freq[1], 1, MaxClass);
|
379
|
+
GEnv.FixedSplitInfo = PartInfo(GEnv.ValFreq[0]) + PartInfo(GEnv.ValFreq[1]);
|
380
|
+
|
381
|
+
Verbosity(3, PrintDistribution(Att, 0, 1, GEnv.Freq, GEnv.ValFreq, true))
|
382
|
+
}
|
383
|
+
else
|
384
|
+
{
|
385
|
+
GEnv.Xp = Fp;
|
386
|
+
|
387
|
+
ForEach(i, Fp, Lp)
|
388
|
+
{
|
389
|
+
GEnv.SRec[i].V = CVal(Case[i], Att);
|
390
|
+
GEnv.SRec[i].W = Weight(Case[i]);
|
391
|
+
GEnv.SRec[i].C = Class(Case[i]);
|
392
|
+
|
393
|
+
GEnv.Freq[3][Class(Case[i])] += Weight(Case[i]);
|
394
|
+
}
|
395
|
+
|
396
|
+
ForEach(c, 1, MaxClass)
|
397
|
+
{
|
398
|
+
GEnv.Cases += GEnv.Freq[3][c];
|
399
|
+
}
|
400
|
+
|
401
|
+
GEnv.NAInfo = GEnv.FixedSplitInfo = 0;
|
402
|
+
}
|
403
|
+
|
404
|
+
GEnv.KnownCases = GEnv.Cases - GEnv.ValFreq[0];
|
405
|
+
GEnv.ApplicCases = GEnv.KnownCases - GEnv.ValFreq[1];
|
406
|
+
|
407
|
+
GEnv.UnknownRate = 1.0 - GEnv.KnownCases / GEnv.Cases;
|
408
|
+
|
409
|
+
Cachesort(GEnv.Xp, Lp, GEnv.SRec);
|
410
|
+
|
411
|
+
/* If unknowns or using sampling, must recompute base information */
|
412
|
+
|
413
|
+
if ( GEnv.ValFreq[0] > 0 || SampleFrac < 1 )
|
414
|
+
{
|
415
|
+
/* Determine base information using GEnv.Freq[0] as temp buffer */
|
416
|
+
|
417
|
+
ForEach(c, 1, MaxClass)
|
418
|
+
{
|
419
|
+
GEnv.Freq[0][c] = GEnv.Freq[1][c] + GEnv.Freq[3][c];
|
420
|
+
}
|
421
|
+
|
422
|
+
GEnv.BaseInfo = TotalInfo(GEnv.Freq[0], 1, MaxClass) / GEnv.KnownCases;
|
423
|
+
}
|
424
|
+
else
|
425
|
+
{
|
426
|
+
GEnv.BaseInfo = GlobalBaseInfo;
|
427
|
+
}
|
428
|
+
}
|
429
|
+
|
430
|
+
|
431
|
+
|
432
|
+
/*************************************************************************/
|
433
|
+
/* */
|
434
|
+
/* Set low and high bounds for scan and initial class */
|
435
|
+
/* (used by EvalContinuousAtt and EstimateMaxGR) */
|
436
|
+
/* */
|
437
|
+
/*************************************************************************/
|
438
|
+
|
439
|
+
|
440
|
+
CaseNo PrepareForScan(CaseNo Lp)
|
441
|
+
/* -------------- */
|
442
|
+
{
|
443
|
+
CaseNo i, j;
|
444
|
+
ClassNo c;
|
445
|
+
double w;
|
446
|
+
|
447
|
+
/* Find last possible split */
|
448
|
+
|
449
|
+
GEnv.HighCases = GEnv.LowCases = 0;
|
450
|
+
|
451
|
+
for ( GEnv.Ep = Lp ; GEnv.Ep >= GEnv.Xp && GEnv.HighCases < GEnv.MinSplit ; GEnv.Ep-- )
|
452
|
+
{
|
453
|
+
GEnv.HighCases += GEnv.SRec[GEnv.Ep].W;
|
454
|
+
}
|
455
|
+
|
456
|
+
/* Skip cases before first possible cut */
|
457
|
+
|
458
|
+
for ( i = GEnv.Xp ;
|
459
|
+
i <= GEnv.Ep &&
|
460
|
+
( GEnv.LowCases + GEnv.SRec[i].W < GEnv.MinSplit - 1E-5 ||
|
461
|
+
GEnv.SRec[i].V == GEnv.SRec[i+1].V ) ;
|
462
|
+
i++ )
|
463
|
+
{
|
464
|
+
c = GEnv.SRec[i].C;
|
465
|
+
w = GEnv.SRec[i].W;
|
466
|
+
assert(c >= 1 && c <= MaxClass);
|
467
|
+
|
468
|
+
GEnv.LowCases += w;
|
469
|
+
GEnv.Freq[2][c] += w;
|
470
|
+
GEnv.Freq[3][c] -= w;
|
471
|
+
}
|
472
|
+
|
473
|
+
/* Find the class key for the first interval */
|
474
|
+
|
475
|
+
GEnv.HighClass = GEnv.SRec[i].C;
|
476
|
+
for ( j = i-1; GEnv.HighClass && j >= GEnv.Xp ; j-- )
|
477
|
+
{
|
478
|
+
if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;
|
479
|
+
}
|
480
|
+
assert(GEnv.HighClass <= MaxClass);
|
481
|
+
assert(j+1 >= GEnv.Xp);
|
482
|
+
|
483
|
+
GEnv.LowVal = GEnv.SRec[i].V;
|
484
|
+
|
485
|
+
return i;
|
486
|
+
}
|
487
|
+
|
488
|
+
|
489
|
+
|
490
|
+
/*************************************************************************/
|
491
|
+
/* */
|
492
|
+
/* Change a leaf into a test on a continuous attribute */
|
493
|
+
/* */
|
494
|
+
/*************************************************************************/
|
495
|
+
|
496
|
+
|
497
|
+
void ContinTest(Tree Node, Attribute Att)
|
498
|
+
/* ---------- */
|
499
|
+
{
|
500
|
+
Sprout(Node, 3);
|
501
|
+
|
502
|
+
Node->NodeType = BrThresh;
|
503
|
+
Node->Tested = Att;
|
504
|
+
Node->Cut =
|
505
|
+
Node->Lower =
|
506
|
+
Node->Upper = Bar[Att];
|
507
|
+
}
|
508
|
+
|
509
|
+
|
510
|
+
|
511
|
+
/*************************************************************************/
|
512
|
+
/* */
|
513
|
+
/* Adjust thresholds of all continuous attributes so that cuts */
|
514
|
+
/* are values that appear in the data */
|
515
|
+
/* */
|
516
|
+
/*************************************************************************/
|
517
|
+
|
518
|
+
|
519
|
+
void AdjustAllThresholds(Tree T)
|
520
|
+
/* ------------------- */
|
521
|
+
{
|
522
|
+
Attribute Att;
|
523
|
+
CaseNo Ep;
|
524
|
+
|
525
|
+
ForEach(Att, 1, MaxAtt)
|
526
|
+
{
|
527
|
+
if ( Continuous(Att) )
|
528
|
+
{
|
529
|
+
Ep = -1;
|
530
|
+
AdjustThresholds(T, Att, &Ep);
|
531
|
+
}
|
532
|
+
}
|
533
|
+
}
|
534
|
+
|
535
|
+
|
536
|
+
|
537
|
+
void AdjustThresholds(Tree T, Attribute Att, CaseNo *Ep)
|
538
|
+
/* ---------------- */
|
539
|
+
{
|
540
|
+
DiscrValue v;
|
541
|
+
CaseNo i;
|
542
|
+
|
543
|
+
if ( T->NodeType == BrThresh && T->Tested == Att )
|
544
|
+
{
|
545
|
+
if ( *Ep == -1 )
|
546
|
+
{
|
547
|
+
ForEach(i, 0, MaxCase)
|
548
|
+
{
|
549
|
+
if ( ! Unknown(Case[i], Att) && ! NotApplic(Case[i], Att) )
|
550
|
+
{
|
551
|
+
(&GEnv)->SRec[++(*Ep)].V = CVal(Case[i], Att);
|
552
|
+
}
|
553
|
+
}
|
554
|
+
Cachesort(0, *Ep, (&GEnv)->SRec);
|
555
|
+
|
556
|
+
if ( PossibleCuts && Trial == 0 )
|
557
|
+
{
|
558
|
+
int Cuts=0;
|
559
|
+
|
560
|
+
ForEach(i, 1, *Ep)
|
561
|
+
{
|
562
|
+
if ( (&GEnv)->SRec[i].V != (&GEnv)->SRec[i-1].V ) Cuts++;
|
563
|
+
}
|
564
|
+
PossibleCuts[Att] = Cuts;
|
565
|
+
}
|
566
|
+
}
|
567
|
+
|
568
|
+
T->Cut = T->Lower = T->Upper = GreatestValueBelow(T->Cut, Ep);
|
569
|
+
}
|
570
|
+
|
571
|
+
if ( T->NodeType )
|
572
|
+
{
|
573
|
+
ForEach(v, 1, T->Forks)
|
574
|
+
{
|
575
|
+
AdjustThresholds(T->Branch[v], Att, Ep);
|
576
|
+
}
|
577
|
+
}
|
578
|
+
}
|
579
|
+
|
580
|
+
|
581
|
+
|
582
|
+
/*************************************************************************/
|
583
|
+
/* */
|
584
|
+
/* Return the greatest value of attribute Att below threshold Th */
|
585
|
+
/* (Assumes values of Att have been sorted.) */
|
586
|
+
/* */
|
587
|
+
/*************************************************************************/
|
588
|
+
|
589
|
+
|
590
|
+
ContValue GreatestValueBelow(ContValue Th, CaseNo *Ep)
|
591
|
+
/* ------------------ */
|
592
|
+
{
|
593
|
+
CaseNo Low, Mid, High;
|
594
|
+
|
595
|
+
Low = 0;
|
596
|
+
High = *Ep;
|
597
|
+
|
598
|
+
while ( Low < High )
|
599
|
+
{
|
600
|
+
Mid = (Low + High + 1) / 2;
|
601
|
+
|
602
|
+
if ( (&GEnv)->SRec[Mid].V > Th )
|
603
|
+
{
|
604
|
+
High = Mid - 1;
|
605
|
+
}
|
606
|
+
else
|
607
|
+
{
|
608
|
+
Low = Mid;
|
609
|
+
}
|
610
|
+
}
|
611
|
+
|
612
|
+
return (&GEnv)->SRec[Low].V;
|
613
|
+
}
|