see5-installer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
data/ext/c5.0/formtree.c
ADDED
@@ -0,0 +1,1158 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Central tree-forming algorithm */
|
30
|
+
/* ------------------------------ */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#include "defns.i"
|
36
|
+
#include "extern.i"
|
37
|
+
|
38
|
+
|
39
|
+
Boolean MultiVal, /* all atts have many values */
|
40
|
+
Subsample; /* use subsampling */
|
41
|
+
float AvGainWt, /* weight of average gain in gain threshold */
|
42
|
+
MDLWt; /* weight of MDL threshold ditto */
|
43
|
+
|
44
|
+
Attribute *DList=Nil; /* list of discrete atts */
|
45
|
+
int NDList; /* number in list */
|
46
|
+
|
47
|
+
DiscrValue MaxLeaves; /* target maximum tree size */
|
48
|
+
|
49
|
+
#define SAMPLEUNIT 2000
|
50
|
+
|
51
|
+
float ValThresh; /* minimum GR when evaluating sampled atts */
|
52
|
+
Boolean Sampled; /* true if sampling used */
|
53
|
+
|
54
|
+
Attribute *Waiting=Nil, /* attribute wait list */
|
55
|
+
NWaiting=0;
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
/*************************************************************************/
|
61
|
+
/* */
|
62
|
+
/* Allocate space for tree tables */
|
63
|
+
/* */
|
64
|
+
/*************************************************************************/
|
65
|
+
|
66
|
+
|
67
|
+
void InitialiseTreeData()
|
68
|
+
/* ------------------ */
|
69
|
+
{
|
70
|
+
DiscrValue v;
|
71
|
+
Attribute Att;
|
72
|
+
DiscrValue vMax;
|
73
|
+
|
74
|
+
Raw = AllocZero(TRIALS+1, Tree);
|
75
|
+
Pruned = AllocZero(TRIALS+1, Tree);
|
76
|
+
|
77
|
+
Tested = AllocZero(MaxAtt+1, Byte);
|
78
|
+
|
79
|
+
Gain = AllocZero(MaxAtt+1, float);
|
80
|
+
Info = AllocZero(MaxAtt+1, float);
|
81
|
+
Bar = AllocZero(MaxAtt+1, ContValue);
|
82
|
+
|
83
|
+
EstMaxGR = AllocZero(MaxAtt+1, float);
|
84
|
+
|
85
|
+
/* Data for subsets */
|
86
|
+
|
87
|
+
if ( SUBSET )
|
88
|
+
{
|
89
|
+
InitialiseBellNumbers();
|
90
|
+
Subset = Alloc(MaxAtt+1, Set *);
|
91
|
+
|
92
|
+
ForEach(Att, 1, MaxAtt)
|
93
|
+
{
|
94
|
+
if ( Discrete(Att) && Att != ClassAtt && ! Skip(Att) )
|
95
|
+
{
|
96
|
+
Subset[Att] = AllocZero(MaxAttVal[Att]+1, Set);
|
97
|
+
ForEach(v, 0, MaxAttVal[Att])
|
98
|
+
{
|
99
|
+
Subset[Att][v] = Alloc((MaxAttVal[Att]>>3)+1, Byte);
|
100
|
+
}
|
101
|
+
}
|
102
|
+
}
|
103
|
+
Subsets = AllocZero(MaxAtt+1, int);
|
104
|
+
}
|
105
|
+
|
106
|
+
DList = Alloc(MaxAtt, Attribute);
|
107
|
+
NDList = 0;
|
108
|
+
|
109
|
+
DFreq = AllocZero(MaxAtt+1, double *);
|
110
|
+
ForEach(Att, 1, MaxAtt)
|
111
|
+
{
|
112
|
+
if ( Att == ClassAtt || Skip(Att) || ! Discrete(Att) ) continue;
|
113
|
+
|
114
|
+
DList[NDList++] = Att;
|
115
|
+
|
116
|
+
DFreq[Att] = Alloc(MaxClass * (MaxAttVal[Att]+1), double);
|
117
|
+
}
|
118
|
+
|
119
|
+
ClassFreq = AllocZero(MaxClass+1, double);
|
120
|
+
ClassSum = Alloc(MaxClass+1, float);
|
121
|
+
|
122
|
+
if ( BOOST )
|
123
|
+
{
|
124
|
+
Vote = Alloc(MaxClass+1, float);
|
125
|
+
TrialPred = Alloc(TRIALS, ClassNo);
|
126
|
+
}
|
127
|
+
|
128
|
+
if ( RULES )
|
129
|
+
{
|
130
|
+
MostSpec = Alloc(MaxClass+1, CRule);
|
131
|
+
PossibleCuts = Alloc(MaxAtt+1, int);
|
132
|
+
}
|
133
|
+
|
134
|
+
/* Check whether all attributes have many discrete values */
|
135
|
+
|
136
|
+
MultiVal = true;
|
137
|
+
if ( ! SUBSET )
|
138
|
+
{
|
139
|
+
for ( Att = 1 ; MultiVal && Att <= MaxAtt ; Att++ )
|
140
|
+
{
|
141
|
+
if ( ! Skip(Att) && Att != ClassAtt )
|
142
|
+
{
|
143
|
+
MultiVal = MaxAttVal[Att] >= 0.3 * (MaxCase + 1);
|
144
|
+
}
|
145
|
+
}
|
146
|
+
}
|
147
|
+
|
148
|
+
/* See whether there are continuous attributes for subsampling */
|
149
|
+
|
150
|
+
Subsample = false;
|
151
|
+
|
152
|
+
/* Set parameters for RawExtraErrs() */
|
153
|
+
|
154
|
+
InitialiseExtraErrs();
|
155
|
+
|
156
|
+
/* Set up environment */
|
157
|
+
|
158
|
+
Waiting = Alloc(MaxAtt+1, Attribute);
|
159
|
+
|
160
|
+
vMax = Max(3, MaxDiscrVal+1);
|
161
|
+
|
162
|
+
GEnv.Freq = Alloc(vMax+1, double *);
|
163
|
+
ForEach(v, 0, vMax)
|
164
|
+
{
|
165
|
+
GEnv.Freq[v] = Alloc(MaxClass+1, double);
|
166
|
+
}
|
167
|
+
|
168
|
+
GEnv.ValFreq = Alloc(vMax, double);
|
169
|
+
|
170
|
+
GEnv.ClassFreq = Alloc(MaxClass+1, double);
|
171
|
+
|
172
|
+
GEnv.SRec = Alloc(MaxCase+1, SortRec);
|
173
|
+
|
174
|
+
if ( SUBSET )
|
175
|
+
{
|
176
|
+
GEnv.SubsetInfo = Alloc(MaxDiscrVal+1, double);
|
177
|
+
GEnv.SubsetEntr = Alloc(MaxDiscrVal+1, double);
|
178
|
+
|
179
|
+
GEnv.MergeInfo = Alloc(MaxDiscrVal+1, double *);
|
180
|
+
GEnv.MergeEntr = Alloc(MaxDiscrVal+1, double *);
|
181
|
+
GEnv.WSubset = Alloc(MaxDiscrVal+1, Set);
|
182
|
+
ForEach(v, 1, MaxDiscrVal)
|
183
|
+
{
|
184
|
+
GEnv.MergeInfo[v] = Alloc(MaxDiscrVal+1, double);
|
185
|
+
GEnv.MergeEntr[v] = Alloc(MaxDiscrVal+1, double);
|
186
|
+
GEnv.WSubset[v] = Alloc((MaxDiscrVal>>3)+1, Byte);
|
187
|
+
}
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
|
192
|
+
void FreeTreeData()
|
193
|
+
/* ------------ */
|
194
|
+
{
|
195
|
+
Attribute Att;
|
196
|
+
DiscrValue vMax;
|
197
|
+
|
198
|
+
FreeUnlessNil(Raw); Raw = Nil;
|
199
|
+
FreeUnlessNil(Pruned); Pruned = Nil;
|
200
|
+
|
201
|
+
FreeUnlessNil(Tested); Tested = Nil;
|
202
|
+
|
203
|
+
FreeUnlessNil(Gain); Gain = Nil;
|
204
|
+
FreeUnlessNil(Info); Info = Nil;
|
205
|
+
FreeUnlessNil(Bar); Bar = Nil;
|
206
|
+
|
207
|
+
FreeUnlessNil(EstMaxGR); EstMaxGR = Nil;
|
208
|
+
|
209
|
+
if ( SUBSET )
|
210
|
+
{
|
211
|
+
FreeVector((void **) Bell, 1, MaxDiscrVal); Bell = Nil;
|
212
|
+
|
213
|
+
if ( Subset )
|
214
|
+
{
|
215
|
+
ForEach(Att, 1, MaxAtt)
|
216
|
+
{
|
217
|
+
if ( Subset[Att] )
|
218
|
+
{
|
219
|
+
FreeVector((void **) Subset[Att], 0, MaxAttVal[Att]);
|
220
|
+
}
|
221
|
+
}
|
222
|
+
Free(Subset); Subset = Nil;
|
223
|
+
Free(Subsets); Subsets = Nil;
|
224
|
+
}
|
225
|
+
}
|
226
|
+
|
227
|
+
FreeUnlessNil(DList); DList = Nil;
|
228
|
+
|
229
|
+
if ( DFreq )
|
230
|
+
{
|
231
|
+
ForEach(Att, 1, MaxAtt)
|
232
|
+
{
|
233
|
+
FreeUnlessNil(DFreq[Att]);
|
234
|
+
}
|
235
|
+
|
236
|
+
Free(DFreq); DFreq = Nil;
|
237
|
+
}
|
238
|
+
|
239
|
+
FreeUnlessNil(ClassFreq); ClassFreq = Nil;
|
240
|
+
FreeUnlessNil(ClassSum); ClassSum = Nil;
|
241
|
+
|
242
|
+
FreeUnlessNil(Vote); Vote = Nil;
|
243
|
+
FreeUnlessNil(TrialPred); TrialPred = Nil;
|
244
|
+
|
245
|
+
FreeUnlessNil(MostSpec); MostSpec = Nil;
|
246
|
+
FreeUnlessNil(PossibleCuts); PossibleCuts = Nil;
|
247
|
+
|
248
|
+
vMax = Max(3, MaxDiscrVal+1);
|
249
|
+
FreeVector((void **) GEnv.Freq, 0, vMax);
|
250
|
+
Free(GEnv.ValFreq);
|
251
|
+
Free(GEnv.ClassFreq);
|
252
|
+
FreeUnlessNil(GEnv.SRec);
|
253
|
+
|
254
|
+
if ( GEnv.SubsetInfo )
|
255
|
+
{
|
256
|
+
Free(GEnv.SubsetInfo);
|
257
|
+
Free(GEnv.SubsetEntr);
|
258
|
+
FreeVector((void **) GEnv.MergeInfo, 1, MaxDiscrVal);
|
259
|
+
FreeVector((void **) GEnv.MergeEntr, 1, MaxDiscrVal);
|
260
|
+
FreeVector((void **) GEnv.WSubset, 1, MaxDiscrVal);
|
261
|
+
}
|
262
|
+
|
263
|
+
FreeUnlessNil(Waiting); Waiting = Nil;
|
264
|
+
}
|
265
|
+
|
266
|
+
|
267
|
+
|
268
|
+
/*************************************************************************/
|
269
|
+
/* */
|
270
|
+
/* Set threshold on minimum gain as follows: */
|
271
|
+
/* * when forming winnowing tree: no minimum */
|
272
|
+
/* * for small problems, AvGain (usual Gain Ratio) */
|
273
|
+
/* * for large problems, discounted MDL */
|
274
|
+
/* * for intermediate problems, interpolated */
|
275
|
+
/* */
|
276
|
+
/*************************************************************************/
|
277
|
+
|
278
|
+
|
279
|
+
void SetMinGainThresh()
|
280
|
+
/* ---------------- */
|
281
|
+
{
|
282
|
+
float Frac;
|
283
|
+
|
284
|
+
/* Set AvGainWt and MDLWt */
|
285
|
+
|
286
|
+
if ( Now == WINNOWATTS )
|
287
|
+
{
|
288
|
+
AvGainWt = MDLWt = 0.0;
|
289
|
+
}
|
290
|
+
else
|
291
|
+
if ( (MaxCase+1) / MaxClass <= 500 )
|
292
|
+
{
|
293
|
+
AvGainWt = 1.0;
|
294
|
+
MDLWt = 0.0;
|
295
|
+
}
|
296
|
+
else
|
297
|
+
if ( (MaxCase+1) / MaxClass >= 1000 )
|
298
|
+
{
|
299
|
+
AvGainWt = 0.0;
|
300
|
+
MDLWt = 0.9;
|
301
|
+
}
|
302
|
+
else
|
303
|
+
{
|
304
|
+
Frac = ((MaxCase+1) / MaxClass - 500) / 500.0;
|
305
|
+
|
306
|
+
AvGainWt = 1 - Frac;
|
307
|
+
MDLWt = 0.9 * Frac;
|
308
|
+
}
|
309
|
+
}
|
310
|
+
|
311
|
+
|
312
|
+
|
313
|
+
/*************************************************************************/
|
314
|
+
/* */
|
315
|
+
/* Build a decision tree for the cases Fp through Lp */
|
316
|
+
/* */
|
317
|
+
/* - if all cases are of the same class, the tree is a leaf labelled */
|
318
|
+
/* with this class */
|
319
|
+
/* */
|
320
|
+
/* - for each attribute, calculate the potential information provided */
|
321
|
+
/* by a test on the attribute (based on the probabilities of each */
|
322
|
+
/* case having a particular value for the attribute), and the gain */
|
323
|
+
/* in information that would result from a test on the attribute */
|
324
|
+
/* (based on the probabilities of each case with a particular */
|
325
|
+
/* value for the attribute being of a particular class) */
|
326
|
+
/* */
|
327
|
+
/* - on the basis of these figures, and depending on the current */
|
328
|
+
/* selection criterion, find the best attribute to branch on. */
|
329
|
+
/* Note: this version will not allow a split on an attribute */
|
330
|
+
/* unless two or more subsets have at least MINITEMS cases */
|
331
|
+
/* */
|
332
|
+
/* - try branching and test whether the resulting tree is better than */
|
333
|
+
/* forming a leaf */
|
334
|
+
/* */
|
335
|
+
/*************************************************************************/
|
336
|
+
|
337
|
+
|
338
|
+
void FormTree(CaseNo Fp, CaseNo Lp, int Level, Tree *Result)
|
339
|
+
/* -------- */
|
340
|
+
{
|
341
|
+
CaseCount Cases=0, TreeErrs=0;
|
342
|
+
Attribute BestAtt;
|
343
|
+
ClassNo c, BestLeaf=1, Least=1;
|
344
|
+
Tree Node;
|
345
|
+
DiscrValue v;
|
346
|
+
|
347
|
+
|
348
|
+
assert(Fp >= 0 && Lp >= Fp && Lp <= MaxCase);
|
349
|
+
|
350
|
+
/* Make a single pass through the cases to determine class frequencies
|
351
|
+
and value/class frequencies for all discrete attributes */
|
352
|
+
|
353
|
+
FindAllFreq(Fp, Lp);
|
354
|
+
|
355
|
+
/* Choose the best leaf and the least prevalent class */
|
356
|
+
|
357
|
+
ForEach(c, 2, MaxClass)
|
358
|
+
{
|
359
|
+
if ( ClassFreq[c] > ClassFreq[BestLeaf] )
|
360
|
+
{
|
361
|
+
BestLeaf = c;
|
362
|
+
}
|
363
|
+
else
|
364
|
+
if ( ClassFreq[c] > 0.1 && ClassFreq[c] < ClassFreq[Least] )
|
365
|
+
{
|
366
|
+
Least = c;
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
370
|
+
ForEach(c, 1, MaxClass)
|
371
|
+
{
|
372
|
+
Cases += ClassFreq[c];
|
373
|
+
}
|
374
|
+
|
375
|
+
MaxLeaves = ( LEAFRATIO > 0 ? rint(LEAFRATIO * Cases) : 1E6 );
|
376
|
+
|
377
|
+
*Result = Node =
|
378
|
+
Leaf(ClassFreq, BestLeaf, Cases, Cases - ClassFreq[BestLeaf]);
|
379
|
+
|
380
|
+
Verbosity(1,
|
381
|
+
fprintf(Of, "\n<%d> %d cases", Level, No(Fp,Lp));
|
382
|
+
if ( fabs(No(Fp,Lp) - Cases) >= 0.1 )
|
383
|
+
{
|
384
|
+
fprintf(Of, ", total weight %.1f", Cases);
|
385
|
+
}
|
386
|
+
fprintf(Of, "\n"))
|
387
|
+
|
388
|
+
/* Do not try to split if:
|
389
|
+
- all cases are of the same class
|
390
|
+
- there are not enough cases to split */
|
391
|
+
|
392
|
+
if ( ClassFreq[BestLeaf] >= 0.999 * Cases ||
|
393
|
+
Cases < 2 * MINITEMS ||
|
394
|
+
MaxLeaves < 2 )
|
395
|
+
{
|
396
|
+
if ( Now == FORMTREE ) Progress(Cases);
|
397
|
+
return;
|
398
|
+
}
|
399
|
+
|
400
|
+
/* Calculate base information */
|
401
|
+
|
402
|
+
GlobalBaseInfo = TotalInfo(ClassFreq, 1, MaxClass) / Cases;
|
403
|
+
|
404
|
+
/* Perform preliminary evaluation if using subsampling.
|
405
|
+
Must expect at least 10 of least prevalent class */
|
406
|
+
|
407
|
+
ValThresh = 0;
|
408
|
+
if ( Subsample && No(Fp, Lp) > 5 * MaxClass * SAMPLEUNIT &&
|
409
|
+
(ClassFreq[Least] * MaxClass * SAMPLEUNIT) / No(Fp, Lp) >= 10 )
|
410
|
+
{
|
411
|
+
SampleEstimate(Fp, Lp, Cases);
|
412
|
+
Sampled = true;
|
413
|
+
}
|
414
|
+
else
|
415
|
+
{
|
416
|
+
Sampled = false;
|
417
|
+
}
|
418
|
+
|
419
|
+
BestAtt = ChooseSplit(Fp, Lp, Cases, Sampled);
|
420
|
+
|
421
|
+
/* Decide whether to branch or not */
|
422
|
+
|
423
|
+
if ( BestAtt == None )
|
424
|
+
{
|
425
|
+
Verbosity(1, fprintf(Of, "\tno sensible splits\n"))
|
426
|
+
if ( Now == FORMTREE ) Progress(Cases);
|
427
|
+
}
|
428
|
+
else
|
429
|
+
{
|
430
|
+
Verbosity(1,
|
431
|
+
fprintf(Of, "\tbest attribute %s", AttName[BestAtt]);
|
432
|
+
if ( Continuous(BestAtt) )
|
433
|
+
{
|
434
|
+
fprintf(Of, " cut %.3f", Bar[BestAtt]);
|
435
|
+
}
|
436
|
+
fprintf(Of, " inf %.3f gain %.3f val %.3f\n",
|
437
|
+
Info[BestAtt], Gain[BestAtt], Gain[BestAtt] / Info[BestAtt]))
|
438
|
+
|
439
|
+
/* Build a node of the selected test */
|
440
|
+
|
441
|
+
if ( Discrete(BestAtt) )
|
442
|
+
{
|
443
|
+
if ( SUBSET && MaxAttVal[BestAtt] > 3 && ! Ordered(BestAtt) )
|
444
|
+
{
|
445
|
+
SubsetTest(Node, BestAtt);
|
446
|
+
}
|
447
|
+
else
|
448
|
+
{
|
449
|
+
DiscreteTest(Node, BestAtt);
|
450
|
+
}
|
451
|
+
}
|
452
|
+
else
|
453
|
+
{
|
454
|
+
ContinTest(Node, BestAtt);
|
455
|
+
}
|
456
|
+
|
457
|
+
/* Carry out the recursive divide-and-conquer */
|
458
|
+
|
459
|
+
++Tested[BestAtt];
|
460
|
+
|
461
|
+
Divide(Node, Fp, Lp, Level);
|
462
|
+
|
463
|
+
--Tested[BestAtt];
|
464
|
+
|
465
|
+
/* See whether we would have been no worse off with a leaf */
|
466
|
+
|
467
|
+
ForEach(v, 1, Node->Forks)
|
468
|
+
{
|
469
|
+
TreeErrs += Node->Branch[v]->Errors;
|
470
|
+
}
|
471
|
+
|
472
|
+
if ( TreeErrs >= 0.999 * Node->Errors )
|
473
|
+
{
|
474
|
+
Verbosity(1,
|
475
|
+
fprintf(Of, "<%d> Collapse tree for %d cases to leaf %s\n",
|
476
|
+
Level, No(Fp,Lp), ClassName[BestLeaf]))
|
477
|
+
|
478
|
+
UnSprout(Node);
|
479
|
+
}
|
480
|
+
else
|
481
|
+
{
|
482
|
+
Node->Errors = TreeErrs;
|
483
|
+
}
|
484
|
+
}
|
485
|
+
}
|
486
|
+
|
487
|
+
|
488
|
+
|
489
|
+
/*************************************************************************/
|
490
|
+
/* */
|
491
|
+
/* Estimate Gain[] and Info[] using sample */
|
492
|
+
/* */
|
493
|
+
/*************************************************************************/
|
494
|
+
|
495
|
+
|
496
|
+
void SampleEstimate(CaseNo Fp, CaseNo Lp, CaseCount Cases)
|
497
|
+
/* -------------- */
|
498
|
+
{
|
499
|
+
CaseNo SLp, SampleSize;
|
500
|
+
CaseCount NewCases;
|
501
|
+
Attribute Att;
|
502
|
+
float GR;
|
503
|
+
|
504
|
+
/* Phase 1: evaluate all discrete attributes and record best GR */
|
505
|
+
|
506
|
+
ForEach(Att, 1, MaxAtt)
|
507
|
+
{
|
508
|
+
Gain[Att] = None;
|
509
|
+
|
510
|
+
if ( Discrete(Att) )
|
511
|
+
{
|
512
|
+
EvalDiscrSplit(Att, Cases);
|
513
|
+
|
514
|
+
if ( Info[Att] > Epsilon &&
|
515
|
+
(GR = Gain[Att] / Info[Att]) > ValThresh )
|
516
|
+
{
|
517
|
+
ValThresh = GR;
|
518
|
+
}
|
519
|
+
}
|
520
|
+
}
|
521
|
+
|
522
|
+
/* Phase 2: generate sample */
|
523
|
+
|
524
|
+
SampleSize = MaxClass * SAMPLEUNIT;
|
525
|
+
Sample(Fp, Lp, SampleSize);
|
526
|
+
SLp = Fp + SampleSize - 1;
|
527
|
+
|
528
|
+
/* Phase 3: evaluate continuous attributes using sample */
|
529
|
+
|
530
|
+
NewCases = CountCases(Fp, SLp);
|
531
|
+
SampleFrac = NewCases / Cases;
|
532
|
+
NWaiting = 0;
|
533
|
+
|
534
|
+
ForEach(Att, 1, MaxAtt)
|
535
|
+
{
|
536
|
+
if ( Continuous(Att) )
|
537
|
+
{
|
538
|
+
Waiting[NWaiting++] = Att;
|
539
|
+
}
|
540
|
+
}
|
541
|
+
|
542
|
+
ProcessQueue(Fp, SLp, NewCases);
|
543
|
+
|
544
|
+
SampleFrac = 1.0;
|
545
|
+
}
|
546
|
+
|
547
|
+
|
548
|
+
|
549
|
+
/*************************************************************************/
|
550
|
+
/* */
|
551
|
+
/* Sample N cases from cases Fp through Lp */
|
552
|
+
/* */
|
553
|
+
/*************************************************************************/
|
554
|
+
|
555
|
+
|
556
|
+
void Sample(CaseNo Fp, CaseNo Lp, CaseNo N)
|
557
|
+
/* ------ */
|
558
|
+
{
|
559
|
+
CaseNo i, j;
|
560
|
+
double Interval;
|
561
|
+
|
562
|
+
Interval = No(Fp, Lp) / (double) N;
|
563
|
+
|
564
|
+
ForEach(i, 0, N-1)
|
565
|
+
{
|
566
|
+
j = (i + 0.5) * Interval;
|
567
|
+
|
568
|
+
assert(j >= 0 && Fp + j <= Lp);
|
569
|
+
|
570
|
+
Swap(Fp + i, Fp + j);
|
571
|
+
}
|
572
|
+
}
|
573
|
+
|
574
|
+
|
575
|
+
|
576
|
+
/*************************************************************************/
|
577
|
+
/* */
|
578
|
+
/* Evaluate splits and choose best attribute to split on. */
|
579
|
+
/* If Sampled, Gain[] and Info[] have been estimated on */
|
580
|
+
/* sample and unlikely candidates are not evaluated on all cases */
|
581
|
+
/* */
|
582
|
+
/*************************************************************************/
|
583
|
+
|
584
|
+
|
585
|
+
Attribute ChooseSplit(CaseNo Fp, CaseNo Lp, CaseCount Cases, Boolean Sampled)
|
586
|
+
/* ----------- */
|
587
|
+
{
|
588
|
+
Attribute Att;
|
589
|
+
int i, j;
|
590
|
+
|
591
|
+
|
592
|
+
/* For each available attribute, find the information and gain */
|
593
|
+
|
594
|
+
NWaiting = 0;
|
595
|
+
|
596
|
+
if ( Sampled )
|
597
|
+
{
|
598
|
+
/* If samples have been used, do not re-evaluate discrete atts
|
599
|
+
or atts that have low GR */
|
600
|
+
|
601
|
+
for ( Att = MaxAtt ; Att > 0 ; Att-- )
|
602
|
+
{
|
603
|
+
if ( ! Continuous(Att) ) continue;
|
604
|
+
|
605
|
+
if ( EstMaxGR[Att] >= ValThresh )
|
606
|
+
{
|
607
|
+
/* Add attributes in reverse order of estimated max GR */
|
608
|
+
|
609
|
+
for ( i = 0 ;
|
610
|
+
i < NWaiting && EstMaxGR[Waiting[i]] < EstMaxGR[Att] ;
|
611
|
+
i++ )
|
612
|
+
;
|
613
|
+
|
614
|
+
for ( j = NWaiting-1 ; j >= i ; j-- )
|
615
|
+
{
|
616
|
+
Waiting[j+1] = Waiting[j];
|
617
|
+
}
|
618
|
+
NWaiting++;
|
619
|
+
|
620
|
+
Waiting[i] = Att;
|
621
|
+
}
|
622
|
+
else
|
623
|
+
{
|
624
|
+
/* Don't use -- attribute hasn't been fully evaluated.
|
625
|
+
Leave Gain unchanged to get correct count for Possible */
|
626
|
+
|
627
|
+
Info[Att] = -1E6; /* negative so GR also negative */
|
628
|
+
}
|
629
|
+
}
|
630
|
+
}
|
631
|
+
else
|
632
|
+
{
|
633
|
+
for ( Att = MaxAtt ; Att > 0 ; Att-- )
|
634
|
+
{
|
635
|
+
Gain[Att] = None;
|
636
|
+
|
637
|
+
if ( Skip(Att) || Att == ClassAtt )
|
638
|
+
{
|
639
|
+
continue;
|
640
|
+
}
|
641
|
+
|
642
|
+
Waiting[NWaiting++] = Att;
|
643
|
+
}
|
644
|
+
}
|
645
|
+
|
646
|
+
ProcessQueue(Fp, Lp, Cases);
|
647
|
+
|
648
|
+
return FindBestAtt(Cases);
|
649
|
+
}
|
650
|
+
|
651
|
+
|
652
|
+
|
653
|
+
void ProcessQueue(CaseNo WFp, CaseNo WLp, CaseCount WCases)
|
654
|
+
/* ------------ */
|
655
|
+
{
|
656
|
+
Attribute Att;
|
657
|
+
float GR;
|
658
|
+
|
659
|
+
for ( ; NWaiting > 0 ; )
|
660
|
+
{
|
661
|
+
Att = Waiting[--NWaiting];
|
662
|
+
|
663
|
+
if ( Discrete(Att) )
|
664
|
+
{
|
665
|
+
EvalDiscrSplit(Att, WCases);
|
666
|
+
}
|
667
|
+
else
|
668
|
+
if ( SampleFrac < 1 )
|
669
|
+
{
|
670
|
+
EstimateMaxGR(Att, WFp, WLp);
|
671
|
+
}
|
672
|
+
else
|
673
|
+
if ( Sampled )
|
674
|
+
{
|
675
|
+
Info[Att] = -1E16;
|
676
|
+
|
677
|
+
if ( EstMaxGR[Att] > ValThresh )
|
678
|
+
{
|
679
|
+
EvalContinuousAtt(Att, WFp, WLp);
|
680
|
+
|
681
|
+
if ( Info[Att] > Epsilon &&
|
682
|
+
(GR = Gain[Att] / Info[Att]) > ValThresh )
|
683
|
+
{
|
684
|
+
if ( GR > ValThresh ) ValThresh = GR;
|
685
|
+
}
|
686
|
+
}
|
687
|
+
}
|
688
|
+
else
|
689
|
+
{
|
690
|
+
EvalContinuousAtt(Att, WFp, WLp);
|
691
|
+
}
|
692
|
+
}
|
693
|
+
}
|
694
|
+
|
695
|
+
|
696
|
+
|
697
|
+
/*************************************************************************/
|
698
|
+
/* */
|
699
|
+
/* Adjust each attribute's gain to reflect choice and */
|
700
|
+
/* select att with maximum GR */
|
701
|
+
/* */
|
702
|
+
/*************************************************************************/
|
703
|
+
|
704
|
+
|
705
|
+
Attribute FindBestAtt(CaseCount Cases)
|
706
|
+
/* ----------- */
|
707
|
+
{
|
708
|
+
double BestVal, Val, MinGain=1E6, AvGain=0, MDL;
|
709
|
+
Attribute Att, BestAtt, Possible=0;
|
710
|
+
DiscrValue NBr, BestNBr=MaxDiscrVal+1;
|
711
|
+
|
712
|
+
ForEach(Att, 1, MaxAtt)
|
713
|
+
{
|
714
|
+
/* Update the number of possible attributes for splitting and
|
715
|
+
average gain (unless very many values) */
|
716
|
+
|
717
|
+
if ( Gain[Att] >= Epsilon &&
|
718
|
+
( MultiVal || MaxAttVal[Att] < 0.3 * (MaxCase + 1) ) )
|
719
|
+
{
|
720
|
+
Possible++;
|
721
|
+
AvGain += Gain[Att];
|
722
|
+
}
|
723
|
+
else
|
724
|
+
{
|
725
|
+
Gain[Att] = None;
|
726
|
+
}
|
727
|
+
}
|
728
|
+
|
729
|
+
/* Set threshold on minimum gain */
|
730
|
+
|
731
|
+
if ( ! Possible ) return None;
|
732
|
+
|
733
|
+
AvGain /= Possible;
|
734
|
+
MDL = Log(Possible) / Cases;
|
735
|
+
MinGain = AvGain * AvGainWt + MDL * MDLWt;
|
736
|
+
|
737
|
+
Verbosity(2,
|
738
|
+
fprintf(Of, "\tav gain=%.3f, MDL (%d) = %.3f, min=%.3f\n",
|
739
|
+
AvGain, Possible, MDL, MinGain))
|
740
|
+
|
741
|
+
/* Find best attribute according to Gain Ratio criterion subject
|
742
|
+
to threshold on minimum gain */
|
743
|
+
|
744
|
+
BestVal = -Epsilon;
|
745
|
+
BestAtt = None;
|
746
|
+
|
747
|
+
ForEach(Att, 1, MaxAtt)
|
748
|
+
{
|
749
|
+
if ( Gain[Att] >= 0.999 * MinGain && Info[Att] > 0 )
|
750
|
+
{
|
751
|
+
Val = Gain[Att] / Info[Att];
|
752
|
+
NBr = ( MaxAttVal[Att] <= 3 || Ordered(Att) ? 3 :
|
753
|
+
SUBSET ? Subsets[Att] : MaxAttVal[Att] );
|
754
|
+
|
755
|
+
if ( Val > BestVal ||
|
756
|
+
Val > 0.999 * BestVal &&
|
757
|
+
( NBr < BestNBr ||
|
758
|
+
NBr == BestNBr && Gain[Att] > Gain[BestAtt] ) )
|
759
|
+
{
|
760
|
+
BestAtt = Att;
|
761
|
+
BestVal = Val;
|
762
|
+
BestNBr = NBr;
|
763
|
+
}
|
764
|
+
}
|
765
|
+
}
|
766
|
+
|
767
|
+
return BestAtt;
|
768
|
+
}
|
769
|
+
|
770
|
+
|
771
|
+
|
772
|
+
/*************************************************************************/
|
773
|
+
/* */
|
774
|
+
/* Evaluate split on Att */
|
775
|
+
/* */
|
776
|
+
/*************************************************************************/
|
777
|
+
|
778
|
+
|
779
|
+
void EvalDiscrSplit(Attribute Att, CaseCount Cases)
|
780
|
+
/* -------------- */
|
781
|
+
{
|
782
|
+
DiscrValue v, NBr;
|
783
|
+
|
784
|
+
Gain[Att] = None;
|
785
|
+
|
786
|
+
if ( Skip(Att) || Att == ClassAtt ) return;
|
787
|
+
|
788
|
+
if ( Ordered(Att) )
|
789
|
+
{
|
790
|
+
EvalOrderedAtt(Att, Cases);
|
791
|
+
NBr = ( GEnv.ValFreq[1] > 0.5 ? 3 : 2 );
|
792
|
+
}
|
793
|
+
else
|
794
|
+
if ( SUBSET && MaxAttVal[Att] > 3 )
|
795
|
+
{
|
796
|
+
EvalSubset(Att, Cases);
|
797
|
+
NBr = Subsets[Att];
|
798
|
+
}
|
799
|
+
else
|
800
|
+
if ( ! Tested[Att] )
|
801
|
+
{
|
802
|
+
EvalDiscreteAtt(Att, Cases);
|
803
|
+
|
804
|
+
NBr = 0;
|
805
|
+
ForEach(v, 1, MaxAttVal[Att])
|
806
|
+
{
|
807
|
+
if ( GEnv.ValFreq[v] > 0.5 ) NBr++;
|
808
|
+
}
|
809
|
+
}
|
810
|
+
else
|
811
|
+
{
|
812
|
+
NBr = 0;
|
813
|
+
}
|
814
|
+
|
815
|
+
/* Check that this test will not give too many leaves */
|
816
|
+
|
817
|
+
if ( NBr > MaxLeaves + 1 )
|
818
|
+
{
|
819
|
+
Verbosity(2,
|
820
|
+
fprintf(Of, "\t(cancelled -- %d leaves, max %d)\n", NBr, MaxLeaves))
|
821
|
+
|
822
|
+
Gain[Att] = None;
|
823
|
+
}
|
824
|
+
}
|
825
|
+
|
826
|
+
|
827
|
+
|
828
|
+
/*************************************************************************/
|
829
|
+
/* */
|
830
|
+
/* Form the subtrees for the given node */
|
831
|
+
/* */
|
832
|
+
/*************************************************************************/
|
833
|
+
|
834
|
+
|
835
|
+
void Divide(Tree T, CaseNo Fp, CaseNo Lp, int Level)
|
836
|
+
/* ------ */
|
837
|
+
{
|
838
|
+
CaseNo Bp, Ep, Missing, Cases, i;
|
839
|
+
CaseCount KnownCases, MissingCases, BranchCases;
|
840
|
+
Attribute Att;
|
841
|
+
double Factor;
|
842
|
+
DiscrValue v;
|
843
|
+
Boolean PrevUnitWeights;
|
844
|
+
|
845
|
+
PrevUnitWeights = UnitWeights;
|
846
|
+
|
847
|
+
Att = T->Tested;
|
848
|
+
Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
|
849
|
+
|
850
|
+
KnownCases = T->Cases - (MissingCases = CountCases(Fp, Ep));
|
851
|
+
|
852
|
+
if ( Missing )
|
853
|
+
{
|
854
|
+
UnitWeights = false;
|
855
|
+
|
856
|
+
/* If using costs, must adjust branch factors to undo effects of
|
857
|
+
reweighting cases */
|
858
|
+
|
859
|
+
if ( CostWeights )
|
860
|
+
{
|
861
|
+
KnownCases = SumNocostWeights(Ep+1, Lp);
|
862
|
+
}
|
863
|
+
|
864
|
+
/* If there are many cases with missing values and many branches,
|
865
|
+
skip cases whose weight < 0.1 */
|
866
|
+
|
867
|
+
if ( (Cases = No(Fp,Lp)) > 1000 &&
|
868
|
+
Missing > 0.5 * Cases &&
|
869
|
+
T->Forks >= 10 )
|
870
|
+
{
|
871
|
+
ForEach(i, Fp, Ep)
|
872
|
+
{
|
873
|
+
if ( Weight(Case[i]) < 0.1 )
|
874
|
+
{
|
875
|
+
Missing--;
|
876
|
+
MissingCases -= Weight(Case[i]);
|
877
|
+
Swap(Fp, i);
|
878
|
+
Fp++;
|
879
|
+
}
|
880
|
+
}
|
881
|
+
|
882
|
+
assert(Missing >= 0);
|
883
|
+
}
|
884
|
+
}
|
885
|
+
|
886
|
+
Bp = Fp;
|
887
|
+
ForEach(v, 1, T->Forks)
|
888
|
+
{
|
889
|
+
Ep = Group(v, Bp + Missing, Lp, T);
|
890
|
+
|
891
|
+
assert(Bp + Missing <= Lp+1 && Ep <= Lp);
|
892
|
+
|
893
|
+
/* Bp -> first value in missing + remaining values
|
894
|
+
Ep -> last value in missing + current group */
|
895
|
+
|
896
|
+
BranchCases = CountCases(Bp + Missing, Ep);
|
897
|
+
|
898
|
+
Factor = ( ! Missing ? 0 :
|
899
|
+
! CostWeights ? BranchCases / KnownCases :
|
900
|
+
SumNocostWeights(Bp + Missing, Ep) / KnownCases );
|
901
|
+
|
902
|
+
if ( BranchCases + Factor * MissingCases >= MinLeaf )
|
903
|
+
{
|
904
|
+
if ( Missing )
|
905
|
+
{
|
906
|
+
/* Adjust weights of cases with missing values */
|
907
|
+
|
908
|
+
ForEach(i, Bp, Bp + Missing - 1)
|
909
|
+
{
|
910
|
+
Weight(Case[i]) *= Factor;
|
911
|
+
}
|
912
|
+
}
|
913
|
+
|
914
|
+
FormTree(Bp, Ep, Level+1, &T->Branch[v]);
|
915
|
+
|
916
|
+
/* Restore weights if changed */
|
917
|
+
|
918
|
+
if ( Missing )
|
919
|
+
{
|
920
|
+
for ( i = Ep ; i >= Bp ; i-- )
|
921
|
+
{
|
922
|
+
if ( Unknown(Case[i], Att) )
|
923
|
+
{
|
924
|
+
Weight(Case[i]) /= Factor;
|
925
|
+
Swap(i, Ep);
|
926
|
+
Ep--;
|
927
|
+
}
|
928
|
+
}
|
929
|
+
}
|
930
|
+
|
931
|
+
Bp = Ep+1;
|
932
|
+
}
|
933
|
+
else
|
934
|
+
{
|
935
|
+
T->Branch[v] = Leaf(Nil, T->Leaf, 0.0, 0.0);
|
936
|
+
}
|
937
|
+
}
|
938
|
+
|
939
|
+
UnitWeights = PrevUnitWeights;
|
940
|
+
}
|
941
|
+
|
942
|
+
|
943
|
+
|
944
|
+
/*************************************************************************/
|
945
|
+
/* */
|
946
|
+
/* Group together the cases corresponding to branch V of a test */
|
947
|
+
/* and return the index of the last such */
|
948
|
+
/* */
|
949
|
+
/* Note: if V equals zero, group the unknown values */
|
950
|
+
/* */
|
951
|
+
/*************************************************************************/
|
952
|
+
|
953
|
+
|
954
|
+
CaseNo Group(DiscrValue V, CaseNo Bp, CaseNo Ep, Tree TestNode)
|
955
|
+
/* ----- */
|
956
|
+
{
|
957
|
+
CaseNo i;
|
958
|
+
Attribute Att;
|
959
|
+
ContValue Thresh;
|
960
|
+
Set SS;
|
961
|
+
|
962
|
+
Att = TestNode->Tested;
|
963
|
+
|
964
|
+
if ( ! V )
|
965
|
+
{
|
966
|
+
/* Group together unknown values (if any) */
|
967
|
+
|
968
|
+
if ( SomeMiss[Att] )
|
969
|
+
{
|
970
|
+
ForEach(i, Bp, Ep)
|
971
|
+
{
|
972
|
+
if ( Unknown(Case[i], Att) )
|
973
|
+
{
|
974
|
+
Swap(Bp, i);
|
975
|
+
Bp++;
|
976
|
+
}
|
977
|
+
}
|
978
|
+
}
|
979
|
+
}
|
980
|
+
else /* skip non-existant N/A values */
|
981
|
+
if ( V != 1 || TestNode->NodeType == BrSubset || SomeNA[Att] )
|
982
|
+
{
|
983
|
+
/* Group cases on the value of attribute Att, and depending
|
984
|
+
on the type of branch */
|
985
|
+
|
986
|
+
switch ( TestNode->NodeType )
|
987
|
+
{
|
988
|
+
case BrDiscr:
|
989
|
+
|
990
|
+
ForEach(i, Bp, Ep)
|
991
|
+
{
|
992
|
+
if ( DVal(Case[i], Att) == V )
|
993
|
+
{
|
994
|
+
Swap(Bp, i);
|
995
|
+
Bp++;
|
996
|
+
}
|
997
|
+
}
|
998
|
+
break;
|
999
|
+
|
1000
|
+
case BrThresh:
|
1001
|
+
|
1002
|
+
Thresh = TestNode->Cut;
|
1003
|
+
ForEach(i, Bp, Ep)
|
1004
|
+
{
|
1005
|
+
if ( V == 1 ? NotApplic(Case[i], Att) :
|
1006
|
+
(CVal(Case[i], Att) <= Thresh) == (V == 2) )
|
1007
|
+
{
|
1008
|
+
Swap(Bp, i);
|
1009
|
+
Bp++;
|
1010
|
+
}
|
1011
|
+
}
|
1012
|
+
break;
|
1013
|
+
|
1014
|
+
case BrSubset:
|
1015
|
+
|
1016
|
+
SS = TestNode->Subset[V];
|
1017
|
+
ForEach(i, Bp, Ep)
|
1018
|
+
{
|
1019
|
+
if ( In(XDVal(Case[i], Att), SS) )
|
1020
|
+
{
|
1021
|
+
Swap(Bp, i);
|
1022
|
+
Bp++;
|
1023
|
+
}
|
1024
|
+
}
|
1025
|
+
break;
|
1026
|
+
}
|
1027
|
+
}
|
1028
|
+
|
1029
|
+
return Bp - 1;
|
1030
|
+
}
|
1031
|
+
|
1032
|
+
|
1033
|
+
|
1034
|
+
/*************************************************************************/
|
1035
|
+
/* */
|
1036
|
+
/* Return the total weight of cases from Fp to Lp */
|
1037
|
+
/* */
|
1038
|
+
/*************************************************************************/
|
1039
|
+
|
1040
|
+
|
1041
|
+
CaseCount SumWeights(CaseNo Fp, CaseNo Lp)
|
1042
|
+
/* ---------- */
|
1043
|
+
{
|
1044
|
+
double Sum=0.0;
|
1045
|
+
CaseNo i;
|
1046
|
+
|
1047
|
+
assert(Fp >= 0 && Lp >= Fp-1 && Lp <= MaxCase);
|
1048
|
+
|
1049
|
+
ForEach(i, Fp, Lp)
|
1050
|
+
{
|
1051
|
+
Sum += Weight(Case[i]);
|
1052
|
+
}
|
1053
|
+
|
1054
|
+
return Sum;
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
|
1058
|
+
|
1059
|
+
/*************************************************************************/
|
1060
|
+
/* */
|
1061
|
+
/* Special version to undo the weightings associated with costs */
|
1062
|
+
/* */
|
1063
|
+
/*************************************************************************/
|
1064
|
+
|
1065
|
+
|
1066
|
+
CaseCount SumNocostWeights(CaseNo Fp, CaseNo Lp)
|
1067
|
+
/* ---------------- */
|
1068
|
+
{
|
1069
|
+
double Sum=0.0;
|
1070
|
+
CaseNo i;
|
1071
|
+
|
1072
|
+
assert(Fp >= 0 && Lp >= Fp-1 && Lp <= MaxCase);
|
1073
|
+
|
1074
|
+
ForEach(i, Fp, Lp)
|
1075
|
+
{
|
1076
|
+
Sum += Weight(Case[i]) / WeightMul[Class(Case[i])];
|
1077
|
+
}
|
1078
|
+
|
1079
|
+
return Sum;
|
1080
|
+
}
|
1081
|
+
|
1082
|
+
|
1083
|
+
|
1084
|
+
/*************************************************************************/
|
1085
|
+
/* */
|
1086
|
+
/* Generate class frequency distribution */
|
1087
|
+
/* */
|
1088
|
+
/*************************************************************************/
|
1089
|
+
|
1090
|
+
|
1091
|
+
void FindClassFreq(double *CF, CaseNo Fp, CaseNo Lp)
|
1092
|
+
/* ------------- */
|
1093
|
+
{
|
1094
|
+
ClassNo c;
|
1095
|
+
CaseNo i;
|
1096
|
+
|
1097
|
+
assert(Fp >= 0 && Lp >= Fp && Lp <= MaxCase);
|
1098
|
+
|
1099
|
+
ForEach(c, 0, MaxClass)
|
1100
|
+
{
|
1101
|
+
CF[c] = 0;
|
1102
|
+
}
|
1103
|
+
|
1104
|
+
ForEach(i, Fp, Lp)
|
1105
|
+
{
|
1106
|
+
assert(Class(Case[i]) >= 1 && Class(Case[i]) <= MaxClass);
|
1107
|
+
|
1108
|
+
CF[ Class(Case[i]) ] += Weight(Case[i]);
|
1109
|
+
}
|
1110
|
+
}
|
1111
|
+
|
1112
|
+
|
1113
|
+
|
1114
|
+
/*************************************************************************/
|
1115
|
+
/* */
|
1116
|
+
/* Find all discrete frequencies */
|
1117
|
+
/* */
|
1118
|
+
/*************************************************************************/
|
1119
|
+
|
1120
|
+
|
1121
|
+
void FindAllFreq(CaseNo Fp, CaseNo Lp)
|
1122
|
+
/* ----------- */
|
1123
|
+
{
|
1124
|
+
ClassNo c;
|
1125
|
+
CaseNo i;
|
1126
|
+
Attribute Att, a;
|
1127
|
+
CaseCount w;
|
1128
|
+
int x;
|
1129
|
+
|
1130
|
+
/* Zero all values */
|
1131
|
+
|
1132
|
+
ForEach(c, 0, MaxClass)
|
1133
|
+
{
|
1134
|
+
ClassFreq[c] = 0;
|
1135
|
+
}
|
1136
|
+
|
1137
|
+
for ( a = 0 ; a < NDList ; a++ )
|
1138
|
+
{
|
1139
|
+
Att = DList[a];
|
1140
|
+
for ( x = MaxClass * (MaxAttVal[Att]+1) - 1 ; x >= 0 ; x-- )
|
1141
|
+
{
|
1142
|
+
DFreq[Att][x] = 0;
|
1143
|
+
}
|
1144
|
+
}
|
1145
|
+
|
1146
|
+
/* Scan cases */
|
1147
|
+
|
1148
|
+
ForEach(i, Fp, Lp)
|
1149
|
+
{
|
1150
|
+
ClassFreq[ (c=Class(Case[i])) ] += (w=Weight(Case[i]));
|
1151
|
+
|
1152
|
+
for ( a = 0 ; a < NDList ; a++ )
|
1153
|
+
{
|
1154
|
+
Att = DList[a];
|
1155
|
+
DFreq[Att][ MaxClass * XDVal(Case[i], Att) + (c-1) ] += w;
|
1156
|
+
}
|
1157
|
+
}
|
1158
|
+
}
|