see5-installer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
@@ -0,0 +1,342 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of GritBot GPL Edition, a single-threaded version */
|
6
|
+
/* of GritBot release 2.01. */
|
7
|
+
/* */
|
8
|
+
/* GritBot GPL Edition is free software: you can redistribute it */
|
9
|
+
/* and/or modify it under the terms of the GNU General Public License */
|
10
|
+
/* as published by the Free Software Foundation, either version 3 of */
|
11
|
+
/* the License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* GritBot GPL Edition is distributed in the hope that it will be */
|
14
|
+
/* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
|
15
|
+
/* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
16
|
+
/* GNU General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with GritBot GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Routines to manage clusters */
|
30
|
+
/* --------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#include "defns.i"
|
36
|
+
#include "extern.i"
|
37
|
+
|
38
|
+
|
39
|
+
/*************************************************************************/
|
40
|
+
/* */
|
41
|
+
/* Register a new cluster */
|
42
|
+
/* */
|
43
|
+
/*************************************************************************/
|
44
|
+
|
45
|
+
|
46
|
+
Clust NewClust(ContValue Expect, ContValue SD, ContValue Limit,
|
47
|
+
CaseCount Anoms, CaseCount GpSize)
|
48
|
+
/* -------- */
|
49
|
+
{
|
50
|
+
Clust C;
|
51
|
+
|
52
|
+
/* Make sure we have room for another */
|
53
|
+
|
54
|
+
if ( NClust >= ClustSpace )
|
55
|
+
{
|
56
|
+
Realloc(Cluster, (ClustSpace += 1000), Clust);
|
57
|
+
}
|
58
|
+
|
59
|
+
C = Cluster[NClust++] = Alloc(1, ClustRec);
|
60
|
+
|
61
|
+
/* Save cluster information */
|
62
|
+
|
63
|
+
C->Att = ClassAtt;
|
64
|
+
C->Expect = Expect;
|
65
|
+
C->SD = SD;
|
66
|
+
C->Limit = Limit;
|
67
|
+
C->GpSize = GpSize;
|
68
|
+
C->Frac = 1 - Anoms / (float) GpSize;
|
69
|
+
|
70
|
+
SaveClustConds(C);
|
71
|
+
|
72
|
+
return C;
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
/*************************************************************************/
|
78
|
+
/* */
|
79
|
+
/* Process current tests to determine cluster conditions. */
|
80
|
+
/* Different functions are called depending on the condition */
|
81
|
+
/* type. */
|
82
|
+
/* */
|
83
|
+
/*************************************************************************/
|
84
|
+
|
85
|
+
|
86
|
+
void SaveClustConds(Clust C)
|
87
|
+
/* -------------- */
|
88
|
+
{
|
89
|
+
Attribute Att, NAtts=0;
|
90
|
+
int NC=0;
|
91
|
+
|
92
|
+
/* Count attributes tested */
|
93
|
+
|
94
|
+
ForEach(Att, 1, MaxAtt)
|
95
|
+
{
|
96
|
+
if ( GEnv.Tested[Att] ) NAtts++;
|
97
|
+
}
|
98
|
+
|
99
|
+
C->NCond = NAtts;
|
100
|
+
C->Cond = Alloc(C->NCond, ClustCond);
|
101
|
+
|
102
|
+
/* Format tests on each attribute */
|
103
|
+
|
104
|
+
ForEach(Att, 1, MaxAtt)
|
105
|
+
{
|
106
|
+
if ( GEnv.Tested[Att] )
|
107
|
+
{
|
108
|
+
if ( Continuous(Att) )
|
109
|
+
{
|
110
|
+
FormatContinCond(Att, &C->Cond[NC]);
|
111
|
+
}
|
112
|
+
else
|
113
|
+
if ( Ordered(Att) )
|
114
|
+
{
|
115
|
+
FormatOrderedCond(Att, &C->Cond[NC]);
|
116
|
+
}
|
117
|
+
else
|
118
|
+
if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
|
119
|
+
{
|
120
|
+
FormatSubsetCond(Att, &C->Cond[NC]);
|
121
|
+
}
|
122
|
+
else
|
123
|
+
{
|
124
|
+
FormatValCond(Att, &C->Cond[NC]);
|
125
|
+
}
|
126
|
+
|
127
|
+
NC++;
|
128
|
+
}
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
/*************************************************************************/
|
135
|
+
/* */
|
136
|
+
/* Test on a continuous att. Check all threshold tests */
|
137
|
+
/* and assemble lowest and highest possible value */
|
138
|
+
/* */
|
139
|
+
/*************************************************************************/
|
140
|
+
|
141
|
+
|
142
|
+
void FormatContinCond(Attribute Att, ClustCond *CC)
|
143
|
+
/* ---------------- */
|
144
|
+
{
|
145
|
+
ContValue Lo=-MARKER, Hi=MARKER;
|
146
|
+
int i, Type=0;
|
147
|
+
|
148
|
+
ForEach(i, 0, GEnv.Level)
|
149
|
+
{
|
150
|
+
if ( GEnv.Test[i].Att == Att )
|
151
|
+
{
|
152
|
+
if ( GEnv.Test[i].Br == 1 )
|
153
|
+
{
|
154
|
+
Type = CONT_NA;
|
155
|
+
Lo = 1;
|
156
|
+
Hi = 0;
|
157
|
+
break;
|
158
|
+
}
|
159
|
+
else
|
160
|
+
if ( GEnv.Test[i].Br == 2 )
|
161
|
+
{
|
162
|
+
Type |= CONT_LT;
|
163
|
+
Hi = GEnv.Test[i].Cut;
|
164
|
+
}
|
165
|
+
else
|
166
|
+
{
|
167
|
+
Type |= CONT_GT;
|
168
|
+
Lo = GEnv.Test[i].Cut;
|
169
|
+
}
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
CC->Type = Type;
|
174
|
+
CC->Att = Att;
|
175
|
+
CC->Low = Lo;
|
176
|
+
CC->High = Hi;
|
177
|
+
CC->Values = Nil;
|
178
|
+
}
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
/*************************************************************************/
|
183
|
+
/* */
|
184
|
+
/* Test on an ordered discrete attribute (similar to above) */
|
185
|
+
/* */
|
186
|
+
/*************************************************************************/
|
187
|
+
|
188
|
+
|
189
|
+
void FormatOrderedCond(Attribute Att, ClustCond *CC)
|
190
|
+
/* ----------------- */
|
191
|
+
{
|
192
|
+
DiscrValue Lo, Hi;
|
193
|
+
int i, Type=0;
|
194
|
+
|
195
|
+
Lo = 2;
|
196
|
+
Hi = MaxAttVal[Att];
|
197
|
+
|
198
|
+
ForEach(i, 0, GEnv.Level)
|
199
|
+
{
|
200
|
+
if ( GEnv.Test[i].Att == Att )
|
201
|
+
{
|
202
|
+
if ( GEnv.Test[i].Br == 1 )
|
203
|
+
{
|
204
|
+
Type = DISCR_VAL;
|
205
|
+
Lo = Hi = 1;
|
206
|
+
break;
|
207
|
+
}
|
208
|
+
else
|
209
|
+
if ( GEnv.Test[i].Br == 2 )
|
210
|
+
{
|
211
|
+
Type |= DISCR_LT;
|
212
|
+
Hi = GEnv.Test[i].Cut;
|
213
|
+
}
|
214
|
+
else
|
215
|
+
{
|
216
|
+
Type |= DISCR_GT;
|
217
|
+
Lo = GEnv.Test[i].Cut + 1;
|
218
|
+
}
|
219
|
+
}
|
220
|
+
}
|
221
|
+
|
222
|
+
CC->Type = Type;
|
223
|
+
CC->Att = Att;
|
224
|
+
CC->Low = Lo;
|
225
|
+
CC->High = Hi;
|
226
|
+
CC->Values = Nil;
|
227
|
+
}
|
228
|
+
|
229
|
+
|
230
|
+
|
231
|
+
/*************************************************************************/
|
232
|
+
/* */
|
233
|
+
/* Subset test for a discrete attribute. All tests must be */
|
234
|
+
/* checked to determine the final subset values */
|
235
|
+
/* */
|
236
|
+
/*************************************************************************/
|
237
|
+
|
238
|
+
|
239
|
+
void FormatSubsetCond(Attribute Att, ClustCond *CC)
|
240
|
+
/* ---------------- */
|
241
|
+
{
|
242
|
+
DiscrValue v;
|
243
|
+
int i;
|
244
|
+
|
245
|
+
CC->Att = Att;
|
246
|
+
|
247
|
+
GEnv.Possible[1] = false;
|
248
|
+
ForEach(v, 2, MaxAttVal[Att])
|
249
|
+
{
|
250
|
+
GEnv.Possible[v] = true;
|
251
|
+
}
|
252
|
+
|
253
|
+
ForEach(i, 0, GEnv.Level)
|
254
|
+
{
|
255
|
+
if ( GEnv.Test[i].Att == Att )
|
256
|
+
{
|
257
|
+
if ( GEnv.Test[i].Br == 1 )
|
258
|
+
{
|
259
|
+
GEnv.Possible[1] = true;
|
260
|
+
ForEach(v, 2, MaxAttVal[Att])
|
261
|
+
{
|
262
|
+
GEnv.Possible[v] = false;
|
263
|
+
}
|
264
|
+
break;
|
265
|
+
}
|
266
|
+
else
|
267
|
+
ForEach(v, 2, MaxAttVal[Att])
|
268
|
+
{
|
269
|
+
if ( In(v, GEnv.Test[i].Left) )
|
270
|
+
{
|
271
|
+
GEnv.Possible[v] = GEnv.Possible[v] && ( GEnv.Test[i].Br == 2 );
|
272
|
+
}
|
273
|
+
else
|
274
|
+
{
|
275
|
+
GEnv.Possible[v] = GEnv.Possible[v] && ( GEnv.Test[i].Br == 3 );
|
276
|
+
}
|
277
|
+
}
|
278
|
+
}
|
279
|
+
}
|
280
|
+
|
281
|
+
CC->Type = DISCR_SET;
|
282
|
+
CC->Low = CC->High = 0;
|
283
|
+
CC->Values = AllocZero((MaxAttVal[Att]>>3)+1, unsigned char);
|
284
|
+
|
285
|
+
ForEach(v, 1, MaxAttVal[Att])
|
286
|
+
{
|
287
|
+
if ( GEnv.Possible[v] ) SetBit(v, CC->Values);
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
291
|
+
|
292
|
+
|
293
|
+
/*************************************************************************/
|
294
|
+
/* */
|
295
|
+
/* Simple test on attribute value. There is no need to check */
|
296
|
+
/* more than one test since the first determines the tested value */
|
297
|
+
/* */
|
298
|
+
/*************************************************************************/
|
299
|
+
|
300
|
+
|
301
|
+
void FormatValCond(Attribute Att, ClustCond *CC)
|
302
|
+
/* ------------- */
|
303
|
+
{
|
304
|
+
int i;
|
305
|
+
|
306
|
+
ForEach(i, 0, GEnv.Level)
|
307
|
+
{
|
308
|
+
if ( GEnv.Test[i].Att == Att )
|
309
|
+
{
|
310
|
+
CC->Type = DISCR_VAL;
|
311
|
+
CC->Att = Att;
|
312
|
+
CC->Low = CC->High = GEnv.Test[i].Br;
|
313
|
+
CC->Values = Nil;
|
314
|
+
return;
|
315
|
+
}
|
316
|
+
}
|
317
|
+
}
|
318
|
+
|
319
|
+
|
320
|
+
|
321
|
+
/*************************************************************************/
|
322
|
+
/* */
|
323
|
+
/* Free conditions stored in a cluster */
|
324
|
+
/* */
|
325
|
+
/*************************************************************************/
|
326
|
+
|
327
|
+
|
328
|
+
void FreeClust(Clust C)
|
329
|
+
/* --------- */
|
330
|
+
{
|
331
|
+
int d;
|
332
|
+
|
333
|
+
if ( C )
|
334
|
+
{
|
335
|
+
ForEach(d, 0, C->NCond-1)
|
336
|
+
{
|
337
|
+
FreeUnlessNil(C->Cond[d].Values);
|
338
|
+
}
|
339
|
+
FreeUnlessNil(C->Cond);
|
340
|
+
Free(C);
|
341
|
+
}
|
342
|
+
}
|
@@ -0,0 +1,1269 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of GritBot GPL Edition, a single-threaded version */
|
6
|
+
/* of GritBot release 2.01. */
|
7
|
+
/* */
|
8
|
+
/* GritBot GPL Edition is free software: you can redistribute it */
|
9
|
+
/* and/or modify it under the terms of the GNU General Public License */
|
10
|
+
/* as published by the Free Software Foundation, either version 3 of */
|
11
|
+
/* the License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* GritBot GPL Edition is distributed in the hope that it will be */
|
14
|
+
/* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
|
15
|
+
/* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
16
|
+
/* GNU General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with GritBot GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
#ifndef INSPECT
|
28
|
+
/*************************************************************************/
|
29
|
+
/* */
|
30
|
+
/* Divide-and-Conquer generic routines */
|
31
|
+
/* ----------------------------------- */
|
32
|
+
/* */
|
33
|
+
/*************************************************************************/
|
34
|
+
|
35
|
+
|
36
|
+
#include "defns.i"
|
37
|
+
#include "extern.i"
|
38
|
+
|
39
|
+
|
40
|
+
/*************************************************************************/
|
41
|
+
/* */
|
42
|
+
/* Allocate space for all tables */
|
43
|
+
/* */
|
44
|
+
/*************************************************************************/
|
45
|
+
|
46
|
+
|
47
|
+
void InitialiseDAC()
|
48
|
+
/* ------------- */
|
49
|
+
{
|
50
|
+
DiscrValue v;
|
51
|
+
Attribute Att;
|
52
|
+
CaseNo i, MaxSampleSize;
|
53
|
+
extern SortPair *Pair;
|
54
|
+
|
55
|
+
UseLogs = AllocZero(MaxAtt+1, Boolean);
|
56
|
+
SomeMiss = AllocZero(MaxAtt+1, Boolean);
|
57
|
+
SomeNA = AllocZero(MaxAtt+1, Boolean);
|
58
|
+
LowTail = AllocZero(MaxAtt+1, ContValue);
|
59
|
+
HighTail = AllocZero(MaxAtt+1, ContValue);
|
60
|
+
|
61
|
+
LogCaseNo = Alloc(MaxCase+2, double);
|
62
|
+
LogCaseNo[0] = LogCaseNo[1] = 0;
|
63
|
+
ForEach(i, 2, MaxCase+1)
|
64
|
+
{
|
65
|
+
LogCaseNo[i] = log((double) i) / Log2;
|
66
|
+
}
|
67
|
+
|
68
|
+
/* Save random numbers for sampling */
|
69
|
+
|
70
|
+
MaxSampleSize = SAMPLEUNIT * Max(MaxDiscrVal, 5);
|
71
|
+
if ( MaxSampleSize > (MaxCase+1) / 2 + 1 )
|
72
|
+
{
|
73
|
+
MaxSampleSize = (MaxCase+1) / 2 + 1;
|
74
|
+
}
|
75
|
+
|
76
|
+
Rand = Alloc(MaxSampleSize, double);
|
77
|
+
ResetKR(1230);
|
78
|
+
ForEach(i, 0, MaxSampleSize-1)
|
79
|
+
{
|
80
|
+
Rand[i] = KRandom();
|
81
|
+
}
|
82
|
+
|
83
|
+
/* Compute prior probabilities for discrete values */
|
84
|
+
|
85
|
+
Prior = Alloc(MaxAtt+1, double *);
|
86
|
+
ForEach(Att, 1, MaxAtt)
|
87
|
+
{
|
88
|
+
if ( Discrete(Att) )
|
89
|
+
{
|
90
|
+
Prior[Att] = AllocZero(MaxAttVal[Att]+1, double);
|
91
|
+
|
92
|
+
ForEach(i, 0, MaxCase)
|
93
|
+
{
|
94
|
+
Prior[Att][XDVal(Case[i], Att)]++;
|
95
|
+
}
|
96
|
+
|
97
|
+
SomeMiss[Att] = ( Prior[Att][0] > 0 );
|
98
|
+
SomeNA[Att] = ( Prior[Att][1] > 0 );
|
99
|
+
|
100
|
+
ForEach(v, 0, MaxAttVal[Att])
|
101
|
+
{
|
102
|
+
Prior[Att][v] /= (double) (MaxCase+1);
|
103
|
+
}
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
/* Determine precision for continuous attributes */
|
108
|
+
|
109
|
+
Prec = AllocZero(MaxAtt+1, unsigned char);
|
110
|
+
ForEach(Att, 1, MaxAtt)
|
111
|
+
{
|
112
|
+
if ( ! Exclude(Att) && Continuous(Att) )
|
113
|
+
{
|
114
|
+
Prec[Att] = log(FracBase(Att)) / log(10.0) + 0.5;
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
Pair = Alloc(MaxCase+1, SortPair);
|
119
|
+
|
120
|
+
InitialiseEnvData();
|
121
|
+
}
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
void FreeDAC()
|
126
|
+
/* ------- */
|
127
|
+
{
|
128
|
+
extern SortPair *Pair;
|
129
|
+
|
130
|
+
FreeEnvData();
|
131
|
+
|
132
|
+
FreeUnlessNil(UseLogs); UseLogs = Nil;
|
133
|
+
FreeUnlessNil(SomeMiss); SomeMiss = Nil;
|
134
|
+
FreeUnlessNil(SomeNA); SomeNA = Nil;
|
135
|
+
FreeUnlessNil(LowTail); LowTail = Nil;
|
136
|
+
FreeUnlessNil(HighTail); HighTail = Nil;
|
137
|
+
FreeUnlessNil(LogCaseNo); LogCaseNo = Nil;
|
138
|
+
FreeUnlessNil(Rand); Rand = Nil;
|
139
|
+
FreeVector((void **) Prior, 1, MaxAtt); Prior = Nil;
|
140
|
+
FreeUnlessNil(Prec); Prec = Nil;
|
141
|
+
|
142
|
+
FreeUnlessNil(Pair); Pair = Nil;
|
143
|
+
}
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
/*************************************************************************/
|
148
|
+
/* */
|
149
|
+
/* Split cases Fp through Lp */
|
150
|
+
/* CondAtts is the current number of conditioning attributes */
|
151
|
+
/* */
|
152
|
+
/*************************************************************************/
|
153
|
+
|
154
|
+
|
155
|
+
void Split(CaseNo Fp, CaseNo Lp, int CondAtts, Tree Parent, DiscrValue Br,
|
156
|
+
Tree *Result)
|
157
|
+
/* ----- */
|
158
|
+
{
|
159
|
+
CaseNo i;
|
160
|
+
CaseCount Cases;
|
161
|
+
DiscrValue v;
|
162
|
+
double Val, Sum=0, SumSq=0;
|
163
|
+
Attribute Att, BestAtt;
|
164
|
+
Tree Node;
|
165
|
+
|
166
|
+
|
167
|
+
*Result = Nil;
|
168
|
+
|
169
|
+
/* Recover info about tests to this point */
|
170
|
+
|
171
|
+
ForEach(Att, 1, MaxAtt)
|
172
|
+
{
|
173
|
+
GEnv.Tested[Att] = 0;
|
174
|
+
}
|
175
|
+
|
176
|
+
RecoverContext(Parent, Br);
|
177
|
+
|
178
|
+
Cases = No(Fp, Lp);
|
179
|
+
Verbosity(1,
|
180
|
+
fprintf(Of, "\n<%d> %d cases %d-%d\n", GEnv.Level, Cases, Fp, Lp);
|
181
|
+
ShowContext(Fp))
|
182
|
+
|
183
|
+
GEnv.FRAC = 1;
|
184
|
+
|
185
|
+
/* Determine PSD and base information. This is only approximate, since
|
186
|
+
missing values of the tested attributes are excluded. */
|
187
|
+
|
188
|
+
if ( Continuous(ClassAtt) )
|
189
|
+
{
|
190
|
+
if ( Cases < 2 * CMINITEMS )
|
191
|
+
{
|
192
|
+
Progress(Cases);
|
193
|
+
return;
|
194
|
+
}
|
195
|
+
|
196
|
+
ForEach(i, Fp, Lp)
|
197
|
+
{
|
198
|
+
Val = CClass(Case[i]);
|
199
|
+
Sum += Val;
|
200
|
+
SumSq += Val * Val;
|
201
|
+
}
|
202
|
+
GEnv.PSD = SDEstimate(Cases, Sum, SumSq);
|
203
|
+
}
|
204
|
+
else
|
205
|
+
{
|
206
|
+
if ( Cases < 2 * DMINITEMS )
|
207
|
+
{
|
208
|
+
Progress(Cases);
|
209
|
+
return;
|
210
|
+
}
|
211
|
+
|
212
|
+
/* Check for pure leaf */
|
213
|
+
|
214
|
+
FindClassFrequencies(Fp, Lp);
|
215
|
+
|
216
|
+
ForEach(v, 1, MaxAttVal[ClassAtt])
|
217
|
+
{
|
218
|
+
if ( GEnv.ClassFreq[v] == Cases )
|
219
|
+
{
|
220
|
+
Verbosity(1, fprintf(Of, "\tpure subset\n"))
|
221
|
+
Progress(Cases);
|
222
|
+
return;
|
223
|
+
}
|
224
|
+
}
|
225
|
+
|
226
|
+
GEnv.BaseInfo =
|
227
|
+
TotalInfo(GEnv.ClassFreq, 1, MaxAttVal[ClassAtt]) / Cases;
|
228
|
+
}
|
229
|
+
|
230
|
+
*Result = Node = Leaf(Parent, Br);
|
231
|
+
|
232
|
+
/* Find the best attribute split, using sampling if the number
|
233
|
+
of cases is at least the minimum multiple of the sample size.
|
234
|
+
Start by collecting info on discrete attributes */
|
235
|
+
|
236
|
+
DiscreteAttInfo(Fp, Lp, CondAtts);
|
237
|
+
|
238
|
+
if ( Cases > SAMPLEFACTOR * SampleSize )
|
239
|
+
{
|
240
|
+
ChooseSplitWithSampling(Fp, Lp, CondAtts);
|
241
|
+
}
|
242
|
+
else
|
243
|
+
{
|
244
|
+
ChooseSplit(Fp, Lp, CondAtts);
|
245
|
+
}
|
246
|
+
|
247
|
+
/* Save any sift entry */
|
248
|
+
|
249
|
+
if ( SIFT && GEnv.SiftEntry && GEnv.SiftSize )
|
250
|
+
{
|
251
|
+
Node->SiftEntry = strdup(GEnv.SiftEntry);
|
252
|
+
GEnv.SiftSize = 0;
|
253
|
+
}
|
254
|
+
|
255
|
+
FindBestAtt(&BestAtt, &Val);
|
256
|
+
|
257
|
+
/* Decide whether to branch or not */
|
258
|
+
|
259
|
+
if ( BestAtt == None )
|
260
|
+
{
|
261
|
+
Verbosity(1, fprintf(Of, "\tno sensible splits\n"))
|
262
|
+
|
263
|
+
Progress(Cases);
|
264
|
+
}
|
265
|
+
else
|
266
|
+
{
|
267
|
+
Verbosity(1,
|
268
|
+
fprintf(Of, "\tbest attribute %s", AttName[BestAtt]);
|
269
|
+
if ( Continuous(BestAtt) )
|
270
|
+
{
|
271
|
+
fprintf(Of, " cut %.3f", GEnv.Bar[BestAtt]);
|
272
|
+
}
|
273
|
+
if ( ! Continuous(ClassAtt) )
|
274
|
+
{
|
275
|
+
fprintf(Of, " val %.3f inf %.3f",
|
276
|
+
SplitVal(GEnv.Gain[BestAtt], GEnv.Info[BestAtt]),
|
277
|
+
GEnv.Info[BestAtt]);
|
278
|
+
}
|
279
|
+
fprintf(Of, " gain %.3f\n", GEnv.Gain[BestAtt]);)
|
280
|
+
|
281
|
+
/* Carry out the recursive divide-and-conquer */
|
282
|
+
|
283
|
+
Node->Tested = BestAtt;
|
284
|
+
|
285
|
+
if ( Continuous(BestAtt) || Ordered(BestAtt) )
|
286
|
+
{
|
287
|
+
Node->NodeType = BrThresh;
|
288
|
+
Node->Forks = 3;
|
289
|
+
Node->Cut = GEnv.Bar[BestAtt];
|
290
|
+
}
|
291
|
+
else
|
292
|
+
if ( Continuous(ClassAtt) && MaxAttVal[BestAtt] > 3 )
|
293
|
+
{
|
294
|
+
Node->NodeType = BrSubset;
|
295
|
+
Node->Forks = 3;
|
296
|
+
Node->Left = Alloc((MaxAttVal[BestAtt]>>3)+1, unsigned char);
|
297
|
+
memcpy(Node->Left, GEnv.Subset[BestAtt], (MaxAttVal[BestAtt]>>3)+1);
|
298
|
+
}
|
299
|
+
else
|
300
|
+
{
|
301
|
+
Node->NodeType = BrDiscr;
|
302
|
+
Node->Forks = MaxAttVal[BestAtt];
|
303
|
+
}
|
304
|
+
|
305
|
+
Node->Branch = Alloc(Node->Forks+1, Tree);
|
306
|
+
|
307
|
+
if ( ! GEnv.Tested[BestAtt] ) CondAtts++;
|
308
|
+
|
309
|
+
Divide(Node, Fp, Lp, CondAtts);
|
310
|
+
}
|
311
|
+
}
|
312
|
+
|
313
|
+
|
314
|
+
|
315
|
+
/*************************************************************************/
|
316
|
+
/* */
|
317
|
+
/* Recover information on level and tests from tree and parent */
|
318
|
+
/* */
|
319
|
+
/*************************************************************************/
|
320
|
+
|
321
|
+
|
322
|
+
void RecoverContext(Tree T, DiscrValue Br)
|
323
|
+
/* -------------- */
|
324
|
+
{
|
325
|
+
if ( T )
|
326
|
+
{
|
327
|
+
RecoverContext(T->Parent, T->Br);
|
328
|
+
|
329
|
+
NoteTest(T->Tested, Br, T->Cut, T->Left);
|
330
|
+
GEnv.Tested[T->Tested]++;
|
331
|
+
GEnv.Level++;
|
332
|
+
}
|
333
|
+
else
|
334
|
+
{
|
335
|
+
GEnv.Level = 0;
|
336
|
+
}
|
337
|
+
}
|
338
|
+
|
339
|
+
|
340
|
+
|
341
|
+
/*************************************************************************/
|
342
|
+
/* */
|
343
|
+
/* Analyse all discrete attributes in one pass */
|
344
|
+
/* */
|
345
|
+
/*************************************************************************/
|
346
|
+
|
347
|
+
|
348
|
+
void DiscreteAttInfo(CaseNo Fp, CaseNo Lp, int CondAtts)
|
349
|
+
/* --------------- */
|
350
|
+
{
|
351
|
+
CaseNo i;
|
352
|
+
DiscrValue v, c;
|
353
|
+
Attribute Att;
|
354
|
+
double Val;
|
355
|
+
int NDList=0, dl;
|
356
|
+
|
357
|
+
/* Initialise counts etc and prepare list of attributes */
|
358
|
+
|
359
|
+
ForEach(Att, 1, MaxAtt)
|
360
|
+
{
|
361
|
+
if ( ! Discrete(Att) || Exclude(Att) || Att == ClassAtt ||
|
362
|
+
CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
|
363
|
+
{
|
364
|
+
continue;
|
365
|
+
}
|
366
|
+
|
367
|
+
GEnv.DList[NDList++] = Att;
|
368
|
+
|
369
|
+
if ( Continuous(ClassAtt) )
|
370
|
+
{
|
371
|
+
ForEach(v, 0, MaxAttVal[Att])
|
372
|
+
{
|
373
|
+
GEnv.DValSum[Att][v] = GEnv.DValSumSq[Att][v] = 0;
|
374
|
+
GEnv.DFreq[Att][v][0] = 0; /* value frequency */
|
375
|
+
}
|
376
|
+
}
|
377
|
+
else
|
378
|
+
{
|
379
|
+
ForEach(v, 0, MaxAttVal[Att])
|
380
|
+
{
|
381
|
+
ForEach(c, 1, MaxAttVal[ClassAtt])
|
382
|
+
{
|
383
|
+
GEnv.DFreq[Att][v][c] = 0;
|
384
|
+
}
|
385
|
+
}
|
386
|
+
}
|
387
|
+
}
|
388
|
+
|
389
|
+
if ( ! NDList-- ) return;
|
390
|
+
|
391
|
+
/* Examine cases and update all counts etc */
|
392
|
+
|
393
|
+
ForEach(i, Fp, Lp)
|
394
|
+
{
|
395
|
+
ForEach(dl, 0, NDList)
|
396
|
+
{
|
397
|
+
Att = GEnv.DList[dl];
|
398
|
+
|
399
|
+
v = XDVal(Case[i], Att);
|
400
|
+
|
401
|
+
if ( Continuous(ClassAtt) )
|
402
|
+
{
|
403
|
+
Val = CClass(Case[i]);
|
404
|
+
|
405
|
+
GEnv.DFreq[Att][v][0]++;
|
406
|
+
GEnv.DValSum[Att][v] += Val;
|
407
|
+
GEnv.DValSumSq[Att][v] += Val * Val;
|
408
|
+
}
|
409
|
+
else
|
410
|
+
{
|
411
|
+
GEnv.DFreq[Att][v][ DClass(Case[i]) ]++;
|
412
|
+
}
|
413
|
+
}
|
414
|
+
}
|
415
|
+
}
|
416
|
+
|
417
|
+
|
418
|
+
|
419
|
+
/*************************************************************************/
|
420
|
+
/* */
|
421
|
+
/* Choose split using a sample. There are three phases: */
|
422
|
+
/* - process discrete atts using all data */
|
423
|
+
/* - for continuous atts, find gain etc from two samples and */
|
424
|
+
/* record the better value */
|
425
|
+
/* - re-examine high-value continuous attributes using all cases */
|
426
|
+
/* */
|
427
|
+
/*************************************************************************/
|
428
|
+
|
429
|
+
|
430
|
+
void ChooseSplitWithSampling(CaseNo Fp, CaseNo Lp, int CondAtts)
|
431
|
+
/* ----------------------- */
|
432
|
+
{
|
433
|
+
double Val, OldBestVal;
|
434
|
+
Attribute Att, BestAtt;
|
435
|
+
|
436
|
+
/* Process discrete attributes using all data */
|
437
|
+
|
438
|
+
ForEach(Att, 1, MaxAtt)
|
439
|
+
{
|
440
|
+
GEnv.Gain[Att] = None;
|
441
|
+
|
442
|
+
if ( Exclude(Att) || Att == ClassAtt || Continuous(Att) ||
|
443
|
+
CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
|
444
|
+
{
|
445
|
+
continue;
|
446
|
+
}
|
447
|
+
|
448
|
+
CheckSplit(Att, Fp, Lp);
|
449
|
+
}
|
450
|
+
|
451
|
+
/* Process continuous attributes using two samples */
|
452
|
+
|
453
|
+
GEnv.FRAC = SampleSize / (double) No(Fp, Lp);
|
454
|
+
|
455
|
+
SampleScan(Fp, Lp, CondAtts, false);
|
456
|
+
SampleScan(Fp+SampleSize, Lp, CondAtts, true);
|
457
|
+
|
458
|
+
GEnv.FRAC = 1;
|
459
|
+
|
460
|
+
/* Re-examine continuous attributes that are possible best splits
|
461
|
+
(with value at least 70% of current best value) */
|
462
|
+
|
463
|
+
FindBestAtt(&BestAtt, &OldBestVal);
|
464
|
+
|
465
|
+
if ( BestAtt != None )
|
466
|
+
{
|
467
|
+
Verbosity(2,
|
468
|
+
fprintf(Of, " Revisit threshold %.3f (%s)\n",
|
469
|
+
0.7 * OldBestVal, AttName[BestAtt]))
|
470
|
+
|
471
|
+
ForEach(Att, 1, MaxAtt)
|
472
|
+
{
|
473
|
+
if ( Discrete(Att) || GEnv.Gain[Att] <= Epsilon ) continue;
|
474
|
+
|
475
|
+
Val = SplitVal(GEnv.Gain[Att], GEnv.Info[Att]);
|
476
|
+
|
477
|
+
GEnv.Gain[Att] = None;
|
478
|
+
|
479
|
+
if ( Val > 0.7 * OldBestVal )
|
480
|
+
{
|
481
|
+
CheckSplit(Att, Fp, Lp);
|
482
|
+
}
|
483
|
+
}
|
484
|
+
}
|
485
|
+
}
|
486
|
+
|
487
|
+
|
488
|
+
|
489
|
+
/*************************************************************************/
|
490
|
+
/* */
|
491
|
+
/* Estimate Gain etc of continuous attributes using sample */
|
492
|
+
/* */
|
493
|
+
/*************************************************************************/
|
494
|
+
|
495
|
+
|
496
|
+
void SampleScan(CaseNo Fp, CaseNo Lp, int CondAtts, Boolean Second)
|
497
|
+
/* ---------- */
|
498
|
+
{
|
499
|
+
CaseNo i, SLp;
|
500
|
+
double Val, Sum=0, SumSq=0, SaveBaseInfo, SavePSD,
|
501
|
+
FBar, FInfo, FGain, FVal;
|
502
|
+
Attribute Att;
|
503
|
+
|
504
|
+
/* Save base information or SD */
|
505
|
+
|
506
|
+
SaveBaseInfo = GEnv.BaseInfo;
|
507
|
+
SavePSD = GEnv.PSD;
|
508
|
+
|
509
|
+
/* Generate sample in Fp ... Fp+SampleSize-1 */
|
510
|
+
|
511
|
+
Sample(Fp, Lp, SampleSize);
|
512
|
+
SLp = Fp + SampleSize - 1;
|
513
|
+
|
514
|
+
/* Determine sample PSD or base information */
|
515
|
+
|
516
|
+
if ( Continuous(ClassAtt) )
|
517
|
+
{
|
518
|
+
ForEach(i, Fp, SLp)
|
519
|
+
{
|
520
|
+
Val = CClass(Case[i]);
|
521
|
+
Sum += Val;
|
522
|
+
SumSq += Val * Val;
|
523
|
+
}
|
524
|
+
GEnv.PSD = SDEstimate(SampleSize, Sum, SumSq);
|
525
|
+
}
|
526
|
+
else
|
527
|
+
{
|
528
|
+
FindClassFrequencies(Fp, SLp);
|
529
|
+
GEnv.BaseInfo =
|
530
|
+
TotalInfo(GEnv.ClassFreq, 1, MaxAttVal[ClassAtt]) / SampleSize;
|
531
|
+
}
|
532
|
+
|
533
|
+
/* Check attributes using sample */
|
534
|
+
|
535
|
+
ForEach(Att, 1, MaxAtt)
|
536
|
+
{
|
537
|
+
if ( Exclude(Att) || Att == ClassAtt || ! Continuous(Att) ||
|
538
|
+
CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
|
539
|
+
{
|
540
|
+
continue;
|
541
|
+
}
|
542
|
+
|
543
|
+
/* Save information from possible earlier sample */
|
544
|
+
|
545
|
+
FInfo = GEnv.Info[Att];
|
546
|
+
FGain = GEnv.Gain[Att];
|
547
|
+
FBar = GEnv.Bar[Att];
|
548
|
+
|
549
|
+
GEnv.Gain[Att] = None;
|
550
|
+
|
551
|
+
CheckSplit(Att, Fp, SLp);
|
552
|
+
|
553
|
+
/* If this is second sample, retain information from better */
|
554
|
+
|
555
|
+
if ( Second )
|
556
|
+
{
|
557
|
+
FVal = SplitVal(FGain, FInfo); /* first value */
|
558
|
+
Val = SplitVal(GEnv.Gain[Att], GEnv.Info[Att]); /* second value */
|
559
|
+
|
560
|
+
if ( FVal > Val )
|
561
|
+
{
|
562
|
+
GEnv.Gain[Att] = FGain;
|
563
|
+
GEnv.Info[Att] = FInfo;
|
564
|
+
GEnv.Bar[Att] = FBar;
|
565
|
+
}
|
566
|
+
}
|
567
|
+
}
|
568
|
+
|
569
|
+
/* Restore base information or SD */
|
570
|
+
|
571
|
+
GEnv.BaseInfo = SaveBaseInfo;
|
572
|
+
GEnv.PSD = SavePSD;
|
573
|
+
}
|
574
|
+
|
575
|
+
|
576
|
+
|
577
|
+
/*************************************************************************/
|
578
|
+
/* */
|
579
|
+
/* Sample N cases from Fp through Lp using tabulated random nos */
|
580
|
+
/* */
|
581
|
+
/*************************************************************************/
|
582
|
+
|
583
|
+
|
584
|
+
void Sample(CaseNo Fp, CaseNo Lp, CaseCount N)
|
585
|
+
/* ------ */
|
586
|
+
{
|
587
|
+
CaseNo i, j, Cases;
|
588
|
+
|
589
|
+
Cases = No(Fp, Lp);
|
590
|
+
|
591
|
+
ForEach(i, 0, N-1)
|
592
|
+
{
|
593
|
+
j = Rand[i] * Cases--;
|
594
|
+
Swap(Fp+i, Fp+j);
|
595
|
+
}
|
596
|
+
}
|
597
|
+
|
598
|
+
|
599
|
+
|
600
|
+
/*************************************************************************/
|
601
|
+
/* */
|
602
|
+
/* Choose a split using all cases */
|
603
|
+
/* */
|
604
|
+
/*************************************************************************/
|
605
|
+
|
606
|
+
|
607
|
+
void ChooseSplit(CaseNo Fp, CaseNo Lp, int CondAtts)
|
608
|
+
/* ----------- */
|
609
|
+
{
|
610
|
+
Attribute Att;
|
611
|
+
|
612
|
+
GEnv.FRAC = 1;
|
613
|
+
|
614
|
+
ForEach(Att, 1, MaxAtt)
|
615
|
+
{
|
616
|
+
GEnv.Gain[Att] = None;
|
617
|
+
|
618
|
+
if ( Exclude(Att) || Att == ClassAtt ||
|
619
|
+
CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
|
620
|
+
{
|
621
|
+
continue;
|
622
|
+
}
|
623
|
+
|
624
|
+
CheckSplit(Att, Fp, Lp);
|
625
|
+
}
|
626
|
+
}
|
627
|
+
|
628
|
+
|
629
|
+
|
630
|
+
void FindBestAtt(Attribute *BestAtt, double *BestVal)
|
631
|
+
/* ----------- */
|
632
|
+
{
|
633
|
+
Attribute Att;
|
634
|
+
double Val;
|
635
|
+
|
636
|
+
*BestVal = Epsilon;
|
637
|
+
*BestAtt = None;
|
638
|
+
|
639
|
+
ForEach(Att, 1, MaxAtt)
|
640
|
+
{
|
641
|
+
Val = SplitVal(GEnv.Gain[Att], GEnv.Info[Att]);
|
642
|
+
|
643
|
+
if ( Val > *BestVal )
|
644
|
+
{
|
645
|
+
*BestAtt = Att;
|
646
|
+
*BestVal = Val;
|
647
|
+
}
|
648
|
+
}
|
649
|
+
}
|
650
|
+
|
651
|
+
|
652
|
+
|
653
|
+
/*************************************************************************/
|
654
|
+
/* */
|
655
|
+
/* Evaluate a potential split */
|
656
|
+
/* */
|
657
|
+
/*************************************************************************/
|
658
|
+
|
659
|
+
|
660
|
+
void CheckSplit(Attribute Att, CaseNo Fp, CaseNo Lp)
|
661
|
+
/* ---------- */
|
662
|
+
{
|
663
|
+
CaseNo Xp;
|
664
|
+
|
665
|
+
GEnv.Tested[Att]++;
|
666
|
+
|
667
|
+
/* Remove missing values of Att. Note: this makes values
|
668
|
+
of BaseInfo and PSD approximate only */
|
669
|
+
|
670
|
+
Xp = ( SomeMiss[Att] ? SkipMissing(Att, Fp, Lp) : Fp );
|
671
|
+
|
672
|
+
/* Evaluate attribute for split -- different methods for
|
673
|
+
continuous and discrete class attributes */
|
674
|
+
|
675
|
+
if ( Continuous(Att) ) /* continuous att */
|
676
|
+
{
|
677
|
+
if ( Continuous(ClassAtt) )
|
678
|
+
{
|
679
|
+
CEvalContinAtt(Att, Xp, Lp);
|
680
|
+
}
|
681
|
+
else
|
682
|
+
{
|
683
|
+
DEvalContinAtt(Att, Xp, Lp);
|
684
|
+
}
|
685
|
+
}
|
686
|
+
else /* discrete att */
|
687
|
+
{
|
688
|
+
if ( Continuous(ClassAtt) )
|
689
|
+
{
|
690
|
+
if ( MaxAttVal[Att] > 3 || GEnv.Tested[Att] <= 1 )
|
691
|
+
{
|
692
|
+
CEvalDiscrAtt(Att, Xp, Lp);
|
693
|
+
}
|
694
|
+
}
|
695
|
+
else
|
696
|
+
if ( Ordered(Att) )
|
697
|
+
{
|
698
|
+
DEvalOrderedAtt(Att, Xp, Lp);
|
699
|
+
}
|
700
|
+
else
|
701
|
+
if ( GEnv.Tested[Att] <= 1 )
|
702
|
+
{
|
703
|
+
DEvalDiscrAtt(Att, Xp, Lp);
|
704
|
+
}
|
705
|
+
}
|
706
|
+
|
707
|
+
if ( GEnv.Gain[Att] > Epsilon )
|
708
|
+
{
|
709
|
+
/* Find value adjusted for missing values */
|
710
|
+
|
711
|
+
GEnv.Gain[Att] *= No(Xp, Lp) / (double) No(Fp, Lp);
|
712
|
+
GEnv.Info[Att] = Max(GEnv.Info[Att], 0.5);
|
713
|
+
}
|
714
|
+
|
715
|
+
GEnv.Tested[Att]--;
|
716
|
+
}
|
717
|
+
|
718
|
+
|
719
|
+
|
720
|
+
/*************************************************************************/
|
721
|
+
/* */
|
722
|
+
/* Split cases Fp to Lp on attribute Att */
|
723
|
+
/* */
|
724
|
+
/*************************************************************************/
|
725
|
+
|
726
|
+
|
727
|
+
void Divide(Tree Node, CaseNo Fp, CaseNo Lp, int CondAtts)
|
728
|
+
/* ------ */
|
729
|
+
{
|
730
|
+
CaseNo Ep;
|
731
|
+
DiscrValue v;
|
732
|
+
|
733
|
+
/* Remove unknown attribute values */
|
734
|
+
|
735
|
+
Ep = ( SomeMiss[Node->Tested] ? SkipMissing(Node->Tested, Fp, Lp) : Fp );
|
736
|
+
Progress(Ep - Fp);
|
737
|
+
|
738
|
+
/* Recursive divide and conquer */
|
739
|
+
|
740
|
+
ForEach(v, 1, Node->Forks)
|
741
|
+
{
|
742
|
+
Fp = Ep;
|
743
|
+
Ep = Group(Node->Tested, v, Fp, Lp, Node->Cut, Node->Left);
|
744
|
+
|
745
|
+
if ( Ep > Fp )
|
746
|
+
{
|
747
|
+
Split(Fp, Ep-1, CondAtts, Node, v, &Node->Branch[v]);
|
748
|
+
}
|
749
|
+
}
|
750
|
+
}
|
751
|
+
|
752
|
+
|
753
|
+
|
754
|
+
/*************************************************************************/
|
755
|
+
/* */
|
756
|
+
/* Group together missing values and return index of next case */
|
757
|
+
/* */
|
758
|
+
/*************************************************************************/
|
759
|
+
|
760
|
+
|
761
|
+
CaseNo SkipMissing(Attribute Att, CaseNo Fp, CaseNo Lp)
|
762
|
+
/* ----------- */
|
763
|
+
{
|
764
|
+
CaseNo i;
|
765
|
+
|
766
|
+
ForEach(i, Fp, Lp)
|
767
|
+
{
|
768
|
+
if ( Unknown(Case[i], Att) )
|
769
|
+
{
|
770
|
+
Swap(Fp, i);
|
771
|
+
Fp++;
|
772
|
+
}
|
773
|
+
}
|
774
|
+
|
775
|
+
return Fp;
|
776
|
+
}
|
777
|
+
|
778
|
+
|
779
|
+
|
780
|
+
|
781
|
+
/*************************************************************************/
|
782
|
+
/* */
|
783
|
+
/* Check groups formed by a potential test */
|
784
|
+
/* */
|
785
|
+
/*************************************************************************/
|
786
|
+
|
787
|
+
|
788
|
+
void CheckPotentialClusters(Attribute Att, DiscrValue Forks,
|
789
|
+
CaseNo Fp, CaseNo Lp, ContValue Cut, Set S,
|
790
|
+
CaseNo **FT)
|
791
|
+
/* ---------------------- */
|
792
|
+
{
|
793
|
+
CaseNo Ep;
|
794
|
+
DiscrValue v;
|
795
|
+
|
796
|
+
ForEach(v, 1, Forks)
|
797
|
+
{
|
798
|
+
Ep = Group(Att, v, Fp, Lp, Cut, S);
|
799
|
+
|
800
|
+
if ( Ep > Fp )
|
801
|
+
{
|
802
|
+
NoteTest(Att, v, Cut, S);
|
803
|
+
|
804
|
+
if ( Continuous(ClassAtt) )
|
805
|
+
{
|
806
|
+
FindContinOutliers(Fp, Ep-1, false);
|
807
|
+
}
|
808
|
+
else
|
809
|
+
{
|
810
|
+
FindDiscrOutliers(Fp, Ep-1, ( FT ? FT[v] : Nil ));
|
811
|
+
}
|
812
|
+
|
813
|
+
Fp = Ep;
|
814
|
+
}
|
815
|
+
}
|
816
|
+
}
|
817
|
+
|
818
|
+
|
819
|
+
|
820
|
+
/*************************************************************************/
|
821
|
+
/* */
|
822
|
+
/* Print context information for DAC */
|
823
|
+
/* */
|
824
|
+
/*************************************************************************/
|
825
|
+
|
826
|
+
|
827
|
+
void ShowContext(CaseNo i)
|
828
|
+
/* ----------- */
|
829
|
+
{
|
830
|
+
Attribute Att;
|
831
|
+
ClustRec CR;
|
832
|
+
Clust C=&CR;
|
833
|
+
int d;
|
834
|
+
|
835
|
+
C->Att = ClassAtt;
|
836
|
+
GEnv.Level--;
|
837
|
+
SaveClustConds(C);
|
838
|
+
GEnv.Level++;
|
839
|
+
|
840
|
+
ForEach(d, 0, C->NCond-1)
|
841
|
+
{
|
842
|
+
Att = C->Cond[d].Att;
|
843
|
+
|
844
|
+
if ( Continuous(Att) )
|
845
|
+
{
|
846
|
+
PrintContinCond(Att, C->Cond[d].Low, C->Cond[d].High, i);
|
847
|
+
}
|
848
|
+
else
|
849
|
+
if ( Ordered(Att) )
|
850
|
+
{
|
851
|
+
PrintOrderedCond(Att, (int) C->Cond[d].Low, (int) C->Cond[d].High,
|
852
|
+
i);
|
853
|
+
}
|
854
|
+
else
|
855
|
+
if ( Continuous(C->Att) && MaxAttVal[Att] > 3 )
|
856
|
+
{
|
857
|
+
PrintSubsetCond(Att, C->Cond[d].Values, i);
|
858
|
+
FreeUnlessNil(C->Cond[d].Values);
|
859
|
+
}
|
860
|
+
else
|
861
|
+
{
|
862
|
+
PrintValCond(Att, (int) C->Cond[d].Low);
|
863
|
+
}
|
864
|
+
}
|
865
|
+
|
866
|
+
Free(C->Cond);
|
867
|
+
}
|
868
|
+
|
869
|
+
|
870
|
+
|
871
|
+
/*************************************************************************/
|
872
|
+
/* */
|
873
|
+
/* Construct a leaf in a given node */
|
874
|
+
/* */
|
875
|
+
/*************************************************************************/
|
876
|
+
|
877
|
+
|
878
|
+
Tree Leaf(Tree Parent, DiscrValue Br)
|
879
|
+
/* ---- */
|
880
|
+
{
|
881
|
+
Tree Node;
|
882
|
+
|
883
|
+
Node = AllocZero(1, TreeRec);
|
884
|
+
|
885
|
+
Node->NodeType = 0;
|
886
|
+
Node->Parent = Parent;
|
887
|
+
Node->Br = Br;
|
888
|
+
|
889
|
+
return Node;
|
890
|
+
}
|
891
|
+
|
892
|
+
|
893
|
+
|
894
|
+
void ReleaseTree(Tree T, int Level)
|
895
|
+
/* ----------- */
|
896
|
+
{
|
897
|
+
DiscrValue v;
|
898
|
+
|
899
|
+
if ( ! T ) return;
|
900
|
+
|
901
|
+
if ( Level > 0 && LastLevel >= Level - 1 ) LastLevel = Level - 2;
|
902
|
+
|
903
|
+
/* Possible sift entry */
|
904
|
+
|
905
|
+
if ( T->SiftEntry )
|
906
|
+
{
|
907
|
+
if ( SIFT )
|
908
|
+
{
|
909
|
+
RecoverContext(T->Parent, T->Br);
|
910
|
+
OutputConditions();
|
911
|
+
fprintf(Sf, "%s", T->SiftEntry);
|
912
|
+
}
|
913
|
+
Free(T->SiftEntry);
|
914
|
+
}
|
915
|
+
|
916
|
+
if ( T->NodeType )
|
917
|
+
{
|
918
|
+
ForEach(v, 1, T->Forks)
|
919
|
+
{
|
920
|
+
ReleaseTree(T->Branch[v], Level+1);
|
921
|
+
}
|
922
|
+
|
923
|
+
if ( T->NodeType == BrSubset )
|
924
|
+
{
|
925
|
+
FreeUnlessNil(T->Left);
|
926
|
+
}
|
927
|
+
|
928
|
+
Free(T->Branch);
|
929
|
+
}
|
930
|
+
|
931
|
+
Free(T);
|
932
|
+
}
|
933
|
+
|
934
|
+
|
935
|
+
|
936
|
+
void OutputConditions()
|
937
|
+
/* ---------------- */
|
938
|
+
{
|
939
|
+
Attribute Att;
|
940
|
+
int i, CType, b, Bytes;
|
941
|
+
DiscrValue Br;
|
942
|
+
|
943
|
+
if ( ! TargetSaved )
|
944
|
+
{
|
945
|
+
fprintf(Sf, "1 %d\n", ClassAtt);
|
946
|
+
TargetSaved = true;
|
947
|
+
}
|
948
|
+
|
949
|
+
if ( GEnv.Level < 0 ) return;
|
950
|
+
|
951
|
+
/* Save all conditions since last saved */
|
952
|
+
|
953
|
+
ForEach(i, LastLevel+1, GEnv.Level-1)
|
954
|
+
{
|
955
|
+
Att = GEnv.Test[i].Att;
|
956
|
+
Br = GEnv.Test[i].Br;
|
957
|
+
|
958
|
+
/* Determine condition type */
|
959
|
+
|
960
|
+
CType = ( Br == 1 ? 11 :
|
961
|
+
Continuous(Att) || Ordered(Att) ? 12 :
|
962
|
+
Continuous(ClassAtt) && MaxAttVal[Att] > 3 ? 13 : 11 );
|
963
|
+
|
964
|
+
fprintf(Sf, "%d %d %d %d", CType, i, Att, Br);
|
965
|
+
|
966
|
+
/* Don't need to save anything else if this branch is 1 (N/A)
|
967
|
+
or if test is on two-valued discrete att */
|
968
|
+
|
969
|
+
if ( Br != 1 )
|
970
|
+
{
|
971
|
+
if ( Continuous(Att) || Ordered(Att) )
|
972
|
+
{
|
973
|
+
fprintf(Sf, " %.8g", GEnv.Test[i].Cut);
|
974
|
+
}
|
975
|
+
else
|
976
|
+
if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
|
977
|
+
{
|
978
|
+
/* Print subset of values */
|
979
|
+
|
980
|
+
Bytes = (MaxAttVal[Att]>>3) + 1;
|
981
|
+
|
982
|
+
ForEach(b, 0, Bytes-1)
|
983
|
+
{
|
984
|
+
fprintf(Sf, " %x", GEnv.Test[i].Left[b]);
|
985
|
+
}
|
986
|
+
}
|
987
|
+
}
|
988
|
+
|
989
|
+
fprintf(Sf, "\n");
|
990
|
+
}
|
991
|
+
|
992
|
+
LastLevel = GEnv.Level-1;
|
993
|
+
}
|
994
|
+
|
995
|
+
|
996
|
+
|
997
|
+
/*************************************************************************/
|
998
|
+
/* */
|
999
|
+
/* Set up environment */
|
1000
|
+
/* */
|
1001
|
+
/*************************************************************************/
|
1002
|
+
|
1003
|
+
|
1004
|
+
void InitialiseEnvData()
|
1005
|
+
/* ----------------- */
|
1006
|
+
{
|
1007
|
+
DiscrValue v;
|
1008
|
+
Attribute Att;
|
1009
|
+
|
1010
|
+
GEnv.ValFreq = Alloc(MaxDiscrVal+1, CaseCount);
|
1011
|
+
GEnv.ClassFreq = Alloc(MaxDiscrVal+1, CaseCount);
|
1012
|
+
GEnv.ValSum = Alloc(MaxDiscrVal+1, double);
|
1013
|
+
GEnv.ValSumSq = Alloc(MaxDiscrVal+1, double);
|
1014
|
+
GEnv.Left = Alloc(MaxDiscrVal+1, Boolean);
|
1015
|
+
GEnv.Possible = Alloc(MaxDiscrVal+1, Boolean);
|
1016
|
+
GEnv.Tested = AllocZero(MaxAtt+1, int);
|
1017
|
+
GEnv.Gain = AllocZero(MaxAtt+1, double);
|
1018
|
+
GEnv.Info = AllocZero(MaxAtt+1, double);
|
1019
|
+
GEnv.Bar = AllocZero(MaxAtt+1, ContValue);
|
1020
|
+
|
1021
|
+
GEnv.Subset = AllocZero(MaxAtt+1, Set);
|
1022
|
+
GEnv.Subset[0] = Alloc((MaxDiscrVal>>3)+1, unsigned char); /* caveats */
|
1023
|
+
ForEach(Att, 1, MaxAtt)
|
1024
|
+
{
|
1025
|
+
if ( Discrete(Att) )
|
1026
|
+
{
|
1027
|
+
GEnv.Subset[Att] = Alloc((MaxAttVal[Att]>>3)+1, unsigned char);
|
1028
|
+
}
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
/* Freq[] is one longer than apparently necessary to allow for
|
1032
|
+
the extra slot needed by EvalOrderedAtt() */
|
1033
|
+
|
1034
|
+
GEnv.Freq = AllocZero(MaxDiscrVal+2, CaseCount *);
|
1035
|
+
ForEach(v, 0, MaxDiscrVal+1)
|
1036
|
+
{
|
1037
|
+
GEnv.Freq[v] = AllocZero(MaxDiscrVal+1, CaseCount);
|
1038
|
+
}
|
1039
|
+
|
1040
|
+
GEnv.BestFreq = AllocZero(4, CaseCount *);
|
1041
|
+
ForEach(v, 0, 3)
|
1042
|
+
{
|
1043
|
+
GEnv.BestFreq[v] = AllocZero(MaxDiscrVal+1, CaseCount);
|
1044
|
+
}
|
1045
|
+
|
1046
|
+
GEnv.DList = Alloc(MaxAtt+1, Attribute);
|
1047
|
+
GEnv.DFreq = Alloc(MaxAtt+1, CaseCount **);
|
1048
|
+
GEnv.DValSum = Alloc(MaxAtt+1, double *);
|
1049
|
+
GEnv.DValSumSq = Alloc(MaxAtt+1, double *);
|
1050
|
+
ForEach(Att, 1, MaxAtt)
|
1051
|
+
{
|
1052
|
+
if ( Exclude(Att) || ! Discrete(Att) ) continue;
|
1053
|
+
|
1054
|
+
GEnv.DFreq[Att] = Alloc(MaxAttVal[Att]+1, CaseCount *);
|
1055
|
+
ForEach(v, 0, MaxAttVal[Att])
|
1056
|
+
{
|
1057
|
+
GEnv.DFreq[Att][v] = Alloc(MaxDiscrVal+1, CaseCount);
|
1058
|
+
}
|
1059
|
+
GEnv.DValSum[Att] = Alloc(MaxDiscrVal+1, double);
|
1060
|
+
GEnv.DValSumSq[Att] = Alloc(MaxDiscrVal+1, double);
|
1061
|
+
}
|
1062
|
+
}
|
1063
|
+
|
1064
|
+
|
1065
|
+
|
1066
|
+
/*************************************************************************/
|
1067
|
+
/* */
|
1068
|
+
/* Clean up environment */
|
1069
|
+
/* */
|
1070
|
+
/*************************************************************************/
|
1071
|
+
|
1072
|
+
|
1073
|
+
void FreeEnvData()
|
1074
|
+
/* ------------ */
|
1075
|
+
{
|
1076
|
+
Attribute Att;
|
1077
|
+
int i;
|
1078
|
+
|
1079
|
+
if ( ! GEnv.ValFreq ) return;
|
1080
|
+
|
1081
|
+
FreeUnlessNil(GEnv.ValFreq);
|
1082
|
+
FreeUnlessNil(GEnv.ClassFreq);
|
1083
|
+
FreeUnlessNil(GEnv.ValSum);
|
1084
|
+
FreeUnlessNil(GEnv.ValSumSq);
|
1085
|
+
FreeUnlessNil(GEnv.Left);
|
1086
|
+
FreeUnlessNil(GEnv.Possible);
|
1087
|
+
FreeUnlessNil(GEnv.Tested);
|
1088
|
+
FreeUnlessNil(GEnv.Gain);
|
1089
|
+
FreeUnlessNil(GEnv.Info);
|
1090
|
+
FreeUnlessNil(GEnv.Bar);
|
1091
|
+
|
1092
|
+
if ( GEnv.Test )
|
1093
|
+
{
|
1094
|
+
ForEach(i, 0, GEnv.MaxLevel-1)
|
1095
|
+
{
|
1096
|
+
FreeUnlessNil(GEnv.Test[i].Left);
|
1097
|
+
}
|
1098
|
+
|
1099
|
+
Free(GEnv.Test);
|
1100
|
+
}
|
1101
|
+
|
1102
|
+
FreeUnlessNil(GEnv.Subset[0]);
|
1103
|
+
ForEach(Att, 1, MaxAtt)
|
1104
|
+
{
|
1105
|
+
if ( Discrete(Att) )
|
1106
|
+
{
|
1107
|
+
FreeUnlessNil(GEnv.Subset[Att]);
|
1108
|
+
}
|
1109
|
+
}
|
1110
|
+
FreeUnlessNil(GEnv.Subset);
|
1111
|
+
|
1112
|
+
FreeVector((void **) GEnv.Freq, 0, MaxDiscrVal+1);
|
1113
|
+
FreeVector((void **) GEnv.BestFreq, 0, 3);
|
1114
|
+
|
1115
|
+
ForEach(Att, 1, MaxAtt)
|
1116
|
+
{
|
1117
|
+
if ( ! GEnv.DFreq[Att] ) continue;
|
1118
|
+
|
1119
|
+
FreeVector((void **)GEnv.DFreq[Att], 0, MaxAttVal[Att]);
|
1120
|
+
Free(GEnv.DValSum[Att]);
|
1121
|
+
Free(GEnv.DValSumSq[Att]);
|
1122
|
+
}
|
1123
|
+
Free(GEnv.DFreq);
|
1124
|
+
Free(GEnv.DValSum);
|
1125
|
+
Free(GEnv.DValSumSq);
|
1126
|
+
Free(GEnv.DList);
|
1127
|
+
|
1128
|
+
FreeUnlessNil(GEnv.SiftEntry);
|
1129
|
+
}
|
1130
|
+
#endif
|
1131
|
+
|
1132
|
+
|
1133
|
+
|
1134
|
+
/*************************************************************************/
|
1135
|
+
/* */
|
1136
|
+
/* Test[] contains a stack of current tests. Add a new test */
|
1137
|
+
/* for the current level */
|
1138
|
+
/* */
|
1139
|
+
/*************************************************************************/
|
1140
|
+
|
1141
|
+
|
1142
|
+
void NoteTest(Attribute Att, DiscrValue Br, ContValue Cut, Set Left)
|
1143
|
+
/* -------- */
|
1144
|
+
{
|
1145
|
+
int i;
|
1146
|
+
|
1147
|
+
/* Check space for tests */
|
1148
|
+
|
1149
|
+
if ( GEnv.Level >= GEnv.MaxLevel )
|
1150
|
+
{
|
1151
|
+
if ( ! GEnv.MaxLevel )
|
1152
|
+
{
|
1153
|
+
GEnv.Test = Alloc(100, TestRec);
|
1154
|
+
}
|
1155
|
+
else
|
1156
|
+
{
|
1157
|
+
Realloc(GEnv.Test, GEnv.MaxLevel+100, TestRec);
|
1158
|
+
}
|
1159
|
+
|
1160
|
+
ForEach(i, 0, 99)
|
1161
|
+
{
|
1162
|
+
GEnv.Test[GEnv.MaxLevel+i].Left =
|
1163
|
+
Alloc((MaxDiscrVal>>3)+1, unsigned char);
|
1164
|
+
}
|
1165
|
+
|
1166
|
+
GEnv.MaxLevel += 100;
|
1167
|
+
}
|
1168
|
+
|
1169
|
+
GEnv.Test[GEnv.Level].Att = Att;
|
1170
|
+
GEnv.Test[GEnv.Level].Br = Br;
|
1171
|
+
GEnv.Test[GEnv.Level].Cut = Cut;
|
1172
|
+
if ( Left )
|
1173
|
+
{
|
1174
|
+
memcpy(GEnv.Test[GEnv.Level].Left, Left, (MaxAttVal[Att]>>3)+1);
|
1175
|
+
}
|
1176
|
+
}
|
1177
|
+
|
1178
|
+
|
1179
|
+
|
1180
|
+
/*************************************************************************/
|
1181
|
+
/* */
|
1182
|
+
/* Group together the cases corresponding to branch V of a test */
|
1183
|
+
/* and return the index of the case following the last */
|
1184
|
+
/* */
|
1185
|
+
/*************************************************************************/
|
1186
|
+
|
1187
|
+
|
1188
|
+
CaseNo Group(Attribute Att, DiscrValue V, CaseNo Fp, CaseNo Lp,
|
1189
|
+
ContValue Cut, Set Left)
|
1190
|
+
/* ----- */
|
1191
|
+
{
|
1192
|
+
CaseNo i;
|
1193
|
+
|
1194
|
+
/* Group cases on the value of attribute Att, perhaps depending
|
1195
|
+
on the type of split */
|
1196
|
+
|
1197
|
+
if ( V == 1 )
|
1198
|
+
{
|
1199
|
+
/* Group all non-applicable values. Don't even try if
|
1200
|
+
this attribute doesn't have N/A values */
|
1201
|
+
|
1202
|
+
if ( SomeNA[Att] )
|
1203
|
+
{
|
1204
|
+
ForEach(i, Fp, Lp)
|
1205
|
+
{
|
1206
|
+
if ( NotApplic(Case[i], Att) )
|
1207
|
+
{
|
1208
|
+
Swap(Fp, i);
|
1209
|
+
Fp++;
|
1210
|
+
}
|
1211
|
+
}
|
1212
|
+
}
|
1213
|
+
}
|
1214
|
+
else
|
1215
|
+
if ( Continuous(Att) )
|
1216
|
+
{
|
1217
|
+
ForEach(i, Fp, Lp)
|
1218
|
+
{
|
1219
|
+
if ( ! Unknown(Case[i], Att) &&
|
1220
|
+
! NotApplic(Case[i], Att) &&
|
1221
|
+
(CVal(Case[i], Att) <= Cut) == (V == 2) )
|
1222
|
+
{
|
1223
|
+
Swap(Fp, i);
|
1224
|
+
Fp++;
|
1225
|
+
}
|
1226
|
+
}
|
1227
|
+
}
|
1228
|
+
else
|
1229
|
+
if ( Ordered(Att) && Att != ClassAtt )
|
1230
|
+
{
|
1231
|
+
ForEach(i, Fp, Lp)
|
1232
|
+
{
|
1233
|
+
if ( ! Unknown(Case[i], Att) &&
|
1234
|
+
! NotApplic(Case[i], Att) &&
|
1235
|
+
(XDVal(Case[i], Att) <= Cut + 0.1) == (V == 2) )
|
1236
|
+
{
|
1237
|
+
Swap(Fp, i);
|
1238
|
+
Fp++;
|
1239
|
+
}
|
1240
|
+
}
|
1241
|
+
}
|
1242
|
+
else
|
1243
|
+
if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
|
1244
|
+
{
|
1245
|
+
ForEach(i, Fp, Lp)
|
1246
|
+
{
|
1247
|
+
if ( ! Unknown(Case[i], Att) &&
|
1248
|
+
! NotApplic(Case[i], Att) &&
|
1249
|
+
(In(XDVal(Case[i], Att), Left) != 0) == (V == 2) )
|
1250
|
+
{
|
1251
|
+
Swap(Fp, i);
|
1252
|
+
Fp++;
|
1253
|
+
}
|
1254
|
+
}
|
1255
|
+
}
|
1256
|
+
else
|
1257
|
+
{
|
1258
|
+
ForEach(i, Fp, Lp)
|
1259
|
+
{
|
1260
|
+
if ( XDVal(Case[i], Att) == V )
|
1261
|
+
{
|
1262
|
+
Swap(Fp, i);
|
1263
|
+
Fp++;
|
1264
|
+
}
|
1265
|
+
}
|
1266
|
+
}
|
1267
|
+
|
1268
|
+
return Fp;
|
1269
|
+
}
|