see5-installer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
data/ext/c5.0/c50.c
ADDED
@@ -0,0 +1,330 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Main routine, C5.0 */
|
30
|
+
/* ------------------ */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#include "defns.i"
|
36
|
+
#include "extern.i"
|
37
|
+
#include <signal.h>
|
38
|
+
|
39
|
+
#include <sys/unistd.h>
|
40
|
+
#include <sys/time.h>
|
41
|
+
#include <sys/resource.h>
|
42
|
+
|
43
|
+
#define SetFOpt(V) V = strtod(OptArg, &EndPtr);\
|
44
|
+
if ( ! EndPtr || *EndPtr != '\00' ) break;\
|
45
|
+
ArgOK = true
|
46
|
+
#define SetIOpt(V) V = strtol(OptArg, &EndPtr, 10);\
|
47
|
+
if ( ! EndPtr || *EndPtr != '\00' ) break;\
|
48
|
+
ArgOK = true
|
49
|
+
|
50
|
+
|
51
|
+
int main(int Argc, char *Argv[])
|
52
|
+
/* ---- */
|
53
|
+
{
|
54
|
+
int o;
|
55
|
+
extern String OptArg, Option;
|
56
|
+
char *EndPtr;
|
57
|
+
Boolean FirstTime=true, ArgOK;
|
58
|
+
double StartTime;
|
59
|
+
FILE *F;
|
60
|
+
CaseNo SaveMaxCase;
|
61
|
+
Attribute Att;
|
62
|
+
|
63
|
+
struct rlimit RL;
|
64
|
+
|
65
|
+
/* Make sure there is a largish runtime stack */
|
66
|
+
|
67
|
+
getrlimit(RLIMIT_STACK, &RL);
|
68
|
+
|
69
|
+
RL.rlim_cur = Max(RL.rlim_cur, 20 * 1024 * 1024);
|
70
|
+
|
71
|
+
if ( RL.rlim_max > 0 ) /* -1 if unlimited */
|
72
|
+
{
|
73
|
+
RL.rlim_cur = Min(RL.rlim_max, RL.rlim_cur);
|
74
|
+
}
|
75
|
+
|
76
|
+
setrlimit(RLIMIT_STACK, &RL);
|
77
|
+
|
78
|
+
|
79
|
+
/* Check for output to be saved to a file */
|
80
|
+
|
81
|
+
if ( Argc > 2 && ! strcmp(Argv[Argc-2], "-o") )
|
82
|
+
{
|
83
|
+
Of = fopen(Argv[Argc-1], "w");
|
84
|
+
Argc -= 2;
|
85
|
+
}
|
86
|
+
|
87
|
+
if ( ! Of )
|
88
|
+
{
|
89
|
+
Of = stdout;
|
90
|
+
}
|
91
|
+
|
92
|
+
KRInit = time(0) & 07777;
|
93
|
+
|
94
|
+
PrintHeader("");
|
95
|
+
|
96
|
+
/* Process options */
|
97
|
+
|
98
|
+
while ( (o = ProcessOption(Argc, Argv, "f+bpv+t+sm+c+S+I+ru+egX+wh")) )
|
99
|
+
{
|
100
|
+
if ( FirstTime )
|
101
|
+
{
|
102
|
+
fprintf(Of, T_OptHeader);
|
103
|
+
FirstTime = false;
|
104
|
+
}
|
105
|
+
|
106
|
+
ArgOK = false;
|
107
|
+
|
108
|
+
switch (o)
|
109
|
+
{
|
110
|
+
case 'f': FileStem = OptArg;
|
111
|
+
fprintf(Of, T_OptApplication, FileStem);
|
112
|
+
ArgOK = true;
|
113
|
+
break;
|
114
|
+
case 'b': BOOST = true;
|
115
|
+
fprintf(Of, T_OptBoost);
|
116
|
+
if ( TRIALS == 1 ) TRIALS = 10;
|
117
|
+
ArgOK = true;
|
118
|
+
break;
|
119
|
+
case 'p': PROBTHRESH = true;
|
120
|
+
fprintf(Of, T_OptProbThresh);
|
121
|
+
ArgOK = true;
|
122
|
+
break;
|
123
|
+
#ifdef VerbOpt
|
124
|
+
case 'v': SetIOpt(VERBOSITY);
|
125
|
+
fprintf(Of, "\tVerbosity level %d\n", VERBOSITY);
|
126
|
+
ArgOK = true;
|
127
|
+
break;
|
128
|
+
#endif
|
129
|
+
case 't': SetIOpt(TRIALS);
|
130
|
+
fprintf(Of, T_OptTrials, TRIALS);
|
131
|
+
Check(TRIALS, 3, 1000);
|
132
|
+
BOOST = true;
|
133
|
+
break;
|
134
|
+
case 's': SUBSET = true;
|
135
|
+
fprintf(Of, T_OptSubsets);
|
136
|
+
ArgOK = true;
|
137
|
+
break;
|
138
|
+
case 'm': SetFOpt(MINITEMS);
|
139
|
+
fprintf(Of, T_OptMinCases, MINITEMS);
|
140
|
+
Check(MINITEMS, 1, 1000000);
|
141
|
+
break;
|
142
|
+
case 'c': SetFOpt(CF);
|
143
|
+
fprintf(Of, T_OptCF, CF);
|
144
|
+
Check(CF, 0, 100);
|
145
|
+
CF /= 100;
|
146
|
+
break;
|
147
|
+
case 'r': RULES = true;
|
148
|
+
fprintf(Of, T_OptRules);
|
149
|
+
ArgOK = true;
|
150
|
+
break;
|
151
|
+
case 'S': SetFOpt(SAMPLE);
|
152
|
+
fprintf(Of, T_OptSampling, SAMPLE);
|
153
|
+
Check(SAMPLE, 0.1, 99.9);
|
154
|
+
SAMPLE /= 100;
|
155
|
+
break;
|
156
|
+
case 'I': SetIOpt(KRInit);
|
157
|
+
fprintf(Of, T_OptSeed, KRInit);
|
158
|
+
KRInit = KRInit & 07777;
|
159
|
+
break;
|
160
|
+
case 'u': SetIOpt(UTILITY);
|
161
|
+
fprintf(Of, T_OptUtility, UTILITY);
|
162
|
+
Check(UTILITY, 2, 10000);
|
163
|
+
RULES = true;
|
164
|
+
break;
|
165
|
+
case 'e': NOCOSTS = true;
|
166
|
+
fprintf(Of, T_OptNoCosts);
|
167
|
+
ArgOK = true;
|
168
|
+
break;
|
169
|
+
case 'w': WINNOW = true;
|
170
|
+
fprintf(Of, T_OptWinnow);
|
171
|
+
ArgOK = true;
|
172
|
+
break;
|
173
|
+
case 'g': GLOBAL = false;
|
174
|
+
fprintf(Of, T_OptNoGlobal);
|
175
|
+
ArgOK = true;
|
176
|
+
break;
|
177
|
+
case 'X': SetIOpt(FOLDS);
|
178
|
+
fprintf(Of, T_OptXval, FOLDS);
|
179
|
+
Check(FOLDS, 2, 1000);
|
180
|
+
XVAL = true;
|
181
|
+
break;
|
182
|
+
}
|
183
|
+
|
184
|
+
if ( ! ArgOK )
|
185
|
+
{
|
186
|
+
if ( o != 'h' )
|
187
|
+
{
|
188
|
+
fprintf(Of, T_UnregnizedOpt,
|
189
|
+
Option,
|
190
|
+
( ! OptArg || OptArg == Option+2 ? "" : OptArg ));
|
191
|
+
fprintf(Of, T_SummaryOpts);
|
192
|
+
}
|
193
|
+
fprintf(Of, T_ListOpts);
|
194
|
+
Goodbye(1);
|
195
|
+
}
|
196
|
+
}
|
197
|
+
|
198
|
+
if ( UTILITY && BOOST )
|
199
|
+
{
|
200
|
+
fprintf(Of, T_UBWarn);
|
201
|
+
}
|
202
|
+
|
203
|
+
StartTime = ExecTime();
|
204
|
+
|
205
|
+
/* Get information on training data */
|
206
|
+
|
207
|
+
if ( ! (F = GetFile(".names", "r")) ) Error(NOFILE, "", "");
|
208
|
+
GetNames(F);
|
209
|
+
|
210
|
+
if ( ClassAtt )
|
211
|
+
{
|
212
|
+
fprintf(Of, T_ClassVar, AttName[ClassAtt]);
|
213
|
+
}
|
214
|
+
|
215
|
+
NotifyStage(READDATA);
|
216
|
+
Progress(-1.0);
|
217
|
+
|
218
|
+
/* Allocate space for SomeMiss[] and SomeNA[] */
|
219
|
+
|
220
|
+
SomeMiss = AllocZero(MaxAtt+1, Boolean);
|
221
|
+
SomeNA = AllocZero(MaxAtt+1, Boolean);
|
222
|
+
|
223
|
+
/* Read data file */
|
224
|
+
|
225
|
+
if ( ! (F = GetFile(".data", "r")) ) Error(NOFILE, "", "");
|
226
|
+
GetData(F, true, false);
|
227
|
+
fprintf(Of, TX_ReadData(MaxCase+1, MaxAtt, FileStem));
|
228
|
+
|
229
|
+
if ( XVAL && (F = GetFile(".test", "r")) )
|
230
|
+
{
|
231
|
+
SaveMaxCase = MaxCase;
|
232
|
+
GetData(F, false, false);
|
233
|
+
fprintf(Of, TX_ReadTest(MaxCase-SaveMaxCase, FileStem));
|
234
|
+
}
|
235
|
+
|
236
|
+
/* Check whether case weight attribute appears */
|
237
|
+
|
238
|
+
if ( CWtAtt )
|
239
|
+
{
|
240
|
+
fprintf(Of, T_CWtAtt);
|
241
|
+
}
|
242
|
+
|
243
|
+
if ( ! NOCOSTS && (F = GetFile(".costs", "r")) )
|
244
|
+
{
|
245
|
+
GetMCosts(F);
|
246
|
+
if ( MCost )
|
247
|
+
{
|
248
|
+
fprintf(Of, T_ReadCosts, FileStem);
|
249
|
+
}
|
250
|
+
}
|
251
|
+
|
252
|
+
/* Note any attribute exclusions/inclusions */
|
253
|
+
|
254
|
+
if ( AttExIn )
|
255
|
+
{
|
256
|
+
fprintf(Of, "%s", ( AttExIn == -1 ? T_AttributesOut : T_AttributesIn ));
|
257
|
+
|
258
|
+
ForEach(Att, 1, MaxAtt)
|
259
|
+
{
|
260
|
+
if ( Att != ClassAtt &&
|
261
|
+
Att != CWtAtt &&
|
262
|
+
( StatBit(Att, SKIP) > 0 ) == ( AttExIn == -1 ) )
|
263
|
+
{
|
264
|
+
fprintf(Of, " %s\n", AttName[Att]);
|
265
|
+
}
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
/* Build decision trees */
|
270
|
+
|
271
|
+
if ( ! BOOST )
|
272
|
+
{
|
273
|
+
TRIALS = 1;
|
274
|
+
}
|
275
|
+
|
276
|
+
InitialiseTreeData();
|
277
|
+
if ( RULES )
|
278
|
+
{
|
279
|
+
RuleSet = AllocZero(TRIALS+1, CRuleSet);
|
280
|
+
}
|
281
|
+
|
282
|
+
if ( WINNOW )
|
283
|
+
{
|
284
|
+
NotifyStage(WINNOWATTS);
|
285
|
+
Progress(-MaxAtt);
|
286
|
+
WinnowAtts();
|
287
|
+
}
|
288
|
+
|
289
|
+
if ( XVAL )
|
290
|
+
{
|
291
|
+
CrossVal();
|
292
|
+
}
|
293
|
+
else
|
294
|
+
{
|
295
|
+
ConstructClassifiers();
|
296
|
+
|
297
|
+
/* Evaluation */
|
298
|
+
|
299
|
+
fprintf(Of, T_EvalTrain, MaxCase+1);
|
300
|
+
|
301
|
+
NotifyStage(EVALTRAIN);
|
302
|
+
Progress(-TRIALS * (MaxCase+1.0));
|
303
|
+
|
304
|
+
Evaluate(CMINFO | USAGEINFO);
|
305
|
+
|
306
|
+
if ( (F = GetFile(( SAMPLE ? ".data" : ".test" ), "r")) )
|
307
|
+
{
|
308
|
+
NotifyStage(READTEST);
|
309
|
+
fprintf(Of, "\n");
|
310
|
+
|
311
|
+
FreeData();
|
312
|
+
GetData(F, false, false);
|
313
|
+
|
314
|
+
fprintf(Of, T_EvalTest, MaxCase+1);
|
315
|
+
|
316
|
+
NotifyStage(EVALTEST);
|
317
|
+
Progress(-TRIALS * (MaxCase+1.0));
|
318
|
+
|
319
|
+
Evaluate(CMINFO);
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
323
|
+
fprintf(Of, T_Time, ExecTime() - StartTime);
|
324
|
+
|
325
|
+
#ifdef VerbOpt
|
326
|
+
Cleanup();
|
327
|
+
#endif
|
328
|
+
|
329
|
+
return 0;
|
330
|
+
}
|
data/ext/c5.0/classify.c
ADDED
@@ -0,0 +1,700 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Determine the class of a case from a decision tree or ruleset */
|
30
|
+
/* */
|
31
|
+
/*************************************************************************/
|
32
|
+
|
33
|
+
|
34
|
+
#include "defns.i"
|
35
|
+
#include "extern.i"
|
36
|
+
|
37
|
+
|
38
|
+
/* Local data used by MarkActive and RuleClassify.
|
39
|
+
Note: Active is never deallocated, just grows as required */
|
40
|
+
|
41
|
+
RuleNo *Active=Nil, /* rules that fire while classifying case */
|
42
|
+
NActive, /* number ditto */
|
43
|
+
ActiveSpace=0; /* space allocated */
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
/*************************************************************************/
|
48
|
+
/* */
|
49
|
+
/* Classify a case using a decision tree */
|
50
|
+
/* */
|
51
|
+
/*************************************************************************/
|
52
|
+
|
53
|
+
|
54
|
+
ClassNo TreeClassify(DataRec Case, Tree DecisionTree)
|
55
|
+
/* ------------ */
|
56
|
+
{
|
57
|
+
ClassNo c;
|
58
|
+
|
59
|
+
ForEach(c, 0, MaxClass)
|
60
|
+
{
|
61
|
+
ClassSum[c] = 0;
|
62
|
+
}
|
63
|
+
|
64
|
+
FindLeaf(Case, DecisionTree, Nil, 1.0);
|
65
|
+
|
66
|
+
return SelectClass(1, (Boolean)(MCost != Nil));
|
67
|
+
}
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
/*************************************************************************/
|
72
|
+
/* */
|
73
|
+
/* Classify a case using the given subtree. */
|
74
|
+
/* Adjust the value ClassSum for each class */
|
75
|
+
/* */
|
76
|
+
/*************************************************************************/
|
77
|
+
|
78
|
+
|
79
|
+
void FindLeaf(DataRec Case, Tree T, Tree PT, float Fraction)
|
80
|
+
/* -------- */
|
81
|
+
{
|
82
|
+
DiscrValue v, Dv;
|
83
|
+
ClassNo c;
|
84
|
+
float NewFrac, BrWt[4];
|
85
|
+
|
86
|
+
/* Special case for winnowing cycles */
|
87
|
+
|
88
|
+
if ( T->NodeType && Skip(T->Tested) )
|
89
|
+
{
|
90
|
+
FollowAllBranches(Case, T, Fraction);
|
91
|
+
return;
|
92
|
+
}
|
93
|
+
|
94
|
+
if ( T->NodeType && Tested )
|
95
|
+
{
|
96
|
+
Tested[T->Tested] = true; /* for usage */
|
97
|
+
}
|
98
|
+
|
99
|
+
switch ( T->NodeType )
|
100
|
+
{
|
101
|
+
case 0: /* leaf */
|
102
|
+
|
103
|
+
LeafUpdate:
|
104
|
+
|
105
|
+
/* Use parent node if effectively no cases at this node */
|
106
|
+
|
107
|
+
if ( T->Cases < Epsilon )
|
108
|
+
{
|
109
|
+
T = PT;
|
110
|
+
}
|
111
|
+
|
112
|
+
/* Update from all classes */
|
113
|
+
|
114
|
+
ForEach(c, 1, MaxClass)
|
115
|
+
{
|
116
|
+
ClassSum[c] += Fraction * T->ClassDist[c] / T->Cases;
|
117
|
+
}
|
118
|
+
|
119
|
+
return;
|
120
|
+
|
121
|
+
case BrDiscr: /* test of discrete attribute */
|
122
|
+
|
123
|
+
Dv = DVal(Case, T->Tested); /* > MaxAttVal if unknown */
|
124
|
+
|
125
|
+
if ( Dv <= T->Forks ) /* Make sure not new discrete value */
|
126
|
+
{
|
127
|
+
FindLeaf(Case, T->Branch[Dv], T, Fraction);
|
128
|
+
}
|
129
|
+
else
|
130
|
+
{
|
131
|
+
FollowAllBranches(Case, T, Fraction);
|
132
|
+
}
|
133
|
+
|
134
|
+
return;
|
135
|
+
|
136
|
+
case BrThresh: /* test of continuous attribute */
|
137
|
+
|
138
|
+
if ( Unknown(Case, T->Tested) )
|
139
|
+
{
|
140
|
+
FollowAllBranches(Case, T, Fraction);
|
141
|
+
}
|
142
|
+
else
|
143
|
+
if ( NotApplic(Case, T->Tested) )
|
144
|
+
{
|
145
|
+
FindLeaf(Case, T->Branch[1], T, Fraction);
|
146
|
+
}
|
147
|
+
else
|
148
|
+
{
|
149
|
+
/* Find weights for <= and > branches, interpolating if
|
150
|
+
probabilistic thresholds are used */
|
151
|
+
|
152
|
+
BrWt[2] = Interpolate(T, CVal(Case, T->Tested));
|
153
|
+
BrWt[3] = 1 - BrWt[2];
|
154
|
+
|
155
|
+
ForEach(v, 2, 3)
|
156
|
+
{
|
157
|
+
if ( (NewFrac = Fraction * BrWt[v]) >= 0.01 )
|
158
|
+
{
|
159
|
+
FindLeaf(Case, T->Branch[v], T, NewFrac);
|
160
|
+
}
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
return;
|
165
|
+
|
166
|
+
case BrSubset: /* subset test on discrete attribute */
|
167
|
+
|
168
|
+
Dv = DVal(Case, T->Tested); /* > MaxAttVal if unknown */
|
169
|
+
|
170
|
+
if ( Dv <= MaxAttVal[T->Tested] )
|
171
|
+
{
|
172
|
+
ForEach(v, 1, T->Forks)
|
173
|
+
{
|
174
|
+
if ( In(Dv, T->Subset[v]) )
|
175
|
+
{
|
176
|
+
FindLeaf(Case, T->Branch[v], T, Fraction);
|
177
|
+
|
178
|
+
return;
|
179
|
+
}
|
180
|
+
}
|
181
|
+
|
182
|
+
/* Value not found in any subset -- treat as leaf */
|
183
|
+
|
184
|
+
goto LeafUpdate;
|
185
|
+
}
|
186
|
+
else
|
187
|
+
{
|
188
|
+
FollowAllBranches(Case, T, Fraction);
|
189
|
+
}
|
190
|
+
}
|
191
|
+
}
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
/*************************************************************************/
|
196
|
+
/* */
|
197
|
+
/* Follow all branches from a node, weighting them in proportion */
|
198
|
+
/* to the number of training cases they contain */
|
199
|
+
/* */
|
200
|
+
/*************************************************************************/
|
201
|
+
|
202
|
+
|
203
|
+
void FollowAllBranches(DataRec Case, Tree T, float Fraction)
|
204
|
+
/* ----------------- */
|
205
|
+
{
|
206
|
+
DiscrValue v;
|
207
|
+
|
208
|
+
ForEach(v, 1, T->Forks)
|
209
|
+
{
|
210
|
+
if ( T->Branch[v]->Cases > Epsilon )
|
211
|
+
{
|
212
|
+
FindLeaf(Case, T->Branch[v], T,
|
213
|
+
(Fraction * T->Branch[v]->Cases) / T->Cases);
|
214
|
+
}
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
/*************************************************************************/
|
221
|
+
/* */
|
222
|
+
/* Classify a case using a ruleset */
|
223
|
+
/* */
|
224
|
+
/*************************************************************************/
|
225
|
+
|
226
|
+
|
227
|
+
ClassNo RuleClassify(DataRec Case, CRuleSet RS)
|
228
|
+
/* ------------ */
|
229
|
+
{
|
230
|
+
ClassNo c, Best;
|
231
|
+
float TotWeight=0;
|
232
|
+
int a, u=1, d;
|
233
|
+
CRule R;
|
234
|
+
RuleNo r;
|
235
|
+
|
236
|
+
ForEach(c, 0, MaxClass)
|
237
|
+
{
|
238
|
+
ClassSum[c] = 0;
|
239
|
+
MostSpec[c] = Nil;
|
240
|
+
}
|
241
|
+
|
242
|
+
/* Find active rules */
|
243
|
+
|
244
|
+
NActive = 0;
|
245
|
+
|
246
|
+
if ( RS->RT )
|
247
|
+
{
|
248
|
+
MarkActive(RS->RT, Case);
|
249
|
+
}
|
250
|
+
else
|
251
|
+
{
|
252
|
+
ForEach(r, 1, RS->SNRules)
|
253
|
+
{
|
254
|
+
R = RS->SRule[r];
|
255
|
+
|
256
|
+
if ( Matches(R, Case) )
|
257
|
+
{
|
258
|
+
Active[NActive++] = r;
|
259
|
+
}
|
260
|
+
}
|
261
|
+
}
|
262
|
+
|
263
|
+
/* Must sort rules if using utility bands */
|
264
|
+
|
265
|
+
if ( UtilBand )
|
266
|
+
{
|
267
|
+
SortActive();
|
268
|
+
}
|
269
|
+
|
270
|
+
/* Vote active rules */
|
271
|
+
|
272
|
+
ForEach(a, 0, NActive-1)
|
273
|
+
{
|
274
|
+
r = Active[a];
|
275
|
+
R = RS->SRule[r];
|
276
|
+
|
277
|
+
if ( Tested )
|
278
|
+
{
|
279
|
+
ForEach(d, 1, R->Size)
|
280
|
+
{
|
281
|
+
Tested[R->Lhs[d]->Tested] = true; /* for usage */
|
282
|
+
}
|
283
|
+
}
|
284
|
+
if ( UtilBand ) CheckUtilityBand(&u, r, Class(Case), RS->SDefault);
|
285
|
+
ClassSum[R->Rhs] += R->Vote;
|
286
|
+
TotWeight += 1000.0;
|
287
|
+
|
288
|
+
/* Check whether this is the most specific rule for this class;
|
289
|
+
resolve ties in favor of rule with higher vote */
|
290
|
+
|
291
|
+
if ( ! MostSpec[R->Rhs] ||
|
292
|
+
R->Cover < MostSpec[R->Rhs]->Cover ||
|
293
|
+
( R->Cover == MostSpec[R->Rhs]->Cover &&
|
294
|
+
R->Vote > MostSpec[R->Rhs]->Vote ) )
|
295
|
+
{
|
296
|
+
MostSpec[R->Rhs] = R;
|
297
|
+
}
|
298
|
+
}
|
299
|
+
|
300
|
+
/* Flush any remaining utility bands */
|
301
|
+
|
302
|
+
if ( UtilBand )
|
303
|
+
{
|
304
|
+
CheckUtilityBand(&u, RS->SNRules+1, Class(Case), RS->SDefault);
|
305
|
+
}
|
306
|
+
|
307
|
+
/* Check for default and normalise ClassSum */
|
308
|
+
|
309
|
+
if ( ! TotWeight )
|
310
|
+
{
|
311
|
+
Confidence = 0.5;
|
312
|
+
return RS->SDefault;
|
313
|
+
}
|
314
|
+
|
315
|
+
ForEach(c, 1, MaxClass)
|
316
|
+
{
|
317
|
+
ClassSum[c] /= TotWeight;
|
318
|
+
}
|
319
|
+
|
320
|
+
Best = SelectClass(RS->SDefault, false);
|
321
|
+
|
322
|
+
/* Set Confidence to the vote for the most specific rule of class Best */
|
323
|
+
|
324
|
+
Confidence = MostSpec[Best]->Vote / 1000.0;
|
325
|
+
|
326
|
+
return Best;
|
327
|
+
}
|
328
|
+
|
329
|
+
|
330
|
+
|
331
|
+
/*************************************************************************/
|
332
|
+
/* */
|
333
|
+
/* Determine outcome of a test on a case. */
|
334
|
+
/* Return -1 if value of tested attribute is unknown */
|
335
|
+
/* */
|
336
|
+
/*************************************************************************/
|
337
|
+
|
338
|
+
|
339
|
+
int FindOutcome(DataRec Case, Condition OneCond)
|
340
|
+
/* ----------- */
|
341
|
+
{
|
342
|
+
DiscrValue v, Outcome;
|
343
|
+
Attribute Att;
|
344
|
+
|
345
|
+
Att = OneCond->Tested;
|
346
|
+
|
347
|
+
/* Determine the outcome of this test on this case */
|
348
|
+
|
349
|
+
switch ( OneCond->NodeType )
|
350
|
+
{
|
351
|
+
case BrDiscr: /* test of discrete attribute */
|
352
|
+
|
353
|
+
v = XDVal(Case, Att);
|
354
|
+
Outcome = ( v == 0 ? -1 : v );
|
355
|
+
break;
|
356
|
+
|
357
|
+
case BrThresh: /* test of continuous attribute */
|
358
|
+
|
359
|
+
Outcome = ( Unknown(Case, Att) ? -1 :
|
360
|
+
NotApplic(Case, Att) ? 1 :
|
361
|
+
CVal(Case, Att) <= OneCond->Cut ? 2 : 3 );
|
362
|
+
break;
|
363
|
+
|
364
|
+
case BrSubset: /* subset test on discrete attribute */
|
365
|
+
|
366
|
+
v = XDVal(Case, Att);
|
367
|
+
Outcome = ( v <= MaxAttVal[Att] && In(v, OneCond->Subset) ?
|
368
|
+
OneCond->TestValue : 0 );
|
369
|
+
}
|
370
|
+
|
371
|
+
return Outcome;
|
372
|
+
}
|
373
|
+
|
374
|
+
|
375
|
+
|
376
|
+
/*************************************************************************/
|
377
|
+
/* */
|
378
|
+
/* Determine whether a case satisfies a condition */
|
379
|
+
/* */
|
380
|
+
/*************************************************************************/
|
381
|
+
|
382
|
+
|
383
|
+
Boolean Satisfies(DataRec Case, Condition OneCond)
|
384
|
+
/* --------- */
|
385
|
+
{
|
386
|
+
return ( FindOutcome(Case, OneCond) == OneCond->TestValue );
|
387
|
+
}
|
388
|
+
|
389
|
+
|
390
|
+
|
391
|
+
/*************************************************************************/
|
392
|
+
/* */
|
393
|
+
/* Determine whether a case satisfies all conditions of a rule */
|
394
|
+
/* */
|
395
|
+
/*************************************************************************/
|
396
|
+
|
397
|
+
|
398
|
+
Boolean Matches(CRule R, DataRec Case)
|
399
|
+
/* ------- */
|
400
|
+
{
|
401
|
+
int d;
|
402
|
+
|
403
|
+
ForEach(d, 1, R->Size)
|
404
|
+
{
|
405
|
+
if ( ! Satisfies(Case, R->Lhs[d]) )
|
406
|
+
{
|
407
|
+
return false;
|
408
|
+
}
|
409
|
+
}
|
410
|
+
|
411
|
+
return true;
|
412
|
+
}
|
413
|
+
|
414
|
+
|
415
|
+
|
416
|
+
/*************************************************************************/
|
417
|
+
/* */
|
418
|
+
/* Make sure that Active[] has space for at least N rules */
|
419
|
+
/* */
|
420
|
+
/*************************************************************************/
|
421
|
+
|
422
|
+
|
423
|
+
void CheckActiveSpace(int N)
|
424
|
+
/* ---------------- */
|
425
|
+
{
|
426
|
+
if ( ActiveSpace <= N )
|
427
|
+
{
|
428
|
+
Realloc(Active, (ActiveSpace=N+1), RuleNo);
|
429
|
+
}
|
430
|
+
}
|
431
|
+
|
432
|
+
|
433
|
+
|
434
|
+
/*************************************************************************/
|
435
|
+
/* */
|
436
|
+
/* Use RT to enter active rules in Active[] */
|
437
|
+
/* */
|
438
|
+
/*************************************************************************/
|
439
|
+
|
440
|
+
|
441
|
+
void MarkActive(RuleTree RT, DataRec Case)
|
442
|
+
/* ---------- */
|
443
|
+
{
|
444
|
+
DiscrValue v;
|
445
|
+
int ri;
|
446
|
+
RuleNo r;
|
447
|
+
|
448
|
+
if ( ! RT ) return;
|
449
|
+
|
450
|
+
/* Enter any rules satisfied at this node */
|
451
|
+
|
452
|
+
if ( RT->Fire )
|
453
|
+
{
|
454
|
+
for ( ri = 0 ; (r = RT->Fire[ri]) ; ri++ )
|
455
|
+
{
|
456
|
+
Active[NActive++] = r;
|
457
|
+
}
|
458
|
+
}
|
459
|
+
|
460
|
+
if ( ! RT->Branch ) return;
|
461
|
+
|
462
|
+
/* Explore subtree for rules that include condition at this node */
|
463
|
+
|
464
|
+
if ( (v = FindOutcome(Case, RT->CondTest)) > 0 && v <= RT->Forks )
|
465
|
+
{
|
466
|
+
MarkActive(RT->Branch[v], Case);
|
467
|
+
}
|
468
|
+
|
469
|
+
/* Explore default subtree for rules that do not include condition */
|
470
|
+
|
471
|
+
MarkActive(RT->Branch[0], Case);
|
472
|
+
}
|
473
|
+
|
474
|
+
|
475
|
+
|
476
|
+
/*************************************************************************/
|
477
|
+
/* */
|
478
|
+
/* Sort active rules for utility band error rates */
|
479
|
+
/* */
|
480
|
+
/*************************************************************************/
|
481
|
+
|
482
|
+
|
483
|
+
void SortActive()
|
484
|
+
/* ---------- */
|
485
|
+
{
|
486
|
+
RuleNo r;
|
487
|
+
int a, aa, aLow;
|
488
|
+
|
489
|
+
ForEach(a, 0, NActive-1)
|
490
|
+
{
|
491
|
+
aLow = a;
|
492
|
+
|
493
|
+
ForEach(aa, a+1, NActive-1)
|
494
|
+
{
|
495
|
+
if ( Active[aa] < Active[aLow] ) aLow = aa;
|
496
|
+
}
|
497
|
+
|
498
|
+
r = Active[a];
|
499
|
+
Active[a] = Active[aLow];
|
500
|
+
Active[aLow] = r;
|
501
|
+
}
|
502
|
+
}
|
503
|
+
|
504
|
+
|
505
|
+
|
506
|
+
/*************************************************************************/
|
507
|
+
/* */
|
508
|
+
/* Update utility band error rates for all bands before rule r */
|
509
|
+
/* that have not been competed yet. Update current band. */
|
510
|
+
/* */
|
511
|
+
/*************************************************************************/
|
512
|
+
|
513
|
+
|
514
|
+
void CheckUtilityBand(int *u, RuleNo r, ClassNo Actual, ClassNo Default)
|
515
|
+
/* ---------------- */
|
516
|
+
{
|
517
|
+
ClassNo c;
|
518
|
+
|
519
|
+
while ( *u < UTILITY && r > UtilBand[*u] )
|
520
|
+
{
|
521
|
+
c = SelectClass(Default, false);
|
522
|
+
if ( c != Actual )
|
523
|
+
{
|
524
|
+
UtilErr[*u]++;
|
525
|
+
if ( MCost ) UtilCost[*u] += MCost[c][Actual];
|
526
|
+
}
|
527
|
+
|
528
|
+
(*u)++;
|
529
|
+
}
|
530
|
+
}
|
531
|
+
|
532
|
+
|
533
|
+
|
534
|
+
/*************************************************************************/
|
535
|
+
/* */
|
536
|
+
/* Classify a case using boosted tree or rule sequence. */
|
537
|
+
/* Global variable Default must have been set prior to call */
|
538
|
+
/* */
|
539
|
+
/* Note: boosting with costs is complicated. With trees, */
|
540
|
+
/* complete class distributions are accumulated and then a class */
|
541
|
+
/* selected to minimize expected cost. This cannot be done with */
|
542
|
+
/* rulesets since a single ruleset does not give a reliable */
|
543
|
+
/* class distribution; instead, the votes from all cost-adjusted */
|
544
|
+
/* rulesets are combined without reference to costs. */
|
545
|
+
/* */
|
546
|
+
/*************************************************************************/
|
547
|
+
|
548
|
+
|
549
|
+
ClassNo BoostClassify(DataRec Case, int MaxTrial)
|
550
|
+
/* ------------- */
|
551
|
+
{
|
552
|
+
ClassNo c, Best;
|
553
|
+
int t;
|
554
|
+
float Total=0;
|
555
|
+
|
556
|
+
ForEach(c, 1, MaxClass)
|
557
|
+
{
|
558
|
+
Vote[c] = 0;
|
559
|
+
}
|
560
|
+
|
561
|
+
ForEach(t, 0, MaxTrial)
|
562
|
+
{
|
563
|
+
Best = ( RULES ? RuleClassify(Case, RuleSet[t]) :
|
564
|
+
TreeClassify(Case, Pruned[t]) );
|
565
|
+
|
566
|
+
Vote[Best] += Confidence;
|
567
|
+
Total += Confidence;
|
568
|
+
|
569
|
+
TrialPred[t] = Best;
|
570
|
+
}
|
571
|
+
|
572
|
+
/* Copy votes into ClassSum */
|
573
|
+
|
574
|
+
ForEach(c, 1, MaxClass)
|
575
|
+
{
|
576
|
+
ClassSum[c] = Vote[c] / Total;
|
577
|
+
}
|
578
|
+
|
579
|
+
return SelectClass(Default, false);
|
580
|
+
}
|
581
|
+
|
582
|
+
|
583
|
+
|
584
|
+
/*************************************************************************/
|
585
|
+
/* */
|
586
|
+
/* Select the best class to return. Take misclassification costs */
|
587
|
+
/* into account if they are defined. */
|
588
|
+
/* */
|
589
|
+
/*************************************************************************/
|
590
|
+
|
591
|
+
|
592
|
+
ClassNo SelectClass(ClassNo Default, Boolean UseCosts)
|
593
|
+
/* ----------- */
|
594
|
+
{
|
595
|
+
ClassNo c, cc, BestClass;
|
596
|
+
float ExpCost, BestCost=1E38, TotCost=0;
|
597
|
+
|
598
|
+
BestClass = Default;
|
599
|
+
|
600
|
+
if ( UseCosts )
|
601
|
+
{
|
602
|
+
ForEach(c, 1, MaxClass)
|
603
|
+
{
|
604
|
+
ExpCost = 0;
|
605
|
+
ForEach(cc, 1, MaxClass)
|
606
|
+
{
|
607
|
+
if ( cc == c ) continue;
|
608
|
+
ExpCost += ClassSum[cc] * MCost[c][cc];
|
609
|
+
}
|
610
|
+
|
611
|
+
TotCost += ExpCost;
|
612
|
+
|
613
|
+
if ( ExpCost < BestCost )
|
614
|
+
{
|
615
|
+
BestClass = c;
|
616
|
+
BestCost = ExpCost;
|
617
|
+
}
|
618
|
+
}
|
619
|
+
|
620
|
+
Confidence = 1 - BestCost / TotCost;
|
621
|
+
}
|
622
|
+
else
|
623
|
+
{
|
624
|
+
ForEach(c, 1, MaxClass)
|
625
|
+
{
|
626
|
+
if ( ClassSum[c] > ClassSum[BestClass] ) BestClass = c;
|
627
|
+
}
|
628
|
+
|
629
|
+
Confidence = ClassSum[BestClass];
|
630
|
+
}
|
631
|
+
|
632
|
+
return BestClass;
|
633
|
+
}
|
634
|
+
|
635
|
+
|
636
|
+
|
637
|
+
/*************************************************************************/
|
638
|
+
/* */
|
639
|
+
/* General classification routine */
|
640
|
+
/* */
|
641
|
+
/*************************************************************************/
|
642
|
+
|
643
|
+
|
644
|
+
ClassNo Classify(DataRec Case)
|
645
|
+
/* -------- */
|
646
|
+
{
|
647
|
+
|
648
|
+
return ( TRIALS > 1 ? BoostClassify(Case, TRIALS-1) :
|
649
|
+
RULES ? RuleClassify(Case, RuleSet[0]) :
|
650
|
+
TreeClassify(Case, Pruned[0]) );
|
651
|
+
}
|
652
|
+
|
653
|
+
|
654
|
+
|
655
|
+
/*************************************************************************/
|
656
|
+
/* */
|
657
|
+
/* Interpolate a single value between Lower, Mid and Upper */
|
658
|
+
/* (All these have the same value unless using probabilistic */
|
659
|
+
/* thresholds.) */
|
660
|
+
/* */
|
661
|
+
/*************************************************************************/
|
662
|
+
|
663
|
+
|
664
|
+
float Interpolate(Tree T, ContValue Val)
|
665
|
+
/* ----------- */
|
666
|
+
{
|
667
|
+
return ( Val <= T->Lower ? 1.0 :
|
668
|
+
Val >= T->Upper ? 0.0 :
|
669
|
+
Val <= T->Mid ?
|
670
|
+
1 - 0.5 * (Val - T->Lower) / (T->Mid - T->Lower + 1E-6) :
|
671
|
+
0.5 - 0.5 * (Val - T->Mid) / (T->Upper - T->Mid + 1E-6) );
|
672
|
+
}
|
673
|
+
|
674
|
+
|
675
|
+
|
676
|
+
/*************************************************************************/
|
677
|
+
/* */
|
678
|
+
/* Free data structures for one classifier */
|
679
|
+
/* */
|
680
|
+
/*************************************************************************/
|
681
|
+
|
682
|
+
|
683
|
+
void FreeClassifier(int Trial)
|
684
|
+
/* -------------- */
|
685
|
+
{
|
686
|
+
if ( Raw )
|
687
|
+
{
|
688
|
+
FreeTree(Raw[Trial]); Raw[Trial] = Nil;
|
689
|
+
}
|
690
|
+
|
691
|
+
if ( Pruned )
|
692
|
+
{
|
693
|
+
FreeTree(Pruned[Trial]); Pruned[Trial] = Nil;
|
694
|
+
}
|
695
|
+
|
696
|
+
if ( RULES && RuleSet && RuleSet[Trial] )
|
697
|
+
{
|
698
|
+
FreeRules(RuleSet[Trial]); RuleSet[Trial] = Nil;
|
699
|
+
}
|
700
|
+
}
|