see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
data/ext/c5.0/c50.c ADDED
@@ -0,0 +1,330 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Main routine, C5.0 */
30
+ /* ------------------ */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+ #include <signal.h>
38
+
39
+ #include <sys/unistd.h>
40
+ #include <sys/time.h>
41
+ #include <sys/resource.h>
42
+
43
+ #define SetFOpt(V) V = strtod(OptArg, &EndPtr);\
44
+ if ( ! EndPtr || *EndPtr != '\00' ) break;\
45
+ ArgOK = true
46
+ #define SetIOpt(V) V = strtol(OptArg, &EndPtr, 10);\
47
+ if ( ! EndPtr || *EndPtr != '\00' ) break;\
48
+ ArgOK = true
49
+
50
+
51
+ int main(int Argc, char *Argv[])
52
+ /* ---- */
53
+ {
54
+ int o;
55
+ extern String OptArg, Option;
56
+ char *EndPtr;
57
+ Boolean FirstTime=true, ArgOK;
58
+ double StartTime;
59
+ FILE *F;
60
+ CaseNo SaveMaxCase;
61
+ Attribute Att;
62
+
63
+ struct rlimit RL;
64
+
65
+ /* Make sure there is a largish runtime stack */
66
+
67
+ getrlimit(RLIMIT_STACK, &RL);
68
+
69
+ RL.rlim_cur = Max(RL.rlim_cur, 20 * 1024 * 1024);
70
+
71
+ if ( RL.rlim_max > 0 ) /* -1 if unlimited */
72
+ {
73
+ RL.rlim_cur = Min(RL.rlim_max, RL.rlim_cur);
74
+ }
75
+
76
+ setrlimit(RLIMIT_STACK, &RL);
77
+
78
+
79
+ /* Check for output to be saved to a file */
80
+
81
+ if ( Argc > 2 && ! strcmp(Argv[Argc-2], "-o") )
82
+ {
83
+ Of = fopen(Argv[Argc-1], "w");
84
+ Argc -= 2;
85
+ }
86
+
87
+ if ( ! Of )
88
+ {
89
+ Of = stdout;
90
+ }
91
+
92
+ KRInit = time(0) & 07777;
93
+
94
+ PrintHeader("");
95
+
96
+ /* Process options */
97
+
98
+ while ( (o = ProcessOption(Argc, Argv, "f+bpv+t+sm+c+S+I+ru+egX+wh")) )
99
+ {
100
+ if ( FirstTime )
101
+ {
102
+ fprintf(Of, T_OptHeader);
103
+ FirstTime = false;
104
+ }
105
+
106
+ ArgOK = false;
107
+
108
+ switch (o)
109
+ {
110
+ case 'f': FileStem = OptArg;
111
+ fprintf(Of, T_OptApplication, FileStem);
112
+ ArgOK = true;
113
+ break;
114
+ case 'b': BOOST = true;
115
+ fprintf(Of, T_OptBoost);
116
+ if ( TRIALS == 1 ) TRIALS = 10;
117
+ ArgOK = true;
118
+ break;
119
+ case 'p': PROBTHRESH = true;
120
+ fprintf(Of, T_OptProbThresh);
121
+ ArgOK = true;
122
+ break;
123
+ #ifdef VerbOpt
124
+ case 'v': SetIOpt(VERBOSITY);
125
+ fprintf(Of, "\tVerbosity level %d\n", VERBOSITY);
126
+ ArgOK = true;
127
+ break;
128
+ #endif
129
+ case 't': SetIOpt(TRIALS);
130
+ fprintf(Of, T_OptTrials, TRIALS);
131
+ Check(TRIALS, 3, 1000);
132
+ BOOST = true;
133
+ break;
134
+ case 's': SUBSET = true;
135
+ fprintf(Of, T_OptSubsets);
136
+ ArgOK = true;
137
+ break;
138
+ case 'm': SetFOpt(MINITEMS);
139
+ fprintf(Of, T_OptMinCases, MINITEMS);
140
+ Check(MINITEMS, 1, 1000000);
141
+ break;
142
+ case 'c': SetFOpt(CF);
143
+ fprintf(Of, T_OptCF, CF);
144
+ Check(CF, 0, 100);
145
+ CF /= 100;
146
+ break;
147
+ case 'r': RULES = true;
148
+ fprintf(Of, T_OptRules);
149
+ ArgOK = true;
150
+ break;
151
+ case 'S': SetFOpt(SAMPLE);
152
+ fprintf(Of, T_OptSampling, SAMPLE);
153
+ Check(SAMPLE, 0.1, 99.9);
154
+ SAMPLE /= 100;
155
+ break;
156
+ case 'I': SetIOpt(KRInit);
157
+ fprintf(Of, T_OptSeed, KRInit);
158
+ KRInit = KRInit & 07777;
159
+ break;
160
+ case 'u': SetIOpt(UTILITY);
161
+ fprintf(Of, T_OptUtility, UTILITY);
162
+ Check(UTILITY, 2, 10000);
163
+ RULES = true;
164
+ break;
165
+ case 'e': NOCOSTS = true;
166
+ fprintf(Of, T_OptNoCosts);
167
+ ArgOK = true;
168
+ break;
169
+ case 'w': WINNOW = true;
170
+ fprintf(Of, T_OptWinnow);
171
+ ArgOK = true;
172
+ break;
173
+ case 'g': GLOBAL = false;
174
+ fprintf(Of, T_OptNoGlobal);
175
+ ArgOK = true;
176
+ break;
177
+ case 'X': SetIOpt(FOLDS);
178
+ fprintf(Of, T_OptXval, FOLDS);
179
+ Check(FOLDS, 2, 1000);
180
+ XVAL = true;
181
+ break;
182
+ }
183
+
184
+ if ( ! ArgOK )
185
+ {
186
+ if ( o != 'h' )
187
+ {
188
+ fprintf(Of, T_UnregnizedOpt,
189
+ Option,
190
+ ( ! OptArg || OptArg == Option+2 ? "" : OptArg ));
191
+ fprintf(Of, T_SummaryOpts);
192
+ }
193
+ fprintf(Of, T_ListOpts);
194
+ Goodbye(1);
195
+ }
196
+ }
197
+
198
+ if ( UTILITY && BOOST )
199
+ {
200
+ fprintf(Of, T_UBWarn);
201
+ }
202
+
203
+ StartTime = ExecTime();
204
+
205
+ /* Get information on training data */
206
+
207
+ if ( ! (F = GetFile(".names", "r")) ) Error(NOFILE, "", "");
208
+ GetNames(F);
209
+
210
+ if ( ClassAtt )
211
+ {
212
+ fprintf(Of, T_ClassVar, AttName[ClassAtt]);
213
+ }
214
+
215
+ NotifyStage(READDATA);
216
+ Progress(-1.0);
217
+
218
+ /* Allocate space for SomeMiss[] and SomeNA[] */
219
+
220
+ SomeMiss = AllocZero(MaxAtt+1, Boolean);
221
+ SomeNA = AllocZero(MaxAtt+1, Boolean);
222
+
223
+ /* Read data file */
224
+
225
+ if ( ! (F = GetFile(".data", "r")) ) Error(NOFILE, "", "");
226
+ GetData(F, true, false);
227
+ fprintf(Of, TX_ReadData(MaxCase+1, MaxAtt, FileStem));
228
+
229
+ if ( XVAL && (F = GetFile(".test", "r")) )
230
+ {
231
+ SaveMaxCase = MaxCase;
232
+ GetData(F, false, false);
233
+ fprintf(Of, TX_ReadTest(MaxCase-SaveMaxCase, FileStem));
234
+ }
235
+
236
+ /* Check whether case weight attribute appears */
237
+
238
+ if ( CWtAtt )
239
+ {
240
+ fprintf(Of, T_CWtAtt);
241
+ }
242
+
243
+ if ( ! NOCOSTS && (F = GetFile(".costs", "r")) )
244
+ {
245
+ GetMCosts(F);
246
+ if ( MCost )
247
+ {
248
+ fprintf(Of, T_ReadCosts, FileStem);
249
+ }
250
+ }
251
+
252
+ /* Note any attribute exclusions/inclusions */
253
+
254
+ if ( AttExIn )
255
+ {
256
+ fprintf(Of, "%s", ( AttExIn == -1 ? T_AttributesOut : T_AttributesIn ));
257
+
258
+ ForEach(Att, 1, MaxAtt)
259
+ {
260
+ if ( Att != ClassAtt &&
261
+ Att != CWtAtt &&
262
+ ( StatBit(Att, SKIP) > 0 ) == ( AttExIn == -1 ) )
263
+ {
264
+ fprintf(Of, " %s\n", AttName[Att]);
265
+ }
266
+ }
267
+ }
268
+
269
+ /* Build decision trees */
270
+
271
+ if ( ! BOOST )
272
+ {
273
+ TRIALS = 1;
274
+ }
275
+
276
+ InitialiseTreeData();
277
+ if ( RULES )
278
+ {
279
+ RuleSet = AllocZero(TRIALS+1, CRuleSet);
280
+ }
281
+
282
+ if ( WINNOW )
283
+ {
284
+ NotifyStage(WINNOWATTS);
285
+ Progress(-MaxAtt);
286
+ WinnowAtts();
287
+ }
288
+
289
+ if ( XVAL )
290
+ {
291
+ CrossVal();
292
+ }
293
+ else
294
+ {
295
+ ConstructClassifiers();
296
+
297
+ /* Evaluation */
298
+
299
+ fprintf(Of, T_EvalTrain, MaxCase+1);
300
+
301
+ NotifyStage(EVALTRAIN);
302
+ Progress(-TRIALS * (MaxCase+1.0));
303
+
304
+ Evaluate(CMINFO | USAGEINFO);
305
+
306
+ if ( (F = GetFile(( SAMPLE ? ".data" : ".test" ), "r")) )
307
+ {
308
+ NotifyStage(READTEST);
309
+ fprintf(Of, "\n");
310
+
311
+ FreeData();
312
+ GetData(F, false, false);
313
+
314
+ fprintf(Of, T_EvalTest, MaxCase+1);
315
+
316
+ NotifyStage(EVALTEST);
317
+ Progress(-TRIALS * (MaxCase+1.0));
318
+
319
+ Evaluate(CMINFO);
320
+ }
321
+ }
322
+
323
+ fprintf(Of, T_Time, ExecTime() - StartTime);
324
+
325
+ #ifdef VerbOpt
326
+ Cleanup();
327
+ #endif
328
+
329
+ return 0;
330
+ }
@@ -0,0 +1,700 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Determine the class of a case from a decision tree or ruleset */
30
+ /* */
31
+ /*************************************************************************/
32
+
33
+
34
+ #include "defns.i"
35
+ #include "extern.i"
36
+
37
+
38
+ /* Local data used by MarkActive and RuleClassify.
39
+ Note: Active is never deallocated, just grows as required */
40
+
41
+ RuleNo *Active=Nil, /* rules that fire while classifying case */
42
+ NActive, /* number ditto */
43
+ ActiveSpace=0; /* space allocated */
44
+
45
+
46
+
47
+ /*************************************************************************/
48
+ /* */
49
+ /* Classify a case using a decision tree */
50
+ /* */
51
+ /*************************************************************************/
52
+
53
+
54
+ ClassNo TreeClassify(DataRec Case, Tree DecisionTree)
55
+ /* ------------ */
56
+ {
57
+ ClassNo c;
58
+
59
+ ForEach(c, 0, MaxClass)
60
+ {
61
+ ClassSum[c] = 0;
62
+ }
63
+
64
+ FindLeaf(Case, DecisionTree, Nil, 1.0);
65
+
66
+ return SelectClass(1, (Boolean)(MCost != Nil));
67
+ }
68
+
69
+
70
+
71
+ /*************************************************************************/
72
+ /* */
73
+ /* Classify a case using the given subtree. */
74
+ /* Adjust the value ClassSum for each class */
75
+ /* */
76
+ /*************************************************************************/
77
+
78
+
79
+ void FindLeaf(DataRec Case, Tree T, Tree PT, float Fraction)
80
+ /* -------- */
81
+ {
82
+ DiscrValue v, Dv;
83
+ ClassNo c;
84
+ float NewFrac, BrWt[4];
85
+
86
+ /* Special case for winnowing cycles */
87
+
88
+ if ( T->NodeType && Skip(T->Tested) )
89
+ {
90
+ FollowAllBranches(Case, T, Fraction);
91
+ return;
92
+ }
93
+
94
+ if ( T->NodeType && Tested )
95
+ {
96
+ Tested[T->Tested] = true; /* for usage */
97
+ }
98
+
99
+ switch ( T->NodeType )
100
+ {
101
+ case 0: /* leaf */
102
+
103
+ LeafUpdate:
104
+
105
+ /* Use parent node if effectively no cases at this node */
106
+
107
+ if ( T->Cases < Epsilon )
108
+ {
109
+ T = PT;
110
+ }
111
+
112
+ /* Update from all classes */
113
+
114
+ ForEach(c, 1, MaxClass)
115
+ {
116
+ ClassSum[c] += Fraction * T->ClassDist[c] / T->Cases;
117
+ }
118
+
119
+ return;
120
+
121
+ case BrDiscr: /* test of discrete attribute */
122
+
123
+ Dv = DVal(Case, T->Tested); /* > MaxAttVal if unknown */
124
+
125
+ if ( Dv <= T->Forks ) /* Make sure not new discrete value */
126
+ {
127
+ FindLeaf(Case, T->Branch[Dv], T, Fraction);
128
+ }
129
+ else
130
+ {
131
+ FollowAllBranches(Case, T, Fraction);
132
+ }
133
+
134
+ return;
135
+
136
+ case BrThresh: /* test of continuous attribute */
137
+
138
+ if ( Unknown(Case, T->Tested) )
139
+ {
140
+ FollowAllBranches(Case, T, Fraction);
141
+ }
142
+ else
143
+ if ( NotApplic(Case, T->Tested) )
144
+ {
145
+ FindLeaf(Case, T->Branch[1], T, Fraction);
146
+ }
147
+ else
148
+ {
149
+ /* Find weights for <= and > branches, interpolating if
150
+ probabilistic thresholds are used */
151
+
152
+ BrWt[2] = Interpolate(T, CVal(Case, T->Tested));
153
+ BrWt[3] = 1 - BrWt[2];
154
+
155
+ ForEach(v, 2, 3)
156
+ {
157
+ if ( (NewFrac = Fraction * BrWt[v]) >= 0.01 )
158
+ {
159
+ FindLeaf(Case, T->Branch[v], T, NewFrac);
160
+ }
161
+ }
162
+ }
163
+
164
+ return;
165
+
166
+ case BrSubset: /* subset test on discrete attribute */
167
+
168
+ Dv = DVal(Case, T->Tested); /* > MaxAttVal if unknown */
169
+
170
+ if ( Dv <= MaxAttVal[T->Tested] )
171
+ {
172
+ ForEach(v, 1, T->Forks)
173
+ {
174
+ if ( In(Dv, T->Subset[v]) )
175
+ {
176
+ FindLeaf(Case, T->Branch[v], T, Fraction);
177
+
178
+ return;
179
+ }
180
+ }
181
+
182
+ /* Value not found in any subset -- treat as leaf */
183
+
184
+ goto LeafUpdate;
185
+ }
186
+ else
187
+ {
188
+ FollowAllBranches(Case, T, Fraction);
189
+ }
190
+ }
191
+ }
192
+
193
+
194
+
195
+ /*************************************************************************/
196
+ /* */
197
+ /* Follow all branches from a node, weighting them in proportion */
198
+ /* to the number of training cases they contain */
199
+ /* */
200
+ /*************************************************************************/
201
+
202
+
203
+ void FollowAllBranches(DataRec Case, Tree T, float Fraction)
204
+ /* ----------------- */
205
+ {
206
+ DiscrValue v;
207
+
208
+ ForEach(v, 1, T->Forks)
209
+ {
210
+ if ( T->Branch[v]->Cases > Epsilon )
211
+ {
212
+ FindLeaf(Case, T->Branch[v], T,
213
+ (Fraction * T->Branch[v]->Cases) / T->Cases);
214
+ }
215
+ }
216
+ }
217
+
218
+
219
+
220
+ /*************************************************************************/
221
+ /* */
222
+ /* Classify a case using a ruleset */
223
+ /* */
224
+ /*************************************************************************/
225
+
226
+
227
+ ClassNo RuleClassify(DataRec Case, CRuleSet RS)
228
+ /* ------------ */
229
+ {
230
+ ClassNo c, Best;
231
+ float TotWeight=0;
232
+ int a, u=1, d;
233
+ CRule R;
234
+ RuleNo r;
235
+
236
+ ForEach(c, 0, MaxClass)
237
+ {
238
+ ClassSum[c] = 0;
239
+ MostSpec[c] = Nil;
240
+ }
241
+
242
+ /* Find active rules */
243
+
244
+ NActive = 0;
245
+
246
+ if ( RS->RT )
247
+ {
248
+ MarkActive(RS->RT, Case);
249
+ }
250
+ else
251
+ {
252
+ ForEach(r, 1, RS->SNRules)
253
+ {
254
+ R = RS->SRule[r];
255
+
256
+ if ( Matches(R, Case) )
257
+ {
258
+ Active[NActive++] = r;
259
+ }
260
+ }
261
+ }
262
+
263
+ /* Must sort rules if using utility bands */
264
+
265
+ if ( UtilBand )
266
+ {
267
+ SortActive();
268
+ }
269
+
270
+ /* Vote active rules */
271
+
272
+ ForEach(a, 0, NActive-1)
273
+ {
274
+ r = Active[a];
275
+ R = RS->SRule[r];
276
+
277
+ if ( Tested )
278
+ {
279
+ ForEach(d, 1, R->Size)
280
+ {
281
+ Tested[R->Lhs[d]->Tested] = true; /* for usage */
282
+ }
283
+ }
284
+ if ( UtilBand ) CheckUtilityBand(&u, r, Class(Case), RS->SDefault);
285
+ ClassSum[R->Rhs] += R->Vote;
286
+ TotWeight += 1000.0;
287
+
288
+ /* Check whether this is the most specific rule for this class;
289
+ resolve ties in favor of rule with higher vote */
290
+
291
+ if ( ! MostSpec[R->Rhs] ||
292
+ R->Cover < MostSpec[R->Rhs]->Cover ||
293
+ ( R->Cover == MostSpec[R->Rhs]->Cover &&
294
+ R->Vote > MostSpec[R->Rhs]->Vote ) )
295
+ {
296
+ MostSpec[R->Rhs] = R;
297
+ }
298
+ }
299
+
300
+ /* Flush any remaining utility bands */
301
+
302
+ if ( UtilBand )
303
+ {
304
+ CheckUtilityBand(&u, RS->SNRules+1, Class(Case), RS->SDefault);
305
+ }
306
+
307
+ /* Check for default and normalise ClassSum */
308
+
309
+ if ( ! TotWeight )
310
+ {
311
+ Confidence = 0.5;
312
+ return RS->SDefault;
313
+ }
314
+
315
+ ForEach(c, 1, MaxClass)
316
+ {
317
+ ClassSum[c] /= TotWeight;
318
+ }
319
+
320
+ Best = SelectClass(RS->SDefault, false);
321
+
322
+ /* Set Confidence to the vote for the most specific rule of class Best */
323
+
324
+ Confidence = MostSpec[Best]->Vote / 1000.0;
325
+
326
+ return Best;
327
+ }
328
+
329
+
330
+
331
+ /*************************************************************************/
332
+ /* */
333
+ /* Determine outcome of a test on a case. */
334
+ /* Return -1 if value of tested attribute is unknown */
335
+ /* */
336
+ /*************************************************************************/
337
+
338
+
339
+ int FindOutcome(DataRec Case, Condition OneCond)
340
+ /* ----------- */
341
+ {
342
+ DiscrValue v, Outcome;
343
+ Attribute Att;
344
+
345
+ Att = OneCond->Tested;
346
+
347
+ /* Determine the outcome of this test on this case */
348
+
349
+ switch ( OneCond->NodeType )
350
+ {
351
+ case BrDiscr: /* test of discrete attribute */
352
+
353
+ v = XDVal(Case, Att);
354
+ Outcome = ( v == 0 ? -1 : v );
355
+ break;
356
+
357
+ case BrThresh: /* test of continuous attribute */
358
+
359
+ Outcome = ( Unknown(Case, Att) ? -1 :
360
+ NotApplic(Case, Att) ? 1 :
361
+ CVal(Case, Att) <= OneCond->Cut ? 2 : 3 );
362
+ break;
363
+
364
+ case BrSubset: /* subset test on discrete attribute */
365
+
366
+ v = XDVal(Case, Att);
367
+ Outcome = ( v <= MaxAttVal[Att] && In(v, OneCond->Subset) ?
368
+ OneCond->TestValue : 0 );
369
+ }
370
+
371
+ return Outcome;
372
+ }
373
+
374
+
375
+
376
+ /*************************************************************************/
377
+ /* */
378
+ /* Determine whether a case satisfies a condition */
379
+ /* */
380
+ /*************************************************************************/
381
+
382
+
383
+ Boolean Satisfies(DataRec Case, Condition OneCond)
384
+ /* --------- */
385
+ {
386
+ return ( FindOutcome(Case, OneCond) == OneCond->TestValue );
387
+ }
388
+
389
+
390
+
391
+ /*************************************************************************/
392
+ /* */
393
+ /* Determine whether a case satisfies all conditions of a rule */
394
+ /* */
395
+ /*************************************************************************/
396
+
397
+
398
+ Boolean Matches(CRule R, DataRec Case)
399
+ /* ------- */
400
+ {
401
+ int d;
402
+
403
+ ForEach(d, 1, R->Size)
404
+ {
405
+ if ( ! Satisfies(Case, R->Lhs[d]) )
406
+ {
407
+ return false;
408
+ }
409
+ }
410
+
411
+ return true;
412
+ }
413
+
414
+
415
+
416
+ /*************************************************************************/
417
+ /* */
418
+ /* Make sure that Active[] has space for at least N rules */
419
+ /* */
420
+ /*************************************************************************/
421
+
422
+
423
+ void CheckActiveSpace(int N)
424
+ /* ---------------- */
425
+ {
426
+ if ( ActiveSpace <= N )
427
+ {
428
+ Realloc(Active, (ActiveSpace=N+1), RuleNo);
429
+ }
430
+ }
431
+
432
+
433
+
434
+ /*************************************************************************/
435
+ /* */
436
+ /* Use RT to enter active rules in Active[] */
437
+ /* */
438
+ /*************************************************************************/
439
+
440
+
441
+ void MarkActive(RuleTree RT, DataRec Case)
442
+ /* ---------- */
443
+ {
444
+ DiscrValue v;
445
+ int ri;
446
+ RuleNo r;
447
+
448
+ if ( ! RT ) return;
449
+
450
+ /* Enter any rules satisfied at this node */
451
+
452
+ if ( RT->Fire )
453
+ {
454
+ for ( ri = 0 ; (r = RT->Fire[ri]) ; ri++ )
455
+ {
456
+ Active[NActive++] = r;
457
+ }
458
+ }
459
+
460
+ if ( ! RT->Branch ) return;
461
+
462
+ /* Explore subtree for rules that include condition at this node */
463
+
464
+ if ( (v = FindOutcome(Case, RT->CondTest)) > 0 && v <= RT->Forks )
465
+ {
466
+ MarkActive(RT->Branch[v], Case);
467
+ }
468
+
469
+ /* Explore default subtree for rules that do not include condition */
470
+
471
+ MarkActive(RT->Branch[0], Case);
472
+ }
473
+
474
+
475
+
476
+ /*************************************************************************/
477
+ /* */
478
+ /* Sort active rules for utility band error rates */
479
+ /* */
480
+ /*************************************************************************/
481
+
482
+
483
+ void SortActive()
484
+ /* ---------- */
485
+ {
486
+ RuleNo r;
487
+ int a, aa, aLow;
488
+
489
+ ForEach(a, 0, NActive-1)
490
+ {
491
+ aLow = a;
492
+
493
+ ForEach(aa, a+1, NActive-1)
494
+ {
495
+ if ( Active[aa] < Active[aLow] ) aLow = aa;
496
+ }
497
+
498
+ r = Active[a];
499
+ Active[a] = Active[aLow];
500
+ Active[aLow] = r;
501
+ }
502
+ }
503
+
504
+
505
+
506
+ /*************************************************************************/
507
+ /* */
508
+ /* Update utility band error rates for all bands before rule r */
509
+ /* that have not been competed yet. Update current band. */
510
+ /* */
511
+ /*************************************************************************/
512
+
513
+
514
+ void CheckUtilityBand(int *u, RuleNo r, ClassNo Actual, ClassNo Default)
515
+ /* ---------------- */
516
+ {
517
+ ClassNo c;
518
+
519
+ while ( *u < UTILITY && r > UtilBand[*u] )
520
+ {
521
+ c = SelectClass(Default, false);
522
+ if ( c != Actual )
523
+ {
524
+ UtilErr[*u]++;
525
+ if ( MCost ) UtilCost[*u] += MCost[c][Actual];
526
+ }
527
+
528
+ (*u)++;
529
+ }
530
+ }
531
+
532
+
533
+
534
+ /*************************************************************************/
535
+ /* */
536
+ /* Classify a case using boosted tree or rule sequence. */
537
+ /* Global variable Default must have been set prior to call */
538
+ /* */
539
+ /* Note: boosting with costs is complicated. With trees, */
540
+ /* complete class distributions are accumulated and then a class */
541
+ /* selected to minimize expected cost. This cannot be done with */
542
+ /* rulesets since a single ruleset does not give a reliable */
543
+ /* class distribution; instead, the votes from all cost-adjusted */
544
+ /* rulesets are combined without reference to costs. */
545
+ /* */
546
+ /*************************************************************************/
547
+
548
+
549
+ ClassNo BoostClassify(DataRec Case, int MaxTrial)
550
+ /* ------------- */
551
+ {
552
+ ClassNo c, Best;
553
+ int t;
554
+ float Total=0;
555
+
556
+ ForEach(c, 1, MaxClass)
557
+ {
558
+ Vote[c] = 0;
559
+ }
560
+
561
+ ForEach(t, 0, MaxTrial)
562
+ {
563
+ Best = ( RULES ? RuleClassify(Case, RuleSet[t]) :
564
+ TreeClassify(Case, Pruned[t]) );
565
+
566
+ Vote[Best] += Confidence;
567
+ Total += Confidence;
568
+
569
+ TrialPred[t] = Best;
570
+ }
571
+
572
+ /* Copy votes into ClassSum */
573
+
574
+ ForEach(c, 1, MaxClass)
575
+ {
576
+ ClassSum[c] = Vote[c] / Total;
577
+ }
578
+
579
+ return SelectClass(Default, false);
580
+ }
581
+
582
+
583
+
584
+ /*************************************************************************/
585
+ /* */
586
+ /* Select the best class to return. Take misclassification costs */
587
+ /* into account if they are defined. */
588
+ /* */
589
+ /*************************************************************************/
590
+
591
+
592
+ ClassNo SelectClass(ClassNo Default, Boolean UseCosts)
593
+ /* ----------- */
594
+ {
595
+ ClassNo c, cc, BestClass;
596
+ float ExpCost, BestCost=1E38, TotCost=0;
597
+
598
+ BestClass = Default;
599
+
600
+ if ( UseCosts )
601
+ {
602
+ ForEach(c, 1, MaxClass)
603
+ {
604
+ ExpCost = 0;
605
+ ForEach(cc, 1, MaxClass)
606
+ {
607
+ if ( cc == c ) continue;
608
+ ExpCost += ClassSum[cc] * MCost[c][cc];
609
+ }
610
+
611
+ TotCost += ExpCost;
612
+
613
+ if ( ExpCost < BestCost )
614
+ {
615
+ BestClass = c;
616
+ BestCost = ExpCost;
617
+ }
618
+ }
619
+
620
+ Confidence = 1 - BestCost / TotCost;
621
+ }
622
+ else
623
+ {
624
+ ForEach(c, 1, MaxClass)
625
+ {
626
+ if ( ClassSum[c] > ClassSum[BestClass] ) BestClass = c;
627
+ }
628
+
629
+ Confidence = ClassSum[BestClass];
630
+ }
631
+
632
+ return BestClass;
633
+ }
634
+
635
+
636
+
637
+ /*************************************************************************/
638
+ /* */
639
+ /* General classification routine */
640
+ /* */
641
+ /*************************************************************************/
642
+
643
+
644
+ ClassNo Classify(DataRec Case)
645
+ /* -------- */
646
+ {
647
+
648
+ return ( TRIALS > 1 ? BoostClassify(Case, TRIALS-1) :
649
+ RULES ? RuleClassify(Case, RuleSet[0]) :
650
+ TreeClassify(Case, Pruned[0]) );
651
+ }
652
+
653
+
654
+
655
+ /*************************************************************************/
656
+ /* */
657
+ /* Interpolate a single value between Lower, Mid and Upper */
658
+ /* (All these have the same value unless using probabilistic */
659
+ /* thresholds.) */
660
+ /* */
661
+ /*************************************************************************/
662
+
663
+
664
+ float Interpolate(Tree T, ContValue Val)
665
+ /* ----------- */
666
+ {
667
+ return ( Val <= T->Lower ? 1.0 :
668
+ Val >= T->Upper ? 0.0 :
669
+ Val <= T->Mid ?
670
+ 1 - 0.5 * (Val - T->Lower) / (T->Mid - T->Lower + 1E-6) :
671
+ 0.5 - 0.5 * (Val - T->Mid) / (T->Upper - T->Mid + 1E-6) );
672
+ }
673
+
674
+
675
+
676
+ /*************************************************************************/
677
+ /* */
678
+ /* Free data structures for one classifier */
679
+ /* */
680
+ /*************************************************************************/
681
+
682
+
683
+ void FreeClassifier(int Trial)
684
+ /* -------------- */
685
+ {
686
+ if ( Raw )
687
+ {
688
+ FreeTree(Raw[Trial]); Raw[Trial] = Nil;
689
+ }
690
+
691
+ if ( Pruned )
692
+ {
693
+ FreeTree(Pruned[Trial]); Pruned[Trial] = Nil;
694
+ }
695
+
696
+ if ( RULES && RuleSet && RuleSet[Trial] )
697
+ {
698
+ FreeRules(RuleSet[Trial]); RuleSet[Trial] = Nil;
699
+ }
700
+ }