see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,195 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Routine for printing confusion matrices and attribute usage */
30
+ /* ----------------------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+ #include "defns.i"
35
+ #include "extern.i"
36
+
37
+
38
+ void PrintConfusionMatrix(CaseNo *ConfusionMat)
39
+ /* -------------------- */
40
+ {
41
+ int Row, Col, Entry, EntryWidth=10000;
42
+
43
+ /* For more than 20 classes, use summary instead */
44
+
45
+ if ( MaxClass > 20 )
46
+ {
47
+ PrintErrorBreakdown(ConfusionMat);
48
+ return;
49
+ }
50
+
51
+ /* Find maximum entry width in chars */
52
+
53
+ ForEach(Row, 1, MaxClass)
54
+ {
55
+ ForEach(Col, 1, MaxClass)
56
+ {
57
+ EntryWidth = Max(EntryWidth, ConfusionMat[Row*(MaxClass+1) + Col]);
58
+ }
59
+ }
60
+
61
+ EntryWidth = floor(Log(EntryWidth + 100.0) / Log(10.0)) + 2;
62
+
63
+ /* Print the heading, then each row */
64
+
65
+ fprintf(Of, "\n\n\t");
66
+ ForEach(Col, 1, MaxClass)
67
+ {
68
+ fprintf(Of, "%*s(%c)", EntryWidth-3, " ", 'a' + Col-1);
69
+ }
70
+
71
+ fprintf(Of, " <-" T_classified_as "\n\t");
72
+ ForEach(Col, 1, MaxClass)
73
+ {
74
+ fprintf(Of, "%*.*s", EntryWidth, EntryWidth-2, "----------");
75
+ }
76
+ fprintf(Of, "\n");
77
+
78
+ ForEach(Row, 1, MaxClass)
79
+ {
80
+ fprintf(Of, "\t");
81
+ ForEach(Col, 1, MaxClass)
82
+ {
83
+ if ( (Entry = ConfusionMat[Row*(MaxClass+1) + Col]) )
84
+ {
85
+ fprintf(Of, " %*d", EntryWidth-1, Entry);
86
+ }
87
+ else
88
+ {
89
+ fprintf(Of, "%*s", EntryWidth, " ");
90
+ }
91
+ }
92
+ fprintf(Of, " (%c): " T_class " %s\n", 'a' + Row-1, ClassName[Row]);
93
+ }
94
+ }
95
+
96
+
97
+
98
+ void PrintErrorBreakdown(CaseNo *ConfusionMat)
99
+ /* ------------------- */
100
+ {
101
+ CaseNo *TruePos, *FalsePos, *FalseNeg, Entry;
102
+ int Row, Col, EntryWidth=100000, ClassWidth=5;
103
+
104
+ TruePos = AllocZero(MaxClass+1, CaseNo);
105
+ FalsePos = AllocZero(MaxClass+1, CaseNo);
106
+ FalseNeg = AllocZero(MaxClass+1, CaseNo);
107
+
108
+ ForEach(Row, 1, MaxClass)
109
+ {
110
+ ForEach(Col, 1, MaxClass)
111
+ {
112
+ Entry = ConfusionMat[Row*(MaxClass+1) + Col];
113
+
114
+ if ( Col == Row )
115
+ {
116
+ TruePos[Row] += Entry;
117
+ }
118
+ else
119
+ {
120
+ FalseNeg[Row] += Entry;
121
+ FalsePos[Col] += Entry;
122
+ }
123
+ }
124
+
125
+ EntryWidth = Max(EntryWidth, TruePos[Row] + FalseNeg[Row]);
126
+ ClassWidth = Max(ClassWidth, strlen(ClassName[Row]));
127
+ }
128
+
129
+ EntryWidth = floor(Log(EntryWidth + 100.0) / Log(10.0)) + 2;
130
+
131
+ /* Print heading (tricky spacing) */
132
+
133
+ fprintf(Of, "\n\n\t %-*s %*s %*s %*s\n\t %*s %*s %*s %*s\n",
134
+ ClassWidth, "Class",
135
+ EntryWidth, "Cases",
136
+ EntryWidth, "False",
137
+ EntryWidth, "False",
138
+ ClassWidth, "",
139
+ EntryWidth, "",
140
+ EntryWidth, "Pos",
141
+ EntryWidth, "Neg");
142
+ fprintf(Of, "\t %-*s %*s %*s %*s\n",
143
+ ClassWidth, "-----",
144
+ EntryWidth, "-----",
145
+ EntryWidth, "-----",
146
+ EntryWidth, "-----");
147
+
148
+ ForEach(Row, 1, MaxClass)
149
+ {
150
+ fprintf(Of, "\t %-*s %*d %*d %*d\n",
151
+ ClassWidth, ClassName[Row],
152
+ EntryWidth, TruePos[Row] + FalseNeg[Row],
153
+ EntryWidth, FalsePos[Row],
154
+ EntryWidth, FalseNeg[Row]);
155
+ }
156
+
157
+ Free(TruePos);
158
+ Free(FalsePos);
159
+ Free(FalseNeg);
160
+ }
161
+
162
+
163
+
164
+ void PrintUsageInfo(CaseNo *Usage)
165
+ /* -------------- */
166
+ {
167
+ Attribute Att, Best;
168
+ float Tests;
169
+ Boolean First=true;
170
+
171
+ Tests = Max(1, MaxCase+1);
172
+
173
+ while ( true )
174
+ {
175
+ Best = 0;
176
+
177
+ ForEach(Att, 1, MaxAtt)
178
+ {
179
+ if ( Usage[Att] > Usage[Best] ) Best = Att;
180
+ }
181
+
182
+ if ( ! Best || Usage[Best] < 0.01 * Tests ) break;
183
+
184
+ if ( First )
185
+ {
186
+ fprintf(Of, T_Usage);
187
+ First = false;
188
+ }
189
+
190
+ fprintf(Of, "\t%7d%% %s\n",
191
+ (int) ((100 * Usage[Best]) / Tests + 0.5), AttName[Best]);
192
+
193
+ Usage[Best] = 0;
194
+ }
195
+ }
@@ -0,0 +1,853 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Manage construction of classifiers, including boosting */
30
+ /* ------------------------------------------------------ */
31
+ /* */
32
+ /* C5.0 uses a modified form of boosting as follows: */
33
+ /* * Multiplicative weight adjustment is used for cases that are */
34
+ /* classified correctly, but misclassified cases use a form */
35
+ /* of additive weight adjustment. */
36
+ /* * In later boosting trials, cases that cannot possibly be */
37
+ /* classified correctly are dropped. (This follows Freund's */
38
+ /* "Brown-Boost" approach, since the number of trials is known.) */
39
+ /* * The voting weight of a boosted classifier is determined by */
40
+ /* the confidence of its classification, based on the boosted */
41
+ /* weights of the training cases. */
42
+ /* */
43
+ /* Variable misclassification costs are also supported. When */
44
+ /* there are two classes, the misclassification costs are used */
45
+ /* to reweight the training cases, and the reweighting is reversed */
46
+ /* after the classifier is constructed. When classifying a case, */
47
+ /* the estimated class probabilities and cost matrix are used to */
48
+ /* determine the predicted class with lowest expected cost. */
49
+ /* */
50
+ /*************************************************************************/
51
+
52
+
53
+ #include "defns.i"
54
+ #include "extern.i"
55
+
56
+ /*************************************************************************/
57
+ /* */
58
+ /* Grow single tree or sequence of boosted trees */
59
+ /* */
60
+ /*************************************************************************/
61
+
62
+
63
+ void ConstructClassifiers()
64
+ /* -------------------- */
65
+ {
66
+ CaseNo i, Errs, Cases, Bp, Excl=0;
67
+ double ErrWt, ExclWt=0, OKWt, ExtraErrWt, NFact, MinWt=1.0, a, b;
68
+ ClassNo c, Pred, Real, Best;
69
+ static ClassNo *Wrong=Nil;
70
+ int BaseLeaves;
71
+ Boolean NoStructure, CheckExcl;
72
+ float *BVote;
73
+
74
+ /* Clean up after possible interrupt */
75
+
76
+ FreeUnlessNil(Wrong);
77
+
78
+ Wrong = Alloc(MaxCase+1, ClassNo);
79
+
80
+ if ( TRIALS > 1 )
81
+ {
82
+ /* BVoteBlock contains each case's class votes */
83
+
84
+ BVoteBlock = AllocZero((MaxCase+1) * (MaxClass+1), float);
85
+ }
86
+
87
+ /* Preserve original case order */
88
+
89
+ SaveCase = Alloc(MaxCase+1, DataRec);
90
+ memcpy(SaveCase, Case, (MaxCase+1) * sizeof(DataRec));
91
+
92
+ /* If using case weighting, find average */
93
+
94
+ if ( CWtAtt )
95
+ {
96
+ SetAvCWt();
97
+ }
98
+
99
+ InitialiseWeights();
100
+
101
+ /* Adjust minimum weight if using cost weighting */
102
+
103
+ if ( CostWeights )
104
+ {
105
+ ForEach(c, 1, MaxClass)
106
+ {
107
+ if ( WeightMul[c] < MinWt ) MinWt = WeightMul[c];
108
+ }
109
+ }
110
+
111
+ LEAFRATIO = Bp = 0;
112
+ SetMinGainThresh();
113
+
114
+ /* Main loop for growing the sequence of boosted classifiers */
115
+
116
+ ForEach(Trial, 0, TRIALS-1 )
117
+ {
118
+ if ( TRIALS > 1 )
119
+ {
120
+ fprintf(Of, "\n----- " F_Trial " %d: -----\n", Trial);
121
+ }
122
+
123
+ NotifyStage(FORMTREE);
124
+ Progress(-(MaxCase+1.0));
125
+
126
+ /* Update count here in case tree construction is interrupted */
127
+
128
+ MaxTree = Trial;
129
+ Raw[MaxTree] = Pruned[MaxTree] = Nil;
130
+ if ( RULES ) RuleSet[MaxTree] = Nil;
131
+
132
+ memset(Tested, 0, MaxAtt+1); /* reset tested attributes */
133
+
134
+ FormTree(Bp, MaxCase, 0, &Raw[Trial]);
135
+
136
+ /* Prune the raw tree to minimise expected misclassification cost */
137
+
138
+ Verbosity(1, if ( ! RULES ) PrintTree(Raw[Trial], "Before pruning:"))
139
+
140
+ NotifyStage(SIMPLIFYTREE);
141
+ Progress(-(MaxCase+1));
142
+
143
+ /* If still need raw tree, copy it; otherwise set initial
144
+ pruned tree to raw tree */
145
+
146
+ if ( VERBOSITY && ! RULES )
147
+ {
148
+ Pruned[Trial] = CopyTree(Raw[Trial]);
149
+ if ( MCost )
150
+ {
151
+ RestoreDistribs(Raw[Trial]);
152
+ }
153
+ }
154
+ else
155
+ {
156
+ Pruned[Trial] = Raw[Trial];
157
+ Raw[Trial] = Nil;
158
+ }
159
+
160
+ memcpy(Case, SaveCase, (MaxCase+1) * sizeof(DataRec)); /* restore */
161
+
162
+ Prune(Pruned[Trial]);
163
+
164
+ AdjustAllThresholds(Pruned[Trial]);
165
+
166
+ /* Record tree parameters for later */
167
+
168
+ if ( ! Trial )
169
+ {
170
+ BaseLeaves = ( RULES || SUBSET ? TreeSize(Pruned[0]) :
171
+ ExpandedLeafCount(Pruned[0]) );
172
+ }
173
+ NoStructure = ! Pruned[Trial]->NodeType;
174
+
175
+ if ( PROBTHRESH )
176
+ {
177
+ SoftenThresh(Pruned[Trial]);
178
+ }
179
+
180
+ memcpy(Case, SaveCase, (MaxCase+1) * sizeof(DataRec)); /* restore */
181
+
182
+ if ( RULES )
183
+ {
184
+ RuleSet[Trial] = FormRules(Pruned[Trial]);
185
+ NoStructure |= ! RuleSet[Trial]->SNRules;
186
+
187
+ PrintRules(RuleSet[Trial], T_Rules);
188
+ fprintf(Of, "\n" T_Default_class ": %s\n",
189
+ ClassName[RuleSet[Trial]->SDefault]);
190
+
191
+ FreeTree(Pruned[Trial]); Pruned[Trial] = Nil;
192
+ }
193
+ else
194
+ {
195
+ PrintTree(Pruned[Trial], T_Tree);
196
+ }
197
+
198
+ if ( Trial == TRIALS-1 ) continue;
199
+
200
+ /* Check errors, adjust boost voting, and shift dropped cases
201
+ to the front */
202
+
203
+ ErrWt = Errs = OKWt = Bp = 0;
204
+ CheckExcl = ( Trial+1 > TRIALS / 2.0 );
205
+
206
+ ForEach(i, 0, MaxCase)
207
+ {
208
+ /* Has this case been dropped already? */
209
+
210
+ if ( Weight(Case[i]) <= 0 )
211
+ {
212
+ Case[i] = Case[Bp];
213
+ Wrong[i] = Wrong[Bp];
214
+ Bp++;
215
+ continue;
216
+ }
217
+
218
+ Pred = ( RULES ? RuleClassify(Case[i], RuleSet[Trial]) :
219
+ TreeClassify(Case[i], Pruned[Trial]) );
220
+
221
+ Real = Class(Case[i]);
222
+
223
+ /* Update boosting votes for this case. (Note that cases
224
+ must have been reset to their original order.) */
225
+
226
+ BVote = BVoteBlock + i * (MaxClass+1);
227
+ BVote[Pred] += Confidence;
228
+
229
+ Best = BVote[0];
230
+ if ( BVote[Pred] > BVote[Best] ) BVote[0] = Best = Pred;
231
+
232
+ if ( CheckExcl )
233
+ {
234
+ /* Check whether this case should be dropped because
235
+ the vote for the correct class cannot be increased
236
+ sufficiently in the remaining trials */
237
+
238
+ if ( BVote[Best] > BVote[Real] + (TRIALS-1) - Trial )
239
+ {
240
+ Excl++;
241
+ ExclWt += Weight(Case[i]);
242
+
243
+ Weight(Case[i]) = 0;
244
+ Case[i] = Case[Bp];
245
+ Wrong[i] = Wrong[Bp];
246
+ Bp++;
247
+
248
+ continue;
249
+ }
250
+ }
251
+
252
+ if ( Pred != Real )
253
+ {
254
+ Wrong[i] = Pred;
255
+ ErrWt += Weight(Case[i]);
256
+ Errs++;
257
+ }
258
+ else
259
+ {
260
+ Wrong[i] = 0;
261
+ OKWt += Weight(Case[i]);
262
+ }
263
+ }
264
+
265
+ Cases = (MaxCase+1) - Excl;
266
+
267
+ /* Special termination conditions */
268
+
269
+ if ( ErrWt < 0.1 )
270
+ {
271
+ TRIALS = Trial + 1;
272
+ fprintf(Of, TX_Reduced1(TRIALS), TRIALS);
273
+ }
274
+ else
275
+ if ( Trial && NoStructure || ErrWt / Cases >= 0.49 )
276
+ {
277
+ TRIALS = ( Trial ? Trial : 1 );
278
+ fprintf(Of, TX_Reduced2(TRIALS), TRIALS);
279
+ }
280
+ else
281
+ {
282
+ /* Adjust case weights. Total weight of misclassified cases
283
+ set to midway between current weight and half total weight.
284
+ Take account of any dropped cases */
285
+
286
+ ExtraErrWt = 0.25 * (OKWt - ErrWt); /* half */
287
+ a = (OKWt - ExtraErrWt) / OKWt;
288
+ b = ExtraErrWt / Errs;
289
+
290
+ /* Normalise so that sum of weights equals number of cases */
291
+
292
+ NFact = Cases / (OKWt + ErrWt);
293
+
294
+ MinWt *= a * NFact;
295
+
296
+ ForEach(i, Bp, MaxCase)
297
+ {
298
+ if ( Wrong[i] )
299
+ {
300
+ Weight(Case[i]) = NFact * (Weight(Case[i]) + b);
301
+ }
302
+ else
303
+ {
304
+ Weight(Case[i]) *= NFact * a;
305
+
306
+ /* Necessary for accumulated arithmetic errors */
307
+
308
+ if ( Weight(Case[i]) < 1E-3 ) Weight(Case[i]) = 1E-3;
309
+ }
310
+ }
311
+
312
+ /* Set the leaf ratio for subsequent boosting trials.
313
+ The idea is to constrain the tree to roughly the size
314
+ of the initial tree by limiting the number of leaves
315
+ per training case. This limitation is not strict
316
+ since even a tiny number of cases can give a leaf */
317
+
318
+ if ( Trial == 0 )
319
+ {
320
+ LEAFRATIO = 1.1 * BaseLeaves / (MaxCase + 1.0);
321
+ }
322
+
323
+ /* Trim cases for larger datasets */
324
+
325
+ if ( MaxCase > 4000 && MinWt <= 0.2 )
326
+ {
327
+ a = 0;
328
+ ForEach(i, Bp, MaxCase)
329
+ {
330
+ if ( Weight(Case[i]) <= MinWt + 1E-3 )
331
+ {
332
+ a += Weight(Case[i]);
333
+ Swap(i, Bp);
334
+ Bp++;
335
+ }
336
+ }
337
+ }
338
+ }
339
+
340
+ UnitWeights = false;
341
+ }
342
+
343
+ FreeUnlessNil(SaveCase); SaveCase = Nil;
344
+
345
+ /* Decide whether boosting should be abandoned */
346
+
347
+ if ( BOOST && TRIALS <= 2 )
348
+ {
349
+ fprintf(Of, T_Abandoned);
350
+ TRIALS = 1;
351
+ }
352
+
353
+ /* Save trees or rulesets */
354
+
355
+ if ( ! XVAL )
356
+ {
357
+ if ( ! RULES )
358
+ {
359
+ ForEach(Trial, 0, TRIALS-1)
360
+ {
361
+ SaveTree(Pruned[Trial], ".tree");
362
+ }
363
+ }
364
+ else
365
+ {
366
+ ForEach(Trial, 0, TRIALS-1)
367
+ {
368
+ SaveRules(RuleSet[Trial], ".rules");
369
+ }
370
+ }
371
+
372
+ fclose(TRf);
373
+ }
374
+ TRf = 0;
375
+
376
+ Free(Wrong); Wrong = Nil;
377
+ FreeUnlessNil(BVoteBlock); BVoteBlock = Nil;
378
+ }
379
+
380
+
381
+
382
+ /*************************************************************************/
383
+ /* */
384
+ /* Initialise the weight of each case */
385
+ /* */
386
+ /*************************************************************************/
387
+
388
+
389
+ void InitialiseWeights()
390
+ /* ----------------- */
391
+ {
392
+ CaseNo i;
393
+
394
+ if ( CostWeights )
395
+ {
396
+ /* Make weights proportional to average error cost */
397
+
398
+ ForEach(i, 0, MaxCase)
399
+ {
400
+ Weight(Case[i]) = WeightMul[Class(Case[i])];
401
+ }
402
+ UnitWeights = false;
403
+ }
404
+ else
405
+ {
406
+ ForEach(i, 0, MaxCase)
407
+ {
408
+ Weight(Case[i]) = 1.0;
409
+ }
410
+ UnitWeights = true;
411
+ }
412
+
413
+ /* Adjust when using case weights */
414
+
415
+ if ( CWtAtt )
416
+ {
417
+ ForEach(i, 0, MaxCase)
418
+ {
419
+ Weight(Case[i]) *= RelCWt(Case[i]);
420
+ }
421
+ UnitWeights = false;
422
+ }
423
+ }
424
+
425
+
426
+
427
+ /*************************************************************************/
428
+ /* */
429
+ /* Determine average case weight, ignoring cases with unknown, */
430
+ /* non-applicable, or negative values of CWtAtt. */
431
+ /* */
432
+ /*************************************************************************/
433
+
434
+
435
+ void SetAvCWt()
436
+ /* -------- */
437
+ {
438
+ CaseNo i, NCWt=0;
439
+ ContValue CWt;
440
+
441
+ AvCWt = 0;
442
+ ForEach(i, 0, MaxCase)
443
+ {
444
+ if ( ! NotApplic(Case[i], CWtAtt) && ! Unknown(Case[i], CWtAtt) &&
445
+ (CWt = CVal(Case[i], CWtAtt)) > 0 )
446
+ {
447
+ NCWt++;
448
+ AvCWt += CWt;
449
+ }
450
+ }
451
+
452
+ AvCWt = ( NCWt > 0 ? AvCWt / NCWt : 1 );
453
+ }
454
+
455
+
456
+
457
+ /*************************************************************************/
458
+ /* */
459
+ /* Print report of errors for each of the trials */
460
+ /* */
461
+ /*************************************************************************/
462
+
463
+ char *Multi[] = { F_Trial,
464
+ F_UTrial,
465
+ "" },
466
+
467
+ *StdR[] = { " Before Pruning ",
468
+ " ---------------- ",
469
+ " " F_SizeErrors " " },
470
+
471
+ *StdP[] = { " " F_DecisionTree16 " ",
472
+ " ---------------- ",
473
+ " " F_SizeErrors " " },
474
+
475
+ *StdPC[] = { " " F_DecisionTree23 " ",
476
+ " ----------------------- ",
477
+ " " F_SizeErrorsCost " " },
478
+
479
+ *Extra[] = { " " F_Rules16,
480
+ " ----------------",
481
+ " " F_NoErrors },
482
+
483
+ *ExtraC[] = { " " F_Rules23,
484
+ " -----------------------",
485
+ " " F_NoErrorsCost };
486
+
487
+
488
+ void Evaluate(int Flags)
489
+ /* -------- */
490
+ {
491
+ if ( TRIALS == 1 )
492
+ {
493
+ EvaluateSingle(Flags);
494
+ }
495
+ else
496
+ {
497
+ EvaluateBoost(Flags);
498
+ }
499
+ }
500
+
501
+
502
+
503
+ void EvaluateSingle(int Flags)
504
+ /* -------------- */
505
+ {
506
+ ClassNo RealClass, PredClass;
507
+ int x, u, SaveUtility;
508
+ CaseNo *ConfusionMat, *Usage, i, RawErrs=0, Errs=0;
509
+ double ECost=0, Tests;
510
+ Boolean CMInfo, UsageInfo;
511
+
512
+ CMInfo = Flags & CMINFO;
513
+ UsageInfo = Flags & USAGEINFO;
514
+
515
+ if ( CMInfo )
516
+ {
517
+ ConfusionMat = AllocZero((MaxClass+1)*(MaxClass+1), CaseNo);
518
+ }
519
+
520
+ if ( UsageInfo )
521
+ {
522
+ Usage = AllocZero(MaxAtt+1, CaseNo);
523
+ }
524
+
525
+ Tests = Max(MaxCase+1, 1); /* in case no useful test data! */
526
+
527
+ if ( UTILITY && RULES )
528
+ {
529
+ SaveUtility = UTILITY;
530
+
531
+ UTILITY = Min(UTILITY, RuleSet[0]->SNRules);
532
+
533
+ UtilErr = AllocZero(UTILITY, int);
534
+ UtilBand = Alloc(UTILITY, int);
535
+ if ( MCost )
536
+ {
537
+ UtilCost = AllocZero(UTILITY, double);
538
+ }
539
+
540
+ ForEach(u, 1, UTILITY-1)
541
+ {
542
+ UtilBand[u] = rint(RuleSet[0]->SNRules * u / (float) UTILITY);
543
+ }
544
+ }
545
+
546
+ fprintf(Of, "\n");
547
+ ForEach(x, 0, 2)
548
+ {
549
+ putc('\t', Of);
550
+ if ( RULES )
551
+ {
552
+ fprintf(Of, "%s", ( MCost ? ExtraC[x] : Extra[x] ));
553
+ }
554
+ else
555
+ {
556
+ Verbosity(1, fprintf(Of, "%s", StdR[x]))
557
+ fprintf(Of, "%s", ( MCost ? StdPC[x] : StdP[x] ));
558
+ }
559
+ putc('\n', Of);
560
+ }
561
+ putc('\n', Of);
562
+
563
+ ForEach(i, 0, MaxCase)
564
+ {
565
+ RealClass = Class(Case[i]);
566
+ assert(RealClass > 0 && RealClass <= MaxClass);
567
+
568
+ memset(Tested, 0, MaxAtt+1); /* for usage */
569
+
570
+ if ( RULES )
571
+ {
572
+ PredClass = RuleClassify(Case[i], RuleSet[0]);
573
+ }
574
+ else
575
+ {
576
+ Verbosity(1,
577
+ PredClass = TreeClassify(Case[i], Raw[0]);
578
+ if ( PredClass != RealClass )
579
+ {
580
+ RawErrs++;
581
+ })
582
+
583
+ PredClass = TreeClassify(Case[i], Pruned[0]);
584
+ }
585
+ assert(PredClass > 0 && PredClass <= MaxClass);
586
+
587
+ if ( PredClass != RealClass )
588
+ {
589
+ Errs++;
590
+ if ( MCost ) ECost += MCost[PredClass][RealClass];
591
+ }
592
+
593
+ if ( CMInfo )
594
+ {
595
+ ConfusionMat[RealClass*(MaxClass+1)+PredClass]++;
596
+ }
597
+
598
+ if ( UsageInfo )
599
+ {
600
+ RecordAttUsage(Case[i], Usage);
601
+ }
602
+ }
603
+
604
+ putc('\t', Of);
605
+
606
+ if ( RULES )
607
+ {
608
+ fprintf(Of, " %4d %4d(%4.1f%%)",
609
+ RuleSet[0]->SNRules, Errs, 100 * Errs / Tests);
610
+ }
611
+ else
612
+ {
613
+ /* Results for unpruned tree */
614
+
615
+ Verbosity(1,
616
+ {
617
+ fprintf(Of, " %4d %4d(%4.1f%%) ",
618
+ TreeSize(Raw[0]), RawErrs, 100 * RawErrs / Tests);
619
+ })
620
+
621
+ /* Results for pruned tree */
622
+
623
+ fprintf(Of, " %4d %4d(%4.1f%%)",
624
+ TreeSize(Pruned[0]), Errs, 100 * Errs / Tests);
625
+ }
626
+
627
+ if ( MCost )
628
+ {
629
+ fprintf(Of, "%7.2f", ECost / Tests);
630
+ }
631
+
632
+ fprintf(Of, " <<\n");
633
+
634
+ if ( CMInfo )
635
+ {
636
+ PrintConfusionMatrix(ConfusionMat);
637
+ Free(ConfusionMat);
638
+ }
639
+
640
+ if ( UsageInfo )
641
+ {
642
+ PrintUsageInfo(Usage);
643
+ Free(Usage);
644
+ }
645
+
646
+ if ( UtilErr )
647
+ {
648
+ if ( ! XVAL )
649
+ {
650
+ fprintf(Of, "\n" T_Rule_utility_summary ":\n\n"
651
+ "\t" F_Rules "\t " F_Errors "%s\n"
652
+ "\t" F_URules "\t " F_UErrors "%s\n",
653
+ ( MCost ? " " F_Cost : "" ),
654
+ ( MCost ? " " F_UCost : "" ));
655
+
656
+ ForEach(u, 1, UTILITY-1)
657
+ {
658
+ fprintf(Of, "\t%s%d\t %4d(%4.1f%%)",
659
+ ( UtilBand[u] == 1 ? "" : "1-" ), UtilBand[u],
660
+ UtilErr[u], 100 * UtilErr[u] / Tests);
661
+ if ( MCost )
662
+ {
663
+ fprintf(Of, "%7.2f", UtilCost[u] / Tests);
664
+ }
665
+ fprintf(Of, "\n");
666
+ }
667
+ }
668
+
669
+ Free(UtilErr); UtilErr = Nil;
670
+ FreeUnlessNil(UtilCost); UtilCost = Nil;
671
+ Free(UtilBand); UtilBand = Nil;
672
+
673
+ UTILITY = SaveUtility;
674
+ }
675
+ }
676
+
677
+
678
+
679
+ void EvaluateBoost(int Flags)
680
+ /* ------------- */
681
+ {
682
+ ClassNo RealClass, PredClass;
683
+ int t;
684
+ CaseNo *ConfusionMat, *Usage, i, *Errs, BoostErrs=0;
685
+ double *ECost, BoostECost=0, Tests;
686
+ Boolean CMInfo, UsageInfo;
687
+
688
+ CMInfo = Flags & CMINFO;
689
+ UsageInfo = Flags & USAGEINFO;
690
+
691
+ if ( CMInfo )
692
+ {
693
+ ConfusionMat = AllocZero((MaxClass+1)*(MaxClass+1), CaseNo);
694
+ }
695
+
696
+ if ( UsageInfo )
697
+ {
698
+ Usage = AllocZero(MaxAtt+1, CaseNo);
699
+ }
700
+
701
+ Tests = Max(MaxCase+1, 1); /* in case no useful test data! */
702
+ Errs = AllocZero(TRIALS, CaseNo);
703
+ ECost = AllocZero(TRIALS, double);
704
+
705
+ fprintf(Of, "\n");
706
+ ForEach(t, 0, 2)
707
+ {
708
+ fprintf(Of, "%s\t", Multi[t]);
709
+ if ( RULES )
710
+ {
711
+ fprintf(Of, "%s", ( MCost ? ExtraC[t] : Extra[t] ));
712
+ }
713
+ else
714
+ {
715
+ fprintf(Of, "%s", ( MCost ? StdPC[t] : StdP[t] ));
716
+ }
717
+ putc('\n', Of);
718
+ }
719
+ putc('\n', Of);
720
+
721
+ /* Set global default class for boosting */
722
+
723
+ Default = ( RULES ? RuleSet[0]->SDefault : Pruned[0]->Leaf );
724
+
725
+ ForEach(i, 0, MaxCase)
726
+ {
727
+ RealClass = Class(Case[i]);
728
+
729
+ memset(Tested, 0, MaxAtt+1); /* for usage */
730
+
731
+ PredClass = BoostClassify(Case[i], TRIALS-1);
732
+ if ( PredClass != RealClass )
733
+ {
734
+ BoostErrs++;
735
+ if ( MCost ) BoostECost += MCost[PredClass][RealClass];
736
+ }
737
+
738
+ if ( CMInfo )
739
+ {
740
+ ConfusionMat[RealClass*(MaxClass+1)+PredClass]++;
741
+ }
742
+
743
+ if ( UsageInfo )
744
+ {
745
+ RecordAttUsage(Case[i], Usage);
746
+ }
747
+
748
+ /* Keep track of results for each trial */
749
+
750
+ ForEach(t, 0, TRIALS-1)
751
+ {
752
+ if ( TrialPred[t] != RealClass )
753
+ {
754
+ Errs[t]++;
755
+ if ( MCost ) ECost[t] += MCost[TrialPred[t]][RealClass];
756
+ }
757
+ }
758
+ }
759
+
760
+ /* Print results for individual trials */
761
+
762
+ ForEach(t, 0, TRIALS-1)
763
+ {
764
+ fprintf(Of, "%4d\t", t);
765
+
766
+ if ( RULES )
767
+ {
768
+ fprintf(Of, " %4d %4d(%4.1f%%)",
769
+ RuleSet[t]->SNRules, Errs[t], 100 * Errs[t] / Tests);
770
+ }
771
+ else
772
+ {
773
+ fprintf(Of, " %4d %4d(%4.1f%%)",
774
+ TreeSize(Pruned[t]), Errs[t], 100 * Errs[t] / Tests);
775
+ }
776
+
777
+ if ( MCost )
778
+ {
779
+ fprintf(Of, "%7.2f", ECost[t] / Tests);
780
+ }
781
+
782
+ putc('\n', Of);
783
+ }
784
+
785
+ /* Print boosted results */
786
+
787
+ if ( RULES )
788
+ {
789
+ fprintf(Of, F_Boost "\t %9d(%4.1f%%)",
790
+ BoostErrs, 100 * BoostErrs / Tests);
791
+ }
792
+ else
793
+ {
794
+ fprintf(Of, F_Boost "\t %4d(%4.1f%%)",
795
+ BoostErrs, 100 * BoostErrs / Tests);
796
+ }
797
+
798
+ if ( MCost )
799
+ {
800
+ fprintf(Of, "%7.2f", BoostECost / Tests);
801
+ }
802
+
803
+ fprintf(Of, " <<\n");
804
+
805
+ if ( CMInfo )
806
+ {
807
+ PrintConfusionMatrix(ConfusionMat);
808
+ Free(ConfusionMat);
809
+ }
810
+
811
+ if ( UsageInfo )
812
+ {
813
+ PrintUsageInfo(Usage);
814
+ Free(Usage);
815
+ }
816
+
817
+ Free(Errs);
818
+ Free(ECost);
819
+ }
820
+
821
+
822
+
823
+ /*************************************************************************/
824
+ /* */
825
+ /* Record atts used when classifying last case */
826
+ /* */
827
+ /*************************************************************************/
828
+
829
+
830
+ void RecordAttUsage(DataRec Case, int *Usage)
831
+ /* -------------- */
832
+ {
833
+ Attribute Att;
834
+ int i;
835
+
836
+ /* Scan backwards to allow for information from defined attributes */
837
+
838
+ for ( Att = MaxAtt ; Att > 0 ; Att-- )
839
+ {
840
+ if ( Tested[Att] && ! Unknown(Case, Att) )
841
+ {
842
+ Usage[Att]++;
843
+
844
+ if ( AttDef[Att] )
845
+ {
846
+ ForEach(i, 1, AttDefUses[Att][0])
847
+ {
848
+ Tested[AttDefUses[Att][i]] = true;
849
+ }
850
+ }
851
+ }
852
+ }
853
+ }