see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,313 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Soften thresholds for continuous attributes */
30
+ /* ------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ /*************************************************************************/
40
+ /* */
41
+ /* Soften all thresholds for continuous attributes in tree T */
42
+ /* */
43
+ /*************************************************************************/
44
+
45
+
46
+ void SoftenThresh(Tree T)
47
+ /* ------------ */
48
+ {
49
+ ResubErrs(T, 0, MaxCase);
50
+
51
+ FindBounds(T, 0, MaxCase);
52
+ }
53
+
54
+
55
+
56
+ /*************************************************************************/
57
+ /* */
58
+ /* Find resubstitution errors for tree T */
59
+ /* */
60
+ /*************************************************************************/
61
+
62
+
63
+ void ResubErrs(Tree T, CaseNo Fp, CaseNo Lp)
64
+ /* --------- */
65
+ {
66
+ CaseNo i, Bp, Ep, Missing;
67
+ CaseCount Cases=0, KnownCases, BranchCases, MissingCases;
68
+ double Factor;
69
+ DiscrValue v;
70
+ Boolean PrevUnitWeights;
71
+ Attribute Att;
72
+
73
+ if ( ! T->NodeType )
74
+ {
75
+ T->Errors = T->Cases - T->ClassDist[T->Leaf];
76
+ return;
77
+ }
78
+
79
+ /* Estimate errors for each branch */
80
+
81
+ Att = T->Tested;
82
+ Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
83
+
84
+ if ( CostWeights )
85
+ {
86
+ MissingCases = SumNocostWeights(Fp, Ep);
87
+ KnownCases = SumNocostWeights(Ep+1, Lp);
88
+ }
89
+ else
90
+ {
91
+ MissingCases = CountCases(Fp, Ep);
92
+ KnownCases = Cases - MissingCases;
93
+ }
94
+
95
+ PrevUnitWeights = UnitWeights;
96
+ if ( Missing ) UnitWeights = false;
97
+
98
+ T->Errors = 0;
99
+ Bp = Fp;
100
+
101
+ ForEach(v, 1, T->Forks)
102
+ {
103
+ Ep = Group(v, Bp + Missing, Lp, T);
104
+
105
+ /* Bp -> first value in missing + remaining values
106
+ Ep -> last value in missing + current group */
107
+
108
+ BranchCases = CountCases(Bp + Missing, Ep);
109
+
110
+ Factor = ( ! Missing ? 0 :
111
+ ! CostWeights ? BranchCases / KnownCases :
112
+ SumNocostWeights(Bp + Missing, Ep) / KnownCases );
113
+
114
+ if ( BranchCases + Factor * MissingCases >= MinLeaf )
115
+ {
116
+ if ( Missing )
117
+ {
118
+ ForEach(i, Bp, Bp + Missing - 1)
119
+ {
120
+ Weight(Case[i]) *= Factor;
121
+ }
122
+ }
123
+
124
+ ResubErrs(T->Branch[v], Bp, Ep);
125
+
126
+ T->Errors += T->Branch[v]->Errors;
127
+
128
+ /* Restore weights if changed */
129
+
130
+ if ( Missing )
131
+ {
132
+ for ( i = Ep ; i >= Bp ; i-- )
133
+ {
134
+ if ( Unknown(Case[i], Att) )
135
+ {
136
+ Weight(Case[i]) /= Factor;
137
+ Swap(i, Ep);
138
+ Ep--;
139
+ }
140
+ }
141
+ }
142
+
143
+ Bp = Ep+1;
144
+ }
145
+ }
146
+
147
+ UnitWeights = PrevUnitWeights;
148
+ }
149
+
150
+
151
+
152
+ /*************************************************************************/
153
+ /* */
154
+ /* Calculate upper and lower bounds for each test on a continuous */
155
+ /* attribute in tree T, using cases from Fp to Lp. */
156
+ /* */
157
+ /* The lower bound is set so that the error rate of the GT branch */
158
+ /* on the cases between the bound and the threshold is double that */
159
+ /* of the correct (LE) branch; the upper bound is set similarly. */
160
+ /* */
161
+ /*************************************************************************/
162
+
163
+
164
+ void FindBounds(Tree T, CaseNo Fp, CaseNo Lp)
165
+ /* -------- */
166
+ {
167
+ int v;
168
+ CaseNo i, j, Kp, Bp, Ap, Missing, SplitI;
169
+ CaseCount w, LEErrs, GTErrs, KnownCases, SE;
170
+ ClassNo RealClass;
171
+ Attribute Att;
172
+ Boolean PrevUnitWeights;
173
+ double Factor;
174
+
175
+ /* Stop when get to a leaf */
176
+
177
+ if ( ! T->NodeType ) return;
178
+
179
+ Kp = Group(0, Fp, Lp, T) + 1;
180
+ Missing = Kp - Fp;
181
+
182
+ Att = T->Tested;
183
+ KnownCases = CountCases(Kp, Lp);
184
+
185
+ /* Soften a threshold for a continuous attribute */
186
+
187
+ if ( T->NodeType == BrThresh )
188
+ {
189
+ Verbosity(1, fprintf(Of, "\nTest %s <> %g\n", AttName[Att], T->Cut))
190
+
191
+ /* Skip N/A values */
192
+
193
+ Ap = Group(1, Kp, Lp, T) + 1;
194
+
195
+ Quicksort(Ap, Lp, Att);
196
+
197
+ /* Locate cut point and overall errors of the LE and GT branches */
198
+
199
+ SplitI = Ap;
200
+ LEErrs = GTErrs = 0;
201
+ ForEach(i, Ap, Lp)
202
+ {
203
+ if ( CVal(Case[i], Att) <= T->Cut ) SplitI = i;
204
+ }
205
+
206
+ T->Mid = (CVal(Case[SplitI], Att) + CVal(Case[SplitI+1], Att)) / 2;
207
+
208
+ /* Consider cutoff points below and above the threshold.
209
+ The errors on the cases between the cutoff and the threshold
210
+ are computed for both the LE and GT branches. The additional
211
+ errors must be less than 0.5SE and, further, the errors
212
+ on the "other" branch must not exceed twice the errors
213
+ on the "real" branch, both after Laplace adjustment */
214
+
215
+ SE = sqrt(T->Errors * (KnownCases - T->Errors) / (KnownCases + 1E-3))
216
+ * 2;
217
+
218
+ LEErrs = GTErrs = 0;
219
+ j = SplitI;
220
+ for ( i = SplitI ; i > Ap ; i-- )
221
+ {
222
+ RealClass = Class(Case[i]);
223
+
224
+ w = Weight(Case[i]);
225
+ GTErrs += w * ( TreeClassify(Case[i], T->Branch[3]) != RealClass );
226
+ LEErrs += w * ( TreeClassify(Case[i], T->Branch[2]) != RealClass );
227
+
228
+ if ( CVal(Case[i-1], Att) < CVal(Case[i], Att) )
229
+ {
230
+ if ( GTErrs > 2 * LEErrs + 1 || GTErrs - LEErrs > 0.5 * SE )
231
+ {
232
+ break;
233
+ }
234
+
235
+ j = i-1;
236
+ }
237
+ }
238
+ T->Lower = Min(T->Mid, CVal(Case[j], Att));
239
+
240
+ LEErrs = GTErrs = 0;
241
+ j = SplitI+1;
242
+ for ( i = SplitI+1 ; i < Lp ; i++ )
243
+ {
244
+ RealClass = Class(Case[i]);
245
+
246
+ w = Weight(Case[i]);
247
+ LEErrs += w * ( TreeClassify(Case[i], T->Branch[2]) != RealClass );
248
+ GTErrs += w * ( TreeClassify(Case[i], T->Branch[3]) != RealClass );
249
+
250
+ if ( CVal(Case[i], Att) < CVal(Case[i+1], Att) )
251
+ {
252
+ if ( LEErrs > 2 * GTErrs + 1 || LEErrs - GTErrs > 0.5 * SE )
253
+ {
254
+ break;
255
+ }
256
+
257
+ j = i+1;
258
+ }
259
+ }
260
+ T->Upper = Max(T->Mid, CVal(Case[j], Att));
261
+
262
+ Verbosity(1,
263
+ fprintf(Of, "\tLower = %g, Upper = %g\n", T->Lower, T->Upper))
264
+ }
265
+
266
+ /* Recursively scan each branch */
267
+
268
+ PrevUnitWeights = UnitWeights;
269
+ if ( Missing > 0 ) UnitWeights = false;
270
+
271
+ Bp = Fp;
272
+
273
+ ForEach(v, 1, T->Forks)
274
+ {
275
+ Kp = Group(v, Bp + Missing, Lp, T);
276
+
277
+ /* Bp -> first value in missing + remaining values
278
+ Kp -> last value in missing + current group */
279
+
280
+ if ( Bp + Missing <= Kp &&
281
+ (Factor = CountCases(Bp + Missing, Kp) / KnownCases) > 1E-6 )
282
+ {
283
+ if ( Missing )
284
+ {
285
+ ForEach(i, Bp, Bp + Missing - 1)
286
+ {
287
+ Weight(Case[i]) *= Factor;
288
+ }
289
+ }
290
+
291
+ FindBounds(T->Branch[v], Bp, Kp);
292
+
293
+ /* Restore weights if changed */
294
+
295
+ if ( Missing )
296
+ {
297
+ for ( i = Kp ; i >= Bp ; i-- )
298
+ {
299
+ if ( Unknown(Case[i], Att) )
300
+ {
301
+ Weight(Case[i]) /= Factor;
302
+ Swap(i, Kp);
303
+ Kp--;
304
+ }
305
+ }
306
+ }
307
+
308
+ Bp = Kp+1;
309
+ }
310
+ }
311
+
312
+ UnitWeights = PrevUnitWeights;
313
+ }
data/ext/c5.0/prune.c ADDED
@@ -0,0 +1,1069 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Prune a decision tree and predict its error rate */
30
+ /* ------------------------------------------------ */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ #define LocalVerbosity(x,s) if (Sh >= 0) {Verbosity(x,s)}
40
+ #define Intab(x) Indent(x, 0)
41
+
42
+ #define UPDATE 1 /* flag: change tree */
43
+ #define REGROW 2 /* regrow branches */
44
+ #define REPORTPROGRESS 4 /* original tree */
45
+ #define UNITWEIGHTS 8 /* UnitWeights is true*/
46
+
47
+ Set *PossibleValues;
48
+
49
+ double MaxExtraErrs, /* limit for global prune */
50
+ TotalExtraErrs; /* extra errors from ties */
51
+ Tree *XT; /* subtrees with lowest cost comp */
52
+ int NXT; /* number ditto */
53
+ float MinCC; /* cost compexity for XT */
54
+ Boolean RecalculateErrs; /* if missing values */
55
+
56
+
57
+
58
+
59
+ /*************************************************************************/
60
+ /* */
61
+ /* Prune tree T */
62
+ /* */
63
+ /*************************************************************************/
64
+
65
+
66
+ void Prune(Tree T)
67
+ /* ----- */
68
+ {
69
+ Attribute Att;
70
+ int i, Options;
71
+ Boolean Regrow;
72
+
73
+ Verbosity(2, fprintf(Of, "\n"))
74
+
75
+ Regrow = ( Trial == 0 || Now == WINNOWATTS );
76
+
77
+ /* Local pruning phase */
78
+
79
+
80
+ Options = ( Now == WINNOWATTS ? (UPDATE|REGROW) :
81
+ Regrow ? (UPDATE|REGROW|REPORTPROGRESS) :
82
+ (UPDATE|REPORTPROGRESS) );
83
+ if ( UnitWeights ) Options |= UNITWEIGHTS;
84
+
85
+ EstimateErrs(T, 0, MaxCase, 0, Options);
86
+
87
+ if ( MCost )
88
+ {
89
+ /* Remove any effects of WeightMul and reset leaf classes */
90
+
91
+ RestoreDistribs(T);
92
+ }
93
+ else
94
+ {
95
+ /* Insert information on parents and recalculate errors, noting
96
+ whether fractional cases might have appeared (for GlobalPrune) */
97
+
98
+ RecalculateErrs = false;
99
+ InsertParents(T, Nil);
100
+
101
+ /* Possible global pruning phase */
102
+
103
+ if ( GLOBAL && Now != WINNOWATTS )
104
+ {
105
+ GlobalPrune(T);
106
+ }
107
+ }
108
+
109
+ /* Remove impossible values from subsets and ordered splits.
110
+ First record possible values for discrete attributes */
111
+
112
+ PossibleValues = AllocZero(MaxAtt+1, Set);
113
+ ForEach(Att, 1, MaxAtt)
114
+ {
115
+ if ( Ordered(Att) || Discrete(Att) && SUBSET )
116
+ {
117
+ PossibleValues[Att] = AllocZero((MaxAttVal[Att]>>3)+1, Byte);
118
+ ForEach(i, 1, MaxAttVal[Att])
119
+ {
120
+ SetBit(i, PossibleValues[Att]);
121
+ }
122
+ }
123
+ }
124
+
125
+ CheckSubsets(T, true);
126
+
127
+ FreeVector((void **) PossibleValues, 1, MaxAtt); PossibleValues = Nil;
128
+
129
+ /* For multibranch splits, merge non-occurring values. For trees
130
+ (first boosting trial only), also merge leaves of same class */
131
+
132
+ if ( ! SUBSET )
133
+ {
134
+ CompressBranches(T);
135
+ }
136
+ }
137
+
138
+
139
+
140
+ /*************************************************************************/
141
+ /* */
142
+ /* Estimate the errors in a given subtree */
143
+ /* */
144
+ /*************************************************************************/
145
+
146
+
147
+ void EstimateErrs(Tree T, CaseNo Fp, CaseNo Lp, int Sh, int Flags)
148
+ /* ------------ */
149
+ {
150
+ CaseNo i, Bp, Ep, Missing;
151
+ CaseCount Cases=0, KnownCases, *BranchCases, MissingCases,
152
+ *SmallBranches, SmallBranchCases=0,
153
+ Errs, SaveErrs, TreeErrs, LeafErrs, ExtraLeafErrs=0, BestBrErrs;
154
+ double Factor, *LocalClassDist;
155
+ DiscrValue v, BestBr=0;
156
+ ClassNo c, BestClass=1;
157
+ int UnitWeights; /* local value */
158
+ Tree Br;
159
+ Attribute Att;
160
+
161
+
162
+ if ( Fp > Lp ) return;
163
+
164
+ UnitWeights = (Flags & UNITWEIGHTS);
165
+
166
+ LocalClassDist = Alloc(MaxClass+1, double);
167
+
168
+ FindClassFreq(LocalClassDist, Fp, Lp);
169
+
170
+ /* Record new class distribution if updating the tree */
171
+
172
+ if ( (Flags & UPDATE) )
173
+ {
174
+ ForEach(c, 1, MaxClass)
175
+ {
176
+ T->ClassDist[c] = LocalClassDist[c];
177
+ Cases += LocalClassDist[c];
178
+
179
+ if ( LocalClassDist[c] > LocalClassDist[BestClass] ) BestClass = c;
180
+ }
181
+ }
182
+ else
183
+ {
184
+ ForEach(c, 1, MaxClass)
185
+ {
186
+ Cases += LocalClassDist[c];
187
+
188
+ if ( LocalClassDist[c] > LocalClassDist[BestClass] ) BestClass = c;
189
+ }
190
+ }
191
+
192
+ LeafErrs = Cases - LocalClassDist[BestClass];
193
+ ExtraLeafErrs = ExtraErrs(Cases, LeafErrs, BestClass);
194
+
195
+ Free(LocalClassDist);
196
+
197
+ if ( (Flags & UPDATE) )
198
+ {
199
+ T->Cases = Cases;
200
+ T->Leaf = BestClass;
201
+ }
202
+
203
+ if ( ! T->NodeType ) /* leaf */
204
+ {
205
+ if ( (Flags & UPDATE) && (Flags & REPORTPROGRESS) &&
206
+ Now == SIMPLIFYTREE &&
207
+ T->Cases > 0 )
208
+ {
209
+ Progress(T->Cases);
210
+ }
211
+
212
+ T->Errors = LeafErrs + ExtraLeafErrs;
213
+
214
+ if ( (Flags & UPDATE) )
215
+ {
216
+ if ( Sh >= 0 )
217
+ {
218
+ LocalVerbosity(3,
219
+ Intab(Sh);
220
+ fprintf(Of, "%s (%.2f:%.2f/%.2f)\n", ClassName[T->Leaf],
221
+ T->Cases, LeafErrs, T->Errors))
222
+ }
223
+ }
224
+
225
+ return;
226
+ }
227
+
228
+ /* Estimate errors for each branch */
229
+
230
+ Att = T->Tested;
231
+ Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
232
+
233
+ if ( CostWeights )
234
+ {
235
+ MissingCases = SumNocostWeights(Fp, Ep);
236
+ KnownCases = SumNocostWeights(Ep+1, Lp);
237
+ }
238
+ else
239
+ {
240
+ MissingCases = CountCases(Fp, Ep);
241
+ KnownCases = Cases - MissingCases;
242
+ }
243
+
244
+ SmallBranches = AllocZero(MaxClass+1, CaseCount);
245
+ BranchCases = Alloc(T->Forks+1, CaseCount);
246
+
247
+ if ( Missing ) UnitWeights = 0;
248
+
249
+ TreeErrs = 0;
250
+ Bp = Fp;
251
+
252
+ ForEach(v, 1, T->Forks)
253
+ {
254
+ Ep = Group(v, Bp + Missing, Lp, T);
255
+
256
+ /* Bp -> first value in missing + remaining values
257
+ Ep -> last value in missing + current group */
258
+
259
+ BranchCases[v] = CountCases(Bp + Missing, Ep);
260
+
261
+ Factor = ( ! Missing ? 0 :
262
+ ! CostWeights ? BranchCases[v] / KnownCases :
263
+ SumNocostWeights(Bp + Missing, Ep) / KnownCases );
264
+
265
+ if ( (BranchCases[v] += Factor * MissingCases) >= MinLeaf )
266
+ {
267
+ if ( Missing )
268
+ {
269
+ ForEach(i, Bp, Bp + Missing - 1)
270
+ {
271
+ Weight(Case[i]) *= Factor;
272
+ }
273
+ }
274
+
275
+ EstimateErrs(T->Branch[v], Bp, Ep, Sh+1, ((Flags&7) | UnitWeights));
276
+
277
+ /* Group small branches together for error estimation */
278
+
279
+ if ( BranchCases[v] < MINITEMS )
280
+ {
281
+ ForEach(i, Bp, Ep)
282
+ {
283
+ SmallBranches[ Class(Case[i]) ] += Weight(Case[i]);
284
+ }
285
+
286
+ SmallBranchCases += BranchCases[v];
287
+ }
288
+ else
289
+ {
290
+ TreeErrs += T->Branch[v]->Errors;
291
+ }
292
+
293
+ /* Restore weights if changed */
294
+
295
+ if ( Missing )
296
+ {
297
+ for ( i = Ep ; i >= Bp ; i-- )
298
+ {
299
+ if ( Unknown(Case[i], Att) )
300
+ {
301
+ Weight(Case[i]) /= Factor;
302
+ Swap(i, Ep);
303
+ Ep--;
304
+ }
305
+ }
306
+ }
307
+
308
+ Bp = Ep+1;
309
+ }
310
+ }
311
+
312
+ /* Add error estimate from small branches, if any */
313
+
314
+ if ( SmallBranchCases )
315
+ {
316
+ BestClass = 1;
317
+ ForEach(c, 2, MaxClass)
318
+ {
319
+ if ( SmallBranches[c] > SmallBranches[BestClass] ) BestClass = c;
320
+ }
321
+
322
+ Errs = SmallBranchCases - SmallBranches[BestClass];
323
+ TreeErrs += Errs + ExtraErrs(SmallBranchCases, Errs, BestClass);
324
+ }
325
+ Free(SmallBranches);
326
+ Free(BranchCases);
327
+
328
+ if ( ! (Flags & UPDATE) )
329
+ {
330
+ T->Errors = TreeErrs;
331
+ return;
332
+ }
333
+
334
+ /* See how the largest candidate branch would perform. A branch
335
+ is a candidate if it is not a leaf, contains at least 10% of
336
+ the cases, and does not test the same continuous attribute.
337
+ This test is skipped for boosted trees */
338
+
339
+ ForEach(v, 1, T->Forks)
340
+ {
341
+ if ( ! T->Branch[v]->NodeType ||
342
+ T->Branch[v]->Cases < 0.1 * T->Cases ||
343
+ T->Branch[v]->Tested == Att && Continuous(Att) )
344
+ {
345
+ continue;
346
+ }
347
+
348
+ if ( ! BestBr || T->Branch[v]->Cases > T->Branch[BestBr]->Cases )
349
+ {
350
+ BestBr = v;
351
+ }
352
+ }
353
+
354
+ if ( BestBr )
355
+ {
356
+ SaveErrs = T->Branch[BestBr]->Errors;
357
+ EstimateErrs(T->Branch[BestBr], Fp, Lp, -1, 0);
358
+ BestBrErrs = T->Branch[BestBr]->Errors;
359
+ T->Branch[BestBr]->Errors = SaveErrs;
360
+ }
361
+ else
362
+ {
363
+ BestBrErrs = MaxCase+1;
364
+ }
365
+
366
+ LocalVerbosity(2,
367
+ Intab(Sh);
368
+ fprintf(Of, "%s: [%d%% N=%.2f tree=%.2f leaf=%.2f+%.2f",
369
+ AttName[T->Tested],
370
+ (int) ((TreeErrs * 100) / (T->Cases + 0.001)),
371
+ T->Cases, TreeErrs, LeafErrs, ExtraLeafErrs);
372
+ if ( BestBr )
373
+ {
374
+ fprintf(Of, " br[%d]=%.2f", BestBr, BestBrErrs);
375
+ }
376
+ fprintf(Of, "]\n"))
377
+
378
+ /* See whether tree should be replaced with leaf or best branch */
379
+
380
+ if ( LeafErrs + ExtraLeafErrs <= BestBrErrs + 0.1 &&
381
+ LeafErrs + ExtraLeafErrs <= TreeErrs + 0.1 )
382
+ {
383
+ LocalVerbosity(2,
384
+ Intab(Sh);
385
+ fprintf(Of, "Replaced with leaf %s\n", ClassName[T->Leaf]))
386
+
387
+ UnSprout(T);
388
+ T->Errors = LeafErrs + ExtraLeafErrs;
389
+ }
390
+ else
391
+ if ( BestBrErrs <= TreeErrs + 0.1 )
392
+ {
393
+ LocalVerbosity(2,
394
+ Intab(Sh);
395
+ fprintf(Of, "Replaced with branch %d\n", BestBr))
396
+
397
+ /* Free unused bits of tree */
398
+
399
+ ForEach(v, 1, T->Forks)
400
+ {
401
+ if ( v != BestBr ) FreeTree(T->Branch[v]);
402
+ }
403
+ Br = T->Branch[BestBr];
404
+ Free(T->Branch);
405
+ Free(T->ClassDist);
406
+ if ( T->NodeType == BrSubset )
407
+ {
408
+ FreeVector((void **) T->Subset, 1, T->Forks);
409
+ }
410
+
411
+ /* Copy the branch up */
412
+
413
+ memcpy((char *) T, (char *) Br, sizeof(TreeRec));
414
+ Free(Br);
415
+
416
+ Factor = T->Cases / Cases;
417
+ T->Cases = Cases;
418
+
419
+ /* If not within a rebuilt tree, not a cascaded test, and
420
+ sufficient new cases to justify the effort, rebuild the branch */
421
+
422
+ if ( T->NodeType && (Flags & REGROW) && Factor < 0.95 )
423
+ {
424
+ ForEach(v, 1, T->Forks)
425
+ {
426
+ FreeTree(T->Branch[v]); T->Branch[v] = Nil;
427
+ }
428
+
429
+ SetGlobalUnitWeights(Flags & UNITWEIGHTS);
430
+
431
+ Divide(T, Fp, Lp, 0);
432
+ }
433
+
434
+ EstimateErrs(T, Fp, Lp, Sh, UPDATE);
435
+ }
436
+ else
437
+ {
438
+ T->Errors = TreeErrs;
439
+ }
440
+ }
441
+
442
+
443
+
444
+ /*************************************************************************/
445
+ /* */
446
+ /* Phase 2 (global) pruning. */
447
+ /* Prune minimum cost complexity subtrees until "error" */
448
+ /* (measured by sum of branch errors) increases by 1SE */
449
+ /* */
450
+ /*************************************************************************/
451
+
452
+
453
+ void GlobalPrune(Tree T)
454
+ /* ----------- */
455
+ {
456
+ int DeltaLeaves, x;
457
+ double BaseErrs, DeltaErrs;
458
+ CaseNo i;
459
+ Tree ST;
460
+
461
+ /* If fractional cases may have been used, calculate errors
462
+ directly by checking training data */
463
+
464
+ if ( RecalculateErrs )
465
+ {
466
+ BaseErrs = 0;
467
+ ForEach(i, 0, MaxCase)
468
+ {
469
+ if ( TreeClassify(Case[i], T) != Class(Case[i]) )
470
+ {
471
+ BaseErrs += Weight(Case[i]);
472
+ }
473
+ }
474
+ }
475
+ else
476
+ {
477
+ BaseErrs = T->Errors;
478
+ }
479
+
480
+ XT = Alloc(T->Leaves, Tree);
481
+
482
+ /* Additional error limit set at 1SE */
483
+
484
+ MaxExtraErrs = sqrt(BaseErrs * (1 - BaseErrs / (MaxCase + 1)));
485
+
486
+ while ( MaxExtraErrs > 0 )
487
+ {
488
+ TotalExtraErrs = 0;
489
+
490
+ MinCC = 1E38;
491
+ NXT = 0;
492
+
493
+ /* Find all subtrees with lowest cost complexity */
494
+
495
+ FindMinCC(T);
496
+
497
+ Verbosity(2,
498
+ if ( NXT > 0 && TotalExtraErrs > MaxExtraErrs )
499
+ fprintf(Of, "%d tied with MinCC=%.3f; total extra errs %.1f\n",
500
+ NXT, MinCC, TotalExtraErrs))
501
+
502
+ if ( ! NXT || TotalExtraErrs > MaxExtraErrs ) break;
503
+
504
+ /* Make subtree into a leaf */
505
+
506
+ ForEach(x, 0, NXT-1)
507
+ {
508
+ ST = XT[x];
509
+
510
+ UnSprout(ST);
511
+
512
+ /* Update errors and leaves for ST and ancestors */
513
+
514
+ DeltaErrs = (ST->Cases - ST->ClassDist[ST->Leaf]) - ST->Errors;
515
+ DeltaLeaves = 1 - ST->Leaves;
516
+ while ( ST )
517
+ {
518
+ ST->Errors += DeltaErrs;
519
+ ST->Leaves += DeltaLeaves;
520
+ ST = ST->Parent;
521
+ }
522
+
523
+ MaxExtraErrs -= DeltaErrs;
524
+
525
+ Verbosity(2,
526
+ fprintf(Of, "global: %d leaves, %.1f errs\n",
527
+ DeltaLeaves, DeltaErrs))
528
+ }
529
+ Verbosity(2, fprintf(Of, "\tremaining=%.1f\n", MaxExtraErrs))
530
+ }
531
+
532
+ Free(XT);
533
+ }
534
+
535
+
536
+
537
+ /*************************************************************************/
538
+ /* */
539
+ /* Scan tree computing cost complexity of each subtree and */
540
+ /* record lowest in global variable XT */
541
+ /* */
542
+ /*************************************************************************/
543
+
544
+
545
+ void FindMinCC(Tree T)
546
+ /* --------- */
547
+ {
548
+ DiscrValue v;
549
+ double ExtraErrs, CC, SaveMinCC, SaveTotalExtraErrs;
550
+ int SaveNXT;
551
+
552
+ if ( T->NodeType )
553
+ {
554
+ /* Save current situation */
555
+
556
+ SaveTotalExtraErrs = TotalExtraErrs;
557
+ SaveMinCC = MinCC;
558
+ SaveNXT = NXT;
559
+
560
+ /* Scan subtrees */
561
+
562
+ ForEach(v, 1, T->Forks)
563
+ {
564
+ if ( T->Branch[v]->Cases > 0.1 )
565
+ {
566
+ FindMinCC(T->Branch[v]);
567
+ }
568
+ }
569
+
570
+ /* Compute CC for this subtree and check whether minimum */
571
+
572
+ ExtraErrs = (T->Cases - T->ClassDist[T->Leaf]) - T->Errors;
573
+
574
+ CC = ExtraErrs / (T->Leaves - 1);
575
+
576
+ if ( ExtraErrs <= MaxExtraErrs )
577
+ {
578
+ /* Have to be careful of ties in descendants, because
579
+ they would inflate TotalExtraErrs. Any such ties
580
+ should be discarded */
581
+
582
+ if ( CC < MinCC ||
583
+ CC <= MinCC && CC < SaveMinCC /* changed by descendants */ )
584
+ {
585
+ /* This is the first of a possible group of ties */
586
+
587
+ MinCC = CC;
588
+ NXT = 1;
589
+ XT[0] = T;
590
+ TotalExtraErrs = ExtraErrs;
591
+ }
592
+ else
593
+ if ( CC <= MinCC )
594
+ {
595
+ /* This is a tie. Discard any ties among descendants */
596
+
597
+ if ( NXT > SaveNXT )
598
+ {
599
+ TotalExtraErrs = SaveTotalExtraErrs;
600
+ NXT = SaveNXT;
601
+ }
602
+
603
+ XT[NXT++] = T;
604
+ TotalExtraErrs += ExtraErrs;
605
+ }
606
+ }
607
+ }
608
+ }
609
+
610
+
611
+
612
+ /*************************************************************************/
613
+ /* */
614
+ /* Annotate tree with information on parent and leaves */
615
+ /* */
616
+ /*************************************************************************/
617
+
618
+
619
+ void InsertParents(Tree T, Tree P)
620
+ /* ------------- */
621
+ {
622
+ DiscrValue v;
623
+
624
+ T->Parent = P;
625
+ T->Errors = T->Leaves = 0;
626
+
627
+ if ( T->NodeType )
628
+ {
629
+ ForEach(v, 1, T->Forks)
630
+ {
631
+ InsertParents(T->Branch[v], T);
632
+ T->Errors += T->Branch[v]->Errors;
633
+ T->Leaves += T->Branch[v]->Leaves;
634
+ }
635
+
636
+ if ( SomeMiss[T->Tested] ) RecalculateErrs = true;
637
+ }
638
+ else
639
+ if ( T->Cases > 1E-3 )
640
+ {
641
+ T->Errors = T->Cases - T->ClassDist[T->Leaf];
642
+ T->Leaves = 1;
643
+ }
644
+ }
645
+
646
+
647
+
648
+ /*************************************************************************/
649
+ /* */
650
+ /* Remove unnecessary subset tests on missing values */
651
+ /* */
652
+ /*************************************************************************/
653
+
654
+
655
+ void CheckSubsets(Tree T, Boolean PruneDefaults)
656
+ /* ------------ */
657
+ {
658
+ Set HoldValues;
659
+ int v, vv, x, Bytes, b, First, Any=0;
660
+ Attribute A;
661
+ Tree LeafBr;
662
+ ClassNo c;
663
+
664
+ if ( T->NodeType == BrSubset )
665
+ {
666
+ A = T->Tested;
667
+
668
+ Bytes = (MaxAttVal[A]>>3) + 1;
669
+ HoldValues = Alloc(Bytes, Byte);
670
+
671
+ /* For non-ordered attributes the last (default) branch contains
672
+ all values that do not appear in the data. See whether this
673
+ branch can be simplified or omitted */
674
+
675
+ if ( ! Ordered(A) && PruneDefaults )
676
+ {
677
+ ForEach(b, 0, Bytes-1)
678
+ {
679
+ T->Subset[T->Forks][b] &= PossibleValues[A][b];
680
+ Any |= T->Subset[T->Forks][b];
681
+ }
682
+
683
+ if ( ! Any )
684
+ {
685
+ FreeTree(T->Branch[T->Forks]);
686
+ Free(T->Subset[T->Forks]);
687
+ T->Forks--;
688
+ }
689
+ }
690
+
691
+ /* Process each subtree, leaving only values in branch subset */
692
+
693
+ CopyBits(Bytes, PossibleValues[A], HoldValues);
694
+
695
+ ForEach(v, 1, T->Forks)
696
+ {
697
+ /* Remove any impossible values from ordered subsets */
698
+
699
+ if ( Ordered(A) )
700
+ {
701
+ ForEach(vv, 1, MaxAttVal[A])
702
+ {
703
+ if ( In(vv, T->Subset[v]) && ! In(vv, HoldValues) )
704
+ {
705
+ ResetBit(vv, T->Subset[v]);
706
+ }
707
+ }
708
+ }
709
+
710
+ CopyBits(Bytes, T->Subset[v], PossibleValues[A]);
711
+
712
+ CheckSubsets(T->Branch[v], PruneDefaults);
713
+ }
714
+
715
+ CopyBits(Bytes, HoldValues, PossibleValues[A]);
716
+
717
+ Free(HoldValues);
718
+
719
+ /* See whether branches other than N/A can be merged.
720
+ This cannot be done for ordered attributes since the
721
+ values in the subset represent an interval */
722
+
723
+ if ( ! Ordered(A) )
724
+ {
725
+ First = ( In(1, T->Subset[1]) ? 2 : 1 );
726
+ for ( v = First ; v < T->Forks ; v++ )
727
+ {
728
+ if ( T->Branch[v]->NodeType ) continue;
729
+ LeafBr = T->Branch[v];
730
+
731
+ /* Consider branches vv that could be merged with branch v */
732
+
733
+ for ( vv = v+1 ; vv <= T->Forks ; )
734
+ {
735
+ if ( ! T->Branch[vv]->NodeType &&
736
+ T->Branch[vv]->Leaf == LeafBr->Leaf &&
737
+ ( PruneDefaults || T->Branch[vv]->Cases > 0 ) )
738
+ {
739
+ /* Branch vv can be merged with branch v */
740
+
741
+ if ( T->Branch[vv]->Cases )
742
+ {
743
+ /* Add class distribution from branch vv,
744
+ or replace if branch v has no cases */
745
+
746
+ ForEach(c, 1, MaxClass)
747
+ {
748
+ if ( ! LeafBr->Cases )
749
+ {
750
+ LeafBr->ClassDist[c] =
751
+ T->Branch[vv]->ClassDist[c];
752
+ }
753
+ else
754
+ {
755
+ LeafBr->ClassDist[c] +=
756
+ T->Branch[vv]->ClassDist[c];
757
+ }
758
+ }
759
+ LeafBr->Cases += T->Branch[vv]->Cases;
760
+ LeafBr->Errors += T->Branch[vv]->Errors;
761
+ }
762
+
763
+ /* Merge values and free branch vv */
764
+
765
+ ForEach(b, 0, Bytes-1)
766
+ {
767
+ T->Subset[v][b] |= T->Subset[vv][b];
768
+ }
769
+ FreeTree(T->Branch[vv]);
770
+ Free(T->Subset[vv]);
771
+
772
+ T->Forks--;
773
+ ForEach(x, vv, T->Forks)
774
+ {
775
+ T->Branch[x] = T->Branch[x+1];
776
+ T->Subset[x] = T->Subset[x+1];
777
+ }
778
+ }
779
+ else
780
+ {
781
+ vv++;
782
+ }
783
+ }
784
+ }
785
+ }
786
+ }
787
+ else
788
+ if ( T->NodeType )
789
+ {
790
+ ForEach(v, 1, T->Forks)
791
+ {
792
+ CheckSubsets(T->Branch[v], PruneDefaults);
793
+ }
794
+ }
795
+ }
796
+
797
+
798
+
799
+ /*************************************************************************/
800
+ /* */
801
+ /* Compute Coeff, used by RawExtraErrs() to adjust resubstitution */
802
+ /* error rate to upper limit of the confidence level. Coeff is */
803
+ /* the square of the number of standard deviations corresponding */
804
+ /* to the selected confidence level. (Taken from Documenta Geigy */
805
+ /* Scientific Tables (Sixth Edition), p185 (with modifications).) */
806
+ /* */
807
+ /*************************************************************************/
808
+
809
+
810
+ float Val[] = { 0, 0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00},
811
+ Dev[] = {4.0, 3.09, 2.58, 2.33, 1.65, 1.28, 0.84, 0.25, 0.00},
812
+ Coeff;
813
+
814
+
815
+ void InitialiseExtraErrs()
816
+ /* ------------------- */
817
+ {
818
+ int i=1;
819
+
820
+ /* Compute and retain the coefficient value, interpolating from
821
+ the values in Val and Dev */
822
+
823
+ while ( CF > Val[i] ) i++;
824
+
825
+ Coeff = Dev[i-1] +
826
+ (Dev[i] - Dev[i-1]) * (CF - Val[i-1]) /(Val[i] - Val[i-1]);
827
+ Coeff = Coeff * Coeff;
828
+ CF = Max(CF, 1E-6);
829
+ }
830
+
831
+
832
+ /*************************************************************************/
833
+ /* */
834
+ /* Calculate extra errors to correct the resubstitution error */
835
+ /* rate at a leaf with N cases, E errors, predicted class C. */
836
+ /* If CostWeights are used, N and E are normalised by removing */
837
+ /* the effects of cost weighting and then reapplying weights to */
838
+ /* the result. */
839
+ /* */
840
+ /*************************************************************************/
841
+
842
+
843
+ float ExtraErrs(CaseCount N, CaseCount E, ClassNo C)
844
+ /* --------- */
845
+ {
846
+ ClassNo EC;
847
+ CaseCount NormC, NormEC;
848
+
849
+ if ( ! CostWeights )
850
+ {
851
+ return RawExtraErrs(N, E);
852
+ }
853
+
854
+ EC = 3 - C; /* the other class */
855
+ NormC = (N - E) / WeightMul[C]; /* normalised cases of class C */
856
+ NormEC = E / WeightMul[EC]; /* ditto the other class */
857
+
858
+ return WeightMul[EC] * RawExtraErrs(NormC + NormEC, NormEC);
859
+ }
860
+
861
+
862
+
863
+ float RawExtraErrs(CaseCount N, CaseCount E)
864
+ /* ------------ */
865
+ {
866
+ float Val0, Pr;
867
+
868
+ if ( E < 1E-6 )
869
+ {
870
+ return N * (1 - exp(log(CF) / N));
871
+ }
872
+ else
873
+ if ( N > 1 && E < 0.9999 )
874
+ {
875
+ Val0 = N * (1 - exp(log(CF) / N));
876
+ return Val0 + E * (RawExtraErrs(N, 1.0) - Val0);
877
+ }
878
+ else
879
+ if ( E + 0.5 >= N )
880
+ {
881
+ return 0.67 * (N - E);
882
+ }
883
+ else
884
+ {
885
+ Pr = (E + 0.5 + Coeff/2
886
+ + sqrt(Coeff * ((E + 0.5) * (1 - (E + 0.5)/N) + Coeff/4)) )
887
+ / (N + Coeff);
888
+ return (N * Pr - E);
889
+ }
890
+ }
891
+
892
+
893
+
894
+ /*************************************************************************/
895
+ /* */
896
+ /* If there are differential misclassification costs, the weights */
897
+ /* may have been artificially adjusted. Fix the distributions so */
898
+ /* that they represent the "true" (possibly boosted) weights */
899
+ /* */
900
+ /*************************************************************************/
901
+
902
+
903
+ void RestoreDistribs(Tree T)
904
+ /* --------------- */
905
+ {
906
+ DiscrValue v;
907
+ ClassNo c;
908
+
909
+ if ( T->NodeType )
910
+ {
911
+ ForEach(v, 1, T->Forks)
912
+ {
913
+ RestoreDistribs(T->Branch[v]);
914
+ }
915
+ }
916
+
917
+ if ( T->Cases >= MinLeaf )
918
+ {
919
+ if ( CostWeights )
920
+ {
921
+ T->Cases = 0;
922
+ ForEach(c, 1, MaxClass)
923
+ {
924
+ ClassSum[c] = (T->ClassDist[c] /= WeightMul[c]);
925
+ T->Cases += T->ClassDist[c];
926
+ }
927
+ }
928
+ else
929
+ {
930
+ ForEach(c, 1, MaxClass)
931
+ {
932
+ ClassSum[c] = T->ClassDist[c];
933
+ }
934
+ }
935
+
936
+ T->Leaf = SelectClass(1, true);
937
+ T->Errors = T->Cases - T->ClassDist[T->Leaf];
938
+ }
939
+ }
940
+
941
+
942
+
943
+ /*************************************************************************/
944
+ /* */
945
+ /* See whether empty branches can be formed into subsets. */
946
+ /* For the first trial only, and when not generating rulesets, */
947
+ /* combine leaves with the same class. */
948
+ /* */
949
+ /*************************************************************************/
950
+
951
+
952
+ void CompressBranches(Tree T)
953
+ /* ---------------- */
954
+ {
955
+ DiscrValue v, vv, S=0, *LocalSet;
956
+ int Bytes;
957
+ Tree Br, *OldBranch;
958
+ ClassNo c;
959
+ Boolean EmptyOnly;
960
+
961
+ EmptyOnly = Trial || RULES;
962
+
963
+ if ( T->NodeType )
964
+ {
965
+ /* LocalSet[v] is the new branch number to which branch v belongs */
966
+
967
+ LocalSet = AllocZero(T->Forks+1, DiscrValue);
968
+
969
+ ForEach(v, 1, T->Forks)
970
+ {
971
+ Br = T->Branch[v];
972
+ CompressBranches(Br);
973
+
974
+ /* Don't check if compression impossible */
975
+
976
+ if ( v == 1 || T->Forks < 4 || Br->NodeType ||
977
+ EmptyOnly && Br->Cases >= MinLeaf )
978
+ {
979
+ vv = v + 1;
980
+ }
981
+ else
982
+ {
983
+ /* Check whether some previous branch is mergeable.
984
+ For Trial 0, leaves are mergeable if they are
985
+ both empty or both non-empty and have the same class;
986
+ for later trials, they must both be empty */
987
+
988
+ for ( vv = 2 ; vv < v ; vv++ )
989
+ {
990
+ if ( ! T->Branch[vv]->NodeType &&
991
+ ( EmptyOnly ? T->Branch[vv]->Cases < MinLeaf :
992
+ ( T->Branch[vv]->Cases < MinLeaf ) ==
993
+ ( Br->Cases < MinLeaf ) &&
994
+ T->Branch[vv]->Leaf == Br->Leaf ) )
995
+ {
996
+ break;
997
+ }
998
+ }
999
+ }
1000
+
1001
+ /* If no merge was found, this becomes a new branch */
1002
+
1003
+ LocalSet[v] = ( vv < v ? LocalSet[vv] : ++S );
1004
+ }
1005
+
1006
+ if ( S < T->Forks )
1007
+ {
1008
+ /* Compress! */
1009
+
1010
+ T->Subset = Alloc(S+1, Set);
1011
+ OldBranch = T->Branch;
1012
+ T->Branch = Alloc(S+1, Tree);
1013
+
1014
+ Bytes = (MaxAttVal[T->Tested]>>3) + 1;
1015
+ S = 0;
1016
+
1017
+ ForEach(v, 1, T->Forks)
1018
+ {
1019
+ if ( LocalSet[v] > S )
1020
+ {
1021
+ S++;
1022
+ Br = T->Branch[S] = OldBranch[v];
1023
+ if ( ! Br->ClassDist )
1024
+ {
1025
+ Br->ClassDist = AllocZero(MaxClass+1, CaseCount);
1026
+ }
1027
+ T->Subset[S] = AllocZero(Bytes, Byte);
1028
+
1029
+ /* Must include N/A even when no cases -- otherwise
1030
+ reader gets the branches muddled */
1031
+
1032
+ SetBit(v, T->Subset[S]);
1033
+
1034
+ ForEach(vv, v+1, T->Forks)
1035
+ {
1036
+ if ( LocalSet[vv] == S )
1037
+ {
1038
+ SetBit(vv, T->Subset[S]);
1039
+
1040
+ Br->Cases += OldBranch[vv]->Cases;
1041
+ Br->Errors += OldBranch[vv]->Errors;
1042
+ ForEach(c, 1, MaxClass)
1043
+ {
1044
+ Br->ClassDist[c] += OldBranch[vv]->ClassDist[c];
1045
+ }
1046
+ }
1047
+ }
1048
+ }
1049
+ else
1050
+ {
1051
+ FreeTree(OldBranch[v]);
1052
+ }
1053
+ }
1054
+
1055
+ T->NodeType = BrSubset;
1056
+ T->Forks = S;
1057
+ Free(OldBranch);
1058
+ }
1059
+ Free(LocalSet);
1060
+ }
1061
+ }
1062
+
1063
+
1064
+
1065
+ void SetGlobalUnitWeights(int LocalFlag)
1066
+ /* -------------------- */
1067
+ {
1068
+ UnitWeights = ( LocalFlag != 0 );
1069
+ }