see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
data/ext/c5.0/sort.c ADDED
@@ -0,0 +1,156 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Sorting utilities */
30
+ /* ----------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+ #define SwapSRec(a,b) {Xab=SRec[a]; SRec[a]=SRec[b]; SRec[b]=Xab;}
39
+
40
+
41
+ /*************************************************************************/
42
+ /* */
43
+ /* Sort elements Fp to Lp of SRec */
44
+ /* */
45
+ /*************************************************************************/
46
+
47
+
48
+ void Cachesort(CaseNo Fp, CaseNo Lp, SortRec *SRec)
49
+ /* --------- */
50
+ {
51
+ CaseNo i, Middle, High;
52
+ ContValue Thresh, Val;
53
+ SortRec Xab;
54
+
55
+
56
+ while ( Fp < Lp )
57
+ {
58
+ Thresh = SRec[(Fp+Lp) / 2].V;
59
+
60
+ /* Divide elements into three groups:
61
+ Fp .. Middle-1: values < Thresh
62
+ Middle .. High: values = Thresh
63
+ High+1 .. Lp: values > Thresh */
64
+
65
+ for ( Middle = Fp ; SRec[Middle].V < Thresh ; Middle++ )
66
+ ;
67
+
68
+ for ( High = Lp ; SRec[High].V > Thresh ; High-- )
69
+ ;
70
+
71
+ for ( i = Middle ; i <= High ; )
72
+ {
73
+ if ( (Val = SRec[i].V) < Thresh )
74
+ {
75
+ SwapSRec(Middle, i);
76
+ Middle++;
77
+ i++;
78
+ }
79
+ else
80
+ if ( Val > Thresh )
81
+ {
82
+ SwapSRec(High, i);
83
+ High--;
84
+ }
85
+ else
86
+ {
87
+ i++;
88
+ }
89
+ }
90
+
91
+ /* Sort the first group */
92
+
93
+ Cachesort(Fp, Middle-1, SRec);
94
+
95
+ /* Continue with the last group */
96
+
97
+ Fp = High+1;
98
+ }
99
+ }
100
+
101
+
102
+
103
+ /*************************************************************************/
104
+ /* */
105
+ /* Sort cases from Fp to Lp on attribute Att */
106
+ /* */
107
+ /*************************************************************************/
108
+
109
+
110
+ void Quicksort(CaseNo Fp, CaseNo Lp, Attribute Att)
111
+ /* --------- */
112
+ {
113
+ CaseNo i, Middle, High;
114
+ ContValue Thresh, Val;
115
+
116
+ if ( Fp < Lp )
117
+ {
118
+ Thresh = CVal(Case[(Fp+Lp) / 2], Att);
119
+
120
+ /* Divide cases into three groups:
121
+ Fp .. Middle-1: values < Thresh
122
+ Middle .. High: values = Thresh
123
+ High+1 .. Lp: values > Thresh */
124
+
125
+ for ( Middle = Fp ; CVal(Case[Middle], Att) < Thresh ; Middle++ )
126
+ ;
127
+
128
+ for ( High = Lp ; CVal(Case[High], Att) > Thresh ; High-- )
129
+ ;
130
+
131
+ for ( i = Middle ; i <= High ; )
132
+ {
133
+ if ( (Val = CVal(Case[i], Att)) < Thresh )
134
+ {
135
+ Swap(Middle, i);
136
+ Middle++;
137
+ i++;
138
+ }
139
+ else
140
+ if ( Val > Thresh )
141
+ {
142
+ Swap(High, i);
143
+ High--;
144
+ }
145
+ else
146
+ {
147
+ i++;
148
+ }
149
+ }
150
+
151
+ /* Sort the first and third groups */
152
+
153
+ Quicksort(Fp, Middle-1, Att);
154
+ Quicksort(High+1, Lp, Att);
155
+ }
156
+ }
data/ext/c5.0/subset.c ADDED
@@ -0,0 +1,599 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Evaluation of discrete attribute subsets */
30
+ /* ---------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+
40
+ /*************************************************************************/
41
+ /* */
42
+ /* Set up tables for subsets */
43
+ /* */
44
+ /*************************************************************************/
45
+
46
+
47
+ void InitialiseBellNumbers()
48
+ /* --------------------- */
49
+ {
50
+ DiscrValue n, k;
51
+
52
+ /* Table of Bell numbers (used for subset test penalties) */
53
+
54
+ Bell = AllocZero(MaxDiscrVal+1, double *);
55
+ ForEach(n, 1, MaxDiscrVal)
56
+ {
57
+ Bell[n] = AllocZero(n+1, double);
58
+ ForEach(k, 1, n)
59
+ {
60
+ Bell[n][k] = ( k == 1 || k == n ? 1 :
61
+ Bell[n-1][k-1] + k * Bell[n-1][k] );
62
+ }
63
+ }
64
+ }
65
+
66
+
67
+
68
+ /*************************************************************************/
69
+ /* */
70
+ /* Evaluate subsetting a discrete attribute and form the chosen */
71
+ /* subsets Subset[Att][], setting Subsets[Att] to the number of */
72
+ /* subsets, and the Info[] and Gain[] of a test on the attribute */
73
+ /* */
74
+ /*************************************************************************/
75
+
76
+
77
+ void EvalSubset(Attribute Att, CaseCount Cases)
78
+ /* ---------- */
79
+ {
80
+ DiscrValue V1, V2, V3, BestV1, BestV2, InitialBlocks, First=1, Prelim=0;
81
+ ClassNo c;
82
+ double BaseInfo, ThisGain, ThisInfo, Penalty, UnknownRate,
83
+ Val, BestVal, BestGain, BestInfo, PrevGain, PrevInfo;
84
+ int MissingValues=0;
85
+ CaseCount KnownCases;
86
+ Boolean Better;
87
+
88
+ /* First compute Freq[][], ValFreq[], base info, and the gain
89
+ and total info of a split on discrete attribute Att */
90
+
91
+ SetDiscrFreq(Att);
92
+
93
+ GEnv.ReasonableSubsets = 0;
94
+ ForEach(c, 1, MaxAttVal[Att])
95
+ {
96
+ if ( GEnv.ValFreq[c] >= MINITEMS ) GEnv.ReasonableSubsets++;
97
+ }
98
+
99
+ if ( ! GEnv.ReasonableSubsets )
100
+ {
101
+ Verbosity(2,
102
+ fprintf(Of, "\tAtt %s: poor initial split\n", AttName[Att]))
103
+
104
+ return;
105
+ }
106
+
107
+ KnownCases = Cases - GEnv.ValFreq[0];
108
+ UnknownRate = GEnv.ValFreq[0] / Cases;
109
+
110
+ BaseInfo = ( ! GEnv.ValFreq[0] ? GlobalBaseInfo :
111
+ DiscrKnownBaseInfo(KnownCases, MaxAttVal[Att]) );
112
+
113
+ PrevGain = ComputeGain(BaseInfo, UnknownRate, MaxAttVal[Att], KnownCases);
114
+ PrevInfo = TotalInfo(GEnv.ValFreq, 0, MaxAttVal[Att]) / Cases;
115
+ BestVal = PrevGain / PrevInfo;
116
+
117
+ Verbosity(2, fprintf(Of, "\tAtt %s", AttName[Att]))
118
+ Verbosity(3, PrintDistribution(Att, 0, MaxAttVal[Att], GEnv.Freq,
119
+ GEnv.ValFreq, true))
120
+ Verbosity(2,
121
+ fprintf(Of, "\tinitial inf %.3f, gain %.3f, val=%.3f\n",
122
+ PrevInfo, PrevGain, BestVal))
123
+
124
+ /* Eliminate unrepresented attribute values from Freq[] and ValFreq[]
125
+ and form a separate subset for each represented attribute value.
126
+ Unrepresented N/A values are ignored */
127
+
128
+ GEnv.Bytes = (MaxAttVal[Att]>>3) + 1;
129
+ ClearBits(GEnv.Bytes, Subset[Att][0]);
130
+
131
+ GEnv.Blocks = 0;
132
+ ForEach(V1, 1, MaxAttVal[Att])
133
+ {
134
+ if ( GEnv.ValFreq[V1] > Epsilon || V1 == 1 && SomeNA[Att] )
135
+ {
136
+ if ( ++GEnv.Blocks < V1 )
137
+ {
138
+ GEnv.ValFreq[GEnv.Blocks] = GEnv.ValFreq[V1];
139
+ ForEach(c, 1, MaxClass)
140
+ {
141
+ GEnv.Freq[GEnv.Blocks][c] = GEnv.Freq[V1][c];
142
+ }
143
+ }
144
+ ClearBits(GEnv.Bytes, GEnv.WSubset[GEnv.Blocks]);
145
+ SetBit(V1, GEnv.WSubset[GEnv.Blocks]);
146
+ CopyBits(GEnv.Bytes, GEnv.WSubset[GEnv.Blocks],
147
+ Subset[Att][GEnv.Blocks]);
148
+
149
+ /* Cannot merge N/A values with other blocks */
150
+
151
+ if ( V1 == 1 ) First = 2;
152
+ }
153
+ else
154
+ if ( V1 != 1 )
155
+ {
156
+ SetBit(V1, Subset[Att][0]);
157
+ MissingValues++;
158
+ }
159
+ }
160
+
161
+ /* Set n-way branch as initial test */
162
+
163
+ Gain[Att] = PrevGain;
164
+ Info[Att] = PrevInfo;
165
+ Subsets[Att] = InitialBlocks = GEnv.Blocks;
166
+
167
+ /* As a preliminary step, merge values with identical distributions */
168
+
169
+ ForEach(V1, First, GEnv.Blocks-1)
170
+ {
171
+ ForEach(V2, V1+1, GEnv.Blocks)
172
+ {
173
+ if ( SameDistribution(V1, V2) )
174
+ {
175
+ Prelim = V1;
176
+ AddBlock(V1, V2);
177
+ }
178
+ }
179
+
180
+ /* Eliminate any merged values */
181
+
182
+ if ( Prelim == V1 )
183
+ {
184
+ V3 = V1;
185
+
186
+ ForEach(V2, V1+1, GEnv.Blocks)
187
+ {
188
+ if ( GEnv.ValFreq[V2] && ++V3 != V2 )
189
+ {
190
+ MoveBlock(V3, V2);
191
+ }
192
+ }
193
+
194
+ GEnv.Blocks = V3;
195
+ }
196
+ }
197
+
198
+ if ( Prelim )
199
+ {
200
+ PrevInfo = TotalInfo(GEnv.ValFreq, 0, GEnv.Blocks) / Cases;
201
+
202
+ Penalty = ( finite(Bell[InitialBlocks][GEnv.Blocks]) ?
203
+ Log(Bell[InitialBlocks][GEnv.Blocks]) :
204
+ (InitialBlocks-GEnv.Blocks+1) * Log(GEnv.Blocks) );
205
+
206
+ Val = (PrevGain - Penalty / Cases) / PrevInfo;
207
+ Better = ( GEnv.Blocks >= 2 && GEnv.ReasonableSubsets >= 2 &&
208
+ Val >= BestVal );
209
+
210
+ Verbosity(2,
211
+ {
212
+ fprintf(Of, "\tprelim merges -> inf %.3f, gain %.3f, val %.3f%s%s",
213
+ PrevInfo, PrevGain, Val,
214
+ ( Better ? " **" : "" ),
215
+ (VERBOSITY > 2 ? "" : "\n" ));
216
+ Verbosity(3, PrintDistribution(Att, 0, GEnv.Blocks, GEnv.Freq,
217
+ GEnv.ValFreq, false))
218
+ })
219
+
220
+ if ( Better )
221
+ {
222
+ Subsets[Att] = GEnv.Blocks;
223
+
224
+ ForEach(V1, 1, GEnv.Blocks)
225
+ {
226
+ CopyBits(GEnv.Bytes, GEnv.WSubset[V1], Subset[Att][V1]);
227
+ }
228
+
229
+ Info[Att] = PrevInfo;
230
+ Gain[Att] = PrevGain - Penalty / KnownCases;
231
+ BestVal = Val;
232
+ }
233
+ }
234
+
235
+ /* Determine initial information and entropy values */
236
+
237
+ ForEach(V1, 1, GEnv.Blocks)
238
+ {
239
+ GEnv.SubsetInfo[V1] = -GEnv.ValFreq[V1] * Log(GEnv.ValFreq[V1] / Cases);
240
+ GEnv.SubsetEntr[V1] = TotalInfo(GEnv.Freq[V1], 1, MaxClass);
241
+ }
242
+
243
+ ForEach(V1, First, GEnv.Blocks-1)
244
+ {
245
+ ForEach(V2, V1+1, GEnv.Blocks)
246
+ {
247
+ EvaluatePair(V1, V2, Cases);
248
+ }
249
+ }
250
+
251
+ /* Examine possible pair mergers and hill-climb */
252
+
253
+ while ( GEnv.Blocks > 2 )
254
+ {
255
+ BestV1 = 0;
256
+ BestGain = -Epsilon;
257
+
258
+ /* For each possible pair of values, calculate the gain and
259
+ total info of a split in which they are treated as one.
260
+ Keep track of the pair with the best gain. */
261
+
262
+ ForEach(V1, First, GEnv.Blocks-1)
263
+ {
264
+ ForEach(V2, V1+1, GEnv.Blocks)
265
+ {
266
+ if ( GEnv.ReasonableSubsets == 2 &&
267
+ GEnv.ValFreq[V1] >= MINITEMS-Epsilon &&
268
+ GEnv.ValFreq[V2] >= MINITEMS-Epsilon )
269
+ {
270
+ continue;
271
+ }
272
+
273
+ ThisGain = PrevGain -
274
+ ((1-UnknownRate) / KnownCases) *
275
+ (GEnv.MergeEntr[V1][V2] -
276
+ (GEnv.SubsetEntr[V1] + GEnv.SubsetEntr[V2]));
277
+ ThisInfo = PrevInfo + (GEnv.MergeInfo[V1][V2] -
278
+ (GEnv.SubsetInfo[V1] + GEnv.SubsetInfo[V2])) / Cases;
279
+ Verbosity(3,
280
+ fprintf(Of, "\t combine %d %d info %.3f gain %.3f\n",
281
+ V1, V2, ThisInfo, ThisGain))
282
+
283
+ /* See whether this merge has the best gain so far */
284
+
285
+ if ( ThisGain > BestGain+Epsilon )
286
+ {
287
+ BestGain = ThisGain;
288
+ BestInfo = ThisInfo;
289
+ BestV1 = V1;
290
+ BestV2 = V2;
291
+ }
292
+ }
293
+ }
294
+
295
+ if ( ! BestV1 ) break;
296
+
297
+ PrevGain = BestGain;
298
+ PrevInfo = BestInfo;
299
+
300
+ /* Determine penalty as log of Bell number. If number is too
301
+ large, use an approximation of log */
302
+
303
+ Penalty = ( finite(Bell[InitialBlocks][GEnv.Blocks-1]) ?
304
+ Log(Bell[InitialBlocks][GEnv.Blocks-1]) :
305
+ (InitialBlocks-GEnv.Blocks+1) * Log(GEnv.Blocks-1) );
306
+
307
+ Val = (BestGain - Penalty / Cases) / BestInfo;
308
+
309
+ Merge(BestV1, BestV2, Cases);
310
+
311
+ Verbosity(2,
312
+ fprintf(Of, "\tform subset ");
313
+ PrintSubset(Att, GEnv.WSubset[BestV1]);
314
+ fprintf(Of, ": %d subsets, inf %.3f, gain %.3f, val %.3f%s\n",
315
+ GEnv.Blocks, BestInfo, BestGain, Val,
316
+ ( Val > BestVal ? " **" : "" ));
317
+ Verbosity(3,
318
+ PrintDistribution(Att, 0, GEnv.Blocks, GEnv.Freq, GEnv.ValFreq,
319
+ false))
320
+ )
321
+
322
+ if ( Val >= BestVal )
323
+ {
324
+ Subsets[Att] = GEnv.Blocks;
325
+
326
+ ForEach(V1, 1, GEnv.Blocks)
327
+ {
328
+ CopyBits(GEnv.Bytes, GEnv.WSubset[V1], Subset[Att][V1]);
329
+ }
330
+
331
+ Info[Att] = BestInfo;
332
+ Gain[Att] = BestGain - Penalty / KnownCases;
333
+ BestVal = Val;
334
+ }
335
+ }
336
+
337
+ /* Add missing values as another branch */
338
+
339
+ if ( MissingValues )
340
+ {
341
+ Subsets[Att]++;
342
+ CopyBits(GEnv.Bytes, Subset[Att][0], Subset[Att][Subsets[Att]]);
343
+ }
344
+
345
+ Verbosity(2,
346
+ fprintf(Of, "\tfinal inf %.3f, gain %.3f, val=%.3f\n",
347
+ Info[Att], Gain[Att], Gain[Att] / (Info[Att] + 1E-3)))
348
+ }
349
+
350
+
351
+
352
+ /*************************************************************************/
353
+ /* */
354
+ /* Combine the distribution figures of subsets x and y. */
355
+ /* Update Freq, ValFreq, SubsetInfo, SubsetEntr, MergeInfo, and */
356
+ /* MergeEntr. */
357
+ /* */
358
+ /*************************************************************************/
359
+
360
+
361
+ void Merge(DiscrValue x, DiscrValue y, CaseCount Cases)
362
+ /* ----- */
363
+ {
364
+ ClassNo c;
365
+ double Entr=0;
366
+ CaseCount KnownCases=0;
367
+ int R, C;
368
+
369
+ AddBlock(x, y);
370
+
371
+ ForEach(c, 1, MaxClass)
372
+ {
373
+ Entr -= GEnv.Freq[x][c] * Log(GEnv.Freq[x][c]);
374
+ KnownCases += GEnv.Freq[x][c];
375
+ }
376
+
377
+ GEnv.SubsetInfo[x] = - GEnv.ValFreq[x] * Log(GEnv.ValFreq[x] / Cases);
378
+ GEnv.SubsetEntr[x] = Entr + KnownCases * Log(KnownCases);
379
+
380
+ /* Eliminate y from working blocks */
381
+
382
+ ForEach(R, y, GEnv.Blocks-1)
383
+ {
384
+ MoveBlock(R, R+1);
385
+
386
+ GEnv.SubsetInfo[R] = GEnv.SubsetInfo[R+1];
387
+ GEnv.SubsetEntr[R] = GEnv.SubsetEntr[R+1];
388
+
389
+ ForEach(C, 1, GEnv.Blocks)
390
+ {
391
+ GEnv.MergeInfo[R][C] = GEnv.MergeInfo[R+1][C];
392
+ GEnv.MergeEntr[R][C] = GEnv.MergeEntr[R+1][C];
393
+ }
394
+ }
395
+
396
+ ForEach(C, y, GEnv.Blocks-1)
397
+ {
398
+ ForEach(R, 1, GEnv.Blocks-1)
399
+ {
400
+ GEnv.MergeInfo[R][C] = GEnv.MergeInfo[R][C+1];
401
+ GEnv.MergeEntr[R][C] = GEnv.MergeEntr[R][C+1];
402
+ }
403
+ }
404
+ GEnv.Blocks--;
405
+
406
+ /* Update information for newly-merged block */
407
+
408
+ ForEach(C, 1, GEnv.Blocks)
409
+ {
410
+ if ( C != x ) EvaluatePair(x, C, Cases);
411
+ }
412
+ }
413
+
414
+
415
+
416
+ /*************************************************************************/
417
+ /* */
418
+ /* Calculate the effect of merging subsets x and y */
419
+ /* */
420
+ /*************************************************************************/
421
+
422
+
423
+ void EvaluatePair(DiscrValue x, DiscrValue y, CaseCount Cases)
424
+ /* ------------ */
425
+ {
426
+ ClassNo c;
427
+ double Entr=0;
428
+ CaseCount KnownCases=0, F;
429
+
430
+ if ( y < x )
431
+ {
432
+ c = y;
433
+ y = x;
434
+ x = c;
435
+ }
436
+
437
+ F = GEnv.ValFreq[x] + GEnv.ValFreq[y];
438
+ GEnv.MergeInfo[x][y] = - F * Log(F / Cases);
439
+
440
+ ForEach(c, 1, MaxClass)
441
+ {
442
+ F = GEnv.Freq[x][c] + GEnv.Freq[y][c];
443
+ Entr -= F * Log(F);
444
+ KnownCases += F;
445
+ }
446
+ GEnv.MergeEntr[x][y] = Entr + KnownCases * Log(KnownCases);
447
+ }
448
+
449
+
450
+
451
+ /*************************************************************************/
452
+ /* */
453
+ /* Check whether two values have same class distribution */
454
+ /* */
455
+ /*************************************************************************/
456
+
457
+
458
+ Boolean SameDistribution(DiscrValue V1, DiscrValue V2)
459
+ /* ---------------- */
460
+ {
461
+ ClassNo c;
462
+ CaseCount D1, D2;
463
+
464
+ D1 = GEnv.ValFreq[V1];
465
+ D2 = GEnv.ValFreq[V2];
466
+
467
+ ForEach(c, 1, MaxClass)
468
+ {
469
+ if ( fabs(GEnv.Freq[V1][c] / D1 - GEnv.Freq[V2][c] / D2) > 0.001 )
470
+ {
471
+ return false;
472
+ }
473
+ }
474
+
475
+ return true;
476
+ }
477
+
478
+
479
+
480
+ /*************************************************************************/
481
+ /* */
482
+ /* Add frequency and subset information from block V2 to V1 */
483
+ /* */
484
+ /*************************************************************************/
485
+
486
+
487
+ void AddBlock(DiscrValue V1, DiscrValue V2)
488
+ /* -------- */
489
+ {
490
+ ClassNo c;
491
+ int b;
492
+
493
+ if ( GEnv.ValFreq[V1] >= MINITEMS-Epsilon &&
494
+ GEnv.ValFreq[V2] >= MINITEMS-Epsilon )
495
+ {
496
+ GEnv.ReasonableSubsets--;
497
+ }
498
+ else
499
+ if ( GEnv.ValFreq[V1] < MINITEMS-Epsilon &&
500
+ GEnv.ValFreq[V2] < MINITEMS-Epsilon &&
501
+ GEnv.ValFreq[V1] + GEnv.ValFreq[V2] >= MINITEMS-Epsilon )
502
+ {
503
+ GEnv.ReasonableSubsets++;
504
+ }
505
+
506
+ ForEach(c, 1, MaxClass)
507
+ {
508
+ GEnv.Freq[V1][c] += GEnv.Freq[V2][c];
509
+ }
510
+ GEnv.ValFreq[V1] += GEnv.ValFreq[V2];
511
+ GEnv.ValFreq[V2] = 0;
512
+ ForEach(b, 0, GEnv.Bytes-1)
513
+ {
514
+ GEnv.WSubset[V1][b] |= GEnv.WSubset[V2][b];
515
+ }
516
+ }
517
+
518
+
519
+
520
+ /*************************************************************************/
521
+ /* */
522
+ /* Move frequency and subset information from block V2 to V1 */
523
+ /* */
524
+ /*************************************************************************/
525
+
526
+
527
+ void MoveBlock(DiscrValue V1, DiscrValue V2)
528
+ /* --------- */
529
+ {
530
+ ClassNo c;
531
+
532
+ ForEach(c, 1, MaxClass)
533
+ {
534
+ GEnv.Freq[V1][c] = GEnv.Freq[V2][c];
535
+ }
536
+ GEnv.ValFreq[V1] = GEnv.ValFreq[V2];
537
+ CopyBits(GEnv.Bytes, GEnv.WSubset[V2], GEnv.WSubset[V1]);
538
+ }
539
+
540
+
541
+
542
+ /*************************************************************************/
543
+ /* */
544
+ /* Print the values of attribute Att which are in the subset Ss */
545
+ /* */
546
+ /*************************************************************************/
547
+
548
+
549
+ void PrintSubset(Attribute Att, Set Ss)
550
+ /* ----------- */
551
+ {
552
+ DiscrValue V1;
553
+ Boolean First=true;
554
+
555
+ ForEach(V1, 1, MaxAttVal[Att])
556
+ {
557
+ if ( In(V1, Ss) )
558
+ {
559
+ if ( First )
560
+ {
561
+ First = false;
562
+ }
563
+ else
564
+ {
565
+ fprintf(Of, ", ");
566
+ }
567
+
568
+ fprintf(Of, "%s", AttValName[Att][V1]);
569
+ }
570
+ }
571
+ }
572
+
573
+
574
+
575
+ /*************************************************************************/
576
+ /* */
577
+ /* Construct and return a node for a test on a subset of values */
578
+ /* */
579
+ /*************************************************************************/
580
+
581
+
582
+ void SubsetTest(Tree Node, Attribute Att)
583
+ /* ----------- */
584
+ {
585
+ int S, Bytes;
586
+
587
+ Sprout(Node, Subsets[Att]);
588
+
589
+ Node->NodeType = BrSubset;
590
+ Node->Tested = Att;
591
+
592
+ Bytes = (MaxAttVal[Att]>>3) + 1;
593
+ Node->Subset = AllocZero(Subsets[Att]+1, Set);
594
+ ForEach(S, 1, Node->Forks)
595
+ {
596
+ Node->Subset[S] = Alloc(Bytes, Byte);
597
+ CopyBits(Bytes, Subset[Att][S], Node->Subset[S]);
598
+ }
599
+ }