see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,342 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of GritBot GPL Edition, a single-threaded version */
6
+ /* of GritBot release 2.01. */
7
+ /* */
8
+ /* GritBot GPL Edition is free software: you can redistribute it */
9
+ /* and/or modify it under the terms of the GNU General Public License */
10
+ /* as published by the Free Software Foundation, either version 3 of */
11
+ /* the License, or (at your option) any later version. */
12
+ /* */
13
+ /* GritBot GPL Edition is distributed in the hope that it will be */
14
+ /* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
15
+ /* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16
+ /* GNU General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with GritBot GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Routines to manage clusters */
30
+ /* --------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ /*************************************************************************/
40
+ /* */
41
+ /* Register a new cluster */
42
+ /* */
43
+ /*************************************************************************/
44
+
45
+
46
+ Clust NewClust(ContValue Expect, ContValue SD, ContValue Limit,
47
+ CaseCount Anoms, CaseCount GpSize)
48
+ /* -------- */
49
+ {
50
+ Clust C;
51
+
52
+ /* Make sure we have room for another */
53
+
54
+ if ( NClust >= ClustSpace )
55
+ {
56
+ Realloc(Cluster, (ClustSpace += 1000), Clust);
57
+ }
58
+
59
+ C = Cluster[NClust++] = Alloc(1, ClustRec);
60
+
61
+ /* Save cluster information */
62
+
63
+ C->Att = ClassAtt;
64
+ C->Expect = Expect;
65
+ C->SD = SD;
66
+ C->Limit = Limit;
67
+ C->GpSize = GpSize;
68
+ C->Frac = 1 - Anoms / (float) GpSize;
69
+
70
+ SaveClustConds(C);
71
+
72
+ return C;
73
+ }
74
+
75
+
76
+
77
+ /*************************************************************************/
78
+ /* */
79
+ /* Process current tests to determine cluster conditions. */
80
+ /* Different functions are called depending on the condition */
81
+ /* type. */
82
+ /* */
83
+ /*************************************************************************/
84
+
85
+
86
+ void SaveClustConds(Clust C)
87
+ /* -------------- */
88
+ {
89
+ Attribute Att, NAtts=0;
90
+ int NC=0;
91
+
92
+ /* Count attributes tested */
93
+
94
+ ForEach(Att, 1, MaxAtt)
95
+ {
96
+ if ( GEnv.Tested[Att] ) NAtts++;
97
+ }
98
+
99
+ C->NCond = NAtts;
100
+ C->Cond = Alloc(C->NCond, ClustCond);
101
+
102
+ /* Format tests on each attribute */
103
+
104
+ ForEach(Att, 1, MaxAtt)
105
+ {
106
+ if ( GEnv.Tested[Att] )
107
+ {
108
+ if ( Continuous(Att) )
109
+ {
110
+ FormatContinCond(Att, &C->Cond[NC]);
111
+ }
112
+ else
113
+ if ( Ordered(Att) )
114
+ {
115
+ FormatOrderedCond(Att, &C->Cond[NC]);
116
+ }
117
+ else
118
+ if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
119
+ {
120
+ FormatSubsetCond(Att, &C->Cond[NC]);
121
+ }
122
+ else
123
+ {
124
+ FormatValCond(Att, &C->Cond[NC]);
125
+ }
126
+
127
+ NC++;
128
+ }
129
+ }
130
+ }
131
+
132
+
133
+
134
+ /*************************************************************************/
135
+ /* */
136
+ /* Test on a continuous att. Check all threshold tests */
137
+ /* and assemble lowest and highest possible value */
138
+ /* */
139
+ /*************************************************************************/
140
+
141
+
142
+ void FormatContinCond(Attribute Att, ClustCond *CC)
143
+ /* ---------------- */
144
+ {
145
+ ContValue Lo=-MARKER, Hi=MARKER;
146
+ int i, Type=0;
147
+
148
+ ForEach(i, 0, GEnv.Level)
149
+ {
150
+ if ( GEnv.Test[i].Att == Att )
151
+ {
152
+ if ( GEnv.Test[i].Br == 1 )
153
+ {
154
+ Type = CONT_NA;
155
+ Lo = 1;
156
+ Hi = 0;
157
+ break;
158
+ }
159
+ else
160
+ if ( GEnv.Test[i].Br == 2 )
161
+ {
162
+ Type |= CONT_LT;
163
+ Hi = GEnv.Test[i].Cut;
164
+ }
165
+ else
166
+ {
167
+ Type |= CONT_GT;
168
+ Lo = GEnv.Test[i].Cut;
169
+ }
170
+ }
171
+ }
172
+
173
+ CC->Type = Type;
174
+ CC->Att = Att;
175
+ CC->Low = Lo;
176
+ CC->High = Hi;
177
+ CC->Values = Nil;
178
+ }
179
+
180
+
181
+
182
+ /*************************************************************************/
183
+ /* */
184
+ /* Test on an ordered discrete attribute (similar to above) */
185
+ /* */
186
+ /*************************************************************************/
187
+
188
+
189
+ void FormatOrderedCond(Attribute Att, ClustCond *CC)
190
+ /* ----------------- */
191
+ {
192
+ DiscrValue Lo, Hi;
193
+ int i, Type=0;
194
+
195
+ Lo = 2;
196
+ Hi = MaxAttVal[Att];
197
+
198
+ ForEach(i, 0, GEnv.Level)
199
+ {
200
+ if ( GEnv.Test[i].Att == Att )
201
+ {
202
+ if ( GEnv.Test[i].Br == 1 )
203
+ {
204
+ Type = DISCR_VAL;
205
+ Lo = Hi = 1;
206
+ break;
207
+ }
208
+ else
209
+ if ( GEnv.Test[i].Br == 2 )
210
+ {
211
+ Type |= DISCR_LT;
212
+ Hi = GEnv.Test[i].Cut;
213
+ }
214
+ else
215
+ {
216
+ Type |= DISCR_GT;
217
+ Lo = GEnv.Test[i].Cut + 1;
218
+ }
219
+ }
220
+ }
221
+
222
+ CC->Type = Type;
223
+ CC->Att = Att;
224
+ CC->Low = Lo;
225
+ CC->High = Hi;
226
+ CC->Values = Nil;
227
+ }
228
+
229
+
230
+
231
+ /*************************************************************************/
232
+ /* */
233
+ /* Subset test for a discrete attribute. All tests must be */
234
+ /* checked to determine the final subset values */
235
+ /* */
236
+ /*************************************************************************/
237
+
238
+
239
+ void FormatSubsetCond(Attribute Att, ClustCond *CC)
240
+ /* ---------------- */
241
+ {
242
+ DiscrValue v;
243
+ int i;
244
+
245
+ CC->Att = Att;
246
+
247
+ GEnv.Possible[1] = false;
248
+ ForEach(v, 2, MaxAttVal[Att])
249
+ {
250
+ GEnv.Possible[v] = true;
251
+ }
252
+
253
+ ForEach(i, 0, GEnv.Level)
254
+ {
255
+ if ( GEnv.Test[i].Att == Att )
256
+ {
257
+ if ( GEnv.Test[i].Br == 1 )
258
+ {
259
+ GEnv.Possible[1] = true;
260
+ ForEach(v, 2, MaxAttVal[Att])
261
+ {
262
+ GEnv.Possible[v] = false;
263
+ }
264
+ break;
265
+ }
266
+ else
267
+ ForEach(v, 2, MaxAttVal[Att])
268
+ {
269
+ if ( In(v, GEnv.Test[i].Left) )
270
+ {
271
+ GEnv.Possible[v] = GEnv.Possible[v] && ( GEnv.Test[i].Br == 2 );
272
+ }
273
+ else
274
+ {
275
+ GEnv.Possible[v] = GEnv.Possible[v] && ( GEnv.Test[i].Br == 3 );
276
+ }
277
+ }
278
+ }
279
+ }
280
+
281
+ CC->Type = DISCR_SET;
282
+ CC->Low = CC->High = 0;
283
+ CC->Values = AllocZero((MaxAttVal[Att]>>3)+1, unsigned char);
284
+
285
+ ForEach(v, 1, MaxAttVal[Att])
286
+ {
287
+ if ( GEnv.Possible[v] ) SetBit(v, CC->Values);
288
+ }
289
+ }
290
+
291
+
292
+
293
+ /*************************************************************************/
294
+ /* */
295
+ /* Simple test on attribute value. There is no need to check */
296
+ /* more than one test since the first determines the tested value */
297
+ /* */
298
+ /*************************************************************************/
299
+
300
+
301
+ void FormatValCond(Attribute Att, ClustCond *CC)
302
+ /* ------------- */
303
+ {
304
+ int i;
305
+
306
+ ForEach(i, 0, GEnv.Level)
307
+ {
308
+ if ( GEnv.Test[i].Att == Att )
309
+ {
310
+ CC->Type = DISCR_VAL;
311
+ CC->Att = Att;
312
+ CC->Low = CC->High = GEnv.Test[i].Br;
313
+ CC->Values = Nil;
314
+ return;
315
+ }
316
+ }
317
+ }
318
+
319
+
320
+
321
+ /*************************************************************************/
322
+ /* */
323
+ /* Free conditions stored in a cluster */
324
+ /* */
325
+ /*************************************************************************/
326
+
327
+
328
+ void FreeClust(Clust C)
329
+ /* --------- */
330
+ {
331
+ int d;
332
+
333
+ if ( C )
334
+ {
335
+ ForEach(d, 0, C->NCond-1)
336
+ {
337
+ FreeUnlessNil(C->Cond[d].Values);
338
+ }
339
+ FreeUnlessNil(C->Cond);
340
+ Free(C);
341
+ }
342
+ }
@@ -0,0 +1,1269 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of GritBot GPL Edition, a single-threaded version */
6
+ /* of GritBot release 2.01. */
7
+ /* */
8
+ /* GritBot GPL Edition is free software: you can redistribute it */
9
+ /* and/or modify it under the terms of the GNU General Public License */
10
+ /* as published by the Free Software Foundation, either version 3 of */
11
+ /* the License, or (at your option) any later version. */
12
+ /* */
13
+ /* GritBot GPL Edition is distributed in the hope that it will be */
14
+ /* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
15
+ /* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16
+ /* GNU General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with GritBot GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ #ifndef INSPECT
28
+ /*************************************************************************/
29
+ /* */
30
+ /* Divide-and-Conquer generic routines */
31
+ /* ----------------------------------- */
32
+ /* */
33
+ /*************************************************************************/
34
+
35
+
36
+ #include "defns.i"
37
+ #include "extern.i"
38
+
39
+
40
+ /*************************************************************************/
41
+ /* */
42
+ /* Allocate space for all tables */
43
+ /* */
44
+ /*************************************************************************/
45
+
46
+
47
+ void InitialiseDAC()
48
+ /* ------------- */
49
+ {
50
+ DiscrValue v;
51
+ Attribute Att;
52
+ CaseNo i, MaxSampleSize;
53
+ extern SortPair *Pair;
54
+
55
+ UseLogs = AllocZero(MaxAtt+1, Boolean);
56
+ SomeMiss = AllocZero(MaxAtt+1, Boolean);
57
+ SomeNA = AllocZero(MaxAtt+1, Boolean);
58
+ LowTail = AllocZero(MaxAtt+1, ContValue);
59
+ HighTail = AllocZero(MaxAtt+1, ContValue);
60
+
61
+ LogCaseNo = Alloc(MaxCase+2, double);
62
+ LogCaseNo[0] = LogCaseNo[1] = 0;
63
+ ForEach(i, 2, MaxCase+1)
64
+ {
65
+ LogCaseNo[i] = log((double) i) / Log2;
66
+ }
67
+
68
+ /* Save random numbers for sampling */
69
+
70
+ MaxSampleSize = SAMPLEUNIT * Max(MaxDiscrVal, 5);
71
+ if ( MaxSampleSize > (MaxCase+1) / 2 + 1 )
72
+ {
73
+ MaxSampleSize = (MaxCase+1) / 2 + 1;
74
+ }
75
+
76
+ Rand = Alloc(MaxSampleSize, double);
77
+ ResetKR(1230);
78
+ ForEach(i, 0, MaxSampleSize-1)
79
+ {
80
+ Rand[i] = KRandom();
81
+ }
82
+
83
+ /* Compute prior probabilities for discrete values */
84
+
85
+ Prior = Alloc(MaxAtt+1, double *);
86
+ ForEach(Att, 1, MaxAtt)
87
+ {
88
+ if ( Discrete(Att) )
89
+ {
90
+ Prior[Att] = AllocZero(MaxAttVal[Att]+1, double);
91
+
92
+ ForEach(i, 0, MaxCase)
93
+ {
94
+ Prior[Att][XDVal(Case[i], Att)]++;
95
+ }
96
+
97
+ SomeMiss[Att] = ( Prior[Att][0] > 0 );
98
+ SomeNA[Att] = ( Prior[Att][1] > 0 );
99
+
100
+ ForEach(v, 0, MaxAttVal[Att])
101
+ {
102
+ Prior[Att][v] /= (double) (MaxCase+1);
103
+ }
104
+ }
105
+ }
106
+
107
+ /* Determine precision for continuous attributes */
108
+
109
+ Prec = AllocZero(MaxAtt+1, unsigned char);
110
+ ForEach(Att, 1, MaxAtt)
111
+ {
112
+ if ( ! Exclude(Att) && Continuous(Att) )
113
+ {
114
+ Prec[Att] = log(FracBase(Att)) / log(10.0) + 0.5;
115
+ }
116
+ }
117
+
118
+ Pair = Alloc(MaxCase+1, SortPair);
119
+
120
+ InitialiseEnvData();
121
+ }
122
+
123
+
124
+
125
+ void FreeDAC()
126
+ /* ------- */
127
+ {
128
+ extern SortPair *Pair;
129
+
130
+ FreeEnvData();
131
+
132
+ FreeUnlessNil(UseLogs); UseLogs = Nil;
133
+ FreeUnlessNil(SomeMiss); SomeMiss = Nil;
134
+ FreeUnlessNil(SomeNA); SomeNA = Nil;
135
+ FreeUnlessNil(LowTail); LowTail = Nil;
136
+ FreeUnlessNil(HighTail); HighTail = Nil;
137
+ FreeUnlessNil(LogCaseNo); LogCaseNo = Nil;
138
+ FreeUnlessNil(Rand); Rand = Nil;
139
+ FreeVector((void **) Prior, 1, MaxAtt); Prior = Nil;
140
+ FreeUnlessNil(Prec); Prec = Nil;
141
+
142
+ FreeUnlessNil(Pair); Pair = Nil;
143
+ }
144
+
145
+
146
+
147
+ /*************************************************************************/
148
+ /* */
149
+ /* Split cases Fp through Lp */
150
+ /* CondAtts is the current number of conditioning attributes */
151
+ /* */
152
+ /*************************************************************************/
153
+
154
+
155
+ void Split(CaseNo Fp, CaseNo Lp, int CondAtts, Tree Parent, DiscrValue Br,
156
+ Tree *Result)
157
+ /* ----- */
158
+ {
159
+ CaseNo i;
160
+ CaseCount Cases;
161
+ DiscrValue v;
162
+ double Val, Sum=0, SumSq=0;
163
+ Attribute Att, BestAtt;
164
+ Tree Node;
165
+
166
+
167
+ *Result = Nil;
168
+
169
+ /* Recover info about tests to this point */
170
+
171
+ ForEach(Att, 1, MaxAtt)
172
+ {
173
+ GEnv.Tested[Att] = 0;
174
+ }
175
+
176
+ RecoverContext(Parent, Br);
177
+
178
+ Cases = No(Fp, Lp);
179
+ Verbosity(1,
180
+ fprintf(Of, "\n<%d> %d cases %d-%d\n", GEnv.Level, Cases, Fp, Lp);
181
+ ShowContext(Fp))
182
+
183
+ GEnv.FRAC = 1;
184
+
185
+ /* Determine PSD and base information. This is only approximate, since
186
+ missing values of the tested attributes are excluded. */
187
+
188
+ if ( Continuous(ClassAtt) )
189
+ {
190
+ if ( Cases < 2 * CMINITEMS )
191
+ {
192
+ Progress(Cases);
193
+ return;
194
+ }
195
+
196
+ ForEach(i, Fp, Lp)
197
+ {
198
+ Val = CClass(Case[i]);
199
+ Sum += Val;
200
+ SumSq += Val * Val;
201
+ }
202
+ GEnv.PSD = SDEstimate(Cases, Sum, SumSq);
203
+ }
204
+ else
205
+ {
206
+ if ( Cases < 2 * DMINITEMS )
207
+ {
208
+ Progress(Cases);
209
+ return;
210
+ }
211
+
212
+ /* Check for pure leaf */
213
+
214
+ FindClassFrequencies(Fp, Lp);
215
+
216
+ ForEach(v, 1, MaxAttVal[ClassAtt])
217
+ {
218
+ if ( GEnv.ClassFreq[v] == Cases )
219
+ {
220
+ Verbosity(1, fprintf(Of, "\tpure subset\n"))
221
+ Progress(Cases);
222
+ return;
223
+ }
224
+ }
225
+
226
+ GEnv.BaseInfo =
227
+ TotalInfo(GEnv.ClassFreq, 1, MaxAttVal[ClassAtt]) / Cases;
228
+ }
229
+
230
+ *Result = Node = Leaf(Parent, Br);
231
+
232
+ /* Find the best attribute split, using sampling if the number
233
+ of cases is at least the minimum multiple of the sample size.
234
+ Start by collecting info on discrete attributes */
235
+
236
+ DiscreteAttInfo(Fp, Lp, CondAtts);
237
+
238
+ if ( Cases > SAMPLEFACTOR * SampleSize )
239
+ {
240
+ ChooseSplitWithSampling(Fp, Lp, CondAtts);
241
+ }
242
+ else
243
+ {
244
+ ChooseSplit(Fp, Lp, CondAtts);
245
+ }
246
+
247
+ /* Save any sift entry */
248
+
249
+ if ( SIFT && GEnv.SiftEntry && GEnv.SiftSize )
250
+ {
251
+ Node->SiftEntry = strdup(GEnv.SiftEntry);
252
+ GEnv.SiftSize = 0;
253
+ }
254
+
255
+ FindBestAtt(&BestAtt, &Val);
256
+
257
+ /* Decide whether to branch or not */
258
+
259
+ if ( BestAtt == None )
260
+ {
261
+ Verbosity(1, fprintf(Of, "\tno sensible splits\n"))
262
+
263
+ Progress(Cases);
264
+ }
265
+ else
266
+ {
267
+ Verbosity(1,
268
+ fprintf(Of, "\tbest attribute %s", AttName[BestAtt]);
269
+ if ( Continuous(BestAtt) )
270
+ {
271
+ fprintf(Of, " cut %.3f", GEnv.Bar[BestAtt]);
272
+ }
273
+ if ( ! Continuous(ClassAtt) )
274
+ {
275
+ fprintf(Of, " val %.3f inf %.3f",
276
+ SplitVal(GEnv.Gain[BestAtt], GEnv.Info[BestAtt]),
277
+ GEnv.Info[BestAtt]);
278
+ }
279
+ fprintf(Of, " gain %.3f\n", GEnv.Gain[BestAtt]);)
280
+
281
+ /* Carry out the recursive divide-and-conquer */
282
+
283
+ Node->Tested = BestAtt;
284
+
285
+ if ( Continuous(BestAtt) || Ordered(BestAtt) )
286
+ {
287
+ Node->NodeType = BrThresh;
288
+ Node->Forks = 3;
289
+ Node->Cut = GEnv.Bar[BestAtt];
290
+ }
291
+ else
292
+ if ( Continuous(ClassAtt) && MaxAttVal[BestAtt] > 3 )
293
+ {
294
+ Node->NodeType = BrSubset;
295
+ Node->Forks = 3;
296
+ Node->Left = Alloc((MaxAttVal[BestAtt]>>3)+1, unsigned char);
297
+ memcpy(Node->Left, GEnv.Subset[BestAtt], (MaxAttVal[BestAtt]>>3)+1);
298
+ }
299
+ else
300
+ {
301
+ Node->NodeType = BrDiscr;
302
+ Node->Forks = MaxAttVal[BestAtt];
303
+ }
304
+
305
+ Node->Branch = Alloc(Node->Forks+1, Tree);
306
+
307
+ if ( ! GEnv.Tested[BestAtt] ) CondAtts++;
308
+
309
+ Divide(Node, Fp, Lp, CondAtts);
310
+ }
311
+ }
312
+
313
+
314
+
315
+ /*************************************************************************/
316
+ /* */
317
+ /* Recover information on level and tests from tree and parent */
318
+ /* */
319
+ /*************************************************************************/
320
+
321
+
322
+ void RecoverContext(Tree T, DiscrValue Br)
323
+ /* -------------- */
324
+ {
325
+ if ( T )
326
+ {
327
+ RecoverContext(T->Parent, T->Br);
328
+
329
+ NoteTest(T->Tested, Br, T->Cut, T->Left);
330
+ GEnv.Tested[T->Tested]++;
331
+ GEnv.Level++;
332
+ }
333
+ else
334
+ {
335
+ GEnv.Level = 0;
336
+ }
337
+ }
338
+
339
+
340
+
341
+ /*************************************************************************/
342
+ /* */
343
+ /* Analyse all discrete attributes in one pass */
344
+ /* */
345
+ /*************************************************************************/
346
+
347
+
348
+ void DiscreteAttInfo(CaseNo Fp, CaseNo Lp, int CondAtts)
349
+ /* --------------- */
350
+ {
351
+ CaseNo i;
352
+ DiscrValue v, c;
353
+ Attribute Att;
354
+ double Val;
355
+ int NDList=0, dl;
356
+
357
+ /* Initialise counts etc and prepare list of attributes */
358
+
359
+ ForEach(Att, 1, MaxAtt)
360
+ {
361
+ if ( ! Discrete(Att) || Exclude(Att) || Att == ClassAtt ||
362
+ CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
363
+ {
364
+ continue;
365
+ }
366
+
367
+ GEnv.DList[NDList++] = Att;
368
+
369
+ if ( Continuous(ClassAtt) )
370
+ {
371
+ ForEach(v, 0, MaxAttVal[Att])
372
+ {
373
+ GEnv.DValSum[Att][v] = GEnv.DValSumSq[Att][v] = 0;
374
+ GEnv.DFreq[Att][v][0] = 0; /* value frequency */
375
+ }
376
+ }
377
+ else
378
+ {
379
+ ForEach(v, 0, MaxAttVal[Att])
380
+ {
381
+ ForEach(c, 1, MaxAttVal[ClassAtt])
382
+ {
383
+ GEnv.DFreq[Att][v][c] = 0;
384
+ }
385
+ }
386
+ }
387
+ }
388
+
389
+ if ( ! NDList-- ) return;
390
+
391
+ /* Examine cases and update all counts etc */
392
+
393
+ ForEach(i, Fp, Lp)
394
+ {
395
+ ForEach(dl, 0, NDList)
396
+ {
397
+ Att = GEnv.DList[dl];
398
+
399
+ v = XDVal(Case[i], Att);
400
+
401
+ if ( Continuous(ClassAtt) )
402
+ {
403
+ Val = CClass(Case[i]);
404
+
405
+ GEnv.DFreq[Att][v][0]++;
406
+ GEnv.DValSum[Att][v] += Val;
407
+ GEnv.DValSumSq[Att][v] += Val * Val;
408
+ }
409
+ else
410
+ {
411
+ GEnv.DFreq[Att][v][ DClass(Case[i]) ]++;
412
+ }
413
+ }
414
+ }
415
+ }
416
+
417
+
418
+
419
+ /*************************************************************************/
420
+ /* */
421
+ /* Choose split using a sample. There are three phases: */
422
+ /* - process discrete atts using all data */
423
+ /* - for continuous atts, find gain etc from two samples and */
424
+ /* record the better value */
425
+ /* - re-examine high-value continuous attributes using all cases */
426
+ /* */
427
+ /*************************************************************************/
428
+
429
+
430
+ void ChooseSplitWithSampling(CaseNo Fp, CaseNo Lp, int CondAtts)
431
+ /* ----------------------- */
432
+ {
433
+ double Val, OldBestVal;
434
+ Attribute Att, BestAtt;
435
+
436
+ /* Process discrete attributes using all data */
437
+
438
+ ForEach(Att, 1, MaxAtt)
439
+ {
440
+ GEnv.Gain[Att] = None;
441
+
442
+ if ( Exclude(Att) || Att == ClassAtt || Continuous(Att) ||
443
+ CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
444
+ {
445
+ continue;
446
+ }
447
+
448
+ CheckSplit(Att, Fp, Lp);
449
+ }
450
+
451
+ /* Process continuous attributes using two samples */
452
+
453
+ GEnv.FRAC = SampleSize / (double) No(Fp, Lp);
454
+
455
+ SampleScan(Fp, Lp, CondAtts, false);
456
+ SampleScan(Fp+SampleSize, Lp, CondAtts, true);
457
+
458
+ GEnv.FRAC = 1;
459
+
460
+ /* Re-examine continuous attributes that are possible best splits
461
+ (with value at least 70% of current best value) */
462
+
463
+ FindBestAtt(&BestAtt, &OldBestVal);
464
+
465
+ if ( BestAtt != None )
466
+ {
467
+ Verbosity(2,
468
+ fprintf(Of, " Revisit threshold %.3f (%s)\n",
469
+ 0.7 * OldBestVal, AttName[BestAtt]))
470
+
471
+ ForEach(Att, 1, MaxAtt)
472
+ {
473
+ if ( Discrete(Att) || GEnv.Gain[Att] <= Epsilon ) continue;
474
+
475
+ Val = SplitVal(GEnv.Gain[Att], GEnv.Info[Att]);
476
+
477
+ GEnv.Gain[Att] = None;
478
+
479
+ if ( Val > 0.7 * OldBestVal )
480
+ {
481
+ CheckSplit(Att, Fp, Lp);
482
+ }
483
+ }
484
+ }
485
+ }
486
+
487
+
488
+
489
+ /*************************************************************************/
490
+ /* */
491
+ /* Estimate Gain etc of continuous attributes using sample */
492
+ /* */
493
+ /*************************************************************************/
494
+
495
+
496
+ void SampleScan(CaseNo Fp, CaseNo Lp, int CondAtts, Boolean Second)
497
+ /* ---------- */
498
+ {
499
+ CaseNo i, SLp;
500
+ double Val, Sum=0, SumSq=0, SaveBaseInfo, SavePSD,
501
+ FBar, FInfo, FGain, FVal;
502
+ Attribute Att;
503
+
504
+ /* Save base information or SD */
505
+
506
+ SaveBaseInfo = GEnv.BaseInfo;
507
+ SavePSD = GEnv.PSD;
508
+
509
+ /* Generate sample in Fp ... Fp+SampleSize-1 */
510
+
511
+ Sample(Fp, Lp, SampleSize);
512
+ SLp = Fp + SampleSize - 1;
513
+
514
+ /* Determine sample PSD or base information */
515
+
516
+ if ( Continuous(ClassAtt) )
517
+ {
518
+ ForEach(i, Fp, SLp)
519
+ {
520
+ Val = CClass(Case[i]);
521
+ Sum += Val;
522
+ SumSq += Val * Val;
523
+ }
524
+ GEnv.PSD = SDEstimate(SampleSize, Sum, SumSq);
525
+ }
526
+ else
527
+ {
528
+ FindClassFrequencies(Fp, SLp);
529
+ GEnv.BaseInfo =
530
+ TotalInfo(GEnv.ClassFreq, 1, MaxAttVal[ClassAtt]) / SampleSize;
531
+ }
532
+
533
+ /* Check attributes using sample */
534
+
535
+ ForEach(Att, 1, MaxAtt)
536
+ {
537
+ if ( Exclude(Att) || Att == ClassAtt || ! Continuous(Att) ||
538
+ CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
539
+ {
540
+ continue;
541
+ }
542
+
543
+ /* Save information from possible earlier sample */
544
+
545
+ FInfo = GEnv.Info[Att];
546
+ FGain = GEnv.Gain[Att];
547
+ FBar = GEnv.Bar[Att];
548
+
549
+ GEnv.Gain[Att] = None;
550
+
551
+ CheckSplit(Att, Fp, SLp);
552
+
553
+ /* If this is second sample, retain information from better */
554
+
555
+ if ( Second )
556
+ {
557
+ FVal = SplitVal(FGain, FInfo); /* first value */
558
+ Val = SplitVal(GEnv.Gain[Att], GEnv.Info[Att]); /* second value */
559
+
560
+ if ( FVal > Val )
561
+ {
562
+ GEnv.Gain[Att] = FGain;
563
+ GEnv.Info[Att] = FInfo;
564
+ GEnv.Bar[Att] = FBar;
565
+ }
566
+ }
567
+ }
568
+
569
+ /* Restore base information or SD */
570
+
571
+ GEnv.BaseInfo = SaveBaseInfo;
572
+ GEnv.PSD = SavePSD;
573
+ }
574
+
575
+
576
+
577
+ /*************************************************************************/
578
+ /* */
579
+ /* Sample N cases from Fp through Lp using tabulated random nos */
580
+ /* */
581
+ /*************************************************************************/
582
+
583
+
584
+ void Sample(CaseNo Fp, CaseNo Lp, CaseCount N)
585
+ /* ------ */
586
+ {
587
+ CaseNo i, j, Cases;
588
+
589
+ Cases = No(Fp, Lp);
590
+
591
+ ForEach(i, 0, N-1)
592
+ {
593
+ j = Rand[i] * Cases--;
594
+ Swap(Fp+i, Fp+j);
595
+ }
596
+ }
597
+
598
+
599
+
600
+ /*************************************************************************/
601
+ /* */
602
+ /* Choose a split using all cases */
603
+ /* */
604
+ /*************************************************************************/
605
+
606
+
607
+ void ChooseSplit(CaseNo Fp, CaseNo Lp, int CondAtts)
608
+ /* ----------- */
609
+ {
610
+ Attribute Att;
611
+
612
+ GEnv.FRAC = 1;
613
+
614
+ ForEach(Att, 1, MaxAtt)
615
+ {
616
+ GEnv.Gain[Att] = None;
617
+
618
+ if ( Exclude(Att) || Att == ClassAtt ||
619
+ CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
620
+ {
621
+ continue;
622
+ }
623
+
624
+ CheckSplit(Att, Fp, Lp);
625
+ }
626
+ }
627
+
628
+
629
+
630
+ void FindBestAtt(Attribute *BestAtt, double *BestVal)
631
+ /* ----------- */
632
+ {
633
+ Attribute Att;
634
+ double Val;
635
+
636
+ *BestVal = Epsilon;
637
+ *BestAtt = None;
638
+
639
+ ForEach(Att, 1, MaxAtt)
640
+ {
641
+ Val = SplitVal(GEnv.Gain[Att], GEnv.Info[Att]);
642
+
643
+ if ( Val > *BestVal )
644
+ {
645
+ *BestAtt = Att;
646
+ *BestVal = Val;
647
+ }
648
+ }
649
+ }
650
+
651
+
652
+
653
+ /*************************************************************************/
654
+ /* */
655
+ /* Evaluate a potential split */
656
+ /* */
657
+ /*************************************************************************/
658
+
659
+
660
+ void CheckSplit(Attribute Att, CaseNo Fp, CaseNo Lp)
661
+ /* ---------- */
662
+ {
663
+ CaseNo Xp;
664
+
665
+ GEnv.Tested[Att]++;
666
+
667
+ /* Remove missing values of Att. Note: this makes values
668
+ of BaseInfo and PSD approximate only */
669
+
670
+ Xp = ( SomeMiss[Att] ? SkipMissing(Att, Fp, Lp) : Fp );
671
+
672
+ /* Evaluate attribute for split -- different methods for
673
+ continuous and discrete class attributes */
674
+
675
+ if ( Continuous(Att) ) /* continuous att */
676
+ {
677
+ if ( Continuous(ClassAtt) )
678
+ {
679
+ CEvalContinAtt(Att, Xp, Lp);
680
+ }
681
+ else
682
+ {
683
+ DEvalContinAtt(Att, Xp, Lp);
684
+ }
685
+ }
686
+ else /* discrete att */
687
+ {
688
+ if ( Continuous(ClassAtt) )
689
+ {
690
+ if ( MaxAttVal[Att] > 3 || GEnv.Tested[Att] <= 1 )
691
+ {
692
+ CEvalDiscrAtt(Att, Xp, Lp);
693
+ }
694
+ }
695
+ else
696
+ if ( Ordered(Att) )
697
+ {
698
+ DEvalOrderedAtt(Att, Xp, Lp);
699
+ }
700
+ else
701
+ if ( GEnv.Tested[Att] <= 1 )
702
+ {
703
+ DEvalDiscrAtt(Att, Xp, Lp);
704
+ }
705
+ }
706
+
707
+ if ( GEnv.Gain[Att] > Epsilon )
708
+ {
709
+ /* Find value adjusted for missing values */
710
+
711
+ GEnv.Gain[Att] *= No(Xp, Lp) / (double) No(Fp, Lp);
712
+ GEnv.Info[Att] = Max(GEnv.Info[Att], 0.5);
713
+ }
714
+
715
+ GEnv.Tested[Att]--;
716
+ }
717
+
718
+
719
+
720
+ /*************************************************************************/
721
+ /* */
722
+ /* Split cases Fp to Lp on attribute Att */
723
+ /* */
724
+ /*************************************************************************/
725
+
726
+
727
+ void Divide(Tree Node, CaseNo Fp, CaseNo Lp, int CondAtts)
728
+ /* ------ */
729
+ {
730
+ CaseNo Ep;
731
+ DiscrValue v;
732
+
733
+ /* Remove unknown attribute values */
734
+
735
+ Ep = ( SomeMiss[Node->Tested] ? SkipMissing(Node->Tested, Fp, Lp) : Fp );
736
+ Progress(Ep - Fp);
737
+
738
+ /* Recursive divide and conquer */
739
+
740
+ ForEach(v, 1, Node->Forks)
741
+ {
742
+ Fp = Ep;
743
+ Ep = Group(Node->Tested, v, Fp, Lp, Node->Cut, Node->Left);
744
+
745
+ if ( Ep > Fp )
746
+ {
747
+ Split(Fp, Ep-1, CondAtts, Node, v, &Node->Branch[v]);
748
+ }
749
+ }
750
+ }
751
+
752
+
753
+
754
+ /*************************************************************************/
755
+ /* */
756
+ /* Group together missing values and return index of next case */
757
+ /* */
758
+ /*************************************************************************/
759
+
760
+
761
+ CaseNo SkipMissing(Attribute Att, CaseNo Fp, CaseNo Lp)
762
+ /* ----------- */
763
+ {
764
+ CaseNo i;
765
+
766
+ ForEach(i, Fp, Lp)
767
+ {
768
+ if ( Unknown(Case[i], Att) )
769
+ {
770
+ Swap(Fp, i);
771
+ Fp++;
772
+ }
773
+ }
774
+
775
+ return Fp;
776
+ }
777
+
778
+
779
+
780
+
781
+ /*************************************************************************/
782
+ /* */
783
+ /* Check groups formed by a potential test */
784
+ /* */
785
+ /*************************************************************************/
786
+
787
+
788
+ void CheckPotentialClusters(Attribute Att, DiscrValue Forks,
789
+ CaseNo Fp, CaseNo Lp, ContValue Cut, Set S,
790
+ CaseNo **FT)
791
+ /* ---------------------- */
792
+ {
793
+ CaseNo Ep;
794
+ DiscrValue v;
795
+
796
+ ForEach(v, 1, Forks)
797
+ {
798
+ Ep = Group(Att, v, Fp, Lp, Cut, S);
799
+
800
+ if ( Ep > Fp )
801
+ {
802
+ NoteTest(Att, v, Cut, S);
803
+
804
+ if ( Continuous(ClassAtt) )
805
+ {
806
+ FindContinOutliers(Fp, Ep-1, false);
807
+ }
808
+ else
809
+ {
810
+ FindDiscrOutliers(Fp, Ep-1, ( FT ? FT[v] : Nil ));
811
+ }
812
+
813
+ Fp = Ep;
814
+ }
815
+ }
816
+ }
817
+
818
+
819
+
820
+ /*************************************************************************/
821
+ /* */
822
+ /* Print context information for DAC */
823
+ /* */
824
+ /*************************************************************************/
825
+
826
+
827
+ void ShowContext(CaseNo i)
828
+ /* ----------- */
829
+ {
830
+ Attribute Att;
831
+ ClustRec CR;
832
+ Clust C=&CR;
833
+ int d;
834
+
835
+ C->Att = ClassAtt;
836
+ GEnv.Level--;
837
+ SaveClustConds(C);
838
+ GEnv.Level++;
839
+
840
+ ForEach(d, 0, C->NCond-1)
841
+ {
842
+ Att = C->Cond[d].Att;
843
+
844
+ if ( Continuous(Att) )
845
+ {
846
+ PrintContinCond(Att, C->Cond[d].Low, C->Cond[d].High, i);
847
+ }
848
+ else
849
+ if ( Ordered(Att) )
850
+ {
851
+ PrintOrderedCond(Att, (int) C->Cond[d].Low, (int) C->Cond[d].High,
852
+ i);
853
+ }
854
+ else
855
+ if ( Continuous(C->Att) && MaxAttVal[Att] > 3 )
856
+ {
857
+ PrintSubsetCond(Att, C->Cond[d].Values, i);
858
+ FreeUnlessNil(C->Cond[d].Values);
859
+ }
860
+ else
861
+ {
862
+ PrintValCond(Att, (int) C->Cond[d].Low);
863
+ }
864
+ }
865
+
866
+ Free(C->Cond);
867
+ }
868
+
869
+
870
+
871
+ /*************************************************************************/
872
+ /* */
873
+ /* Construct a leaf in a given node */
874
+ /* */
875
+ /*************************************************************************/
876
+
877
+
878
+ Tree Leaf(Tree Parent, DiscrValue Br)
879
+ /* ---- */
880
+ {
881
+ Tree Node;
882
+
883
+ Node = AllocZero(1, TreeRec);
884
+
885
+ Node->NodeType = 0;
886
+ Node->Parent = Parent;
887
+ Node->Br = Br;
888
+
889
+ return Node;
890
+ }
891
+
892
+
893
+
894
+ void ReleaseTree(Tree T, int Level)
895
+ /* ----------- */
896
+ {
897
+ DiscrValue v;
898
+
899
+ if ( ! T ) return;
900
+
901
+ if ( Level > 0 && LastLevel >= Level - 1 ) LastLevel = Level - 2;
902
+
903
+ /* Possible sift entry */
904
+
905
+ if ( T->SiftEntry )
906
+ {
907
+ if ( SIFT )
908
+ {
909
+ RecoverContext(T->Parent, T->Br);
910
+ OutputConditions();
911
+ fprintf(Sf, "%s", T->SiftEntry);
912
+ }
913
+ Free(T->SiftEntry);
914
+ }
915
+
916
+ if ( T->NodeType )
917
+ {
918
+ ForEach(v, 1, T->Forks)
919
+ {
920
+ ReleaseTree(T->Branch[v], Level+1);
921
+ }
922
+
923
+ if ( T->NodeType == BrSubset )
924
+ {
925
+ FreeUnlessNil(T->Left);
926
+ }
927
+
928
+ Free(T->Branch);
929
+ }
930
+
931
+ Free(T);
932
+ }
933
+
934
+
935
+
936
+ void OutputConditions()
937
+ /* ---------------- */
938
+ {
939
+ Attribute Att;
940
+ int i, CType, b, Bytes;
941
+ DiscrValue Br;
942
+
943
+ if ( ! TargetSaved )
944
+ {
945
+ fprintf(Sf, "1 %d\n", ClassAtt);
946
+ TargetSaved = true;
947
+ }
948
+
949
+ if ( GEnv.Level < 0 ) return;
950
+
951
+ /* Save all conditions since last saved */
952
+
953
+ ForEach(i, LastLevel+1, GEnv.Level-1)
954
+ {
955
+ Att = GEnv.Test[i].Att;
956
+ Br = GEnv.Test[i].Br;
957
+
958
+ /* Determine condition type */
959
+
960
+ CType = ( Br == 1 ? 11 :
961
+ Continuous(Att) || Ordered(Att) ? 12 :
962
+ Continuous(ClassAtt) && MaxAttVal[Att] > 3 ? 13 : 11 );
963
+
964
+ fprintf(Sf, "%d %d %d %d", CType, i, Att, Br);
965
+
966
+ /* Don't need to save anything else if this branch is 1 (N/A)
967
+ or if test is on two-valued discrete att */
968
+
969
+ if ( Br != 1 )
970
+ {
971
+ if ( Continuous(Att) || Ordered(Att) )
972
+ {
973
+ fprintf(Sf, " %.8g", GEnv.Test[i].Cut);
974
+ }
975
+ else
976
+ if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
977
+ {
978
+ /* Print subset of values */
979
+
980
+ Bytes = (MaxAttVal[Att]>>3) + 1;
981
+
982
+ ForEach(b, 0, Bytes-1)
983
+ {
984
+ fprintf(Sf, " %x", GEnv.Test[i].Left[b]);
985
+ }
986
+ }
987
+ }
988
+
989
+ fprintf(Sf, "\n");
990
+ }
991
+
992
+ LastLevel = GEnv.Level-1;
993
+ }
994
+
995
+
996
+
997
+ /*************************************************************************/
998
+ /* */
999
+ /* Set up environment */
1000
+ /* */
1001
+ /*************************************************************************/
1002
+
1003
+
1004
+ void InitialiseEnvData()
1005
+ /* ----------------- */
1006
+ {
1007
+ DiscrValue v;
1008
+ Attribute Att;
1009
+
1010
+ GEnv.ValFreq = Alloc(MaxDiscrVal+1, CaseCount);
1011
+ GEnv.ClassFreq = Alloc(MaxDiscrVal+1, CaseCount);
1012
+ GEnv.ValSum = Alloc(MaxDiscrVal+1, double);
1013
+ GEnv.ValSumSq = Alloc(MaxDiscrVal+1, double);
1014
+ GEnv.Left = Alloc(MaxDiscrVal+1, Boolean);
1015
+ GEnv.Possible = Alloc(MaxDiscrVal+1, Boolean);
1016
+ GEnv.Tested = AllocZero(MaxAtt+1, int);
1017
+ GEnv.Gain = AllocZero(MaxAtt+1, double);
1018
+ GEnv.Info = AllocZero(MaxAtt+1, double);
1019
+ GEnv.Bar = AllocZero(MaxAtt+1, ContValue);
1020
+
1021
+ GEnv.Subset = AllocZero(MaxAtt+1, Set);
1022
+ GEnv.Subset[0] = Alloc((MaxDiscrVal>>3)+1, unsigned char); /* caveats */
1023
+ ForEach(Att, 1, MaxAtt)
1024
+ {
1025
+ if ( Discrete(Att) )
1026
+ {
1027
+ GEnv.Subset[Att] = Alloc((MaxAttVal[Att]>>3)+1, unsigned char);
1028
+ }
1029
+ }
1030
+
1031
+ /* Freq[] is one longer than apparently necessary to allow for
1032
+ the extra slot needed by EvalOrderedAtt() */
1033
+
1034
+ GEnv.Freq = AllocZero(MaxDiscrVal+2, CaseCount *);
1035
+ ForEach(v, 0, MaxDiscrVal+1)
1036
+ {
1037
+ GEnv.Freq[v] = AllocZero(MaxDiscrVal+1, CaseCount);
1038
+ }
1039
+
1040
+ GEnv.BestFreq = AllocZero(4, CaseCount *);
1041
+ ForEach(v, 0, 3)
1042
+ {
1043
+ GEnv.BestFreq[v] = AllocZero(MaxDiscrVal+1, CaseCount);
1044
+ }
1045
+
1046
+ GEnv.DList = Alloc(MaxAtt+1, Attribute);
1047
+ GEnv.DFreq = Alloc(MaxAtt+1, CaseCount **);
1048
+ GEnv.DValSum = Alloc(MaxAtt+1, double *);
1049
+ GEnv.DValSumSq = Alloc(MaxAtt+1, double *);
1050
+ ForEach(Att, 1, MaxAtt)
1051
+ {
1052
+ if ( Exclude(Att) || ! Discrete(Att) ) continue;
1053
+
1054
+ GEnv.DFreq[Att] = Alloc(MaxAttVal[Att]+1, CaseCount *);
1055
+ ForEach(v, 0, MaxAttVal[Att])
1056
+ {
1057
+ GEnv.DFreq[Att][v] = Alloc(MaxDiscrVal+1, CaseCount);
1058
+ }
1059
+ GEnv.DValSum[Att] = Alloc(MaxDiscrVal+1, double);
1060
+ GEnv.DValSumSq[Att] = Alloc(MaxDiscrVal+1, double);
1061
+ }
1062
+ }
1063
+
1064
+
1065
+
1066
+ /*************************************************************************/
1067
+ /* */
1068
+ /* Clean up environment */
1069
+ /* */
1070
+ /*************************************************************************/
1071
+
1072
+
1073
+ void FreeEnvData()
1074
+ /* ------------ */
1075
+ {
1076
+ Attribute Att;
1077
+ int i;
1078
+
1079
+ if ( ! GEnv.ValFreq ) return;
1080
+
1081
+ FreeUnlessNil(GEnv.ValFreq);
1082
+ FreeUnlessNil(GEnv.ClassFreq);
1083
+ FreeUnlessNil(GEnv.ValSum);
1084
+ FreeUnlessNil(GEnv.ValSumSq);
1085
+ FreeUnlessNil(GEnv.Left);
1086
+ FreeUnlessNil(GEnv.Possible);
1087
+ FreeUnlessNil(GEnv.Tested);
1088
+ FreeUnlessNil(GEnv.Gain);
1089
+ FreeUnlessNil(GEnv.Info);
1090
+ FreeUnlessNil(GEnv.Bar);
1091
+
1092
+ if ( GEnv.Test )
1093
+ {
1094
+ ForEach(i, 0, GEnv.MaxLevel-1)
1095
+ {
1096
+ FreeUnlessNil(GEnv.Test[i].Left);
1097
+ }
1098
+
1099
+ Free(GEnv.Test);
1100
+ }
1101
+
1102
+ FreeUnlessNil(GEnv.Subset[0]);
1103
+ ForEach(Att, 1, MaxAtt)
1104
+ {
1105
+ if ( Discrete(Att) )
1106
+ {
1107
+ FreeUnlessNil(GEnv.Subset[Att]);
1108
+ }
1109
+ }
1110
+ FreeUnlessNil(GEnv.Subset);
1111
+
1112
+ FreeVector((void **) GEnv.Freq, 0, MaxDiscrVal+1);
1113
+ FreeVector((void **) GEnv.BestFreq, 0, 3);
1114
+
1115
+ ForEach(Att, 1, MaxAtt)
1116
+ {
1117
+ if ( ! GEnv.DFreq[Att] ) continue;
1118
+
1119
+ FreeVector((void **)GEnv.DFreq[Att], 0, MaxAttVal[Att]);
1120
+ Free(GEnv.DValSum[Att]);
1121
+ Free(GEnv.DValSumSq[Att]);
1122
+ }
1123
+ Free(GEnv.DFreq);
1124
+ Free(GEnv.DValSum);
1125
+ Free(GEnv.DValSumSq);
1126
+ Free(GEnv.DList);
1127
+
1128
+ FreeUnlessNil(GEnv.SiftEntry);
1129
+ }
1130
+ #endif
1131
+
1132
+
1133
+
1134
+ /*************************************************************************/
1135
+ /* */
1136
+ /* Test[] contains a stack of current tests. Add a new test */
1137
+ /* for the current level */
1138
+ /* */
1139
+ /*************************************************************************/
1140
+
1141
+
1142
+ void NoteTest(Attribute Att, DiscrValue Br, ContValue Cut, Set Left)
1143
+ /* -------- */
1144
+ {
1145
+ int i;
1146
+
1147
+ /* Check space for tests */
1148
+
1149
+ if ( GEnv.Level >= GEnv.MaxLevel )
1150
+ {
1151
+ if ( ! GEnv.MaxLevel )
1152
+ {
1153
+ GEnv.Test = Alloc(100, TestRec);
1154
+ }
1155
+ else
1156
+ {
1157
+ Realloc(GEnv.Test, GEnv.MaxLevel+100, TestRec);
1158
+ }
1159
+
1160
+ ForEach(i, 0, 99)
1161
+ {
1162
+ GEnv.Test[GEnv.MaxLevel+i].Left =
1163
+ Alloc((MaxDiscrVal>>3)+1, unsigned char);
1164
+ }
1165
+
1166
+ GEnv.MaxLevel += 100;
1167
+ }
1168
+
1169
+ GEnv.Test[GEnv.Level].Att = Att;
1170
+ GEnv.Test[GEnv.Level].Br = Br;
1171
+ GEnv.Test[GEnv.Level].Cut = Cut;
1172
+ if ( Left )
1173
+ {
1174
+ memcpy(GEnv.Test[GEnv.Level].Left, Left, (MaxAttVal[Att]>>3)+1);
1175
+ }
1176
+ }
1177
+
1178
+
1179
+
1180
+ /*************************************************************************/
1181
+ /* */
1182
+ /* Group together the cases corresponding to branch V of a test */
1183
+ /* and return the index of the case following the last */
1184
+ /* */
1185
+ /*************************************************************************/
1186
+
1187
+
1188
+ CaseNo Group(Attribute Att, DiscrValue V, CaseNo Fp, CaseNo Lp,
1189
+ ContValue Cut, Set Left)
1190
+ /* ----- */
1191
+ {
1192
+ CaseNo i;
1193
+
1194
+ /* Group cases on the value of attribute Att, perhaps depending
1195
+ on the type of split */
1196
+
1197
+ if ( V == 1 )
1198
+ {
1199
+ /* Group all non-applicable values. Don't even try if
1200
+ this attribute doesn't have N/A values */
1201
+
1202
+ if ( SomeNA[Att] )
1203
+ {
1204
+ ForEach(i, Fp, Lp)
1205
+ {
1206
+ if ( NotApplic(Case[i], Att) )
1207
+ {
1208
+ Swap(Fp, i);
1209
+ Fp++;
1210
+ }
1211
+ }
1212
+ }
1213
+ }
1214
+ else
1215
+ if ( Continuous(Att) )
1216
+ {
1217
+ ForEach(i, Fp, Lp)
1218
+ {
1219
+ if ( ! Unknown(Case[i], Att) &&
1220
+ ! NotApplic(Case[i], Att) &&
1221
+ (CVal(Case[i], Att) <= Cut) == (V == 2) )
1222
+ {
1223
+ Swap(Fp, i);
1224
+ Fp++;
1225
+ }
1226
+ }
1227
+ }
1228
+ else
1229
+ if ( Ordered(Att) && Att != ClassAtt )
1230
+ {
1231
+ ForEach(i, Fp, Lp)
1232
+ {
1233
+ if ( ! Unknown(Case[i], Att) &&
1234
+ ! NotApplic(Case[i], Att) &&
1235
+ (XDVal(Case[i], Att) <= Cut + 0.1) == (V == 2) )
1236
+ {
1237
+ Swap(Fp, i);
1238
+ Fp++;
1239
+ }
1240
+ }
1241
+ }
1242
+ else
1243
+ if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
1244
+ {
1245
+ ForEach(i, Fp, Lp)
1246
+ {
1247
+ if ( ! Unknown(Case[i], Att) &&
1248
+ ! NotApplic(Case[i], Att) &&
1249
+ (In(XDVal(Case[i], Att), Left) != 0) == (V == 2) )
1250
+ {
1251
+ Swap(Fp, i);
1252
+ Fp++;
1253
+ }
1254
+ }
1255
+ }
1256
+ else
1257
+ {
1258
+ ForEach(i, Fp, Lp)
1259
+ {
1260
+ if ( XDVal(Case[i], Att) == V )
1261
+ {
1262
+ Swap(Fp, i);
1263
+ Fp++;
1264
+ }
1265
+ }
1266
+ }
1267
+
1268
+ return Fp;
1269
+ }