see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,342 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of GritBot GPL Edition, a single-threaded version */
6
+ /* of GritBot release 2.01. */
7
+ /* */
8
+ /* GritBot GPL Edition is free software: you can redistribute it */
9
+ /* and/or modify it under the terms of the GNU General Public License */
10
+ /* as published by the Free Software Foundation, either version 3 of */
11
+ /* the License, or (at your option) any later version. */
12
+ /* */
13
+ /* GritBot GPL Edition is distributed in the hope that it will be */
14
+ /* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
15
+ /* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16
+ /* GNU General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with GritBot GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Routines to manage clusters */
30
+ /* --------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ /*************************************************************************/
40
+ /* */
41
+ /* Register a new cluster */
42
+ /* */
43
+ /*************************************************************************/
44
+
45
+
46
+ Clust NewClust(ContValue Expect, ContValue SD, ContValue Limit,
47
+ CaseCount Anoms, CaseCount GpSize)
48
+ /* -------- */
49
+ {
50
+ Clust C;
51
+
52
+ /* Make sure we have room for another */
53
+
54
+ if ( NClust >= ClustSpace )
55
+ {
56
+ Realloc(Cluster, (ClustSpace += 1000), Clust);
57
+ }
58
+
59
+ C = Cluster[NClust++] = Alloc(1, ClustRec);
60
+
61
+ /* Save cluster information */
62
+
63
+ C->Att = ClassAtt;
64
+ C->Expect = Expect;
65
+ C->SD = SD;
66
+ C->Limit = Limit;
67
+ C->GpSize = GpSize;
68
+ C->Frac = 1 - Anoms / (float) GpSize;
69
+
70
+ SaveClustConds(C);
71
+
72
+ return C;
73
+ }
74
+
75
+
76
+
77
+ /*************************************************************************/
78
+ /* */
79
+ /* Process current tests to determine cluster conditions. */
80
+ /* Different functions are called depending on the condition */
81
+ /* type. */
82
+ /* */
83
+ /*************************************************************************/
84
+
85
+
86
+ void SaveClustConds(Clust C)
87
+ /* -------------- */
88
+ {
89
+ Attribute Att, NAtts=0;
90
+ int NC=0;
91
+
92
+ /* Count attributes tested */
93
+
94
+ ForEach(Att, 1, MaxAtt)
95
+ {
96
+ if ( GEnv.Tested[Att] ) NAtts++;
97
+ }
98
+
99
+ C->NCond = NAtts;
100
+ C->Cond = Alloc(C->NCond, ClustCond);
101
+
102
+ /* Format tests on each attribute */
103
+
104
+ ForEach(Att, 1, MaxAtt)
105
+ {
106
+ if ( GEnv.Tested[Att] )
107
+ {
108
+ if ( Continuous(Att) )
109
+ {
110
+ FormatContinCond(Att, &C->Cond[NC]);
111
+ }
112
+ else
113
+ if ( Ordered(Att) )
114
+ {
115
+ FormatOrderedCond(Att, &C->Cond[NC]);
116
+ }
117
+ else
118
+ if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
119
+ {
120
+ FormatSubsetCond(Att, &C->Cond[NC]);
121
+ }
122
+ else
123
+ {
124
+ FormatValCond(Att, &C->Cond[NC]);
125
+ }
126
+
127
+ NC++;
128
+ }
129
+ }
130
+ }
131
+
132
+
133
+
134
+ /*************************************************************************/
135
+ /* */
136
+ /* Test on a continuous att. Check all threshold tests */
137
+ /* and assemble lowest and highest possible value */
138
+ /* */
139
+ /*************************************************************************/
140
+
141
+
142
+ void FormatContinCond(Attribute Att, ClustCond *CC)
143
+ /* ---------------- */
144
+ {
145
+ ContValue Lo=-MARKER, Hi=MARKER;
146
+ int i, Type=0;
147
+
148
+ ForEach(i, 0, GEnv.Level)
149
+ {
150
+ if ( GEnv.Test[i].Att == Att )
151
+ {
152
+ if ( GEnv.Test[i].Br == 1 )
153
+ {
154
+ Type = CONT_NA;
155
+ Lo = 1;
156
+ Hi = 0;
157
+ break;
158
+ }
159
+ else
160
+ if ( GEnv.Test[i].Br == 2 )
161
+ {
162
+ Type |= CONT_LT;
163
+ Hi = GEnv.Test[i].Cut;
164
+ }
165
+ else
166
+ {
167
+ Type |= CONT_GT;
168
+ Lo = GEnv.Test[i].Cut;
169
+ }
170
+ }
171
+ }
172
+
173
+ CC->Type = Type;
174
+ CC->Att = Att;
175
+ CC->Low = Lo;
176
+ CC->High = Hi;
177
+ CC->Values = Nil;
178
+ }
179
+
180
+
181
+
182
+ /*************************************************************************/
183
+ /* */
184
+ /* Test on an ordered discrete attribute (similar to above) */
185
+ /* */
186
+ /*************************************************************************/
187
+
188
+
189
+ void FormatOrderedCond(Attribute Att, ClustCond *CC)
190
+ /* ----------------- */
191
+ {
192
+ DiscrValue Lo, Hi;
193
+ int i, Type=0;
194
+
195
+ Lo = 2;
196
+ Hi = MaxAttVal[Att];
197
+
198
+ ForEach(i, 0, GEnv.Level)
199
+ {
200
+ if ( GEnv.Test[i].Att == Att )
201
+ {
202
+ if ( GEnv.Test[i].Br == 1 )
203
+ {
204
+ Type = DISCR_VAL;
205
+ Lo = Hi = 1;
206
+ break;
207
+ }
208
+ else
209
+ if ( GEnv.Test[i].Br == 2 )
210
+ {
211
+ Type |= DISCR_LT;
212
+ Hi = GEnv.Test[i].Cut;
213
+ }
214
+ else
215
+ {
216
+ Type |= DISCR_GT;
217
+ Lo = GEnv.Test[i].Cut + 1;
218
+ }
219
+ }
220
+ }
221
+
222
+ CC->Type = Type;
223
+ CC->Att = Att;
224
+ CC->Low = Lo;
225
+ CC->High = Hi;
226
+ CC->Values = Nil;
227
+ }
228
+
229
+
230
+
231
+ /*************************************************************************/
232
+ /* */
233
+ /* Subset test for a discrete attribute. All tests must be */
234
+ /* checked to determine the final subset values */
235
+ /* */
236
+ /*************************************************************************/
237
+
238
+
239
+ void FormatSubsetCond(Attribute Att, ClustCond *CC)
240
+ /* ---------------- */
241
+ {
242
+ DiscrValue v;
243
+ int i;
244
+
245
+ CC->Att = Att;
246
+
247
+ GEnv.Possible[1] = false;
248
+ ForEach(v, 2, MaxAttVal[Att])
249
+ {
250
+ GEnv.Possible[v] = true;
251
+ }
252
+
253
+ ForEach(i, 0, GEnv.Level)
254
+ {
255
+ if ( GEnv.Test[i].Att == Att )
256
+ {
257
+ if ( GEnv.Test[i].Br == 1 )
258
+ {
259
+ GEnv.Possible[1] = true;
260
+ ForEach(v, 2, MaxAttVal[Att])
261
+ {
262
+ GEnv.Possible[v] = false;
263
+ }
264
+ break;
265
+ }
266
+ else
267
+ ForEach(v, 2, MaxAttVal[Att])
268
+ {
269
+ if ( In(v, GEnv.Test[i].Left) )
270
+ {
271
+ GEnv.Possible[v] = GEnv.Possible[v] && ( GEnv.Test[i].Br == 2 );
272
+ }
273
+ else
274
+ {
275
+ GEnv.Possible[v] = GEnv.Possible[v] && ( GEnv.Test[i].Br == 3 );
276
+ }
277
+ }
278
+ }
279
+ }
280
+
281
+ CC->Type = DISCR_SET;
282
+ CC->Low = CC->High = 0;
283
+ CC->Values = AllocZero((MaxAttVal[Att]>>3)+1, unsigned char);
284
+
285
+ ForEach(v, 1, MaxAttVal[Att])
286
+ {
287
+ if ( GEnv.Possible[v] ) SetBit(v, CC->Values);
288
+ }
289
+ }
290
+
291
+
292
+
293
+ /*************************************************************************/
294
+ /* */
295
+ /* Simple test on attribute value. There is no need to check */
296
+ /* more than one test since the first determines the tested value */
297
+ /* */
298
+ /*************************************************************************/
299
+
300
+
301
+ void FormatValCond(Attribute Att, ClustCond *CC)
302
+ /* ------------- */
303
+ {
304
+ int i;
305
+
306
+ ForEach(i, 0, GEnv.Level)
307
+ {
308
+ if ( GEnv.Test[i].Att == Att )
309
+ {
310
+ CC->Type = DISCR_VAL;
311
+ CC->Att = Att;
312
+ CC->Low = CC->High = GEnv.Test[i].Br;
313
+ CC->Values = Nil;
314
+ return;
315
+ }
316
+ }
317
+ }
318
+
319
+
320
+
321
+ /*************************************************************************/
322
+ /* */
323
+ /* Free conditions stored in a cluster */
324
+ /* */
325
+ /*************************************************************************/
326
+
327
+
328
+ void FreeClust(Clust C)
329
+ /* --------- */
330
+ {
331
+ int d;
332
+
333
+ if ( C )
334
+ {
335
+ ForEach(d, 0, C->NCond-1)
336
+ {
337
+ FreeUnlessNil(C->Cond[d].Values);
338
+ }
339
+ FreeUnlessNil(C->Cond);
340
+ Free(C);
341
+ }
342
+ }
@@ -0,0 +1,1269 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of GritBot GPL Edition, a single-threaded version */
6
+ /* of GritBot release 2.01. */
7
+ /* */
8
+ /* GritBot GPL Edition is free software: you can redistribute it */
9
+ /* and/or modify it under the terms of the GNU General Public License */
10
+ /* as published by the Free Software Foundation, either version 3 of */
11
+ /* the License, or (at your option) any later version. */
12
+ /* */
13
+ /* GritBot GPL Edition is distributed in the hope that it will be */
14
+ /* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
15
+ /* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16
+ /* GNU General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with GritBot GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ #ifndef INSPECT
28
+ /*************************************************************************/
29
+ /* */
30
+ /* Divide-and-Conquer generic routines */
31
+ /* ----------------------------------- */
32
+ /* */
33
+ /*************************************************************************/
34
+
35
+
36
+ #include "defns.i"
37
+ #include "extern.i"
38
+
39
+
40
+ /*************************************************************************/
41
+ /* */
42
+ /* Allocate space for all tables */
43
+ /* */
44
+ /*************************************************************************/
45
+
46
+
47
+ void InitialiseDAC()
48
+ /* ------------- */
49
+ {
50
+ DiscrValue v;
51
+ Attribute Att;
52
+ CaseNo i, MaxSampleSize;
53
+ extern SortPair *Pair;
54
+
55
+ UseLogs = AllocZero(MaxAtt+1, Boolean);
56
+ SomeMiss = AllocZero(MaxAtt+1, Boolean);
57
+ SomeNA = AllocZero(MaxAtt+1, Boolean);
58
+ LowTail = AllocZero(MaxAtt+1, ContValue);
59
+ HighTail = AllocZero(MaxAtt+1, ContValue);
60
+
61
+ LogCaseNo = Alloc(MaxCase+2, double);
62
+ LogCaseNo[0] = LogCaseNo[1] = 0;
63
+ ForEach(i, 2, MaxCase+1)
64
+ {
65
+ LogCaseNo[i] = log((double) i) / Log2;
66
+ }
67
+
68
+ /* Save random numbers for sampling */
69
+
70
+ MaxSampleSize = SAMPLEUNIT * Max(MaxDiscrVal, 5);
71
+ if ( MaxSampleSize > (MaxCase+1) / 2 + 1 )
72
+ {
73
+ MaxSampleSize = (MaxCase+1) / 2 + 1;
74
+ }
75
+
76
+ Rand = Alloc(MaxSampleSize, double);
77
+ ResetKR(1230);
78
+ ForEach(i, 0, MaxSampleSize-1)
79
+ {
80
+ Rand[i] = KRandom();
81
+ }
82
+
83
+ /* Compute prior probabilities for discrete values */
84
+
85
+ Prior = Alloc(MaxAtt+1, double *);
86
+ ForEach(Att, 1, MaxAtt)
87
+ {
88
+ if ( Discrete(Att) )
89
+ {
90
+ Prior[Att] = AllocZero(MaxAttVal[Att]+1, double);
91
+
92
+ ForEach(i, 0, MaxCase)
93
+ {
94
+ Prior[Att][XDVal(Case[i], Att)]++;
95
+ }
96
+
97
+ SomeMiss[Att] = ( Prior[Att][0] > 0 );
98
+ SomeNA[Att] = ( Prior[Att][1] > 0 );
99
+
100
+ ForEach(v, 0, MaxAttVal[Att])
101
+ {
102
+ Prior[Att][v] /= (double) (MaxCase+1);
103
+ }
104
+ }
105
+ }
106
+
107
+ /* Determine precision for continuous attributes */
108
+
109
+ Prec = AllocZero(MaxAtt+1, unsigned char);
110
+ ForEach(Att, 1, MaxAtt)
111
+ {
112
+ if ( ! Exclude(Att) && Continuous(Att) )
113
+ {
114
+ Prec[Att] = log(FracBase(Att)) / log(10.0) + 0.5;
115
+ }
116
+ }
117
+
118
+ Pair = Alloc(MaxCase+1, SortPair);
119
+
120
+ InitialiseEnvData();
121
+ }
122
+
123
+
124
+
125
+ void FreeDAC()
126
+ /* ------- */
127
+ {
128
+ extern SortPair *Pair;
129
+
130
+ FreeEnvData();
131
+
132
+ FreeUnlessNil(UseLogs); UseLogs = Nil;
133
+ FreeUnlessNil(SomeMiss); SomeMiss = Nil;
134
+ FreeUnlessNil(SomeNA); SomeNA = Nil;
135
+ FreeUnlessNil(LowTail); LowTail = Nil;
136
+ FreeUnlessNil(HighTail); HighTail = Nil;
137
+ FreeUnlessNil(LogCaseNo); LogCaseNo = Nil;
138
+ FreeUnlessNil(Rand); Rand = Nil;
139
+ FreeVector((void **) Prior, 1, MaxAtt); Prior = Nil;
140
+ FreeUnlessNil(Prec); Prec = Nil;
141
+
142
+ FreeUnlessNil(Pair); Pair = Nil;
143
+ }
144
+
145
+
146
+
147
+ /*************************************************************************/
148
+ /* */
149
+ /* Split cases Fp through Lp */
150
+ /* CondAtts is the current number of conditioning attributes */
151
+ /* */
152
+ /*************************************************************************/
153
+
154
+
155
+ void Split(CaseNo Fp, CaseNo Lp, int CondAtts, Tree Parent, DiscrValue Br,
156
+ Tree *Result)
157
+ /* ----- */
158
+ {
159
+ CaseNo i;
160
+ CaseCount Cases;
161
+ DiscrValue v;
162
+ double Val, Sum=0, SumSq=0;
163
+ Attribute Att, BestAtt;
164
+ Tree Node;
165
+
166
+
167
+ *Result = Nil;
168
+
169
+ /* Recover info about tests to this point */
170
+
171
+ ForEach(Att, 1, MaxAtt)
172
+ {
173
+ GEnv.Tested[Att] = 0;
174
+ }
175
+
176
+ RecoverContext(Parent, Br);
177
+
178
+ Cases = No(Fp, Lp);
179
+ Verbosity(1,
180
+ fprintf(Of, "\n<%d> %d cases %d-%d\n", GEnv.Level, Cases, Fp, Lp);
181
+ ShowContext(Fp))
182
+
183
+ GEnv.FRAC = 1;
184
+
185
+ /* Determine PSD and base information. This is only approximate, since
186
+ missing values of the tested attributes are excluded. */
187
+
188
+ if ( Continuous(ClassAtt) )
189
+ {
190
+ if ( Cases < 2 * CMINITEMS )
191
+ {
192
+ Progress(Cases);
193
+ return;
194
+ }
195
+
196
+ ForEach(i, Fp, Lp)
197
+ {
198
+ Val = CClass(Case[i]);
199
+ Sum += Val;
200
+ SumSq += Val * Val;
201
+ }
202
+ GEnv.PSD = SDEstimate(Cases, Sum, SumSq);
203
+ }
204
+ else
205
+ {
206
+ if ( Cases < 2 * DMINITEMS )
207
+ {
208
+ Progress(Cases);
209
+ return;
210
+ }
211
+
212
+ /* Check for pure leaf */
213
+
214
+ FindClassFrequencies(Fp, Lp);
215
+
216
+ ForEach(v, 1, MaxAttVal[ClassAtt])
217
+ {
218
+ if ( GEnv.ClassFreq[v] == Cases )
219
+ {
220
+ Verbosity(1, fprintf(Of, "\tpure subset\n"))
221
+ Progress(Cases);
222
+ return;
223
+ }
224
+ }
225
+
226
+ GEnv.BaseInfo =
227
+ TotalInfo(GEnv.ClassFreq, 1, MaxAttVal[ClassAtt]) / Cases;
228
+ }
229
+
230
+ *Result = Node = Leaf(Parent, Br);
231
+
232
+ /* Find the best attribute split, using sampling if the number
233
+ of cases is at least the minimum multiple of the sample size.
234
+ Start by collecting info on discrete attributes */
235
+
236
+ DiscreteAttInfo(Fp, Lp, CondAtts);
237
+
238
+ if ( Cases > SAMPLEFACTOR * SampleSize )
239
+ {
240
+ ChooseSplitWithSampling(Fp, Lp, CondAtts);
241
+ }
242
+ else
243
+ {
244
+ ChooseSplit(Fp, Lp, CondAtts);
245
+ }
246
+
247
+ /* Save any sift entry */
248
+
249
+ if ( SIFT && GEnv.SiftEntry && GEnv.SiftSize )
250
+ {
251
+ Node->SiftEntry = strdup(GEnv.SiftEntry);
252
+ GEnv.SiftSize = 0;
253
+ }
254
+
255
+ FindBestAtt(&BestAtt, &Val);
256
+
257
+ /* Decide whether to branch or not */
258
+
259
+ if ( BestAtt == None )
260
+ {
261
+ Verbosity(1, fprintf(Of, "\tno sensible splits\n"))
262
+
263
+ Progress(Cases);
264
+ }
265
+ else
266
+ {
267
+ Verbosity(1,
268
+ fprintf(Of, "\tbest attribute %s", AttName[BestAtt]);
269
+ if ( Continuous(BestAtt) )
270
+ {
271
+ fprintf(Of, " cut %.3f", GEnv.Bar[BestAtt]);
272
+ }
273
+ if ( ! Continuous(ClassAtt) )
274
+ {
275
+ fprintf(Of, " val %.3f inf %.3f",
276
+ SplitVal(GEnv.Gain[BestAtt], GEnv.Info[BestAtt]),
277
+ GEnv.Info[BestAtt]);
278
+ }
279
+ fprintf(Of, " gain %.3f\n", GEnv.Gain[BestAtt]);)
280
+
281
+ /* Carry out the recursive divide-and-conquer */
282
+
283
+ Node->Tested = BestAtt;
284
+
285
+ if ( Continuous(BestAtt) || Ordered(BestAtt) )
286
+ {
287
+ Node->NodeType = BrThresh;
288
+ Node->Forks = 3;
289
+ Node->Cut = GEnv.Bar[BestAtt];
290
+ }
291
+ else
292
+ if ( Continuous(ClassAtt) && MaxAttVal[BestAtt] > 3 )
293
+ {
294
+ Node->NodeType = BrSubset;
295
+ Node->Forks = 3;
296
+ Node->Left = Alloc((MaxAttVal[BestAtt]>>3)+1, unsigned char);
297
+ memcpy(Node->Left, GEnv.Subset[BestAtt], (MaxAttVal[BestAtt]>>3)+1);
298
+ }
299
+ else
300
+ {
301
+ Node->NodeType = BrDiscr;
302
+ Node->Forks = MaxAttVal[BestAtt];
303
+ }
304
+
305
+ Node->Branch = Alloc(Node->Forks+1, Tree);
306
+
307
+ if ( ! GEnv.Tested[BestAtt] ) CondAtts++;
308
+
309
+ Divide(Node, Fp, Lp, CondAtts);
310
+ }
311
+ }
312
+
313
+
314
+
315
+ /*************************************************************************/
316
+ /* */
317
+ /* Recover information on level and tests from tree and parent */
318
+ /* */
319
+ /*************************************************************************/
320
+
321
+
322
+ void RecoverContext(Tree T, DiscrValue Br)
323
+ /* -------------- */
324
+ {
325
+ if ( T )
326
+ {
327
+ RecoverContext(T->Parent, T->Br);
328
+
329
+ NoteTest(T->Tested, Br, T->Cut, T->Left);
330
+ GEnv.Tested[T->Tested]++;
331
+ GEnv.Level++;
332
+ }
333
+ else
334
+ {
335
+ GEnv.Level = 0;
336
+ }
337
+ }
338
+
339
+
340
+
341
+ /*************************************************************************/
342
+ /* */
343
+ /* Analyse all discrete attributes in one pass */
344
+ /* */
345
+ /*************************************************************************/
346
+
347
+
348
+ void DiscreteAttInfo(CaseNo Fp, CaseNo Lp, int CondAtts)
349
+ /* --------------- */
350
+ {
351
+ CaseNo i;
352
+ DiscrValue v, c;
353
+ Attribute Att;
354
+ double Val;
355
+ int NDList=0, dl;
356
+
357
+ /* Initialise counts etc and prepare list of attributes */
358
+
359
+ ForEach(Att, 1, MaxAtt)
360
+ {
361
+ if ( ! Discrete(Att) || Exclude(Att) || Att == ClassAtt ||
362
+ CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
363
+ {
364
+ continue;
365
+ }
366
+
367
+ GEnv.DList[NDList++] = Att;
368
+
369
+ if ( Continuous(ClassAtt) )
370
+ {
371
+ ForEach(v, 0, MaxAttVal[Att])
372
+ {
373
+ GEnv.DValSum[Att][v] = GEnv.DValSumSq[Att][v] = 0;
374
+ GEnv.DFreq[Att][v][0] = 0; /* value frequency */
375
+ }
376
+ }
377
+ else
378
+ {
379
+ ForEach(v, 0, MaxAttVal[Att])
380
+ {
381
+ ForEach(c, 1, MaxAttVal[ClassAtt])
382
+ {
383
+ GEnv.DFreq[Att][v][c] = 0;
384
+ }
385
+ }
386
+ }
387
+ }
388
+
389
+ if ( ! NDList-- ) return;
390
+
391
+ /* Examine cases and update all counts etc */
392
+
393
+ ForEach(i, Fp, Lp)
394
+ {
395
+ ForEach(dl, 0, NDList)
396
+ {
397
+ Att = GEnv.DList[dl];
398
+
399
+ v = XDVal(Case[i], Att);
400
+
401
+ if ( Continuous(ClassAtt) )
402
+ {
403
+ Val = CClass(Case[i]);
404
+
405
+ GEnv.DFreq[Att][v][0]++;
406
+ GEnv.DValSum[Att][v] += Val;
407
+ GEnv.DValSumSq[Att][v] += Val * Val;
408
+ }
409
+ else
410
+ {
411
+ GEnv.DFreq[Att][v][ DClass(Case[i]) ]++;
412
+ }
413
+ }
414
+ }
415
+ }
416
+
417
+
418
+
419
+ /*************************************************************************/
420
+ /* */
421
+ /* Choose split using a sample. There are three phases: */
422
+ /* - process discrete atts using all data */
423
+ /* - for continuous atts, find gain etc from two samples and */
424
+ /* record the better value */
425
+ /* - re-examine high-value continuous attributes using all cases */
426
+ /* */
427
+ /*************************************************************************/
428
+
429
+
430
+ void ChooseSplitWithSampling(CaseNo Fp, CaseNo Lp, int CondAtts)
431
+ /* ----------------------- */
432
+ {
433
+ double Val, OldBestVal;
434
+ Attribute Att, BestAtt;
435
+
436
+ /* Process discrete attributes using all data */
437
+
438
+ ForEach(Att, 1, MaxAtt)
439
+ {
440
+ GEnv.Gain[Att] = None;
441
+
442
+ if ( Exclude(Att) || Att == ClassAtt || Continuous(Att) ||
443
+ CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
444
+ {
445
+ continue;
446
+ }
447
+
448
+ CheckSplit(Att, Fp, Lp);
449
+ }
450
+
451
+ /* Process continuous attributes using two samples */
452
+
453
+ GEnv.FRAC = SampleSize / (double) No(Fp, Lp);
454
+
455
+ SampleScan(Fp, Lp, CondAtts, false);
456
+ SampleScan(Fp+SampleSize, Lp, CondAtts, true);
457
+
458
+ GEnv.FRAC = 1;
459
+
460
+ /* Re-examine continuous attributes that are possible best splits
461
+ (with value at least 70% of current best value) */
462
+
463
+ FindBestAtt(&BestAtt, &OldBestVal);
464
+
465
+ if ( BestAtt != None )
466
+ {
467
+ Verbosity(2,
468
+ fprintf(Of, " Revisit threshold %.3f (%s)\n",
469
+ 0.7 * OldBestVal, AttName[BestAtt]))
470
+
471
+ ForEach(Att, 1, MaxAtt)
472
+ {
473
+ if ( Discrete(Att) || GEnv.Gain[Att] <= Epsilon ) continue;
474
+
475
+ Val = SplitVal(GEnv.Gain[Att], GEnv.Info[Att]);
476
+
477
+ GEnv.Gain[Att] = None;
478
+
479
+ if ( Val > 0.7 * OldBestVal )
480
+ {
481
+ CheckSplit(Att, Fp, Lp);
482
+ }
483
+ }
484
+ }
485
+ }
486
+
487
+
488
+
489
+ /*************************************************************************/
490
+ /* */
491
+ /* Estimate Gain etc of continuous attributes using sample */
492
+ /* */
493
+ /*************************************************************************/
494
+
495
+
496
+ void SampleScan(CaseNo Fp, CaseNo Lp, int CondAtts, Boolean Second)
497
+ /* ---------- */
498
+ {
499
+ CaseNo i, SLp;
500
+ double Val, Sum=0, SumSq=0, SaveBaseInfo, SavePSD,
501
+ FBar, FInfo, FGain, FVal;
502
+ Attribute Att;
503
+
504
+ /* Save base information or SD */
505
+
506
+ SaveBaseInfo = GEnv.BaseInfo;
507
+ SavePSD = GEnv.PSD;
508
+
509
+ /* Generate sample in Fp ... Fp+SampleSize-1 */
510
+
511
+ Sample(Fp, Lp, SampleSize);
512
+ SLp = Fp + SampleSize - 1;
513
+
514
+ /* Determine sample PSD or base information */
515
+
516
+ if ( Continuous(ClassAtt) )
517
+ {
518
+ ForEach(i, Fp, SLp)
519
+ {
520
+ Val = CClass(Case[i]);
521
+ Sum += Val;
522
+ SumSq += Val * Val;
523
+ }
524
+ GEnv.PSD = SDEstimate(SampleSize, Sum, SumSq);
525
+ }
526
+ else
527
+ {
528
+ FindClassFrequencies(Fp, SLp);
529
+ GEnv.BaseInfo =
530
+ TotalInfo(GEnv.ClassFreq, 1, MaxAttVal[ClassAtt]) / SampleSize;
531
+ }
532
+
533
+ /* Check attributes using sample */
534
+
535
+ ForEach(Att, 1, MaxAtt)
536
+ {
537
+ if ( Exclude(Att) || Att == ClassAtt || ! Continuous(Att) ||
538
+ CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
539
+ {
540
+ continue;
541
+ }
542
+
543
+ /* Save information from possible earlier sample */
544
+
545
+ FInfo = GEnv.Info[Att];
546
+ FGain = GEnv.Gain[Att];
547
+ FBar = GEnv.Bar[Att];
548
+
549
+ GEnv.Gain[Att] = None;
550
+
551
+ CheckSplit(Att, Fp, SLp);
552
+
553
+ /* If this is second sample, retain information from better */
554
+
555
+ if ( Second )
556
+ {
557
+ FVal = SplitVal(FGain, FInfo); /* first value */
558
+ Val = SplitVal(GEnv.Gain[Att], GEnv.Info[Att]); /* second value */
559
+
560
+ if ( FVal > Val )
561
+ {
562
+ GEnv.Gain[Att] = FGain;
563
+ GEnv.Info[Att] = FInfo;
564
+ GEnv.Bar[Att] = FBar;
565
+ }
566
+ }
567
+ }
568
+
569
+ /* Restore base information or SD */
570
+
571
+ GEnv.BaseInfo = SaveBaseInfo;
572
+ GEnv.PSD = SavePSD;
573
+ }
574
+
575
+
576
+
577
+ /*************************************************************************/
578
+ /* */
579
+ /* Sample N cases from Fp through Lp using tabulated random nos */
580
+ /* */
581
+ /*************************************************************************/
582
+
583
+
584
+ void Sample(CaseNo Fp, CaseNo Lp, CaseCount N)
585
+ /* ------ */
586
+ {
587
+ CaseNo i, j, Cases;
588
+
589
+ Cases = No(Fp, Lp);
590
+
591
+ ForEach(i, 0, N-1)
592
+ {
593
+ j = Rand[i] * Cases--;
594
+ Swap(Fp+i, Fp+j);
595
+ }
596
+ }
597
+
598
+
599
+
600
+ /*************************************************************************/
601
+ /* */
602
+ /* Choose a split using all cases */
603
+ /* */
604
+ /*************************************************************************/
605
+
606
+
607
+ void ChooseSplit(CaseNo Fp, CaseNo Lp, int CondAtts)
608
+ /* ----------- */
609
+ {
610
+ Attribute Att;
611
+
612
+ GEnv.FRAC = 1;
613
+
614
+ ForEach(Att, 1, MaxAtt)
615
+ {
616
+ GEnv.Gain[Att] = None;
617
+
618
+ if ( Exclude(Att) || Att == ClassAtt ||
619
+ CondAtts >= MAXCONDATTS && ! GEnv.Tested[Att] )
620
+ {
621
+ continue;
622
+ }
623
+
624
+ CheckSplit(Att, Fp, Lp);
625
+ }
626
+ }
627
+
628
+
629
+
630
+ void FindBestAtt(Attribute *BestAtt, double *BestVal)
631
+ /* ----------- */
632
+ {
633
+ Attribute Att;
634
+ double Val;
635
+
636
+ *BestVal = Epsilon;
637
+ *BestAtt = None;
638
+
639
+ ForEach(Att, 1, MaxAtt)
640
+ {
641
+ Val = SplitVal(GEnv.Gain[Att], GEnv.Info[Att]);
642
+
643
+ if ( Val > *BestVal )
644
+ {
645
+ *BestAtt = Att;
646
+ *BestVal = Val;
647
+ }
648
+ }
649
+ }
650
+
651
+
652
+
653
+ /*************************************************************************/
654
+ /* */
655
+ /* Evaluate a potential split */
656
+ /* */
657
+ /*************************************************************************/
658
+
659
+
660
+ void CheckSplit(Attribute Att, CaseNo Fp, CaseNo Lp)
661
+ /* ---------- */
662
+ {
663
+ CaseNo Xp;
664
+
665
+ GEnv.Tested[Att]++;
666
+
667
+ /* Remove missing values of Att. Note: this makes values
668
+ of BaseInfo and PSD approximate only */
669
+
670
+ Xp = ( SomeMiss[Att] ? SkipMissing(Att, Fp, Lp) : Fp );
671
+
672
+ /* Evaluate attribute for split -- different methods for
673
+ continuous and discrete class attributes */
674
+
675
+ if ( Continuous(Att) ) /* continuous att */
676
+ {
677
+ if ( Continuous(ClassAtt) )
678
+ {
679
+ CEvalContinAtt(Att, Xp, Lp);
680
+ }
681
+ else
682
+ {
683
+ DEvalContinAtt(Att, Xp, Lp);
684
+ }
685
+ }
686
+ else /* discrete att */
687
+ {
688
+ if ( Continuous(ClassAtt) )
689
+ {
690
+ if ( MaxAttVal[Att] > 3 || GEnv.Tested[Att] <= 1 )
691
+ {
692
+ CEvalDiscrAtt(Att, Xp, Lp);
693
+ }
694
+ }
695
+ else
696
+ if ( Ordered(Att) )
697
+ {
698
+ DEvalOrderedAtt(Att, Xp, Lp);
699
+ }
700
+ else
701
+ if ( GEnv.Tested[Att] <= 1 )
702
+ {
703
+ DEvalDiscrAtt(Att, Xp, Lp);
704
+ }
705
+ }
706
+
707
+ if ( GEnv.Gain[Att] > Epsilon )
708
+ {
709
+ /* Find value adjusted for missing values */
710
+
711
+ GEnv.Gain[Att] *= No(Xp, Lp) / (double) No(Fp, Lp);
712
+ GEnv.Info[Att] = Max(GEnv.Info[Att], 0.5);
713
+ }
714
+
715
+ GEnv.Tested[Att]--;
716
+ }
717
+
718
+
719
+
720
+ /*************************************************************************/
721
+ /* */
722
+ /* Split cases Fp to Lp on attribute Att */
723
+ /* */
724
+ /*************************************************************************/
725
+
726
+
727
+ void Divide(Tree Node, CaseNo Fp, CaseNo Lp, int CondAtts)
728
+ /* ------ */
729
+ {
730
+ CaseNo Ep;
731
+ DiscrValue v;
732
+
733
+ /* Remove unknown attribute values */
734
+
735
+ Ep = ( SomeMiss[Node->Tested] ? SkipMissing(Node->Tested, Fp, Lp) : Fp );
736
+ Progress(Ep - Fp);
737
+
738
+ /* Recursive divide and conquer */
739
+
740
+ ForEach(v, 1, Node->Forks)
741
+ {
742
+ Fp = Ep;
743
+ Ep = Group(Node->Tested, v, Fp, Lp, Node->Cut, Node->Left);
744
+
745
+ if ( Ep > Fp )
746
+ {
747
+ Split(Fp, Ep-1, CondAtts, Node, v, &Node->Branch[v]);
748
+ }
749
+ }
750
+ }
751
+
752
+
753
+
754
+ /*************************************************************************/
755
+ /* */
756
+ /* Group together missing values and return index of next case */
757
+ /* */
758
+ /*************************************************************************/
759
+
760
+
761
+ CaseNo SkipMissing(Attribute Att, CaseNo Fp, CaseNo Lp)
762
+ /* ----------- */
763
+ {
764
+ CaseNo i;
765
+
766
+ ForEach(i, Fp, Lp)
767
+ {
768
+ if ( Unknown(Case[i], Att) )
769
+ {
770
+ Swap(Fp, i);
771
+ Fp++;
772
+ }
773
+ }
774
+
775
+ return Fp;
776
+ }
777
+
778
+
779
+
780
+
781
+ /*************************************************************************/
782
+ /* */
783
+ /* Check groups formed by a potential test */
784
+ /* */
785
+ /*************************************************************************/
786
+
787
+
788
+ void CheckPotentialClusters(Attribute Att, DiscrValue Forks,
789
+ CaseNo Fp, CaseNo Lp, ContValue Cut, Set S,
790
+ CaseNo **FT)
791
+ /* ---------------------- */
792
+ {
793
+ CaseNo Ep;
794
+ DiscrValue v;
795
+
796
+ ForEach(v, 1, Forks)
797
+ {
798
+ Ep = Group(Att, v, Fp, Lp, Cut, S);
799
+
800
+ if ( Ep > Fp )
801
+ {
802
+ NoteTest(Att, v, Cut, S);
803
+
804
+ if ( Continuous(ClassAtt) )
805
+ {
806
+ FindContinOutliers(Fp, Ep-1, false);
807
+ }
808
+ else
809
+ {
810
+ FindDiscrOutliers(Fp, Ep-1, ( FT ? FT[v] : Nil ));
811
+ }
812
+
813
+ Fp = Ep;
814
+ }
815
+ }
816
+ }
817
+
818
+
819
+
820
+ /*************************************************************************/
821
+ /* */
822
+ /* Print context information for DAC */
823
+ /* */
824
+ /*************************************************************************/
825
+
826
+
827
+ void ShowContext(CaseNo i)
828
+ /* ----------- */
829
+ {
830
+ Attribute Att;
831
+ ClustRec CR;
832
+ Clust C=&CR;
833
+ int d;
834
+
835
+ C->Att = ClassAtt;
836
+ GEnv.Level--;
837
+ SaveClustConds(C);
838
+ GEnv.Level++;
839
+
840
+ ForEach(d, 0, C->NCond-1)
841
+ {
842
+ Att = C->Cond[d].Att;
843
+
844
+ if ( Continuous(Att) )
845
+ {
846
+ PrintContinCond(Att, C->Cond[d].Low, C->Cond[d].High, i);
847
+ }
848
+ else
849
+ if ( Ordered(Att) )
850
+ {
851
+ PrintOrderedCond(Att, (int) C->Cond[d].Low, (int) C->Cond[d].High,
852
+ i);
853
+ }
854
+ else
855
+ if ( Continuous(C->Att) && MaxAttVal[Att] > 3 )
856
+ {
857
+ PrintSubsetCond(Att, C->Cond[d].Values, i);
858
+ FreeUnlessNil(C->Cond[d].Values);
859
+ }
860
+ else
861
+ {
862
+ PrintValCond(Att, (int) C->Cond[d].Low);
863
+ }
864
+ }
865
+
866
+ Free(C->Cond);
867
+ }
868
+
869
+
870
+
871
+ /*************************************************************************/
872
+ /* */
873
+ /* Construct a leaf in a given node */
874
+ /* */
875
+ /*************************************************************************/
876
+
877
+
878
+ Tree Leaf(Tree Parent, DiscrValue Br)
879
+ /* ---- */
880
+ {
881
+ Tree Node;
882
+
883
+ Node = AllocZero(1, TreeRec);
884
+
885
+ Node->NodeType = 0;
886
+ Node->Parent = Parent;
887
+ Node->Br = Br;
888
+
889
+ return Node;
890
+ }
891
+
892
+
893
+
894
+ void ReleaseTree(Tree T, int Level)
895
+ /* ----------- */
896
+ {
897
+ DiscrValue v;
898
+
899
+ if ( ! T ) return;
900
+
901
+ if ( Level > 0 && LastLevel >= Level - 1 ) LastLevel = Level - 2;
902
+
903
+ /* Possible sift entry */
904
+
905
+ if ( T->SiftEntry )
906
+ {
907
+ if ( SIFT )
908
+ {
909
+ RecoverContext(T->Parent, T->Br);
910
+ OutputConditions();
911
+ fprintf(Sf, "%s", T->SiftEntry);
912
+ }
913
+ Free(T->SiftEntry);
914
+ }
915
+
916
+ if ( T->NodeType )
917
+ {
918
+ ForEach(v, 1, T->Forks)
919
+ {
920
+ ReleaseTree(T->Branch[v], Level+1);
921
+ }
922
+
923
+ if ( T->NodeType == BrSubset )
924
+ {
925
+ FreeUnlessNil(T->Left);
926
+ }
927
+
928
+ Free(T->Branch);
929
+ }
930
+
931
+ Free(T);
932
+ }
933
+
934
+
935
+
936
+ void OutputConditions()
937
+ /* ---------------- */
938
+ {
939
+ Attribute Att;
940
+ int i, CType, b, Bytes;
941
+ DiscrValue Br;
942
+
943
+ if ( ! TargetSaved )
944
+ {
945
+ fprintf(Sf, "1 %d\n", ClassAtt);
946
+ TargetSaved = true;
947
+ }
948
+
949
+ if ( GEnv.Level < 0 ) return;
950
+
951
+ /* Save all conditions since last saved */
952
+
953
+ ForEach(i, LastLevel+1, GEnv.Level-1)
954
+ {
955
+ Att = GEnv.Test[i].Att;
956
+ Br = GEnv.Test[i].Br;
957
+
958
+ /* Determine condition type */
959
+
960
+ CType = ( Br == 1 ? 11 :
961
+ Continuous(Att) || Ordered(Att) ? 12 :
962
+ Continuous(ClassAtt) && MaxAttVal[Att] > 3 ? 13 : 11 );
963
+
964
+ fprintf(Sf, "%d %d %d %d", CType, i, Att, Br);
965
+
966
+ /* Don't need to save anything else if this branch is 1 (N/A)
967
+ or if test is on two-valued discrete att */
968
+
969
+ if ( Br != 1 )
970
+ {
971
+ if ( Continuous(Att) || Ordered(Att) )
972
+ {
973
+ fprintf(Sf, " %.8g", GEnv.Test[i].Cut);
974
+ }
975
+ else
976
+ if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
977
+ {
978
+ /* Print subset of values */
979
+
980
+ Bytes = (MaxAttVal[Att]>>3) + 1;
981
+
982
+ ForEach(b, 0, Bytes-1)
983
+ {
984
+ fprintf(Sf, " %x", GEnv.Test[i].Left[b]);
985
+ }
986
+ }
987
+ }
988
+
989
+ fprintf(Sf, "\n");
990
+ }
991
+
992
+ LastLevel = GEnv.Level-1;
993
+ }
994
+
995
+
996
+
997
+ /*************************************************************************/
998
+ /* */
999
+ /* Set up environment */
1000
+ /* */
1001
+ /*************************************************************************/
1002
+
1003
+
1004
+ void InitialiseEnvData()
1005
+ /* ----------------- */
1006
+ {
1007
+ DiscrValue v;
1008
+ Attribute Att;
1009
+
1010
+ GEnv.ValFreq = Alloc(MaxDiscrVal+1, CaseCount);
1011
+ GEnv.ClassFreq = Alloc(MaxDiscrVal+1, CaseCount);
1012
+ GEnv.ValSum = Alloc(MaxDiscrVal+1, double);
1013
+ GEnv.ValSumSq = Alloc(MaxDiscrVal+1, double);
1014
+ GEnv.Left = Alloc(MaxDiscrVal+1, Boolean);
1015
+ GEnv.Possible = Alloc(MaxDiscrVal+1, Boolean);
1016
+ GEnv.Tested = AllocZero(MaxAtt+1, int);
1017
+ GEnv.Gain = AllocZero(MaxAtt+1, double);
1018
+ GEnv.Info = AllocZero(MaxAtt+1, double);
1019
+ GEnv.Bar = AllocZero(MaxAtt+1, ContValue);
1020
+
1021
+ GEnv.Subset = AllocZero(MaxAtt+1, Set);
1022
+ GEnv.Subset[0] = Alloc((MaxDiscrVal>>3)+1, unsigned char); /* caveats */
1023
+ ForEach(Att, 1, MaxAtt)
1024
+ {
1025
+ if ( Discrete(Att) )
1026
+ {
1027
+ GEnv.Subset[Att] = Alloc((MaxAttVal[Att]>>3)+1, unsigned char);
1028
+ }
1029
+ }
1030
+
1031
+ /* Freq[] is one longer than apparently necessary to allow for
1032
+ the extra slot needed by EvalOrderedAtt() */
1033
+
1034
+ GEnv.Freq = AllocZero(MaxDiscrVal+2, CaseCount *);
1035
+ ForEach(v, 0, MaxDiscrVal+1)
1036
+ {
1037
+ GEnv.Freq[v] = AllocZero(MaxDiscrVal+1, CaseCount);
1038
+ }
1039
+
1040
+ GEnv.BestFreq = AllocZero(4, CaseCount *);
1041
+ ForEach(v, 0, 3)
1042
+ {
1043
+ GEnv.BestFreq[v] = AllocZero(MaxDiscrVal+1, CaseCount);
1044
+ }
1045
+
1046
+ GEnv.DList = Alloc(MaxAtt+1, Attribute);
1047
+ GEnv.DFreq = Alloc(MaxAtt+1, CaseCount **);
1048
+ GEnv.DValSum = Alloc(MaxAtt+1, double *);
1049
+ GEnv.DValSumSq = Alloc(MaxAtt+1, double *);
1050
+ ForEach(Att, 1, MaxAtt)
1051
+ {
1052
+ if ( Exclude(Att) || ! Discrete(Att) ) continue;
1053
+
1054
+ GEnv.DFreq[Att] = Alloc(MaxAttVal[Att]+1, CaseCount *);
1055
+ ForEach(v, 0, MaxAttVal[Att])
1056
+ {
1057
+ GEnv.DFreq[Att][v] = Alloc(MaxDiscrVal+1, CaseCount);
1058
+ }
1059
+ GEnv.DValSum[Att] = Alloc(MaxDiscrVal+1, double);
1060
+ GEnv.DValSumSq[Att] = Alloc(MaxDiscrVal+1, double);
1061
+ }
1062
+ }
1063
+
1064
+
1065
+
1066
+ /*************************************************************************/
1067
+ /* */
1068
+ /* Clean up environment */
1069
+ /* */
1070
+ /*************************************************************************/
1071
+
1072
+
1073
+ void FreeEnvData()
1074
+ /* ------------ */
1075
+ {
1076
+ Attribute Att;
1077
+ int i;
1078
+
1079
+ if ( ! GEnv.ValFreq ) return;
1080
+
1081
+ FreeUnlessNil(GEnv.ValFreq);
1082
+ FreeUnlessNil(GEnv.ClassFreq);
1083
+ FreeUnlessNil(GEnv.ValSum);
1084
+ FreeUnlessNil(GEnv.ValSumSq);
1085
+ FreeUnlessNil(GEnv.Left);
1086
+ FreeUnlessNil(GEnv.Possible);
1087
+ FreeUnlessNil(GEnv.Tested);
1088
+ FreeUnlessNil(GEnv.Gain);
1089
+ FreeUnlessNil(GEnv.Info);
1090
+ FreeUnlessNil(GEnv.Bar);
1091
+
1092
+ if ( GEnv.Test )
1093
+ {
1094
+ ForEach(i, 0, GEnv.MaxLevel-1)
1095
+ {
1096
+ FreeUnlessNil(GEnv.Test[i].Left);
1097
+ }
1098
+
1099
+ Free(GEnv.Test);
1100
+ }
1101
+
1102
+ FreeUnlessNil(GEnv.Subset[0]);
1103
+ ForEach(Att, 1, MaxAtt)
1104
+ {
1105
+ if ( Discrete(Att) )
1106
+ {
1107
+ FreeUnlessNil(GEnv.Subset[Att]);
1108
+ }
1109
+ }
1110
+ FreeUnlessNil(GEnv.Subset);
1111
+
1112
+ FreeVector((void **) GEnv.Freq, 0, MaxDiscrVal+1);
1113
+ FreeVector((void **) GEnv.BestFreq, 0, 3);
1114
+
1115
+ ForEach(Att, 1, MaxAtt)
1116
+ {
1117
+ if ( ! GEnv.DFreq[Att] ) continue;
1118
+
1119
+ FreeVector((void **)GEnv.DFreq[Att], 0, MaxAttVal[Att]);
1120
+ Free(GEnv.DValSum[Att]);
1121
+ Free(GEnv.DValSumSq[Att]);
1122
+ }
1123
+ Free(GEnv.DFreq);
1124
+ Free(GEnv.DValSum);
1125
+ Free(GEnv.DValSumSq);
1126
+ Free(GEnv.DList);
1127
+
1128
+ FreeUnlessNil(GEnv.SiftEntry);
1129
+ }
1130
+ #endif
1131
+
1132
+
1133
+
1134
+ /*************************************************************************/
1135
+ /* */
1136
+ /* Test[] contains a stack of current tests. Add a new test */
1137
+ /* for the current level */
1138
+ /* */
1139
+ /*************************************************************************/
1140
+
1141
+
1142
+ void NoteTest(Attribute Att, DiscrValue Br, ContValue Cut, Set Left)
1143
+ /* -------- */
1144
+ {
1145
+ int i;
1146
+
1147
+ /* Check space for tests */
1148
+
1149
+ if ( GEnv.Level >= GEnv.MaxLevel )
1150
+ {
1151
+ if ( ! GEnv.MaxLevel )
1152
+ {
1153
+ GEnv.Test = Alloc(100, TestRec);
1154
+ }
1155
+ else
1156
+ {
1157
+ Realloc(GEnv.Test, GEnv.MaxLevel+100, TestRec);
1158
+ }
1159
+
1160
+ ForEach(i, 0, 99)
1161
+ {
1162
+ GEnv.Test[GEnv.MaxLevel+i].Left =
1163
+ Alloc((MaxDiscrVal>>3)+1, unsigned char);
1164
+ }
1165
+
1166
+ GEnv.MaxLevel += 100;
1167
+ }
1168
+
1169
+ GEnv.Test[GEnv.Level].Att = Att;
1170
+ GEnv.Test[GEnv.Level].Br = Br;
1171
+ GEnv.Test[GEnv.Level].Cut = Cut;
1172
+ if ( Left )
1173
+ {
1174
+ memcpy(GEnv.Test[GEnv.Level].Left, Left, (MaxAttVal[Att]>>3)+1);
1175
+ }
1176
+ }
1177
+
1178
+
1179
+
1180
+ /*************************************************************************/
1181
+ /* */
1182
+ /* Group together the cases corresponding to branch V of a test */
1183
+ /* and return the index of the case following the last */
1184
+ /* */
1185
+ /*************************************************************************/
1186
+
1187
+
1188
+ CaseNo Group(Attribute Att, DiscrValue V, CaseNo Fp, CaseNo Lp,
1189
+ ContValue Cut, Set Left)
1190
+ /* ----- */
1191
+ {
1192
+ CaseNo i;
1193
+
1194
+ /* Group cases on the value of attribute Att, perhaps depending
1195
+ on the type of split */
1196
+
1197
+ if ( V == 1 )
1198
+ {
1199
+ /* Group all non-applicable values. Don't even try if
1200
+ this attribute doesn't have N/A values */
1201
+
1202
+ if ( SomeNA[Att] )
1203
+ {
1204
+ ForEach(i, Fp, Lp)
1205
+ {
1206
+ if ( NotApplic(Case[i], Att) )
1207
+ {
1208
+ Swap(Fp, i);
1209
+ Fp++;
1210
+ }
1211
+ }
1212
+ }
1213
+ }
1214
+ else
1215
+ if ( Continuous(Att) )
1216
+ {
1217
+ ForEach(i, Fp, Lp)
1218
+ {
1219
+ if ( ! Unknown(Case[i], Att) &&
1220
+ ! NotApplic(Case[i], Att) &&
1221
+ (CVal(Case[i], Att) <= Cut) == (V == 2) )
1222
+ {
1223
+ Swap(Fp, i);
1224
+ Fp++;
1225
+ }
1226
+ }
1227
+ }
1228
+ else
1229
+ if ( Ordered(Att) && Att != ClassAtt )
1230
+ {
1231
+ ForEach(i, Fp, Lp)
1232
+ {
1233
+ if ( ! Unknown(Case[i], Att) &&
1234
+ ! NotApplic(Case[i], Att) &&
1235
+ (XDVal(Case[i], Att) <= Cut + 0.1) == (V == 2) )
1236
+ {
1237
+ Swap(Fp, i);
1238
+ Fp++;
1239
+ }
1240
+ }
1241
+ }
1242
+ else
1243
+ if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
1244
+ {
1245
+ ForEach(i, Fp, Lp)
1246
+ {
1247
+ if ( ! Unknown(Case[i], Att) &&
1248
+ ! NotApplic(Case[i], Att) &&
1249
+ (In(XDVal(Case[i], Att), Left) != 0) == (V == 2) )
1250
+ {
1251
+ Swap(Fp, i);
1252
+ Fp++;
1253
+ }
1254
+ }
1255
+ }
1256
+ else
1257
+ {
1258
+ ForEach(i, Fp, Lp)
1259
+ {
1260
+ if ( XDVal(Case[i], Att) == V )
1261
+ {
1262
+ Swap(Fp, i);
1263
+ Fp++;
1264
+ }
1265
+ }
1266
+ }
1267
+
1268
+ return Fp;
1269
+ }