see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
data/ext/c5.0/contin.c ADDED
@@ -0,0 +1,613 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Evaluation of a test on a continuous valued attribute */
30
+ /* ----------------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+ #include "defns.i"
35
+ #include "extern.i"
36
+
37
+ #define PartInfo(n) (-(n)*Log((n)/GEnv.Cases))
38
+
39
+
40
+ /*************************************************************************/
41
+ /* */
42
+ /* Continuous attributes are treated as if they have possible */
43
+ /* values 0 (unknown), 1 (not applicable), 2 (less than cut) and */
44
+ /* 3 (greater than cut). */
45
+ /* This routine finds the best cut for cases Fp through Lp and */
46
+ /* sets Info[], Gain[] and Bar[] */
47
+ /* */
48
+ /*************************************************************************/
49
+
50
+
51
+ void EvalContinuousAtt(Attribute Att, CaseNo Fp, CaseNo Lp)
52
+ /* ----------------- */
53
+ {
54
+ CaseNo i, j, BestI, Tries=0;
55
+ double LowInfo, LHInfo, LeastInfo=1E38,
56
+ w, BestGain, BestInfo, ThreshCost=1;
57
+ ClassNo c;
58
+ ContValue Interval;
59
+
60
+ Verbosity(3, fprintf(Of, "\tAtt %s\n", AttName[Att]))
61
+
62
+ Gain[Att] = None;
63
+ PrepareForContin(Att, Fp, Lp);
64
+
65
+ /* Special case when very few known values */
66
+
67
+ if ( GEnv.ApplicCases < 2 * MINITEMS )
68
+ {
69
+ Verbosity(2,
70
+ fprintf(Of, "\tAtt %s\tinsufficient cases with known values\n",
71
+ AttName[Att]))
72
+ return;
73
+ }
74
+
75
+ /* Try possible cuts between cases i and i+1, and determine the
76
+ information and gain of the split in each case */
77
+
78
+ /* We have to be wary of splitting a small number of cases off one end,
79
+ as this has little predictive power. The minimum split GEnv.MinSplit is
80
+ the maximum of MINITEMS or (the minimum of 25 and 10% of the cases
81
+ per class) */
82
+
83
+ GEnv.MinSplit = 0.10 * GEnv.KnownCases / MaxClass;
84
+ if ( GEnv.MinSplit > 25 ) GEnv.MinSplit = 25;
85
+ if ( GEnv.MinSplit < MINITEMS ) GEnv.MinSplit = MINITEMS;
86
+
87
+ /* Find first possible cut point and initialise scan parameters */
88
+
89
+ i = PrepareForScan(Lp);
90
+
91
+ /* Repeatedly check next possible cut */
92
+
93
+ for ( ; i <= GEnv.Ep ; i++ )
94
+ {
95
+ c = GEnv.SRec[i].C;
96
+ w = GEnv.SRec[i].W;
97
+ assert(c >= 1 && c <= MaxClass);
98
+
99
+ GEnv.LowCases += w;
100
+ GEnv.Freq[2][c] += w;
101
+ GEnv.Freq[3][c] -= w;
102
+
103
+ GEnv.HighVal = GEnv.SRec[i+1].V;
104
+ if ( GEnv.HighVal > GEnv.LowVal )
105
+ {
106
+ Tries++;
107
+
108
+ GEnv.LowClass = GEnv.HighClass;
109
+ GEnv.HighClass = GEnv.SRec[i+1].C;
110
+ for ( j = i+2 ;
111
+ GEnv.HighClass && j <= GEnv.Ep && GEnv.SRec[j].V == GEnv.HighVal ;
112
+ j++ )
113
+ {
114
+ if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;
115
+ }
116
+
117
+ if ( ! GEnv.LowClass || GEnv.LowClass != GEnv.HighClass || j > GEnv.Ep )
118
+ {
119
+ LowInfo = TotalInfo(GEnv.Freq[2], 1, MaxClass);
120
+
121
+ /* If cannot improve on best so far, count remaining
122
+ possible cuts and break */
123
+
124
+ if ( LowInfo >= LeastInfo )
125
+ {
126
+ for ( i++ ; i <= GEnv.Ep ; i++ )
127
+ {
128
+ if ( GEnv.SRec[i+1].V > GEnv.SRec[i].V )
129
+ {
130
+ Tries++;
131
+ }
132
+ }
133
+ break;
134
+ }
135
+
136
+ LHInfo = LowInfo + TotalInfo(GEnv.Freq[3], 1, MaxClass);
137
+ if ( LHInfo < LeastInfo )
138
+ {
139
+ LeastInfo = LHInfo;
140
+ BestI = i;
141
+
142
+ BestInfo = (GEnv.FixedSplitInfo
143
+ + PartInfo(GEnv.LowCases)
144
+ + PartInfo(GEnv.ApplicCases - GEnv.LowCases))
145
+ / GEnv.Cases;
146
+ }
147
+
148
+ Verbosity(3,
149
+ {
150
+ fprintf(Of, "\t\tCut at %.3f (gain %.3f):",
151
+ (GEnv.LowVal + GEnv.HighVal) / 2,
152
+ (1 - GEnv.UnknownRate) *
153
+ (GEnv.BaseInfo - (GEnv.NAInfo + LHInfo) / GEnv.KnownCases));
154
+ PrintDistribution(Att, 2, 3, GEnv.Freq, GEnv.ValFreq, true);
155
+ })
156
+ }
157
+
158
+ GEnv.LowVal = GEnv.HighVal;
159
+ }
160
+ }
161
+
162
+ BestGain = (1 - GEnv.UnknownRate) *
163
+ (GEnv.BaseInfo - (GEnv.NAInfo + LeastInfo) / GEnv.KnownCases);
164
+
165
+ /* The threshold cost is the lesser of the cost of indicating the
166
+ cases to split between or the interval containing the split */
167
+
168
+ if ( BestGain > 0 )
169
+ {
170
+ Interval = (GEnv.SRec[Lp].V - GEnv.SRec[GEnv.Xp].V) /
171
+ (GEnv.SRec[BestI+1].V - GEnv.SRec[BestI].V);
172
+ ThreshCost = ( Interval < Tries ? Log(Interval) : Log(Tries) )
173
+ / GEnv.Cases;
174
+ }
175
+
176
+ BestGain -= ThreshCost;
177
+
178
+ /* If a test on the attribute is able to make a gain,
179
+ set the best break point, gain and information */
180
+
181
+ if ( BestGain <= 0 )
182
+ {
183
+ Verbosity(2, fprintf(Of, "\tAtt %s\tno gain\n", AttName[Att]))
184
+ }
185
+ else
186
+ {
187
+ Gain[Att] = BestGain;
188
+ Info[Att] = BestInfo;
189
+
190
+ GEnv.LowVal = GEnv.SRec[BestI].V;
191
+ GEnv.HighVal = GEnv.SRec[BestI+1].V;
192
+
193
+ /* Set threshold, making sure that rounding problems do not
194
+ cause it to reach upper value */
195
+
196
+ if ( (Bar[Att] = (ContValue) (0.5 * (GEnv.LowVal + GEnv.HighVal)))
197
+ >= GEnv.HighVal )
198
+ {
199
+ Bar[Att] = GEnv.LowVal;
200
+ }
201
+
202
+ Verbosity(2,
203
+ fprintf(Of, "\tAtt %s\tcut=%.3f, inf %.3f, gain %.3f\n",
204
+ AttName[Att], Bar[Att], Info[Att], Gain[Att]))
205
+ }
206
+ }
207
+
208
+
209
+
210
+ /*************************************************************************/
211
+ /* */
212
+ /* Estimate max gain ratio available from any cut, using sample */
213
+ /* of SampleFrac of all cases */
214
+ /* */
215
+ /*************************************************************************/
216
+
217
+
218
+ void EstimateMaxGR(Attribute Att, CaseNo Fp, CaseNo Lp)
219
+ /* ------------- */
220
+ {
221
+ CaseNo i, j;
222
+ double LHInfo, w, SplitInfo, ThisGain, GR;
223
+ ClassNo c;
224
+
225
+ EstMaxGR[Att] = 0;
226
+
227
+ if ( Skip(Att) || Att == ClassAtt ) return;
228
+
229
+ PrepareForContin(Att, Fp, Lp);
230
+
231
+ /* Special case when very few known values */
232
+
233
+ if ( GEnv.ApplicCases < 2 * MINITEMS * SampleFrac )
234
+ {
235
+ return;
236
+ }
237
+
238
+ /* Try possible cuts between cases i and i+1. Use conservative
239
+ value of GEnv.MinSplit to allow for sampling */
240
+
241
+ GEnv.MinSplit = 0.10 * GEnv.KnownCases / MaxClass;
242
+ if ( GEnv.MinSplit > 25 ) GEnv.MinSplit = 25;
243
+ if ( GEnv.MinSplit < MINITEMS ) GEnv.MinSplit = MINITEMS;
244
+
245
+ GEnv.MinSplit *= SampleFrac * 0.33;
246
+
247
+ i = PrepareForScan(Lp);
248
+
249
+ /* Repeatedly check next possible cut */
250
+
251
+ for ( ; i <= GEnv.Ep ; i++ )
252
+ {
253
+ c = GEnv.SRec[i].C;
254
+ w = GEnv.SRec[i].W;
255
+ assert(c >= 1 && c <= MaxClass);
256
+
257
+ GEnv.LowCases += w;
258
+ GEnv.Freq[2][c] += w;
259
+ GEnv.Freq[3][c] -= w;
260
+
261
+ GEnv.HighVal = GEnv.SRec[i+1].V;
262
+ if ( GEnv.HighVal > GEnv.LowVal )
263
+ {
264
+ GEnv.LowClass = GEnv.HighClass;
265
+ GEnv.HighClass = GEnv.SRec[i+1].C;
266
+ for ( j = i+2 ;
267
+ GEnv.HighClass && j <= GEnv.Ep && GEnv.SRec[j].V == GEnv.HighVal ;
268
+ j++ )
269
+ {
270
+ if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;
271
+ }
272
+
273
+ if ( ! GEnv.LowClass || GEnv.LowClass != GEnv.HighClass || j > GEnv.Ep )
274
+ {
275
+ LHInfo = TotalInfo(GEnv.Freq[2], 1, MaxClass)
276
+ + TotalInfo(GEnv.Freq[3], 1, MaxClass);
277
+
278
+ SplitInfo = (GEnv.FixedSplitInfo
279
+ + PartInfo(GEnv.LowCases)
280
+ + PartInfo(GEnv.ApplicCases - GEnv.LowCases)) / GEnv.Cases;
281
+
282
+ ThisGain = (1 - GEnv.UnknownRate) *
283
+ (GEnv.BaseInfo - (GEnv.NAInfo + LHInfo) / GEnv.KnownCases);
284
+ if ( ThisGain > Gain[Att] ) Gain[Att] = ThisGain;
285
+
286
+ /* Adjust GR to make it more conservative upper bound */
287
+
288
+ GR = (ThisGain + 1E-5) / SplitInfo;
289
+ if ( GR > EstMaxGR[Att] )
290
+ {
291
+ EstMaxGR[Att] = GR;
292
+ }
293
+
294
+ Verbosity(3,
295
+ {
296
+ fprintf(Of, "\t\tCut at %.3f (gain %.3f):",
297
+ (GEnv.LowVal + GEnv.HighVal) / 2, ThisGain);
298
+ PrintDistribution(Att, 2, 3, GEnv.Freq, GEnv.ValFreq, true);
299
+ })
300
+ }
301
+
302
+ GEnv.LowVal = GEnv.HighVal;
303
+ }
304
+ }
305
+
306
+ Verbosity(2,
307
+ fprintf(Of, "\tAtt %s: max GR estimate %.3f\n",
308
+ AttName[Att], EstMaxGR[Att]))
309
+ }
310
+
311
+
312
+
313
+ /*************************************************************************/
314
+ /* */
315
+ /* Routine to set some preparatory values used by both */
316
+ /* EvalContinuousAtt and EstimateMaxGR */
317
+ /* */
318
+ /*************************************************************************/
319
+
320
+
321
+ void PrepareForContin(Attribute Att, CaseNo Fp, CaseNo Lp)
322
+ /* ---------------- */
323
+ {
324
+ CaseNo i;
325
+ ClassNo c;
326
+ DiscrValue v;
327
+
328
+ /* Reset frequency tables */
329
+
330
+ ForEach(v, 0, 3)
331
+ {
332
+ ForEach(c, 1, MaxClass)
333
+ {
334
+ GEnv.Freq[v][c] = 0;
335
+ }
336
+ GEnv.ValFreq[v] = 0;
337
+ }
338
+
339
+ /* Omit and count unknown and N/A values */
340
+
341
+ GEnv.Cases = 0;
342
+
343
+ if ( SomeMiss[Att] || SomeNA[Att] )
344
+ {
345
+ GEnv.Xp = Lp+1;
346
+
347
+ ForEach(i, Fp, Lp)
348
+ {
349
+ assert(Class(Case[i]) >= 1 && Class(Case[i]) <= MaxClass);
350
+
351
+ GEnv.Cases += Weight(Case[i]);
352
+
353
+ if ( Unknown(Case[i], Att) )
354
+ {
355
+ GEnv.Freq[ 0 ][ Class(Case[i]) ] += Weight(Case[i]);
356
+ }
357
+ else
358
+ if ( NotApplic(Case[i], Att) )
359
+ {
360
+ GEnv.Freq[ 1 ][ Class(Case[i]) ] += Weight(Case[i]);
361
+ }
362
+ else
363
+ {
364
+ GEnv.Freq[ 3 ][ Class(Case[i]) ] += Weight(Case[i]);
365
+ GEnv.Xp--;
366
+ GEnv.SRec[GEnv.Xp].V = CVal(Case[i], Att);
367
+ GEnv.SRec[GEnv.Xp].W = Weight(Case[i]);
368
+ GEnv.SRec[GEnv.Xp].C = Class(Case[i]);
369
+ }
370
+ }
371
+
372
+ ForEach(c, 1, MaxClass)
373
+ {
374
+ GEnv.ValFreq[0] += GEnv.Freq[0][c];
375
+ GEnv.ValFreq[1] += GEnv.Freq[1][c];
376
+ }
377
+
378
+ GEnv.NAInfo = TotalInfo(GEnv.Freq[1], 1, MaxClass);
379
+ GEnv.FixedSplitInfo = PartInfo(GEnv.ValFreq[0]) + PartInfo(GEnv.ValFreq[1]);
380
+
381
+ Verbosity(3, PrintDistribution(Att, 0, 1, GEnv.Freq, GEnv.ValFreq, true))
382
+ }
383
+ else
384
+ {
385
+ GEnv.Xp = Fp;
386
+
387
+ ForEach(i, Fp, Lp)
388
+ {
389
+ GEnv.SRec[i].V = CVal(Case[i], Att);
390
+ GEnv.SRec[i].W = Weight(Case[i]);
391
+ GEnv.SRec[i].C = Class(Case[i]);
392
+
393
+ GEnv.Freq[3][Class(Case[i])] += Weight(Case[i]);
394
+ }
395
+
396
+ ForEach(c, 1, MaxClass)
397
+ {
398
+ GEnv.Cases += GEnv.Freq[3][c];
399
+ }
400
+
401
+ GEnv.NAInfo = GEnv.FixedSplitInfo = 0;
402
+ }
403
+
404
+ GEnv.KnownCases = GEnv.Cases - GEnv.ValFreq[0];
405
+ GEnv.ApplicCases = GEnv.KnownCases - GEnv.ValFreq[1];
406
+
407
+ GEnv.UnknownRate = 1.0 - GEnv.KnownCases / GEnv.Cases;
408
+
409
+ Cachesort(GEnv.Xp, Lp, GEnv.SRec);
410
+
411
+ /* If unknowns or using sampling, must recompute base information */
412
+
413
+ if ( GEnv.ValFreq[0] > 0 || SampleFrac < 1 )
414
+ {
415
+ /* Determine base information using GEnv.Freq[0] as temp buffer */
416
+
417
+ ForEach(c, 1, MaxClass)
418
+ {
419
+ GEnv.Freq[0][c] = GEnv.Freq[1][c] + GEnv.Freq[3][c];
420
+ }
421
+
422
+ GEnv.BaseInfo = TotalInfo(GEnv.Freq[0], 1, MaxClass) / GEnv.KnownCases;
423
+ }
424
+ else
425
+ {
426
+ GEnv.BaseInfo = GlobalBaseInfo;
427
+ }
428
+ }
429
+
430
+
431
+
432
+ /*************************************************************************/
433
+ /* */
434
+ /* Set low and high bounds for scan and initial class */
435
+ /* (used by EvalContinuousAtt and EstimateMaxGR) */
436
+ /* */
437
+ /*************************************************************************/
438
+
439
+
440
+ CaseNo PrepareForScan(CaseNo Lp)
441
+ /* -------------- */
442
+ {
443
+ CaseNo i, j;
444
+ ClassNo c;
445
+ double w;
446
+
447
+ /* Find last possible split */
448
+
449
+ GEnv.HighCases = GEnv.LowCases = 0;
450
+
451
+ for ( GEnv.Ep = Lp ; GEnv.Ep >= GEnv.Xp && GEnv.HighCases < GEnv.MinSplit ; GEnv.Ep-- )
452
+ {
453
+ GEnv.HighCases += GEnv.SRec[GEnv.Ep].W;
454
+ }
455
+
456
+ /* Skip cases before first possible cut */
457
+
458
+ for ( i = GEnv.Xp ;
459
+ i <= GEnv.Ep &&
460
+ ( GEnv.LowCases + GEnv.SRec[i].W < GEnv.MinSplit - 1E-5 ||
461
+ GEnv.SRec[i].V == GEnv.SRec[i+1].V ) ;
462
+ i++ )
463
+ {
464
+ c = GEnv.SRec[i].C;
465
+ w = GEnv.SRec[i].W;
466
+ assert(c >= 1 && c <= MaxClass);
467
+
468
+ GEnv.LowCases += w;
469
+ GEnv.Freq[2][c] += w;
470
+ GEnv.Freq[3][c] -= w;
471
+ }
472
+
473
+ /* Find the class key for the first interval */
474
+
475
+ GEnv.HighClass = GEnv.SRec[i].C;
476
+ for ( j = i-1; GEnv.HighClass && j >= GEnv.Xp ; j-- )
477
+ {
478
+ if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;
479
+ }
480
+ assert(GEnv.HighClass <= MaxClass);
481
+ assert(j+1 >= GEnv.Xp);
482
+
483
+ GEnv.LowVal = GEnv.SRec[i].V;
484
+
485
+ return i;
486
+ }
487
+
488
+
489
+
490
+ /*************************************************************************/
491
+ /* */
492
+ /* Change a leaf into a test on a continuous attribute */
493
+ /* */
494
+ /*************************************************************************/
495
+
496
+
497
+ void ContinTest(Tree Node, Attribute Att)
498
+ /* ---------- */
499
+ {
500
+ Sprout(Node, 3);
501
+
502
+ Node->NodeType = BrThresh;
503
+ Node->Tested = Att;
504
+ Node->Cut =
505
+ Node->Lower =
506
+ Node->Upper = Bar[Att];
507
+ }
508
+
509
+
510
+
511
+ /*************************************************************************/
512
+ /* */
513
+ /* Adjust thresholds of all continuous attributes so that cuts */
514
+ /* are values that appear in the data */
515
+ /* */
516
+ /*************************************************************************/
517
+
518
+
519
+ void AdjustAllThresholds(Tree T)
520
+ /* ------------------- */
521
+ {
522
+ Attribute Att;
523
+ CaseNo Ep;
524
+
525
+ ForEach(Att, 1, MaxAtt)
526
+ {
527
+ if ( Continuous(Att) )
528
+ {
529
+ Ep = -1;
530
+ AdjustThresholds(T, Att, &Ep);
531
+ }
532
+ }
533
+ }
534
+
535
+
536
+
537
+ void AdjustThresholds(Tree T, Attribute Att, CaseNo *Ep)
538
+ /* ---------------- */
539
+ {
540
+ DiscrValue v;
541
+ CaseNo i;
542
+
543
+ if ( T->NodeType == BrThresh && T->Tested == Att )
544
+ {
545
+ if ( *Ep == -1 )
546
+ {
547
+ ForEach(i, 0, MaxCase)
548
+ {
549
+ if ( ! Unknown(Case[i], Att) && ! NotApplic(Case[i], Att) )
550
+ {
551
+ (&GEnv)->SRec[++(*Ep)].V = CVal(Case[i], Att);
552
+ }
553
+ }
554
+ Cachesort(0, *Ep, (&GEnv)->SRec);
555
+
556
+ if ( PossibleCuts && Trial == 0 )
557
+ {
558
+ int Cuts=0;
559
+
560
+ ForEach(i, 1, *Ep)
561
+ {
562
+ if ( (&GEnv)->SRec[i].V != (&GEnv)->SRec[i-1].V ) Cuts++;
563
+ }
564
+ PossibleCuts[Att] = Cuts;
565
+ }
566
+ }
567
+
568
+ T->Cut = T->Lower = T->Upper = GreatestValueBelow(T->Cut, Ep);
569
+ }
570
+
571
+ if ( T->NodeType )
572
+ {
573
+ ForEach(v, 1, T->Forks)
574
+ {
575
+ AdjustThresholds(T->Branch[v], Att, Ep);
576
+ }
577
+ }
578
+ }
579
+
580
+
581
+
582
+ /*************************************************************************/
583
+ /* */
584
+ /* Return the greatest value of attribute Att below threshold Th */
585
+ /* (Assumes values of Att have been sorted.) */
586
+ /* */
587
+ /*************************************************************************/
588
+
589
+
590
+ ContValue GreatestValueBelow(ContValue Th, CaseNo *Ep)
591
+ /* ------------------ */
592
+ {
593
+ CaseNo Low, Mid, High;
594
+
595
+ Low = 0;
596
+ High = *Ep;
597
+
598
+ while ( Low < High )
599
+ {
600
+ Mid = (Low + High + 1) / 2;
601
+
602
+ if ( (&GEnv)->SRec[Mid].V > Th )
603
+ {
604
+ High = Mid - 1;
605
+ }
606
+ else
607
+ {
608
+ Low = Mid;
609
+ }
610
+ }
611
+
612
+ return (&GEnv)->SRec[Low].V;
613
+ }