see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,1158 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Central tree-forming algorithm */
30
+ /* ------------------------------ */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ Boolean MultiVal, /* all atts have many values */
40
+ Subsample; /* use subsampling */
41
+ float AvGainWt, /* weight of average gain in gain threshold */
42
+ MDLWt; /* weight of MDL threshold ditto */
43
+
44
+ Attribute *DList=Nil; /* list of discrete atts */
45
+ int NDList; /* number in list */
46
+
47
+ DiscrValue MaxLeaves; /* target maximum tree size */
48
+
49
+ #define SAMPLEUNIT 2000
50
+
51
+ float ValThresh; /* minimum GR when evaluating sampled atts */
52
+ Boolean Sampled; /* true if sampling used */
53
+
54
+ Attribute *Waiting=Nil, /* attribute wait list */
55
+ NWaiting=0;
56
+
57
+
58
+
59
+
60
+ /*************************************************************************/
61
+ /* */
62
+ /* Allocate space for tree tables */
63
+ /* */
64
+ /*************************************************************************/
65
+
66
+
67
+ void InitialiseTreeData()
68
+ /* ------------------ */
69
+ {
70
+ DiscrValue v;
71
+ Attribute Att;
72
+ DiscrValue vMax;
73
+
74
+ Raw = AllocZero(TRIALS+1, Tree);
75
+ Pruned = AllocZero(TRIALS+1, Tree);
76
+
77
+ Tested = AllocZero(MaxAtt+1, Byte);
78
+
79
+ Gain = AllocZero(MaxAtt+1, float);
80
+ Info = AllocZero(MaxAtt+1, float);
81
+ Bar = AllocZero(MaxAtt+1, ContValue);
82
+
83
+ EstMaxGR = AllocZero(MaxAtt+1, float);
84
+
85
+ /* Data for subsets */
86
+
87
+ if ( SUBSET )
88
+ {
89
+ InitialiseBellNumbers();
90
+ Subset = Alloc(MaxAtt+1, Set *);
91
+
92
+ ForEach(Att, 1, MaxAtt)
93
+ {
94
+ if ( Discrete(Att) && Att != ClassAtt && ! Skip(Att) )
95
+ {
96
+ Subset[Att] = AllocZero(MaxAttVal[Att]+1, Set);
97
+ ForEach(v, 0, MaxAttVal[Att])
98
+ {
99
+ Subset[Att][v] = Alloc((MaxAttVal[Att]>>3)+1, Byte);
100
+ }
101
+ }
102
+ }
103
+ Subsets = AllocZero(MaxAtt+1, int);
104
+ }
105
+
106
+ DList = Alloc(MaxAtt, Attribute);
107
+ NDList = 0;
108
+
109
+ DFreq = AllocZero(MaxAtt+1, double *);
110
+ ForEach(Att, 1, MaxAtt)
111
+ {
112
+ if ( Att == ClassAtt || Skip(Att) || ! Discrete(Att) ) continue;
113
+
114
+ DList[NDList++] = Att;
115
+
116
+ DFreq[Att] = Alloc(MaxClass * (MaxAttVal[Att]+1), double);
117
+ }
118
+
119
+ ClassFreq = AllocZero(MaxClass+1, double);
120
+ ClassSum = Alloc(MaxClass+1, float);
121
+
122
+ if ( BOOST )
123
+ {
124
+ Vote = Alloc(MaxClass+1, float);
125
+ TrialPred = Alloc(TRIALS, ClassNo);
126
+ }
127
+
128
+ if ( RULES )
129
+ {
130
+ MostSpec = Alloc(MaxClass+1, CRule);
131
+ PossibleCuts = Alloc(MaxAtt+1, int);
132
+ }
133
+
134
+ /* Check whether all attributes have many discrete values */
135
+
136
+ MultiVal = true;
137
+ if ( ! SUBSET )
138
+ {
139
+ for ( Att = 1 ; MultiVal && Att <= MaxAtt ; Att++ )
140
+ {
141
+ if ( ! Skip(Att) && Att != ClassAtt )
142
+ {
143
+ MultiVal = MaxAttVal[Att] >= 0.3 * (MaxCase + 1);
144
+ }
145
+ }
146
+ }
147
+
148
+ /* See whether there are continuous attributes for subsampling */
149
+
150
+ Subsample = false;
151
+
152
+ /* Set parameters for RawExtraErrs() */
153
+
154
+ InitialiseExtraErrs();
155
+
156
+ /* Set up environment */
157
+
158
+ Waiting = Alloc(MaxAtt+1, Attribute);
159
+
160
+ vMax = Max(3, MaxDiscrVal+1);
161
+
162
+ GEnv.Freq = Alloc(vMax+1, double *);
163
+ ForEach(v, 0, vMax)
164
+ {
165
+ GEnv.Freq[v] = Alloc(MaxClass+1, double);
166
+ }
167
+
168
+ GEnv.ValFreq = Alloc(vMax, double);
169
+
170
+ GEnv.ClassFreq = Alloc(MaxClass+1, double);
171
+
172
+ GEnv.SRec = Alloc(MaxCase+1, SortRec);
173
+
174
+ if ( SUBSET )
175
+ {
176
+ GEnv.SubsetInfo = Alloc(MaxDiscrVal+1, double);
177
+ GEnv.SubsetEntr = Alloc(MaxDiscrVal+1, double);
178
+
179
+ GEnv.MergeInfo = Alloc(MaxDiscrVal+1, double *);
180
+ GEnv.MergeEntr = Alloc(MaxDiscrVal+1, double *);
181
+ GEnv.WSubset = Alloc(MaxDiscrVal+1, Set);
182
+ ForEach(v, 1, MaxDiscrVal)
183
+ {
184
+ GEnv.MergeInfo[v] = Alloc(MaxDiscrVal+1, double);
185
+ GEnv.MergeEntr[v] = Alloc(MaxDiscrVal+1, double);
186
+ GEnv.WSubset[v] = Alloc((MaxDiscrVal>>3)+1, Byte);
187
+ }
188
+ }
189
+ }
190
+
191
+
192
+ void FreeTreeData()
193
+ /* ------------ */
194
+ {
195
+ Attribute Att;
196
+ DiscrValue vMax;
197
+
198
+ FreeUnlessNil(Raw); Raw = Nil;
199
+ FreeUnlessNil(Pruned); Pruned = Nil;
200
+
201
+ FreeUnlessNil(Tested); Tested = Nil;
202
+
203
+ FreeUnlessNil(Gain); Gain = Nil;
204
+ FreeUnlessNil(Info); Info = Nil;
205
+ FreeUnlessNil(Bar); Bar = Nil;
206
+
207
+ FreeUnlessNil(EstMaxGR); EstMaxGR = Nil;
208
+
209
+ if ( SUBSET )
210
+ {
211
+ FreeVector((void **) Bell, 1, MaxDiscrVal); Bell = Nil;
212
+
213
+ if ( Subset )
214
+ {
215
+ ForEach(Att, 1, MaxAtt)
216
+ {
217
+ if ( Subset[Att] )
218
+ {
219
+ FreeVector((void **) Subset[Att], 0, MaxAttVal[Att]);
220
+ }
221
+ }
222
+ Free(Subset); Subset = Nil;
223
+ Free(Subsets); Subsets = Nil;
224
+ }
225
+ }
226
+
227
+ FreeUnlessNil(DList); DList = Nil;
228
+
229
+ if ( DFreq )
230
+ {
231
+ ForEach(Att, 1, MaxAtt)
232
+ {
233
+ FreeUnlessNil(DFreq[Att]);
234
+ }
235
+
236
+ Free(DFreq); DFreq = Nil;
237
+ }
238
+
239
+ FreeUnlessNil(ClassFreq); ClassFreq = Nil;
240
+ FreeUnlessNil(ClassSum); ClassSum = Nil;
241
+
242
+ FreeUnlessNil(Vote); Vote = Nil;
243
+ FreeUnlessNil(TrialPred); TrialPred = Nil;
244
+
245
+ FreeUnlessNil(MostSpec); MostSpec = Nil;
246
+ FreeUnlessNil(PossibleCuts); PossibleCuts = Nil;
247
+
248
+ vMax = Max(3, MaxDiscrVal+1);
249
+ FreeVector((void **) GEnv.Freq, 0, vMax);
250
+ Free(GEnv.ValFreq);
251
+ Free(GEnv.ClassFreq);
252
+ FreeUnlessNil(GEnv.SRec);
253
+
254
+ if ( GEnv.SubsetInfo )
255
+ {
256
+ Free(GEnv.SubsetInfo);
257
+ Free(GEnv.SubsetEntr);
258
+ FreeVector((void **) GEnv.MergeInfo, 1, MaxDiscrVal);
259
+ FreeVector((void **) GEnv.MergeEntr, 1, MaxDiscrVal);
260
+ FreeVector((void **) GEnv.WSubset, 1, MaxDiscrVal);
261
+ }
262
+
263
+ FreeUnlessNil(Waiting); Waiting = Nil;
264
+ }
265
+
266
+
267
+
268
+ /*************************************************************************/
269
+ /* */
270
+ /* Set threshold on minimum gain as follows: */
271
+ /* * when forming winnowing tree: no minimum */
272
+ /* * for small problems, AvGain (usual Gain Ratio) */
273
+ /* * for large problems, discounted MDL */
274
+ /* * for intermediate problems, interpolated */
275
+ /* */
276
+ /*************************************************************************/
277
+
278
+
279
+ void SetMinGainThresh()
280
+ /* ---------------- */
281
+ {
282
+ float Frac;
283
+
284
+ /* Set AvGainWt and MDLWt */
285
+
286
+ if ( Now == WINNOWATTS )
287
+ {
288
+ AvGainWt = MDLWt = 0.0;
289
+ }
290
+ else
291
+ if ( (MaxCase+1) / MaxClass <= 500 )
292
+ {
293
+ AvGainWt = 1.0;
294
+ MDLWt = 0.0;
295
+ }
296
+ else
297
+ if ( (MaxCase+1) / MaxClass >= 1000 )
298
+ {
299
+ AvGainWt = 0.0;
300
+ MDLWt = 0.9;
301
+ }
302
+ else
303
+ {
304
+ Frac = ((MaxCase+1) / MaxClass - 500) / 500.0;
305
+
306
+ AvGainWt = 1 - Frac;
307
+ MDLWt = 0.9 * Frac;
308
+ }
309
+ }
310
+
311
+
312
+
313
+ /*************************************************************************/
314
+ /* */
315
+ /* Build a decision tree for the cases Fp through Lp */
316
+ /* */
317
+ /* - if all cases are of the same class, the tree is a leaf labelled */
318
+ /* with this class */
319
+ /* */
320
+ /* - for each attribute, calculate the potential information provided */
321
+ /* by a test on the attribute (based on the probabilities of each */
322
+ /* case having a particular value for the attribute), and the gain */
323
+ /* in information that would result from a test on the attribute */
324
+ /* (based on the probabilities of each case with a particular */
325
+ /* value for the attribute being of a particular class) */
326
+ /* */
327
+ /* - on the basis of these figures, and depending on the current */
328
+ /* selection criterion, find the best attribute to branch on. */
329
+ /* Note: this version will not allow a split on an attribute */
330
+ /* unless two or more subsets have at least MINITEMS cases */
331
+ /* */
332
+ /* - try branching and test whether the resulting tree is better than */
333
+ /* forming a leaf */
334
+ /* */
335
+ /*************************************************************************/
336
+
337
+
338
+ void FormTree(CaseNo Fp, CaseNo Lp, int Level, Tree *Result)
339
+ /* -------- */
340
+ {
341
+ CaseCount Cases=0, TreeErrs=0;
342
+ Attribute BestAtt;
343
+ ClassNo c, BestLeaf=1, Least=1;
344
+ Tree Node;
345
+ DiscrValue v;
346
+
347
+
348
+ assert(Fp >= 0 && Lp >= Fp && Lp <= MaxCase);
349
+
350
+ /* Make a single pass through the cases to determine class frequencies
351
+ and value/class frequencies for all discrete attributes */
352
+
353
+ FindAllFreq(Fp, Lp);
354
+
355
+ /* Choose the best leaf and the least prevalent class */
356
+
357
+ ForEach(c, 2, MaxClass)
358
+ {
359
+ if ( ClassFreq[c] > ClassFreq[BestLeaf] )
360
+ {
361
+ BestLeaf = c;
362
+ }
363
+ else
364
+ if ( ClassFreq[c] > 0.1 && ClassFreq[c] < ClassFreq[Least] )
365
+ {
366
+ Least = c;
367
+ }
368
+ }
369
+
370
+ ForEach(c, 1, MaxClass)
371
+ {
372
+ Cases += ClassFreq[c];
373
+ }
374
+
375
+ MaxLeaves = ( LEAFRATIO > 0 ? rint(LEAFRATIO * Cases) : 1E6 );
376
+
377
+ *Result = Node =
378
+ Leaf(ClassFreq, BestLeaf, Cases, Cases - ClassFreq[BestLeaf]);
379
+
380
+ Verbosity(1,
381
+ fprintf(Of, "\n<%d> %d cases", Level, No(Fp,Lp));
382
+ if ( fabs(No(Fp,Lp) - Cases) >= 0.1 )
383
+ {
384
+ fprintf(Of, ", total weight %.1f", Cases);
385
+ }
386
+ fprintf(Of, "\n"))
387
+
388
+ /* Do not try to split if:
389
+ - all cases are of the same class
390
+ - there are not enough cases to split */
391
+
392
+ if ( ClassFreq[BestLeaf] >= 0.999 * Cases ||
393
+ Cases < 2 * MINITEMS ||
394
+ MaxLeaves < 2 )
395
+ {
396
+ if ( Now == FORMTREE ) Progress(Cases);
397
+ return;
398
+ }
399
+
400
+ /* Calculate base information */
401
+
402
+ GlobalBaseInfo = TotalInfo(ClassFreq, 1, MaxClass) / Cases;
403
+
404
+ /* Perform preliminary evaluation if using subsampling.
405
+ Must expect at least 10 of least prevalent class */
406
+
407
+ ValThresh = 0;
408
+ if ( Subsample && No(Fp, Lp) > 5 * MaxClass * SAMPLEUNIT &&
409
+ (ClassFreq[Least] * MaxClass * SAMPLEUNIT) / No(Fp, Lp) >= 10 )
410
+ {
411
+ SampleEstimate(Fp, Lp, Cases);
412
+ Sampled = true;
413
+ }
414
+ else
415
+ {
416
+ Sampled = false;
417
+ }
418
+
419
+ BestAtt = ChooseSplit(Fp, Lp, Cases, Sampled);
420
+
421
+ /* Decide whether to branch or not */
422
+
423
+ if ( BestAtt == None )
424
+ {
425
+ Verbosity(1, fprintf(Of, "\tno sensible splits\n"))
426
+ if ( Now == FORMTREE ) Progress(Cases);
427
+ }
428
+ else
429
+ {
430
+ Verbosity(1,
431
+ fprintf(Of, "\tbest attribute %s", AttName[BestAtt]);
432
+ if ( Continuous(BestAtt) )
433
+ {
434
+ fprintf(Of, " cut %.3f", Bar[BestAtt]);
435
+ }
436
+ fprintf(Of, " inf %.3f gain %.3f val %.3f\n",
437
+ Info[BestAtt], Gain[BestAtt], Gain[BestAtt] / Info[BestAtt]))
438
+
439
+ /* Build a node of the selected test */
440
+
441
+ if ( Discrete(BestAtt) )
442
+ {
443
+ if ( SUBSET && MaxAttVal[BestAtt] > 3 && ! Ordered(BestAtt) )
444
+ {
445
+ SubsetTest(Node, BestAtt);
446
+ }
447
+ else
448
+ {
449
+ DiscreteTest(Node, BestAtt);
450
+ }
451
+ }
452
+ else
453
+ {
454
+ ContinTest(Node, BestAtt);
455
+ }
456
+
457
+ /* Carry out the recursive divide-and-conquer */
458
+
459
+ ++Tested[BestAtt];
460
+
461
+ Divide(Node, Fp, Lp, Level);
462
+
463
+ --Tested[BestAtt];
464
+
465
+ /* See whether we would have been no worse off with a leaf */
466
+
467
+ ForEach(v, 1, Node->Forks)
468
+ {
469
+ TreeErrs += Node->Branch[v]->Errors;
470
+ }
471
+
472
+ if ( TreeErrs >= 0.999 * Node->Errors )
473
+ {
474
+ Verbosity(1,
475
+ fprintf(Of, "<%d> Collapse tree for %d cases to leaf %s\n",
476
+ Level, No(Fp,Lp), ClassName[BestLeaf]))
477
+
478
+ UnSprout(Node);
479
+ }
480
+ else
481
+ {
482
+ Node->Errors = TreeErrs;
483
+ }
484
+ }
485
+ }
486
+
487
+
488
+
489
+ /*************************************************************************/
490
+ /* */
491
+ /* Estimate Gain[] and Info[] using sample */
492
+ /* */
493
+ /*************************************************************************/
494
+
495
+
496
+ void SampleEstimate(CaseNo Fp, CaseNo Lp, CaseCount Cases)
497
+ /* -------------- */
498
+ {
499
+ CaseNo SLp, SampleSize;
500
+ CaseCount NewCases;
501
+ Attribute Att;
502
+ float GR;
503
+
504
+ /* Phase 1: evaluate all discrete attributes and record best GR */
505
+
506
+ ForEach(Att, 1, MaxAtt)
507
+ {
508
+ Gain[Att] = None;
509
+
510
+ if ( Discrete(Att) )
511
+ {
512
+ EvalDiscrSplit(Att, Cases);
513
+
514
+ if ( Info[Att] > Epsilon &&
515
+ (GR = Gain[Att] / Info[Att]) > ValThresh )
516
+ {
517
+ ValThresh = GR;
518
+ }
519
+ }
520
+ }
521
+
522
+ /* Phase 2: generate sample */
523
+
524
+ SampleSize = MaxClass * SAMPLEUNIT;
525
+ Sample(Fp, Lp, SampleSize);
526
+ SLp = Fp + SampleSize - 1;
527
+
528
+ /* Phase 3: evaluate continuous attributes using sample */
529
+
530
+ NewCases = CountCases(Fp, SLp);
531
+ SampleFrac = NewCases / Cases;
532
+ NWaiting = 0;
533
+
534
+ ForEach(Att, 1, MaxAtt)
535
+ {
536
+ if ( Continuous(Att) )
537
+ {
538
+ Waiting[NWaiting++] = Att;
539
+ }
540
+ }
541
+
542
+ ProcessQueue(Fp, SLp, NewCases);
543
+
544
+ SampleFrac = 1.0;
545
+ }
546
+
547
+
548
+
549
+ /*************************************************************************/
550
+ /* */
551
+ /* Sample N cases from cases Fp through Lp */
552
+ /* */
553
+ /*************************************************************************/
554
+
555
+
556
+ void Sample(CaseNo Fp, CaseNo Lp, CaseNo N)
557
+ /* ------ */
558
+ {
559
+ CaseNo i, j;
560
+ double Interval;
561
+
562
+ Interval = No(Fp, Lp) / (double) N;
563
+
564
+ ForEach(i, 0, N-1)
565
+ {
566
+ j = (i + 0.5) * Interval;
567
+
568
+ assert(j >= 0 && Fp + j <= Lp);
569
+
570
+ Swap(Fp + i, Fp + j);
571
+ }
572
+ }
573
+
574
+
575
+
576
+ /*************************************************************************/
577
+ /* */
578
+ /* Evaluate splits and choose best attribute to split on. */
579
+ /* If Sampled, Gain[] and Info[] have been estimated on */
580
+ /* sample and unlikely candidates are not evaluated on all cases */
581
+ /* */
582
+ /*************************************************************************/
583
+
584
+
585
+ Attribute ChooseSplit(CaseNo Fp, CaseNo Lp, CaseCount Cases, Boolean Sampled)
586
+ /* ----------- */
587
+ {
588
+ Attribute Att;
589
+ int i, j;
590
+
591
+
592
+ /* For each available attribute, find the information and gain */
593
+
594
+ NWaiting = 0;
595
+
596
+ if ( Sampled )
597
+ {
598
+ /* If samples have been used, do not re-evaluate discrete atts
599
+ or atts that have low GR */
600
+
601
+ for ( Att = MaxAtt ; Att > 0 ; Att-- )
602
+ {
603
+ if ( ! Continuous(Att) ) continue;
604
+
605
+ if ( EstMaxGR[Att] >= ValThresh )
606
+ {
607
+ /* Add attributes in reverse order of estimated max GR */
608
+
609
+ for ( i = 0 ;
610
+ i < NWaiting && EstMaxGR[Waiting[i]] < EstMaxGR[Att] ;
611
+ i++ )
612
+ ;
613
+
614
+ for ( j = NWaiting-1 ; j >= i ; j-- )
615
+ {
616
+ Waiting[j+1] = Waiting[j];
617
+ }
618
+ NWaiting++;
619
+
620
+ Waiting[i] = Att;
621
+ }
622
+ else
623
+ {
624
+ /* Don't use -- attribute hasn't been fully evaluated.
625
+ Leave Gain unchanged to get correct count for Possible */
626
+
627
+ Info[Att] = -1E6; /* negative so GR also negative */
628
+ }
629
+ }
630
+ }
631
+ else
632
+ {
633
+ for ( Att = MaxAtt ; Att > 0 ; Att-- )
634
+ {
635
+ Gain[Att] = None;
636
+
637
+ if ( Skip(Att) || Att == ClassAtt )
638
+ {
639
+ continue;
640
+ }
641
+
642
+ Waiting[NWaiting++] = Att;
643
+ }
644
+ }
645
+
646
+ ProcessQueue(Fp, Lp, Cases);
647
+
648
+ return FindBestAtt(Cases);
649
+ }
650
+
651
+
652
+
653
+ void ProcessQueue(CaseNo WFp, CaseNo WLp, CaseCount WCases)
654
+ /* ------------ */
655
+ {
656
+ Attribute Att;
657
+ float GR;
658
+
659
+ for ( ; NWaiting > 0 ; )
660
+ {
661
+ Att = Waiting[--NWaiting];
662
+
663
+ if ( Discrete(Att) )
664
+ {
665
+ EvalDiscrSplit(Att, WCases);
666
+ }
667
+ else
668
+ if ( SampleFrac < 1 )
669
+ {
670
+ EstimateMaxGR(Att, WFp, WLp);
671
+ }
672
+ else
673
+ if ( Sampled )
674
+ {
675
+ Info[Att] = -1E16;
676
+
677
+ if ( EstMaxGR[Att] > ValThresh )
678
+ {
679
+ EvalContinuousAtt(Att, WFp, WLp);
680
+
681
+ if ( Info[Att] > Epsilon &&
682
+ (GR = Gain[Att] / Info[Att]) > ValThresh )
683
+ {
684
+ if ( GR > ValThresh ) ValThresh = GR;
685
+ }
686
+ }
687
+ }
688
+ else
689
+ {
690
+ EvalContinuousAtt(Att, WFp, WLp);
691
+ }
692
+ }
693
+ }
694
+
695
+
696
+
697
+ /*************************************************************************/
698
+ /* */
699
+ /* Adjust each attribute's gain to reflect choice and */
700
+ /* select att with maximum GR */
701
+ /* */
702
+ /*************************************************************************/
703
+
704
+
705
+ Attribute FindBestAtt(CaseCount Cases)
706
+ /* ----------- */
707
+ {
708
+ double BestVal, Val, MinGain=1E6, AvGain=0, MDL;
709
+ Attribute Att, BestAtt, Possible=0;
710
+ DiscrValue NBr, BestNBr=MaxDiscrVal+1;
711
+
712
+ ForEach(Att, 1, MaxAtt)
713
+ {
714
+ /* Update the number of possible attributes for splitting and
715
+ average gain (unless very many values) */
716
+
717
+ if ( Gain[Att] >= Epsilon &&
718
+ ( MultiVal || MaxAttVal[Att] < 0.3 * (MaxCase + 1) ) )
719
+ {
720
+ Possible++;
721
+ AvGain += Gain[Att];
722
+ }
723
+ else
724
+ {
725
+ Gain[Att] = None;
726
+ }
727
+ }
728
+
729
+ /* Set threshold on minimum gain */
730
+
731
+ if ( ! Possible ) return None;
732
+
733
+ AvGain /= Possible;
734
+ MDL = Log(Possible) / Cases;
735
+ MinGain = AvGain * AvGainWt + MDL * MDLWt;
736
+
737
+ Verbosity(2,
738
+ fprintf(Of, "\tav gain=%.3f, MDL (%d) = %.3f, min=%.3f\n",
739
+ AvGain, Possible, MDL, MinGain))
740
+
741
+ /* Find best attribute according to Gain Ratio criterion subject
742
+ to threshold on minimum gain */
743
+
744
+ BestVal = -Epsilon;
745
+ BestAtt = None;
746
+
747
+ ForEach(Att, 1, MaxAtt)
748
+ {
749
+ if ( Gain[Att] >= 0.999 * MinGain && Info[Att] > 0 )
750
+ {
751
+ Val = Gain[Att] / Info[Att];
752
+ NBr = ( MaxAttVal[Att] <= 3 || Ordered(Att) ? 3 :
753
+ SUBSET ? Subsets[Att] : MaxAttVal[Att] );
754
+
755
+ if ( Val > BestVal ||
756
+ Val > 0.999 * BestVal &&
757
+ ( NBr < BestNBr ||
758
+ NBr == BestNBr && Gain[Att] > Gain[BestAtt] ) )
759
+ {
760
+ BestAtt = Att;
761
+ BestVal = Val;
762
+ BestNBr = NBr;
763
+ }
764
+ }
765
+ }
766
+
767
+ return BestAtt;
768
+ }
769
+
770
+
771
+
772
+ /*************************************************************************/
773
+ /* */
774
+ /* Evaluate split on Att */
775
+ /* */
776
+ /*************************************************************************/
777
+
778
+
779
+ void EvalDiscrSplit(Attribute Att, CaseCount Cases)
780
+ /* -------------- */
781
+ {
782
+ DiscrValue v, NBr;
783
+
784
+ Gain[Att] = None;
785
+
786
+ if ( Skip(Att) || Att == ClassAtt ) return;
787
+
788
+ if ( Ordered(Att) )
789
+ {
790
+ EvalOrderedAtt(Att, Cases);
791
+ NBr = ( GEnv.ValFreq[1] > 0.5 ? 3 : 2 );
792
+ }
793
+ else
794
+ if ( SUBSET && MaxAttVal[Att] > 3 )
795
+ {
796
+ EvalSubset(Att, Cases);
797
+ NBr = Subsets[Att];
798
+ }
799
+ else
800
+ if ( ! Tested[Att] )
801
+ {
802
+ EvalDiscreteAtt(Att, Cases);
803
+
804
+ NBr = 0;
805
+ ForEach(v, 1, MaxAttVal[Att])
806
+ {
807
+ if ( GEnv.ValFreq[v] > 0.5 ) NBr++;
808
+ }
809
+ }
810
+ else
811
+ {
812
+ NBr = 0;
813
+ }
814
+
815
+ /* Check that this test will not give too many leaves */
816
+
817
+ if ( NBr > MaxLeaves + 1 )
818
+ {
819
+ Verbosity(2,
820
+ fprintf(Of, "\t(cancelled -- %d leaves, max %d)\n", NBr, MaxLeaves))
821
+
822
+ Gain[Att] = None;
823
+ }
824
+ }
825
+
826
+
827
+
828
+ /*************************************************************************/
829
+ /* */
830
+ /* Form the subtrees for the given node */
831
+ /* */
832
+ /*************************************************************************/
833
+
834
+
835
+ void Divide(Tree T, CaseNo Fp, CaseNo Lp, int Level)
836
+ /* ------ */
837
+ {
838
+ CaseNo Bp, Ep, Missing, Cases, i;
839
+ CaseCount KnownCases, MissingCases, BranchCases;
840
+ Attribute Att;
841
+ double Factor;
842
+ DiscrValue v;
843
+ Boolean PrevUnitWeights;
844
+
845
+ PrevUnitWeights = UnitWeights;
846
+
847
+ Att = T->Tested;
848
+ Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
849
+
850
+ KnownCases = T->Cases - (MissingCases = CountCases(Fp, Ep));
851
+
852
+ if ( Missing )
853
+ {
854
+ UnitWeights = false;
855
+
856
+ /* If using costs, must adjust branch factors to undo effects of
857
+ reweighting cases */
858
+
859
+ if ( CostWeights )
860
+ {
861
+ KnownCases = SumNocostWeights(Ep+1, Lp);
862
+ }
863
+
864
+ /* If there are many cases with missing values and many branches,
865
+ skip cases whose weight < 0.1 */
866
+
867
+ if ( (Cases = No(Fp,Lp)) > 1000 &&
868
+ Missing > 0.5 * Cases &&
869
+ T->Forks >= 10 )
870
+ {
871
+ ForEach(i, Fp, Ep)
872
+ {
873
+ if ( Weight(Case[i]) < 0.1 )
874
+ {
875
+ Missing--;
876
+ MissingCases -= Weight(Case[i]);
877
+ Swap(Fp, i);
878
+ Fp++;
879
+ }
880
+ }
881
+
882
+ assert(Missing >= 0);
883
+ }
884
+ }
885
+
886
+ Bp = Fp;
887
+ ForEach(v, 1, T->Forks)
888
+ {
889
+ Ep = Group(v, Bp + Missing, Lp, T);
890
+
891
+ assert(Bp + Missing <= Lp+1 && Ep <= Lp);
892
+
893
+ /* Bp -> first value in missing + remaining values
894
+ Ep -> last value in missing + current group */
895
+
896
+ BranchCases = CountCases(Bp + Missing, Ep);
897
+
898
+ Factor = ( ! Missing ? 0 :
899
+ ! CostWeights ? BranchCases / KnownCases :
900
+ SumNocostWeights(Bp + Missing, Ep) / KnownCases );
901
+
902
+ if ( BranchCases + Factor * MissingCases >= MinLeaf )
903
+ {
904
+ if ( Missing )
905
+ {
906
+ /* Adjust weights of cases with missing values */
907
+
908
+ ForEach(i, Bp, Bp + Missing - 1)
909
+ {
910
+ Weight(Case[i]) *= Factor;
911
+ }
912
+ }
913
+
914
+ FormTree(Bp, Ep, Level+1, &T->Branch[v]);
915
+
916
+ /* Restore weights if changed */
917
+
918
+ if ( Missing )
919
+ {
920
+ for ( i = Ep ; i >= Bp ; i-- )
921
+ {
922
+ if ( Unknown(Case[i], Att) )
923
+ {
924
+ Weight(Case[i]) /= Factor;
925
+ Swap(i, Ep);
926
+ Ep--;
927
+ }
928
+ }
929
+ }
930
+
931
+ Bp = Ep+1;
932
+ }
933
+ else
934
+ {
935
+ T->Branch[v] = Leaf(Nil, T->Leaf, 0.0, 0.0);
936
+ }
937
+ }
938
+
939
+ UnitWeights = PrevUnitWeights;
940
+ }
941
+
942
+
943
+
944
+ /*************************************************************************/
945
+ /* */
946
+ /* Group together the cases corresponding to branch V of a test */
947
+ /* and return the index of the last such */
948
+ /* */
949
+ /* Note: if V equals zero, group the unknown values */
950
+ /* */
951
+ /*************************************************************************/
952
+
953
+
954
+ CaseNo Group(DiscrValue V, CaseNo Bp, CaseNo Ep, Tree TestNode)
955
+ /* ----- */
956
+ {
957
+ CaseNo i;
958
+ Attribute Att;
959
+ ContValue Thresh;
960
+ Set SS;
961
+
962
+ Att = TestNode->Tested;
963
+
964
+ if ( ! V )
965
+ {
966
+ /* Group together unknown values (if any) */
967
+
968
+ if ( SomeMiss[Att] )
969
+ {
970
+ ForEach(i, Bp, Ep)
971
+ {
972
+ if ( Unknown(Case[i], Att) )
973
+ {
974
+ Swap(Bp, i);
975
+ Bp++;
976
+ }
977
+ }
978
+ }
979
+ }
980
+ else /* skip non-existant N/A values */
981
+ if ( V != 1 || TestNode->NodeType == BrSubset || SomeNA[Att] )
982
+ {
983
+ /* Group cases on the value of attribute Att, and depending
984
+ on the type of branch */
985
+
986
+ switch ( TestNode->NodeType )
987
+ {
988
+ case BrDiscr:
989
+
990
+ ForEach(i, Bp, Ep)
991
+ {
992
+ if ( DVal(Case[i], Att) == V )
993
+ {
994
+ Swap(Bp, i);
995
+ Bp++;
996
+ }
997
+ }
998
+ break;
999
+
1000
+ case BrThresh:
1001
+
1002
+ Thresh = TestNode->Cut;
1003
+ ForEach(i, Bp, Ep)
1004
+ {
1005
+ if ( V == 1 ? NotApplic(Case[i], Att) :
1006
+ (CVal(Case[i], Att) <= Thresh) == (V == 2) )
1007
+ {
1008
+ Swap(Bp, i);
1009
+ Bp++;
1010
+ }
1011
+ }
1012
+ break;
1013
+
1014
+ case BrSubset:
1015
+
1016
+ SS = TestNode->Subset[V];
1017
+ ForEach(i, Bp, Ep)
1018
+ {
1019
+ if ( In(XDVal(Case[i], Att), SS) )
1020
+ {
1021
+ Swap(Bp, i);
1022
+ Bp++;
1023
+ }
1024
+ }
1025
+ break;
1026
+ }
1027
+ }
1028
+
1029
+ return Bp - 1;
1030
+ }
1031
+
1032
+
1033
+
1034
+ /*************************************************************************/
1035
+ /* */
1036
+ /* Return the total weight of cases from Fp to Lp */
1037
+ /* */
1038
+ /*************************************************************************/
1039
+
1040
+
1041
+ CaseCount SumWeights(CaseNo Fp, CaseNo Lp)
1042
+ /* ---------- */
1043
+ {
1044
+ double Sum=0.0;
1045
+ CaseNo i;
1046
+
1047
+ assert(Fp >= 0 && Lp >= Fp-1 && Lp <= MaxCase);
1048
+
1049
+ ForEach(i, Fp, Lp)
1050
+ {
1051
+ Sum += Weight(Case[i]);
1052
+ }
1053
+
1054
+ return Sum;
1055
+ }
1056
+
1057
+
1058
+
1059
+ /*************************************************************************/
1060
+ /* */
1061
+ /* Special version to undo the weightings associated with costs */
1062
+ /* */
1063
+ /*************************************************************************/
1064
+
1065
+
1066
+ CaseCount SumNocostWeights(CaseNo Fp, CaseNo Lp)
1067
+ /* ---------------- */
1068
+ {
1069
+ double Sum=0.0;
1070
+ CaseNo i;
1071
+
1072
+ assert(Fp >= 0 && Lp >= Fp-1 && Lp <= MaxCase);
1073
+
1074
+ ForEach(i, Fp, Lp)
1075
+ {
1076
+ Sum += Weight(Case[i]) / WeightMul[Class(Case[i])];
1077
+ }
1078
+
1079
+ return Sum;
1080
+ }
1081
+
1082
+
1083
+
1084
+ /*************************************************************************/
1085
+ /* */
1086
+ /* Generate class frequency distribution */
1087
+ /* */
1088
+ /*************************************************************************/
1089
+
1090
+
1091
+ void FindClassFreq(double *CF, CaseNo Fp, CaseNo Lp)
1092
+ /* ------------- */
1093
+ {
1094
+ ClassNo c;
1095
+ CaseNo i;
1096
+
1097
+ assert(Fp >= 0 && Lp >= Fp && Lp <= MaxCase);
1098
+
1099
+ ForEach(c, 0, MaxClass)
1100
+ {
1101
+ CF[c] = 0;
1102
+ }
1103
+
1104
+ ForEach(i, Fp, Lp)
1105
+ {
1106
+ assert(Class(Case[i]) >= 1 && Class(Case[i]) <= MaxClass);
1107
+
1108
+ CF[ Class(Case[i]) ] += Weight(Case[i]);
1109
+ }
1110
+ }
1111
+
1112
+
1113
+
1114
+ /*************************************************************************/
1115
+ /* */
1116
+ /* Find all discrete frequencies */
1117
+ /* */
1118
+ /*************************************************************************/
1119
+
1120
+
1121
+ void FindAllFreq(CaseNo Fp, CaseNo Lp)
1122
+ /* ----------- */
1123
+ {
1124
+ ClassNo c;
1125
+ CaseNo i;
1126
+ Attribute Att, a;
1127
+ CaseCount w;
1128
+ int x;
1129
+
1130
+ /* Zero all values */
1131
+
1132
+ ForEach(c, 0, MaxClass)
1133
+ {
1134
+ ClassFreq[c] = 0;
1135
+ }
1136
+
1137
+ for ( a = 0 ; a < NDList ; a++ )
1138
+ {
1139
+ Att = DList[a];
1140
+ for ( x = MaxClass * (MaxAttVal[Att]+1) - 1 ; x >= 0 ; x-- )
1141
+ {
1142
+ DFreq[Att][x] = 0;
1143
+ }
1144
+ }
1145
+
1146
+ /* Scan cases */
1147
+
1148
+ ForEach(i, Fp, Lp)
1149
+ {
1150
+ ClassFreq[ (c=Class(Case[i])) ] += (w=Weight(Case[i]));
1151
+
1152
+ for ( a = 0 ; a < NDList ; a++ )
1153
+ {
1154
+ Att = DList[a];
1155
+ DFreq[Att][ MaxClass * XDVal(Case[i], Att) + (c-1) ] += w;
1156
+ }
1157
+ }
1158
+ }