see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,1158 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Central tree-forming algorithm */
30
+ /* ------------------------------ */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ Boolean MultiVal, /* all atts have many values */
40
+ Subsample; /* use subsampling */
41
+ float AvGainWt, /* weight of average gain in gain threshold */
42
+ MDLWt; /* weight of MDL threshold ditto */
43
+
44
+ Attribute *DList=Nil; /* list of discrete atts */
45
+ int NDList; /* number in list */
46
+
47
+ DiscrValue MaxLeaves; /* target maximum tree size */
48
+
49
+ #define SAMPLEUNIT 2000
50
+
51
+ float ValThresh; /* minimum GR when evaluating sampled atts */
52
+ Boolean Sampled; /* true if sampling used */
53
+
54
+ Attribute *Waiting=Nil, /* attribute wait list */
55
+ NWaiting=0;
56
+
57
+
58
+
59
+
60
+ /*************************************************************************/
61
+ /* */
62
+ /* Allocate space for tree tables */
63
+ /* */
64
+ /*************************************************************************/
65
+
66
+
67
+ void InitialiseTreeData()
68
+ /* ------------------ */
69
+ {
70
+ DiscrValue v;
71
+ Attribute Att;
72
+ DiscrValue vMax;
73
+
74
+ Raw = AllocZero(TRIALS+1, Tree);
75
+ Pruned = AllocZero(TRIALS+1, Tree);
76
+
77
+ Tested = AllocZero(MaxAtt+1, Byte);
78
+
79
+ Gain = AllocZero(MaxAtt+1, float);
80
+ Info = AllocZero(MaxAtt+1, float);
81
+ Bar = AllocZero(MaxAtt+1, ContValue);
82
+
83
+ EstMaxGR = AllocZero(MaxAtt+1, float);
84
+
85
+ /* Data for subsets */
86
+
87
+ if ( SUBSET )
88
+ {
89
+ InitialiseBellNumbers();
90
+ Subset = Alloc(MaxAtt+1, Set *);
91
+
92
+ ForEach(Att, 1, MaxAtt)
93
+ {
94
+ if ( Discrete(Att) && Att != ClassAtt && ! Skip(Att) )
95
+ {
96
+ Subset[Att] = AllocZero(MaxAttVal[Att]+1, Set);
97
+ ForEach(v, 0, MaxAttVal[Att])
98
+ {
99
+ Subset[Att][v] = Alloc((MaxAttVal[Att]>>3)+1, Byte);
100
+ }
101
+ }
102
+ }
103
+ Subsets = AllocZero(MaxAtt+1, int);
104
+ }
105
+
106
+ DList = Alloc(MaxAtt, Attribute);
107
+ NDList = 0;
108
+
109
+ DFreq = AllocZero(MaxAtt+1, double *);
110
+ ForEach(Att, 1, MaxAtt)
111
+ {
112
+ if ( Att == ClassAtt || Skip(Att) || ! Discrete(Att) ) continue;
113
+
114
+ DList[NDList++] = Att;
115
+
116
+ DFreq[Att] = Alloc(MaxClass * (MaxAttVal[Att]+1), double);
117
+ }
118
+
119
+ ClassFreq = AllocZero(MaxClass+1, double);
120
+ ClassSum = Alloc(MaxClass+1, float);
121
+
122
+ if ( BOOST )
123
+ {
124
+ Vote = Alloc(MaxClass+1, float);
125
+ TrialPred = Alloc(TRIALS, ClassNo);
126
+ }
127
+
128
+ if ( RULES )
129
+ {
130
+ MostSpec = Alloc(MaxClass+1, CRule);
131
+ PossibleCuts = Alloc(MaxAtt+1, int);
132
+ }
133
+
134
+ /* Check whether all attributes have many discrete values */
135
+
136
+ MultiVal = true;
137
+ if ( ! SUBSET )
138
+ {
139
+ for ( Att = 1 ; MultiVal && Att <= MaxAtt ; Att++ )
140
+ {
141
+ if ( ! Skip(Att) && Att != ClassAtt )
142
+ {
143
+ MultiVal = MaxAttVal[Att] >= 0.3 * (MaxCase + 1);
144
+ }
145
+ }
146
+ }
147
+
148
+ /* See whether there are continuous attributes for subsampling */
149
+
150
+ Subsample = false;
151
+
152
+ /* Set parameters for RawExtraErrs() */
153
+
154
+ InitialiseExtraErrs();
155
+
156
+ /* Set up environment */
157
+
158
+ Waiting = Alloc(MaxAtt+1, Attribute);
159
+
160
+ vMax = Max(3, MaxDiscrVal+1);
161
+
162
+ GEnv.Freq = Alloc(vMax+1, double *);
163
+ ForEach(v, 0, vMax)
164
+ {
165
+ GEnv.Freq[v] = Alloc(MaxClass+1, double);
166
+ }
167
+
168
+ GEnv.ValFreq = Alloc(vMax, double);
169
+
170
+ GEnv.ClassFreq = Alloc(MaxClass+1, double);
171
+
172
+ GEnv.SRec = Alloc(MaxCase+1, SortRec);
173
+
174
+ if ( SUBSET )
175
+ {
176
+ GEnv.SubsetInfo = Alloc(MaxDiscrVal+1, double);
177
+ GEnv.SubsetEntr = Alloc(MaxDiscrVal+1, double);
178
+
179
+ GEnv.MergeInfo = Alloc(MaxDiscrVal+1, double *);
180
+ GEnv.MergeEntr = Alloc(MaxDiscrVal+1, double *);
181
+ GEnv.WSubset = Alloc(MaxDiscrVal+1, Set);
182
+ ForEach(v, 1, MaxDiscrVal)
183
+ {
184
+ GEnv.MergeInfo[v] = Alloc(MaxDiscrVal+1, double);
185
+ GEnv.MergeEntr[v] = Alloc(MaxDiscrVal+1, double);
186
+ GEnv.WSubset[v] = Alloc((MaxDiscrVal>>3)+1, Byte);
187
+ }
188
+ }
189
+ }
190
+
191
+
192
+ void FreeTreeData()
193
+ /* ------------ */
194
+ {
195
+ Attribute Att;
196
+ DiscrValue vMax;
197
+
198
+ FreeUnlessNil(Raw); Raw = Nil;
199
+ FreeUnlessNil(Pruned); Pruned = Nil;
200
+
201
+ FreeUnlessNil(Tested); Tested = Nil;
202
+
203
+ FreeUnlessNil(Gain); Gain = Nil;
204
+ FreeUnlessNil(Info); Info = Nil;
205
+ FreeUnlessNil(Bar); Bar = Nil;
206
+
207
+ FreeUnlessNil(EstMaxGR); EstMaxGR = Nil;
208
+
209
+ if ( SUBSET )
210
+ {
211
+ FreeVector((void **) Bell, 1, MaxDiscrVal); Bell = Nil;
212
+
213
+ if ( Subset )
214
+ {
215
+ ForEach(Att, 1, MaxAtt)
216
+ {
217
+ if ( Subset[Att] )
218
+ {
219
+ FreeVector((void **) Subset[Att], 0, MaxAttVal[Att]);
220
+ }
221
+ }
222
+ Free(Subset); Subset = Nil;
223
+ Free(Subsets); Subsets = Nil;
224
+ }
225
+ }
226
+
227
+ FreeUnlessNil(DList); DList = Nil;
228
+
229
+ if ( DFreq )
230
+ {
231
+ ForEach(Att, 1, MaxAtt)
232
+ {
233
+ FreeUnlessNil(DFreq[Att]);
234
+ }
235
+
236
+ Free(DFreq); DFreq = Nil;
237
+ }
238
+
239
+ FreeUnlessNil(ClassFreq); ClassFreq = Nil;
240
+ FreeUnlessNil(ClassSum); ClassSum = Nil;
241
+
242
+ FreeUnlessNil(Vote); Vote = Nil;
243
+ FreeUnlessNil(TrialPred); TrialPred = Nil;
244
+
245
+ FreeUnlessNil(MostSpec); MostSpec = Nil;
246
+ FreeUnlessNil(PossibleCuts); PossibleCuts = Nil;
247
+
248
+ vMax = Max(3, MaxDiscrVal+1);
249
+ FreeVector((void **) GEnv.Freq, 0, vMax);
250
+ Free(GEnv.ValFreq);
251
+ Free(GEnv.ClassFreq);
252
+ FreeUnlessNil(GEnv.SRec);
253
+
254
+ if ( GEnv.SubsetInfo )
255
+ {
256
+ Free(GEnv.SubsetInfo);
257
+ Free(GEnv.SubsetEntr);
258
+ FreeVector((void **) GEnv.MergeInfo, 1, MaxDiscrVal);
259
+ FreeVector((void **) GEnv.MergeEntr, 1, MaxDiscrVal);
260
+ FreeVector((void **) GEnv.WSubset, 1, MaxDiscrVal);
261
+ }
262
+
263
+ FreeUnlessNil(Waiting); Waiting = Nil;
264
+ }
265
+
266
+
267
+
268
+ /*************************************************************************/
269
+ /* */
270
+ /* Set threshold on minimum gain as follows: */
271
+ /* * when forming winnowing tree: no minimum */
272
+ /* * for small problems, AvGain (usual Gain Ratio) */
273
+ /* * for large problems, discounted MDL */
274
+ /* * for intermediate problems, interpolated */
275
+ /* */
276
+ /*************************************************************************/
277
+
278
+
279
+ void SetMinGainThresh()
280
+ /* ---------------- */
281
+ {
282
+ float Frac;
283
+
284
+ /* Set AvGainWt and MDLWt */
285
+
286
+ if ( Now == WINNOWATTS )
287
+ {
288
+ AvGainWt = MDLWt = 0.0;
289
+ }
290
+ else
291
+ if ( (MaxCase+1) / MaxClass <= 500 )
292
+ {
293
+ AvGainWt = 1.0;
294
+ MDLWt = 0.0;
295
+ }
296
+ else
297
+ if ( (MaxCase+1) / MaxClass >= 1000 )
298
+ {
299
+ AvGainWt = 0.0;
300
+ MDLWt = 0.9;
301
+ }
302
+ else
303
+ {
304
+ Frac = ((MaxCase+1) / MaxClass - 500) / 500.0;
305
+
306
+ AvGainWt = 1 - Frac;
307
+ MDLWt = 0.9 * Frac;
308
+ }
309
+ }
310
+
311
+
312
+
313
+ /*************************************************************************/
314
+ /* */
315
+ /* Build a decision tree for the cases Fp through Lp */
316
+ /* */
317
+ /* - if all cases are of the same class, the tree is a leaf labelled */
318
+ /* with this class */
319
+ /* */
320
+ /* - for each attribute, calculate the potential information provided */
321
+ /* by a test on the attribute (based on the probabilities of each */
322
+ /* case having a particular value for the attribute), and the gain */
323
+ /* in information that would result from a test on the attribute */
324
+ /* (based on the probabilities of each case with a particular */
325
+ /* value for the attribute being of a particular class) */
326
+ /* */
327
+ /* - on the basis of these figures, and depending on the current */
328
+ /* selection criterion, find the best attribute to branch on. */
329
+ /* Note: this version will not allow a split on an attribute */
330
+ /* unless two or more subsets have at least MINITEMS cases */
331
+ /* */
332
+ /* - try branching and test whether the resulting tree is better than */
333
+ /* forming a leaf */
334
+ /* */
335
+ /*************************************************************************/
336
+
337
+
338
+ void FormTree(CaseNo Fp, CaseNo Lp, int Level, Tree *Result)
339
+ /* -------- */
340
+ {
341
+ CaseCount Cases=0, TreeErrs=0;
342
+ Attribute BestAtt;
343
+ ClassNo c, BestLeaf=1, Least=1;
344
+ Tree Node;
345
+ DiscrValue v;
346
+
347
+
348
+ assert(Fp >= 0 && Lp >= Fp && Lp <= MaxCase);
349
+
350
+ /* Make a single pass through the cases to determine class frequencies
351
+ and value/class frequencies for all discrete attributes */
352
+
353
+ FindAllFreq(Fp, Lp);
354
+
355
+ /* Choose the best leaf and the least prevalent class */
356
+
357
+ ForEach(c, 2, MaxClass)
358
+ {
359
+ if ( ClassFreq[c] > ClassFreq[BestLeaf] )
360
+ {
361
+ BestLeaf = c;
362
+ }
363
+ else
364
+ if ( ClassFreq[c] > 0.1 && ClassFreq[c] < ClassFreq[Least] )
365
+ {
366
+ Least = c;
367
+ }
368
+ }
369
+
370
+ ForEach(c, 1, MaxClass)
371
+ {
372
+ Cases += ClassFreq[c];
373
+ }
374
+
375
+ MaxLeaves = ( LEAFRATIO > 0 ? rint(LEAFRATIO * Cases) : 1E6 );
376
+
377
+ *Result = Node =
378
+ Leaf(ClassFreq, BestLeaf, Cases, Cases - ClassFreq[BestLeaf]);
379
+
380
+ Verbosity(1,
381
+ fprintf(Of, "\n<%d> %d cases", Level, No(Fp,Lp));
382
+ if ( fabs(No(Fp,Lp) - Cases) >= 0.1 )
383
+ {
384
+ fprintf(Of, ", total weight %.1f", Cases);
385
+ }
386
+ fprintf(Of, "\n"))
387
+
388
+ /* Do not try to split if:
389
+ - all cases are of the same class
390
+ - there are not enough cases to split */
391
+
392
+ if ( ClassFreq[BestLeaf] >= 0.999 * Cases ||
393
+ Cases < 2 * MINITEMS ||
394
+ MaxLeaves < 2 )
395
+ {
396
+ if ( Now == FORMTREE ) Progress(Cases);
397
+ return;
398
+ }
399
+
400
+ /* Calculate base information */
401
+
402
+ GlobalBaseInfo = TotalInfo(ClassFreq, 1, MaxClass) / Cases;
403
+
404
+ /* Perform preliminary evaluation if using subsampling.
405
+ Must expect at least 10 of least prevalent class */
406
+
407
+ ValThresh = 0;
408
+ if ( Subsample && No(Fp, Lp) > 5 * MaxClass * SAMPLEUNIT &&
409
+ (ClassFreq[Least] * MaxClass * SAMPLEUNIT) / No(Fp, Lp) >= 10 )
410
+ {
411
+ SampleEstimate(Fp, Lp, Cases);
412
+ Sampled = true;
413
+ }
414
+ else
415
+ {
416
+ Sampled = false;
417
+ }
418
+
419
+ BestAtt = ChooseSplit(Fp, Lp, Cases, Sampled);
420
+
421
+ /* Decide whether to branch or not */
422
+
423
+ if ( BestAtt == None )
424
+ {
425
+ Verbosity(1, fprintf(Of, "\tno sensible splits\n"))
426
+ if ( Now == FORMTREE ) Progress(Cases);
427
+ }
428
+ else
429
+ {
430
+ Verbosity(1,
431
+ fprintf(Of, "\tbest attribute %s", AttName[BestAtt]);
432
+ if ( Continuous(BestAtt) )
433
+ {
434
+ fprintf(Of, " cut %.3f", Bar[BestAtt]);
435
+ }
436
+ fprintf(Of, " inf %.3f gain %.3f val %.3f\n",
437
+ Info[BestAtt], Gain[BestAtt], Gain[BestAtt] / Info[BestAtt]))
438
+
439
+ /* Build a node of the selected test */
440
+
441
+ if ( Discrete(BestAtt) )
442
+ {
443
+ if ( SUBSET && MaxAttVal[BestAtt] > 3 && ! Ordered(BestAtt) )
444
+ {
445
+ SubsetTest(Node, BestAtt);
446
+ }
447
+ else
448
+ {
449
+ DiscreteTest(Node, BestAtt);
450
+ }
451
+ }
452
+ else
453
+ {
454
+ ContinTest(Node, BestAtt);
455
+ }
456
+
457
+ /* Carry out the recursive divide-and-conquer */
458
+
459
+ ++Tested[BestAtt];
460
+
461
+ Divide(Node, Fp, Lp, Level);
462
+
463
+ --Tested[BestAtt];
464
+
465
+ /* See whether we would have been no worse off with a leaf */
466
+
467
+ ForEach(v, 1, Node->Forks)
468
+ {
469
+ TreeErrs += Node->Branch[v]->Errors;
470
+ }
471
+
472
+ if ( TreeErrs >= 0.999 * Node->Errors )
473
+ {
474
+ Verbosity(1,
475
+ fprintf(Of, "<%d> Collapse tree for %d cases to leaf %s\n",
476
+ Level, No(Fp,Lp), ClassName[BestLeaf]))
477
+
478
+ UnSprout(Node);
479
+ }
480
+ else
481
+ {
482
+ Node->Errors = TreeErrs;
483
+ }
484
+ }
485
+ }
486
+
487
+
488
+
489
+ /*************************************************************************/
490
+ /* */
491
+ /* Estimate Gain[] and Info[] using sample */
492
+ /* */
493
+ /*************************************************************************/
494
+
495
+
496
+ void SampleEstimate(CaseNo Fp, CaseNo Lp, CaseCount Cases)
497
+ /* -------------- */
498
+ {
499
+ CaseNo SLp, SampleSize;
500
+ CaseCount NewCases;
501
+ Attribute Att;
502
+ float GR;
503
+
504
+ /* Phase 1: evaluate all discrete attributes and record best GR */
505
+
506
+ ForEach(Att, 1, MaxAtt)
507
+ {
508
+ Gain[Att] = None;
509
+
510
+ if ( Discrete(Att) )
511
+ {
512
+ EvalDiscrSplit(Att, Cases);
513
+
514
+ if ( Info[Att] > Epsilon &&
515
+ (GR = Gain[Att] / Info[Att]) > ValThresh )
516
+ {
517
+ ValThresh = GR;
518
+ }
519
+ }
520
+ }
521
+
522
+ /* Phase 2: generate sample */
523
+
524
+ SampleSize = MaxClass * SAMPLEUNIT;
525
+ Sample(Fp, Lp, SampleSize);
526
+ SLp = Fp + SampleSize - 1;
527
+
528
+ /* Phase 3: evaluate continuous attributes using sample */
529
+
530
+ NewCases = CountCases(Fp, SLp);
531
+ SampleFrac = NewCases / Cases;
532
+ NWaiting = 0;
533
+
534
+ ForEach(Att, 1, MaxAtt)
535
+ {
536
+ if ( Continuous(Att) )
537
+ {
538
+ Waiting[NWaiting++] = Att;
539
+ }
540
+ }
541
+
542
+ ProcessQueue(Fp, SLp, NewCases);
543
+
544
+ SampleFrac = 1.0;
545
+ }
546
+
547
+
548
+
549
+ /*************************************************************************/
550
+ /* */
551
+ /* Sample N cases from cases Fp through Lp */
552
+ /* */
553
+ /*************************************************************************/
554
+
555
+
556
+ void Sample(CaseNo Fp, CaseNo Lp, CaseNo N)
557
+ /* ------ */
558
+ {
559
+ CaseNo i, j;
560
+ double Interval;
561
+
562
+ Interval = No(Fp, Lp) / (double) N;
563
+
564
+ ForEach(i, 0, N-1)
565
+ {
566
+ j = (i + 0.5) * Interval;
567
+
568
+ assert(j >= 0 && Fp + j <= Lp);
569
+
570
+ Swap(Fp + i, Fp + j);
571
+ }
572
+ }
573
+
574
+
575
+
576
+ /*************************************************************************/
577
+ /* */
578
+ /* Evaluate splits and choose best attribute to split on. */
579
+ /* If Sampled, Gain[] and Info[] have been estimated on */
580
+ /* sample and unlikely candidates are not evaluated on all cases */
581
+ /* */
582
+ /*************************************************************************/
583
+
584
+
585
+ Attribute ChooseSplit(CaseNo Fp, CaseNo Lp, CaseCount Cases, Boolean Sampled)
586
+ /* ----------- */
587
+ {
588
+ Attribute Att;
589
+ int i, j;
590
+
591
+
592
+ /* For each available attribute, find the information and gain */
593
+
594
+ NWaiting = 0;
595
+
596
+ if ( Sampled )
597
+ {
598
+ /* If samples have been used, do not re-evaluate discrete atts
599
+ or atts that have low GR */
600
+
601
+ for ( Att = MaxAtt ; Att > 0 ; Att-- )
602
+ {
603
+ if ( ! Continuous(Att) ) continue;
604
+
605
+ if ( EstMaxGR[Att] >= ValThresh )
606
+ {
607
+ /* Add attributes in reverse order of estimated max GR */
608
+
609
+ for ( i = 0 ;
610
+ i < NWaiting && EstMaxGR[Waiting[i]] < EstMaxGR[Att] ;
611
+ i++ )
612
+ ;
613
+
614
+ for ( j = NWaiting-1 ; j >= i ; j-- )
615
+ {
616
+ Waiting[j+1] = Waiting[j];
617
+ }
618
+ NWaiting++;
619
+
620
+ Waiting[i] = Att;
621
+ }
622
+ else
623
+ {
624
+ /* Don't use -- attribute hasn't been fully evaluated.
625
+ Leave Gain unchanged to get correct count for Possible */
626
+
627
+ Info[Att] = -1E6; /* negative so GR also negative */
628
+ }
629
+ }
630
+ }
631
+ else
632
+ {
633
+ for ( Att = MaxAtt ; Att > 0 ; Att-- )
634
+ {
635
+ Gain[Att] = None;
636
+
637
+ if ( Skip(Att) || Att == ClassAtt )
638
+ {
639
+ continue;
640
+ }
641
+
642
+ Waiting[NWaiting++] = Att;
643
+ }
644
+ }
645
+
646
+ ProcessQueue(Fp, Lp, Cases);
647
+
648
+ return FindBestAtt(Cases);
649
+ }
650
+
651
+
652
+
653
+ void ProcessQueue(CaseNo WFp, CaseNo WLp, CaseCount WCases)
654
+ /* ------------ */
655
+ {
656
+ Attribute Att;
657
+ float GR;
658
+
659
+ for ( ; NWaiting > 0 ; )
660
+ {
661
+ Att = Waiting[--NWaiting];
662
+
663
+ if ( Discrete(Att) )
664
+ {
665
+ EvalDiscrSplit(Att, WCases);
666
+ }
667
+ else
668
+ if ( SampleFrac < 1 )
669
+ {
670
+ EstimateMaxGR(Att, WFp, WLp);
671
+ }
672
+ else
673
+ if ( Sampled )
674
+ {
675
+ Info[Att] = -1E16;
676
+
677
+ if ( EstMaxGR[Att] > ValThresh )
678
+ {
679
+ EvalContinuousAtt(Att, WFp, WLp);
680
+
681
+ if ( Info[Att] > Epsilon &&
682
+ (GR = Gain[Att] / Info[Att]) > ValThresh )
683
+ {
684
+ if ( GR > ValThresh ) ValThresh = GR;
685
+ }
686
+ }
687
+ }
688
+ else
689
+ {
690
+ EvalContinuousAtt(Att, WFp, WLp);
691
+ }
692
+ }
693
+ }
694
+
695
+
696
+
697
+ /*************************************************************************/
698
+ /* */
699
+ /* Adjust each attribute's gain to reflect choice and */
700
+ /* select att with maximum GR */
701
+ /* */
702
+ /*************************************************************************/
703
+
704
+
705
+ Attribute FindBestAtt(CaseCount Cases)
706
+ /* ----------- */
707
+ {
708
+ double BestVal, Val, MinGain=1E6, AvGain=0, MDL;
709
+ Attribute Att, BestAtt, Possible=0;
710
+ DiscrValue NBr, BestNBr=MaxDiscrVal+1;
711
+
712
+ ForEach(Att, 1, MaxAtt)
713
+ {
714
+ /* Update the number of possible attributes for splitting and
715
+ average gain (unless very many values) */
716
+
717
+ if ( Gain[Att] >= Epsilon &&
718
+ ( MultiVal || MaxAttVal[Att] < 0.3 * (MaxCase + 1) ) )
719
+ {
720
+ Possible++;
721
+ AvGain += Gain[Att];
722
+ }
723
+ else
724
+ {
725
+ Gain[Att] = None;
726
+ }
727
+ }
728
+
729
+ /* Set threshold on minimum gain */
730
+
731
+ if ( ! Possible ) return None;
732
+
733
+ AvGain /= Possible;
734
+ MDL = Log(Possible) / Cases;
735
+ MinGain = AvGain * AvGainWt + MDL * MDLWt;
736
+
737
+ Verbosity(2,
738
+ fprintf(Of, "\tav gain=%.3f, MDL (%d) = %.3f, min=%.3f\n",
739
+ AvGain, Possible, MDL, MinGain))
740
+
741
+ /* Find best attribute according to Gain Ratio criterion subject
742
+ to threshold on minimum gain */
743
+
744
+ BestVal = -Epsilon;
745
+ BestAtt = None;
746
+
747
+ ForEach(Att, 1, MaxAtt)
748
+ {
749
+ if ( Gain[Att] >= 0.999 * MinGain && Info[Att] > 0 )
750
+ {
751
+ Val = Gain[Att] / Info[Att];
752
+ NBr = ( MaxAttVal[Att] <= 3 || Ordered(Att) ? 3 :
753
+ SUBSET ? Subsets[Att] : MaxAttVal[Att] );
754
+
755
+ if ( Val > BestVal ||
756
+ Val > 0.999 * BestVal &&
757
+ ( NBr < BestNBr ||
758
+ NBr == BestNBr && Gain[Att] > Gain[BestAtt] ) )
759
+ {
760
+ BestAtt = Att;
761
+ BestVal = Val;
762
+ BestNBr = NBr;
763
+ }
764
+ }
765
+ }
766
+
767
+ return BestAtt;
768
+ }
769
+
770
+
771
+
772
+ /*************************************************************************/
773
+ /* */
774
+ /* Evaluate split on Att */
775
+ /* */
776
+ /*************************************************************************/
777
+
778
+
779
+ void EvalDiscrSplit(Attribute Att, CaseCount Cases)
780
+ /* -------------- */
781
+ {
782
+ DiscrValue v, NBr;
783
+
784
+ Gain[Att] = None;
785
+
786
+ if ( Skip(Att) || Att == ClassAtt ) return;
787
+
788
+ if ( Ordered(Att) )
789
+ {
790
+ EvalOrderedAtt(Att, Cases);
791
+ NBr = ( GEnv.ValFreq[1] > 0.5 ? 3 : 2 );
792
+ }
793
+ else
794
+ if ( SUBSET && MaxAttVal[Att] > 3 )
795
+ {
796
+ EvalSubset(Att, Cases);
797
+ NBr = Subsets[Att];
798
+ }
799
+ else
800
+ if ( ! Tested[Att] )
801
+ {
802
+ EvalDiscreteAtt(Att, Cases);
803
+
804
+ NBr = 0;
805
+ ForEach(v, 1, MaxAttVal[Att])
806
+ {
807
+ if ( GEnv.ValFreq[v] > 0.5 ) NBr++;
808
+ }
809
+ }
810
+ else
811
+ {
812
+ NBr = 0;
813
+ }
814
+
815
+ /* Check that this test will not give too many leaves */
816
+
817
+ if ( NBr > MaxLeaves + 1 )
818
+ {
819
+ Verbosity(2,
820
+ fprintf(Of, "\t(cancelled -- %d leaves, max %d)\n", NBr, MaxLeaves))
821
+
822
+ Gain[Att] = None;
823
+ }
824
+ }
825
+
826
+
827
+
828
+ /*************************************************************************/
829
+ /* */
830
+ /* Form the subtrees for the given node */
831
+ /* */
832
+ /*************************************************************************/
833
+
834
+
835
+ void Divide(Tree T, CaseNo Fp, CaseNo Lp, int Level)
836
+ /* ------ */
837
+ {
838
+ CaseNo Bp, Ep, Missing, Cases, i;
839
+ CaseCount KnownCases, MissingCases, BranchCases;
840
+ Attribute Att;
841
+ double Factor;
842
+ DiscrValue v;
843
+ Boolean PrevUnitWeights;
844
+
845
+ PrevUnitWeights = UnitWeights;
846
+
847
+ Att = T->Tested;
848
+ Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
849
+
850
+ KnownCases = T->Cases - (MissingCases = CountCases(Fp, Ep));
851
+
852
+ if ( Missing )
853
+ {
854
+ UnitWeights = false;
855
+
856
+ /* If using costs, must adjust branch factors to undo effects of
857
+ reweighting cases */
858
+
859
+ if ( CostWeights )
860
+ {
861
+ KnownCases = SumNocostWeights(Ep+1, Lp);
862
+ }
863
+
864
+ /* If there are many cases with missing values and many branches,
865
+ skip cases whose weight < 0.1 */
866
+
867
+ if ( (Cases = No(Fp,Lp)) > 1000 &&
868
+ Missing > 0.5 * Cases &&
869
+ T->Forks >= 10 )
870
+ {
871
+ ForEach(i, Fp, Ep)
872
+ {
873
+ if ( Weight(Case[i]) < 0.1 )
874
+ {
875
+ Missing--;
876
+ MissingCases -= Weight(Case[i]);
877
+ Swap(Fp, i);
878
+ Fp++;
879
+ }
880
+ }
881
+
882
+ assert(Missing >= 0);
883
+ }
884
+ }
885
+
886
+ Bp = Fp;
887
+ ForEach(v, 1, T->Forks)
888
+ {
889
+ Ep = Group(v, Bp + Missing, Lp, T);
890
+
891
+ assert(Bp + Missing <= Lp+1 && Ep <= Lp);
892
+
893
+ /* Bp -> first value in missing + remaining values
894
+ Ep -> last value in missing + current group */
895
+
896
+ BranchCases = CountCases(Bp + Missing, Ep);
897
+
898
+ Factor = ( ! Missing ? 0 :
899
+ ! CostWeights ? BranchCases / KnownCases :
900
+ SumNocostWeights(Bp + Missing, Ep) / KnownCases );
901
+
902
+ if ( BranchCases + Factor * MissingCases >= MinLeaf )
903
+ {
904
+ if ( Missing )
905
+ {
906
+ /* Adjust weights of cases with missing values */
907
+
908
+ ForEach(i, Bp, Bp + Missing - 1)
909
+ {
910
+ Weight(Case[i]) *= Factor;
911
+ }
912
+ }
913
+
914
+ FormTree(Bp, Ep, Level+1, &T->Branch[v]);
915
+
916
+ /* Restore weights if changed */
917
+
918
+ if ( Missing )
919
+ {
920
+ for ( i = Ep ; i >= Bp ; i-- )
921
+ {
922
+ if ( Unknown(Case[i], Att) )
923
+ {
924
+ Weight(Case[i]) /= Factor;
925
+ Swap(i, Ep);
926
+ Ep--;
927
+ }
928
+ }
929
+ }
930
+
931
+ Bp = Ep+1;
932
+ }
933
+ else
934
+ {
935
+ T->Branch[v] = Leaf(Nil, T->Leaf, 0.0, 0.0);
936
+ }
937
+ }
938
+
939
+ UnitWeights = PrevUnitWeights;
940
+ }
941
+
942
+
943
+
944
+ /*************************************************************************/
945
+ /* */
946
+ /* Group together the cases corresponding to branch V of a test */
947
+ /* and return the index of the last such */
948
+ /* */
949
+ /* Note: if V equals zero, group the unknown values */
950
+ /* */
951
+ /*************************************************************************/
952
+
953
+
954
+ CaseNo Group(DiscrValue V, CaseNo Bp, CaseNo Ep, Tree TestNode)
955
+ /* ----- */
956
+ {
957
+ CaseNo i;
958
+ Attribute Att;
959
+ ContValue Thresh;
960
+ Set SS;
961
+
962
+ Att = TestNode->Tested;
963
+
964
+ if ( ! V )
965
+ {
966
+ /* Group together unknown values (if any) */
967
+
968
+ if ( SomeMiss[Att] )
969
+ {
970
+ ForEach(i, Bp, Ep)
971
+ {
972
+ if ( Unknown(Case[i], Att) )
973
+ {
974
+ Swap(Bp, i);
975
+ Bp++;
976
+ }
977
+ }
978
+ }
979
+ }
980
+ else /* skip non-existant N/A values */
981
+ if ( V != 1 || TestNode->NodeType == BrSubset || SomeNA[Att] )
982
+ {
983
+ /* Group cases on the value of attribute Att, and depending
984
+ on the type of branch */
985
+
986
+ switch ( TestNode->NodeType )
987
+ {
988
+ case BrDiscr:
989
+
990
+ ForEach(i, Bp, Ep)
991
+ {
992
+ if ( DVal(Case[i], Att) == V )
993
+ {
994
+ Swap(Bp, i);
995
+ Bp++;
996
+ }
997
+ }
998
+ break;
999
+
1000
+ case BrThresh:
1001
+
1002
+ Thresh = TestNode->Cut;
1003
+ ForEach(i, Bp, Ep)
1004
+ {
1005
+ if ( V == 1 ? NotApplic(Case[i], Att) :
1006
+ (CVal(Case[i], Att) <= Thresh) == (V == 2) )
1007
+ {
1008
+ Swap(Bp, i);
1009
+ Bp++;
1010
+ }
1011
+ }
1012
+ break;
1013
+
1014
+ case BrSubset:
1015
+
1016
+ SS = TestNode->Subset[V];
1017
+ ForEach(i, Bp, Ep)
1018
+ {
1019
+ if ( In(XDVal(Case[i], Att), SS) )
1020
+ {
1021
+ Swap(Bp, i);
1022
+ Bp++;
1023
+ }
1024
+ }
1025
+ break;
1026
+ }
1027
+ }
1028
+
1029
+ return Bp - 1;
1030
+ }
1031
+
1032
+
1033
+
1034
+ /*************************************************************************/
1035
+ /* */
1036
+ /* Return the total weight of cases from Fp to Lp */
1037
+ /* */
1038
+ /*************************************************************************/
1039
+
1040
+
1041
+ CaseCount SumWeights(CaseNo Fp, CaseNo Lp)
1042
+ /* ---------- */
1043
+ {
1044
+ double Sum=0.0;
1045
+ CaseNo i;
1046
+
1047
+ assert(Fp >= 0 && Lp >= Fp-1 && Lp <= MaxCase);
1048
+
1049
+ ForEach(i, Fp, Lp)
1050
+ {
1051
+ Sum += Weight(Case[i]);
1052
+ }
1053
+
1054
+ return Sum;
1055
+ }
1056
+
1057
+
1058
+
1059
+ /*************************************************************************/
1060
+ /* */
1061
+ /* Special version to undo the weightings associated with costs */
1062
+ /* */
1063
+ /*************************************************************************/
1064
+
1065
+
1066
+ CaseCount SumNocostWeights(CaseNo Fp, CaseNo Lp)
1067
+ /* ---------------- */
1068
+ {
1069
+ double Sum=0.0;
1070
+ CaseNo i;
1071
+
1072
+ assert(Fp >= 0 && Lp >= Fp-1 && Lp <= MaxCase);
1073
+
1074
+ ForEach(i, Fp, Lp)
1075
+ {
1076
+ Sum += Weight(Case[i]) / WeightMul[Class(Case[i])];
1077
+ }
1078
+
1079
+ return Sum;
1080
+ }
1081
+
1082
+
1083
+
1084
+ /*************************************************************************/
1085
+ /* */
1086
+ /* Generate class frequency distribution */
1087
+ /* */
1088
+ /*************************************************************************/
1089
+
1090
+
1091
+ void FindClassFreq(double *CF, CaseNo Fp, CaseNo Lp)
1092
+ /* ------------- */
1093
+ {
1094
+ ClassNo c;
1095
+ CaseNo i;
1096
+
1097
+ assert(Fp >= 0 && Lp >= Fp && Lp <= MaxCase);
1098
+
1099
+ ForEach(c, 0, MaxClass)
1100
+ {
1101
+ CF[c] = 0;
1102
+ }
1103
+
1104
+ ForEach(i, Fp, Lp)
1105
+ {
1106
+ assert(Class(Case[i]) >= 1 && Class(Case[i]) <= MaxClass);
1107
+
1108
+ CF[ Class(Case[i]) ] += Weight(Case[i]);
1109
+ }
1110
+ }
1111
+
1112
+
1113
+
1114
+ /*************************************************************************/
1115
+ /* */
1116
+ /* Find all discrete frequencies */
1117
+ /* */
1118
+ /*************************************************************************/
1119
+
1120
+
1121
+ void FindAllFreq(CaseNo Fp, CaseNo Lp)
1122
+ /* ----------- */
1123
+ {
1124
+ ClassNo c;
1125
+ CaseNo i;
1126
+ Attribute Att, a;
1127
+ CaseCount w;
1128
+ int x;
1129
+
1130
+ /* Zero all values */
1131
+
1132
+ ForEach(c, 0, MaxClass)
1133
+ {
1134
+ ClassFreq[c] = 0;
1135
+ }
1136
+
1137
+ for ( a = 0 ; a < NDList ; a++ )
1138
+ {
1139
+ Att = DList[a];
1140
+ for ( x = MaxClass * (MaxAttVal[Att]+1) - 1 ; x >= 0 ; x-- )
1141
+ {
1142
+ DFreq[Att][x] = 0;
1143
+ }
1144
+ }
1145
+
1146
+ /* Scan cases */
1147
+
1148
+ ForEach(i, Fp, Lp)
1149
+ {
1150
+ ClassFreq[ (c=Class(Case[i])) ] += (w=Weight(Case[i]));
1151
+
1152
+ for ( a = 0 ; a < NDList ; a++ )
1153
+ {
1154
+ Att = DList[a];
1155
+ DFreq[Att][ MaxClass * XDVal(Case[i], Att) + (c-1) ] += w;
1156
+ }
1157
+ }
1158
+ }