see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,459 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of GritBot GPL Edition, a single-threaded version */
6
+ /* of GritBot release 2.01. */
7
+ /* */
8
+ /* GritBot GPL Edition is free software: you can redistribute it */
9
+ /* and/or modify it under the terms of the GNU General Public License */
10
+ /* as published by the Free Software Foundation, either version 3 of */
11
+ /* the License, or (at your option) any later version. */
12
+ /* */
13
+ /* GritBot GPL Edition is distributed in the hope that it will be */
14
+ /* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
15
+ /* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16
+ /* GNU General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with GritBot GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Divide-and-Conquer for discrete attributes */
30
+ /* ------------------------------------------ */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+ #include "defns.i"
35
+ #include "extern.i"
36
+
37
+
38
+ /*************************************************************************/
39
+ /* */
40
+ /* Known values of continuous attributes are divided into */
41
+ /* three groups: */
42
+ /* (1) N/A */
43
+ /* (2) values less than a threshold */
44
+ /* (3) values greater than a threshold */
45
+ /* This routine finds the best threshold for items Fp through Lp */
46
+ /* and sets Gain[], Info[] and Bar[] */
47
+ /* */
48
+ /*************************************************************************/
49
+
50
+
51
+ void DEvalContinAtt(Attribute Att, CaseNo Fp, CaseNo Lp)
52
+ /* -------------- */
53
+ {
54
+ CaseNo i, BestI, Xp;
55
+ CaseCount Cases;
56
+ DiscrValue v, c;
57
+ double BestGain=-1E-6, BestInfo, ThisGain;
58
+
59
+ /* Reset frequencies */
60
+
61
+ ForEach(v, 0, 3)
62
+ {
63
+ ForEach(c, 1, MaxAttVal[ClassAtt])
64
+ {
65
+ GEnv.Freq[v][c] = 0;
66
+ }
67
+ GEnv.ValFreq[v] = 0;
68
+ }
69
+
70
+ /* Omit and count N/A values */
71
+
72
+ Xp = Fp;
73
+ ForEach(i, Fp, Lp)
74
+ {
75
+ if ( NotApplic(Case[i],Att) )
76
+ {
77
+ GEnv.Freq[1][DClass(Case[i])]++;
78
+ Swap(Xp, i);
79
+ Xp++;
80
+ }
81
+ else
82
+ {
83
+ GEnv.Freq[3][DClass(Case[i])]++;
84
+ }
85
+ }
86
+
87
+ /* Special case when very few known values */
88
+
89
+ if ( No(Xp, Lp) < 2 * (DMINITEMS*GEnv.FRAC) )
90
+ {
91
+ Verbosity(2,
92
+ fprintf(Of, "\tAtt %s: insufficient cases with known values\n",
93
+ AttName[Att]))
94
+ return;
95
+ }
96
+
97
+ Cases = No(Fp, Lp);
98
+
99
+ GEnv.ValFreq[0] = GEnv.ValFreq[1] = 0;
100
+ ForEach(c, 1, MaxAttVal[ClassAtt])
101
+ {
102
+ GEnv.ValFreq[1] += GEnv.Freq[1][c];
103
+ }
104
+
105
+ /* Sort all applicable values */
106
+
107
+ Quicksort(Xp, Lp, Att);
108
+
109
+ /* Try possible cuts between items i and i+1, and determine the
110
+ information and gain of the split in each case */
111
+
112
+ if ( Xp + (DMINITEMS*GEnv.FRAC) - 2 >= Lp ) return;
113
+ ForEach(i, Xp, Xp + (DMINITEMS*GEnv.FRAC) - 2)
114
+ {
115
+ c = DClass(Case[i]);
116
+
117
+ GEnv.Freq[2][c]++;
118
+ GEnv.Freq[3][c]--;
119
+ }
120
+
121
+ ForEach(i, Xp + (DMINITEMS*GEnv.FRAC) - 1, Lp - (DMINITEMS*GEnv.FRAC))
122
+ {
123
+ c = DClass(Case[i]);
124
+
125
+ GEnv.Freq[2][c]++;
126
+ GEnv.Freq[3][c]--;
127
+
128
+ if ( CVal(Case[i+1], Att) > CVal(Case[i], Att) )
129
+ {
130
+ GEnv.ValFreq[2] = i - Xp + 1;
131
+ GEnv.ValFreq[3] = Lp - i;
132
+
133
+ ThisGain = DiscrGain(3, Cases);
134
+
135
+ if ( ThisGain > BestGain + Epsilon )
136
+ {
137
+ BestGain = ThisGain;
138
+ BestInfo = TotalInfo(GEnv.ValFreq, 1, 3) / Cases;
139
+ BestI = i;
140
+
141
+ ForEach(v, 1, 3)
142
+ {
143
+ ForEach(c, 1, MaxAttVal[ClassAtt])
144
+ {
145
+ GEnv.BestFreq[v][c] = GEnv.Freq[v][c];
146
+ }
147
+ }
148
+ }
149
+ }
150
+ }
151
+
152
+ /* If a test on the attribute is able to make a gain,
153
+ set the best break point, gain and information */
154
+
155
+ if ( BestGain > Epsilon )
156
+ {
157
+ GEnv.Gain[Att] = BestGain;
158
+ GEnv.Info[Att] = BestInfo;
159
+ GEnv.Bar[Att] = Between(CVal(Case[BestI],Att), CVal(Case[BestI+1],Att));
160
+
161
+ Verbosity(2,
162
+ fprintf(Of, "\tAtt %s: cut=%.3f, inf %.3f, gain %.3f\n",
163
+ AttName[Att], GEnv.Bar[Att], GEnv.Info[Att], GEnv.Gain[Att]))
164
+
165
+ /* If not sampling, check subsets */
166
+
167
+ if ( GEnv.FRAC >= 1 )
168
+ {
169
+ if ( Xp > Fp )
170
+ {
171
+ NoteTest(Att, 1, 0.0, Nil);
172
+ FindDiscrOutliers(Fp, Xp-1, GEnv.BestFreq[1]);
173
+ }
174
+
175
+ NoteTest(Att, 2, GEnv.Bar[Att], Nil);
176
+ FindDiscrOutliers(Xp, BestI, GEnv.BestFreq[2]);
177
+
178
+ NoteTest(Att, 3, GEnv.Bar[Att], Nil);
179
+ FindDiscrOutliers(BestI+1, Lp, GEnv.BestFreq[3]);
180
+ }
181
+ }
182
+ else
183
+ {
184
+ Verbosity(2, fprintf(Of, "\tAtt %s: no gain\n", AttName[Att]))
185
+ }
186
+ }
187
+
188
+
189
+
190
+ /*************************************************************************/
191
+ /* */
192
+ /* Set Info[] and Gain[] for discrete partition of items Fp to Lp */
193
+ /* */
194
+ /*************************************************************************/
195
+
196
+
197
+ void DEvalDiscrAtt(Attribute Att, CaseNo Fp, CaseNo Lp)
198
+ /* ------------- */
199
+ {
200
+ CaseCount KnownCases;
201
+ int ReasonableSubsets=0;
202
+ DiscrValue v;
203
+
204
+ ComputeFrequencies(Att, Fp, Lp);
205
+ KnownCases = No(Fp, Lp);
206
+
207
+ /* Check reasonable subsets */
208
+
209
+ ForEach(v, 1, MaxAttVal[Att])
210
+ {
211
+ if ( GEnv.ValFreq[v] >= (DMINITEMS*GEnv.FRAC) ) ReasonableSubsets++;
212
+ }
213
+
214
+ if ( ReasonableSubsets < 2 )
215
+ {
216
+ Verbosity(2, fprintf(Of, "\tAtt %s: poor split\n", AttName[Att]))
217
+ return;
218
+ }
219
+
220
+ GEnv.Gain[Att] = DiscrGain(MaxAttVal[Att], KnownCases);
221
+ GEnv.Info[Att] = TotalInfo(GEnv.ValFreq, 1, MaxAttVal[Att]) / KnownCases;
222
+
223
+ if ( GEnv.Gain[Att] > Epsilon )
224
+ {
225
+ Verbosity(2,
226
+ fprintf(Of, "\tAtt %s: inf %.3f, gain %.3f\n",
227
+ AttName[Att], GEnv.Info[Att], GEnv.Gain[Att]))
228
+ }
229
+ else
230
+ {
231
+ GEnv.Gain[Att] = None;
232
+ Verbosity(2,
233
+ fprintf(Of, "\tAtt %s: no gain\n", AttName[Att]))
234
+ }
235
+
236
+ if ( GEnv.FRAC >= 1 && GEnv.Gain[Att] > Epsilon )
237
+ {
238
+ CheckPotentialClusters(Att, MaxAttVal[Att], Fp, Lp, 0.0, Nil, GEnv.Freq);
239
+ }
240
+ }
241
+
242
+
243
+
244
+ /*************************************************************************/
245
+ /* */
246
+ /* Set Info[] and Gain[] for ordered split on items Fp to Lp */
247
+ /* */
248
+ /*************************************************************************/
249
+
250
+
251
+ void DEvalOrderedAtt(Attribute Att, CaseNo Fp, CaseNo Lp)
252
+ /* --------------- */
253
+ {
254
+ CaseCount KnownCases, *HoldFreqRow, SplitFreq[4];
255
+ DiscrValue v, BestV, vv, c;
256
+ double ThisGain, BestInfo, BestGain=-1E-6;
257
+
258
+ ComputeFrequencies(Att, Fp, Lp);
259
+
260
+ KnownCases = No(Fp, Lp);
261
+
262
+ /* Move elts of Freq[] starting with the third up one place
263
+ and aggregate class frequencies */
264
+
265
+ HoldFreqRow = GEnv.Freq[MaxAttVal[Att]+1];
266
+ ForEach(c, 1, MaxAttVal[ClassAtt])
267
+ {
268
+ HoldFreqRow[c] = 0;
269
+ }
270
+ SplitFreq[0] = GEnv.ValFreq[0];
271
+ SplitFreq[1] = GEnv.ValFreq[1];
272
+ SplitFreq[2] = GEnv.ValFreq[2];
273
+ SplitFreq[3] = 0;
274
+
275
+ for ( v = MaxAttVal[Att] ; v > 2 ; v-- )
276
+ {
277
+ GEnv.Freq[v+1] = GEnv.Freq[v];
278
+ ForEach(c, 1, MaxAttVal[ClassAtt])
279
+ {
280
+ HoldFreqRow[c] += GEnv.Freq[v][c];
281
+ }
282
+ SplitFreq[3] += GEnv.ValFreq[v];
283
+ }
284
+
285
+ GEnv.Freq[3] = HoldFreqRow;
286
+
287
+ /* Try various cuts, saving the one with maximum gain */
288
+
289
+ ForEach(v, 3, MaxAttVal[Att])
290
+ {
291
+ if ( SplitFreq[2] >= (DMINITEMS*GEnv.FRAC) &&
292
+ SplitFreq[3] >= (DMINITEMS*GEnv.FRAC) )
293
+ {
294
+ ThisGain = DiscrGain(3, KnownCases);
295
+
296
+ if ( ThisGain > BestGain + Epsilon )
297
+ {
298
+ BestGain = ThisGain;
299
+ BestInfo = TotalInfo(SplitFreq, 0, 3) / KnownCases;
300
+ BestV = v-1;
301
+
302
+ ForEach(vv, 1, 3)
303
+ {
304
+ ForEach(c, 1, MaxAttVal[ClassAtt])
305
+ {
306
+ GEnv.BestFreq[vv][c] = GEnv.Freq[vv][c];
307
+ }
308
+ }
309
+ }
310
+ }
311
+
312
+ /* Move val v from right branch to left branch */
313
+
314
+ ForEach(c, 1, MaxAttVal[ClassAtt])
315
+ {
316
+ GEnv.Freq[2][c] += GEnv.Freq[v+1][c];
317
+ GEnv.Freq[3][c] -= GEnv.Freq[v+1][c];
318
+ }
319
+ SplitFreq[2] += GEnv.ValFreq[v];
320
+ SplitFreq[3] -= GEnv.ValFreq[v];
321
+ }
322
+
323
+ /* If a test on the attribute is able to make a gain,
324
+ set the best break point, gain and information */
325
+
326
+ if ( BestGain > Epsilon )
327
+ {
328
+ GEnv.Gain[Att] = BestGain;
329
+ GEnv.Info[Att] = BestInfo;
330
+ GEnv.Bar[Att] = BestV + 0.1;
331
+
332
+ ClearBits((MaxAttVal[Att]>>3)+1, GEnv.Subset[Att]);
333
+ ForEach(v, 2, BestV)
334
+ {
335
+ SetBit(v, GEnv.Subset[Att]);
336
+ }
337
+
338
+ Verbosity(2,
339
+ fprintf(Of, "\tAtt %s: cut after %s, inf %.3f, gain %.3f\n",
340
+ AttName[Att], AttValName[Att][(int) GEnv.Bar[Att]],
341
+ GEnv.Info[Att], GEnv.Gain[Att]))
342
+
343
+ if ( GEnv.FRAC >= 1 )
344
+ {
345
+ CheckPotentialClusters(Att, 3, Fp, Lp, GEnv.Bar[Att], GEnv.Subset[Att],
346
+ GEnv.BestFreq);
347
+ }
348
+ }
349
+ else
350
+ {
351
+ Verbosity(2, fprintf(Of, "\tAtt %s: no gain\n", AttName[Att]))
352
+ }
353
+ }
354
+
355
+
356
+
357
+ /*************************************************************************/
358
+ /* */
359
+ /* Compute frequency tables Freq[][] and ValFreq[] for attribute */
360
+ /* Att from items Fp to Lp */
361
+ /* */
362
+ /*************************************************************************/
363
+
364
+
365
+ void ComputeFrequencies(Attribute Att, CaseNo Fp, CaseNo Lp)
366
+ /* ------------------ */
367
+ {
368
+ DiscrValue v, c;
369
+ CaseNo Sum;
370
+
371
+ ForEach(v, 0, MaxAttVal[Att])
372
+ {
373
+ Sum = 0;
374
+ ForEach(c, 1, MaxAttVal[ClassAtt])
375
+ {
376
+ Sum += (GEnv.Freq[v][c] = GEnv.DFreq[Att][v][c]);
377
+ }
378
+ GEnv.ValFreq[v] = Sum;
379
+ }
380
+ }
381
+
382
+
383
+
384
+ void FindClassFrequencies(CaseNo Fp, CaseNo Lp)
385
+ /* -------------------- */
386
+ {
387
+ DiscrValue v;
388
+ CaseNo i;
389
+
390
+ ForEach(v, 1, MaxAttVal[ClassAtt])
391
+ {
392
+ GEnv.ClassFreq[v] = 0;
393
+ }
394
+
395
+ ForEach(i, Fp, Lp)
396
+ {
397
+ GEnv.ClassFreq[DClass(Case[i])]++;
398
+ }
399
+ }
400
+
401
+
402
+
403
+ /*************************************************************************/
404
+ /* */
405
+ /* Given Freq[][] and ValFreq[], compute the information gain */
406
+ /* */
407
+ /*************************************************************************/
408
+
409
+
410
+ double DiscrGain(DiscrValue MaxVal, CaseCount KnownCases)
411
+ /* --------- */
412
+ {
413
+ DiscrValue v;
414
+ double ThisInfo=0.0;
415
+
416
+ /* Check whether all values are unknown or the same */
417
+
418
+ if ( ! KnownCases ) return None;
419
+
420
+ /* Compute total info after split, by summing the
421
+ info of each of the subsets formed by the test */
422
+
423
+ ForEach(v, 1, MaxVal)
424
+ {
425
+ ThisInfo += TotalInfo(GEnv.Freq[v], 1, MaxAttVal[ClassAtt]);
426
+ }
427
+
428
+ /* Set the gain in information for all items */
429
+
430
+ return GEnv.BaseInfo - ThisInfo / KnownCases;
431
+ }
432
+
433
+
434
+
435
+ /*************************************************************************/
436
+ /* */
437
+ /* Compute the total information in V[ MinVal..MaxVal ]. */
438
+ /* Use tabulate logs of numbers of cases */
439
+ /* */
440
+ /*************************************************************************/
441
+
442
+
443
+ double TotalInfo(CaseCount V[], DiscrValue MinVal, DiscrValue MaxVal)
444
+ /* --------- */
445
+ {
446
+ DiscrValue v;
447
+ double Sum=0.0;
448
+ CaseCount N, TotalCases=0;
449
+
450
+ ForEach(v, MinVal, MaxVal)
451
+ {
452
+ N = V[v];
453
+
454
+ Sum += N * LogCaseNo[N];
455
+ TotalCases += N;
456
+ }
457
+
458
+ return TotalCases * LogCaseNo[TotalCases] - Sum;
459
+ }