see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
data/ext/c5.0/xval ADDED
@@ -0,0 +1,150 @@
1
+ #! /bin/csh
2
+ #---------------------------------------------------------------------
3
+ # Multi F-fold cross-validation script
4
+ #---------------------------------------------------------------------
5
+ #
6
+ # Invocation:
7
+ # xval [C5.0 options] [F=folds] [R=repeats] [+label] [+d]
8
+ #
9
+ # Carries out R F-fold cross-validations
10
+ #
11
+ # If +d is used, individual results from each block are left in
12
+ # <filestem>.o<cross-validation no>[+label]
13
+ # Averages over cross-validations are written to
14
+ # <filestem>.res[+label]
15
+ #---------------------------------------------------------------------
16
+
17
+
18
+ # Sort the options into those applying to C5.0 and the rest
19
+
20
+ set opts =
21
+ set folds = 10
22
+ set repeats = 1
23
+ set label =
24
+ set filestem = undefined
25
+ set rules = 0
26
+
27
+ set i = 1
28
+ while ( $i <= $#argv )
29
+ set opt = $argv[$i]
30
+
31
+ switch ( $opt )
32
+ case "F=*":
33
+ set folds = `echo $opt | sed s/F=//`
34
+ breaksw
35
+ case "R=*":
36
+ set repeats = `echo $opt | sed s/R=//`
37
+ breaksw
38
+ case "+d":
39
+ set details
40
+ breaksw
41
+ case "+*":
42
+ set label = $opt
43
+ breaksw
44
+ case "-f":
45
+ @ i++
46
+ set filestem = $argv[$i]
47
+ breaksw
48
+ case "-f*":
49
+ set filestem = `echo $opt | sed s/-f//`
50
+ breaksw
51
+ case "-t":
52
+ case "-m":
53
+ case "-c":
54
+ case "-u":
55
+ case "-S":
56
+ case "-I":
57
+ @ i++
58
+ set opts = ( $opts ${opt}$argv[$i] )
59
+ breaksw
60
+ case "-b":
61
+ case "-p":
62
+ case "-e":
63
+ case "-t*":
64
+ case "-g":
65
+ case "-s":
66
+ case "-w":
67
+ case "-u*":
68
+ case "-m*":
69
+ case "-c*":
70
+ case "-S*":
71
+ case "-I*":
72
+ set opts = ( $opts $opt )
73
+ breaksw
74
+ case "-r":
75
+ set opts = ( $opts $opt )
76
+ set rules = 1
77
+ breaksw
78
+ case "-X":
79
+ @ i++
80
+ set folds = $argv[$i]
81
+ breaksw
82
+ case "-X*":
83
+ set folds = `echo $opt | sed s/-X//`
84
+ breaksw
85
+ default:
86
+ echo "unrecognised or inappropriate option" $opt
87
+ case "-h":
88
+ echo ""
89
+ echo "Summary of options for xval:"
90
+ echo ""
91
+ echo " F=<f> set f folds"
92
+ echo " R=<r> repeat r times"
93
+ echo " +d retain detailed files"
94
+ echo " +s label all output files with suffix +s"
95
+ echo ""
96
+ echo " -f <filestem> application filestem"
97
+ echo " -r use rule-based classifiers"
98
+ echo " -u <bands> order rules by utility"
99
+ echo " -w invoke attribute winnowing"
100
+ echo " -b invoke 10-trial boosting"
101
+ echo " -t <trials> number of boosting trials"
102
+ echo " -p use soft thresholds"
103
+ echo " -e focus on errors (ignore costs file)"
104
+ echo " -s find subset tests for discrete atts"
105
+ echo " -m <objs> restrict allowable splits"
106
+ echo " -c <CF> confidence level for pruning"
107
+ echo " -S <percent> training sample percentage"
108
+ echo " -X <folds> cross-validate"
109
+ echo " -I <integer> random seed [ignored]"
110
+ echo " -h print this message"
111
+ exit 0
112
+ endsw
113
+
114
+ @ i++
115
+ end
116
+
117
+
118
+ # Clear the summary file
119
+
120
+ cp /dev/null $filestem.xsum
121
+
122
+
123
+ # Repeat cross-validations, incrementing the random seed
124
+
125
+ set r = 0
126
+ while ( $r < $repeats )
127
+
128
+ set outf = $filestem.o$r$label
129
+ c5.0 -f $filestem $opts -X $folds -I $r >$outf
130
+ grep "<<" $outf >> $filestem.xsum
131
+
132
+ @ r++
133
+ end
134
+
135
+
136
+ # Find the number of cases in the training and test files
137
+
138
+ set junk = `grep ^Read $outf`
139
+ @ examples = $junk[2]
140
+ if ( -e $filestem.test ) then
141
+ @ examples += $junk[9]
142
+ endif
143
+
144
+
145
+ # Remove the temporary file and summarize results
146
+
147
+ report $examples $folds $repeats $rules <$filestem.xsum >$filestem.res$label
148
+
149
+ rm $filestem.xsum
150
+ if ( ! $?details ) rm -f $filestem.o[0-9]*$label
data/ext/c5.0/xval.c ADDED
@@ -0,0 +1,402 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Carry out crossvalidation trials */
30
+ /* -------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+ #include "defns.i"
35
+ #include "extern.i"
36
+
37
+
38
+ DataRec *Blocked=Nil;
39
+ float **Result=Nil; /* Result[f][0] = tree/ruleset size
40
+ [1] = tree/ruleset errors
41
+ [2] = tree/ruleset cost */
42
+
43
+
44
+
45
+ /*************************************************************************/
46
+ /* */
47
+ /* Outer function (differs from xval script) */
48
+ /* */
49
+ /*************************************************************************/
50
+
51
+
52
+ void CrossVal()
53
+ /* -------- */
54
+ {
55
+ CaseNo i, Size, Start=0, Next, SaveMaxCase;
56
+ int f, SmallTestBlocks, t, SaveTRIALS;
57
+ ClassNo c;
58
+ static CaseNo *ConfusionMat=Nil;
59
+ static int SaveFOLDS=0;
60
+
61
+ /* Check for left-overs after interrupt */
62
+
63
+ if ( Result )
64
+ {
65
+ FreeVector((void **) Result, 0, SaveFOLDS-1);
66
+ Free(ConfusionMat);
67
+ }
68
+
69
+ if ( FOLDS > MaxCase+1 )
70
+ {
71
+ fprintf(Of, T_FoldsReduced);
72
+ FOLDS = MaxCase+1;
73
+ }
74
+
75
+ Result = AllocZero((SaveFOLDS = FOLDS), float *);
76
+ Blocked = Alloc(MaxCase+1, DataRec);
77
+ ConfusionMat = AllocZero((MaxClass+1)*(MaxClass+1), CaseNo);
78
+
79
+ Prepare();
80
+
81
+ SaveMaxCase = MaxCase;
82
+ SaveTRIALS = TRIALS;
83
+
84
+ /* First test blocks may be smaller than the others */
85
+
86
+ SmallTestBlocks = FOLDS - ((MaxCase+1) % FOLDS);
87
+ Size = (MaxCase + 1) / FOLDS;
88
+
89
+ ForEach(f, 0, FOLDS-1)
90
+ {
91
+ fprintf(Of, "\n\n[ " T_Fold " %d ]\n", f+1);
92
+ Result[f] = AllocZero(3, float);
93
+
94
+ if ( f == SmallTestBlocks ) Size++;
95
+ MaxCase = SaveMaxCase - Size;
96
+
97
+ ForEach(i, 0, MaxCase)
98
+ {
99
+ Case[i] = Blocked[Start];
100
+ Start = (Start + 1) % (SaveMaxCase + 1);
101
+ }
102
+
103
+ ConstructClassifiers();
104
+
105
+ /* Check size (if appropriate) and errors */
106
+
107
+ if ( TRIALS == 1 )
108
+ {
109
+ Result[f][0] = ( RULES ? RuleSet[0]->SNRules :
110
+ TreeSize(Pruned[0]) );
111
+ Next = Start;
112
+ ForEach(i, 0, Size-1)
113
+ {
114
+ Case[i] = Blocked[Next];
115
+ c = ( RULES ? RuleClassify(Blocked[Next], RuleSet[0]) :
116
+ TreeClassify(Blocked[Next], Pruned[0]) );
117
+ if ( c != Class(Blocked[Next]) )
118
+ {
119
+ Result[f][1] += 1.0;
120
+ if ( MCost )
121
+ {
122
+ Result[f][2] += MCost[c][Class(Blocked[Next])];
123
+ }
124
+ }
125
+
126
+ /* Add to confusion matrix for target classifier */
127
+
128
+ ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;
129
+
130
+ Next = (Next + 1) % (SaveMaxCase + 1);
131
+ }
132
+ }
133
+ else
134
+ {
135
+ Result[f][0] = -1;
136
+ Next = Start;
137
+ Default = ( RULES ? RuleSet[0]->SDefault : Pruned[0]->Leaf );
138
+ ForEach(i, 0, Size-1)
139
+ {
140
+ Case[i] = Blocked[Next];
141
+ c = BoostClassify(Blocked[Next], TRIALS-1);
142
+ if ( c != Class(Blocked[Next]) )
143
+ {
144
+ Result[f][1] += 1.0;
145
+ if ( MCost )
146
+ {
147
+ Result[f][2] += MCost[c][Class(Blocked[Next])];
148
+ }
149
+ }
150
+
151
+ /* Add to confusion matrix for target classifier */
152
+
153
+ ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;
154
+
155
+ Next = (Next + 1) % (SaveMaxCase + 1);
156
+ }
157
+ }
158
+
159
+ Result[f][1] = (100.0 * Result[f][1]) / Size;
160
+ Result[f][2] /= Size;
161
+
162
+ fprintf(Of, T_EvalHoldOut, Size);
163
+ MaxCase = Size-1;
164
+ Evaluate(0);
165
+
166
+ /* Free space used by classifiers */
167
+
168
+ ForEach(t, 0, MaxTree)
169
+ {
170
+ FreeClassifier(t);
171
+ }
172
+ MaxTree = -1;
173
+
174
+ TRIALS = SaveTRIALS;
175
+ }
176
+
177
+ /* Print summary of crossvalidation */
178
+
179
+ MaxCase = SaveMaxCase;
180
+
181
+ Summary();
182
+ PrintConfusionMatrix(ConfusionMat);
183
+
184
+ /* Free local storage */
185
+
186
+ ForEach(i, 0, MaxCase)
187
+ {
188
+ Case[i] = Blocked[i];
189
+ }
190
+
191
+ FreeVector((void **) Result, 0, FOLDS-1); Result = Nil;
192
+ Free(Blocked); Blocked = Nil;
193
+ Free(ConfusionMat); ConfusionMat = Nil;
194
+ }
195
+
196
+
197
+
198
+ /*************************************************************************/
199
+ /* */
200
+ /* Prepare data for crossvalidation (similar to xval-prep.c) */
201
+ /* */
202
+ /*************************************************************************/
203
+
204
+
205
+ void Prepare()
206
+ /* ------- */
207
+ {
208
+ CaseNo i, First=0, Last, *Temp, Hold, Next=0;
209
+ ClassNo Group;
210
+
211
+ Temp = Alloc(MaxCase+1, CaseNo);
212
+ ForEach(i, 0, MaxCase)
213
+ {
214
+ Temp[i] = i;
215
+ }
216
+
217
+ Shuffle(Temp);
218
+
219
+ /* Sort into class groups */
220
+
221
+ while ( First <= MaxCase )
222
+ {
223
+ Last = First;
224
+ Group = Class(Case[Temp[First]]);
225
+
226
+ ForEach(i, First+1, MaxCase)
227
+ {
228
+ if ( Class(Case[Temp[i]]) == Group )
229
+ {
230
+ Last++;
231
+ Hold = Temp[Last];
232
+ Temp[Last] = Temp[i];
233
+ Temp[i] = Hold;
234
+ }
235
+ }
236
+
237
+ First = Last+1;
238
+ }
239
+
240
+ /* Organize into stratified blocks */
241
+
242
+ ForEach(First, 0, FOLDS-1)
243
+ {
244
+ for ( i = First ; i <= MaxCase ; i += FOLDS )
245
+ {
246
+ Blocked[Next++] = Case[Temp[i]];
247
+ }
248
+ }
249
+
250
+ Free(Temp);
251
+ }
252
+
253
+
254
+
255
+ /*************************************************************************/
256
+ /* */
257
+ /* Shuffle the data cases */
258
+ /* */
259
+ /*************************************************************************/
260
+
261
+
262
+ void Shuffle(int *Vec)
263
+ /* ------- */
264
+ {
265
+ int This=0, Alt, Left=MaxCase+1, Hold;
266
+
267
+ ResetKR(KRInit);
268
+
269
+ while ( Left )
270
+ {
271
+ Alt = This + (Left--) * KRandom();
272
+
273
+ Hold = Vec[This];
274
+ Vec[This++] = Vec[Alt];
275
+ Vec[Alt] = Hold;
276
+ }
277
+ }
278
+
279
+
280
+
281
+ /*************************************************************************/
282
+ /* */
283
+ /* Summarise a crossvalidation */
284
+ /* */
285
+ /*************************************************************************/
286
+
287
+
288
+ char
289
+ *FoldHead[] = { F_Fold, F_UFold, "" };
290
+
291
+ void Summary()
292
+ /* ------- */
293
+ {
294
+ int i, f, t;
295
+ Boolean PrintSize=true;
296
+ float Sum[3], SumSq[3];
297
+ extern char *StdP[], *StdPC[], *Extra[], *ExtraC[];
298
+
299
+ for ( i = 0 ; i < 3 ; i++ )
300
+ {
301
+ Sum[i] = SumSq[i] = 0;
302
+ }
303
+
304
+ ForEach(f, 0, FOLDS-1)
305
+ {
306
+ if ( Result[f][0] < 1 ) PrintSize = false;
307
+ }
308
+
309
+ fprintf(Of, "\n\n[ " T_Summary " ]\n\n");
310
+
311
+ ForEach(t, 0, 2)
312
+ {
313
+ fprintf(Of, "%s", FoldHead[t]);
314
+ putc('\t', Of);
315
+ if ( RULES )
316
+ {
317
+ fprintf(Of, "%s", ( MCost ? ExtraC[t] : Extra[t] ));
318
+ }
319
+ else
320
+ {
321
+ fprintf(Of, "%s", ( MCost ? StdPC[t] : StdP[t] ));
322
+ }
323
+ putc('\n', Of);
324
+ }
325
+ putc('\n', Of);
326
+
327
+ ForEach(f, 0, FOLDS-1)
328
+ {
329
+ fprintf(Of, "%4d\t", f+1);
330
+
331
+ if ( PrintSize )
332
+ {
333
+ fprintf(Of, " %5g", Result[f][0]);
334
+ }
335
+ else
336
+ {
337
+ fprintf(Of, " *");
338
+ }
339
+ fprintf(Of, " %10.1f%%", Result[f][1]);
340
+
341
+ if ( MCost )
342
+ {
343
+ fprintf(Of, "%7.2f", Result[f][2]);
344
+ }
345
+ fprintf(Of, "\n");
346
+
347
+ for ( i = 0 ; i < 3 ; i++ )
348
+ {
349
+ Sum[i] += Result[f][i];
350
+ SumSq[i] += Result[f][i] * Result[f][i];
351
+ }
352
+ }
353
+
354
+ fprintf(Of, "\n " T_Mean "\t");
355
+
356
+ if ( ! PrintSize )
357
+ {
358
+ fprintf(Of, " ");
359
+ }
360
+ else
361
+ {
362
+ fprintf(Of, "%6.1f", Sum[0] / FOLDS);
363
+ }
364
+
365
+ fprintf(Of, " %10.1f%%", Sum[1] / FOLDS);
366
+
367
+ if ( MCost )
368
+ {
369
+ fprintf(Of, "%7.2f", Sum[2] / FOLDS);
370
+ }
371
+
372
+ fprintf(Of, "\n " T_SE "\t");
373
+
374
+ if ( ! PrintSize )
375
+ {
376
+ fprintf(Of, " ");
377
+ }
378
+ else
379
+ {
380
+ fprintf(Of, "%6.1f", SE(Sum[0], SumSq[0], FOLDS));
381
+ }
382
+
383
+ fprintf(Of, " %10.1f%%", SE(Sum[1], SumSq[1], FOLDS));
384
+
385
+ if ( MCost )
386
+ {
387
+ fprintf(Of, "%7.2f", SE(Sum[2], SumSq[2], FOLDS));
388
+ }
389
+ fprintf(Of, "\n");
390
+ }
391
+
392
+
393
+
394
+ float SE(float sum, float sumsq, int no)
395
+ /* -- */
396
+ {
397
+ float mean;
398
+
399
+ mean = sum / no;
400
+
401
+ return sqrt( ((sumsq - no * mean * mean) / (no - 1)) / no );
402
+ }