see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
data/ext/c5.0/xval ADDED
@@ -0,0 +1,150 @@
1
+ #! /bin/csh
2
+ #---------------------------------------------------------------------
3
+ # Multi F-fold cross-validation script
4
+ #---------------------------------------------------------------------
5
+ #
6
+ # Invocation:
7
+ # xval [C5.0 options] [F=folds] [R=repeats] [+label] [+d]
8
+ #
9
+ # Carries out R F-fold cross-validations
10
+ #
11
+ # If +d is used, individual results from each block are left in
12
+ # <filestem>.o<cross-validation no>[+label]
13
+ # Averages over cross-validations are written to
14
+ # <filestem>.res[+label]
15
+ #---------------------------------------------------------------------
16
+
17
+
18
+ # Sort the options into those applying to C5.0 and the rest
19
+
20
+ set opts =
21
+ set folds = 10
22
+ set repeats = 1
23
+ set label =
24
+ set filestem = undefined
25
+ set rules = 0
26
+
27
+ set i = 1
28
+ while ( $i <= $#argv )
29
+ set opt = $argv[$i]
30
+
31
+ switch ( $opt )
32
+ case "F=*":
33
+ set folds = `echo $opt | sed s/F=//`
34
+ breaksw
35
+ case "R=*":
36
+ set repeats = `echo $opt | sed s/R=//`
37
+ breaksw
38
+ case "+d":
39
+ set details
40
+ breaksw
41
+ case "+*":
42
+ set label = $opt
43
+ breaksw
44
+ case "-f":
45
+ @ i++
46
+ set filestem = $argv[$i]
47
+ breaksw
48
+ case "-f*":
49
+ set filestem = `echo $opt | sed s/-f//`
50
+ breaksw
51
+ case "-t":
52
+ case "-m":
53
+ case "-c":
54
+ case "-u":
55
+ case "-S":
56
+ case "-I":
57
+ @ i++
58
+ set opts = ( $opts ${opt}$argv[$i] )
59
+ breaksw
60
+ case "-b":
61
+ case "-p":
62
+ case "-e":
63
+ case "-t*":
64
+ case "-g":
65
+ case "-s":
66
+ case "-w":
67
+ case "-u*":
68
+ case "-m*":
69
+ case "-c*":
70
+ case "-S*":
71
+ case "-I*":
72
+ set opts = ( $opts $opt )
73
+ breaksw
74
+ case "-r":
75
+ set opts = ( $opts $opt )
76
+ set rules = 1
77
+ breaksw
78
+ case "-X":
79
+ @ i++
80
+ set folds = $argv[$i]
81
+ breaksw
82
+ case "-X*":
83
+ set folds = `echo $opt | sed s/-X//`
84
+ breaksw
85
+ default:
86
+ echo "unrecognised or inappropriate option" $opt
87
+ case "-h":
88
+ echo ""
89
+ echo "Summary of options for xval:"
90
+ echo ""
91
+ echo " F=<f> set f folds"
92
+ echo " R=<r> repeat r times"
93
+ echo " +d retain detailed files"
94
+ echo " +s label all output files with suffix +s"
95
+ echo ""
96
+ echo " -f <filestem> application filestem"
97
+ echo " -r use rule-based classifiers"
98
+ echo " -u <bands> order rules by utility"
99
+ echo " -w invoke attribute winnowing"
100
+ echo " -b invoke 10-trial boosting"
101
+ echo " -t <trials> number of boosting trials"
102
+ echo " -p use soft thresholds"
103
+ echo " -e focus on errors (ignore costs file)"
104
+ echo " -s find subset tests for discrete atts"
105
+ echo " -m <objs> restrict allowable splits"
106
+ echo " -c <CF> confidence level for pruning"
107
+ echo " -S <percent> training sample percentage"
108
+ echo " -X <folds> cross-validate"
109
+ echo " -I <integer> random seed [ignored]"
110
+ echo " -h print this message"
111
+ exit 0
112
+ endsw
113
+
114
+ @ i++
115
+ end
116
+
117
+
118
+ # Clear the summary file
119
+
120
+ cp /dev/null $filestem.xsum
121
+
122
+
123
+ # Repeat cross-validations, incrementing the random seed
124
+
125
+ set r = 0
126
+ while ( $r < $repeats )
127
+
128
+ set outf = $filestem.o$r$label
129
+ c5.0 -f $filestem $opts -X $folds -I $r >$outf
130
+ grep "<<" $outf >> $filestem.xsum
131
+
132
+ @ r++
133
+ end
134
+
135
+
136
+ # Find the number of cases in the training and test files
137
+
138
+ set junk = `grep ^Read $outf`
139
+ @ examples = $junk[2]
140
+ if ( -e $filestem.test ) then
141
+ @ examples += $junk[9]
142
+ endif
143
+
144
+
145
+ # Remove the temporary file and summarize results
146
+
147
+ report $examples $folds $repeats $rules <$filestem.xsum >$filestem.res$label
148
+
149
+ rm $filestem.xsum
150
+ if ( ! $?details ) rm -f $filestem.o[0-9]*$label
data/ext/c5.0/xval.c ADDED
@@ -0,0 +1,402 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Carry out crossvalidation trials */
30
+ /* -------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+ #include "defns.i"
35
+ #include "extern.i"
36
+
37
+
38
+ DataRec *Blocked=Nil;
39
+ float **Result=Nil; /* Result[f][0] = tree/ruleset size
40
+ [1] = tree/ruleset errors
41
+ [2] = tree/ruleset cost */
42
+
43
+
44
+
45
+ /*************************************************************************/
46
+ /* */
47
+ /* Outer function (differs from xval script) */
48
+ /* */
49
+ /*************************************************************************/
50
+
51
+
52
+ void CrossVal()
53
+ /* -------- */
54
+ {
55
+ CaseNo i, Size, Start=0, Next, SaveMaxCase;
56
+ int f, SmallTestBlocks, t, SaveTRIALS;
57
+ ClassNo c;
58
+ static CaseNo *ConfusionMat=Nil;
59
+ static int SaveFOLDS=0;
60
+
61
+ /* Check for left-overs after interrupt */
62
+
63
+ if ( Result )
64
+ {
65
+ FreeVector((void **) Result, 0, SaveFOLDS-1);
66
+ Free(ConfusionMat);
67
+ }
68
+
69
+ if ( FOLDS > MaxCase+1 )
70
+ {
71
+ fprintf(Of, T_FoldsReduced);
72
+ FOLDS = MaxCase+1;
73
+ }
74
+
75
+ Result = AllocZero((SaveFOLDS = FOLDS), float *);
76
+ Blocked = Alloc(MaxCase+1, DataRec);
77
+ ConfusionMat = AllocZero((MaxClass+1)*(MaxClass+1), CaseNo);
78
+
79
+ Prepare();
80
+
81
+ SaveMaxCase = MaxCase;
82
+ SaveTRIALS = TRIALS;
83
+
84
+ /* First test blocks may be smaller than the others */
85
+
86
+ SmallTestBlocks = FOLDS - ((MaxCase+1) % FOLDS);
87
+ Size = (MaxCase + 1) / FOLDS;
88
+
89
+ ForEach(f, 0, FOLDS-1)
90
+ {
91
+ fprintf(Of, "\n\n[ " T_Fold " %d ]\n", f+1);
92
+ Result[f] = AllocZero(3, float);
93
+
94
+ if ( f == SmallTestBlocks ) Size++;
95
+ MaxCase = SaveMaxCase - Size;
96
+
97
+ ForEach(i, 0, MaxCase)
98
+ {
99
+ Case[i] = Blocked[Start];
100
+ Start = (Start + 1) % (SaveMaxCase + 1);
101
+ }
102
+
103
+ ConstructClassifiers();
104
+
105
+ /* Check size (if appropriate) and errors */
106
+
107
+ if ( TRIALS == 1 )
108
+ {
109
+ Result[f][0] = ( RULES ? RuleSet[0]->SNRules :
110
+ TreeSize(Pruned[0]) );
111
+ Next = Start;
112
+ ForEach(i, 0, Size-1)
113
+ {
114
+ Case[i] = Blocked[Next];
115
+ c = ( RULES ? RuleClassify(Blocked[Next], RuleSet[0]) :
116
+ TreeClassify(Blocked[Next], Pruned[0]) );
117
+ if ( c != Class(Blocked[Next]) )
118
+ {
119
+ Result[f][1] += 1.0;
120
+ if ( MCost )
121
+ {
122
+ Result[f][2] += MCost[c][Class(Blocked[Next])];
123
+ }
124
+ }
125
+
126
+ /* Add to confusion matrix for target classifier */
127
+
128
+ ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;
129
+
130
+ Next = (Next + 1) % (SaveMaxCase + 1);
131
+ }
132
+ }
133
+ else
134
+ {
135
+ Result[f][0] = -1;
136
+ Next = Start;
137
+ Default = ( RULES ? RuleSet[0]->SDefault : Pruned[0]->Leaf );
138
+ ForEach(i, 0, Size-1)
139
+ {
140
+ Case[i] = Blocked[Next];
141
+ c = BoostClassify(Blocked[Next], TRIALS-1);
142
+ if ( c != Class(Blocked[Next]) )
143
+ {
144
+ Result[f][1] += 1.0;
145
+ if ( MCost )
146
+ {
147
+ Result[f][2] += MCost[c][Class(Blocked[Next])];
148
+ }
149
+ }
150
+
151
+ /* Add to confusion matrix for target classifier */
152
+
153
+ ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;
154
+
155
+ Next = (Next + 1) % (SaveMaxCase + 1);
156
+ }
157
+ }
158
+
159
+ Result[f][1] = (100.0 * Result[f][1]) / Size;
160
+ Result[f][2] /= Size;
161
+
162
+ fprintf(Of, T_EvalHoldOut, Size);
163
+ MaxCase = Size-1;
164
+ Evaluate(0);
165
+
166
+ /* Free space used by classifiers */
167
+
168
+ ForEach(t, 0, MaxTree)
169
+ {
170
+ FreeClassifier(t);
171
+ }
172
+ MaxTree = -1;
173
+
174
+ TRIALS = SaveTRIALS;
175
+ }
176
+
177
+ /* Print summary of crossvalidation */
178
+
179
+ MaxCase = SaveMaxCase;
180
+
181
+ Summary();
182
+ PrintConfusionMatrix(ConfusionMat);
183
+
184
+ /* Free local storage */
185
+
186
+ ForEach(i, 0, MaxCase)
187
+ {
188
+ Case[i] = Blocked[i];
189
+ }
190
+
191
+ FreeVector((void **) Result, 0, FOLDS-1); Result = Nil;
192
+ Free(Blocked); Blocked = Nil;
193
+ Free(ConfusionMat); ConfusionMat = Nil;
194
+ }
195
+
196
+
197
+
198
+ /*************************************************************************/
199
+ /* */
200
+ /* Prepare data for crossvalidation (similar to xval-prep.c) */
201
+ /* */
202
+ /*************************************************************************/
203
+
204
+
205
+ void Prepare()
206
+ /* ------- */
207
+ {
208
+ CaseNo i, First=0, Last, *Temp, Hold, Next=0;
209
+ ClassNo Group;
210
+
211
+ Temp = Alloc(MaxCase+1, CaseNo);
212
+ ForEach(i, 0, MaxCase)
213
+ {
214
+ Temp[i] = i;
215
+ }
216
+
217
+ Shuffle(Temp);
218
+
219
+ /* Sort into class groups */
220
+
221
+ while ( First <= MaxCase )
222
+ {
223
+ Last = First;
224
+ Group = Class(Case[Temp[First]]);
225
+
226
+ ForEach(i, First+1, MaxCase)
227
+ {
228
+ if ( Class(Case[Temp[i]]) == Group )
229
+ {
230
+ Last++;
231
+ Hold = Temp[Last];
232
+ Temp[Last] = Temp[i];
233
+ Temp[i] = Hold;
234
+ }
235
+ }
236
+
237
+ First = Last+1;
238
+ }
239
+
240
+ /* Organize into stratified blocks */
241
+
242
+ ForEach(First, 0, FOLDS-1)
243
+ {
244
+ for ( i = First ; i <= MaxCase ; i += FOLDS )
245
+ {
246
+ Blocked[Next++] = Case[Temp[i]];
247
+ }
248
+ }
249
+
250
+ Free(Temp);
251
+ }
252
+
253
+
254
+
255
+ /*************************************************************************/
256
+ /* */
257
+ /* Shuffle the data cases */
258
+ /* */
259
+ /*************************************************************************/
260
+
261
+
262
+ void Shuffle(int *Vec)
263
+ /* ------- */
264
+ {
265
+ int This=0, Alt, Left=MaxCase+1, Hold;
266
+
267
+ ResetKR(KRInit);
268
+
269
+ while ( Left )
270
+ {
271
+ Alt = This + (Left--) * KRandom();
272
+
273
+ Hold = Vec[This];
274
+ Vec[This++] = Vec[Alt];
275
+ Vec[Alt] = Hold;
276
+ }
277
+ }
278
+
279
+
280
+
281
+ /*************************************************************************/
282
+ /* */
283
+ /* Summarise a crossvalidation */
284
+ /* */
285
+ /*************************************************************************/
286
+
287
+
288
+ char
289
+ *FoldHead[] = { F_Fold, F_UFold, "" };
290
+
291
+ void Summary()
292
+ /* ------- */
293
+ {
294
+ int i, f, t;
295
+ Boolean PrintSize=true;
296
+ float Sum[3], SumSq[3];
297
+ extern char *StdP[], *StdPC[], *Extra[], *ExtraC[];
298
+
299
+ for ( i = 0 ; i < 3 ; i++ )
300
+ {
301
+ Sum[i] = SumSq[i] = 0;
302
+ }
303
+
304
+ ForEach(f, 0, FOLDS-1)
305
+ {
306
+ if ( Result[f][0] < 1 ) PrintSize = false;
307
+ }
308
+
309
+ fprintf(Of, "\n\n[ " T_Summary " ]\n\n");
310
+
311
+ ForEach(t, 0, 2)
312
+ {
313
+ fprintf(Of, "%s", FoldHead[t]);
314
+ putc('\t', Of);
315
+ if ( RULES )
316
+ {
317
+ fprintf(Of, "%s", ( MCost ? ExtraC[t] : Extra[t] ));
318
+ }
319
+ else
320
+ {
321
+ fprintf(Of, "%s", ( MCost ? StdPC[t] : StdP[t] ));
322
+ }
323
+ putc('\n', Of);
324
+ }
325
+ putc('\n', Of);
326
+
327
+ ForEach(f, 0, FOLDS-1)
328
+ {
329
+ fprintf(Of, "%4d\t", f+1);
330
+
331
+ if ( PrintSize )
332
+ {
333
+ fprintf(Of, " %5g", Result[f][0]);
334
+ }
335
+ else
336
+ {
337
+ fprintf(Of, " *");
338
+ }
339
+ fprintf(Of, " %10.1f%%", Result[f][1]);
340
+
341
+ if ( MCost )
342
+ {
343
+ fprintf(Of, "%7.2f", Result[f][2]);
344
+ }
345
+ fprintf(Of, "\n");
346
+
347
+ for ( i = 0 ; i < 3 ; i++ )
348
+ {
349
+ Sum[i] += Result[f][i];
350
+ SumSq[i] += Result[f][i] * Result[f][i];
351
+ }
352
+ }
353
+
354
+ fprintf(Of, "\n " T_Mean "\t");
355
+
356
+ if ( ! PrintSize )
357
+ {
358
+ fprintf(Of, " ");
359
+ }
360
+ else
361
+ {
362
+ fprintf(Of, "%6.1f", Sum[0] / FOLDS);
363
+ }
364
+
365
+ fprintf(Of, " %10.1f%%", Sum[1] / FOLDS);
366
+
367
+ if ( MCost )
368
+ {
369
+ fprintf(Of, "%7.2f", Sum[2] / FOLDS);
370
+ }
371
+
372
+ fprintf(Of, "\n " T_SE "\t");
373
+
374
+ if ( ! PrintSize )
375
+ {
376
+ fprintf(Of, " ");
377
+ }
378
+ else
379
+ {
380
+ fprintf(Of, "%6.1f", SE(Sum[0], SumSq[0], FOLDS));
381
+ }
382
+
383
+ fprintf(Of, " %10.1f%%", SE(Sum[1], SumSq[1], FOLDS));
384
+
385
+ if ( MCost )
386
+ {
387
+ fprintf(Of, "%7.2f", SE(Sum[2], SumSq[2], FOLDS));
388
+ }
389
+ fprintf(Of, "\n");
390
+ }
391
+
392
+
393
+
394
+ float SE(float sum, float sumsq, int no)
395
+ /* -- */
396
+ {
397
+ float mean;
398
+
399
+ mean = sum / no;
400
+
401
+ return sqrt( ((sumsq - no * mean * mean) / (no - 1)) / no );
402
+ }