see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,521 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Get cases from data file */
30
+ /* ------------------------ */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ #define Inc 2048
40
+
41
+ Boolean SuppressErrorMessages=false;
42
+ #define XError(a,b,c) if (! SuppressErrorMessages) Error(a,b,c)
43
+
44
+ CaseNo SampleFrom; /* file count for sampling */
45
+
46
+
47
+ /*************************************************************************/
48
+ /* */
49
+ /* Read raw cases from file with given extension. */
50
+ /* */
51
+ /* On completion, cases are stored in array Case in the form */
52
+ /* of vectors of attribute values, and MaxCase is set to the */
53
+ /* number of data cases. */
54
+ /* */
55
+ /*************************************************************************/
56
+
57
+
58
+ void GetData(FILE *Df, Boolean Train, Boolean AllowUnknownClass)
59
+ /* ------- */
60
+ {
61
+ DataRec DVec;
62
+ CaseNo CaseSpace, WantTrain, LeftTrain, WantTest, LeftTest;
63
+ Boolean FirstIgnore=true, SelectTrain;
64
+
65
+ LineNo = 0;
66
+ SuppressErrorMessages = SAMPLE && ! Train;
67
+
68
+ /* Don't reset case count if appending data for xval */
69
+
70
+ if ( Train || ! Case )
71
+ {
72
+ MaxCase = MaxLabel = CaseSpace = 0;
73
+ Case = Alloc(1, DataRec); /* for error reporting */
74
+ }
75
+ else
76
+ {
77
+ CaseSpace = MaxCase + 1;
78
+ MaxCase++;
79
+ }
80
+
81
+ if ( SAMPLE )
82
+ {
83
+ if ( Train )
84
+ {
85
+ SampleFrom = CountData(Df);
86
+ ResetKR(KRInit); /* initialise KRandom() */
87
+ }
88
+ else
89
+ {
90
+ ResetKR(KRInit); /* restore KRandom() */
91
+ }
92
+
93
+ WantTrain = SampleFrom * SAMPLE + 0.5;
94
+ LeftTrain = SampleFrom;
95
+
96
+ WantTest = ( SAMPLE < 0.5 ? WantTrain : SampleFrom - WantTrain );
97
+ LeftTest = SampleFrom - WantTrain;
98
+ }
99
+
100
+ while ( (DVec = GetDataRec(Df, Train)) )
101
+ {
102
+ /* Check whether to include if we are sampling */
103
+
104
+ if ( SAMPLE )
105
+ {
106
+ SelectTrain = KRandom() < WantTrain / (float) LeftTrain--;
107
+
108
+ /* Include if
109
+ * Select and this is the training set
110
+ * ! Select and this is the test set and sub-select
111
+ NB: Must use different random number generator for
112
+ sub-selection since cannot disturb random number sequence */
113
+
114
+ if ( SelectTrain )
115
+ {
116
+ WantTrain--;
117
+ }
118
+
119
+ if ( SelectTrain != Train ||
120
+ ( ! Train && AltRandom >= WantTest / (float) LeftTest-- ) )
121
+ {
122
+ FreeLastCase(DVec);
123
+ continue;
124
+ }
125
+
126
+ if ( ! Train )
127
+ {
128
+ WantTest--;
129
+ }
130
+ }
131
+
132
+ /* Make sure there is room for another case */
133
+
134
+ if ( MaxCase >= CaseSpace )
135
+ {
136
+ CaseSpace += Inc;
137
+ Realloc(Case, CaseSpace+1, DataRec);
138
+ }
139
+
140
+ /* Ignore cases with unknown class */
141
+
142
+ if ( AllowUnknownClass || (Class(DVec) & 077777777) > 0 )
143
+ {
144
+ Case[MaxCase] = DVec;
145
+ MaxCase++;
146
+ }
147
+ else
148
+ {
149
+ if ( FirstIgnore && Of )
150
+ {
151
+ fprintf(Of, T_IgnoreBadClass);
152
+ FirstIgnore = false;
153
+ }
154
+
155
+ FreeLastCase(DVec);
156
+ }
157
+ }
158
+
159
+ fclose(Df);
160
+ MaxCase--;
161
+
162
+ }
163
+
164
+
165
+
166
+ /*************************************************************************/
167
+ /* */
168
+ /* Read a raw case from file Df. */
169
+ /* */
170
+ /* For each attribute, read the attribute value from the file. */
171
+ /* If it is a discrete valued attribute, find the associated no. */
172
+ /* of this attribute value (if the value is unknown this is 0). */
173
+ /* */
174
+ /* Returns the DataRec of the case (i.e. the array of attribute */
175
+ /* values). */
176
+ /* */
177
+ /*************************************************************************/
178
+
179
+
180
+ DataRec GetDataRec(FILE *Df, Boolean Train)
181
+ /* ---------- */
182
+ {
183
+ Attribute Att;
184
+ char Name[1000], *EndName;
185
+ int Dv, Chars;
186
+ DataRec DVec;
187
+ ContValue Cv;
188
+ Boolean FirstValue=true;
189
+
190
+
191
+ if ( ReadName(Df, Name, 1000, '\00') )
192
+ {
193
+ Case[MaxCase] = DVec = NewCase();
194
+ ForEach(Att, 1, MaxAtt)
195
+ {
196
+ if ( AttDef[Att] )
197
+ {
198
+ DVec[Att] = EvaluateDef(AttDef[Att], DVec);
199
+
200
+ if ( Continuous(Att) )
201
+ {
202
+ CheckValue(DVec, Att);
203
+ }
204
+
205
+ if ( SomeMiss )
206
+ {
207
+ SomeMiss[Att] |= Unknown(DVec, Att);
208
+ SomeNA[Att] |= NotApplic(DVec, Att);
209
+ }
210
+
211
+ continue;
212
+ }
213
+
214
+ /* Get the attribute value if don't already have it */
215
+
216
+ if ( ! FirstValue && ! ReadName(Df, Name, 1000, '\00') )
217
+ {
218
+ XError(HITEOF, AttName[Att], "");
219
+ FreeLastCase(DVec);
220
+ return Nil;
221
+ }
222
+ FirstValue = false;
223
+
224
+ if ( Exclude(Att) )
225
+ {
226
+ if ( Att == LabelAtt )
227
+ {
228
+ /* Record the value as a string */
229
+
230
+ SVal(DVec,Att) = StoreIVal(Name);
231
+ }
232
+ }
233
+ else
234
+ if ( ! strcmp(Name, "?") )
235
+ {
236
+ /* Set marker to indicate missing value */
237
+
238
+ DVal(DVec, Att) = UNKNOWN;
239
+ if ( SomeMiss ) SomeMiss[Att] = true;
240
+ }
241
+ else
242
+ if ( Att != ClassAtt && ! strcmp(Name, "N/A") )
243
+ {
244
+ /* Set marker to indicate not applicable */
245
+
246
+ DVal(DVec, Att) = NA;
247
+ if ( SomeNA ) SomeNA[Att] = true;
248
+ }
249
+ else
250
+ if ( Discrete(Att) )
251
+ {
252
+ /* Discrete attribute */
253
+
254
+ Dv = Which(Name, AttValName[Att], 1, MaxAttVal[Att]);
255
+ if ( ! Dv )
256
+ {
257
+ if ( StatBit(Att, DISCRETE) )
258
+ {
259
+ if ( Train || XVAL )
260
+ {
261
+ /* Add value to list */
262
+
263
+ if ( MaxAttVal[Att] >= (long) AttValName[Att][0] )
264
+ {
265
+ XError(TOOMANYVALS, AttName[Att],
266
+ (char *) AttValName[Att][0] - 1);
267
+ Dv = MaxAttVal[Att];
268
+ }
269
+ else
270
+ {
271
+ Dv = ++MaxAttVal[Att];
272
+ AttValName[Att][Dv] = strdup(Name);
273
+ AttValName[Att][Dv+1] = "<other>"; /* no free */
274
+ }
275
+ if ( Dv > MaxDiscrVal )
276
+ {
277
+ MaxDiscrVal = Dv;
278
+ }
279
+ }
280
+ else
281
+ {
282
+ /* Set value to "<other>" */
283
+
284
+ Dv = MaxAttVal[Att] + 1;
285
+ }
286
+ }
287
+ else
288
+ {
289
+ XError(BADATTVAL, AttName[Att], Name);
290
+ Dv = UNKNOWN;
291
+ }
292
+ }
293
+ DVal(DVec, Att) = Dv;
294
+ }
295
+ else
296
+ {
297
+ /* Continuous value */
298
+
299
+ if ( TStampVal(Att) )
300
+ {
301
+ CVal(DVec, Att) = Cv = TStampToMins(Name);
302
+ if ( Cv >= 1E9 ) /* long time in future */
303
+ {
304
+ XError(BADTSTMP, AttName[Att], Name);
305
+ DVal(DVec, Att) = UNKNOWN;
306
+ }
307
+ }
308
+ else
309
+ if ( DateVal(Att) )
310
+ {
311
+ CVal(DVec, Att) = Cv = DateToDay(Name);
312
+ if ( Cv < 1 )
313
+ {
314
+ XError(BADDATE, AttName[Att], Name);
315
+ DVal(DVec, Att) = UNKNOWN;
316
+ }
317
+ }
318
+ else
319
+ if ( TimeVal(Att) )
320
+ {
321
+ CVal(DVec, Att) = Cv = TimeToSecs(Name);
322
+ if ( Cv < 0 )
323
+ {
324
+ XError(BADTIME, AttName[Att], Name);
325
+ DVal(DVec, Att) = UNKNOWN;
326
+ }
327
+ }
328
+ else
329
+ {
330
+ CVal(DVec, Att) = strtod(Name, &EndName);
331
+ if ( EndName == Name || *EndName != '\0' )
332
+ {
333
+ XError(BADATTVAL, AttName[Att], Name);
334
+ DVal(DVec, Att) = UNKNOWN;
335
+ }
336
+ }
337
+
338
+ CheckValue(DVec, Att);
339
+ }
340
+ }
341
+
342
+ if ( ClassAtt )
343
+ {
344
+ if ( Discrete(ClassAtt) )
345
+ {
346
+ Class(DVec) = XDVal(DVec, ClassAtt);
347
+ }
348
+ else
349
+ if ( Unknown(DVec, ClassAtt) || NotApplic(DVec, ClassAtt) )
350
+ {
351
+ Class(DVec) = 0;
352
+ }
353
+ else
354
+ {
355
+ /* Find appropriate segment using class thresholds */
356
+
357
+ Cv = CVal(DVec, ClassAtt);
358
+
359
+ for ( Dv = 1 ; Dv < MaxClass && Cv > ClassThresh[Dv] ; Dv++ )
360
+ ;
361
+
362
+ Class(DVec) = Dv;
363
+ }
364
+ }
365
+ else
366
+ {
367
+ if ( ! ReadName(Df, Name, 1000, '\00') )
368
+ {
369
+ XError(HITEOF, Fn, "");
370
+ FreeLastCase(DVec);
371
+ return Nil;
372
+ }
373
+
374
+ if ( (Class(DVec) = Dv = Which(Name, ClassName, 1, MaxClass)) == 0 )
375
+ {
376
+ if ( strcmp(Name, "?") ) XError(BADCLASS, "", Name);
377
+ }
378
+ }
379
+
380
+ if ( LabelAtt &&
381
+ (Chars = strlen(IgnoredVals + SVal(DVec, LabelAtt))) > MaxLabel )
382
+ {
383
+ MaxLabel = Chars;
384
+ }
385
+ return DVec;
386
+ }
387
+ else
388
+ {
389
+ return Nil;
390
+ }
391
+ }
392
+
393
+
394
+
395
+ /*************************************************************************/
396
+ /* */
397
+ /* Count cases in data file */
398
+ /* */
399
+ /*************************************************************************/
400
+
401
+
402
+ CaseNo CountData(FILE *Df)
403
+ /* --------- */
404
+ {
405
+ char Last=',';
406
+ int Count=0, Next;
407
+
408
+ while ( true )
409
+ {
410
+ if ( (Next = getc(Df)) == EOF )
411
+ {
412
+ if ( Last != ',' ) Count++;
413
+ rewind(Df);
414
+ return Count;
415
+ }
416
+
417
+ if ( Next == '|' )
418
+ {
419
+ while ( (Next = getc(Df)) != '\n' )
420
+ ;
421
+ }
422
+
423
+ if ( Next == '\n' )
424
+ {
425
+ if ( Last != ',' ) Count++;
426
+ Last = ',';
427
+ }
428
+ else
429
+ if ( Next == '\\' )
430
+ {
431
+ /* Skip escaped character */
432
+
433
+ getc(Df);
434
+ }
435
+ else
436
+ if ( Next != '\t' && Next != ' ' )
437
+ {
438
+ Last = Next;
439
+ }
440
+ }
441
+ }
442
+
443
+
444
+
445
+ /*************************************************************************/
446
+ /* */
447
+ /* Store a label or ignored value in IValStore */
448
+ /* */
449
+ /*************************************************************************/
450
+
451
+
452
+ int StoreIVal(String S)
453
+ /* --------- */
454
+ {
455
+ int StartIx, Length;
456
+
457
+ if ( (Length=strlen(S) + 1) + IValsOffset > IValsSize )
458
+ {
459
+ if ( IgnoredVals )
460
+ {
461
+ Realloc(IgnoredVals, IValsSize += 32768, char);
462
+ }
463
+ else
464
+ {
465
+ IValsSize = 32768;
466
+ IValsOffset = 0;
467
+ IgnoredVals = Alloc(IValsSize, char);
468
+ }
469
+ }
470
+
471
+ StartIx = IValsOffset;
472
+ strcpy(IgnoredVals + StartIx, S);
473
+ IValsOffset += Length;
474
+
475
+ return StartIx;
476
+ }
477
+
478
+
479
+
480
+ /*************************************************************************/
481
+ /* */
482
+ /* Free case space */
483
+ /* */
484
+ /*************************************************************************/
485
+
486
+
487
+ void FreeData()
488
+ /* -------- */
489
+ {
490
+ FreeCases();
491
+
492
+ FreeUnlessNil(IgnoredVals); IgnoredVals = Nil;
493
+ IValsSize = 0;
494
+
495
+ Free(Case); Case = Nil;
496
+
497
+ MaxCase = -1;
498
+ }
499
+
500
+
501
+
502
+ /*************************************************************************/
503
+ /* */
504
+ /* Check for bad continuous value */
505
+ /* */
506
+ /*************************************************************************/
507
+
508
+
509
+ void CheckValue(DataRec DVec, Attribute Att)
510
+ /* ---------- */
511
+ {
512
+ ContValue Cv;
513
+
514
+ Cv = CVal(DVec, Att);
515
+ if ( ! finite(Cv) )
516
+ {
517
+ Error(BADNUMBER, AttName[Att], "");
518
+
519
+ CVal(DVec, Att) = UNKNOWN;
520
+ }
521
+ }