see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,521 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Get cases from data file */
30
+ /* ------------------------ */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ #define Inc 2048
40
+
41
+ Boolean SuppressErrorMessages=false;
42
+ #define XError(a,b,c) if (! SuppressErrorMessages) Error(a,b,c)
43
+
44
+ CaseNo SampleFrom; /* file count for sampling */
45
+
46
+
47
+ /*************************************************************************/
48
+ /* */
49
+ /* Read raw cases from file with given extension. */
50
+ /* */
51
+ /* On completion, cases are stored in array Case in the form */
52
+ /* of vectors of attribute values, and MaxCase is set to the */
53
+ /* number of data cases. */
54
+ /* */
55
+ /*************************************************************************/
56
+
57
+
58
+ void GetData(FILE *Df, Boolean Train, Boolean AllowUnknownClass)
59
+ /* ------- */
60
+ {
61
+ DataRec DVec;
62
+ CaseNo CaseSpace, WantTrain, LeftTrain, WantTest, LeftTest;
63
+ Boolean FirstIgnore=true, SelectTrain;
64
+
65
+ LineNo = 0;
66
+ SuppressErrorMessages = SAMPLE && ! Train;
67
+
68
+ /* Don't reset case count if appending data for xval */
69
+
70
+ if ( Train || ! Case )
71
+ {
72
+ MaxCase = MaxLabel = CaseSpace = 0;
73
+ Case = Alloc(1, DataRec); /* for error reporting */
74
+ }
75
+ else
76
+ {
77
+ CaseSpace = MaxCase + 1;
78
+ MaxCase++;
79
+ }
80
+
81
+ if ( SAMPLE )
82
+ {
83
+ if ( Train )
84
+ {
85
+ SampleFrom = CountData(Df);
86
+ ResetKR(KRInit); /* initialise KRandom() */
87
+ }
88
+ else
89
+ {
90
+ ResetKR(KRInit); /* restore KRandom() */
91
+ }
92
+
93
+ WantTrain = SampleFrom * SAMPLE + 0.5;
94
+ LeftTrain = SampleFrom;
95
+
96
+ WantTest = ( SAMPLE < 0.5 ? WantTrain : SampleFrom - WantTrain );
97
+ LeftTest = SampleFrom - WantTrain;
98
+ }
99
+
100
+ while ( (DVec = GetDataRec(Df, Train)) )
101
+ {
102
+ /* Check whether to include if we are sampling */
103
+
104
+ if ( SAMPLE )
105
+ {
106
+ SelectTrain = KRandom() < WantTrain / (float) LeftTrain--;
107
+
108
+ /* Include if
109
+ * Select and this is the training set
110
+ * ! Select and this is the test set and sub-select
111
+ NB: Must use different random number generator for
112
+ sub-selection since cannot disturb random number sequence */
113
+
114
+ if ( SelectTrain )
115
+ {
116
+ WantTrain--;
117
+ }
118
+
119
+ if ( SelectTrain != Train ||
120
+ ( ! Train && AltRandom >= WantTest / (float) LeftTest-- ) )
121
+ {
122
+ FreeLastCase(DVec);
123
+ continue;
124
+ }
125
+
126
+ if ( ! Train )
127
+ {
128
+ WantTest--;
129
+ }
130
+ }
131
+
132
+ /* Make sure there is room for another case */
133
+
134
+ if ( MaxCase >= CaseSpace )
135
+ {
136
+ CaseSpace += Inc;
137
+ Realloc(Case, CaseSpace+1, DataRec);
138
+ }
139
+
140
+ /* Ignore cases with unknown class */
141
+
142
+ if ( AllowUnknownClass || (Class(DVec) & 077777777) > 0 )
143
+ {
144
+ Case[MaxCase] = DVec;
145
+ MaxCase++;
146
+ }
147
+ else
148
+ {
149
+ if ( FirstIgnore && Of )
150
+ {
151
+ fprintf(Of, T_IgnoreBadClass);
152
+ FirstIgnore = false;
153
+ }
154
+
155
+ FreeLastCase(DVec);
156
+ }
157
+ }
158
+
159
+ fclose(Df);
160
+ MaxCase--;
161
+
162
+ }
163
+
164
+
165
+
166
+ /*************************************************************************/
167
+ /* */
168
+ /* Read a raw case from file Df. */
169
+ /* */
170
+ /* For each attribute, read the attribute value from the file. */
171
+ /* If it is a discrete valued attribute, find the associated no. */
172
+ /* of this attribute value (if the value is unknown this is 0). */
173
+ /* */
174
+ /* Returns the DataRec of the case (i.e. the array of attribute */
175
+ /* values). */
176
+ /* */
177
+ /*************************************************************************/
178
+
179
+
180
+ DataRec GetDataRec(FILE *Df, Boolean Train)
181
+ /* ---------- */
182
+ {
183
+ Attribute Att;
184
+ char Name[1000], *EndName;
185
+ int Dv, Chars;
186
+ DataRec DVec;
187
+ ContValue Cv;
188
+ Boolean FirstValue=true;
189
+
190
+
191
+ if ( ReadName(Df, Name, 1000, '\00') )
192
+ {
193
+ Case[MaxCase] = DVec = NewCase();
194
+ ForEach(Att, 1, MaxAtt)
195
+ {
196
+ if ( AttDef[Att] )
197
+ {
198
+ DVec[Att] = EvaluateDef(AttDef[Att], DVec);
199
+
200
+ if ( Continuous(Att) )
201
+ {
202
+ CheckValue(DVec, Att);
203
+ }
204
+
205
+ if ( SomeMiss )
206
+ {
207
+ SomeMiss[Att] |= Unknown(DVec, Att);
208
+ SomeNA[Att] |= NotApplic(DVec, Att);
209
+ }
210
+
211
+ continue;
212
+ }
213
+
214
+ /* Get the attribute value if don't already have it */
215
+
216
+ if ( ! FirstValue && ! ReadName(Df, Name, 1000, '\00') )
217
+ {
218
+ XError(HITEOF, AttName[Att], "");
219
+ FreeLastCase(DVec);
220
+ return Nil;
221
+ }
222
+ FirstValue = false;
223
+
224
+ if ( Exclude(Att) )
225
+ {
226
+ if ( Att == LabelAtt )
227
+ {
228
+ /* Record the value as a string */
229
+
230
+ SVal(DVec,Att) = StoreIVal(Name);
231
+ }
232
+ }
233
+ else
234
+ if ( ! strcmp(Name, "?") )
235
+ {
236
+ /* Set marker to indicate missing value */
237
+
238
+ DVal(DVec, Att) = UNKNOWN;
239
+ if ( SomeMiss ) SomeMiss[Att] = true;
240
+ }
241
+ else
242
+ if ( Att != ClassAtt && ! strcmp(Name, "N/A") )
243
+ {
244
+ /* Set marker to indicate not applicable */
245
+
246
+ DVal(DVec, Att) = NA;
247
+ if ( SomeNA ) SomeNA[Att] = true;
248
+ }
249
+ else
250
+ if ( Discrete(Att) )
251
+ {
252
+ /* Discrete attribute */
253
+
254
+ Dv = Which(Name, AttValName[Att], 1, MaxAttVal[Att]);
255
+ if ( ! Dv )
256
+ {
257
+ if ( StatBit(Att, DISCRETE) )
258
+ {
259
+ if ( Train || XVAL )
260
+ {
261
+ /* Add value to list */
262
+
263
+ if ( MaxAttVal[Att] >= (long) AttValName[Att][0] )
264
+ {
265
+ XError(TOOMANYVALS, AttName[Att],
266
+ (char *) AttValName[Att][0] - 1);
267
+ Dv = MaxAttVal[Att];
268
+ }
269
+ else
270
+ {
271
+ Dv = ++MaxAttVal[Att];
272
+ AttValName[Att][Dv] = strdup(Name);
273
+ AttValName[Att][Dv+1] = "<other>"; /* no free */
274
+ }
275
+ if ( Dv > MaxDiscrVal )
276
+ {
277
+ MaxDiscrVal = Dv;
278
+ }
279
+ }
280
+ else
281
+ {
282
+ /* Set value to "<other>" */
283
+
284
+ Dv = MaxAttVal[Att] + 1;
285
+ }
286
+ }
287
+ else
288
+ {
289
+ XError(BADATTVAL, AttName[Att], Name);
290
+ Dv = UNKNOWN;
291
+ }
292
+ }
293
+ DVal(DVec, Att) = Dv;
294
+ }
295
+ else
296
+ {
297
+ /* Continuous value */
298
+
299
+ if ( TStampVal(Att) )
300
+ {
301
+ CVal(DVec, Att) = Cv = TStampToMins(Name);
302
+ if ( Cv >= 1E9 ) /* long time in future */
303
+ {
304
+ XError(BADTSTMP, AttName[Att], Name);
305
+ DVal(DVec, Att) = UNKNOWN;
306
+ }
307
+ }
308
+ else
309
+ if ( DateVal(Att) )
310
+ {
311
+ CVal(DVec, Att) = Cv = DateToDay(Name);
312
+ if ( Cv < 1 )
313
+ {
314
+ XError(BADDATE, AttName[Att], Name);
315
+ DVal(DVec, Att) = UNKNOWN;
316
+ }
317
+ }
318
+ else
319
+ if ( TimeVal(Att) )
320
+ {
321
+ CVal(DVec, Att) = Cv = TimeToSecs(Name);
322
+ if ( Cv < 0 )
323
+ {
324
+ XError(BADTIME, AttName[Att], Name);
325
+ DVal(DVec, Att) = UNKNOWN;
326
+ }
327
+ }
328
+ else
329
+ {
330
+ CVal(DVec, Att) = strtod(Name, &EndName);
331
+ if ( EndName == Name || *EndName != '\0' )
332
+ {
333
+ XError(BADATTVAL, AttName[Att], Name);
334
+ DVal(DVec, Att) = UNKNOWN;
335
+ }
336
+ }
337
+
338
+ CheckValue(DVec, Att);
339
+ }
340
+ }
341
+
342
+ if ( ClassAtt )
343
+ {
344
+ if ( Discrete(ClassAtt) )
345
+ {
346
+ Class(DVec) = XDVal(DVec, ClassAtt);
347
+ }
348
+ else
349
+ if ( Unknown(DVec, ClassAtt) || NotApplic(DVec, ClassAtt) )
350
+ {
351
+ Class(DVec) = 0;
352
+ }
353
+ else
354
+ {
355
+ /* Find appropriate segment using class thresholds */
356
+
357
+ Cv = CVal(DVec, ClassAtt);
358
+
359
+ for ( Dv = 1 ; Dv < MaxClass && Cv > ClassThresh[Dv] ; Dv++ )
360
+ ;
361
+
362
+ Class(DVec) = Dv;
363
+ }
364
+ }
365
+ else
366
+ {
367
+ if ( ! ReadName(Df, Name, 1000, '\00') )
368
+ {
369
+ XError(HITEOF, Fn, "");
370
+ FreeLastCase(DVec);
371
+ return Nil;
372
+ }
373
+
374
+ if ( (Class(DVec) = Dv = Which(Name, ClassName, 1, MaxClass)) == 0 )
375
+ {
376
+ if ( strcmp(Name, "?") ) XError(BADCLASS, "", Name);
377
+ }
378
+ }
379
+
380
+ if ( LabelAtt &&
381
+ (Chars = strlen(IgnoredVals + SVal(DVec, LabelAtt))) > MaxLabel )
382
+ {
383
+ MaxLabel = Chars;
384
+ }
385
+ return DVec;
386
+ }
387
+ else
388
+ {
389
+ return Nil;
390
+ }
391
+ }
392
+
393
+
394
+
395
+ /*************************************************************************/
396
+ /* */
397
+ /* Count cases in data file */
398
+ /* */
399
+ /*************************************************************************/
400
+
401
+
402
+ CaseNo CountData(FILE *Df)
403
+ /* --------- */
404
+ {
405
+ char Last=',';
406
+ int Count=0, Next;
407
+
408
+ while ( true )
409
+ {
410
+ if ( (Next = getc(Df)) == EOF )
411
+ {
412
+ if ( Last != ',' ) Count++;
413
+ rewind(Df);
414
+ return Count;
415
+ }
416
+
417
+ if ( Next == '|' )
418
+ {
419
+ while ( (Next = getc(Df)) != '\n' )
420
+ ;
421
+ }
422
+
423
+ if ( Next == '\n' )
424
+ {
425
+ if ( Last != ',' ) Count++;
426
+ Last = ',';
427
+ }
428
+ else
429
+ if ( Next == '\\' )
430
+ {
431
+ /* Skip escaped character */
432
+
433
+ getc(Df);
434
+ }
435
+ else
436
+ if ( Next != '\t' && Next != ' ' )
437
+ {
438
+ Last = Next;
439
+ }
440
+ }
441
+ }
442
+
443
+
444
+
445
+ /*************************************************************************/
446
+ /* */
447
+ /* Store a label or ignored value in IValStore */
448
+ /* */
449
+ /*************************************************************************/
450
+
451
+
452
+ int StoreIVal(String S)
453
+ /* --------- */
454
+ {
455
+ int StartIx, Length;
456
+
457
+ if ( (Length=strlen(S) + 1) + IValsOffset > IValsSize )
458
+ {
459
+ if ( IgnoredVals )
460
+ {
461
+ Realloc(IgnoredVals, IValsSize += 32768, char);
462
+ }
463
+ else
464
+ {
465
+ IValsSize = 32768;
466
+ IValsOffset = 0;
467
+ IgnoredVals = Alloc(IValsSize, char);
468
+ }
469
+ }
470
+
471
+ StartIx = IValsOffset;
472
+ strcpy(IgnoredVals + StartIx, S);
473
+ IValsOffset += Length;
474
+
475
+ return StartIx;
476
+ }
477
+
478
+
479
+
480
+ /*************************************************************************/
481
+ /* */
482
+ /* Free case space */
483
+ /* */
484
+ /*************************************************************************/
485
+
486
+
487
+ void FreeData()
488
+ /* -------- */
489
+ {
490
+ FreeCases();
491
+
492
+ FreeUnlessNil(IgnoredVals); IgnoredVals = Nil;
493
+ IValsSize = 0;
494
+
495
+ Free(Case); Case = Nil;
496
+
497
+ MaxCase = -1;
498
+ }
499
+
500
+
501
+
502
+ /*************************************************************************/
503
+ /* */
504
+ /* Check for bad continuous value */
505
+ /* */
506
+ /*************************************************************************/
507
+
508
+
509
+ void CheckValue(DataRec DVec, Attribute Att)
510
+ /* ---------- */
511
+ {
512
+ ContValue Cv;
513
+
514
+ Cv = CVal(DVec, Att);
515
+ if ( ! finite(Cv) )
516
+ {
517
+ Error(BADNUMBER, AttName[Att], "");
518
+
519
+ CVal(DVec, Att) = UNKNOWN;
520
+ }
521
+ }