see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,412 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of GritBot GPL Edition, a single-threaded version */
6
+ /* of GritBot release 2.01. */
7
+ /* */
8
+ /* GritBot GPL Edition is free software: you can redistribute it */
9
+ /* and/or modify it under the terms of the GNU General Public License */
10
+ /* as published by the Free Software Foundation, either version 3 of */
11
+ /* the License, or (at your option) any later version. */
12
+ /* */
13
+ /* GritBot GPL Edition is distributed in the hope that it will be */
14
+ /* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
15
+ /* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16
+ /* GNU General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with GritBot GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Divide-and-Conquer for continuous attributes */
30
+ /* -------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ /*************************************************************************/
40
+ /* */
41
+ /* Known values of continuous attributes are divided into */
42
+ /* three groups: */
43
+ /* (1) N/A */
44
+ /* (2) values less than a threshold */
45
+ /* (3) values greater than a threshold */
46
+ /* This routine finds the best threshold for items Fp through Lp */
47
+ /* and sets Gain[] and Bar[] */
48
+ /* */
49
+ /*************************************************************************/
50
+
51
+
52
+ void CEvalContinAtt(Attribute Att, CaseNo Fp, CaseNo Lp)
53
+ /* -------------- */
54
+ {
55
+ CaseNo i, BestI, Xp;
56
+ double Val, ThisGain, BestGain=-1E-6;
57
+
58
+ /* Special case when very few values */
59
+
60
+ if ( No(Fp, Lp) < 2 * (CMINITEMS*GEnv.FRAC) )
61
+ {
62
+ Verbosity(2,
63
+ fprintf(Of, "\tAtt %s: insufficient cases with known values\n",
64
+ AttName[Att]))
65
+ return;
66
+ }
67
+
68
+ GEnv.BrFreq[1] = GEnv.BrFreq[2] = GEnv.BrFreq[3] = 0;
69
+
70
+ GEnv.BrSum[1] = GEnv.BrSumSq[1] =
71
+ GEnv.BrSum[2] = GEnv.BrSumSq[2] =
72
+ GEnv.BrSum[3] = GEnv.BrSumSq[3] = 0;
73
+
74
+ /* Omit and count N/A values and count base values */
75
+
76
+ Xp = Fp;
77
+ ForEach(i, Fp, Lp)
78
+ {
79
+ Val = CClass(Case[i]);
80
+
81
+ if ( NotApplic(Case[i],Att) )
82
+ {
83
+ GEnv.BrFreq[1]++;
84
+ GEnv.BrSum[1] += Val;
85
+ GEnv.BrSumSq[1] += Val * Val;
86
+
87
+ Swap(i, Xp);
88
+ Xp++;
89
+ }
90
+ else
91
+ {
92
+ GEnv.BrFreq[3]++;
93
+ GEnv.BrSum[3] += Val;
94
+ GEnv.BrSumSq[3] += Val * Val;
95
+ }
96
+ }
97
+
98
+ /* Sort all applicable values */
99
+
100
+ Quicksort(Xp, Lp, Att);
101
+
102
+ /* Try possible cuts between items i and i+1, and determine the
103
+ information and gain of the split in each case */
104
+
105
+ ForEach(i, Xp, Lp - (CMINITEMS*GEnv.FRAC))
106
+ {
107
+ Val = CClass(Case[i]);
108
+
109
+ GEnv.BrFreq[2]++;
110
+ GEnv.BrFreq[3]--;
111
+
112
+ GEnv.BrSum[2] += Val;
113
+ GEnv.BrSum[3] -= Val;
114
+ GEnv.BrSumSq[2] += Val * Val;
115
+ GEnv.BrSumSq[3] -= Val * Val;
116
+
117
+ if ( CVal(Case[i+1], Att) > CVal(Case[i], Att) &&
118
+ i >= Xp+(CMINITEMS*GEnv.FRAC)-1 )
119
+ {
120
+ ThisGain = ContinGain();
121
+ if ( ThisGain > BestGain + Epsilon )
122
+ {
123
+ BestGain = ThisGain;
124
+ BestI = i;
125
+ }
126
+ }
127
+ }
128
+
129
+ /* Set the best break point and gain */
130
+
131
+ if ( BestGain > Epsilon )
132
+ {
133
+ GEnv.Gain[Att] = BestGain;
134
+ GEnv.Bar[Att] = Between(CVal(Case[BestI],Att),
135
+ CVal(Case[BestI+1],Att));
136
+
137
+ Verbosity(2,
138
+ fprintf(Of, "\tAtt %s: cut=%.3f, gain %.3f\n",
139
+ AttName[Att], GEnv.Bar[Att], GEnv.Gain[Att]))
140
+
141
+ /* If not sampling, check subsets now */
142
+
143
+ if ( GEnv.FRAC >= 1.0 )
144
+ {
145
+ if ( Xp > Fp )
146
+ {
147
+ NoteTest(Att, 1, GEnv.Bar[Att], Nil);
148
+ FindContinOutliers(Fp, Xp-1, false);
149
+ }
150
+
151
+ NoteTest(Att, 2, GEnv.Bar[Att], Nil);
152
+ FindContinOutliers(Fp, BestI, false);
153
+
154
+ NoteTest(Att, 3, GEnv.Bar[Att], Nil);
155
+ FindContinOutliers(BestI+1, Lp, false);
156
+ }
157
+ }
158
+ else
159
+ {
160
+ Verbosity(2, fprintf(Of, "\tAtt %s: no gain\n", AttName[Att]))
161
+ }
162
+ }
163
+
164
+
165
+
166
+ /*************************************************************************/
167
+ /* */
168
+ /* Find the lowest-precision value in the range Low to High */
169
+ /* */
170
+ /*************************************************************************/
171
+
172
+
173
+ ContValue Between(ContValue Low, ContValue High)
174
+ /* ------- */
175
+ {
176
+ ContValue Base, Unit, Cut, Try, Margin;
177
+
178
+ if ( Low <= 0 && High > 0 ) return 0.0;
179
+
180
+ Margin = 0.005L * (High - Low);
181
+ Cut = (Low + High) / 2;
182
+
183
+ /* Try successively smaller units until a threshold lies between
184
+ Low and High */
185
+
186
+ for ( Base = 6 ; Base > -6 ; Base-- )
187
+ {
188
+ Unit = pow(10.0L, Base);
189
+ Try = rint(Cut / Unit) * Unit;
190
+
191
+ if ( Try >= Low && Try < High - Margin ) return Try;
192
+ if ( fmod(Low, Unit) < 1E-6 && fmod(High, Unit) < 1E-6 ) break;
193
+ }
194
+
195
+ /* If all else fails, return the low value */
196
+
197
+ return Low;
198
+ }
199
+
200
+
201
+
202
+ /*************************************************************************/
203
+ /* */
204
+ /* Set Gain[] for discrete partition of items Fp to Lp */
205
+ /* */
206
+ /*************************************************************************/
207
+
208
+
209
+ void CEvalDiscrAtt(Attribute Att, CaseNo Fp, CaseNo Lp)
210
+ /* ------------- */
211
+ {
212
+ if ( MaxAttVal[Att] == 3 )
213
+ {
214
+ EvalBinarySplit(Att, Fp, Lp);
215
+ }
216
+ else
217
+ {
218
+ EvalSubsetSplit(Att, Fp, Lp);
219
+ }
220
+
221
+ Verbosity(2,
222
+ if ( GEnv.Gain[Att] > Epsilon )
223
+ {
224
+ fprintf(Of, "\tAtt %s: gain %.3f\n", AttName[Att], GEnv.Gain[Att]);
225
+ }
226
+ else
227
+ {
228
+ fprintf(Of, "\tAtt %s: no gain\n", AttName[Att]);
229
+ })
230
+ }
231
+
232
+
233
+
234
+ /*************************************************************************/
235
+ /* */
236
+ /* Special case of binary split */
237
+ /* */
238
+ /*************************************************************************/
239
+
240
+
241
+ void EvalBinarySplit(Attribute Att, CaseNo Fp, CaseNo Lp)
242
+ /* --------------- */
243
+ {
244
+ DiscrValue v;
245
+
246
+ ForEach(v, 1, 3)
247
+ {
248
+ GEnv.BrFreq[v] = GEnv.DFreq[Att][v][0];
249
+ GEnv.BrSum[v] = GEnv.DValSum[Att][v];
250
+ GEnv.BrSumSq[v] = GEnv.DValSumSq[Att][v];
251
+ }
252
+
253
+ GEnv.Gain[Att] = ContinGain();
254
+ if ( GEnv.Gain[Att] < Epsilon ) GEnv.Gain[Att] = None;
255
+
256
+ if ( GEnv.FRAC >= 1 && GEnv.Gain[Att] > Epsilon )
257
+ {
258
+ CheckPotentialClusters(Att, 3, Fp, Lp, 0.0, Nil, Nil);
259
+ }
260
+ }
261
+
262
+
263
+
264
+ /*************************************************************************/
265
+ /* */
266
+ /* Divide attribute values into three subsets (one being N/A) */
267
+ /* */
268
+ /*************************************************************************/
269
+
270
+
271
+ void EvalSubsetSplit(Attribute Att, CaseNo Fp, CaseNo Lp)
272
+ /* --------------- */
273
+ {
274
+ DiscrValue v, sv, Cycle;
275
+ double ThisGain, BestGain=-1E-6;
276
+ int Bytes;
277
+
278
+ ForEach(v, 1, MaxAttVal[Att])
279
+ {
280
+ GEnv.ValFreq[v] = GEnv.DFreq[Att][v][0];
281
+ GEnv.ValSum[v] = GEnv.DValSum[Att][v];
282
+ GEnv.ValSumSq[v] = GEnv.DValSumSq[Att][v];
283
+ }
284
+
285
+ GEnv.BrFreq[1] = GEnv.ValFreq[1];
286
+ GEnv.BrSum[1] = GEnv.ValSum[1];
287
+ GEnv.BrSumSq[1] = GEnv.ValSumSq[1];
288
+
289
+ ForEach(v, 2, 3)
290
+ {
291
+ GEnv.BrFreq[v] = GEnv.BrSum[v] = GEnv.BrSumSq[v] = 0;
292
+ }
293
+
294
+ ForEach(v, 2, MaxAttVal[Att])
295
+ {
296
+ GEnv.BrFreq[2] += GEnv.ValFreq[v];
297
+ GEnv.BrSum[2] += GEnv.ValSum[v];
298
+ GEnv.BrSumSq[2] += GEnv.ValSumSq[v];
299
+ }
300
+
301
+ /* Examine subsets, starting with all values in the left branch.
302
+ At each iteration, move the value with the highest mean from
303
+ the left branch to the right branch and check the gain.
304
+ (In the case of ordered attributes, the value moved is the
305
+ rightmost value in the left branch.)
306
+ Save the best gain so far in Subset[Att]. */
307
+
308
+ ForEach(v, 2, MaxAttVal[Att])
309
+ {
310
+ GEnv.Left[v] = ( GEnv.ValFreq[v] > 0 );
311
+ }
312
+
313
+ Bytes = (MaxAttVal[Att]>>3) + 1;
314
+
315
+ ForEach(Cycle, 2, MaxAttVal[Att])
316
+ {
317
+ if ( Ordered(Att) )
318
+ {
319
+ for ( sv = MaxAttVal[Att] ; sv > 1 && ! GEnv.Left[sv] ; sv-- )
320
+ ;
321
+ }
322
+ else
323
+ {
324
+ sv = 0;
325
+
326
+ ForEach(v, 2, MaxAttVal[Att])
327
+ {
328
+ if ( GEnv.Left[v] &&
329
+ ( ! sv ||
330
+ GEnv.ValSum[v] / GEnv.ValFreq[v] >
331
+ GEnv.ValSum[sv] / GEnv.ValFreq[sv] ) )
332
+ {
333
+ sv = v ;
334
+ }
335
+ }
336
+ }
337
+
338
+ if ( sv < 2 ) break;
339
+
340
+ GEnv.Left[sv] = false;
341
+
342
+ GEnv.BrFreq[2] -= GEnv.ValFreq[sv];
343
+ GEnv.BrSum[2] -= GEnv.ValSum[sv];
344
+ GEnv.BrSumSq[2] -= GEnv.ValSumSq[sv];
345
+ GEnv.BrFreq[3] += GEnv.ValFreq[sv];
346
+ GEnv.BrSum[3] += GEnv.ValSum[sv];
347
+ GEnv.BrSumSq[3] += GEnv.ValSumSq[sv];
348
+
349
+ if ( GEnv.BrFreq[2] >= (CMINITEMS*GEnv.FRAC) &&
350
+ GEnv.BrFreq[3] >= (CMINITEMS*GEnv.FRAC) &&
351
+ (ThisGain = ContinGain()) > BestGain + Epsilon )
352
+ {
353
+ GEnv.Gain[Att] = BestGain = ThisGain;
354
+ GEnv.Bar[Att] = sv-1;
355
+
356
+ /* Record in Subset[Att] */
357
+
358
+ ClearBits(Bytes, GEnv.Subset[Att]);
359
+
360
+ ForEach(v, 2, MaxAttVal[Att])
361
+ {
362
+ if ( GEnv.Left[v] )
363
+ {
364
+ SetBit(v, GEnv.Subset[Att]);
365
+ }
366
+ }
367
+ }
368
+ }
369
+
370
+ if ( GEnv.FRAC >= 1 && GEnv.Gain[Att] > Epsilon )
371
+ {
372
+ CheckPotentialClusters(Att, 3, Fp, Lp, GEnv.Bar[Att], GEnv.Subset[Att],
373
+ Nil);
374
+ }
375
+ }
376
+
377
+
378
+
379
+ double SDEstimate(CaseCount N, double Sum, double SumSq)
380
+ /* ---------- */
381
+ {
382
+ return sqrt( (SumSq - Sum * Sum / N + 1E-3) / (N - 1) );
383
+ }
384
+
385
+
386
+
387
+ /*************************************************************************/
388
+ /* */
389
+ /* Compute continuous gain for three branches */
390
+ /* */
391
+ /*************************************************************************/
392
+
393
+
394
+ double ContinGain()
395
+ /* ---------- */
396
+ {
397
+ double Resid=0;
398
+ DiscrValue v;
399
+ CaseCount Cases=0;
400
+
401
+ ForEach(v, 1, 3)
402
+ {
403
+ if ( GEnv.BrFreq[v] > 1 )
404
+ {
405
+ Cases += GEnv.BrFreq[v];
406
+ Resid += GEnv.BrFreq[v] *
407
+ SDEstimate(GEnv.BrFreq[v], GEnv.BrSum[v], GEnv.BrSumSq[v]);
408
+ }
409
+ }
410
+
411
+ return GEnv.PSD - Resid / Cases;
412
+ }
@@ -0,0 +1,623 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of GritBot GPL Edition, a single-threaded version */
6
+ /* of GritBot release 2.01. */
7
+ /* */
8
+ /* GritBot GPL Edition is free software: you can redistribute it */
9
+ /* and/or modify it under the terms of the GNU General Public License */
10
+ /* as published by the Free Software Foundation, either version 3 of */
11
+ /* the License, or (at your option) any later version. */
12
+ /* */
13
+ /* GritBot GPL Edition is distributed in the hope that it will be */
14
+ /* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
15
+ /* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16
+ /* GNU General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with GritBot GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Definitions */
30
+ /* ----------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #define RELEASE "2.01 GPL Edition"
36
+
37
+ #include <stdio.h>
38
+ #include <math.h>
39
+ #include <string.h>
40
+ #include <stdlib.h>
41
+ #include <time.h>
42
+ #include <ctype.h>
43
+ #include <limits.h>
44
+ #include <float.h>
45
+
46
+ #include "text.i"
47
+
48
+
49
+
50
+ /*************************************************************************/
51
+ /* */
52
+ /* Definitions dependent on cc options */
53
+ /* */
54
+ /*************************************************************************/
55
+
56
+
57
+ #define Goodbye(x) exit(x)
58
+ #define Of stdout
59
+
60
+ #include <values.h>
61
+
62
+ #ifdef VerbOpt
63
+ #include <assert.h>
64
+ #define Verbosity(d,s) if(VERBOSITY >= d) {s;}
65
+ #define Free(x) {free(x); x = 0;}
66
+ #else
67
+ #define assert(x)
68
+ #define Verbosity(d,s)
69
+ #define Free(x) free(x)
70
+ #endif
71
+
72
+
73
+ /*************************************************************************/
74
+ /* */
75
+ /* Constants, macros etc. */
76
+ /* */
77
+ /*************************************************************************/
78
+
79
+
80
+ #define MAXFRAC 0.01 /* max proportion of outliers in group */
81
+ #define MAXNORM 2.67 /* max SDs for non-outlier */
82
+ #define MAXTAIL 5.34 /* max SDs for 'ordinary' tail */
83
+ #define SAMPLEUNIT 2000 /* min sample per split */
84
+ #define SAMPLEFACTOR 5 /* threshold for using sampling */
85
+ #define MINCONTEXT 25 /* min cases to detect other difference */
86
+
87
+ #define Nil 0 /* null pointer */
88
+ #define false 0
89
+ #define true 1
90
+ #define None -1
91
+ #define Epsilon 1E-6
92
+
93
+ #define EXCLUDE 1 /* special attribute status: do not use */
94
+ #define SKIP 2 /* do not check */
95
+ #define DISCRETE 4 /* ditto: collect values as data read */
96
+ #define ORDERED 8 /* ditto: ordered discrete values */
97
+ #define DATEVAL 16 /* ditto: YYYY/MM/DD or YYYY-MM-DD */
98
+ #define STIMEVAL 32 /* ditto: HH:MM:SS */
99
+ #define TSTMPVAL 64 /* date time */
100
+
101
+ /* unknown and N/A values are represented by
102
+ unlikely floating-point numbers
103
+ (octal 01600000000 and 01) */
104
+ #define UNKNOWN 01600000000 /* 1.5777218104420236e-30 */
105
+ #define NA 01 /* 1.4012984643248171e-45 */
106
+
107
+ #define BrDiscr 1
108
+ #define BrThresh 2
109
+ #define BrSubset 3
110
+
111
+ #define AllocZero(N,T) (T *) Pcalloc(N, sizeof(T))
112
+ #define Alloc(N,T) AllocZero(N,T) /*for safety */
113
+ #define Realloc(V,N,T) V = (T *) Prealloc(V, (N)*sizeof(T))
114
+
115
+ #define Max(a,b) ((a)>(b) ? a : b)
116
+ #define Min(a,b) ((a)<(b) ? a : b)
117
+
118
+ #define Log2 0.69314718055994530942
119
+
120
+ #define Bit(b) (1 << (b))
121
+ #define In(b,s) ((s[(b) >> 3]) & Bit((b) & 07))
122
+ #define ClearBits(n,s) memset(s,0,n)
123
+ #define CopyBits(n,f,t) memcpy(t,f,n)
124
+ #define SetBit(b,s) (s[(b) >> 3] |= Bit((b) & 07))
125
+ #define ResetBit(b,s) (s[(b) >> 3] ^= Bit((b) & 07))
126
+
127
+ #define ForEach(v,f,l) for(v=f ; v<=l ; ++v)
128
+
129
+ #define Swap(a,b) {Description _xab; _xab=Case[a]; Case[a]=Case[b]; Case[b]=_xab;}
130
+
131
+ #define StatBit(a,b) (SpecialStatus[a]&(b))
132
+ #define Exclude(a) StatBit(a,EXCLUDE)
133
+ #define Skip(a) StatBit(a,EXCLUDE|SKIP)
134
+ #define Discrete(a) (MaxAttVal[a] || StatBit(a,DISCRETE))
135
+ #define Continuous(a) (! MaxAttVal[a] && ! StatBit(a,DISCRETE))
136
+ #define Ordered(a) StatBit(a,ORDERED)
137
+ #define DateVal(a) StatBit(a,DATEVAL)
138
+ #define TimeVal(a) StatBit(a,STIMEVAL)
139
+ #define TStampVal(a) StatBit(a,TSTMPVAL)
140
+
141
+ #define FreeUnlessNil(p) if((p)!=Nil) free(p)
142
+
143
+ #define CheckClose(f) if(f) {fclose(f); f=Nil;}
144
+
145
+ #define Space(s) (s == ' ' || s == '\n' || s == '\r' || s == '\t')
146
+ #define SkipComment while ( (c = InChar(f)) != '\n' && c != EOF )
147
+
148
+ #define rint(x) floor((x)+0.5) /* for consistency across platforms */
149
+ #define P1(x) (rint((x)*10) / 10)
150
+
151
+ #define No(f,l) ((l)-(f)+1)
152
+ #define SplitVal(g,i) ( Continuous(ClassAtt) ? (g) : (g) / ((i) + 1E-6) )
153
+
154
+
155
+ #define NOFILE 0
156
+ #define BADATTNAME 1
157
+ #define EOFINATT 2
158
+ #define SINGLEATTVAL 3
159
+ #define BADATTVAL 4
160
+ #define BADNUMBER 25
161
+ #define BADCLASS 5
162
+ #define DUPATTNAME 6
163
+ #define NOMEM 8
164
+ #define TOOMANYVALS 9
165
+ #define BADDISCRETE 10
166
+ #define UNKNOWNATT 11
167
+ #define LONGNAME 13
168
+ #define HITEOF 14
169
+ #define MISSNAME 15
170
+ #define BADDATE 16
171
+ #define BADTIME 17
172
+ #define BADDEF1 20
173
+ #define BADDEF2 21
174
+ #define BADDEF3 22
175
+ #define SAMEATT 23
176
+ #define BADTSTMP 24
177
+ #define BADSIFT 30
178
+
179
+ #define READDATA 1
180
+ #define READTEST 2
181
+ #define READCASES 3
182
+ #define PRELIM 4
183
+ #define CHECKING 5
184
+ #define REPORTING 6
185
+ #define CLEANUP 7
186
+
187
+ #define CONT_GT 1 /* CONT_GLT = CONT_GT | CONT_LT */
188
+ #define CONT_LT 2
189
+ #define CONT_GLT 3
190
+ #define CONT_NA 4
191
+ #define DISCR_GT 5 /* DISCR_GLT = DISCR_GT | DISCR_LT */
192
+ #define DISCR_LT 6
193
+ #define DISCR_GLT 7
194
+ #define DISCR_VAL 8
195
+ #define DISCR_SET 9
196
+
197
+
198
+ /*************************************************************************/
199
+ /* */
200
+ /* Type definitions */
201
+ /* */
202
+ /*************************************************************************/
203
+
204
+
205
+ typedef unsigned char Boolean, BranchType, *Set;
206
+ typedef char *String;
207
+
208
+ typedef int CaseNo, /* data item number */
209
+ CaseCount; /* count of items */
210
+
211
+ typedef int DiscrValue, /* discrete attribute value (0 = ?) */
212
+ Attribute; /* attribute number, 1..MaxAtt */
213
+
214
+
215
+ /* defining USEDOUBLE allows for DP
216
+ attribute values, but will not affect
217
+ use of saved analyses */
218
+ #ifdef USEDOUBLE
219
+ typedef double ContValue; /* continuous attribute value */
220
+ #define PREC 14 /* precision */
221
+ #define MARKER MAXDOUBLE
222
+ #else
223
+ typedef float ContValue; /* continuous attribute value */
224
+ #define PREC 7 /* precision */
225
+ #define MARKER MAXFLOAT
226
+ #endif
227
+
228
+
229
+ typedef union _def_val
230
+ {
231
+ String _s_val; /* att val for comparison */
232
+ ContValue _n_val; /* number for arith */
233
+ }
234
+ DefVal;
235
+
236
+ typedef struct _def_elt
237
+ {
238
+ short _op_code; /* type of element */
239
+ DefVal _operand; /* string or numeric value */
240
+ }
241
+ DefElt, *Definition;
242
+
243
+ typedef struct _elt_rec
244
+ {
245
+ int Fi, /* index of first char of element */
246
+ Li; /* last ditto */
247
+ char Type; /* 'B', 'S', or 'N' */
248
+ }
249
+ EltRec;
250
+
251
+
252
+ #define DefOp(DE) DE._op_code
253
+ #define DefSVal(DE) DE._operand._s_val
254
+ #define DefNVal(DE) DE._operand._n_val
255
+
256
+ #define OP_ATT 0 /* opcodes */
257
+ #define OP_NUM 1
258
+ #define OP_STR 2
259
+ #define OP_MISS 3
260
+ #define OP_AND 10
261
+ #define OP_OR 11
262
+ #define OP_EQ 20
263
+ #define OP_NE 21
264
+ #define OP_GT 22
265
+ #define OP_GE 23
266
+ #define OP_LT 24
267
+ #define OP_LE 25
268
+ #define OP_SEQ 26
269
+ #define OP_SNE 27
270
+ #define OP_PLUS 30
271
+ #define OP_MINUS 31
272
+ #define OP_UMINUS 32
273
+ #define OP_MULT 33
274
+ #define OP_DIV 34
275
+ #define OP_MOD 35
276
+ #define OP_POW 36
277
+ #define OP_SIN 40
278
+ #define OP_COS 41
279
+ #define OP_TAN 42
280
+ #define OP_LOG 43
281
+ #define OP_EXP 44
282
+ #define OP_INT 45
283
+ #define OP_END 99
284
+
285
+
286
+ typedef struct _testrec
287
+ {
288
+ Attribute Att; /* attribute tested */
289
+ DiscrValue Br; /* branch of test */
290
+ ContValue Cut; /* threshold (if relevant) */
291
+ Set Left; /* values for left br (if relevant) */
292
+ }
293
+ TestRec;
294
+
295
+ typedef struct _clustcondrec
296
+ {
297
+ int Type; /* type of test */
298
+ Attribute Att; /* attribute tested */
299
+ ContValue Low, /* low thresh or start of range */
300
+ High; /* high threshold or end of range */
301
+ Set Values; /* value subset if required */
302
+ }
303
+ ClustCond;
304
+
305
+ typedef struct _clustrec
306
+ {
307
+ Attribute Att; /* focus attribute */
308
+ ClustCond *Cond; /* group conditions */
309
+ int NCond; /* number of group conditions */
310
+ ContValue Expect, /* mean | (int) modal value */
311
+ SD, /* sd (trimmed) */
312
+ Limit; /* low / high value for normal cases */
313
+ float Frac; /* proportion of "normal" cases */
314
+ CaseCount GpSize; /* size of group */
315
+ }
316
+ ClustRec, *Clust;
317
+
318
+ typedef union _attribute_value
319
+ {
320
+ DiscrValue _discr_val;
321
+ ContValue _cont_val;
322
+ String _string_val;
323
+ Clust _clust;
324
+ }
325
+ AttValue, *Description;
326
+
327
+ typedef struct _sort_pair
328
+ {
329
+ ContValue C;
330
+ Description D;
331
+ }
332
+ SortPair;
333
+
334
+ typedef struct _caveat_rec
335
+ {
336
+ Attribute Att;
337
+ Set Subset;
338
+ float Low, High;
339
+ }
340
+ CaveatRec;
341
+
342
+ typedef struct _treerec *Tree;
343
+ typedef struct _treerec
344
+ {
345
+ BranchType NodeType; /* 0 | BrDiscr | BrThresh | BrSubset */
346
+ Attribute Tested; /* attribute referenced in test */
347
+ int Forks; /* number of branches at this node */
348
+ ContValue Cut; /* threshold for continuous attribute */
349
+ Set Left; /* subset of values for first branch */
350
+ Tree *Branch, /* Branch[x] = subtree for outcome x */
351
+ Parent; /* parent node */
352
+ DiscrValue Br; /* branch from parent */
353
+ String SiftEntry; /* text for sift file */
354
+ }
355
+ TreeRec;
356
+
357
+ typedef struct _env_rec
358
+ {
359
+ CaseCount **Freq,
360
+ **BestFreq,
361
+ *ValFreq,
362
+ *ClassFreq,
363
+ BrFreq[4];
364
+ Boolean *Left,
365
+ *Possible;
366
+ double *Gain,
367
+ *Info,
368
+ *ValSum,
369
+ *ValSumSq,
370
+ BrSum[4],
371
+ BrSumSq[4],
372
+ BaseInfo,
373
+ FRAC,
374
+ PSD;
375
+ Set *Subset;
376
+ ContValue *Bar;
377
+ int Level,
378
+ MaxLevel,
379
+ *Tested;
380
+ TestRec *Test;
381
+ String SiftEntry;
382
+ int SiftSize,
383
+ SiftSpace;
384
+ Attribute *DList; /* current discrete atts */
385
+ CaseCount ***DFreq; /* DFreq[a][][] = Freq[] for att a */
386
+ double **DValSum, /* ValSum[] for att a */
387
+ **DValSumSq; /* ValSumSq[] for att a */
388
+ }
389
+ EnvRec;
390
+
391
+
392
+ #define CVal(Case,Attribute) Case[Attribute]._cont_val
393
+ #define DVal(Case,Attribute) Case[Attribute]._discr_val
394
+ #define XDVal(Case,Att) (Case[Att]._discr_val & 077777777)
395
+ #define SVal(Case,Attribute) Case[Attribute]._string_val
396
+
397
+ #define CClass(Case) (*Case)._cont_val
398
+ #define DClass(Case) ((*Case)._discr_val & 077777777)
399
+
400
+ #define Unknown(Case,Att) (DVal(Case,Att)==UNKNOWN)
401
+ #define UnknownVal(AV) (AV._discr_val==UNKNOWN)
402
+ #define NotApplic(Case,Att) (DVal(Case,Att)==NA)
403
+ #define NotApplicVal(AV) (AV._discr_val==NA)
404
+
405
+ #define OutXVal(Case) Case[MaxAtt+1]._cont_val
406
+ #define OutClust(Case) Case[MaxAtt+2]._clust
407
+
408
+ #define ZScore(i) (fabs(CClass(Case[i])-Mean) / SD)
409
+ #define MaxAnoms(N) (MAXFRAC*(N)+2*sqrt((N)*MAXFRAC*(1-MAXFRAC))+1)
410
+
411
+ #define DScore(n,a,p) ((a) / ((double) (n)*(p)))
412
+
413
+ /* XDScore is a specialised version for possibly non-occuring vals */
414
+ #define XDScore(f,n,a,p) ((f) ? (a) / ((double) (n)*(p)) :\
415
+ (p) ? (1 / ((n)+2.0)) / (p) : (1 / ((n)+2.0)))
416
+
417
+
418
+ /*************************************************************************/
419
+ /* */
420
+ /* Function prototypes */
421
+ /* */
422
+ /*************************************************************************/
423
+
424
+ /* getnames.c */
425
+
426
+ Boolean ReadName(FILE *f, String s, int n, char ColonOpt);
427
+ void GetNames(FILE *Nf);
428
+ void ExplicitAtt(FILE *Nf);
429
+ int Which(String Val, String *List, int First, int Last);
430
+ String CopyString(String S);
431
+ void FreeNames(void);
432
+ int InChar(FILE *f);
433
+
434
+ /* implicitatt.c */
435
+
436
+ void ImplicitAtt(FILE *Nf);
437
+ void ReadDefinition(FILE *f);
438
+ void Append(char c);
439
+ Boolean Expression(void);
440
+ Boolean Conjunct(void);
441
+ Boolean SExpression(void);
442
+ Boolean AExpression(void);
443
+ Boolean Term(void);
444
+ Boolean Factor(void);
445
+ Boolean Primary(void);
446
+ Boolean Atom(void);
447
+ Boolean Find(String S);
448
+ int FindOne(String *Alt);
449
+ Attribute FindAttName(void);
450
+ void DefSyntaxError(String Msg);
451
+ void DefSemanticsError(int Fi, String Msg, int OpCode);
452
+ void Dump(char OpCode, ContValue F, String S, int Fi);
453
+ void DumpOp(char OpCode, int Fi);
454
+ Boolean UpdateTStack(char OpCode, ContValue F, String S, int Fi);
455
+ AttValue EvaluateDef(Definition D, Description Case);
456
+
457
+ /* getdata.c */
458
+
459
+ void GetData(FILE *Df, Boolean Train);
460
+ Description GetDescription(FILE *Df, Boolean Train);
461
+ void FreeData(void);
462
+ void FreeCase(Description DVec);
463
+ void CheckValue(Description DVec, Attribute Att);
464
+
465
+ /* check.c */
466
+
467
+ void CheckData(void);
468
+ void CheckContin(CaseNo Fp);
469
+ void FindContinOutliers(CaseNo Fp, CaseNo Lp, Boolean Sorted);
470
+ void LabelContinOutliers(Clust CL, Clust CH, CaseNo Fp, CaseNo GFp,
471
+ CaseNo GLp);
472
+ void TrimmedSDEstimate(CaseNo Fp, CaseNo Lp, double *Mean, double *SD);
473
+ CaseNo FindTail(CaseNo Fp, CaseNo Lp, int Inc, double Mean, double SD);
474
+ Boolean OmittedCases(int HiLo);
475
+ Boolean SatisfiesTests(Description Case);
476
+ void FindDiscrOutliers(CaseNo Fp, CaseNo Lp, CaseCount *Table);
477
+ CaseNo NoOtherDifference(CaseNo Fp, CaseNo Lp, CaseNo GFp, CaseNo GLp);
478
+ void InitialiseEnvData(void);
479
+ void FreeEnvData(void);
480
+
481
+ /* cluster.c */
482
+
483
+ Clust NewClust(ContValue Expect, ContValue SD, ContValue Limit,
484
+ CaseCount Anoms, CaseCount GpSize);
485
+ void SaveClustConds(Clust C);
486
+ void FormatContinCond(Attribute Att, ClustCond *CC);
487
+ void FormatOrderedCond(Attribute Att, ClustCond *CC);
488
+ void FormatSubsetCond(Attribute Att, ClustCond *CC);
489
+ void FormatValCond(Attribute Att, ClustCond *CC);
490
+ void FreeClust(Clust C);
491
+
492
+ /* outlier.c */
493
+
494
+ void RecordOutlier(CaseNo i, Clust C, float XVal);
495
+ void ReportOutliers(void);
496
+ void PrintAttVal(Description Case, Attribute Att);
497
+ void PrintOutlier(CaseNo i, Clust C, ContValue SVal);
498
+ void PrintContinCond(Attribute Att, ContValue Lo, ContValue Hi, CaseNo N);
499
+ void PrintOrderedCond(Attribute Att, DiscrValue Lo, DiscrValue Hi, CaseNo N);
500
+ void PrintSubsetCond(Attribute Att, Set Values, CaseNo N);
501
+ void PrintValCond(Attribute Att, DiscrValue v);
502
+
503
+
504
+ /* common.c */
505
+
506
+ void InitialiseDAC(void);
507
+ void FreeDAC(void);
508
+ void Split(CaseNo Fp, CaseNo Lp, int CondAtts, Tree Parent,
509
+ DiscrValue Br, Tree *Result);
510
+ void RecoverContext(Tree T, DiscrValue Br);
511
+ void DiscreteAttInfo(CaseNo Fp, CaseNo Lp, int CondAtts);
512
+ void ChooseSplitWithSampling(CaseNo Fp, CaseNo Lp, int CondAtts);
513
+ void Sample(CaseNo Fp, CaseNo Lp, CaseCount N);
514
+ void SampleScan(CaseNo Fp, CaseNo Lp, int CondAtts, Boolean Second);
515
+ void ChooseSplit(CaseNo Fp, CaseNo Lp, int CondAtts);
516
+ void FindBestAtt(Attribute *BestAtt, double *BestVal);
517
+ void CheckSplit(Attribute Att, CaseNo Fp, CaseNo Lp);
518
+ void Divide(Tree Node, CaseNo Fp, CaseNo Lp, int CondAtts);
519
+ void NoteTest(Attribute Att, DiscrValue Br, ContValue Cut, Set Left);
520
+ CaseNo SkipMissing(Attribute Att, CaseNo Fp, CaseNo Lp);
521
+ CaseNo Group(Attribute Att, DiscrValue V, CaseNo Fp, CaseNo Lp,
522
+ ContValue Cut, Set Left);
523
+ void CheckPotentialClusters(Attribute Att, DiscrValue Forks,
524
+ CaseNo Fp, CaseNo Lp, ContValue B, Set S,
525
+ CaseCount **FT);
526
+ void ShowContext(CaseNo i);
527
+ Tree Leaf(Tree Parent, DiscrValue Br);
528
+ void ReleaseTree(Tree T, int Level);
529
+ void OutputConditions(void);
530
+
531
+ /* continatt.c */
532
+
533
+ void CEvalContinAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
534
+ ContValue Between(ContValue Low, ContValue High);
535
+ void CEvalDiscrAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
536
+ void EvalBinarySplit(Attribute Att, CaseNo Fp, CaseNo Lp);
537
+ void EvalSubsetSplit(Attribute Att, CaseNo Fp, CaseNo Lp);
538
+ double SDEstimate(CaseCount N, double Sum, double SumSq);
539
+ double ContinGain(void);
540
+
541
+ /* discratt.c */
542
+
543
+ void DEvalContinAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
544
+ void DEvalDiscrAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
545
+ void DEvalOrderedAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
546
+ void ComputeFrequencies(Attribute Att, CaseNo Fp, CaseNo Lp);
547
+ void FindClassFrequencies(CaseNo Fp, CaseNo Lp);
548
+ double DiscrGain(DiscrValue MaxVal, CaseCount TotalCases);
549
+ double TotalInfo(CaseCount V[], DiscrValue MinVal, DiscrValue MaxVal);
550
+
551
+ /* sort.c */
552
+
553
+ void Quicksort(CaseNo Fp, CaseNo Lp, Attribute Att);
554
+ void Cachesort(CaseNo Fp, CaseNo Lp);
555
+
556
+ /* modelfiles.c */
557
+
558
+ void CheckFile(String Extension, Boolean Write);
559
+ void WriteFilePrefix(String Extension);
560
+ void SaveCondition(void);
561
+ void SaveDiscrCluster(DiscrValue Expect, CaseCount Anoms,
562
+ CaseCount Cases, CaseCount *Freq);
563
+ void SaveContinCluster(float Mean, float SD, CaseCount Cases,
564
+ float LFrac, float LLim, float HFrac, float HLim);
565
+ void SaveNames(void);
566
+ void AsciiOut(String Pre, String S);
567
+ void ReadFilePrefix(String Extension);
568
+ int ReadProp(char *Delim);
569
+ String RemoveQuotes(String S);
570
+ void ExtendSiftEntry(String S);
571
+ void ProcessSift(void);
572
+ void Case1(void);
573
+ void Case2(void);
574
+ void Case3(void);
575
+ void Case11(void);
576
+ void Case12(void);
577
+ void Case13(void);
578
+ void Case21(void);
579
+ void Case22(void);
580
+ void ReadCaveats(void);
581
+ Boolean CheckCaveats(Description Case);
582
+ void FoundPossibleAnom(CaseNo i, Clust C, float Xv);
583
+ void Filter(Attribute Att, DiscrValue Br, ContValue Cut, Set Left);
584
+ void SetTestedAtts(void);
585
+
586
+
587
+ /* utility.c */
588
+
589
+ void PrintHeader(String Title);
590
+ char ProcessOption(int Argc, char **Argv, char *Str);
591
+ void *Pmalloc(size_t Bytes);
592
+ void *Prealloc(void *Present, size_t Bytes);
593
+ void *Pcalloc(size_t Number, unsigned int Size);
594
+ void FreeVector(void **V, int First, int Last);
595
+ Description NewCase(void);
596
+ void MemTrim(void);
597
+ void FreeCases(void);
598
+ void FreeLastCase(Description Case);
599
+ double KRandom(void);
600
+ void ResetKR(int KRInit);
601
+ void Error(int ErrNo, String S1, String S2);
602
+ String CaseLabel(CaseNo N);
603
+ FILE * GetFile(String Extension, String RW);
604
+ double ExecTime(void);
605
+ int Denominator(ContValue Val);
606
+ int FracBase(Attribute Att);
607
+ int GetInt(String S, int N);
608
+ int DateToDay(String DS);
609
+ void DayToDate(int DI, String Date);
610
+ int TimeToSecs(String TS);
611
+ void SecsToTime(int Secs, String Time);
612
+ void SetTSBase(int y);
613
+ int TStampToMins(String TS);
614
+ void CValToStr(ContValue CV, Attribute Att, String DS);
615
+ void CleanupSift(void);
616
+ void Cleanup(void);
617
+ void Check(float Val, float Low, float High);
618
+
619
+
620
+ /* update.c */
621
+
622
+ void NotifyStage(int);
623
+ void Progress(int);