see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,98 @@
1
+ #*************************************************************************#
2
+ #* *#
3
+ #* Makefile for GritBot *#
4
+ #* -------------------- *#
5
+ #* *#
6
+ #*************************************************************************#
7
+
8
+
9
+ CC = gcc -ffloat-store
10
+ CFLAGS = -DVerbOpt -g -Wall -O0
11
+ LFLAGS = $(S)
12
+ SHELL = /bin/csh
13
+
14
+
15
+ # Definitions of file sets
16
+
17
+ src =\
18
+ global.c\
19
+ cluster.c\
20
+ continatt.c\
21
+ outlier.c\
22
+ getdata.c\
23
+ gritbot.c\
24
+ sort.c\
25
+ discratt.c\
26
+ check.c\
27
+ common.c\
28
+ getnames.c\
29
+ implicitatt.c\
30
+ modelfiles.c\
31
+ update.c\
32
+ utility.c
33
+
34
+
35
+ isrc =\
36
+ inspect.c\
37
+ cluster.c\
38
+ outlier.c\
39
+ getdata.c\
40
+ getnames.c\
41
+ implicitatt.c\
42
+ modelfiles.c\
43
+ update.c\
44
+ common.c\
45
+ utility.c
46
+
47
+
48
+ obj =\
49
+ global.o\
50
+ gritbot.o\
51
+ getdata.o getnames.o implicitatt.o\
52
+ check.o cluster.o outlier.o\
53
+ common.o continatt.o discratt.o\
54
+ modelfiles.o\
55
+ sort.o utility.o update.o
56
+
57
+
58
+ all:
59
+ make gritbot
60
+ make inspect
61
+
62
+
63
+ # debug version (including verbosity option)
64
+
65
+ gritbotdbg:\
66
+ $(obj) defns.i text.i extern.i Makefile
67
+ $(CC) -DVerbOpt -g -o gritbotdbg $(obj) -lm
68
+
69
+ inspectdbg:\
70
+ $(isrc) defns.i text.i Makefile
71
+ cat defns.i $(isrc)\
72
+ | egrep -v 'defns.i|extern.i' >insgt.c
73
+ $(CC) $(CFLAGS) -DVerbOpt -DINSPECT -o inspectdbg insgt.c -lm
74
+
75
+ # production versions
76
+
77
+ gritbot:\
78
+ $(src) defns.i text.i Makefile
79
+ cat defns.i $(src)\
80
+ | egrep -v 'defns.i|extern.i' >gbotgt.c
81
+ $(CC) $(LFLAGS) -O3 -o gritbot gbotgt.c -lm
82
+ strip gritbot
83
+ rm gbotgt.c
84
+
85
+ inspect:\
86
+ $(isrc) defns.i text.i Makefile
87
+ cat defns.i $(isrc)\
88
+ | egrep -v 'defns.i|extern.i' >insgt.c
89
+ $(CC) $(LFLAGS) -DINSPECT -O3 -o inspect insgt.c -lm
90
+ strip inspect
91
+ rm insgt.c
92
+
93
+
94
+ $(obj): Makefile defns.i extern.i
95
+
96
+
97
+ .c.o:
98
+ $(CC) $(CFLAGS) -c $<
@@ -0,0 +1,1110 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of GritBot GPL Edition, a single-threaded version */
6
+ /* of GritBot release 2.01. */
7
+ /* */
8
+ /* GritBot GPL Edition is free software: you can redistribute it */
9
+ /* and/or modify it under the terms of the GNU General Public License */
10
+ /* as published by the Free Software Foundation, either version 3 of */
11
+ /* the License, or (at your option) any later version. */
12
+ /* */
13
+ /* GritBot GPL Edition is distributed in the hope that it will be */
14
+ /* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
15
+ /* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16
+ /* GNU General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with GritBot GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Principal routines to check data */
30
+ /* -------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+ Boolean CheckMsg; /* message printed for current attribute */
39
+ CaseNo LowFp, LowLp, HighFp; /* pointers to omitted low/high tails */
40
+
41
+
42
+
43
+ /*************************************************************************/
44
+ /* */
45
+ /* Check each attribute in turn */
46
+ /* */
47
+ /*************************************************************************/
48
+
49
+
50
+ void CheckData()
51
+ /* --------- */
52
+ {
53
+ CaseNo i, Fp, Lp;
54
+
55
+ NotifyStage(PRELIM);
56
+
57
+ /* Set up tables, determine discrete value priors, etc. */
58
+
59
+ InitialiseDAC();
60
+
61
+ /* Initialise random numbers for sampling */
62
+
63
+ ResetKR(0);
64
+
65
+ DMINITEMS = CMINITEMS = Max(35, 0.005 * (MaxCase+1));
66
+
67
+ LowFp = 0;
68
+ LowLp = -1;
69
+ HighFp = MaxCase+1;
70
+
71
+ if ( SIFT )
72
+ {
73
+ CheckFile(".sift", true);
74
+ }
75
+
76
+ /* Set ClassAtt to each attribute in turn.
77
+ Check distribution type, remove tails, and perform global check */
78
+
79
+ ForEach(ClassAtt, 1, MaxAtt)
80
+ {
81
+ if ( Exclude(ClassAtt) )
82
+ {
83
+ continue;
84
+ }
85
+
86
+ Verbosity(1, fprintf(Of, "\n==========\n%s\n", AttName[ClassAtt]))
87
+
88
+ Progress(-ClassAtt);
89
+ CheckMsg = false;
90
+
91
+ /* Delete missing values and set SomeMiss[].
92
+ SomeNA[] is set in CheckContin (for continuous atts)
93
+ and in InitialiseDAC (for discrete atts) */
94
+
95
+ Fp = SkipMissing(ClassAtt, 0, MaxCase);
96
+ if ( (SomeMiss[ClassAtt] = ( Fp > 0 )) && ! Skip(ClassAtt) )
97
+ {
98
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
99
+ fprintf(Of, F_ExcludeMissing(Fp));
100
+ CheckMsg = true;
101
+ }
102
+
103
+ if ( Fp < MaxCase )
104
+ {
105
+ TargetSaved = false;
106
+ GEnv.Level = -1;
107
+
108
+ if ( Continuous(ClassAtt) )
109
+ {
110
+ CheckContin(Fp);
111
+ }
112
+ else
113
+ if ( SIFT )
114
+ {
115
+ /* Check for possible entries for sift file */
116
+
117
+ ForEach(i, Fp, MaxCase)
118
+ {
119
+ Case[i][0] = Case[i][ClassAtt];
120
+ }
121
+
122
+ FindDiscrOutliers(Fp, MaxCase, Nil);
123
+ }
124
+
125
+ /* Dump any sift entries */
126
+
127
+ if ( SIFT && GEnv.SiftSize )
128
+ {
129
+ fprintf(Sf, "1 %d\n%s", ClassAtt, GEnv.SiftEntry);
130
+ GEnv.SiftSize = 0;
131
+ }
132
+ }
133
+ }
134
+
135
+
136
+ /* Search for subsets and test */
137
+
138
+ NotifyStage(CHECKING);
139
+
140
+ ForEach(ClassAtt, 1, MaxAtt)
141
+ {
142
+ if ( Skip(ClassAtt) ) continue;
143
+
144
+ Verbosity(1, fprintf(Of, "\n==========\n%s\n", AttName[ClassAtt]))
145
+
146
+ Progress(-ClassAtt);
147
+
148
+ /* Restore original order */
149
+
150
+ memcpy(Case, SaveCase, (MaxCase+1) * sizeof(Description));
151
+
152
+ /* Remove missing values and tails of continuous attributes */
153
+
154
+ Fp = ( SomeMiss[ClassAtt] ? SkipMissing(ClassAtt, 0, MaxCase) : 0 );
155
+ Lp = MaxCase;
156
+
157
+ if ( Continuous(ClassAtt) )
158
+ {
159
+ /* Remove N/A values */
160
+
161
+ if ( SomeNA[ClassAtt] ) Fp = Group(ClassAtt, 1, Fp, Lp, 0, Nil);
162
+
163
+ /* Put low tails low and high tails high */
164
+
165
+ LowFp = Fp;
166
+
167
+ ForEach(i, Fp, Lp)
168
+ {
169
+ if ( CVal(Case[i], ClassAtt) < LowTail[ClassAtt] )
170
+ {
171
+ Swap(i, Fp);
172
+ Fp++;
173
+ }
174
+ else
175
+ if ( CVal(Case[i], ClassAtt) > HighTail[ClassAtt] )
176
+ {
177
+ Swap(i, Lp);
178
+ Lp--;
179
+ i--;
180
+ }
181
+ }
182
+
183
+ LowLp = Fp-1;
184
+ HighFp = Lp+1;
185
+ }
186
+
187
+ if ( Fp > 0 ) Progress(Fp);
188
+
189
+ /* Copy class values */
190
+
191
+ if ( Continuous(ClassAtt) && UseLogs[ClassAtt] )
192
+ {
193
+ ForEach(i, Fp, Lp)
194
+ {
195
+ CClass(Case[i]) = log(CVal(Case[i], ClassAtt));
196
+ }
197
+ }
198
+ else
199
+ {
200
+ ForEach(i, Fp, Lp)
201
+ {
202
+ Case[i][0] = Case[i][ClassAtt];
203
+ }
204
+ }
205
+
206
+ SampleSize = SAMPLEUNIT *
207
+ ( Continuous(ClassAtt) ? 5 :
208
+ SomeNA[ClassAtt] ? MaxAttVal[ClassAtt] :
209
+ MaxAttVal[ClassAtt] - 1 );
210
+ Split(Fp, Lp, 0, Nil, 0, &T);
211
+
212
+ TargetSaved = false;
213
+ LastLevel = -1;
214
+
215
+ ReleaseTree(T, 0);
216
+ T = Nil;
217
+ }
218
+
219
+ if ( SIFT )
220
+ {
221
+ fprintf(Sf, "0\n");
222
+ fclose(Sf);
223
+ Sf = 0;
224
+ }
225
+ }
226
+
227
+
228
+
229
+ /*************************************************************************/
230
+ /* */
231
+ /* Check a continuous attribute */
232
+ /* - decide whether to apply the log transformation */
233
+ /* - exclude possibly multimodal tails */
234
+ /* - check for global outliers */
235
+ /* */
236
+ /*************************************************************************/
237
+
238
+
239
+ void CheckContin(CaseNo Fp)
240
+ /* ----------- */
241
+ {
242
+ CaseNo i, Mid, Quart, Tail, Tp, Lp;
243
+ CaseCount Cases, Middle;
244
+ double R1, R2, Mean, SD, Sum=0, SumSq=0, Cv;
245
+ char CVS[20];
246
+ Boolean LowT=false, HighT=false;
247
+
248
+ /* First discard any non-applicable values */
249
+
250
+ Tp = Fp;
251
+ ForEach(i, Fp, MaxCase)
252
+ {
253
+ if ( NotApplic(Case[i], ClassAtt) )
254
+ {
255
+ Swap(Fp, i);
256
+ Fp++;
257
+ }
258
+ }
259
+
260
+ /* Remember whether there were any */
261
+
262
+ if ( (SomeNA[ClassAtt] = ( Fp > Tp )) && ! Skip(ClassAtt) )
263
+ {
264
+ if ( ! CheckMsg )
265
+ {
266
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
267
+ CheckMsg = true;
268
+ }
269
+ fprintf(Of, F_ExcludeNA(Fp - Tp));
270
+ }
271
+
272
+ if ( Fp > MaxCase - CMINITEMS ) return;
273
+
274
+ Quicksort(Fp, MaxCase, ClassAtt);
275
+
276
+ Mid = (MaxCase + Fp) / 2;
277
+
278
+ /* Check for asymmetry and potential log distribution */
279
+
280
+ Quart = No(Fp, MaxCase) / 4;
281
+
282
+ if ( CVal(Case[Fp], ClassAtt) > Epsilon &&
283
+ CVal(Case[MaxCase-Quart], ClassAtt) > CVal(Case[Mid], ClassAtt) )
284
+ {
285
+ /* R1 is (log(q2)-log(q1)) / (log(q3)-log(q2))
286
+ R2 is (q2-q1) / (q3-q2)
287
+ Choose the log distribution if R2 < 1 and R1 is closer
288
+ to 1 than R2 */
289
+
290
+ R1 = ( log(CVal(Case[Mid], ClassAtt)) -
291
+ log(CVal(Case[Fp+Quart], ClassAtt)) ) /
292
+ ( log(CVal(Case[MaxCase-Quart], ClassAtt)) -
293
+ log(CVal(Case[Mid], ClassAtt)) );
294
+ R2 = (CVal(Case[Mid], ClassAtt) - CVal(Case[Fp+Quart], ClassAtt)) /
295
+ (CVal(Case[MaxCase-Quart], ClassAtt) - CVal(Case[Mid], ClassAtt));
296
+
297
+ UseLogs[ClassAtt] = R2 < 1 && fabs(R1-1) < fabs(R2-1);
298
+ if ( UseLogs[ClassAtt] )
299
+ {
300
+ Verbosity(1, fprintf(Of, " Using log distribution\n"))
301
+
302
+ if ( SIFT )
303
+ {
304
+ fprintf(Sf, "2 %d\n", ClassAtt);
305
+ }
306
+ }
307
+ }
308
+ else
309
+ {
310
+ UseLogs[ClassAtt] = false;
311
+ }
312
+
313
+ /* That's all that needs to be done for non-included attributes */
314
+
315
+ if ( Skip(ClassAtt) ) return;
316
+
317
+ /* Load the appropriate values into the class */
318
+
319
+ if ( UseLogs[ClassAtt] )
320
+ {
321
+ ForEach(i, Fp, MaxCase)
322
+ {
323
+ CClass(Case[i]) = log(CVal(Case[i], ClassAtt));
324
+ }
325
+ }
326
+ else
327
+ {
328
+ ForEach(i, Fp, MaxCase)
329
+ {
330
+ CClass(Case[i]) = CVal(Case[i], ClassAtt);
331
+ }
332
+ }
333
+
334
+ /* Check for multimodal tails and exclude */
335
+
336
+ Lp = MaxCase;
337
+ Cases = No(Fp, Lp);
338
+ Tail = MaxAnoms(Cases);
339
+
340
+ /* Estimate SD from the central half of the data and adjust; if
341
+ this is impossible (too many repeated values), mark the
342
+ attribute as skipped */
343
+
344
+ if ( CClass(Case[Fp + Quart + Tail]) < CClass(Case[Lp - Quart - Tail]) )
345
+ {
346
+ ForEach(i, Fp + Quart, Lp - Quart)
347
+ {
348
+ Sum += (Cv = CClass(Case[i]));
349
+ SumSq += Cv * Cv;
350
+ }
351
+ Mean = Sum / (Middle = No(Fp, Lp) - 2 * Quart);
352
+ SD = 2.5 * SDEstimate(Middle, Sum, SumSq);
353
+ }
354
+ else
355
+ {
356
+ /* This is not really a continuous distribution -- at least
357
+ half of the cases have identical values. Skip it */
358
+
359
+ if ( ! CheckMsg )
360
+ {
361
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
362
+ CheckMsg = true;
363
+ }
364
+
365
+ fprintf(Of, F_TooManyIdentical);
366
+ SpecialStatus[ClassAtt] |= SKIP;
367
+ return;
368
+ }
369
+
370
+ /* Look for multimodal low tail */
371
+
372
+ for ( Tp = Fp ; Tp < Mid && ZScore(Tp) >= MAXTAIL ; Tp++ )
373
+ ;
374
+
375
+ if ( Tp - Fp > Tail )
376
+ {
377
+ if ( ! CheckMsg )
378
+ {
379
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
380
+ CheckMsg = true;
381
+ }
382
+
383
+ CValToStr(CVal(Case[Tp], ClassAtt), ClassAtt, CVS);
384
+ fprintf(Of, F_LowTail(Tp - Fp, CVS));
385
+ Fp = Tp;
386
+ LowT = true;
387
+ }
388
+
389
+ /* Ditto multimodal high tail */
390
+
391
+ for ( Tp = Lp ; Tp > Mid && ZScore(Tp) >= MAXTAIL ; Tp-- )
392
+ ;
393
+
394
+ if ( Lp - Tp > Tail )
395
+ {
396
+ if ( ! CheckMsg )
397
+ {
398
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
399
+ CheckMsg = true;
400
+ }
401
+
402
+ CValToStr(CVal(Case[Tp], ClassAtt), ClassAtt, CVS);
403
+ fprintf(Of, F_HighTail(Lp - Tp, CVS));
404
+ Lp = Tp;
405
+ HighT = true;
406
+ }
407
+
408
+ /* Record tail information */
409
+
410
+ LowTail[ClassAtt] = CVal(Case[Fp], ClassAtt);
411
+ HighTail[ClassAtt] = CVal(Case[Lp], ClassAtt);
412
+
413
+ if ( SIFT && ( LowT || HighT ) )
414
+ {
415
+ fprintf(Sf, "3 %d %.8g %.8g\n", ClassAtt,
416
+ ( LowT ? CVal(Case[Fp], ClassAtt) : -MAXFLOAT ),
417
+ ( HighT ? CVal(Case[Lp], ClassAtt) : MAXFLOAT ) );
418
+ }
419
+
420
+ /* Carry out global check on remaining cases */
421
+
422
+ FindContinOutliers(Fp, Lp, true);
423
+ }
424
+
425
+
426
+
427
+ /*************************************************************************/
428
+ /* */
429
+ /* Check continuous values of ClassAtt for cases Fp to Lp */
430
+ /* */
431
+ /*************************************************************************/
432
+
433
+
434
+ void FindContinOutliers(CaseNo Fp, CaseNo Lp, Boolean Sorted)
435
+ /* ------------------ */
436
+ {
437
+ CaseNo Tail, Cases, LowTp=-1, HighTp=-1, GFp, i;
438
+ double Mean, SD, LowFrac, HighFrac, LowLim, HighLim;
439
+ Clust CLow=Nil, CHigh=Nil;
440
+ Boolean SavedCluster=false;
441
+
442
+ Cases = No(Fp, Lp);
443
+ if ( Cases < CMINITEMS ) return;
444
+
445
+ if ( ! Sorted )
446
+ {
447
+ Quicksort(Fp, Lp, ClassAtt);
448
+ }
449
+
450
+ TrimmedSDEstimate(Fp, Lp, &Mean, &SD);
451
+
452
+ /* Check low and high tails. A tail is anomalous if
453
+ * it does not contain too many cases
454
+ * the case before the tail has a Z-score <= MAXNORM
455
+ * there is a gap of at least MINABNORM - MAXNORM */
456
+
457
+ Tail = MaxAnoms(Cases);
458
+
459
+ LowTp = FindTail(Fp, Fp + Tail, 1, Mean, SD);
460
+ HighTp = FindTail(Lp, Lp - Tail, -1, Mean, SD);
461
+
462
+ if ( SIFT )
463
+ {
464
+ /* See whether we need to save this cluster for low or high test */
465
+
466
+ if ( LowTp >= 0 )
467
+ {
468
+ LowFrac = No(LowTp+1, Lp) / (double) Cases;
469
+ LowLim = CVal(Case[LowTp+1], ClassAtt);
470
+ }
471
+ else
472
+ {
473
+ LowFrac = LowLim = 0;
474
+ }
475
+
476
+ if ( HighTp > 0 )
477
+ {
478
+ HighFrac = No(Fp, HighTp-1) / (double) Cases;
479
+ HighLim = CVal(Case[HighTp-1], ClassAtt);
480
+ }
481
+ else
482
+ {
483
+ HighFrac = HighLim = 0;
484
+ }
485
+
486
+ if ( LowFrac > 0 || HighFrac > 0 )
487
+ {
488
+ SaveContinCluster(Mean, SD, Cases,
489
+ LowFrac, LowLim, HighFrac, HighLim);
490
+
491
+ SavedCluster = true;
492
+ }
493
+ }
494
+
495
+
496
+ if ( LowTp >= 0 || HighTp > 0 )
497
+ {
498
+ GFp = ( LowTp >= 0 ? LowTp+1 : Fp );
499
+
500
+ if ( LowTp >= 0 )
501
+ {
502
+ CLow = NewClust(Mean, SD,
503
+ CVal(Case[LowTp+1], ClassAtt),
504
+ No(Fp, LowTp), Cases);
505
+ }
506
+
507
+ if ( HighTp > 0 )
508
+ {
509
+ CHigh = NewClust(Mean, SD,
510
+ CVal(Case[HighTp-1], ClassAtt),
511
+ No(HighTp, Lp), Cases);
512
+
513
+ /* Move all anomalies to the front */
514
+
515
+ ForEach(i, HighTp, Lp)
516
+ {
517
+ Swap(i, GFp);
518
+ GFp++;
519
+ }
520
+ }
521
+
522
+ LabelContinOutliers(CLow, CHigh, Fp, GFp, Lp);
523
+ }
524
+
525
+ /* Clusters may have caveats discovered during LabelContinOutliers */
526
+
527
+ if ( SavedCluster )
528
+ {
529
+ ExtendSiftEntry("\n");
530
+ }
531
+ }
532
+
533
+
534
+
535
+ /*************************************************************************/
536
+ /* */
537
+ /* Note cases Fp to Lp as outliers wrt values GFp to GLp of */
538
+ /* ClassAtt whose mean and SD are given */
539
+ /* */
540
+ /*************************************************************************/
541
+
542
+
543
+ void LabelContinOutliers(Clust CL, Clust CH, CaseNo Fp, CaseNo GFp, CaseNo GLp)
544
+ /* ------------------- */
545
+ {
546
+ CaseNo i;
547
+ double Z, Mean, SD, X;
548
+ Clust C, OldC;
549
+
550
+ C = ( CL ? CL : CH ); /* either will do since mean is the same */
551
+
552
+ Mean = C->Expect;
553
+ SD = C->SD;
554
+
555
+ /* Remove cases that already have a more interesting recorded
556
+ anomalous value */
557
+
558
+ ForEach(i, Fp, GFp-1)
559
+ {
560
+ /* Use Chebychev bounds to approximate certainty that this
561
+ case is an outlier */
562
+
563
+ Z = ZScore(i);
564
+ X = 1 / (Z * Z);
565
+
566
+ if ( (OldC = OutClust(Case[i])) &&
567
+ ( C->NCond > OldC->NCond ||
568
+ C->NCond == OldC->NCond && X >= OutXVal(Case[i]) ) )
569
+ {
570
+ Swap(i, Fp);
571
+ Fp++;
572
+ }
573
+ }
574
+
575
+ /* Remove possible anomalies that are non consistent with the
576
+ ordinary cases */
577
+
578
+ Fp = NoOtherDifference(Fp, GFp-1, GFp, GLp);
579
+
580
+ /* Finally, record remaining cases */
581
+
582
+ ForEach(i, Fp, GFp-1)
583
+ {
584
+ Z = ZScore(i);
585
+ Verbosity(1,
586
+ fprintf(Of, "****\tpotential outlier %g (%.1f sd) %s\n",
587
+ CVal(Case[i], ClassAtt), Z,
588
+ ( LabelAtt ? SVal(Case[i], LabelAtt) : "" )))
589
+
590
+ RecordOutlier(i, ( CClass(Case[i]) < Mean ? CL : CH ), 1 / (Z * Z));
591
+ }
592
+ }
593
+
594
+
595
+
596
+ /*************************************************************************/
597
+ /* */
598
+ /* Robust estimator of mean and SD. */
599
+ /* Idea: exclude high/low tails of data, and adjust computed */
600
+ /* mean and SD heuristically. */
601
+ /* Note: unknown and N/A values must be removed and cases must be */
602
+ /* sorted by ClassAtt before calling TrimmedSDEstimate. */
603
+ /* */
604
+ /*************************************************************************/
605
+
606
+
607
+ void TrimmedSDEstimate(CaseNo Fp, CaseNo Lp, double *Mean, double *SD)
608
+ /* ----------------- */
609
+ {
610
+ CaseNo i, Tail, Cases, Quart;
611
+ double Val, Sum=0, SumSq=0;
612
+
613
+ /* Set defaults */
614
+
615
+ *Mean = 0;
616
+ *SD = 1E38;
617
+
618
+ Tail = MaxAnoms(No(Fp, Lp));
619
+ Cases = No(Fp, Lp) - 2 * Tail;
620
+ Quart = No(Fp, Lp) / 4;
621
+
622
+ /* Don't try to estimate if too many values are the same */
623
+
624
+ if ( CClass(Case[Fp+Quart]) == CClass(Case[Lp-Quart]) )
625
+ {
626
+ return;
627
+ }
628
+
629
+ ForEach(i, Fp+Tail, Lp-Tail)
630
+ {
631
+ if ( NotApplic(Case[i], ClassAtt) )
632
+ {
633
+ Cases--;
634
+ }
635
+ else
636
+ {
637
+ Val = CClass(Case[i]);
638
+ Sum += Val;
639
+ SumSq += Val * Val;
640
+ }
641
+ }
642
+
643
+ if ( Cases < Tail )
644
+ {
645
+ return;
646
+ }
647
+
648
+ /* If there are N cases (excluding non-applicables) then adjust
649
+ SD by factor (N + Tail) / (N - Tail) */
650
+
651
+ *Mean = Sum / Cases;
652
+ *SD = SDEstimate(Cases, Sum, SumSq) *
653
+ (Cases + 3.0 * Tail ) / (Cases + Tail);
654
+ }
655
+
656
+
657
+
658
+ /*************************************************************************/
659
+ /* */
660
+ /* Find a tail containing potential anomalies between Fp and Lp-1. */
661
+ /* * case Lp must have a Z-score <= MAXNORM */
662
+ /* * there must be a gap >= MINABNORM-MAXNORM between the anomalous */
663
+ /* and non-anomalous values */
664
+ /* * the cluster cannot contain cases from omitted tails */
665
+ /* */
666
+ /*************************************************************************/
667
+
668
+
669
+ CaseNo FindTail(CaseNo Fp, CaseNo Lp, int I, double Mean, double SD)
670
+ /* -------- */
671
+ {
672
+ CaseNo i;
673
+ double Z;
674
+
675
+ if ( ZScore(Lp) > MAXNORM ) return -1;
676
+
677
+ /* Find the first gap */
678
+
679
+ for ( i = Lp ; i * I > Fp * I && (Z = ZScore(i)) <= MINABNORM ; i -= I )
680
+ {
681
+ if ( ZScore(i - I) - Z >= MINABNORM - MAXNORM )
682
+ {
683
+ break;
684
+ }
685
+ }
686
+
687
+ return ( Z > MINABNORM ? -1 : OmittedCases(I) ? -1 : i - I );
688
+ }
689
+
690
+
691
+
692
+ /*************************************************************************/
693
+ /* */
694
+ /* Check whether the current cluster includes cases from the */
695
+ /* excluded high/low tails */
696
+ /* */
697
+ /*************************************************************************/
698
+
699
+
700
+ Boolean OmittedCases(int HiLo)
701
+ /* ------------ */
702
+ {
703
+ CaseNo Fp, Lp;
704
+ CaseNo i;
705
+
706
+ if ( HiLo > 0 )
707
+ {
708
+ Fp = LowFp;
709
+ Lp = LowLp;
710
+ }
711
+ else
712
+ {
713
+ Fp = HighFp;
714
+ Lp = MaxCase;
715
+ }
716
+
717
+ ForEach(i, Fp, Lp)
718
+ {
719
+ if ( SatisfiesTests(Case[i]) )
720
+ {
721
+ return true;
722
+ }
723
+ }
724
+
725
+ return false;
726
+ }
727
+
728
+
729
+
730
+ /*************************************************************************/
731
+ /* */
732
+ /* See whether a case satisfies all current tests */
733
+ /* */
734
+ /*************************************************************************/
735
+
736
+
737
+ Boolean SatisfiesTests(Description Case)
738
+ /* -------------- */
739
+ {
740
+ Attribute Att;
741
+ DiscrValue Br;
742
+ int i;
743
+
744
+ ForEach(i, 0, GEnv.Level)
745
+ {
746
+ Att = GEnv.Test[i].Att;
747
+ Br = GEnv.Test[i].Br;
748
+
749
+ if ( Unknown(Case, Att) )
750
+ {
751
+ return false;
752
+ }
753
+ else
754
+ if ( Br == 1 )
755
+ {
756
+ if ( ! NotApplic(Case, Att) ) return false;
757
+ }
758
+ else
759
+ if ( NotApplic(Case, Att) )
760
+ {
761
+ return false;
762
+ }
763
+ else
764
+ if ( Continuous(Att) )
765
+ {
766
+ if ( ( Br == 2 ) != ( CVal(Case, Att) <= GEnv.Test[i].Cut ) )
767
+ {
768
+ return false;
769
+ }
770
+ }
771
+ else
772
+ if ( Ordered(Att) )
773
+ {
774
+ if ( ( Br == 2 ) != ( DVal(Case, Att) <= GEnv.Test[i].Cut ) )
775
+ {
776
+ return false;
777
+ }
778
+ }
779
+ else
780
+ if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
781
+ {
782
+ if ( ( Br == 2 ) != ( In(DVal(Case, Att), GEnv.Test[i].Left) != 0 ) )
783
+ {
784
+ return false;
785
+ }
786
+ }
787
+ else
788
+ if ( Br != DVal(Case, Att) )
789
+ {
790
+ return false;
791
+ }
792
+ }
793
+
794
+ return true;
795
+ }
796
+
797
+
798
+
799
+ /*************************************************************************/
800
+ /* */
801
+ /* Check discrete values of ClassAtt for cases Fp to Lp */
802
+ /* Idea: anomaly will appear as an odd case in nearly-pure subset */
803
+ /* */
804
+ /*************************************************************************/
805
+
806
+
807
+ void FindDiscrOutliers(CaseNo Fp, CaseNo Lp, CaseCount *Table)
808
+ /* ----------------- */
809
+ {
810
+ DiscrValue v, Majority=1;
811
+ CaseNo i, GFp, GLp;
812
+ CaseCount Cases, Anoms;
813
+ double X;
814
+ Clust C, OldC;
815
+ Boolean SomeSurprise=false, NeedCluster;
816
+
817
+ Cases = No(Fp, Lp);
818
+ if ( Cases < DMINITEMS ) return;
819
+
820
+ if ( ! Table )
821
+ {
822
+ FindClassFrequencies(Fp, Lp);
823
+ Table = GEnv.ClassFreq;
824
+ }
825
+
826
+ ForEach(v, 2, MaxAttVal[ClassAtt])
827
+ {
828
+ if ( Table[v] > 0 )
829
+ {
830
+ if ( ! Majority || Table[v] > Table[Majority] )
831
+ {
832
+ Majority = v;
833
+ }
834
+ }
835
+ }
836
+
837
+ /* Skip if too many anomalies */
838
+
839
+ Anoms = Cases - Table[Majority];
840
+
841
+ if ( Anoms > MaxAnoms(Cases) )
842
+ {
843
+ return;
844
+ }
845
+
846
+ /* Check whether any non-majority class is surprising */
847
+
848
+ for ( v = 1 ; ! SomeSurprise && v <= MaxAttVal[ClassAtt] ; v++ )
849
+ {
850
+ if ( v == Majority ) continue;
851
+
852
+ X = XDScore(Table[v], Cases, Anoms, Prior[ClassAtt][v]);
853
+ SomeSurprise = ( X <= 1.0 / (MINABNORM * MINABNORM) );
854
+ }
855
+ if ( ! SomeSurprise ) return;
856
+
857
+ if ( SIFT )
858
+ {
859
+ SaveDiscrCluster(Majority, Anoms, Cases, Table);
860
+ }
861
+
862
+ if ( ! Anoms )
863
+ {
864
+ if ( SIFT )
865
+ {
866
+ ExtendSiftEntry("\n");
867
+ }
868
+
869
+ return;
870
+ }
871
+
872
+ /* Need a new cluster if surprising non-zero frequencies */
873
+
874
+ NeedCluster = ( Table[--v] > 0 );
875
+ while ( ! NeedCluster && ++v <= MaxAttVal[ClassAtt] )
876
+ {
877
+ if ( v == Majority || ! Table[v] ) continue;
878
+
879
+ X = XDScore(Table[v], Cases, Anoms, Prior[ClassAtt][v]);
880
+ NeedCluster = ( X <= 1.0 / (MINABNORM * MINABNORM) );
881
+ }
882
+
883
+ if ( NeedCluster )
884
+ {
885
+ /* Move all majority-class cases to the front */
886
+
887
+ GFp = Fp;
888
+ GLp = (Fp = Group(ClassAtt, Majority, Fp, Lp, 0.0, Nil)) - 1;
889
+
890
+ C = NewClust(Majority, 0.0, 0.0, Anoms, Cases);
891
+
892
+ /* Remove cases whose surprise value is insufficient or
893
+ that already have a more interesting recorded anomalous value */
894
+
895
+ ForEach(i, Fp, Lp)
896
+ {
897
+ v = DClass(Case[i]);
898
+ X = DScore(Cases, Anoms, Prior[ClassAtt][v]);
899
+
900
+ if ( X > 1.0 / (MINABNORM * MINABNORM) ||
901
+ (OldC = OutClust(Case[i])) &&
902
+ ( C->NCond > OldC->NCond ||
903
+ C->NCond == OldC->NCond && X > OutXVal(Case[i]) ) )
904
+ {
905
+ Swap(i, Fp);
906
+ Fp++;
907
+ }
908
+ }
909
+
910
+ /* Remove possible anomalies that are non consistent with the
911
+ ordinary cases */
912
+
913
+ Fp = NoOtherDifference(Fp, Lp, GFp, GLp);
914
+
915
+ /* Finally, record remaining cases */
916
+
917
+ if ( Fp <= Lp )
918
+ {
919
+ ForEach(i, Fp, Lp)
920
+ {
921
+ v = DClass(Case[i]);
922
+ X = DScore(Cases, Anoms, Prior[ClassAtt][v]);
923
+
924
+ Verbosity(1,
925
+ fprintf(Of, "****\tpotential outlier %s (p=%.3f) %s\n",
926
+ AttValName[ClassAtt][DClass(Case[i])], X,
927
+ ( LabelAtt ? SVal(Case[i], LabelAtt) : "" )))
928
+
929
+ RecordOutlier(i, C, X);
930
+ }
931
+ }
932
+ }
933
+
934
+ if ( SIFT && SomeSurprise )
935
+ {
936
+ ExtendSiftEntry("\n");
937
+ }
938
+ }
939
+
940
+
941
+
942
+ /*************************************************************************/
943
+ /* */
944
+ /* Cases Fp through Lp have been identified as potential anoms */
945
+ /* in the cluster whose "normal" cases are GFp thrrough GLp. */
946
+ /* Discard potential anomalies that appear to be inconsistent */
947
+ /* with the normals on some other attribute. */
948
+ /* If SIFT is set, record any caveats for the current cluster */
949
+ /* */
950
+ /*************************************************************************/
951
+
952
+
953
+ CaseNo NoOtherDifference(CaseNo Fp, CaseNo Lp, CaseNo GFp, CaseNo GLp)
954
+ /* ----------------- */
955
+ {
956
+ Attribute Att;
957
+ double Sum, SumSq, Mean, SD, CV;
958
+ CaseNo i, Cases, GCases;
959
+ DiscrValue v;
960
+ Boolean Caveat;
961
+ int Bytes;
962
+ char SE[100];
963
+
964
+ if ( GEnv.Level < 0 ||
965
+ Fp > Lp || (GCases = No(GFp, GLp)) < MINCONTEXT ) return Fp;
966
+
967
+ /* Use a sample if there are many normal cases */
968
+
969
+ if ( GCases > MaxDiscrVal * SAMPLEUNIT )
970
+ {
971
+ GCases = 0.5 * MaxDiscrVal * SAMPLEUNIT;
972
+ Sample(GFp, GLp, GCases);
973
+ GLp = GFp + GCases - 1;
974
+ }
975
+
976
+ ForEach(Att, 1, MaxAtt)
977
+ {
978
+ if ( Att == ClassAtt || Exclude(Att) ) continue;
979
+
980
+ if ( Fp > Lp ) return Fp;
981
+
982
+ Caveat = false;
983
+
984
+ if ( Continuous(Att) )
985
+ {
986
+ /* Find mean and variance of ordinary cases */
987
+
988
+ Sum = SumSq = Cases = 0;
989
+ ForEach(i, GFp, GLp)
990
+ {
991
+ if ( ! Unknown(Case[i], Att) && ! NotApplic(Case[i], Att) )
992
+ {
993
+ CV = ( UseLogs[Att] ? log(CVal(Case[i], Att)) :
994
+ CVal(Case[i], Att) );
995
+ Sum += CV;
996
+ SumSq += CV * CV;
997
+ Cases++;
998
+ }
999
+ }
1000
+
1001
+ /* Check that sufficient cases to give reliable SD */
1002
+
1003
+ if ( Cases >= MINCONTEXT )
1004
+ {
1005
+ Mean = Sum / Cases;
1006
+ SD = SDEstimate(Cases, Sum, SumSq);
1007
+
1008
+ /* Move filtered cases to the front */
1009
+
1010
+ ForEach (i, Fp, Lp)
1011
+ {
1012
+ if ( ! Unknown(Case[i], Att) &&
1013
+ ! NotApplic(Case[i], Att) &&
1014
+ fabs(Mean -
1015
+ ( UseLogs[Att] ? log(CVal(Case[i], Att)) :
1016
+ CVal(Case[i], Att) ))
1017
+ / SD > MAXNORM )
1018
+ {
1019
+ Verbosity(2,
1020
+ fprintf(Of, "\t %d: difference %s %.2f SD\n",
1021
+ i, AttName[Att],
1022
+ (Mean -
1023
+ ( UseLogs[Att] ? log(CVal(Case[i], Att)) :
1024
+ CVal(Case[i], Att) )) / SD))
1025
+ Swap(i, Fp);
1026
+ Fp++;
1027
+
1028
+ /* Record possible caveat */
1029
+
1030
+ if ( SIFT && ! Caveat )
1031
+ {
1032
+ Caveat = true;
1033
+
1034
+ sprintf(SE, " %d", Att);
1035
+ ExtendSiftEntry(SE);
1036
+ if ( UseLogs[Att] )
1037
+ {
1038
+ sprintf(SE, " %.8g %.8g",
1039
+ exp(Mean - MAXNORM * SD),
1040
+ exp(Mean + MAXNORM * SD));
1041
+ }
1042
+ else
1043
+ {
1044
+ sprintf(SE, " %.8g %.8g",
1045
+ Mean - MAXNORM * SD,
1046
+ Mean + MAXNORM * SD);
1047
+ }
1048
+
1049
+ ExtendSiftEntry(SE);
1050
+ }
1051
+ }
1052
+ }
1053
+ }
1054
+ }
1055
+ else
1056
+ {
1057
+ /* Discrete attribute
1058
+ NB: This doesn't differentiate between ordered and unordered
1059
+ discrete attributes -- perhaps it should */
1060
+
1061
+ ForEach(v, 0, MaxAttVal[Att])
1062
+ {
1063
+ GEnv.ValFreq[v] = 0;
1064
+ }
1065
+
1066
+ ForEach(i, GFp, GLp)
1067
+ {
1068
+ GEnv.ValFreq[XDVal(Case[i], Att)]++;
1069
+ }
1070
+
1071
+ /* A discrete attribute value is judged to inconsistent with
1072
+ the normals if its Laplace probability in the normals is
1073
+ less than 0.025 and its prior greater than 0.25 */
1074
+
1075
+ Bytes = (MaxAttVal[Att]>>3) + 1;
1076
+ ClearBits(Bytes, GEnv.Subset[0]);
1077
+
1078
+ ForEach(i, Fp, Lp)
1079
+ {
1080
+ v = XDVal(Case[i], Att);
1081
+ if ( Prior[Att][v] >= 0.25 &&
1082
+ (GEnv.ValFreq[v] + 1) / (double) (GCases + 2) < 0.025L )
1083
+ {
1084
+ Verbosity(2,
1085
+ fprintf(Of, "\t %d: difference %s=%s (%d/%d)\n",
1086
+ i, AttName[Att], AttValName[Att][v],
1087
+ GEnv.ValFreq[v], GCases))
1088
+ Swap(i, Fp);
1089
+ Fp++;
1090
+
1091
+ SetBit(v, GEnv.Subset[0]);
1092
+ Caveat = true;
1093
+ }
1094
+ }
1095
+
1096
+ if ( SIFT && Caveat )
1097
+ {
1098
+ sprintf(SE, " %d", Att);
1099
+ ExtendSiftEntry(SE);
1100
+ ForEach(v, 0, Bytes-1)
1101
+ {
1102
+ sprintf(SE, " %x", GEnv.Subset[0][v]);
1103
+ ExtendSiftEntry(SE);
1104
+ }
1105
+ }
1106
+ }
1107
+ }
1108
+
1109
+ return Fp;
1110
+ }