see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
@@ -0,0 +1,98 @@
1
+ #*************************************************************************#
2
+ #* *#
3
+ #* Makefile for GritBot *#
4
+ #* -------------------- *#
5
+ #* *#
6
+ #*************************************************************************#
7
+
8
+
9
+ CC = gcc -ffloat-store
10
+ CFLAGS = -DVerbOpt -g -Wall -O0
11
+ LFLAGS = $(S)
12
+ SHELL = /bin/csh
13
+
14
+
15
+ # Definitions of file sets
16
+
17
+ src =\
18
+ global.c\
19
+ cluster.c\
20
+ continatt.c\
21
+ outlier.c\
22
+ getdata.c\
23
+ gritbot.c\
24
+ sort.c\
25
+ discratt.c\
26
+ check.c\
27
+ common.c\
28
+ getnames.c\
29
+ implicitatt.c\
30
+ modelfiles.c\
31
+ update.c\
32
+ utility.c
33
+
34
+
35
+ isrc =\
36
+ inspect.c\
37
+ cluster.c\
38
+ outlier.c\
39
+ getdata.c\
40
+ getnames.c\
41
+ implicitatt.c\
42
+ modelfiles.c\
43
+ update.c\
44
+ common.c\
45
+ utility.c
46
+
47
+
48
+ obj =\
49
+ global.o\
50
+ gritbot.o\
51
+ getdata.o getnames.o implicitatt.o\
52
+ check.o cluster.o outlier.o\
53
+ common.o continatt.o discratt.o\
54
+ modelfiles.o\
55
+ sort.o utility.o update.o
56
+
57
+
58
+ all:
59
+ make gritbot
60
+ make inspect
61
+
62
+
63
+ # debug version (including verbosity option)
64
+
65
+ gritbotdbg:\
66
+ $(obj) defns.i text.i extern.i Makefile
67
+ $(CC) -DVerbOpt -g -o gritbotdbg $(obj) -lm
68
+
69
+ inspectdbg:\
70
+ $(isrc) defns.i text.i Makefile
71
+ cat defns.i $(isrc)\
72
+ | egrep -v 'defns.i|extern.i' >insgt.c
73
+ $(CC) $(CFLAGS) -DVerbOpt -DINSPECT -o inspectdbg insgt.c -lm
74
+
75
+ # production versions
76
+
77
+ gritbot:\
78
+ $(src) defns.i text.i Makefile
79
+ cat defns.i $(src)\
80
+ | egrep -v 'defns.i|extern.i' >gbotgt.c
81
+ $(CC) $(LFLAGS) -O3 -o gritbot gbotgt.c -lm
82
+ strip gritbot
83
+ rm gbotgt.c
84
+
85
+ inspect:\
86
+ $(isrc) defns.i text.i Makefile
87
+ cat defns.i $(isrc)\
88
+ | egrep -v 'defns.i|extern.i' >insgt.c
89
+ $(CC) $(LFLAGS) -DINSPECT -O3 -o inspect insgt.c -lm
90
+ strip inspect
91
+ rm insgt.c
92
+
93
+
94
+ $(obj): Makefile defns.i extern.i
95
+
96
+
97
+ .c.o:
98
+ $(CC) $(CFLAGS) -c $<
@@ -0,0 +1,1110 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of GritBot GPL Edition, a single-threaded version */
6
+ /* of GritBot release 2.01. */
7
+ /* */
8
+ /* GritBot GPL Edition is free software: you can redistribute it */
9
+ /* and/or modify it under the terms of the GNU General Public License */
10
+ /* as published by the Free Software Foundation, either version 3 of */
11
+ /* the License, or (at your option) any later version. */
12
+ /* */
13
+ /* GritBot GPL Edition is distributed in the hope that it will be */
14
+ /* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
15
+ /* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16
+ /* GNU General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with GritBot GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Principal routines to check data */
30
+ /* -------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+ Boolean CheckMsg; /* message printed for current attribute */
39
+ CaseNo LowFp, LowLp, HighFp; /* pointers to omitted low/high tails */
40
+
41
+
42
+
43
+ /*************************************************************************/
44
+ /* */
45
+ /* Check each attribute in turn */
46
+ /* */
47
+ /*************************************************************************/
48
+
49
+
50
+ void CheckData()
51
+ /* --------- */
52
+ {
53
+ CaseNo i, Fp, Lp;
54
+
55
+ NotifyStage(PRELIM);
56
+
57
+ /* Set up tables, determine discrete value priors, etc. */
58
+
59
+ InitialiseDAC();
60
+
61
+ /* Initialise random numbers for sampling */
62
+
63
+ ResetKR(0);
64
+
65
+ DMINITEMS = CMINITEMS = Max(35, 0.005 * (MaxCase+1));
66
+
67
+ LowFp = 0;
68
+ LowLp = -1;
69
+ HighFp = MaxCase+1;
70
+
71
+ if ( SIFT )
72
+ {
73
+ CheckFile(".sift", true);
74
+ }
75
+
76
+ /* Set ClassAtt to each attribute in turn.
77
+ Check distribution type, remove tails, and perform global check */
78
+
79
+ ForEach(ClassAtt, 1, MaxAtt)
80
+ {
81
+ if ( Exclude(ClassAtt) )
82
+ {
83
+ continue;
84
+ }
85
+
86
+ Verbosity(1, fprintf(Of, "\n==========\n%s\n", AttName[ClassAtt]))
87
+
88
+ Progress(-ClassAtt);
89
+ CheckMsg = false;
90
+
91
+ /* Delete missing values and set SomeMiss[].
92
+ SomeNA[] is set in CheckContin (for continuous atts)
93
+ and in InitialiseDAC (for discrete atts) */
94
+
95
+ Fp = SkipMissing(ClassAtt, 0, MaxCase);
96
+ if ( (SomeMiss[ClassAtt] = ( Fp > 0 )) && ! Skip(ClassAtt) )
97
+ {
98
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
99
+ fprintf(Of, F_ExcludeMissing(Fp));
100
+ CheckMsg = true;
101
+ }
102
+
103
+ if ( Fp < MaxCase )
104
+ {
105
+ TargetSaved = false;
106
+ GEnv.Level = -1;
107
+
108
+ if ( Continuous(ClassAtt) )
109
+ {
110
+ CheckContin(Fp);
111
+ }
112
+ else
113
+ if ( SIFT )
114
+ {
115
+ /* Check for possible entries for sift file */
116
+
117
+ ForEach(i, Fp, MaxCase)
118
+ {
119
+ Case[i][0] = Case[i][ClassAtt];
120
+ }
121
+
122
+ FindDiscrOutliers(Fp, MaxCase, Nil);
123
+ }
124
+
125
+ /* Dump any sift entries */
126
+
127
+ if ( SIFT && GEnv.SiftSize )
128
+ {
129
+ fprintf(Sf, "1 %d\n%s", ClassAtt, GEnv.SiftEntry);
130
+ GEnv.SiftSize = 0;
131
+ }
132
+ }
133
+ }
134
+
135
+
136
+ /* Search for subsets and test */
137
+
138
+ NotifyStage(CHECKING);
139
+
140
+ ForEach(ClassAtt, 1, MaxAtt)
141
+ {
142
+ if ( Skip(ClassAtt) ) continue;
143
+
144
+ Verbosity(1, fprintf(Of, "\n==========\n%s\n", AttName[ClassAtt]))
145
+
146
+ Progress(-ClassAtt);
147
+
148
+ /* Restore original order */
149
+
150
+ memcpy(Case, SaveCase, (MaxCase+1) * sizeof(Description));
151
+
152
+ /* Remove missing values and tails of continuous attributes */
153
+
154
+ Fp = ( SomeMiss[ClassAtt] ? SkipMissing(ClassAtt, 0, MaxCase) : 0 );
155
+ Lp = MaxCase;
156
+
157
+ if ( Continuous(ClassAtt) )
158
+ {
159
+ /* Remove N/A values */
160
+
161
+ if ( SomeNA[ClassAtt] ) Fp = Group(ClassAtt, 1, Fp, Lp, 0, Nil);
162
+
163
+ /* Put low tails low and high tails high */
164
+
165
+ LowFp = Fp;
166
+
167
+ ForEach(i, Fp, Lp)
168
+ {
169
+ if ( CVal(Case[i], ClassAtt) < LowTail[ClassAtt] )
170
+ {
171
+ Swap(i, Fp);
172
+ Fp++;
173
+ }
174
+ else
175
+ if ( CVal(Case[i], ClassAtt) > HighTail[ClassAtt] )
176
+ {
177
+ Swap(i, Lp);
178
+ Lp--;
179
+ i--;
180
+ }
181
+ }
182
+
183
+ LowLp = Fp-1;
184
+ HighFp = Lp+1;
185
+ }
186
+
187
+ if ( Fp > 0 ) Progress(Fp);
188
+
189
+ /* Copy class values */
190
+
191
+ if ( Continuous(ClassAtt) && UseLogs[ClassAtt] )
192
+ {
193
+ ForEach(i, Fp, Lp)
194
+ {
195
+ CClass(Case[i]) = log(CVal(Case[i], ClassAtt));
196
+ }
197
+ }
198
+ else
199
+ {
200
+ ForEach(i, Fp, Lp)
201
+ {
202
+ Case[i][0] = Case[i][ClassAtt];
203
+ }
204
+ }
205
+
206
+ SampleSize = SAMPLEUNIT *
207
+ ( Continuous(ClassAtt) ? 5 :
208
+ SomeNA[ClassAtt] ? MaxAttVal[ClassAtt] :
209
+ MaxAttVal[ClassAtt] - 1 );
210
+ Split(Fp, Lp, 0, Nil, 0, &T);
211
+
212
+ TargetSaved = false;
213
+ LastLevel = -1;
214
+
215
+ ReleaseTree(T, 0);
216
+ T = Nil;
217
+ }
218
+
219
+ if ( SIFT )
220
+ {
221
+ fprintf(Sf, "0\n");
222
+ fclose(Sf);
223
+ Sf = 0;
224
+ }
225
+ }
226
+
227
+
228
+
229
+ /*************************************************************************/
230
+ /* */
231
+ /* Check a continuous attribute */
232
+ /* - decide whether to apply the log transformation */
233
+ /* - exclude possibly multimodal tails */
234
+ /* - check for global outliers */
235
+ /* */
236
+ /*************************************************************************/
237
+
238
+
239
+ void CheckContin(CaseNo Fp)
240
+ /* ----------- */
241
+ {
242
+ CaseNo i, Mid, Quart, Tail, Tp, Lp;
243
+ CaseCount Cases, Middle;
244
+ double R1, R2, Mean, SD, Sum=0, SumSq=0, Cv;
245
+ char CVS[20];
246
+ Boolean LowT=false, HighT=false;
247
+
248
+ /* First discard any non-applicable values */
249
+
250
+ Tp = Fp;
251
+ ForEach(i, Fp, MaxCase)
252
+ {
253
+ if ( NotApplic(Case[i], ClassAtt) )
254
+ {
255
+ Swap(Fp, i);
256
+ Fp++;
257
+ }
258
+ }
259
+
260
+ /* Remember whether there were any */
261
+
262
+ if ( (SomeNA[ClassAtt] = ( Fp > Tp )) && ! Skip(ClassAtt) )
263
+ {
264
+ if ( ! CheckMsg )
265
+ {
266
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
267
+ CheckMsg = true;
268
+ }
269
+ fprintf(Of, F_ExcludeNA(Fp - Tp));
270
+ }
271
+
272
+ if ( Fp > MaxCase - CMINITEMS ) return;
273
+
274
+ Quicksort(Fp, MaxCase, ClassAtt);
275
+
276
+ Mid = (MaxCase + Fp) / 2;
277
+
278
+ /* Check for asymmetry and potential log distribution */
279
+
280
+ Quart = No(Fp, MaxCase) / 4;
281
+
282
+ if ( CVal(Case[Fp], ClassAtt) > Epsilon &&
283
+ CVal(Case[MaxCase-Quart], ClassAtt) > CVal(Case[Mid], ClassAtt) )
284
+ {
285
+ /* R1 is (log(q2)-log(q1)) / (log(q3)-log(q2))
286
+ R2 is (q2-q1) / (q3-q2)
287
+ Choose the log distribution if R2 < 1 and R1 is closer
288
+ to 1 than R2 */
289
+
290
+ R1 = ( log(CVal(Case[Mid], ClassAtt)) -
291
+ log(CVal(Case[Fp+Quart], ClassAtt)) ) /
292
+ ( log(CVal(Case[MaxCase-Quart], ClassAtt)) -
293
+ log(CVal(Case[Mid], ClassAtt)) );
294
+ R2 = (CVal(Case[Mid], ClassAtt) - CVal(Case[Fp+Quart], ClassAtt)) /
295
+ (CVal(Case[MaxCase-Quart], ClassAtt) - CVal(Case[Mid], ClassAtt));
296
+
297
+ UseLogs[ClassAtt] = R2 < 1 && fabs(R1-1) < fabs(R2-1);
298
+ if ( UseLogs[ClassAtt] )
299
+ {
300
+ Verbosity(1, fprintf(Of, " Using log distribution\n"))
301
+
302
+ if ( SIFT )
303
+ {
304
+ fprintf(Sf, "2 %d\n", ClassAtt);
305
+ }
306
+ }
307
+ }
308
+ else
309
+ {
310
+ UseLogs[ClassAtt] = false;
311
+ }
312
+
313
+ /* That's all that needs to be done for non-included attributes */
314
+
315
+ if ( Skip(ClassAtt) ) return;
316
+
317
+ /* Load the appropriate values into the class */
318
+
319
+ if ( UseLogs[ClassAtt] )
320
+ {
321
+ ForEach(i, Fp, MaxCase)
322
+ {
323
+ CClass(Case[i]) = log(CVal(Case[i], ClassAtt));
324
+ }
325
+ }
326
+ else
327
+ {
328
+ ForEach(i, Fp, MaxCase)
329
+ {
330
+ CClass(Case[i]) = CVal(Case[i], ClassAtt);
331
+ }
332
+ }
333
+
334
+ /* Check for multimodal tails and exclude */
335
+
336
+ Lp = MaxCase;
337
+ Cases = No(Fp, Lp);
338
+ Tail = MaxAnoms(Cases);
339
+
340
+ /* Estimate SD from the central half of the data and adjust; if
341
+ this is impossible (too many repeated values), mark the
342
+ attribute as skipped */
343
+
344
+ if ( CClass(Case[Fp + Quart + Tail]) < CClass(Case[Lp - Quart - Tail]) )
345
+ {
346
+ ForEach(i, Fp + Quart, Lp - Quart)
347
+ {
348
+ Sum += (Cv = CClass(Case[i]));
349
+ SumSq += Cv * Cv;
350
+ }
351
+ Mean = Sum / (Middle = No(Fp, Lp) - 2 * Quart);
352
+ SD = 2.5 * SDEstimate(Middle, Sum, SumSq);
353
+ }
354
+ else
355
+ {
356
+ /* This is not really a continuous distribution -- at least
357
+ half of the cases have identical values. Skip it */
358
+
359
+ if ( ! CheckMsg )
360
+ {
361
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
362
+ CheckMsg = true;
363
+ }
364
+
365
+ fprintf(Of, F_TooManyIdentical);
366
+ SpecialStatus[ClassAtt] |= SKIP;
367
+ return;
368
+ }
369
+
370
+ /* Look for multimodal low tail */
371
+
372
+ for ( Tp = Fp ; Tp < Mid && ZScore(Tp) >= MAXTAIL ; Tp++ )
373
+ ;
374
+
375
+ if ( Tp - Fp > Tail )
376
+ {
377
+ if ( ! CheckMsg )
378
+ {
379
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
380
+ CheckMsg = true;
381
+ }
382
+
383
+ CValToStr(CVal(Case[Tp], ClassAtt), ClassAtt, CVS);
384
+ fprintf(Of, F_LowTail(Tp - Fp, CVS));
385
+ Fp = Tp;
386
+ LowT = true;
387
+ }
388
+
389
+ /* Ditto multimodal high tail */
390
+
391
+ for ( Tp = Lp ; Tp > Mid && ZScore(Tp) >= MAXTAIL ; Tp-- )
392
+ ;
393
+
394
+ if ( Lp - Tp > Tail )
395
+ {
396
+ if ( ! CheckMsg )
397
+ {
398
+ fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
399
+ CheckMsg = true;
400
+ }
401
+
402
+ CValToStr(CVal(Case[Tp], ClassAtt), ClassAtt, CVS);
403
+ fprintf(Of, F_HighTail(Lp - Tp, CVS));
404
+ Lp = Tp;
405
+ HighT = true;
406
+ }
407
+
408
+ /* Record tail information */
409
+
410
+ LowTail[ClassAtt] = CVal(Case[Fp], ClassAtt);
411
+ HighTail[ClassAtt] = CVal(Case[Lp], ClassAtt);
412
+
413
+ if ( SIFT && ( LowT || HighT ) )
414
+ {
415
+ fprintf(Sf, "3 %d %.8g %.8g\n", ClassAtt,
416
+ ( LowT ? CVal(Case[Fp], ClassAtt) : -MAXFLOAT ),
417
+ ( HighT ? CVal(Case[Lp], ClassAtt) : MAXFLOAT ) );
418
+ }
419
+
420
+ /* Carry out global check on remaining cases */
421
+
422
+ FindContinOutliers(Fp, Lp, true);
423
+ }
424
+
425
+
426
+
427
+ /*************************************************************************/
428
+ /* */
429
+ /* Check continuous values of ClassAtt for cases Fp to Lp */
430
+ /* */
431
+ /*************************************************************************/
432
+
433
+
434
+ void FindContinOutliers(CaseNo Fp, CaseNo Lp, Boolean Sorted)
435
+ /* ------------------ */
436
+ {
437
+ CaseNo Tail, Cases, LowTp=-1, HighTp=-1, GFp, i;
438
+ double Mean, SD, LowFrac, HighFrac, LowLim, HighLim;
439
+ Clust CLow=Nil, CHigh=Nil;
440
+ Boolean SavedCluster=false;
441
+
442
+ Cases = No(Fp, Lp);
443
+ if ( Cases < CMINITEMS ) return;
444
+
445
+ if ( ! Sorted )
446
+ {
447
+ Quicksort(Fp, Lp, ClassAtt);
448
+ }
449
+
450
+ TrimmedSDEstimate(Fp, Lp, &Mean, &SD);
451
+
452
+ /* Check low and high tails. A tail is anomalous if
453
+ * it does not contain too many cases
454
+ * the case before the tail has a Z-score <= MAXNORM
455
+ * there is a gap of at least MINABNORM - MAXNORM */
456
+
457
+ Tail = MaxAnoms(Cases);
458
+
459
+ LowTp = FindTail(Fp, Fp + Tail, 1, Mean, SD);
460
+ HighTp = FindTail(Lp, Lp - Tail, -1, Mean, SD);
461
+
462
+ if ( SIFT )
463
+ {
464
+ /* See whether we need to save this cluster for low or high test */
465
+
466
+ if ( LowTp >= 0 )
467
+ {
468
+ LowFrac = No(LowTp+1, Lp) / (double) Cases;
469
+ LowLim = CVal(Case[LowTp+1], ClassAtt);
470
+ }
471
+ else
472
+ {
473
+ LowFrac = LowLim = 0;
474
+ }
475
+
476
+ if ( HighTp > 0 )
477
+ {
478
+ HighFrac = No(Fp, HighTp-1) / (double) Cases;
479
+ HighLim = CVal(Case[HighTp-1], ClassAtt);
480
+ }
481
+ else
482
+ {
483
+ HighFrac = HighLim = 0;
484
+ }
485
+
486
+ if ( LowFrac > 0 || HighFrac > 0 )
487
+ {
488
+ SaveContinCluster(Mean, SD, Cases,
489
+ LowFrac, LowLim, HighFrac, HighLim);
490
+
491
+ SavedCluster = true;
492
+ }
493
+ }
494
+
495
+
496
+ if ( LowTp >= 0 || HighTp > 0 )
497
+ {
498
+ GFp = ( LowTp >= 0 ? LowTp+1 : Fp );
499
+
500
+ if ( LowTp >= 0 )
501
+ {
502
+ CLow = NewClust(Mean, SD,
503
+ CVal(Case[LowTp+1], ClassAtt),
504
+ No(Fp, LowTp), Cases);
505
+ }
506
+
507
+ if ( HighTp > 0 )
508
+ {
509
+ CHigh = NewClust(Mean, SD,
510
+ CVal(Case[HighTp-1], ClassAtt),
511
+ No(HighTp, Lp), Cases);
512
+
513
+ /* Move all anomalies to the front */
514
+
515
+ ForEach(i, HighTp, Lp)
516
+ {
517
+ Swap(i, GFp);
518
+ GFp++;
519
+ }
520
+ }
521
+
522
+ LabelContinOutliers(CLow, CHigh, Fp, GFp, Lp);
523
+ }
524
+
525
+ /* Clusters may have caveats discovered during LabelContinOutliers */
526
+
527
+ if ( SavedCluster )
528
+ {
529
+ ExtendSiftEntry("\n");
530
+ }
531
+ }
532
+
533
+
534
+
535
+ /*************************************************************************/
536
+ /* */
537
+ /* Note cases Fp to Lp as outliers wrt values GFp to GLp of */
538
+ /* ClassAtt whose mean and SD are given */
539
+ /* */
540
+ /*************************************************************************/
541
+
542
+
543
+ void LabelContinOutliers(Clust CL, Clust CH, CaseNo Fp, CaseNo GFp, CaseNo GLp)
544
+ /* ------------------- */
545
+ {
546
+ CaseNo i;
547
+ double Z, Mean, SD, X;
548
+ Clust C, OldC;
549
+
550
+ C = ( CL ? CL : CH ); /* either will do since mean is the same */
551
+
552
+ Mean = C->Expect;
553
+ SD = C->SD;
554
+
555
+ /* Remove cases that already have a more interesting recorded
556
+ anomalous value */
557
+
558
+ ForEach(i, Fp, GFp-1)
559
+ {
560
+ /* Use Chebychev bounds to approximate certainty that this
561
+ case is an outlier */
562
+
563
+ Z = ZScore(i);
564
+ X = 1 / (Z * Z);
565
+
566
+ if ( (OldC = OutClust(Case[i])) &&
567
+ ( C->NCond > OldC->NCond ||
568
+ C->NCond == OldC->NCond && X >= OutXVal(Case[i]) ) )
569
+ {
570
+ Swap(i, Fp);
571
+ Fp++;
572
+ }
573
+ }
574
+
575
+ /* Remove possible anomalies that are non consistent with the
576
+ ordinary cases */
577
+
578
+ Fp = NoOtherDifference(Fp, GFp-1, GFp, GLp);
579
+
580
+ /* Finally, record remaining cases */
581
+
582
+ ForEach(i, Fp, GFp-1)
583
+ {
584
+ Z = ZScore(i);
585
+ Verbosity(1,
586
+ fprintf(Of, "****\tpotential outlier %g (%.1f sd) %s\n",
587
+ CVal(Case[i], ClassAtt), Z,
588
+ ( LabelAtt ? SVal(Case[i], LabelAtt) : "" )))
589
+
590
+ RecordOutlier(i, ( CClass(Case[i]) < Mean ? CL : CH ), 1 / (Z * Z));
591
+ }
592
+ }
593
+
594
+
595
+
596
+ /*************************************************************************/
597
+ /* */
598
+ /* Robust estimator of mean and SD. */
599
+ /* Idea: exclude high/low tails of data, and adjust computed */
600
+ /* mean and SD heuristically. */
601
+ /* Note: unknown and N/A values must be removed and cases must be */
602
+ /* sorted by ClassAtt before calling TrimmedSDEstimate. */
603
+ /* */
604
+ /*************************************************************************/
605
+
606
+
607
+ void TrimmedSDEstimate(CaseNo Fp, CaseNo Lp, double *Mean, double *SD)
608
+ /* ----------------- */
609
+ {
610
+ CaseNo i, Tail, Cases, Quart;
611
+ double Val, Sum=0, SumSq=0;
612
+
613
+ /* Set defaults */
614
+
615
+ *Mean = 0;
616
+ *SD = 1E38;
617
+
618
+ Tail = MaxAnoms(No(Fp, Lp));
619
+ Cases = No(Fp, Lp) - 2 * Tail;
620
+ Quart = No(Fp, Lp) / 4;
621
+
622
+ /* Don't try to estimate if too many values are the same */
623
+
624
+ if ( CClass(Case[Fp+Quart]) == CClass(Case[Lp-Quart]) )
625
+ {
626
+ return;
627
+ }
628
+
629
+ ForEach(i, Fp+Tail, Lp-Tail)
630
+ {
631
+ if ( NotApplic(Case[i], ClassAtt) )
632
+ {
633
+ Cases--;
634
+ }
635
+ else
636
+ {
637
+ Val = CClass(Case[i]);
638
+ Sum += Val;
639
+ SumSq += Val * Val;
640
+ }
641
+ }
642
+
643
+ if ( Cases < Tail )
644
+ {
645
+ return;
646
+ }
647
+
648
+ /* If there are N cases (excluding non-applicables) then adjust
649
+ SD by factor (N + Tail) / (N - Tail) */
650
+
651
+ *Mean = Sum / Cases;
652
+ *SD = SDEstimate(Cases, Sum, SumSq) *
653
+ (Cases + 3.0 * Tail ) / (Cases + Tail);
654
+ }
655
+
656
+
657
+
658
+ /*************************************************************************/
659
+ /* */
660
+ /* Find a tail containing potential anomalies between Fp and Lp-1. */
661
+ /* * case Lp must have a Z-score <= MAXNORM */
662
+ /* * there must be a gap >= MINABNORM-MAXNORM between the anomalous */
663
+ /* and non-anomalous values */
664
+ /* * the cluster cannot contain cases from omitted tails */
665
+ /* */
666
+ /*************************************************************************/
667
+
668
+
669
+ CaseNo FindTail(CaseNo Fp, CaseNo Lp, int I, double Mean, double SD)
670
+ /* -------- */
671
+ {
672
+ CaseNo i;
673
+ double Z;
674
+
675
+ if ( ZScore(Lp) > MAXNORM ) return -1;
676
+
677
+ /* Find the first gap */
678
+
679
+ for ( i = Lp ; i * I > Fp * I && (Z = ZScore(i)) <= MINABNORM ; i -= I )
680
+ {
681
+ if ( ZScore(i - I) - Z >= MINABNORM - MAXNORM )
682
+ {
683
+ break;
684
+ }
685
+ }
686
+
687
+ return ( Z > MINABNORM ? -1 : OmittedCases(I) ? -1 : i - I );
688
+ }
689
+
690
+
691
+
692
+ /*************************************************************************/
693
+ /* */
694
+ /* Check whether the current cluster includes cases from the */
695
+ /* excluded high/low tails */
696
+ /* */
697
+ /*************************************************************************/
698
+
699
+
700
+ Boolean OmittedCases(int HiLo)
701
+ /* ------------ */
702
+ {
703
+ CaseNo Fp, Lp;
704
+ CaseNo i;
705
+
706
+ if ( HiLo > 0 )
707
+ {
708
+ Fp = LowFp;
709
+ Lp = LowLp;
710
+ }
711
+ else
712
+ {
713
+ Fp = HighFp;
714
+ Lp = MaxCase;
715
+ }
716
+
717
+ ForEach(i, Fp, Lp)
718
+ {
719
+ if ( SatisfiesTests(Case[i]) )
720
+ {
721
+ return true;
722
+ }
723
+ }
724
+
725
+ return false;
726
+ }
727
+
728
+
729
+
730
+ /*************************************************************************/
731
+ /* */
732
+ /* See whether a case satisfies all current tests */
733
+ /* */
734
+ /*************************************************************************/
735
+
736
+
737
+ Boolean SatisfiesTests(Description Case)
738
+ /* -------------- */
739
+ {
740
+ Attribute Att;
741
+ DiscrValue Br;
742
+ int i;
743
+
744
+ ForEach(i, 0, GEnv.Level)
745
+ {
746
+ Att = GEnv.Test[i].Att;
747
+ Br = GEnv.Test[i].Br;
748
+
749
+ if ( Unknown(Case, Att) )
750
+ {
751
+ return false;
752
+ }
753
+ else
754
+ if ( Br == 1 )
755
+ {
756
+ if ( ! NotApplic(Case, Att) ) return false;
757
+ }
758
+ else
759
+ if ( NotApplic(Case, Att) )
760
+ {
761
+ return false;
762
+ }
763
+ else
764
+ if ( Continuous(Att) )
765
+ {
766
+ if ( ( Br == 2 ) != ( CVal(Case, Att) <= GEnv.Test[i].Cut ) )
767
+ {
768
+ return false;
769
+ }
770
+ }
771
+ else
772
+ if ( Ordered(Att) )
773
+ {
774
+ if ( ( Br == 2 ) != ( DVal(Case, Att) <= GEnv.Test[i].Cut ) )
775
+ {
776
+ return false;
777
+ }
778
+ }
779
+ else
780
+ if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
781
+ {
782
+ if ( ( Br == 2 ) != ( In(DVal(Case, Att), GEnv.Test[i].Left) != 0 ) )
783
+ {
784
+ return false;
785
+ }
786
+ }
787
+ else
788
+ if ( Br != DVal(Case, Att) )
789
+ {
790
+ return false;
791
+ }
792
+ }
793
+
794
+ return true;
795
+ }
796
+
797
+
798
+
799
+ /*************************************************************************/
800
+ /* */
801
+ /* Check discrete values of ClassAtt for cases Fp to Lp */
802
+ /* Idea: anomaly will appear as an odd case in nearly-pure subset */
803
+ /* */
804
+ /*************************************************************************/
805
+
806
+
807
+ void FindDiscrOutliers(CaseNo Fp, CaseNo Lp, CaseCount *Table)
808
+ /* ----------------- */
809
+ {
810
+ DiscrValue v, Majority=1;
811
+ CaseNo i, GFp, GLp;
812
+ CaseCount Cases, Anoms;
813
+ double X;
814
+ Clust C, OldC;
815
+ Boolean SomeSurprise=false, NeedCluster;
816
+
817
+ Cases = No(Fp, Lp);
818
+ if ( Cases < DMINITEMS ) return;
819
+
820
+ if ( ! Table )
821
+ {
822
+ FindClassFrequencies(Fp, Lp);
823
+ Table = GEnv.ClassFreq;
824
+ }
825
+
826
+ ForEach(v, 2, MaxAttVal[ClassAtt])
827
+ {
828
+ if ( Table[v] > 0 )
829
+ {
830
+ if ( ! Majority || Table[v] > Table[Majority] )
831
+ {
832
+ Majority = v;
833
+ }
834
+ }
835
+ }
836
+
837
+ /* Skip if too many anomalies */
838
+
839
+ Anoms = Cases - Table[Majority];
840
+
841
+ if ( Anoms > MaxAnoms(Cases) )
842
+ {
843
+ return;
844
+ }
845
+
846
+ /* Check whether any non-majority class is surprising */
847
+
848
+ for ( v = 1 ; ! SomeSurprise && v <= MaxAttVal[ClassAtt] ; v++ )
849
+ {
850
+ if ( v == Majority ) continue;
851
+
852
+ X = XDScore(Table[v], Cases, Anoms, Prior[ClassAtt][v]);
853
+ SomeSurprise = ( X <= 1.0 / (MINABNORM * MINABNORM) );
854
+ }
855
+ if ( ! SomeSurprise ) return;
856
+
857
+ if ( SIFT )
858
+ {
859
+ SaveDiscrCluster(Majority, Anoms, Cases, Table);
860
+ }
861
+
862
+ if ( ! Anoms )
863
+ {
864
+ if ( SIFT )
865
+ {
866
+ ExtendSiftEntry("\n");
867
+ }
868
+
869
+ return;
870
+ }
871
+
872
+ /* Need a new cluster if surprising non-zero frequencies */
873
+
874
+ NeedCluster = ( Table[--v] > 0 );
875
+ while ( ! NeedCluster && ++v <= MaxAttVal[ClassAtt] )
876
+ {
877
+ if ( v == Majority || ! Table[v] ) continue;
878
+
879
+ X = XDScore(Table[v], Cases, Anoms, Prior[ClassAtt][v]);
880
+ NeedCluster = ( X <= 1.0 / (MINABNORM * MINABNORM) );
881
+ }
882
+
883
+ if ( NeedCluster )
884
+ {
885
+ /* Move all majority-class cases to the front */
886
+
887
+ GFp = Fp;
888
+ GLp = (Fp = Group(ClassAtt, Majority, Fp, Lp, 0.0, Nil)) - 1;
889
+
890
+ C = NewClust(Majority, 0.0, 0.0, Anoms, Cases);
891
+
892
+ /* Remove cases whose surprise value is insufficient or
893
+ that already have a more interesting recorded anomalous value */
894
+
895
+ ForEach(i, Fp, Lp)
896
+ {
897
+ v = DClass(Case[i]);
898
+ X = DScore(Cases, Anoms, Prior[ClassAtt][v]);
899
+
900
+ if ( X > 1.0 / (MINABNORM * MINABNORM) ||
901
+ (OldC = OutClust(Case[i])) &&
902
+ ( C->NCond > OldC->NCond ||
903
+ C->NCond == OldC->NCond && X > OutXVal(Case[i]) ) )
904
+ {
905
+ Swap(i, Fp);
906
+ Fp++;
907
+ }
908
+ }
909
+
910
+ /* Remove possible anomalies that are non consistent with the
911
+ ordinary cases */
912
+
913
+ Fp = NoOtherDifference(Fp, Lp, GFp, GLp);
914
+
915
+ /* Finally, record remaining cases */
916
+
917
+ if ( Fp <= Lp )
918
+ {
919
+ ForEach(i, Fp, Lp)
920
+ {
921
+ v = DClass(Case[i]);
922
+ X = DScore(Cases, Anoms, Prior[ClassAtt][v]);
923
+
924
+ Verbosity(1,
925
+ fprintf(Of, "****\tpotential outlier %s (p=%.3f) %s\n",
926
+ AttValName[ClassAtt][DClass(Case[i])], X,
927
+ ( LabelAtt ? SVal(Case[i], LabelAtt) : "" )))
928
+
929
+ RecordOutlier(i, C, X);
930
+ }
931
+ }
932
+ }
933
+
934
+ if ( SIFT && SomeSurprise )
935
+ {
936
+ ExtendSiftEntry("\n");
937
+ }
938
+ }
939
+
940
+
941
+
942
+ /*************************************************************************/
943
+ /* */
944
+ /* Cases Fp through Lp have been identified as potential anoms */
945
+ /* in the cluster whose "normal" cases are GFp thrrough GLp. */
946
+ /* Discard potential anomalies that appear to be inconsistent */
947
+ /* with the normals on some other attribute. */
948
+ /* If SIFT is set, record any caveats for the current cluster */
949
+ /* */
950
+ /*************************************************************************/
951
+
952
+
953
+ CaseNo NoOtherDifference(CaseNo Fp, CaseNo Lp, CaseNo GFp, CaseNo GLp)
954
+ /* ----------------- */
955
+ {
956
+ Attribute Att;
957
+ double Sum, SumSq, Mean, SD, CV;
958
+ CaseNo i, Cases, GCases;
959
+ DiscrValue v;
960
+ Boolean Caveat;
961
+ int Bytes;
962
+ char SE[100];
963
+
964
+ if ( GEnv.Level < 0 ||
965
+ Fp > Lp || (GCases = No(GFp, GLp)) < MINCONTEXT ) return Fp;
966
+
967
+ /* Use a sample if there are many normal cases */
968
+
969
+ if ( GCases > MaxDiscrVal * SAMPLEUNIT )
970
+ {
971
+ GCases = 0.5 * MaxDiscrVal * SAMPLEUNIT;
972
+ Sample(GFp, GLp, GCases);
973
+ GLp = GFp + GCases - 1;
974
+ }
975
+
976
+ ForEach(Att, 1, MaxAtt)
977
+ {
978
+ if ( Att == ClassAtt || Exclude(Att) ) continue;
979
+
980
+ if ( Fp > Lp ) return Fp;
981
+
982
+ Caveat = false;
983
+
984
+ if ( Continuous(Att) )
985
+ {
986
+ /* Find mean and variance of ordinary cases */
987
+
988
+ Sum = SumSq = Cases = 0;
989
+ ForEach(i, GFp, GLp)
990
+ {
991
+ if ( ! Unknown(Case[i], Att) && ! NotApplic(Case[i], Att) )
992
+ {
993
+ CV = ( UseLogs[Att] ? log(CVal(Case[i], Att)) :
994
+ CVal(Case[i], Att) );
995
+ Sum += CV;
996
+ SumSq += CV * CV;
997
+ Cases++;
998
+ }
999
+ }
1000
+
1001
+ /* Check that sufficient cases to give reliable SD */
1002
+
1003
+ if ( Cases >= MINCONTEXT )
1004
+ {
1005
+ Mean = Sum / Cases;
1006
+ SD = SDEstimate(Cases, Sum, SumSq);
1007
+
1008
+ /* Move filtered cases to the front */
1009
+
1010
+ ForEach (i, Fp, Lp)
1011
+ {
1012
+ if ( ! Unknown(Case[i], Att) &&
1013
+ ! NotApplic(Case[i], Att) &&
1014
+ fabs(Mean -
1015
+ ( UseLogs[Att] ? log(CVal(Case[i], Att)) :
1016
+ CVal(Case[i], Att) ))
1017
+ / SD > MAXNORM )
1018
+ {
1019
+ Verbosity(2,
1020
+ fprintf(Of, "\t %d: difference %s %.2f SD\n",
1021
+ i, AttName[Att],
1022
+ (Mean -
1023
+ ( UseLogs[Att] ? log(CVal(Case[i], Att)) :
1024
+ CVal(Case[i], Att) )) / SD))
1025
+ Swap(i, Fp);
1026
+ Fp++;
1027
+
1028
+ /* Record possible caveat */
1029
+
1030
+ if ( SIFT && ! Caveat )
1031
+ {
1032
+ Caveat = true;
1033
+
1034
+ sprintf(SE, " %d", Att);
1035
+ ExtendSiftEntry(SE);
1036
+ if ( UseLogs[Att] )
1037
+ {
1038
+ sprintf(SE, " %.8g %.8g",
1039
+ exp(Mean - MAXNORM * SD),
1040
+ exp(Mean + MAXNORM * SD));
1041
+ }
1042
+ else
1043
+ {
1044
+ sprintf(SE, " %.8g %.8g",
1045
+ Mean - MAXNORM * SD,
1046
+ Mean + MAXNORM * SD);
1047
+ }
1048
+
1049
+ ExtendSiftEntry(SE);
1050
+ }
1051
+ }
1052
+ }
1053
+ }
1054
+ }
1055
+ else
1056
+ {
1057
+ /* Discrete attribute
1058
+ NB: This doesn't differentiate between ordered and unordered
1059
+ discrete attributes -- perhaps it should */
1060
+
1061
+ ForEach(v, 0, MaxAttVal[Att])
1062
+ {
1063
+ GEnv.ValFreq[v] = 0;
1064
+ }
1065
+
1066
+ ForEach(i, GFp, GLp)
1067
+ {
1068
+ GEnv.ValFreq[XDVal(Case[i], Att)]++;
1069
+ }
1070
+
1071
+ /* A discrete attribute value is judged to inconsistent with
1072
+ the normals if its Laplace probability in the normals is
1073
+ less than 0.025 and its prior greater than 0.25 */
1074
+
1075
+ Bytes = (MaxAttVal[Att]>>3) + 1;
1076
+ ClearBits(Bytes, GEnv.Subset[0]);
1077
+
1078
+ ForEach(i, Fp, Lp)
1079
+ {
1080
+ v = XDVal(Case[i], Att);
1081
+ if ( Prior[Att][v] >= 0.25 &&
1082
+ (GEnv.ValFreq[v] + 1) / (double) (GCases + 2) < 0.025L )
1083
+ {
1084
+ Verbosity(2,
1085
+ fprintf(Of, "\t %d: difference %s=%s (%d/%d)\n",
1086
+ i, AttName[Att], AttValName[Att][v],
1087
+ GEnv.ValFreq[v], GCases))
1088
+ Swap(i, Fp);
1089
+ Fp++;
1090
+
1091
+ SetBit(v, GEnv.Subset[0]);
1092
+ Caveat = true;
1093
+ }
1094
+ }
1095
+
1096
+ if ( SIFT && Caveat )
1097
+ {
1098
+ sprintf(SE, " %d", Att);
1099
+ ExtendSiftEntry(SE);
1100
+ ForEach(v, 0, Bytes-1)
1101
+ {
1102
+ sprintf(SE, " %x", GEnv.Subset[0][v]);
1103
+ ExtendSiftEntry(SE);
1104
+ }
1105
+ }
1106
+ }
1107
+ }
1108
+
1109
+ return Fp;
1110
+ }