see5-installer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
#*************************************************************************#
|
2
|
+
#* *#
|
3
|
+
#* Makefile for GritBot *#
|
4
|
+
#* -------------------- *#
|
5
|
+
#* *#
|
6
|
+
#*************************************************************************#
|
7
|
+
|
8
|
+
|
9
|
+
CC = gcc -ffloat-store
|
10
|
+
CFLAGS = -DVerbOpt -g -Wall -O0
|
11
|
+
LFLAGS = $(S)
|
12
|
+
SHELL = /bin/csh
|
13
|
+
|
14
|
+
|
15
|
+
# Definitions of file sets
|
16
|
+
|
17
|
+
src =\
|
18
|
+
global.c\
|
19
|
+
cluster.c\
|
20
|
+
continatt.c\
|
21
|
+
outlier.c\
|
22
|
+
getdata.c\
|
23
|
+
gritbot.c\
|
24
|
+
sort.c\
|
25
|
+
discratt.c\
|
26
|
+
check.c\
|
27
|
+
common.c\
|
28
|
+
getnames.c\
|
29
|
+
implicitatt.c\
|
30
|
+
modelfiles.c\
|
31
|
+
update.c\
|
32
|
+
utility.c
|
33
|
+
|
34
|
+
|
35
|
+
isrc =\
|
36
|
+
inspect.c\
|
37
|
+
cluster.c\
|
38
|
+
outlier.c\
|
39
|
+
getdata.c\
|
40
|
+
getnames.c\
|
41
|
+
implicitatt.c\
|
42
|
+
modelfiles.c\
|
43
|
+
update.c\
|
44
|
+
common.c\
|
45
|
+
utility.c
|
46
|
+
|
47
|
+
|
48
|
+
obj =\
|
49
|
+
global.o\
|
50
|
+
gritbot.o\
|
51
|
+
getdata.o getnames.o implicitatt.o\
|
52
|
+
check.o cluster.o outlier.o\
|
53
|
+
common.o continatt.o discratt.o\
|
54
|
+
modelfiles.o\
|
55
|
+
sort.o utility.o update.o
|
56
|
+
|
57
|
+
|
58
|
+
all:
|
59
|
+
make gritbot
|
60
|
+
make inspect
|
61
|
+
|
62
|
+
|
63
|
+
# debug version (including verbosity option)
|
64
|
+
|
65
|
+
gritbotdbg:\
|
66
|
+
$(obj) defns.i text.i extern.i Makefile
|
67
|
+
$(CC) -DVerbOpt -g -o gritbotdbg $(obj) -lm
|
68
|
+
|
69
|
+
inspectdbg:\
|
70
|
+
$(isrc) defns.i text.i Makefile
|
71
|
+
cat defns.i $(isrc)\
|
72
|
+
| egrep -v 'defns.i|extern.i' >insgt.c
|
73
|
+
$(CC) $(CFLAGS) -DVerbOpt -DINSPECT -o inspectdbg insgt.c -lm
|
74
|
+
|
75
|
+
# production versions
|
76
|
+
|
77
|
+
gritbot:\
|
78
|
+
$(src) defns.i text.i Makefile
|
79
|
+
cat defns.i $(src)\
|
80
|
+
| egrep -v 'defns.i|extern.i' >gbotgt.c
|
81
|
+
$(CC) $(LFLAGS) -O3 -o gritbot gbotgt.c -lm
|
82
|
+
strip gritbot
|
83
|
+
rm gbotgt.c
|
84
|
+
|
85
|
+
inspect:\
|
86
|
+
$(isrc) defns.i text.i Makefile
|
87
|
+
cat defns.i $(isrc)\
|
88
|
+
| egrep -v 'defns.i|extern.i' >insgt.c
|
89
|
+
$(CC) $(LFLAGS) -DINSPECT -O3 -o inspect insgt.c -lm
|
90
|
+
strip inspect
|
91
|
+
rm insgt.c
|
92
|
+
|
93
|
+
|
94
|
+
$(obj): Makefile defns.i extern.i
|
95
|
+
|
96
|
+
|
97
|
+
.c.o:
|
98
|
+
$(CC) $(CFLAGS) -c $<
|
data/ext/gritbot/check.c
ADDED
@@ -0,0 +1,1110 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of GritBot GPL Edition, a single-threaded version */
|
6
|
+
/* of GritBot release 2.01. */
|
7
|
+
/* */
|
8
|
+
/* GritBot GPL Edition is free software: you can redistribute it */
|
9
|
+
/* and/or modify it under the terms of the GNU General Public License */
|
10
|
+
/* as published by the Free Software Foundation, either version 3 of */
|
11
|
+
/* the License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* GritBot GPL Edition is distributed in the hope that it will be */
|
14
|
+
/* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
|
15
|
+
/* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
16
|
+
/* GNU General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with GritBot GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Principal routines to check data */
|
30
|
+
/* -------------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#include "defns.i"
|
36
|
+
#include "extern.i"
|
37
|
+
|
38
|
+
Boolean CheckMsg; /* message printed for current attribute */
|
39
|
+
CaseNo LowFp, LowLp, HighFp; /* pointers to omitted low/high tails */
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
/*************************************************************************/
|
44
|
+
/* */
|
45
|
+
/* Check each attribute in turn */
|
46
|
+
/* */
|
47
|
+
/*************************************************************************/
|
48
|
+
|
49
|
+
|
50
|
+
void CheckData()
|
51
|
+
/* --------- */
|
52
|
+
{
|
53
|
+
CaseNo i, Fp, Lp;
|
54
|
+
|
55
|
+
NotifyStage(PRELIM);
|
56
|
+
|
57
|
+
/* Set up tables, determine discrete value priors, etc. */
|
58
|
+
|
59
|
+
InitialiseDAC();
|
60
|
+
|
61
|
+
/* Initialise random numbers for sampling */
|
62
|
+
|
63
|
+
ResetKR(0);
|
64
|
+
|
65
|
+
DMINITEMS = CMINITEMS = Max(35, 0.005 * (MaxCase+1));
|
66
|
+
|
67
|
+
LowFp = 0;
|
68
|
+
LowLp = -1;
|
69
|
+
HighFp = MaxCase+1;
|
70
|
+
|
71
|
+
if ( SIFT )
|
72
|
+
{
|
73
|
+
CheckFile(".sift", true);
|
74
|
+
}
|
75
|
+
|
76
|
+
/* Set ClassAtt to each attribute in turn.
|
77
|
+
Check distribution type, remove tails, and perform global check */
|
78
|
+
|
79
|
+
ForEach(ClassAtt, 1, MaxAtt)
|
80
|
+
{
|
81
|
+
if ( Exclude(ClassAtt) )
|
82
|
+
{
|
83
|
+
continue;
|
84
|
+
}
|
85
|
+
|
86
|
+
Verbosity(1, fprintf(Of, "\n==========\n%s\n", AttName[ClassAtt]))
|
87
|
+
|
88
|
+
Progress(-ClassAtt);
|
89
|
+
CheckMsg = false;
|
90
|
+
|
91
|
+
/* Delete missing values and set SomeMiss[].
|
92
|
+
SomeNA[] is set in CheckContin (for continuous atts)
|
93
|
+
and in InitialiseDAC (for discrete atts) */
|
94
|
+
|
95
|
+
Fp = SkipMissing(ClassAtt, 0, MaxCase);
|
96
|
+
if ( (SomeMiss[ClassAtt] = ( Fp > 0 )) && ! Skip(ClassAtt) )
|
97
|
+
{
|
98
|
+
fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
|
99
|
+
fprintf(Of, F_ExcludeMissing(Fp));
|
100
|
+
CheckMsg = true;
|
101
|
+
}
|
102
|
+
|
103
|
+
if ( Fp < MaxCase )
|
104
|
+
{
|
105
|
+
TargetSaved = false;
|
106
|
+
GEnv.Level = -1;
|
107
|
+
|
108
|
+
if ( Continuous(ClassAtt) )
|
109
|
+
{
|
110
|
+
CheckContin(Fp);
|
111
|
+
}
|
112
|
+
else
|
113
|
+
if ( SIFT )
|
114
|
+
{
|
115
|
+
/* Check for possible entries for sift file */
|
116
|
+
|
117
|
+
ForEach(i, Fp, MaxCase)
|
118
|
+
{
|
119
|
+
Case[i][0] = Case[i][ClassAtt];
|
120
|
+
}
|
121
|
+
|
122
|
+
FindDiscrOutliers(Fp, MaxCase, Nil);
|
123
|
+
}
|
124
|
+
|
125
|
+
/* Dump any sift entries */
|
126
|
+
|
127
|
+
if ( SIFT && GEnv.SiftSize )
|
128
|
+
{
|
129
|
+
fprintf(Sf, "1 %d\n%s", ClassAtt, GEnv.SiftEntry);
|
130
|
+
GEnv.SiftSize = 0;
|
131
|
+
}
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
|
136
|
+
/* Search for subsets and test */
|
137
|
+
|
138
|
+
NotifyStage(CHECKING);
|
139
|
+
|
140
|
+
ForEach(ClassAtt, 1, MaxAtt)
|
141
|
+
{
|
142
|
+
if ( Skip(ClassAtt) ) continue;
|
143
|
+
|
144
|
+
Verbosity(1, fprintf(Of, "\n==========\n%s\n", AttName[ClassAtt]))
|
145
|
+
|
146
|
+
Progress(-ClassAtt);
|
147
|
+
|
148
|
+
/* Restore original order */
|
149
|
+
|
150
|
+
memcpy(Case, SaveCase, (MaxCase+1) * sizeof(Description));
|
151
|
+
|
152
|
+
/* Remove missing values and tails of continuous attributes */
|
153
|
+
|
154
|
+
Fp = ( SomeMiss[ClassAtt] ? SkipMissing(ClassAtt, 0, MaxCase) : 0 );
|
155
|
+
Lp = MaxCase;
|
156
|
+
|
157
|
+
if ( Continuous(ClassAtt) )
|
158
|
+
{
|
159
|
+
/* Remove N/A values */
|
160
|
+
|
161
|
+
if ( SomeNA[ClassAtt] ) Fp = Group(ClassAtt, 1, Fp, Lp, 0, Nil);
|
162
|
+
|
163
|
+
/* Put low tails low and high tails high */
|
164
|
+
|
165
|
+
LowFp = Fp;
|
166
|
+
|
167
|
+
ForEach(i, Fp, Lp)
|
168
|
+
{
|
169
|
+
if ( CVal(Case[i], ClassAtt) < LowTail[ClassAtt] )
|
170
|
+
{
|
171
|
+
Swap(i, Fp);
|
172
|
+
Fp++;
|
173
|
+
}
|
174
|
+
else
|
175
|
+
if ( CVal(Case[i], ClassAtt) > HighTail[ClassAtt] )
|
176
|
+
{
|
177
|
+
Swap(i, Lp);
|
178
|
+
Lp--;
|
179
|
+
i--;
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
LowLp = Fp-1;
|
184
|
+
HighFp = Lp+1;
|
185
|
+
}
|
186
|
+
|
187
|
+
if ( Fp > 0 ) Progress(Fp);
|
188
|
+
|
189
|
+
/* Copy class values */
|
190
|
+
|
191
|
+
if ( Continuous(ClassAtt) && UseLogs[ClassAtt] )
|
192
|
+
{
|
193
|
+
ForEach(i, Fp, Lp)
|
194
|
+
{
|
195
|
+
CClass(Case[i]) = log(CVal(Case[i], ClassAtt));
|
196
|
+
}
|
197
|
+
}
|
198
|
+
else
|
199
|
+
{
|
200
|
+
ForEach(i, Fp, Lp)
|
201
|
+
{
|
202
|
+
Case[i][0] = Case[i][ClassAtt];
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
SampleSize = SAMPLEUNIT *
|
207
|
+
( Continuous(ClassAtt) ? 5 :
|
208
|
+
SomeNA[ClassAtt] ? MaxAttVal[ClassAtt] :
|
209
|
+
MaxAttVal[ClassAtt] - 1 );
|
210
|
+
Split(Fp, Lp, 0, Nil, 0, &T);
|
211
|
+
|
212
|
+
TargetSaved = false;
|
213
|
+
LastLevel = -1;
|
214
|
+
|
215
|
+
ReleaseTree(T, 0);
|
216
|
+
T = Nil;
|
217
|
+
}
|
218
|
+
|
219
|
+
if ( SIFT )
|
220
|
+
{
|
221
|
+
fprintf(Sf, "0\n");
|
222
|
+
fclose(Sf);
|
223
|
+
Sf = 0;
|
224
|
+
}
|
225
|
+
}
|
226
|
+
|
227
|
+
|
228
|
+
|
229
|
+
/*************************************************************************/
|
230
|
+
/* */
|
231
|
+
/* Check a continuous attribute */
|
232
|
+
/* - decide whether to apply the log transformation */
|
233
|
+
/* - exclude possibly multimodal tails */
|
234
|
+
/* - check for global outliers */
|
235
|
+
/* */
|
236
|
+
/*************************************************************************/
|
237
|
+
|
238
|
+
|
239
|
+
void CheckContin(CaseNo Fp)
|
240
|
+
/* ----------- */
|
241
|
+
{
|
242
|
+
CaseNo i, Mid, Quart, Tail, Tp, Lp;
|
243
|
+
CaseCount Cases, Middle;
|
244
|
+
double R1, R2, Mean, SD, Sum=0, SumSq=0, Cv;
|
245
|
+
char CVS[20];
|
246
|
+
Boolean LowT=false, HighT=false;
|
247
|
+
|
248
|
+
/* First discard any non-applicable values */
|
249
|
+
|
250
|
+
Tp = Fp;
|
251
|
+
ForEach(i, Fp, MaxCase)
|
252
|
+
{
|
253
|
+
if ( NotApplic(Case[i], ClassAtt) )
|
254
|
+
{
|
255
|
+
Swap(Fp, i);
|
256
|
+
Fp++;
|
257
|
+
}
|
258
|
+
}
|
259
|
+
|
260
|
+
/* Remember whether there were any */
|
261
|
+
|
262
|
+
if ( (SomeNA[ClassAtt] = ( Fp > Tp )) && ! Skip(ClassAtt) )
|
263
|
+
{
|
264
|
+
if ( ! CheckMsg )
|
265
|
+
{
|
266
|
+
fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
|
267
|
+
CheckMsg = true;
|
268
|
+
}
|
269
|
+
fprintf(Of, F_ExcludeNA(Fp - Tp));
|
270
|
+
}
|
271
|
+
|
272
|
+
if ( Fp > MaxCase - CMINITEMS ) return;
|
273
|
+
|
274
|
+
Quicksort(Fp, MaxCase, ClassAtt);
|
275
|
+
|
276
|
+
Mid = (MaxCase + Fp) / 2;
|
277
|
+
|
278
|
+
/* Check for asymmetry and potential log distribution */
|
279
|
+
|
280
|
+
Quart = No(Fp, MaxCase) / 4;
|
281
|
+
|
282
|
+
if ( CVal(Case[Fp], ClassAtt) > Epsilon &&
|
283
|
+
CVal(Case[MaxCase-Quart], ClassAtt) > CVal(Case[Mid], ClassAtt) )
|
284
|
+
{
|
285
|
+
/* R1 is (log(q2)-log(q1)) / (log(q3)-log(q2))
|
286
|
+
R2 is (q2-q1) / (q3-q2)
|
287
|
+
Choose the log distribution if R2 < 1 and R1 is closer
|
288
|
+
to 1 than R2 */
|
289
|
+
|
290
|
+
R1 = ( log(CVal(Case[Mid], ClassAtt)) -
|
291
|
+
log(CVal(Case[Fp+Quart], ClassAtt)) ) /
|
292
|
+
( log(CVal(Case[MaxCase-Quart], ClassAtt)) -
|
293
|
+
log(CVal(Case[Mid], ClassAtt)) );
|
294
|
+
R2 = (CVal(Case[Mid], ClassAtt) - CVal(Case[Fp+Quart], ClassAtt)) /
|
295
|
+
(CVal(Case[MaxCase-Quart], ClassAtt) - CVal(Case[Mid], ClassAtt));
|
296
|
+
|
297
|
+
UseLogs[ClassAtt] = R2 < 1 && fabs(R1-1) < fabs(R2-1);
|
298
|
+
if ( UseLogs[ClassAtt] )
|
299
|
+
{
|
300
|
+
Verbosity(1, fprintf(Of, " Using log distribution\n"))
|
301
|
+
|
302
|
+
if ( SIFT )
|
303
|
+
{
|
304
|
+
fprintf(Sf, "2 %d\n", ClassAtt);
|
305
|
+
}
|
306
|
+
}
|
307
|
+
}
|
308
|
+
else
|
309
|
+
{
|
310
|
+
UseLogs[ClassAtt] = false;
|
311
|
+
}
|
312
|
+
|
313
|
+
/* That's all that needs to be done for non-included attributes */
|
314
|
+
|
315
|
+
if ( Skip(ClassAtt) ) return;
|
316
|
+
|
317
|
+
/* Load the appropriate values into the class */
|
318
|
+
|
319
|
+
if ( UseLogs[ClassAtt] )
|
320
|
+
{
|
321
|
+
ForEach(i, Fp, MaxCase)
|
322
|
+
{
|
323
|
+
CClass(Case[i]) = log(CVal(Case[i], ClassAtt));
|
324
|
+
}
|
325
|
+
}
|
326
|
+
else
|
327
|
+
{
|
328
|
+
ForEach(i, Fp, MaxCase)
|
329
|
+
{
|
330
|
+
CClass(Case[i]) = CVal(Case[i], ClassAtt);
|
331
|
+
}
|
332
|
+
}
|
333
|
+
|
334
|
+
/* Check for multimodal tails and exclude */
|
335
|
+
|
336
|
+
Lp = MaxCase;
|
337
|
+
Cases = No(Fp, Lp);
|
338
|
+
Tail = MaxAnoms(Cases);
|
339
|
+
|
340
|
+
/* Estimate SD from the central half of the data and adjust; if
|
341
|
+
this is impossible (too many repeated values), mark the
|
342
|
+
attribute as skipped */
|
343
|
+
|
344
|
+
if ( CClass(Case[Fp + Quart + Tail]) < CClass(Case[Lp - Quart - Tail]) )
|
345
|
+
{
|
346
|
+
ForEach(i, Fp + Quart, Lp - Quart)
|
347
|
+
{
|
348
|
+
Sum += (Cv = CClass(Case[i]));
|
349
|
+
SumSq += Cv * Cv;
|
350
|
+
}
|
351
|
+
Mean = Sum / (Middle = No(Fp, Lp) - 2 * Quart);
|
352
|
+
SD = 2.5 * SDEstimate(Middle, Sum, SumSq);
|
353
|
+
}
|
354
|
+
else
|
355
|
+
{
|
356
|
+
/* This is not really a continuous distribution -- at least
|
357
|
+
half of the cases have identical values. Skip it */
|
358
|
+
|
359
|
+
if ( ! CheckMsg )
|
360
|
+
{
|
361
|
+
fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
|
362
|
+
CheckMsg = true;
|
363
|
+
}
|
364
|
+
|
365
|
+
fprintf(Of, F_TooManyIdentical);
|
366
|
+
SpecialStatus[ClassAtt] |= SKIP;
|
367
|
+
return;
|
368
|
+
}
|
369
|
+
|
370
|
+
/* Look for multimodal low tail */
|
371
|
+
|
372
|
+
for ( Tp = Fp ; Tp < Mid && ZScore(Tp) >= MAXTAIL ; Tp++ )
|
373
|
+
;
|
374
|
+
|
375
|
+
if ( Tp - Fp > Tail )
|
376
|
+
{
|
377
|
+
if ( ! CheckMsg )
|
378
|
+
{
|
379
|
+
fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
|
380
|
+
CheckMsg = true;
|
381
|
+
}
|
382
|
+
|
383
|
+
CValToStr(CVal(Case[Tp], ClassAtt), ClassAtt, CVS);
|
384
|
+
fprintf(Of, F_LowTail(Tp - Fp, CVS));
|
385
|
+
Fp = Tp;
|
386
|
+
LowT = true;
|
387
|
+
}
|
388
|
+
|
389
|
+
/* Ditto multimodal high tail */
|
390
|
+
|
391
|
+
for ( Tp = Lp ; Tp > Mid && ZScore(Tp) >= MAXTAIL ; Tp-- )
|
392
|
+
;
|
393
|
+
|
394
|
+
if ( Lp - Tp > Tail )
|
395
|
+
{
|
396
|
+
if ( ! CheckMsg )
|
397
|
+
{
|
398
|
+
fprintf(Of, F_WhileCheck, AttName[ClassAtt]);
|
399
|
+
CheckMsg = true;
|
400
|
+
}
|
401
|
+
|
402
|
+
CValToStr(CVal(Case[Tp], ClassAtt), ClassAtt, CVS);
|
403
|
+
fprintf(Of, F_HighTail(Lp - Tp, CVS));
|
404
|
+
Lp = Tp;
|
405
|
+
HighT = true;
|
406
|
+
}
|
407
|
+
|
408
|
+
/* Record tail information */
|
409
|
+
|
410
|
+
LowTail[ClassAtt] = CVal(Case[Fp], ClassAtt);
|
411
|
+
HighTail[ClassAtt] = CVal(Case[Lp], ClassAtt);
|
412
|
+
|
413
|
+
if ( SIFT && ( LowT || HighT ) )
|
414
|
+
{
|
415
|
+
fprintf(Sf, "3 %d %.8g %.8g\n", ClassAtt,
|
416
|
+
( LowT ? CVal(Case[Fp], ClassAtt) : -MAXFLOAT ),
|
417
|
+
( HighT ? CVal(Case[Lp], ClassAtt) : MAXFLOAT ) );
|
418
|
+
}
|
419
|
+
|
420
|
+
/* Carry out global check on remaining cases */
|
421
|
+
|
422
|
+
FindContinOutliers(Fp, Lp, true);
|
423
|
+
}
|
424
|
+
|
425
|
+
|
426
|
+
|
427
|
+
/*************************************************************************/
|
428
|
+
/* */
|
429
|
+
/* Check continuous values of ClassAtt for cases Fp to Lp */
|
430
|
+
/* */
|
431
|
+
/*************************************************************************/
|
432
|
+
|
433
|
+
|
434
|
+
void FindContinOutliers(CaseNo Fp, CaseNo Lp, Boolean Sorted)
|
435
|
+
/* ------------------ */
|
436
|
+
{
|
437
|
+
CaseNo Tail, Cases, LowTp=-1, HighTp=-1, GFp, i;
|
438
|
+
double Mean, SD, LowFrac, HighFrac, LowLim, HighLim;
|
439
|
+
Clust CLow=Nil, CHigh=Nil;
|
440
|
+
Boolean SavedCluster=false;
|
441
|
+
|
442
|
+
Cases = No(Fp, Lp);
|
443
|
+
if ( Cases < CMINITEMS ) return;
|
444
|
+
|
445
|
+
if ( ! Sorted )
|
446
|
+
{
|
447
|
+
Quicksort(Fp, Lp, ClassAtt);
|
448
|
+
}
|
449
|
+
|
450
|
+
TrimmedSDEstimate(Fp, Lp, &Mean, &SD);
|
451
|
+
|
452
|
+
/* Check low and high tails. A tail is anomalous if
|
453
|
+
* it does not contain too many cases
|
454
|
+
* the case before the tail has a Z-score <= MAXNORM
|
455
|
+
* there is a gap of at least MINABNORM - MAXNORM */
|
456
|
+
|
457
|
+
Tail = MaxAnoms(Cases);
|
458
|
+
|
459
|
+
LowTp = FindTail(Fp, Fp + Tail, 1, Mean, SD);
|
460
|
+
HighTp = FindTail(Lp, Lp - Tail, -1, Mean, SD);
|
461
|
+
|
462
|
+
if ( SIFT )
|
463
|
+
{
|
464
|
+
/* See whether we need to save this cluster for low or high test */
|
465
|
+
|
466
|
+
if ( LowTp >= 0 )
|
467
|
+
{
|
468
|
+
LowFrac = No(LowTp+1, Lp) / (double) Cases;
|
469
|
+
LowLim = CVal(Case[LowTp+1], ClassAtt);
|
470
|
+
}
|
471
|
+
else
|
472
|
+
{
|
473
|
+
LowFrac = LowLim = 0;
|
474
|
+
}
|
475
|
+
|
476
|
+
if ( HighTp > 0 )
|
477
|
+
{
|
478
|
+
HighFrac = No(Fp, HighTp-1) / (double) Cases;
|
479
|
+
HighLim = CVal(Case[HighTp-1], ClassAtt);
|
480
|
+
}
|
481
|
+
else
|
482
|
+
{
|
483
|
+
HighFrac = HighLim = 0;
|
484
|
+
}
|
485
|
+
|
486
|
+
if ( LowFrac > 0 || HighFrac > 0 )
|
487
|
+
{
|
488
|
+
SaveContinCluster(Mean, SD, Cases,
|
489
|
+
LowFrac, LowLim, HighFrac, HighLim);
|
490
|
+
|
491
|
+
SavedCluster = true;
|
492
|
+
}
|
493
|
+
}
|
494
|
+
|
495
|
+
|
496
|
+
if ( LowTp >= 0 || HighTp > 0 )
|
497
|
+
{
|
498
|
+
GFp = ( LowTp >= 0 ? LowTp+1 : Fp );
|
499
|
+
|
500
|
+
if ( LowTp >= 0 )
|
501
|
+
{
|
502
|
+
CLow = NewClust(Mean, SD,
|
503
|
+
CVal(Case[LowTp+1], ClassAtt),
|
504
|
+
No(Fp, LowTp), Cases);
|
505
|
+
}
|
506
|
+
|
507
|
+
if ( HighTp > 0 )
|
508
|
+
{
|
509
|
+
CHigh = NewClust(Mean, SD,
|
510
|
+
CVal(Case[HighTp-1], ClassAtt),
|
511
|
+
No(HighTp, Lp), Cases);
|
512
|
+
|
513
|
+
/* Move all anomalies to the front */
|
514
|
+
|
515
|
+
ForEach(i, HighTp, Lp)
|
516
|
+
{
|
517
|
+
Swap(i, GFp);
|
518
|
+
GFp++;
|
519
|
+
}
|
520
|
+
}
|
521
|
+
|
522
|
+
LabelContinOutliers(CLow, CHigh, Fp, GFp, Lp);
|
523
|
+
}
|
524
|
+
|
525
|
+
/* Clusters may have caveats discovered during LabelContinOutliers */
|
526
|
+
|
527
|
+
if ( SavedCluster )
|
528
|
+
{
|
529
|
+
ExtendSiftEntry("\n");
|
530
|
+
}
|
531
|
+
}
|
532
|
+
|
533
|
+
|
534
|
+
|
535
|
+
/*************************************************************************/
|
536
|
+
/* */
|
537
|
+
/* Note cases Fp to Lp as outliers wrt values GFp to GLp of */
|
538
|
+
/* ClassAtt whose mean and SD are given */
|
539
|
+
/* */
|
540
|
+
/*************************************************************************/
|
541
|
+
|
542
|
+
|
543
|
+
void LabelContinOutliers(Clust CL, Clust CH, CaseNo Fp, CaseNo GFp, CaseNo GLp)
|
544
|
+
/* ------------------- */
|
545
|
+
{
|
546
|
+
CaseNo i;
|
547
|
+
double Z, Mean, SD, X;
|
548
|
+
Clust C, OldC;
|
549
|
+
|
550
|
+
C = ( CL ? CL : CH ); /* either will do since mean is the same */
|
551
|
+
|
552
|
+
Mean = C->Expect;
|
553
|
+
SD = C->SD;
|
554
|
+
|
555
|
+
/* Remove cases that already have a more interesting recorded
|
556
|
+
anomalous value */
|
557
|
+
|
558
|
+
ForEach(i, Fp, GFp-1)
|
559
|
+
{
|
560
|
+
/* Use Chebychev bounds to approximate certainty that this
|
561
|
+
case is an outlier */
|
562
|
+
|
563
|
+
Z = ZScore(i);
|
564
|
+
X = 1 / (Z * Z);
|
565
|
+
|
566
|
+
if ( (OldC = OutClust(Case[i])) &&
|
567
|
+
( C->NCond > OldC->NCond ||
|
568
|
+
C->NCond == OldC->NCond && X >= OutXVal(Case[i]) ) )
|
569
|
+
{
|
570
|
+
Swap(i, Fp);
|
571
|
+
Fp++;
|
572
|
+
}
|
573
|
+
}
|
574
|
+
|
575
|
+
/* Remove possible anomalies that are non consistent with the
|
576
|
+
ordinary cases */
|
577
|
+
|
578
|
+
Fp = NoOtherDifference(Fp, GFp-1, GFp, GLp);
|
579
|
+
|
580
|
+
/* Finally, record remaining cases */
|
581
|
+
|
582
|
+
ForEach(i, Fp, GFp-1)
|
583
|
+
{
|
584
|
+
Z = ZScore(i);
|
585
|
+
Verbosity(1,
|
586
|
+
fprintf(Of, "****\tpotential outlier %g (%.1f sd) %s\n",
|
587
|
+
CVal(Case[i], ClassAtt), Z,
|
588
|
+
( LabelAtt ? SVal(Case[i], LabelAtt) : "" )))
|
589
|
+
|
590
|
+
RecordOutlier(i, ( CClass(Case[i]) < Mean ? CL : CH ), 1 / (Z * Z));
|
591
|
+
}
|
592
|
+
}
|
593
|
+
|
594
|
+
|
595
|
+
|
596
|
+
/*************************************************************************/
|
597
|
+
/* */
|
598
|
+
/* Robust estimator of mean and SD. */
|
599
|
+
/* Idea: exclude high/low tails of data, and adjust computed */
|
600
|
+
/* mean and SD heuristically. */
|
601
|
+
/* Note: unknown and N/A values must be removed and cases must be */
|
602
|
+
/* sorted by ClassAtt before calling TrimmedSDEstimate. */
|
603
|
+
/* */
|
604
|
+
/*************************************************************************/
|
605
|
+
|
606
|
+
|
607
|
+
void TrimmedSDEstimate(CaseNo Fp, CaseNo Lp, double *Mean, double *SD)
|
608
|
+
/* ----------------- */
|
609
|
+
{
|
610
|
+
CaseNo i, Tail, Cases, Quart;
|
611
|
+
double Val, Sum=0, SumSq=0;
|
612
|
+
|
613
|
+
/* Set defaults */
|
614
|
+
|
615
|
+
*Mean = 0;
|
616
|
+
*SD = 1E38;
|
617
|
+
|
618
|
+
Tail = MaxAnoms(No(Fp, Lp));
|
619
|
+
Cases = No(Fp, Lp) - 2 * Tail;
|
620
|
+
Quart = No(Fp, Lp) / 4;
|
621
|
+
|
622
|
+
/* Don't try to estimate if too many values are the same */
|
623
|
+
|
624
|
+
if ( CClass(Case[Fp+Quart]) == CClass(Case[Lp-Quart]) )
|
625
|
+
{
|
626
|
+
return;
|
627
|
+
}
|
628
|
+
|
629
|
+
ForEach(i, Fp+Tail, Lp-Tail)
|
630
|
+
{
|
631
|
+
if ( NotApplic(Case[i], ClassAtt) )
|
632
|
+
{
|
633
|
+
Cases--;
|
634
|
+
}
|
635
|
+
else
|
636
|
+
{
|
637
|
+
Val = CClass(Case[i]);
|
638
|
+
Sum += Val;
|
639
|
+
SumSq += Val * Val;
|
640
|
+
}
|
641
|
+
}
|
642
|
+
|
643
|
+
if ( Cases < Tail )
|
644
|
+
{
|
645
|
+
return;
|
646
|
+
}
|
647
|
+
|
648
|
+
/* If there are N cases (excluding non-applicables) then adjust
|
649
|
+
SD by factor (N + Tail) / (N - Tail) */
|
650
|
+
|
651
|
+
*Mean = Sum / Cases;
|
652
|
+
*SD = SDEstimate(Cases, Sum, SumSq) *
|
653
|
+
(Cases + 3.0 * Tail ) / (Cases + Tail);
|
654
|
+
}
|
655
|
+
|
656
|
+
|
657
|
+
|
658
|
+
/*************************************************************************/
|
659
|
+
/* */
|
660
|
+
/* Find a tail containing potential anomalies between Fp and Lp-1. */
|
661
|
+
/* * case Lp must have a Z-score <= MAXNORM */
|
662
|
+
/* * there must be a gap >= MINABNORM-MAXNORM between the anomalous */
|
663
|
+
/* and non-anomalous values */
|
664
|
+
/* * the cluster cannot contain cases from omitted tails */
|
665
|
+
/* */
|
666
|
+
/*************************************************************************/
|
667
|
+
|
668
|
+
|
669
|
+
CaseNo FindTail(CaseNo Fp, CaseNo Lp, int I, double Mean, double SD)
|
670
|
+
/* -------- */
|
671
|
+
{
|
672
|
+
CaseNo i;
|
673
|
+
double Z;
|
674
|
+
|
675
|
+
if ( ZScore(Lp) > MAXNORM ) return -1;
|
676
|
+
|
677
|
+
/* Find the first gap */
|
678
|
+
|
679
|
+
for ( i = Lp ; i * I > Fp * I && (Z = ZScore(i)) <= MINABNORM ; i -= I )
|
680
|
+
{
|
681
|
+
if ( ZScore(i - I) - Z >= MINABNORM - MAXNORM )
|
682
|
+
{
|
683
|
+
break;
|
684
|
+
}
|
685
|
+
}
|
686
|
+
|
687
|
+
return ( Z > MINABNORM ? -1 : OmittedCases(I) ? -1 : i - I );
|
688
|
+
}
|
689
|
+
|
690
|
+
|
691
|
+
|
692
|
+
/*************************************************************************/
|
693
|
+
/* */
|
694
|
+
/* Check whether the current cluster includes cases from the */
|
695
|
+
/* excluded high/low tails */
|
696
|
+
/* */
|
697
|
+
/*************************************************************************/
|
698
|
+
|
699
|
+
|
700
|
+
Boolean OmittedCases(int HiLo)
|
701
|
+
/* ------------ */
|
702
|
+
{
|
703
|
+
CaseNo Fp, Lp;
|
704
|
+
CaseNo i;
|
705
|
+
|
706
|
+
if ( HiLo > 0 )
|
707
|
+
{
|
708
|
+
Fp = LowFp;
|
709
|
+
Lp = LowLp;
|
710
|
+
}
|
711
|
+
else
|
712
|
+
{
|
713
|
+
Fp = HighFp;
|
714
|
+
Lp = MaxCase;
|
715
|
+
}
|
716
|
+
|
717
|
+
ForEach(i, Fp, Lp)
|
718
|
+
{
|
719
|
+
if ( SatisfiesTests(Case[i]) )
|
720
|
+
{
|
721
|
+
return true;
|
722
|
+
}
|
723
|
+
}
|
724
|
+
|
725
|
+
return false;
|
726
|
+
}
|
727
|
+
|
728
|
+
|
729
|
+
|
730
|
+
/*************************************************************************/
|
731
|
+
/* */
|
732
|
+
/* See whether a case satisfies all current tests */
|
733
|
+
/* */
|
734
|
+
/*************************************************************************/
|
735
|
+
|
736
|
+
|
737
|
+
Boolean SatisfiesTests(Description Case)
|
738
|
+
/* -------------- */
|
739
|
+
{
|
740
|
+
Attribute Att;
|
741
|
+
DiscrValue Br;
|
742
|
+
int i;
|
743
|
+
|
744
|
+
ForEach(i, 0, GEnv.Level)
|
745
|
+
{
|
746
|
+
Att = GEnv.Test[i].Att;
|
747
|
+
Br = GEnv.Test[i].Br;
|
748
|
+
|
749
|
+
if ( Unknown(Case, Att) )
|
750
|
+
{
|
751
|
+
return false;
|
752
|
+
}
|
753
|
+
else
|
754
|
+
if ( Br == 1 )
|
755
|
+
{
|
756
|
+
if ( ! NotApplic(Case, Att) ) return false;
|
757
|
+
}
|
758
|
+
else
|
759
|
+
if ( NotApplic(Case, Att) )
|
760
|
+
{
|
761
|
+
return false;
|
762
|
+
}
|
763
|
+
else
|
764
|
+
if ( Continuous(Att) )
|
765
|
+
{
|
766
|
+
if ( ( Br == 2 ) != ( CVal(Case, Att) <= GEnv.Test[i].Cut ) )
|
767
|
+
{
|
768
|
+
return false;
|
769
|
+
}
|
770
|
+
}
|
771
|
+
else
|
772
|
+
if ( Ordered(Att) )
|
773
|
+
{
|
774
|
+
if ( ( Br == 2 ) != ( DVal(Case, Att) <= GEnv.Test[i].Cut ) )
|
775
|
+
{
|
776
|
+
return false;
|
777
|
+
}
|
778
|
+
}
|
779
|
+
else
|
780
|
+
if ( Continuous(ClassAtt) && MaxAttVal[Att] > 3 )
|
781
|
+
{
|
782
|
+
if ( ( Br == 2 ) != ( In(DVal(Case, Att), GEnv.Test[i].Left) != 0 ) )
|
783
|
+
{
|
784
|
+
return false;
|
785
|
+
}
|
786
|
+
}
|
787
|
+
else
|
788
|
+
if ( Br != DVal(Case, Att) )
|
789
|
+
{
|
790
|
+
return false;
|
791
|
+
}
|
792
|
+
}
|
793
|
+
|
794
|
+
return true;
|
795
|
+
}
|
796
|
+
|
797
|
+
|
798
|
+
|
799
|
+
/*************************************************************************/
|
800
|
+
/* */
|
801
|
+
/* Check discrete values of ClassAtt for cases Fp to Lp */
|
802
|
+
/* Idea: anomaly will appear as an odd case in nearly-pure subset */
|
803
|
+
/* */
|
804
|
+
/*************************************************************************/
|
805
|
+
|
806
|
+
|
807
|
+
void FindDiscrOutliers(CaseNo Fp, CaseNo Lp, CaseCount *Table)
|
808
|
+
/* ----------------- */
|
809
|
+
{
|
810
|
+
DiscrValue v, Majority=1;
|
811
|
+
CaseNo i, GFp, GLp;
|
812
|
+
CaseCount Cases, Anoms;
|
813
|
+
double X;
|
814
|
+
Clust C, OldC;
|
815
|
+
Boolean SomeSurprise=false, NeedCluster;
|
816
|
+
|
817
|
+
Cases = No(Fp, Lp);
|
818
|
+
if ( Cases < DMINITEMS ) return;
|
819
|
+
|
820
|
+
if ( ! Table )
|
821
|
+
{
|
822
|
+
FindClassFrequencies(Fp, Lp);
|
823
|
+
Table = GEnv.ClassFreq;
|
824
|
+
}
|
825
|
+
|
826
|
+
ForEach(v, 2, MaxAttVal[ClassAtt])
|
827
|
+
{
|
828
|
+
if ( Table[v] > 0 )
|
829
|
+
{
|
830
|
+
if ( ! Majority || Table[v] > Table[Majority] )
|
831
|
+
{
|
832
|
+
Majority = v;
|
833
|
+
}
|
834
|
+
}
|
835
|
+
}
|
836
|
+
|
837
|
+
/* Skip if too many anomalies */
|
838
|
+
|
839
|
+
Anoms = Cases - Table[Majority];
|
840
|
+
|
841
|
+
if ( Anoms > MaxAnoms(Cases) )
|
842
|
+
{
|
843
|
+
return;
|
844
|
+
}
|
845
|
+
|
846
|
+
/* Check whether any non-majority class is surprising */
|
847
|
+
|
848
|
+
for ( v = 1 ; ! SomeSurprise && v <= MaxAttVal[ClassAtt] ; v++ )
|
849
|
+
{
|
850
|
+
if ( v == Majority ) continue;
|
851
|
+
|
852
|
+
X = XDScore(Table[v], Cases, Anoms, Prior[ClassAtt][v]);
|
853
|
+
SomeSurprise = ( X <= 1.0 / (MINABNORM * MINABNORM) );
|
854
|
+
}
|
855
|
+
if ( ! SomeSurprise ) return;
|
856
|
+
|
857
|
+
if ( SIFT )
|
858
|
+
{
|
859
|
+
SaveDiscrCluster(Majority, Anoms, Cases, Table);
|
860
|
+
}
|
861
|
+
|
862
|
+
if ( ! Anoms )
|
863
|
+
{
|
864
|
+
if ( SIFT )
|
865
|
+
{
|
866
|
+
ExtendSiftEntry("\n");
|
867
|
+
}
|
868
|
+
|
869
|
+
return;
|
870
|
+
}
|
871
|
+
|
872
|
+
/* Need a new cluster if surprising non-zero frequencies */
|
873
|
+
|
874
|
+
NeedCluster = ( Table[--v] > 0 );
|
875
|
+
while ( ! NeedCluster && ++v <= MaxAttVal[ClassAtt] )
|
876
|
+
{
|
877
|
+
if ( v == Majority || ! Table[v] ) continue;
|
878
|
+
|
879
|
+
X = XDScore(Table[v], Cases, Anoms, Prior[ClassAtt][v]);
|
880
|
+
NeedCluster = ( X <= 1.0 / (MINABNORM * MINABNORM) );
|
881
|
+
}
|
882
|
+
|
883
|
+
if ( NeedCluster )
|
884
|
+
{
|
885
|
+
/* Move all majority-class cases to the front */
|
886
|
+
|
887
|
+
GFp = Fp;
|
888
|
+
GLp = (Fp = Group(ClassAtt, Majority, Fp, Lp, 0.0, Nil)) - 1;
|
889
|
+
|
890
|
+
C = NewClust(Majority, 0.0, 0.0, Anoms, Cases);
|
891
|
+
|
892
|
+
/* Remove cases whose surprise value is insufficient or
|
893
|
+
that already have a more interesting recorded anomalous value */
|
894
|
+
|
895
|
+
ForEach(i, Fp, Lp)
|
896
|
+
{
|
897
|
+
v = DClass(Case[i]);
|
898
|
+
X = DScore(Cases, Anoms, Prior[ClassAtt][v]);
|
899
|
+
|
900
|
+
if ( X > 1.0 / (MINABNORM * MINABNORM) ||
|
901
|
+
(OldC = OutClust(Case[i])) &&
|
902
|
+
( C->NCond > OldC->NCond ||
|
903
|
+
C->NCond == OldC->NCond && X > OutXVal(Case[i]) ) )
|
904
|
+
{
|
905
|
+
Swap(i, Fp);
|
906
|
+
Fp++;
|
907
|
+
}
|
908
|
+
}
|
909
|
+
|
910
|
+
/* Remove possible anomalies that are non consistent with the
|
911
|
+
ordinary cases */
|
912
|
+
|
913
|
+
Fp = NoOtherDifference(Fp, Lp, GFp, GLp);
|
914
|
+
|
915
|
+
/* Finally, record remaining cases */
|
916
|
+
|
917
|
+
if ( Fp <= Lp )
|
918
|
+
{
|
919
|
+
ForEach(i, Fp, Lp)
|
920
|
+
{
|
921
|
+
v = DClass(Case[i]);
|
922
|
+
X = DScore(Cases, Anoms, Prior[ClassAtt][v]);
|
923
|
+
|
924
|
+
Verbosity(1,
|
925
|
+
fprintf(Of, "****\tpotential outlier %s (p=%.3f) %s\n",
|
926
|
+
AttValName[ClassAtt][DClass(Case[i])], X,
|
927
|
+
( LabelAtt ? SVal(Case[i], LabelAtt) : "" )))
|
928
|
+
|
929
|
+
RecordOutlier(i, C, X);
|
930
|
+
}
|
931
|
+
}
|
932
|
+
}
|
933
|
+
|
934
|
+
if ( SIFT && SomeSurprise )
|
935
|
+
{
|
936
|
+
ExtendSiftEntry("\n");
|
937
|
+
}
|
938
|
+
}
|
939
|
+
|
940
|
+
|
941
|
+
|
942
|
+
/*************************************************************************/
|
943
|
+
/* */
|
944
|
+
/* Cases Fp through Lp have been identified as potential anoms */
|
945
|
+
/* in the cluster whose "normal" cases are GFp thrrough GLp. */
|
946
|
+
/* Discard potential anomalies that appear to be inconsistent */
|
947
|
+
/* with the normals on some other attribute. */
|
948
|
+
/* If SIFT is set, record any caveats for the current cluster */
|
949
|
+
/* */
|
950
|
+
/*************************************************************************/
|
951
|
+
|
952
|
+
|
953
|
+
CaseNo NoOtherDifference(CaseNo Fp, CaseNo Lp, CaseNo GFp, CaseNo GLp)
|
954
|
+
/* ----------------- */
|
955
|
+
{
|
956
|
+
Attribute Att;
|
957
|
+
double Sum, SumSq, Mean, SD, CV;
|
958
|
+
CaseNo i, Cases, GCases;
|
959
|
+
DiscrValue v;
|
960
|
+
Boolean Caveat;
|
961
|
+
int Bytes;
|
962
|
+
char SE[100];
|
963
|
+
|
964
|
+
if ( GEnv.Level < 0 ||
|
965
|
+
Fp > Lp || (GCases = No(GFp, GLp)) < MINCONTEXT ) return Fp;
|
966
|
+
|
967
|
+
/* Use a sample if there are many normal cases */
|
968
|
+
|
969
|
+
if ( GCases > MaxDiscrVal * SAMPLEUNIT )
|
970
|
+
{
|
971
|
+
GCases = 0.5 * MaxDiscrVal * SAMPLEUNIT;
|
972
|
+
Sample(GFp, GLp, GCases);
|
973
|
+
GLp = GFp + GCases - 1;
|
974
|
+
}
|
975
|
+
|
976
|
+
ForEach(Att, 1, MaxAtt)
|
977
|
+
{
|
978
|
+
if ( Att == ClassAtt || Exclude(Att) ) continue;
|
979
|
+
|
980
|
+
if ( Fp > Lp ) return Fp;
|
981
|
+
|
982
|
+
Caveat = false;
|
983
|
+
|
984
|
+
if ( Continuous(Att) )
|
985
|
+
{
|
986
|
+
/* Find mean and variance of ordinary cases */
|
987
|
+
|
988
|
+
Sum = SumSq = Cases = 0;
|
989
|
+
ForEach(i, GFp, GLp)
|
990
|
+
{
|
991
|
+
if ( ! Unknown(Case[i], Att) && ! NotApplic(Case[i], Att) )
|
992
|
+
{
|
993
|
+
CV = ( UseLogs[Att] ? log(CVal(Case[i], Att)) :
|
994
|
+
CVal(Case[i], Att) );
|
995
|
+
Sum += CV;
|
996
|
+
SumSq += CV * CV;
|
997
|
+
Cases++;
|
998
|
+
}
|
999
|
+
}
|
1000
|
+
|
1001
|
+
/* Check that sufficient cases to give reliable SD */
|
1002
|
+
|
1003
|
+
if ( Cases >= MINCONTEXT )
|
1004
|
+
{
|
1005
|
+
Mean = Sum / Cases;
|
1006
|
+
SD = SDEstimate(Cases, Sum, SumSq);
|
1007
|
+
|
1008
|
+
/* Move filtered cases to the front */
|
1009
|
+
|
1010
|
+
ForEach (i, Fp, Lp)
|
1011
|
+
{
|
1012
|
+
if ( ! Unknown(Case[i], Att) &&
|
1013
|
+
! NotApplic(Case[i], Att) &&
|
1014
|
+
fabs(Mean -
|
1015
|
+
( UseLogs[Att] ? log(CVal(Case[i], Att)) :
|
1016
|
+
CVal(Case[i], Att) ))
|
1017
|
+
/ SD > MAXNORM )
|
1018
|
+
{
|
1019
|
+
Verbosity(2,
|
1020
|
+
fprintf(Of, "\t %d: difference %s %.2f SD\n",
|
1021
|
+
i, AttName[Att],
|
1022
|
+
(Mean -
|
1023
|
+
( UseLogs[Att] ? log(CVal(Case[i], Att)) :
|
1024
|
+
CVal(Case[i], Att) )) / SD))
|
1025
|
+
Swap(i, Fp);
|
1026
|
+
Fp++;
|
1027
|
+
|
1028
|
+
/* Record possible caveat */
|
1029
|
+
|
1030
|
+
if ( SIFT && ! Caveat )
|
1031
|
+
{
|
1032
|
+
Caveat = true;
|
1033
|
+
|
1034
|
+
sprintf(SE, " %d", Att);
|
1035
|
+
ExtendSiftEntry(SE);
|
1036
|
+
if ( UseLogs[Att] )
|
1037
|
+
{
|
1038
|
+
sprintf(SE, " %.8g %.8g",
|
1039
|
+
exp(Mean - MAXNORM * SD),
|
1040
|
+
exp(Mean + MAXNORM * SD));
|
1041
|
+
}
|
1042
|
+
else
|
1043
|
+
{
|
1044
|
+
sprintf(SE, " %.8g %.8g",
|
1045
|
+
Mean - MAXNORM * SD,
|
1046
|
+
Mean + MAXNORM * SD);
|
1047
|
+
}
|
1048
|
+
|
1049
|
+
ExtendSiftEntry(SE);
|
1050
|
+
}
|
1051
|
+
}
|
1052
|
+
}
|
1053
|
+
}
|
1054
|
+
}
|
1055
|
+
else
|
1056
|
+
{
|
1057
|
+
/* Discrete attribute
|
1058
|
+
NB: This doesn't differentiate between ordered and unordered
|
1059
|
+
discrete attributes -- perhaps it should */
|
1060
|
+
|
1061
|
+
ForEach(v, 0, MaxAttVal[Att])
|
1062
|
+
{
|
1063
|
+
GEnv.ValFreq[v] = 0;
|
1064
|
+
}
|
1065
|
+
|
1066
|
+
ForEach(i, GFp, GLp)
|
1067
|
+
{
|
1068
|
+
GEnv.ValFreq[XDVal(Case[i], Att)]++;
|
1069
|
+
}
|
1070
|
+
|
1071
|
+
/* A discrete attribute value is judged to inconsistent with
|
1072
|
+
the normals if its Laplace probability in the normals is
|
1073
|
+
less than 0.025 and its prior greater than 0.25 */
|
1074
|
+
|
1075
|
+
Bytes = (MaxAttVal[Att]>>3) + 1;
|
1076
|
+
ClearBits(Bytes, GEnv.Subset[0]);
|
1077
|
+
|
1078
|
+
ForEach(i, Fp, Lp)
|
1079
|
+
{
|
1080
|
+
v = XDVal(Case[i], Att);
|
1081
|
+
if ( Prior[Att][v] >= 0.25 &&
|
1082
|
+
(GEnv.ValFreq[v] + 1) / (double) (GCases + 2) < 0.025L )
|
1083
|
+
{
|
1084
|
+
Verbosity(2,
|
1085
|
+
fprintf(Of, "\t %d: difference %s=%s (%d/%d)\n",
|
1086
|
+
i, AttName[Att], AttValName[Att][v],
|
1087
|
+
GEnv.ValFreq[v], GCases))
|
1088
|
+
Swap(i, Fp);
|
1089
|
+
Fp++;
|
1090
|
+
|
1091
|
+
SetBit(v, GEnv.Subset[0]);
|
1092
|
+
Caveat = true;
|
1093
|
+
}
|
1094
|
+
}
|
1095
|
+
|
1096
|
+
if ( SIFT && Caveat )
|
1097
|
+
{
|
1098
|
+
sprintf(SE, " %d", Att);
|
1099
|
+
ExtendSiftEntry(SE);
|
1100
|
+
ForEach(v, 0, Bytes-1)
|
1101
|
+
{
|
1102
|
+
sprintf(SE, " %x", GEnv.Subset[0][v]);
|
1103
|
+
ExtendSiftEntry(SE);
|
1104
|
+
}
|
1105
|
+
}
|
1106
|
+
}
|
1107
|
+
}
|
1108
|
+
|
1109
|
+
return Fp;
|
1110
|
+
}
|