see5-installer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
@@ -0,0 +1,412 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of GritBot GPL Edition, a single-threaded version */
|
6
|
+
/* of GritBot release 2.01. */
|
7
|
+
/* */
|
8
|
+
/* GritBot GPL Edition is free software: you can redistribute it */
|
9
|
+
/* and/or modify it under the terms of the GNU General Public License */
|
10
|
+
/* as published by the Free Software Foundation, either version 3 of */
|
11
|
+
/* the License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* GritBot GPL Edition is distributed in the hope that it will be */
|
14
|
+
/* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
|
15
|
+
/* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
16
|
+
/* GNU General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with GritBot GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Divide-and-Conquer for continuous attributes */
|
30
|
+
/* -------------------------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#include "defns.i"
|
36
|
+
#include "extern.i"
|
37
|
+
|
38
|
+
|
39
|
+
/*************************************************************************/
|
40
|
+
/* */
|
41
|
+
/* Known values of continuous attributes are divided into */
|
42
|
+
/* three groups: */
|
43
|
+
/* (1) N/A */
|
44
|
+
/* (2) values less than a threshold */
|
45
|
+
/* (3) values greater than a threshold */
|
46
|
+
/* This routine finds the best threshold for items Fp through Lp */
|
47
|
+
/* and sets Gain[] and Bar[] */
|
48
|
+
/* */
|
49
|
+
/*************************************************************************/
|
50
|
+
|
51
|
+
|
52
|
+
void CEvalContinAtt(Attribute Att, CaseNo Fp, CaseNo Lp)
|
53
|
+
/* -------------- */
|
54
|
+
{
|
55
|
+
CaseNo i, BestI, Xp;
|
56
|
+
double Val, ThisGain, BestGain=-1E-6;
|
57
|
+
|
58
|
+
/* Special case when very few values */
|
59
|
+
|
60
|
+
if ( No(Fp, Lp) < 2 * (CMINITEMS*GEnv.FRAC) )
|
61
|
+
{
|
62
|
+
Verbosity(2,
|
63
|
+
fprintf(Of, "\tAtt %s: insufficient cases with known values\n",
|
64
|
+
AttName[Att]))
|
65
|
+
return;
|
66
|
+
}
|
67
|
+
|
68
|
+
GEnv.BrFreq[1] = GEnv.BrFreq[2] = GEnv.BrFreq[3] = 0;
|
69
|
+
|
70
|
+
GEnv.BrSum[1] = GEnv.BrSumSq[1] =
|
71
|
+
GEnv.BrSum[2] = GEnv.BrSumSq[2] =
|
72
|
+
GEnv.BrSum[3] = GEnv.BrSumSq[3] = 0;
|
73
|
+
|
74
|
+
/* Omit and count N/A values and count base values */
|
75
|
+
|
76
|
+
Xp = Fp;
|
77
|
+
ForEach(i, Fp, Lp)
|
78
|
+
{
|
79
|
+
Val = CClass(Case[i]);
|
80
|
+
|
81
|
+
if ( NotApplic(Case[i],Att) )
|
82
|
+
{
|
83
|
+
GEnv.BrFreq[1]++;
|
84
|
+
GEnv.BrSum[1] += Val;
|
85
|
+
GEnv.BrSumSq[1] += Val * Val;
|
86
|
+
|
87
|
+
Swap(i, Xp);
|
88
|
+
Xp++;
|
89
|
+
}
|
90
|
+
else
|
91
|
+
{
|
92
|
+
GEnv.BrFreq[3]++;
|
93
|
+
GEnv.BrSum[3] += Val;
|
94
|
+
GEnv.BrSumSq[3] += Val * Val;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
/* Sort all applicable values */
|
99
|
+
|
100
|
+
Quicksort(Xp, Lp, Att);
|
101
|
+
|
102
|
+
/* Try possible cuts between items i and i+1, and determine the
|
103
|
+
information and gain of the split in each case */
|
104
|
+
|
105
|
+
ForEach(i, Xp, Lp - (CMINITEMS*GEnv.FRAC))
|
106
|
+
{
|
107
|
+
Val = CClass(Case[i]);
|
108
|
+
|
109
|
+
GEnv.BrFreq[2]++;
|
110
|
+
GEnv.BrFreq[3]--;
|
111
|
+
|
112
|
+
GEnv.BrSum[2] += Val;
|
113
|
+
GEnv.BrSum[3] -= Val;
|
114
|
+
GEnv.BrSumSq[2] += Val * Val;
|
115
|
+
GEnv.BrSumSq[3] -= Val * Val;
|
116
|
+
|
117
|
+
if ( CVal(Case[i+1], Att) > CVal(Case[i], Att) &&
|
118
|
+
i >= Xp+(CMINITEMS*GEnv.FRAC)-1 )
|
119
|
+
{
|
120
|
+
ThisGain = ContinGain();
|
121
|
+
if ( ThisGain > BestGain + Epsilon )
|
122
|
+
{
|
123
|
+
BestGain = ThisGain;
|
124
|
+
BestI = i;
|
125
|
+
}
|
126
|
+
}
|
127
|
+
}
|
128
|
+
|
129
|
+
/* Set the best break point and gain */
|
130
|
+
|
131
|
+
if ( BestGain > Epsilon )
|
132
|
+
{
|
133
|
+
GEnv.Gain[Att] = BestGain;
|
134
|
+
GEnv.Bar[Att] = Between(CVal(Case[BestI],Att),
|
135
|
+
CVal(Case[BestI+1],Att));
|
136
|
+
|
137
|
+
Verbosity(2,
|
138
|
+
fprintf(Of, "\tAtt %s: cut=%.3f, gain %.3f\n",
|
139
|
+
AttName[Att], GEnv.Bar[Att], GEnv.Gain[Att]))
|
140
|
+
|
141
|
+
/* If not sampling, check subsets now */
|
142
|
+
|
143
|
+
if ( GEnv.FRAC >= 1.0 )
|
144
|
+
{
|
145
|
+
if ( Xp > Fp )
|
146
|
+
{
|
147
|
+
NoteTest(Att, 1, GEnv.Bar[Att], Nil);
|
148
|
+
FindContinOutliers(Fp, Xp-1, false);
|
149
|
+
}
|
150
|
+
|
151
|
+
NoteTest(Att, 2, GEnv.Bar[Att], Nil);
|
152
|
+
FindContinOutliers(Fp, BestI, false);
|
153
|
+
|
154
|
+
NoteTest(Att, 3, GEnv.Bar[Att], Nil);
|
155
|
+
FindContinOutliers(BestI+1, Lp, false);
|
156
|
+
}
|
157
|
+
}
|
158
|
+
else
|
159
|
+
{
|
160
|
+
Verbosity(2, fprintf(Of, "\tAtt %s: no gain\n", AttName[Att]))
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
|
165
|
+
|
166
|
+
/*************************************************************************/
|
167
|
+
/* */
|
168
|
+
/* Find the lowest-precision value in the range Low to High */
|
169
|
+
/* */
|
170
|
+
/*************************************************************************/
|
171
|
+
|
172
|
+
|
173
|
+
ContValue Between(ContValue Low, ContValue High)
|
174
|
+
/* ------- */
|
175
|
+
{
|
176
|
+
ContValue Base, Unit, Cut, Try, Margin;
|
177
|
+
|
178
|
+
if ( Low <= 0 && High > 0 ) return 0.0;
|
179
|
+
|
180
|
+
Margin = 0.005L * (High - Low);
|
181
|
+
Cut = (Low + High) / 2;
|
182
|
+
|
183
|
+
/* Try successively smaller units until a threshold lies between
|
184
|
+
Low and High */
|
185
|
+
|
186
|
+
for ( Base = 6 ; Base > -6 ; Base-- )
|
187
|
+
{
|
188
|
+
Unit = pow(10.0L, Base);
|
189
|
+
Try = rint(Cut / Unit) * Unit;
|
190
|
+
|
191
|
+
if ( Try >= Low && Try < High - Margin ) return Try;
|
192
|
+
if ( fmod(Low, Unit) < 1E-6 && fmod(High, Unit) < 1E-6 ) break;
|
193
|
+
}
|
194
|
+
|
195
|
+
/* If all else fails, return the low value */
|
196
|
+
|
197
|
+
return Low;
|
198
|
+
}
|
199
|
+
|
200
|
+
|
201
|
+
|
202
|
+
/*************************************************************************/
|
203
|
+
/* */
|
204
|
+
/* Set Gain[] for discrete partition of items Fp to Lp */
|
205
|
+
/* */
|
206
|
+
/*************************************************************************/
|
207
|
+
|
208
|
+
|
209
|
+
void CEvalDiscrAtt(Attribute Att, CaseNo Fp, CaseNo Lp)
|
210
|
+
/* ------------- */
|
211
|
+
{
|
212
|
+
if ( MaxAttVal[Att] == 3 )
|
213
|
+
{
|
214
|
+
EvalBinarySplit(Att, Fp, Lp);
|
215
|
+
}
|
216
|
+
else
|
217
|
+
{
|
218
|
+
EvalSubsetSplit(Att, Fp, Lp);
|
219
|
+
}
|
220
|
+
|
221
|
+
Verbosity(2,
|
222
|
+
if ( GEnv.Gain[Att] > Epsilon )
|
223
|
+
{
|
224
|
+
fprintf(Of, "\tAtt %s: gain %.3f\n", AttName[Att], GEnv.Gain[Att]);
|
225
|
+
}
|
226
|
+
else
|
227
|
+
{
|
228
|
+
fprintf(Of, "\tAtt %s: no gain\n", AttName[Att]);
|
229
|
+
})
|
230
|
+
}
|
231
|
+
|
232
|
+
|
233
|
+
|
234
|
+
/*************************************************************************/
|
235
|
+
/* */
|
236
|
+
/* Special case of binary split */
|
237
|
+
/* */
|
238
|
+
/*************************************************************************/
|
239
|
+
|
240
|
+
|
241
|
+
void EvalBinarySplit(Attribute Att, CaseNo Fp, CaseNo Lp)
|
242
|
+
/* --------------- */
|
243
|
+
{
|
244
|
+
DiscrValue v;
|
245
|
+
|
246
|
+
ForEach(v, 1, 3)
|
247
|
+
{
|
248
|
+
GEnv.BrFreq[v] = GEnv.DFreq[Att][v][0];
|
249
|
+
GEnv.BrSum[v] = GEnv.DValSum[Att][v];
|
250
|
+
GEnv.BrSumSq[v] = GEnv.DValSumSq[Att][v];
|
251
|
+
}
|
252
|
+
|
253
|
+
GEnv.Gain[Att] = ContinGain();
|
254
|
+
if ( GEnv.Gain[Att] < Epsilon ) GEnv.Gain[Att] = None;
|
255
|
+
|
256
|
+
if ( GEnv.FRAC >= 1 && GEnv.Gain[Att] > Epsilon )
|
257
|
+
{
|
258
|
+
CheckPotentialClusters(Att, 3, Fp, Lp, 0.0, Nil, Nil);
|
259
|
+
}
|
260
|
+
}
|
261
|
+
|
262
|
+
|
263
|
+
|
264
|
+
/*************************************************************************/
|
265
|
+
/* */
|
266
|
+
/* Divide attribute values into three subsets (one being N/A) */
|
267
|
+
/* */
|
268
|
+
/*************************************************************************/
|
269
|
+
|
270
|
+
|
271
|
+
void EvalSubsetSplit(Attribute Att, CaseNo Fp, CaseNo Lp)
|
272
|
+
/* --------------- */
|
273
|
+
{
|
274
|
+
DiscrValue v, sv, Cycle;
|
275
|
+
double ThisGain, BestGain=-1E-6;
|
276
|
+
int Bytes;
|
277
|
+
|
278
|
+
ForEach(v, 1, MaxAttVal[Att])
|
279
|
+
{
|
280
|
+
GEnv.ValFreq[v] = GEnv.DFreq[Att][v][0];
|
281
|
+
GEnv.ValSum[v] = GEnv.DValSum[Att][v];
|
282
|
+
GEnv.ValSumSq[v] = GEnv.DValSumSq[Att][v];
|
283
|
+
}
|
284
|
+
|
285
|
+
GEnv.BrFreq[1] = GEnv.ValFreq[1];
|
286
|
+
GEnv.BrSum[1] = GEnv.ValSum[1];
|
287
|
+
GEnv.BrSumSq[1] = GEnv.ValSumSq[1];
|
288
|
+
|
289
|
+
ForEach(v, 2, 3)
|
290
|
+
{
|
291
|
+
GEnv.BrFreq[v] = GEnv.BrSum[v] = GEnv.BrSumSq[v] = 0;
|
292
|
+
}
|
293
|
+
|
294
|
+
ForEach(v, 2, MaxAttVal[Att])
|
295
|
+
{
|
296
|
+
GEnv.BrFreq[2] += GEnv.ValFreq[v];
|
297
|
+
GEnv.BrSum[2] += GEnv.ValSum[v];
|
298
|
+
GEnv.BrSumSq[2] += GEnv.ValSumSq[v];
|
299
|
+
}
|
300
|
+
|
301
|
+
/* Examine subsets, starting with all values in the left branch.
|
302
|
+
At each iteration, move the value with the highest mean from
|
303
|
+
the left branch to the right branch and check the gain.
|
304
|
+
(In the case of ordered attributes, the value moved is the
|
305
|
+
rightmost value in the left branch.)
|
306
|
+
Save the best gain so far in Subset[Att]. */
|
307
|
+
|
308
|
+
ForEach(v, 2, MaxAttVal[Att])
|
309
|
+
{
|
310
|
+
GEnv.Left[v] = ( GEnv.ValFreq[v] > 0 );
|
311
|
+
}
|
312
|
+
|
313
|
+
Bytes = (MaxAttVal[Att]>>3) + 1;
|
314
|
+
|
315
|
+
ForEach(Cycle, 2, MaxAttVal[Att])
|
316
|
+
{
|
317
|
+
if ( Ordered(Att) )
|
318
|
+
{
|
319
|
+
for ( sv = MaxAttVal[Att] ; sv > 1 && ! GEnv.Left[sv] ; sv-- )
|
320
|
+
;
|
321
|
+
}
|
322
|
+
else
|
323
|
+
{
|
324
|
+
sv = 0;
|
325
|
+
|
326
|
+
ForEach(v, 2, MaxAttVal[Att])
|
327
|
+
{
|
328
|
+
if ( GEnv.Left[v] &&
|
329
|
+
( ! sv ||
|
330
|
+
GEnv.ValSum[v] / GEnv.ValFreq[v] >
|
331
|
+
GEnv.ValSum[sv] / GEnv.ValFreq[sv] ) )
|
332
|
+
{
|
333
|
+
sv = v ;
|
334
|
+
}
|
335
|
+
}
|
336
|
+
}
|
337
|
+
|
338
|
+
if ( sv < 2 ) break;
|
339
|
+
|
340
|
+
GEnv.Left[sv] = false;
|
341
|
+
|
342
|
+
GEnv.BrFreq[2] -= GEnv.ValFreq[sv];
|
343
|
+
GEnv.BrSum[2] -= GEnv.ValSum[sv];
|
344
|
+
GEnv.BrSumSq[2] -= GEnv.ValSumSq[sv];
|
345
|
+
GEnv.BrFreq[3] += GEnv.ValFreq[sv];
|
346
|
+
GEnv.BrSum[3] += GEnv.ValSum[sv];
|
347
|
+
GEnv.BrSumSq[3] += GEnv.ValSumSq[sv];
|
348
|
+
|
349
|
+
if ( GEnv.BrFreq[2] >= (CMINITEMS*GEnv.FRAC) &&
|
350
|
+
GEnv.BrFreq[3] >= (CMINITEMS*GEnv.FRAC) &&
|
351
|
+
(ThisGain = ContinGain()) > BestGain + Epsilon )
|
352
|
+
{
|
353
|
+
GEnv.Gain[Att] = BestGain = ThisGain;
|
354
|
+
GEnv.Bar[Att] = sv-1;
|
355
|
+
|
356
|
+
/* Record in Subset[Att] */
|
357
|
+
|
358
|
+
ClearBits(Bytes, GEnv.Subset[Att]);
|
359
|
+
|
360
|
+
ForEach(v, 2, MaxAttVal[Att])
|
361
|
+
{
|
362
|
+
if ( GEnv.Left[v] )
|
363
|
+
{
|
364
|
+
SetBit(v, GEnv.Subset[Att]);
|
365
|
+
}
|
366
|
+
}
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
370
|
+
if ( GEnv.FRAC >= 1 && GEnv.Gain[Att] > Epsilon )
|
371
|
+
{
|
372
|
+
CheckPotentialClusters(Att, 3, Fp, Lp, GEnv.Bar[Att], GEnv.Subset[Att],
|
373
|
+
Nil);
|
374
|
+
}
|
375
|
+
}
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
double SDEstimate(CaseCount N, double Sum, double SumSq)
|
380
|
+
/* ---------- */
|
381
|
+
{
|
382
|
+
return sqrt( (SumSq - Sum * Sum / N + 1E-3) / (N - 1) );
|
383
|
+
}
|
384
|
+
|
385
|
+
|
386
|
+
|
387
|
+
/*************************************************************************/
|
388
|
+
/* */
|
389
|
+
/* Compute continuous gain for three branches */
|
390
|
+
/* */
|
391
|
+
/*************************************************************************/
|
392
|
+
|
393
|
+
|
394
|
+
double ContinGain()
|
395
|
+
/* ---------- */
|
396
|
+
{
|
397
|
+
double Resid=0;
|
398
|
+
DiscrValue v;
|
399
|
+
CaseCount Cases=0;
|
400
|
+
|
401
|
+
ForEach(v, 1, 3)
|
402
|
+
{
|
403
|
+
if ( GEnv.BrFreq[v] > 1 )
|
404
|
+
{
|
405
|
+
Cases += GEnv.BrFreq[v];
|
406
|
+
Resid += GEnv.BrFreq[v] *
|
407
|
+
SDEstimate(GEnv.BrFreq[v], GEnv.BrSum[v], GEnv.BrSumSq[v]);
|
408
|
+
}
|
409
|
+
}
|
410
|
+
|
411
|
+
return GEnv.PSD - Resid / Cases;
|
412
|
+
}
|
data/ext/gritbot/defns.i
ADDED
@@ -0,0 +1,623 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of GritBot GPL Edition, a single-threaded version */
|
6
|
+
/* of GritBot release 2.01. */
|
7
|
+
/* */
|
8
|
+
/* GritBot GPL Edition is free software: you can redistribute it */
|
9
|
+
/* and/or modify it under the terms of the GNU General Public License */
|
10
|
+
/* as published by the Free Software Foundation, either version 3 of */
|
11
|
+
/* the License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* GritBot GPL Edition is distributed in the hope that it will be */
|
14
|
+
/* useful, but WITHOUT ANY WARRANTY; without even the implied warranty */
|
15
|
+
/* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
16
|
+
/* GNU General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with GritBot GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Definitions */
|
30
|
+
/* ----------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#define RELEASE "2.01 GPL Edition"
|
36
|
+
|
37
|
+
#include <stdio.h>
|
38
|
+
#include <math.h>
|
39
|
+
#include <string.h>
|
40
|
+
#include <stdlib.h>
|
41
|
+
#include <time.h>
|
42
|
+
#include <ctype.h>
|
43
|
+
#include <limits.h>
|
44
|
+
#include <float.h>
|
45
|
+
|
46
|
+
#include "text.i"
|
47
|
+
|
48
|
+
|
49
|
+
|
50
|
+
/*************************************************************************/
|
51
|
+
/* */
|
52
|
+
/* Definitions dependent on cc options */
|
53
|
+
/* */
|
54
|
+
/*************************************************************************/
|
55
|
+
|
56
|
+
|
57
|
+
#define Goodbye(x) exit(x)
|
58
|
+
#define Of stdout
|
59
|
+
|
60
|
+
#include <values.h>
|
61
|
+
|
62
|
+
#ifdef VerbOpt
|
63
|
+
#include <assert.h>
|
64
|
+
#define Verbosity(d,s) if(VERBOSITY >= d) {s;}
|
65
|
+
#define Free(x) {free(x); x = 0;}
|
66
|
+
#else
|
67
|
+
#define assert(x)
|
68
|
+
#define Verbosity(d,s)
|
69
|
+
#define Free(x) free(x)
|
70
|
+
#endif
|
71
|
+
|
72
|
+
|
73
|
+
/*************************************************************************/
|
74
|
+
/* */
|
75
|
+
/* Constants, macros etc. */
|
76
|
+
/* */
|
77
|
+
/*************************************************************************/
|
78
|
+
|
79
|
+
|
80
|
+
#define MAXFRAC 0.01 /* max proportion of outliers in group */
|
81
|
+
#define MAXNORM 2.67 /* max SDs for non-outlier */
|
82
|
+
#define MAXTAIL 5.34 /* max SDs for 'ordinary' tail */
|
83
|
+
#define SAMPLEUNIT 2000 /* min sample per split */
|
84
|
+
#define SAMPLEFACTOR 5 /* threshold for using sampling */
|
85
|
+
#define MINCONTEXT 25 /* min cases to detect other difference */
|
86
|
+
|
87
|
+
#define Nil 0 /* null pointer */
|
88
|
+
#define false 0
|
89
|
+
#define true 1
|
90
|
+
#define None -1
|
91
|
+
#define Epsilon 1E-6
|
92
|
+
|
93
|
+
#define EXCLUDE 1 /* special attribute status: do not use */
|
94
|
+
#define SKIP 2 /* do not check */
|
95
|
+
#define DISCRETE 4 /* ditto: collect values as data read */
|
96
|
+
#define ORDERED 8 /* ditto: ordered discrete values */
|
97
|
+
#define DATEVAL 16 /* ditto: YYYY/MM/DD or YYYY-MM-DD */
|
98
|
+
#define STIMEVAL 32 /* ditto: HH:MM:SS */
|
99
|
+
#define TSTMPVAL 64 /* date time */
|
100
|
+
|
101
|
+
/* unknown and N/A values are represented by
|
102
|
+
unlikely floating-point numbers
|
103
|
+
(octal 01600000000 and 01) */
|
104
|
+
#define UNKNOWN 01600000000 /* 1.5777218104420236e-30 */
|
105
|
+
#define NA 01 /* 1.4012984643248171e-45 */
|
106
|
+
|
107
|
+
#define BrDiscr 1
|
108
|
+
#define BrThresh 2
|
109
|
+
#define BrSubset 3
|
110
|
+
|
111
|
+
#define AllocZero(N,T) (T *) Pcalloc(N, sizeof(T))
|
112
|
+
#define Alloc(N,T) AllocZero(N,T) /*for safety */
|
113
|
+
#define Realloc(V,N,T) V = (T *) Prealloc(V, (N)*sizeof(T))
|
114
|
+
|
115
|
+
#define Max(a,b) ((a)>(b) ? a : b)
|
116
|
+
#define Min(a,b) ((a)<(b) ? a : b)
|
117
|
+
|
118
|
+
#define Log2 0.69314718055994530942
|
119
|
+
|
120
|
+
#define Bit(b) (1 << (b))
|
121
|
+
#define In(b,s) ((s[(b) >> 3]) & Bit((b) & 07))
|
122
|
+
#define ClearBits(n,s) memset(s,0,n)
|
123
|
+
#define CopyBits(n,f,t) memcpy(t,f,n)
|
124
|
+
#define SetBit(b,s) (s[(b) >> 3] |= Bit((b) & 07))
|
125
|
+
#define ResetBit(b,s) (s[(b) >> 3] ^= Bit((b) & 07))
|
126
|
+
|
127
|
+
#define ForEach(v,f,l) for(v=f ; v<=l ; ++v)
|
128
|
+
|
129
|
+
#define Swap(a,b) {Description _xab; _xab=Case[a]; Case[a]=Case[b]; Case[b]=_xab;}
|
130
|
+
|
131
|
+
#define StatBit(a,b) (SpecialStatus[a]&(b))
|
132
|
+
#define Exclude(a) StatBit(a,EXCLUDE)
|
133
|
+
#define Skip(a) StatBit(a,EXCLUDE|SKIP)
|
134
|
+
#define Discrete(a) (MaxAttVal[a] || StatBit(a,DISCRETE))
|
135
|
+
#define Continuous(a) (! MaxAttVal[a] && ! StatBit(a,DISCRETE))
|
136
|
+
#define Ordered(a) StatBit(a,ORDERED)
|
137
|
+
#define DateVal(a) StatBit(a,DATEVAL)
|
138
|
+
#define TimeVal(a) StatBit(a,STIMEVAL)
|
139
|
+
#define TStampVal(a) StatBit(a,TSTMPVAL)
|
140
|
+
|
141
|
+
#define FreeUnlessNil(p) if((p)!=Nil) free(p)
|
142
|
+
|
143
|
+
#define CheckClose(f) if(f) {fclose(f); f=Nil;}
|
144
|
+
|
145
|
+
#define Space(s) (s == ' ' || s == '\n' || s == '\r' || s == '\t')
|
146
|
+
#define SkipComment while ( (c = InChar(f)) != '\n' && c != EOF )
|
147
|
+
|
148
|
+
#define rint(x) floor((x)+0.5) /* for consistency across platforms */
|
149
|
+
#define P1(x) (rint((x)*10) / 10)
|
150
|
+
|
151
|
+
#define No(f,l) ((l)-(f)+1)
|
152
|
+
#define SplitVal(g,i) ( Continuous(ClassAtt) ? (g) : (g) / ((i) + 1E-6) )
|
153
|
+
|
154
|
+
|
155
|
+
#define NOFILE 0
|
156
|
+
#define BADATTNAME 1
|
157
|
+
#define EOFINATT 2
|
158
|
+
#define SINGLEATTVAL 3
|
159
|
+
#define BADATTVAL 4
|
160
|
+
#define BADNUMBER 25
|
161
|
+
#define BADCLASS 5
|
162
|
+
#define DUPATTNAME 6
|
163
|
+
#define NOMEM 8
|
164
|
+
#define TOOMANYVALS 9
|
165
|
+
#define BADDISCRETE 10
|
166
|
+
#define UNKNOWNATT 11
|
167
|
+
#define LONGNAME 13
|
168
|
+
#define HITEOF 14
|
169
|
+
#define MISSNAME 15
|
170
|
+
#define BADDATE 16
|
171
|
+
#define BADTIME 17
|
172
|
+
#define BADDEF1 20
|
173
|
+
#define BADDEF2 21
|
174
|
+
#define BADDEF3 22
|
175
|
+
#define SAMEATT 23
|
176
|
+
#define BADTSTMP 24
|
177
|
+
#define BADSIFT 30
|
178
|
+
|
179
|
+
#define READDATA 1
|
180
|
+
#define READTEST 2
|
181
|
+
#define READCASES 3
|
182
|
+
#define PRELIM 4
|
183
|
+
#define CHECKING 5
|
184
|
+
#define REPORTING 6
|
185
|
+
#define CLEANUP 7
|
186
|
+
|
187
|
+
#define CONT_GT 1 /* CONT_GLT = CONT_GT | CONT_LT */
|
188
|
+
#define CONT_LT 2
|
189
|
+
#define CONT_GLT 3
|
190
|
+
#define CONT_NA 4
|
191
|
+
#define DISCR_GT 5 /* DISCR_GLT = DISCR_GT | DISCR_LT */
|
192
|
+
#define DISCR_LT 6
|
193
|
+
#define DISCR_GLT 7
|
194
|
+
#define DISCR_VAL 8
|
195
|
+
#define DISCR_SET 9
|
196
|
+
|
197
|
+
|
198
|
+
/*************************************************************************/
|
199
|
+
/* */
|
200
|
+
/* Type definitions */
|
201
|
+
/* */
|
202
|
+
/*************************************************************************/
|
203
|
+
|
204
|
+
|
205
|
+
typedef unsigned char Boolean, BranchType, *Set;
|
206
|
+
typedef char *String;
|
207
|
+
|
208
|
+
typedef int CaseNo, /* data item number */
|
209
|
+
CaseCount; /* count of items */
|
210
|
+
|
211
|
+
typedef int DiscrValue, /* discrete attribute value (0 = ?) */
|
212
|
+
Attribute; /* attribute number, 1..MaxAtt */
|
213
|
+
|
214
|
+
|
215
|
+
/* defining USEDOUBLE allows for DP
|
216
|
+
attribute values, but will not affect
|
217
|
+
use of saved analyses */
|
218
|
+
#ifdef USEDOUBLE
|
219
|
+
typedef double ContValue; /* continuous attribute value */
|
220
|
+
#define PREC 14 /* precision */
|
221
|
+
#define MARKER MAXDOUBLE
|
222
|
+
#else
|
223
|
+
typedef float ContValue; /* continuous attribute value */
|
224
|
+
#define PREC 7 /* precision */
|
225
|
+
#define MARKER MAXFLOAT
|
226
|
+
#endif
|
227
|
+
|
228
|
+
|
229
|
+
typedef union _def_val
|
230
|
+
{
|
231
|
+
String _s_val; /* att val for comparison */
|
232
|
+
ContValue _n_val; /* number for arith */
|
233
|
+
}
|
234
|
+
DefVal;
|
235
|
+
|
236
|
+
typedef struct _def_elt
|
237
|
+
{
|
238
|
+
short _op_code; /* type of element */
|
239
|
+
DefVal _operand; /* string or numeric value */
|
240
|
+
}
|
241
|
+
DefElt, *Definition;
|
242
|
+
|
243
|
+
typedef struct _elt_rec
|
244
|
+
{
|
245
|
+
int Fi, /* index of first char of element */
|
246
|
+
Li; /* last ditto */
|
247
|
+
char Type; /* 'B', 'S', or 'N' */
|
248
|
+
}
|
249
|
+
EltRec;
|
250
|
+
|
251
|
+
|
252
|
+
#define DefOp(DE) DE._op_code
|
253
|
+
#define DefSVal(DE) DE._operand._s_val
|
254
|
+
#define DefNVal(DE) DE._operand._n_val
|
255
|
+
|
256
|
+
#define OP_ATT 0 /* opcodes */
|
257
|
+
#define OP_NUM 1
|
258
|
+
#define OP_STR 2
|
259
|
+
#define OP_MISS 3
|
260
|
+
#define OP_AND 10
|
261
|
+
#define OP_OR 11
|
262
|
+
#define OP_EQ 20
|
263
|
+
#define OP_NE 21
|
264
|
+
#define OP_GT 22
|
265
|
+
#define OP_GE 23
|
266
|
+
#define OP_LT 24
|
267
|
+
#define OP_LE 25
|
268
|
+
#define OP_SEQ 26
|
269
|
+
#define OP_SNE 27
|
270
|
+
#define OP_PLUS 30
|
271
|
+
#define OP_MINUS 31
|
272
|
+
#define OP_UMINUS 32
|
273
|
+
#define OP_MULT 33
|
274
|
+
#define OP_DIV 34
|
275
|
+
#define OP_MOD 35
|
276
|
+
#define OP_POW 36
|
277
|
+
#define OP_SIN 40
|
278
|
+
#define OP_COS 41
|
279
|
+
#define OP_TAN 42
|
280
|
+
#define OP_LOG 43
|
281
|
+
#define OP_EXP 44
|
282
|
+
#define OP_INT 45
|
283
|
+
#define OP_END 99
|
284
|
+
|
285
|
+
|
286
|
+
typedef struct _testrec
|
287
|
+
{
|
288
|
+
Attribute Att; /* attribute tested */
|
289
|
+
DiscrValue Br; /* branch of test */
|
290
|
+
ContValue Cut; /* threshold (if relevant) */
|
291
|
+
Set Left; /* values for left br (if relevant) */
|
292
|
+
}
|
293
|
+
TestRec;
|
294
|
+
|
295
|
+
typedef struct _clustcondrec
|
296
|
+
{
|
297
|
+
int Type; /* type of test */
|
298
|
+
Attribute Att; /* attribute tested */
|
299
|
+
ContValue Low, /* low thresh or start of range */
|
300
|
+
High; /* high threshold or end of range */
|
301
|
+
Set Values; /* value subset if required */
|
302
|
+
}
|
303
|
+
ClustCond;
|
304
|
+
|
305
|
+
typedef struct _clustrec
|
306
|
+
{
|
307
|
+
Attribute Att; /* focus attribute */
|
308
|
+
ClustCond *Cond; /* group conditions */
|
309
|
+
int NCond; /* number of group conditions */
|
310
|
+
ContValue Expect, /* mean | (int) modal value */
|
311
|
+
SD, /* sd (trimmed) */
|
312
|
+
Limit; /* low / high value for normal cases */
|
313
|
+
float Frac; /* proportion of "normal" cases */
|
314
|
+
CaseCount GpSize; /* size of group */
|
315
|
+
}
|
316
|
+
ClustRec, *Clust;
|
317
|
+
|
318
|
+
typedef union _attribute_value
|
319
|
+
{
|
320
|
+
DiscrValue _discr_val;
|
321
|
+
ContValue _cont_val;
|
322
|
+
String _string_val;
|
323
|
+
Clust _clust;
|
324
|
+
}
|
325
|
+
AttValue, *Description;
|
326
|
+
|
327
|
+
typedef struct _sort_pair
|
328
|
+
{
|
329
|
+
ContValue C;
|
330
|
+
Description D;
|
331
|
+
}
|
332
|
+
SortPair;
|
333
|
+
|
334
|
+
typedef struct _caveat_rec
|
335
|
+
{
|
336
|
+
Attribute Att;
|
337
|
+
Set Subset;
|
338
|
+
float Low, High;
|
339
|
+
}
|
340
|
+
CaveatRec;
|
341
|
+
|
342
|
+
typedef struct _treerec *Tree;
|
343
|
+
typedef struct _treerec
|
344
|
+
{
|
345
|
+
BranchType NodeType; /* 0 | BrDiscr | BrThresh | BrSubset */
|
346
|
+
Attribute Tested; /* attribute referenced in test */
|
347
|
+
int Forks; /* number of branches at this node */
|
348
|
+
ContValue Cut; /* threshold for continuous attribute */
|
349
|
+
Set Left; /* subset of values for first branch */
|
350
|
+
Tree *Branch, /* Branch[x] = subtree for outcome x */
|
351
|
+
Parent; /* parent node */
|
352
|
+
DiscrValue Br; /* branch from parent */
|
353
|
+
String SiftEntry; /* text for sift file */
|
354
|
+
}
|
355
|
+
TreeRec;
|
356
|
+
|
357
|
+
typedef struct _env_rec
|
358
|
+
{
|
359
|
+
CaseCount **Freq,
|
360
|
+
**BestFreq,
|
361
|
+
*ValFreq,
|
362
|
+
*ClassFreq,
|
363
|
+
BrFreq[4];
|
364
|
+
Boolean *Left,
|
365
|
+
*Possible;
|
366
|
+
double *Gain,
|
367
|
+
*Info,
|
368
|
+
*ValSum,
|
369
|
+
*ValSumSq,
|
370
|
+
BrSum[4],
|
371
|
+
BrSumSq[4],
|
372
|
+
BaseInfo,
|
373
|
+
FRAC,
|
374
|
+
PSD;
|
375
|
+
Set *Subset;
|
376
|
+
ContValue *Bar;
|
377
|
+
int Level,
|
378
|
+
MaxLevel,
|
379
|
+
*Tested;
|
380
|
+
TestRec *Test;
|
381
|
+
String SiftEntry;
|
382
|
+
int SiftSize,
|
383
|
+
SiftSpace;
|
384
|
+
Attribute *DList; /* current discrete atts */
|
385
|
+
CaseCount ***DFreq; /* DFreq[a][][] = Freq[] for att a */
|
386
|
+
double **DValSum, /* ValSum[] for att a */
|
387
|
+
**DValSumSq; /* ValSumSq[] for att a */
|
388
|
+
}
|
389
|
+
EnvRec;
|
390
|
+
|
391
|
+
|
392
|
+
#define CVal(Case,Attribute) Case[Attribute]._cont_val
|
393
|
+
#define DVal(Case,Attribute) Case[Attribute]._discr_val
|
394
|
+
#define XDVal(Case,Att) (Case[Att]._discr_val & 077777777)
|
395
|
+
#define SVal(Case,Attribute) Case[Attribute]._string_val
|
396
|
+
|
397
|
+
#define CClass(Case) (*Case)._cont_val
|
398
|
+
#define DClass(Case) ((*Case)._discr_val & 077777777)
|
399
|
+
|
400
|
+
#define Unknown(Case,Att) (DVal(Case,Att)==UNKNOWN)
|
401
|
+
#define UnknownVal(AV) (AV._discr_val==UNKNOWN)
|
402
|
+
#define NotApplic(Case,Att) (DVal(Case,Att)==NA)
|
403
|
+
#define NotApplicVal(AV) (AV._discr_val==NA)
|
404
|
+
|
405
|
+
#define OutXVal(Case) Case[MaxAtt+1]._cont_val
|
406
|
+
#define OutClust(Case) Case[MaxAtt+2]._clust
|
407
|
+
|
408
|
+
#define ZScore(i) (fabs(CClass(Case[i])-Mean) / SD)
|
409
|
+
#define MaxAnoms(N) (MAXFRAC*(N)+2*sqrt((N)*MAXFRAC*(1-MAXFRAC))+1)
|
410
|
+
|
411
|
+
#define DScore(n,a,p) ((a) / ((double) (n)*(p)))
|
412
|
+
|
413
|
+
/* XDScore is a specialised version for possibly non-occuring vals */
|
414
|
+
#define XDScore(f,n,a,p) ((f) ? (a) / ((double) (n)*(p)) :\
|
415
|
+
(p) ? (1 / ((n)+2.0)) / (p) : (1 / ((n)+2.0)))
|
416
|
+
|
417
|
+
|
418
|
+
/*************************************************************************/
|
419
|
+
/* */
|
420
|
+
/* Function prototypes */
|
421
|
+
/* */
|
422
|
+
/*************************************************************************/
|
423
|
+
|
424
|
+
/* getnames.c */
|
425
|
+
|
426
|
+
Boolean ReadName(FILE *f, String s, int n, char ColonOpt);
|
427
|
+
void GetNames(FILE *Nf);
|
428
|
+
void ExplicitAtt(FILE *Nf);
|
429
|
+
int Which(String Val, String *List, int First, int Last);
|
430
|
+
String CopyString(String S);
|
431
|
+
void FreeNames(void);
|
432
|
+
int InChar(FILE *f);
|
433
|
+
|
434
|
+
/* implicitatt.c */
|
435
|
+
|
436
|
+
void ImplicitAtt(FILE *Nf);
|
437
|
+
void ReadDefinition(FILE *f);
|
438
|
+
void Append(char c);
|
439
|
+
Boolean Expression(void);
|
440
|
+
Boolean Conjunct(void);
|
441
|
+
Boolean SExpression(void);
|
442
|
+
Boolean AExpression(void);
|
443
|
+
Boolean Term(void);
|
444
|
+
Boolean Factor(void);
|
445
|
+
Boolean Primary(void);
|
446
|
+
Boolean Atom(void);
|
447
|
+
Boolean Find(String S);
|
448
|
+
int FindOne(String *Alt);
|
449
|
+
Attribute FindAttName(void);
|
450
|
+
void DefSyntaxError(String Msg);
|
451
|
+
void DefSemanticsError(int Fi, String Msg, int OpCode);
|
452
|
+
void Dump(char OpCode, ContValue F, String S, int Fi);
|
453
|
+
void DumpOp(char OpCode, int Fi);
|
454
|
+
Boolean UpdateTStack(char OpCode, ContValue F, String S, int Fi);
|
455
|
+
AttValue EvaluateDef(Definition D, Description Case);
|
456
|
+
|
457
|
+
/* getdata.c */
|
458
|
+
|
459
|
+
void GetData(FILE *Df, Boolean Train);
|
460
|
+
Description GetDescription(FILE *Df, Boolean Train);
|
461
|
+
void FreeData(void);
|
462
|
+
void FreeCase(Description DVec);
|
463
|
+
void CheckValue(Description DVec, Attribute Att);
|
464
|
+
|
465
|
+
/* check.c */
|
466
|
+
|
467
|
+
void CheckData(void);
|
468
|
+
void CheckContin(CaseNo Fp);
|
469
|
+
void FindContinOutliers(CaseNo Fp, CaseNo Lp, Boolean Sorted);
|
470
|
+
void LabelContinOutliers(Clust CL, Clust CH, CaseNo Fp, CaseNo GFp,
|
471
|
+
CaseNo GLp);
|
472
|
+
void TrimmedSDEstimate(CaseNo Fp, CaseNo Lp, double *Mean, double *SD);
|
473
|
+
CaseNo FindTail(CaseNo Fp, CaseNo Lp, int Inc, double Mean, double SD);
|
474
|
+
Boolean OmittedCases(int HiLo);
|
475
|
+
Boolean SatisfiesTests(Description Case);
|
476
|
+
void FindDiscrOutliers(CaseNo Fp, CaseNo Lp, CaseCount *Table);
|
477
|
+
CaseNo NoOtherDifference(CaseNo Fp, CaseNo Lp, CaseNo GFp, CaseNo GLp);
|
478
|
+
void InitialiseEnvData(void);
|
479
|
+
void FreeEnvData(void);
|
480
|
+
|
481
|
+
/* cluster.c */
|
482
|
+
|
483
|
+
Clust NewClust(ContValue Expect, ContValue SD, ContValue Limit,
|
484
|
+
CaseCount Anoms, CaseCount GpSize);
|
485
|
+
void SaveClustConds(Clust C);
|
486
|
+
void FormatContinCond(Attribute Att, ClustCond *CC);
|
487
|
+
void FormatOrderedCond(Attribute Att, ClustCond *CC);
|
488
|
+
void FormatSubsetCond(Attribute Att, ClustCond *CC);
|
489
|
+
void FormatValCond(Attribute Att, ClustCond *CC);
|
490
|
+
void FreeClust(Clust C);
|
491
|
+
|
492
|
+
/* outlier.c */
|
493
|
+
|
494
|
+
void RecordOutlier(CaseNo i, Clust C, float XVal);
|
495
|
+
void ReportOutliers(void);
|
496
|
+
void PrintAttVal(Description Case, Attribute Att);
|
497
|
+
void PrintOutlier(CaseNo i, Clust C, ContValue SVal);
|
498
|
+
void PrintContinCond(Attribute Att, ContValue Lo, ContValue Hi, CaseNo N);
|
499
|
+
void PrintOrderedCond(Attribute Att, DiscrValue Lo, DiscrValue Hi, CaseNo N);
|
500
|
+
void PrintSubsetCond(Attribute Att, Set Values, CaseNo N);
|
501
|
+
void PrintValCond(Attribute Att, DiscrValue v);
|
502
|
+
|
503
|
+
|
504
|
+
/* common.c */
|
505
|
+
|
506
|
+
void InitialiseDAC(void);
|
507
|
+
void FreeDAC(void);
|
508
|
+
void Split(CaseNo Fp, CaseNo Lp, int CondAtts, Tree Parent,
|
509
|
+
DiscrValue Br, Tree *Result);
|
510
|
+
void RecoverContext(Tree T, DiscrValue Br);
|
511
|
+
void DiscreteAttInfo(CaseNo Fp, CaseNo Lp, int CondAtts);
|
512
|
+
void ChooseSplitWithSampling(CaseNo Fp, CaseNo Lp, int CondAtts);
|
513
|
+
void Sample(CaseNo Fp, CaseNo Lp, CaseCount N);
|
514
|
+
void SampleScan(CaseNo Fp, CaseNo Lp, int CondAtts, Boolean Second);
|
515
|
+
void ChooseSplit(CaseNo Fp, CaseNo Lp, int CondAtts);
|
516
|
+
void FindBestAtt(Attribute *BestAtt, double *BestVal);
|
517
|
+
void CheckSplit(Attribute Att, CaseNo Fp, CaseNo Lp);
|
518
|
+
void Divide(Tree Node, CaseNo Fp, CaseNo Lp, int CondAtts);
|
519
|
+
void NoteTest(Attribute Att, DiscrValue Br, ContValue Cut, Set Left);
|
520
|
+
CaseNo SkipMissing(Attribute Att, CaseNo Fp, CaseNo Lp);
|
521
|
+
CaseNo Group(Attribute Att, DiscrValue V, CaseNo Fp, CaseNo Lp,
|
522
|
+
ContValue Cut, Set Left);
|
523
|
+
void CheckPotentialClusters(Attribute Att, DiscrValue Forks,
|
524
|
+
CaseNo Fp, CaseNo Lp, ContValue B, Set S,
|
525
|
+
CaseCount **FT);
|
526
|
+
void ShowContext(CaseNo i);
|
527
|
+
Tree Leaf(Tree Parent, DiscrValue Br);
|
528
|
+
void ReleaseTree(Tree T, int Level);
|
529
|
+
void OutputConditions(void);
|
530
|
+
|
531
|
+
/* continatt.c */
|
532
|
+
|
533
|
+
void CEvalContinAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
|
534
|
+
ContValue Between(ContValue Low, ContValue High);
|
535
|
+
void CEvalDiscrAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
|
536
|
+
void EvalBinarySplit(Attribute Att, CaseNo Fp, CaseNo Lp);
|
537
|
+
void EvalSubsetSplit(Attribute Att, CaseNo Fp, CaseNo Lp);
|
538
|
+
double SDEstimate(CaseCount N, double Sum, double SumSq);
|
539
|
+
double ContinGain(void);
|
540
|
+
|
541
|
+
/* discratt.c */
|
542
|
+
|
543
|
+
void DEvalContinAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
|
544
|
+
void DEvalDiscrAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
|
545
|
+
void DEvalOrderedAtt(Attribute Att, CaseNo Fp, CaseNo Lp);
|
546
|
+
void ComputeFrequencies(Attribute Att, CaseNo Fp, CaseNo Lp);
|
547
|
+
void FindClassFrequencies(CaseNo Fp, CaseNo Lp);
|
548
|
+
double DiscrGain(DiscrValue MaxVal, CaseCount TotalCases);
|
549
|
+
double TotalInfo(CaseCount V[], DiscrValue MinVal, DiscrValue MaxVal);
|
550
|
+
|
551
|
+
/* sort.c */
|
552
|
+
|
553
|
+
void Quicksort(CaseNo Fp, CaseNo Lp, Attribute Att);
|
554
|
+
void Cachesort(CaseNo Fp, CaseNo Lp);
|
555
|
+
|
556
|
+
/* modelfiles.c */
|
557
|
+
|
558
|
+
void CheckFile(String Extension, Boolean Write);
|
559
|
+
void WriteFilePrefix(String Extension);
|
560
|
+
void SaveCondition(void);
|
561
|
+
void SaveDiscrCluster(DiscrValue Expect, CaseCount Anoms,
|
562
|
+
CaseCount Cases, CaseCount *Freq);
|
563
|
+
void SaveContinCluster(float Mean, float SD, CaseCount Cases,
|
564
|
+
float LFrac, float LLim, float HFrac, float HLim);
|
565
|
+
void SaveNames(void);
|
566
|
+
void AsciiOut(String Pre, String S);
|
567
|
+
void ReadFilePrefix(String Extension);
|
568
|
+
int ReadProp(char *Delim);
|
569
|
+
String RemoveQuotes(String S);
|
570
|
+
void ExtendSiftEntry(String S);
|
571
|
+
void ProcessSift(void);
|
572
|
+
void Case1(void);
|
573
|
+
void Case2(void);
|
574
|
+
void Case3(void);
|
575
|
+
void Case11(void);
|
576
|
+
void Case12(void);
|
577
|
+
void Case13(void);
|
578
|
+
void Case21(void);
|
579
|
+
void Case22(void);
|
580
|
+
void ReadCaveats(void);
|
581
|
+
Boolean CheckCaveats(Description Case);
|
582
|
+
void FoundPossibleAnom(CaseNo i, Clust C, float Xv);
|
583
|
+
void Filter(Attribute Att, DiscrValue Br, ContValue Cut, Set Left);
|
584
|
+
void SetTestedAtts(void);
|
585
|
+
|
586
|
+
|
587
|
+
/* utility.c */
|
588
|
+
|
589
|
+
void PrintHeader(String Title);
|
590
|
+
char ProcessOption(int Argc, char **Argv, char *Str);
|
591
|
+
void *Pmalloc(size_t Bytes);
|
592
|
+
void *Prealloc(void *Present, size_t Bytes);
|
593
|
+
void *Pcalloc(size_t Number, unsigned int Size);
|
594
|
+
void FreeVector(void **V, int First, int Last);
|
595
|
+
Description NewCase(void);
|
596
|
+
void MemTrim(void);
|
597
|
+
void FreeCases(void);
|
598
|
+
void FreeLastCase(Description Case);
|
599
|
+
double KRandom(void);
|
600
|
+
void ResetKR(int KRInit);
|
601
|
+
void Error(int ErrNo, String S1, String S2);
|
602
|
+
String CaseLabel(CaseNo N);
|
603
|
+
FILE * GetFile(String Extension, String RW);
|
604
|
+
double ExecTime(void);
|
605
|
+
int Denominator(ContValue Val);
|
606
|
+
int FracBase(Attribute Att);
|
607
|
+
int GetInt(String S, int N);
|
608
|
+
int DateToDay(String DS);
|
609
|
+
void DayToDate(int DI, String Date);
|
610
|
+
int TimeToSecs(String TS);
|
611
|
+
void SecsToTime(int Secs, String Time);
|
612
|
+
void SetTSBase(int y);
|
613
|
+
int TStampToMins(String TS);
|
614
|
+
void CValToStr(ContValue CV, Attribute Att, String DS);
|
615
|
+
void CleanupSift(void);
|
616
|
+
void Cleanup(void);
|
617
|
+
void Check(float Val, float Low, float High);
|
618
|
+
|
619
|
+
|
620
|
+
/* update.c */
|
621
|
+
|
622
|
+
void NotifyStage(int);
|
623
|
+
void Progress(int);
|