see5-installer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
data/ext/c5.0/report.c
ADDED
@@ -0,0 +1,345 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Program to produce average results from an xval */
|
30
|
+
/* ----------------------------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
#include <math.h>
|
35
|
+
#include <stdio.h>
|
36
|
+
#include <stdlib.h>
|
37
|
+
|
38
|
+
void PrintSummary(float **Val, int No, char *Title);
|
39
|
+
float SE(float sum, float sumsq, int no);
|
40
|
+
|
41
|
+
int Boost=0, Composite=0, Costs=0, Rules;
|
42
|
+
|
43
|
+
#define SIZE 0
|
44
|
+
#define ERRP 1
|
45
|
+
#define COST 2
|
46
|
+
|
47
|
+
|
48
|
+
int main(int argc, char *argv[])
|
49
|
+
/* ---- */
|
50
|
+
{
|
51
|
+
char Line[100], *p;
|
52
|
+
int Cases, Folds, Repeats, f, r, i, N,
|
53
|
+
Size=0, Errs=0, Form, OK;
|
54
|
+
float ***Raw, **Average=0, FX, Tests, Cost=0;
|
55
|
+
|
56
|
+
sscanf(argv[1], "%d", &Cases);
|
57
|
+
sscanf(argv[2], "%d", &Folds);
|
58
|
+
sscanf(argv[3], "%d", &Repeats);
|
59
|
+
sscanf(argv[4], "%d", &Rules);
|
60
|
+
|
61
|
+
/* Assemble all data */
|
62
|
+
|
63
|
+
Raw = (float ***) calloc(Repeats, sizeof(float **));
|
64
|
+
if ( Repeats > 1 )
|
65
|
+
{
|
66
|
+
Average = (float **) calloc(Repeats, sizeof(float *));
|
67
|
+
}
|
68
|
+
|
69
|
+
/* Determine input type from the first line */
|
70
|
+
|
71
|
+
fgets(Line, 100, stdin);
|
72
|
+
|
73
|
+
/* Count the numbers on the line */
|
74
|
+
|
75
|
+
N = 0;
|
76
|
+
for ( p = Line ; *p ; )
|
77
|
+
{
|
78
|
+
if ( isdigit(*p) )
|
79
|
+
{
|
80
|
+
N++;
|
81
|
+
while ( isdigit(*p) || *p == '.' ) p++;
|
82
|
+
}
|
83
|
+
else
|
84
|
+
{
|
85
|
+
p++;
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
if ( ! memcmp(Line, "boost", 5) )
|
90
|
+
{
|
91
|
+
Boost = 1;
|
92
|
+
Costs = ( N == 3 );
|
93
|
+
}
|
94
|
+
else
|
95
|
+
if ( ! memcmp(Line, "composite", 9) )
|
96
|
+
{
|
97
|
+
Composite = 1;
|
98
|
+
Rules = 0;
|
99
|
+
Costs = ( N == 4 );
|
100
|
+
}
|
101
|
+
else
|
102
|
+
{
|
103
|
+
Costs = ( N == 4 );
|
104
|
+
}
|
105
|
+
Form = ( Composite ? 2 + Costs : Costs );
|
106
|
+
|
107
|
+
for ( r = 0 ; r < Repeats ; r++ )
|
108
|
+
{
|
109
|
+
Raw[r] = (float **) calloc(Folds, sizeof(float *));
|
110
|
+
if ( Repeats > 1 )
|
111
|
+
{
|
112
|
+
Average[r] = (float *) calloc(3, sizeof(float));
|
113
|
+
}
|
114
|
+
|
115
|
+
for ( f = 0 ; f < Folds ; f++ )
|
116
|
+
{
|
117
|
+
Raw[r][f] = (float *) calloc(3, sizeof(float));
|
118
|
+
|
119
|
+
if ( r + f != 0 && ! fgets(Line, 100, stdin) )
|
120
|
+
{
|
121
|
+
printf("\nExpecting %d lines\n", Folds * Repeats);
|
122
|
+
exit(1);
|
123
|
+
}
|
124
|
+
|
125
|
+
Tests = Cases / Folds + ( f >= Folds - Cases % Folds);
|
126
|
+
|
127
|
+
if ( ! memcmp(Line, "boost", 5) )
|
128
|
+
{
|
129
|
+
Boost = 1;
|
130
|
+
|
131
|
+
switch ( Form )
|
132
|
+
{
|
133
|
+
case 0:
|
134
|
+
N = sscanf(Line, "boost %d (%f%%)", &Errs, &FX);
|
135
|
+
OK = ( N == 2 );
|
136
|
+
break;
|
137
|
+
|
138
|
+
case 1:
|
139
|
+
N = sscanf(Line, "boost %d (%f%%) %f", &Errs, &FX, &Cost);
|
140
|
+
OK = ( N == 3 );
|
141
|
+
}
|
142
|
+
}
|
143
|
+
else
|
144
|
+
{
|
145
|
+
switch ( Form )
|
146
|
+
{
|
147
|
+
case 0:
|
148
|
+
N = sscanf(Line, "%d %d (%f%%)", &Size, &Errs, &FX);
|
149
|
+
OK = ( N == 3 );
|
150
|
+
break;
|
151
|
+
|
152
|
+
case 1:
|
153
|
+
N = sscanf(Line, "%d %d (%f%%) %f",
|
154
|
+
&Size, &Errs, &FX, &Cost);
|
155
|
+
OK = ( N == 4 );
|
156
|
+
break;
|
157
|
+
|
158
|
+
case 2:
|
159
|
+
N = sscanf(Line+18, "%d %d (%f%%) %f",
|
160
|
+
&Size, &Errs, &FX, &Cost);
|
161
|
+
OK = ( N == 4 );
|
162
|
+
break;
|
163
|
+
}
|
164
|
+
}
|
165
|
+
|
166
|
+
if ( ! OK )
|
167
|
+
{
|
168
|
+
printf("\nCannot parse line\n\t%s", Line);
|
169
|
+
exit(1);
|
170
|
+
}
|
171
|
+
|
172
|
+
Raw[r][f][SIZE] = Size;
|
173
|
+
Raw[r][f][ERRP] = (100.0 * Errs) / Tests;
|
174
|
+
Raw[r][f][COST] = Cost;
|
175
|
+
|
176
|
+
if ( Average )
|
177
|
+
{
|
178
|
+
for ( i = 0 ; i < 3 ; i++ )
|
179
|
+
{
|
180
|
+
Average[r][i] += Raw[r][f][i];
|
181
|
+
}
|
182
|
+
}
|
183
|
+
}
|
184
|
+
|
185
|
+
if ( Average )
|
186
|
+
{
|
187
|
+
for ( i = 0 ; i < 3 ; i++ )
|
188
|
+
{
|
189
|
+
Average[r][i] /= Folds;
|
190
|
+
}
|
191
|
+
}
|
192
|
+
}
|
193
|
+
|
194
|
+
/* Check that amount of data is correct */
|
195
|
+
|
196
|
+
if ( fgets(Line, 100, stdin) )
|
197
|
+
{
|
198
|
+
printf("\nExpecting %d lines\n", Folds * Repeats * 2);
|
199
|
+
exit(1);
|
200
|
+
}
|
201
|
+
|
202
|
+
if ( Average )
|
203
|
+
{
|
204
|
+
PrintSummary(Average, Repeats, "XVal");
|
205
|
+
}
|
206
|
+
else
|
207
|
+
{
|
208
|
+
PrintSummary(Raw[SIZE], Folds, "Fold");
|
209
|
+
}
|
210
|
+
|
211
|
+
return 0;
|
212
|
+
}
|
213
|
+
|
214
|
+
|
215
|
+
char
|
216
|
+
*StdP[] = { " Decision Tree ",
|
217
|
+
" ---------------- ",
|
218
|
+
" Size Errors " },
|
219
|
+
|
220
|
+
*StdPC[] = { " Decision Tree ",
|
221
|
+
" ----------------------- ",
|
222
|
+
" Size Errors Cost " },
|
223
|
+
|
224
|
+
*Extra[] = { " Rules ",
|
225
|
+
" ----------------",
|
226
|
+
" No Errors" },
|
227
|
+
|
228
|
+
*ExtraC[]= { " Rules ",
|
229
|
+
" -----------------------",
|
230
|
+
" No Errors Cost" };
|
231
|
+
|
232
|
+
void PrintSummary(float **Val, int No, char *Title)
|
233
|
+
/* ------------ */
|
234
|
+
{
|
235
|
+
int i, j;
|
236
|
+
float Sum[3], SumSq[3];
|
237
|
+
|
238
|
+
for ( i = 0 ; i < 3 ; i++ )
|
239
|
+
{
|
240
|
+
Sum[i] = SumSq[i] = 0;
|
241
|
+
}
|
242
|
+
|
243
|
+
for ( i = 0 ; i <= 2 ; i++ )
|
244
|
+
{
|
245
|
+
switch ( i )
|
246
|
+
{
|
247
|
+
case 0:
|
248
|
+
printf("\n\t%s ", Title);
|
249
|
+
break;
|
250
|
+
|
251
|
+
case 1:
|
252
|
+
printf("\t---- ");
|
253
|
+
break;
|
254
|
+
|
255
|
+
case 2:
|
256
|
+
printf("\t ");
|
257
|
+
}
|
258
|
+
|
259
|
+
printf("%s\n", ( Composite ?
|
260
|
+
( Costs ? ExtraC[i] : Extra[i] ) :
|
261
|
+
Rules ?
|
262
|
+
( Costs ? ExtraC[i] : Extra[i] ) :
|
263
|
+
( Costs ? StdPC[i] : StdP[i] ) ));
|
264
|
+
}
|
265
|
+
printf("\n");
|
266
|
+
|
267
|
+
for ( j = 0 ; j < No ; j++ )
|
268
|
+
{
|
269
|
+
for ( i = 0 ; i < 3 ; i++ )
|
270
|
+
{
|
271
|
+
Sum[i] += Val[j][i];
|
272
|
+
SumSq[i] += Val[j][i] * Val[j][i];
|
273
|
+
}
|
274
|
+
|
275
|
+
printf("\t%3d ", j+1);
|
276
|
+
|
277
|
+
if ( Boost )
|
278
|
+
{
|
279
|
+
printf(" *");
|
280
|
+
}
|
281
|
+
else
|
282
|
+
{
|
283
|
+
printf("%8.1f", Val[j][SIZE]);
|
284
|
+
}
|
285
|
+
|
286
|
+
printf(" %4.1f%% ", Val[j][ERRP]);
|
287
|
+
|
288
|
+
if ( Costs )
|
289
|
+
{
|
290
|
+
printf("%5.2f ", Val[j][COST]);
|
291
|
+
}
|
292
|
+
|
293
|
+
printf("\n");
|
294
|
+
}
|
295
|
+
|
296
|
+
printf("\n\tMean ");
|
297
|
+
|
298
|
+
if ( Boost )
|
299
|
+
{
|
300
|
+
printf(" ");
|
301
|
+
}
|
302
|
+
else
|
303
|
+
{
|
304
|
+
printf("%8.1f", Sum[SIZE] / No);
|
305
|
+
}
|
306
|
+
|
307
|
+
printf(" %4.1f%% ", Sum[ERRP] / No);
|
308
|
+
|
309
|
+
if ( Costs )
|
310
|
+
{
|
311
|
+
printf("%5.2f ", Sum[COST] / No);
|
312
|
+
}
|
313
|
+
|
314
|
+
printf("\n\tSE ");
|
315
|
+
|
316
|
+
if ( Boost )
|
317
|
+
{
|
318
|
+
printf(" ");
|
319
|
+
}
|
320
|
+
else
|
321
|
+
{
|
322
|
+
printf("%8.1f", SE(Sum[SIZE], SumSq[SIZE], No));
|
323
|
+
}
|
324
|
+
|
325
|
+
printf(" %4.1f%% ", SE(Sum[ERRP], SumSq[ERRP], No));
|
326
|
+
|
327
|
+
if ( Costs )
|
328
|
+
{
|
329
|
+
printf("%5.2f ", SE(Sum[COST], SumSq[COST], No));
|
330
|
+
}
|
331
|
+
|
332
|
+
printf("\n");
|
333
|
+
}
|
334
|
+
|
335
|
+
|
336
|
+
|
337
|
+
float SE(float sum, float sumsq, int no)
|
338
|
+
/* -- */
|
339
|
+
{
|
340
|
+
float mean;
|
341
|
+
|
342
|
+
mean = sum / no;
|
343
|
+
|
344
|
+
return sqrt( ((sumsq - no * mean * mean) / (no - 1)) / no );
|
345
|
+
}
|
data/ext/c5.0/rules.c
ADDED
@@ -0,0 +1,579 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Miscellaneous routines for rule handling */
|
30
|
+
/* ---------------------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#include "defns.i"
|
36
|
+
#include "extern.i"
|
37
|
+
|
38
|
+
|
39
|
+
/*************************************************************************/
|
40
|
+
/* */
|
41
|
+
/* Add a new rule to the current ruleset, by updating Rule[], */
|
42
|
+
/* NRules and, if necessary, RuleSpace */
|
43
|
+
/* */
|
44
|
+
/*************************************************************************/
|
45
|
+
|
46
|
+
|
47
|
+
Boolean NewRule(Condition Cond[], int NCond, ClassNo TargetClass,
|
48
|
+
Boolean *Deleted, CRule Existing,
|
49
|
+
CaseCount Cover, CaseCount Correct, float Prior)
|
50
|
+
/* ------- */
|
51
|
+
{
|
52
|
+
int d, dd, id, r, Size=0, Bytes;
|
53
|
+
CaseNo i;
|
54
|
+
CRule R;
|
55
|
+
Condition *Lhs;
|
56
|
+
Boolean Exclude=false;
|
57
|
+
int Vote;
|
58
|
+
|
59
|
+
/* Sort and copy the conditions if required */
|
60
|
+
|
61
|
+
if ( ! Existing )
|
62
|
+
{
|
63
|
+
ForEach(d, 1, NCond)
|
64
|
+
{
|
65
|
+
if ( ! Deleted[d] ) Size++;
|
66
|
+
}
|
67
|
+
|
68
|
+
Lhs = Alloc(Size+1, Condition);
|
69
|
+
|
70
|
+
/* Sort conditions in print order */
|
71
|
+
|
72
|
+
ForEach(d, 1, Size)
|
73
|
+
{
|
74
|
+
dd = 0;
|
75
|
+
ForEach(id, 1, NCond)
|
76
|
+
{
|
77
|
+
if ( ! Deleted[id] && ( ! dd || Before(Cond[id], Cond[dd]) ) )
|
78
|
+
{
|
79
|
+
dd = id;
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
Lhs[d] = Alloc(1, CondRec);
|
84
|
+
memcpy(Lhs[d], Cond[dd], sizeof(CondRec));
|
85
|
+
if ( Lhs[d]->NodeType == BrSubset )
|
86
|
+
{
|
87
|
+
Bytes = (MaxAttVal[Lhs[d]->Tested]>>3) + 1;
|
88
|
+
Lhs[d]->Subset = Alloc(Bytes, Byte);
|
89
|
+
memcpy(Lhs[d]->Subset, Cond[dd]->Subset, Bytes);
|
90
|
+
}
|
91
|
+
|
92
|
+
Deleted[dd] = true;
|
93
|
+
}
|
94
|
+
}
|
95
|
+
else
|
96
|
+
{
|
97
|
+
Lhs = Cond;
|
98
|
+
Size = NCond;
|
99
|
+
}
|
100
|
+
|
101
|
+
Vote = 1000 * (Correct + 1.0) / (Cover + 2.0) + 0.5;
|
102
|
+
|
103
|
+
/* See if rule already exists */
|
104
|
+
|
105
|
+
for ( r = 1 ; ! Exclude && r <= NRules ; r++ )
|
106
|
+
{
|
107
|
+
if ( SameRule(r, Lhs, Size, TargetClass) )
|
108
|
+
{
|
109
|
+
Verbosity(1, fprintf(Of, "\tduplicates rule %d\n", r))
|
110
|
+
|
111
|
+
/* Keep the most optimistic error estimate */
|
112
|
+
|
113
|
+
if ( Vote > Rule[r]->Vote )
|
114
|
+
{
|
115
|
+
Rule[r]->Vote = Vote;
|
116
|
+
}
|
117
|
+
|
118
|
+
Exclude = true;
|
119
|
+
}
|
120
|
+
}
|
121
|
+
|
122
|
+
if ( Exclude )
|
123
|
+
{
|
124
|
+
if ( ! Existing )
|
125
|
+
{
|
126
|
+
ForEach(d, 1, Size)
|
127
|
+
{
|
128
|
+
if ( Lhs[d]->NodeType == BrSubset ) Free(Lhs[d]->Subset);
|
129
|
+
}
|
130
|
+
FreeVector((void **) Lhs, 1, Size);
|
131
|
+
}
|
132
|
+
|
133
|
+
return false;
|
134
|
+
}
|
135
|
+
|
136
|
+
/* Make sure there is enough room for the new rule */
|
137
|
+
|
138
|
+
NRules++;
|
139
|
+
if ( NRules >= RuleSpace )
|
140
|
+
{
|
141
|
+
RuleSpace += 100;
|
142
|
+
if ( RuleSpace > 100 )
|
143
|
+
{
|
144
|
+
Realloc(Rule, RuleSpace, CRule);
|
145
|
+
Realloc(Fires, RuleSpace, Byte *);
|
146
|
+
ForEach(r, RuleSpace-100, RuleSpace-1)
|
147
|
+
{
|
148
|
+
Fires[r] = Nil;
|
149
|
+
}
|
150
|
+
}
|
151
|
+
else
|
152
|
+
{
|
153
|
+
Rule = Alloc(RuleSpace, CRule);
|
154
|
+
Fires = AllocZero(RuleSpace, Byte *);
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
/* Form the new rule */
|
159
|
+
|
160
|
+
Rule[NRules] = R = Alloc(1, RuleRec);
|
161
|
+
|
162
|
+
R->TNo = ( Existing ? Existing->TNo : Trial );
|
163
|
+
R->RNo = ( Existing ? Existing->RNo : NRules );
|
164
|
+
R->Size = Size;
|
165
|
+
R->Lhs = Lhs;
|
166
|
+
R->Rhs = TargetClass;
|
167
|
+
R->Cover = Cover;
|
168
|
+
R->Correct = Correct;
|
169
|
+
R->Prior = Prior;
|
170
|
+
R->Vote = Vote;
|
171
|
+
|
172
|
+
/* Record entry in Fires and CovBy */
|
173
|
+
|
174
|
+
ListSort(List, 1, List[0]);
|
175
|
+
Fires[NRules] = Compress(List);
|
176
|
+
|
177
|
+
ForEach(i, 1, List[0])
|
178
|
+
{
|
179
|
+
CovBy[List[i]]++;
|
180
|
+
}
|
181
|
+
|
182
|
+
Verbosity(1, if ( ! Existing ) PrintRule(R))
|
183
|
+
|
184
|
+
return true;
|
185
|
+
}
|
186
|
+
|
187
|
+
|
188
|
+
|
189
|
+
/*************************************************************************/
|
190
|
+
/* */
|
191
|
+
/* Compress list of ascending integers. */
|
192
|
+
/* */
|
193
|
+
/* The first integer occupies 4 bytes. Each subsequent integer is */
|
194
|
+
/* represented as the increment on the previous and is encoded as */
|
195
|
+
/* one or more bytes b0 + b1 + .... where */
|
196
|
+
/* if byte b < 128, value is b */
|
197
|
+
/* if byte b = 128 + x, value is x * 128 */
|
198
|
+
/* */
|
199
|
+
/* For example, an increment 4321 (= 33 * 128 + 97) is encoded as */
|
200
|
+
/* two bytes [128 + 33] [97] */
|
201
|
+
/* */
|
202
|
+
/*************************************************************************/
|
203
|
+
|
204
|
+
|
205
|
+
Byte *Compress(int *L)
|
206
|
+
/* -------- */
|
207
|
+
{
|
208
|
+
int i, Last=0, Entry, Blocks;
|
209
|
+
Byte *p, *Compressed;
|
210
|
+
|
211
|
+
/* Copy first integer (uncompressed) */
|
212
|
+
|
213
|
+
memcpy(CBuffer, L, 4);
|
214
|
+
p = CBuffer + 4;
|
215
|
+
|
216
|
+
ForEach(i, 1, L[0])
|
217
|
+
{
|
218
|
+
Entry = L[i] - Last;
|
219
|
+
Last = L[i];
|
220
|
+
|
221
|
+
/* Place any necessary skip bytes */
|
222
|
+
|
223
|
+
while ( Entry > 127 )
|
224
|
+
{
|
225
|
+
Blocks = (Entry >> 7);
|
226
|
+
if ( Blocks > 127 ) Blocks = 127;
|
227
|
+
Entry -= Blocks * 128;
|
228
|
+
*p++ = Blocks + 128;
|
229
|
+
}
|
230
|
+
|
231
|
+
*p++ = Entry;
|
232
|
+
}
|
233
|
+
|
234
|
+
Compressed = Alloc(p - CBuffer, Byte);
|
235
|
+
memcpy(Compressed, CBuffer, p - CBuffer);
|
236
|
+
|
237
|
+
return Compressed;
|
238
|
+
}
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
void Uncompress(Byte *CL, int *UCL)
|
243
|
+
/* ---------- */
|
244
|
+
{
|
245
|
+
int i, Entry=0;
|
246
|
+
Byte *p;
|
247
|
+
|
248
|
+
memcpy(UCL, CL, 4);
|
249
|
+
p = CL + 4;
|
250
|
+
|
251
|
+
ForEach(i, 1, UCL[0])
|
252
|
+
{
|
253
|
+
while ( (*p) & 128 )
|
254
|
+
{
|
255
|
+
Entry += ((*p++) & 127) * 128;
|
256
|
+
}
|
257
|
+
|
258
|
+
Entry = UCL[i] = Entry + *p++;
|
259
|
+
}
|
260
|
+
}
|
261
|
+
|
262
|
+
|
263
|
+
|
264
|
+
/*************************************************************************/
|
265
|
+
/* */
|
266
|
+
/* Sort list in preparation for Compress() */
|
267
|
+
/* */
|
268
|
+
/*************************************************************************/
|
269
|
+
|
270
|
+
|
271
|
+
void ListSort(int *L, int Fp, int Lp)
|
272
|
+
/* -------- */
|
273
|
+
{
|
274
|
+
int i, High, Middle, Thresh, Temp;
|
275
|
+
|
276
|
+
if ( Fp < Lp )
|
277
|
+
{
|
278
|
+
Thresh = L[(Fp+Lp) / 2];
|
279
|
+
|
280
|
+
/* Divide cases into three groups:
|
281
|
+
Fp .. Middle-1: values < Thresh
|
282
|
+
Middle .. High: values = Thresh
|
283
|
+
High+1 .. Lp: values > Thresh */
|
284
|
+
|
285
|
+
for ( Middle = Fp ; L[Middle] < Thresh ; Middle++ )
|
286
|
+
;
|
287
|
+
|
288
|
+
for ( High = Lp ; L[High] > Thresh ; High-- )
|
289
|
+
;
|
290
|
+
|
291
|
+
for ( i = Middle ; i <= High ; )
|
292
|
+
{
|
293
|
+
if ( L[i] < Thresh )
|
294
|
+
{
|
295
|
+
Temp = L[Middle];
|
296
|
+
L[Middle] = L[i];
|
297
|
+
L[i] = Temp;
|
298
|
+
Middle++;
|
299
|
+
i++;
|
300
|
+
}
|
301
|
+
else
|
302
|
+
if ( L[i] > Thresh )
|
303
|
+
{
|
304
|
+
Temp = L[High];
|
305
|
+
L[High] = L[i];
|
306
|
+
L[i] = Temp;
|
307
|
+
High--;
|
308
|
+
}
|
309
|
+
else
|
310
|
+
{
|
311
|
+
i++;
|
312
|
+
}
|
313
|
+
}
|
314
|
+
|
315
|
+
/* Sort the first and third groups */
|
316
|
+
|
317
|
+
ListSort(L, Fp, Middle-1);
|
318
|
+
ListSort(L, High+1, Lp);
|
319
|
+
}
|
320
|
+
}
|
321
|
+
|
322
|
+
|
323
|
+
|
324
|
+
/*************************************************************************/
|
325
|
+
/* */
|
326
|
+
/* Decide whether the given rule duplicates rule r */
|
327
|
+
/* */
|
328
|
+
/*************************************************************************/
|
329
|
+
|
330
|
+
|
331
|
+
Boolean SameRule(RuleNo r, Condition Cond[], int NConds, ClassNo TargetClass)
|
332
|
+
/* -------- */
|
333
|
+
{
|
334
|
+
int d, i, Bytes;
|
335
|
+
|
336
|
+
if ( Rule[r]->Size != NConds || Rule[r]->Rhs != TargetClass )
|
337
|
+
{
|
338
|
+
return false;
|
339
|
+
}
|
340
|
+
|
341
|
+
ForEach(d, 1, NConds)
|
342
|
+
{
|
343
|
+
if ( Rule[r]->Lhs[d]->NodeType != Cond[d]->NodeType ||
|
344
|
+
Rule[r]->Lhs[d]->Tested != Cond[d]->Tested )
|
345
|
+
{
|
346
|
+
return false;
|
347
|
+
}
|
348
|
+
|
349
|
+
switch ( Cond[d]->NodeType )
|
350
|
+
{
|
351
|
+
case BrDiscr:
|
352
|
+
if ( Rule[r]->Lhs[d]->TestValue != Cond[d]->TestValue )
|
353
|
+
{
|
354
|
+
return false;
|
355
|
+
}
|
356
|
+
break;
|
357
|
+
|
358
|
+
case BrThresh:
|
359
|
+
if ( Rule[r]->Lhs[d]->TestValue != Cond[d]->TestValue ||
|
360
|
+
Rule[r]->Lhs[d]->Cut != Cond[d]->Cut )
|
361
|
+
{
|
362
|
+
return false;
|
363
|
+
}
|
364
|
+
break;
|
365
|
+
|
366
|
+
case BrSubset:
|
367
|
+
Bytes = (MaxAttVal[Cond[d]->Tested]>>3) + 1;
|
368
|
+
ForEach(i, 0, Bytes-1)
|
369
|
+
{
|
370
|
+
if ( Rule[r]->Lhs[d]->Subset[i] != Cond[d]->Subset[i] )
|
371
|
+
{
|
372
|
+
return false;
|
373
|
+
}
|
374
|
+
}
|
375
|
+
}
|
376
|
+
}
|
377
|
+
|
378
|
+
return true;
|
379
|
+
}
|
380
|
+
|
381
|
+
|
382
|
+
|
383
|
+
/*************************************************************************/
|
384
|
+
/* */
|
385
|
+
/* Free space occupied by a rule and a ruleset */
|
386
|
+
/* */
|
387
|
+
/*************************************************************************/
|
388
|
+
|
389
|
+
|
390
|
+
void FreeRule(CRule R)
|
391
|
+
/* -------- */
|
392
|
+
{
|
393
|
+
int d;
|
394
|
+
|
395
|
+
ForEach(d, 1, R->Size)
|
396
|
+
{
|
397
|
+
if ( R->Lhs[d]->NodeType == BrSubset )
|
398
|
+
{
|
399
|
+
FreeUnlessNil(R->Lhs[d]->Subset);
|
400
|
+
}
|
401
|
+
FreeUnlessNil(R->Lhs[d]);
|
402
|
+
}
|
403
|
+
FreeUnlessNil(R->Lhs);
|
404
|
+
FreeUnlessNil(R);
|
405
|
+
}
|
406
|
+
|
407
|
+
|
408
|
+
|
409
|
+
void FreeRules(CRuleSet RS)
|
410
|
+
/* --------- */
|
411
|
+
{
|
412
|
+
int ri;
|
413
|
+
|
414
|
+
ForEach(ri, 1, RS->SNRules)
|
415
|
+
{
|
416
|
+
FreeRule(RS->SRule[ri]);
|
417
|
+
}
|
418
|
+
Free(RS->SRule);
|
419
|
+
FreeRuleTree(RS->RT);
|
420
|
+
Free(RS);
|
421
|
+
}
|
422
|
+
|
423
|
+
|
424
|
+
|
425
|
+
/*************************************************************************/
|
426
|
+
/* */
|
427
|
+
/* Print a ruleset */
|
428
|
+
/* */
|
429
|
+
/*************************************************************************/
|
430
|
+
|
431
|
+
|
432
|
+
void PrintRules(CRuleSet RS, String Msg)
|
433
|
+
/* ---------- */
|
434
|
+
{
|
435
|
+
int r;
|
436
|
+
|
437
|
+
fprintf(Of, "\n%s\n", Msg);
|
438
|
+
|
439
|
+
ForEach(r, 1, RS->SNRules)
|
440
|
+
{
|
441
|
+
PrintRule(RS->SRule[r]);
|
442
|
+
}
|
443
|
+
}
|
444
|
+
|
445
|
+
|
446
|
+
|
447
|
+
/*************************************************************************/
|
448
|
+
/* */
|
449
|
+
/* Print rule R */
|
450
|
+
/* */
|
451
|
+
/*************************************************************************/
|
452
|
+
|
453
|
+
|
454
|
+
void PrintRule(CRule R)
|
455
|
+
/* --------- */
|
456
|
+
{
|
457
|
+
int d;
|
458
|
+
|
459
|
+
fprintf(Of, T_RuleHeader);
|
460
|
+
if ( TRIALS > 1 ) fprintf(Of, "%d/", R->TNo);
|
461
|
+
fprintf(Of, "%d: (%.8g", R->RNo, P1(R->Cover));
|
462
|
+
if ( R->Correct < R->Cover - 0.1 )
|
463
|
+
{
|
464
|
+
fprintf(Of, "/%.8g", P1(R->Cover - R->Correct));
|
465
|
+
}
|
466
|
+
fprintf(Of, T_RuleLift, ((R->Correct + 1) / (R->Cover + 2)) / R->Prior);
|
467
|
+
|
468
|
+
ForEach(d, 1, R->Size)
|
469
|
+
{
|
470
|
+
PrintCondition(R->Lhs[d]);
|
471
|
+
}
|
472
|
+
|
473
|
+
fprintf(Of, "\t-> " T_class " %s [%.3f]\n",
|
474
|
+
ClassName[R->Rhs], R->Vote/1000.0);
|
475
|
+
}
|
476
|
+
|
477
|
+
|
478
|
+
|
479
|
+
/*************************************************************************/
|
480
|
+
/* */
|
481
|
+
/* Print a condition C of a rule */
|
482
|
+
/* */
|
483
|
+
/*************************************************************************/
|
484
|
+
|
485
|
+
|
486
|
+
void PrintCondition(Condition C)
|
487
|
+
/* -------------- */
|
488
|
+
{
|
489
|
+
DiscrValue v, pv, Last, Values;
|
490
|
+
Boolean First=true;
|
491
|
+
Attribute Att;
|
492
|
+
int Col, Base, Entry;
|
493
|
+
char CVS[20];
|
494
|
+
|
495
|
+
v = C->TestValue;
|
496
|
+
Att = C->Tested;
|
497
|
+
|
498
|
+
fprintf(Of, "\t%s", AttName[Att]);
|
499
|
+
|
500
|
+
if ( v < 0 )
|
501
|
+
{
|
502
|
+
fprintf(Of, T_IsUnknown);
|
503
|
+
return;
|
504
|
+
}
|
505
|
+
|
506
|
+
switch ( C->NodeType )
|
507
|
+
{
|
508
|
+
case BrDiscr:
|
509
|
+
fprintf(Of, " = %s\n", AttValName[Att][v]);
|
510
|
+
break;
|
511
|
+
|
512
|
+
case BrThresh:
|
513
|
+
if ( v == 1 )
|
514
|
+
{
|
515
|
+
fprintf(Of, " = N/A\n");
|
516
|
+
}
|
517
|
+
else
|
518
|
+
{
|
519
|
+
CValToStr(C->Cut, Att, CVS);
|
520
|
+
fprintf(Of, " %s %s\n", ( v == 2 ? "<=" : ">" ), CVS);
|
521
|
+
}
|
522
|
+
break;
|
523
|
+
|
524
|
+
case BrSubset:
|
525
|
+
/* Count values at this branch */
|
526
|
+
|
527
|
+
Values = Elements(Att, C->Subset, &Last);
|
528
|
+
if ( Values == 1 )
|
529
|
+
{
|
530
|
+
fprintf(Of, " = %s\n", AttValName[Att][Last]);
|
531
|
+
break;
|
532
|
+
}
|
533
|
+
|
534
|
+
if ( Ordered(Att) )
|
535
|
+
{
|
536
|
+
/* Find first value */
|
537
|
+
|
538
|
+
for ( pv = 1 ; ! In(pv, C->Subset) ; pv++ )
|
539
|
+
;
|
540
|
+
|
541
|
+
fprintf(Of, " %s [%s-%s]\n", T_InRange,
|
542
|
+
AttValName[Att][pv], AttValName[Att][Last]);
|
543
|
+
break;
|
544
|
+
}
|
545
|
+
|
546
|
+
/* Must keep track of position to break long lines */
|
547
|
+
|
548
|
+
fprintf(Of, " %s {", T_ElementOf);
|
549
|
+
Col = Base = CharWidth(AttName[Att]) + CharWidth(T_ElementOf) + 11;
|
550
|
+
|
551
|
+
ForEach(pv, 1, MaxAttVal[Att])
|
552
|
+
{
|
553
|
+
if ( In(pv, C->Subset) )
|
554
|
+
{
|
555
|
+
Entry = CharWidth(AttValName[Att][pv]);
|
556
|
+
|
557
|
+
if ( First )
|
558
|
+
{
|
559
|
+
First = false;
|
560
|
+
}
|
561
|
+
else
|
562
|
+
if ( Col + Entry + 2 >= Width )
|
563
|
+
{
|
564
|
+
Col = Base;
|
565
|
+
fprintf(Of, ",\n%*s", Col, "");
|
566
|
+
}
|
567
|
+
else
|
568
|
+
{
|
569
|
+
fprintf(Of, ", ");
|
570
|
+
Col += 2;
|
571
|
+
}
|
572
|
+
|
573
|
+
fprintf(Of, "%s", AttValName[Att][pv]);
|
574
|
+
Col += Entry;
|
575
|
+
}
|
576
|
+
}
|
577
|
+
fprintf(Of, "}\n");
|
578
|
+
}
|
579
|
+
}
|