see5-installer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
data/ext/c5.0/discr.c
ADDED
@@ -0,0 +1,307 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Evaluation of a test on a discrete valued attribute */
|
30
|
+
/* --------------------------------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
#include "defns.i"
|
35
|
+
#include "extern.i"
|
36
|
+
|
37
|
+
|
38
|
+
/*************************************************************************/
|
39
|
+
/* */
|
40
|
+
/* Set Info[] and Gain[] for discrete partition of cases */
|
41
|
+
/* */
|
42
|
+
/*************************************************************************/
|
43
|
+
|
44
|
+
|
45
|
+
void EvalDiscreteAtt(Attribute Att, CaseCount Cases)
|
46
|
+
/* --------------- */
|
47
|
+
{
|
48
|
+
CaseCount KnownCases;
|
49
|
+
int ReasonableSubsets=0;
|
50
|
+
DiscrValue v;
|
51
|
+
double BaseInfo;
|
52
|
+
|
53
|
+
SetDiscrFreq(Att);
|
54
|
+
KnownCases = Cases - GEnv.ValFreq[0];
|
55
|
+
|
56
|
+
/* Check reasonable subsets */
|
57
|
+
|
58
|
+
ForEach(v, 1, MaxAttVal[Att])
|
59
|
+
{
|
60
|
+
if ( GEnv.ValFreq[v] >= MINITEMS ) ReasonableSubsets++;
|
61
|
+
}
|
62
|
+
|
63
|
+
if ( ReasonableSubsets < 2 )
|
64
|
+
{
|
65
|
+
Verbosity(2, fprintf(Of, "\tAtt %s: poor split\n", AttName[Att]))
|
66
|
+
return;
|
67
|
+
}
|
68
|
+
|
69
|
+
BaseInfo = ( ! GEnv.ValFreq[0] ? GlobalBaseInfo :
|
70
|
+
DiscrKnownBaseInfo(KnownCases, MaxAttVal[Att]) );
|
71
|
+
|
72
|
+
Gain[Att] = ComputeGain(BaseInfo, GEnv.ValFreq[0] / Cases, MaxAttVal[Att],
|
73
|
+
KnownCases);
|
74
|
+
Info[Att] = TotalInfo(GEnv.ValFreq, 0, MaxAttVal[Att]) / Cases;
|
75
|
+
|
76
|
+
Verbosity(2,
|
77
|
+
{
|
78
|
+
fprintf(Of, "\tAtt %s", AttName[Att]);
|
79
|
+
Verbosity(3,
|
80
|
+
PrintDistribution(Att, 0, MaxAttVal[Att], GEnv.Freq, GEnv.ValFreq,
|
81
|
+
true))
|
82
|
+
fprintf(Of, "\tinf %.3f, gain %.3f\n", Info[Att], Gain[Att]);
|
83
|
+
})
|
84
|
+
}
|
85
|
+
|
86
|
+
|
87
|
+
|
88
|
+
/*************************************************************************/
|
89
|
+
/* */
|
90
|
+
/* Set Info[] and Gain[] for ordered split on cases */
|
91
|
+
/* */
|
92
|
+
/*************************************************************************/
|
93
|
+
|
94
|
+
|
95
|
+
void EvalOrderedAtt(Attribute Att, CaseCount Cases)
|
96
|
+
/* -------------- */
|
97
|
+
{
|
98
|
+
CaseCount KnownCases;
|
99
|
+
double *HoldFreqRow, SplitFreq[4];
|
100
|
+
ClassNo c;
|
101
|
+
int Tries=0;
|
102
|
+
DiscrValue v, BestV;
|
103
|
+
double BaseInfo, ThisGain, BestInfo, BestGain=None;
|
104
|
+
|
105
|
+
SetDiscrFreq(Att);
|
106
|
+
KnownCases = Cases - GEnv.ValFreq[0];
|
107
|
+
|
108
|
+
BaseInfo = ( ! GEnv.ValFreq[0] ? GlobalBaseInfo :
|
109
|
+
DiscrKnownBaseInfo(KnownCases, MaxAttVal[Att]) );
|
110
|
+
|
111
|
+
Verbosity(2, fprintf(Of, "\tAtt %s", AttName[Att]))
|
112
|
+
Verbosity(3, PrintDistribution(Att, 0, MaxAttVal[Att], GEnv.Freq,
|
113
|
+
GEnv.ValFreq, true))
|
114
|
+
|
115
|
+
/* Move elts of Freq[] starting with the third up one place
|
116
|
+
and aggregate class frequencies */
|
117
|
+
|
118
|
+
HoldFreqRow = GEnv.Freq[MaxAttVal[Att]+1];
|
119
|
+
ForEach(c, 1, MaxClass)
|
120
|
+
{
|
121
|
+
HoldFreqRow[c] = 0;
|
122
|
+
}
|
123
|
+
SplitFreq[0] = GEnv.ValFreq[0];
|
124
|
+
SplitFreq[1] = GEnv.ValFreq[1];
|
125
|
+
SplitFreq[2] = GEnv.ValFreq[2];
|
126
|
+
SplitFreq[3] = 0;
|
127
|
+
|
128
|
+
for ( v = MaxAttVal[Att] ; v > 2 ; v-- )
|
129
|
+
{
|
130
|
+
GEnv.Freq[v+1] = GEnv.Freq[v];
|
131
|
+
ForEach(c, 1, MaxClass)
|
132
|
+
{
|
133
|
+
HoldFreqRow[c] += GEnv.Freq[v][c];
|
134
|
+
}
|
135
|
+
SplitFreq[3] += GEnv.ValFreq[v];
|
136
|
+
}
|
137
|
+
|
138
|
+
GEnv.Freq[3] = HoldFreqRow;
|
139
|
+
|
140
|
+
/* Try various cuts, saving the one with maximum gain */
|
141
|
+
|
142
|
+
ForEach(v, 3, MaxAttVal[Att])
|
143
|
+
{
|
144
|
+
if ( GEnv.ValFreq[v] > 0 &&
|
145
|
+
SplitFreq[2] >= MINITEMS && SplitFreq[3] >= MINITEMS )
|
146
|
+
{
|
147
|
+
Tries++;
|
148
|
+
ThisGain =
|
149
|
+
ComputeGain(BaseInfo, GEnv.ValFreq[0] / Cases, 3, KnownCases);
|
150
|
+
|
151
|
+
if ( ThisGain > BestGain )
|
152
|
+
{
|
153
|
+
BestGain = ThisGain;
|
154
|
+
BestInfo = TotalInfo(SplitFreq, 0, 3) / Cases;
|
155
|
+
BestV = v-1;
|
156
|
+
}
|
157
|
+
|
158
|
+
Verbosity(3,
|
159
|
+
{ fprintf(Of, "\t\tFrom %s (gain %.3f)",
|
160
|
+
AttValName[Att][v], ThisGain);
|
161
|
+
PrintDistribution(Att, 0, 3, GEnv.Freq, GEnv.ValFreq, false);
|
162
|
+
})
|
163
|
+
}
|
164
|
+
|
165
|
+
/* Move val v from right branch to left branch */
|
166
|
+
|
167
|
+
ForEach(c, 1, MaxClass)
|
168
|
+
{
|
169
|
+
GEnv.Freq[2][c] += GEnv.Freq[v+1][c];
|
170
|
+
GEnv.Freq[3][c] -= GEnv.Freq[v+1][c];
|
171
|
+
}
|
172
|
+
SplitFreq[2] += GEnv.ValFreq[v];
|
173
|
+
SplitFreq[3] -= GEnv.ValFreq[v];
|
174
|
+
}
|
175
|
+
|
176
|
+
if ( Tries > 1 ) BestGain -= Log(Tries) / Cases;
|
177
|
+
|
178
|
+
/* If a test on the attribute is able to make a gain,
|
179
|
+
set the best break point, gain and information */
|
180
|
+
|
181
|
+
if ( BestGain <= 0 )
|
182
|
+
{
|
183
|
+
Verbosity(2, fprintf(Of, "\tno gain\n"))
|
184
|
+
}
|
185
|
+
else
|
186
|
+
{
|
187
|
+
Gain[Att] = BestGain;
|
188
|
+
Info[Att] = BestInfo;
|
189
|
+
Bar[Att] = BestV;
|
190
|
+
|
191
|
+
Verbosity(2,
|
192
|
+
fprintf(Of, "\tcut=%g, inf %.3f, gain %.3f\n",
|
193
|
+
Bar[Att], Info[Att], Gain[Att]))
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
|
198
|
+
|
199
|
+
/*************************************************************************/
|
200
|
+
/* */
|
201
|
+
/* Compute frequency tables Freq[][] and ValFreq[] for attribute */
|
202
|
+
/* Att for current cases */
|
203
|
+
/* */
|
204
|
+
/*************************************************************************/
|
205
|
+
|
206
|
+
|
207
|
+
void SetDiscrFreq(Attribute Att)
|
208
|
+
/* ------------ */
|
209
|
+
{
|
210
|
+
ClassNo c;
|
211
|
+
DiscrValue v;
|
212
|
+
int x;
|
213
|
+
|
214
|
+
/* Determine the frequency of each possible value for the
|
215
|
+
given attribute */
|
216
|
+
|
217
|
+
ForEach(v, 0, MaxAttVal[Att])
|
218
|
+
{
|
219
|
+
GEnv.ValFreq[v] = 0;
|
220
|
+
|
221
|
+
x = v * MaxClass;
|
222
|
+
ForEach(c, 1, MaxClass)
|
223
|
+
{
|
224
|
+
GEnv.ValFreq[v] += (GEnv.Freq[v][c] = DFreq[Att][x + (c-1)]);
|
225
|
+
}
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
|
230
|
+
|
231
|
+
/*************************************************************************/
|
232
|
+
/* */
|
233
|
+
/* Return the base info for cases with known values of a discrete */
|
234
|
+
/* attribute, using the frequency table Freq[][] */
|
235
|
+
/* */
|
236
|
+
/*************************************************************************/
|
237
|
+
|
238
|
+
|
239
|
+
double DiscrKnownBaseInfo(CaseCount KnownCases, DiscrValue MaxVal)
|
240
|
+
/* ------------------ */
|
241
|
+
{
|
242
|
+
ClassNo c;
|
243
|
+
CaseCount ClassCount;
|
244
|
+
DiscrValue v;
|
245
|
+
|
246
|
+
if ( KnownCases < 1E-5 ) return 0.0;
|
247
|
+
|
248
|
+
ForEach(c, 1, MaxClass)
|
249
|
+
{
|
250
|
+
ClassCount = 0;
|
251
|
+
ForEach(v, 1, MaxVal)
|
252
|
+
{
|
253
|
+
ClassCount += GEnv.Freq[v][c];
|
254
|
+
}
|
255
|
+
GEnv.ClassFreq[c] = ClassCount;
|
256
|
+
}
|
257
|
+
|
258
|
+
return TotalInfo(GEnv.ClassFreq, 1, MaxClass) / KnownCases;
|
259
|
+
}
|
260
|
+
|
261
|
+
|
262
|
+
|
263
|
+
/*************************************************************************/
|
264
|
+
/* */
|
265
|
+
/* Construct and return a node for a test on a discrete attribute */
|
266
|
+
/* */
|
267
|
+
/*************************************************************************/
|
268
|
+
|
269
|
+
|
270
|
+
void DiscreteTest(Tree Node, Attribute Att)
|
271
|
+
/* ------------ */
|
272
|
+
{
|
273
|
+
int S, Bytes;
|
274
|
+
DiscrValue v, CutV;
|
275
|
+
|
276
|
+
if ( Ordered(Att) )
|
277
|
+
{
|
278
|
+
Sprout(Node, 3);
|
279
|
+
|
280
|
+
Node->NodeType = BrSubset;
|
281
|
+
Node->Tested = Att;
|
282
|
+
|
283
|
+
Bytes = (MaxAttVal[Att]>>3) + 1;
|
284
|
+
Node->Subset = AllocZero(4, Set);
|
285
|
+
|
286
|
+
ForEach(S, 1, 3)
|
287
|
+
{
|
288
|
+
Node->Subset[S] = AllocZero(Bytes, Byte);
|
289
|
+
}
|
290
|
+
|
291
|
+
Node->Cut = CutV = Bar[Att] + 0.1;
|
292
|
+
|
293
|
+
SetBit(1, Node->Subset[1]);
|
294
|
+
ForEach(v, 2, MaxAttVal[Att])
|
295
|
+
{
|
296
|
+
S = ( v <= CutV ? 2 : 3 );
|
297
|
+
SetBit(v, Node->Subset[S]);
|
298
|
+
}
|
299
|
+
}
|
300
|
+
else
|
301
|
+
{
|
302
|
+
Sprout(Node, MaxAttVal[Att]);
|
303
|
+
|
304
|
+
Node->NodeType = BrDiscr;
|
305
|
+
Node->Tested = Att;
|
306
|
+
}
|
307
|
+
}
|
data/ext/c5.0/extern.i
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
extern int VERBOSITY,
|
28
|
+
TRIALS,
|
29
|
+
FOLDS,
|
30
|
+
UTILITY,
|
31
|
+
NCPU;
|
32
|
+
|
33
|
+
extern Boolean SUBSET,
|
34
|
+
BOOST,
|
35
|
+
PROBTHRESH,
|
36
|
+
RULES,
|
37
|
+
XVAL,
|
38
|
+
NOCOSTS,
|
39
|
+
WINNOW,
|
40
|
+
GLOBAL;
|
41
|
+
|
42
|
+
extern CaseCount MINITEMS,
|
43
|
+
LEAFRATIO;
|
44
|
+
|
45
|
+
extern float CF,
|
46
|
+
SAMPLE;
|
47
|
+
|
48
|
+
extern Boolean LOCK;
|
49
|
+
|
50
|
+
extern Attribute ClassAtt,
|
51
|
+
LabelAtt,
|
52
|
+
CWtAtt;
|
53
|
+
|
54
|
+
extern double AvCWt;
|
55
|
+
|
56
|
+
extern String *ClassName,
|
57
|
+
*AttName,
|
58
|
+
**AttValName;
|
59
|
+
|
60
|
+
extern char *IgnoredVals;
|
61
|
+
extern int IValsSize,
|
62
|
+
IValsOffset;
|
63
|
+
|
64
|
+
extern int MaxAtt,
|
65
|
+
MaxClass,
|
66
|
+
MaxDiscrVal,
|
67
|
+
MaxLabel,
|
68
|
+
LineNo,
|
69
|
+
ErrMsgs,
|
70
|
+
AttExIn,
|
71
|
+
TSBase;
|
72
|
+
|
73
|
+
extern DiscrValue *MaxAttVal;
|
74
|
+
|
75
|
+
extern char *SpecialStatus;
|
76
|
+
|
77
|
+
extern Definition *AttDef;
|
78
|
+
extern Attribute **AttDefUses;
|
79
|
+
|
80
|
+
extern Boolean *SomeMiss,
|
81
|
+
*SomeNA,
|
82
|
+
Winnowed;
|
83
|
+
|
84
|
+
extern ContValue *ClassThresh;
|
85
|
+
|
86
|
+
extern CaseNo MaxCase;
|
87
|
+
|
88
|
+
extern DataRec *Case;
|
89
|
+
|
90
|
+
extern DataRec *SaveCase;
|
91
|
+
|
92
|
+
extern String FileStem;
|
93
|
+
|
94
|
+
extern Tree *Raw,
|
95
|
+
*Pruned,
|
96
|
+
WTree;
|
97
|
+
|
98
|
+
extern float Confidence,
|
99
|
+
SampleFrac,
|
100
|
+
*Vote,
|
101
|
+
*BVoteBlock,
|
102
|
+
**MCost,
|
103
|
+
**NCost,
|
104
|
+
*WeightMul;
|
105
|
+
|
106
|
+
extern CRule *MostSpec;
|
107
|
+
|
108
|
+
extern Boolean UnitWeights,
|
109
|
+
CostWeights;
|
110
|
+
|
111
|
+
extern int Trial,
|
112
|
+
MaxTree;
|
113
|
+
|
114
|
+
extern ClassNo *TrialPred;
|
115
|
+
|
116
|
+
extern double *ClassFreq,
|
117
|
+
**DFreq;
|
118
|
+
|
119
|
+
extern float *Gain,
|
120
|
+
*Info,
|
121
|
+
*EstMaxGR,
|
122
|
+
*ClassSum;
|
123
|
+
|
124
|
+
extern ContValue *Bar;
|
125
|
+
|
126
|
+
extern double GlobalBaseInfo,
|
127
|
+
**Bell;
|
128
|
+
|
129
|
+
extern Byte *Tested;
|
130
|
+
|
131
|
+
extern Set **Subset;
|
132
|
+
extern int *Subsets;
|
133
|
+
|
134
|
+
extern EnvRec GEnv;
|
135
|
+
|
136
|
+
extern CRule *Rule;
|
137
|
+
|
138
|
+
extern RuleNo NRules,
|
139
|
+
RuleSpace;
|
140
|
+
|
141
|
+
extern CRuleSet *RuleSet;
|
142
|
+
|
143
|
+
extern ClassNo Default;
|
144
|
+
|
145
|
+
extern Byte **Fires,
|
146
|
+
*CBuffer;
|
147
|
+
|
148
|
+
extern int *CovBy,
|
149
|
+
*List;
|
150
|
+
|
151
|
+
extern float AttTestBits,
|
152
|
+
*BranchBits;
|
153
|
+
extern int *AttValues,
|
154
|
+
*PossibleCuts;
|
155
|
+
|
156
|
+
extern double *LogCaseNo,
|
157
|
+
*LogFact;
|
158
|
+
|
159
|
+
extern int *UtilErr,
|
160
|
+
*UtilBand;
|
161
|
+
extern double *UtilCost;
|
162
|
+
|
163
|
+
extern int KRInit,
|
164
|
+
Now;
|
165
|
+
|
166
|
+
extern FILE *TRf;
|
167
|
+
extern char Fn[500];
|
168
|
+
|
169
|
+
extern FILE *Of;
|
170
|
+
|