see5-installer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/ext/c5.0/Makefile +86 -0
- data/ext/c5.0/attwinnow.c +394 -0
- data/ext/c5.0/c50.c +330 -0
- data/ext/c5.0/classify.c +700 -0
- data/ext/c5.0/confmat.c +195 -0
- data/ext/c5.0/construct.c +853 -0
- data/ext/c5.0/contin.c +613 -0
- data/ext/c5.0/defns.i +788 -0
- data/ext/c5.0/discr.c +307 -0
- data/ext/c5.0/extern.i +170 -0
- data/ext/c5.0/formrules.c +720 -0
- data/ext/c5.0/formtree.c +1158 -0
- data/ext/c5.0/getdata.c +521 -0
- data/ext/c5.0/getnames.c +733 -0
- data/ext/c5.0/global.c +211 -0
- data/ext/c5.0/gpl.txt +674 -0
- data/ext/c5.0/implicitatt.c +1112 -0
- data/ext/c5.0/info.c +146 -0
- data/ext/c5.0/mcost.c +138 -0
- data/ext/c5.0/modelfiles.c +952 -0
- data/ext/c5.0/p-thresh.c +313 -0
- data/ext/c5.0/prune.c +1069 -0
- data/ext/c5.0/report.c +345 -0
- data/ext/c5.0/rules.c +579 -0
- data/ext/c5.0/ruletree.c +398 -0
- data/ext/c5.0/siftrules.c +1285 -0
- data/ext/c5.0/sort.c +156 -0
- data/ext/c5.0/subset.c +599 -0
- data/ext/c5.0/text.i +223 -0
- data/ext/c5.0/trees.c +740 -0
- data/ext/c5.0/update.c +129 -0
- data/ext/c5.0/utility.c +1146 -0
- data/ext/c5.0/xval +150 -0
- data/ext/c5.0/xval.c +402 -0
- data/ext/gritbot/Makefile +98 -0
- data/ext/gritbot/check.c +1110 -0
- data/ext/gritbot/cluster.c +342 -0
- data/ext/gritbot/common.c +1269 -0
- data/ext/gritbot/continatt.c +412 -0
- data/ext/gritbot/defns.i +623 -0
- data/ext/gritbot/discratt.c +459 -0
- data/ext/gritbot/extern.i +101 -0
- data/ext/gritbot/getdata.c +329 -0
- data/ext/gritbot/getnames.c +573 -0
- data/ext/gritbot/global.c +104 -0
- data/ext/gritbot/gpl.txt +674 -0
- data/ext/gritbot/gritbot.c +295 -0
- data/ext/gritbot/implicitatt.c +1108 -0
- data/ext/gritbot/inspect.c +794 -0
- data/ext/gritbot/modelfiles.c +687 -0
- data/ext/gritbot/outlier.c +415 -0
- data/ext/gritbot/sort.c +130 -0
- data/ext/gritbot/text.i +159 -0
- data/ext/gritbot/update.c +126 -0
- data/ext/gritbot/utility.c +1029 -0
- data/ext/see5-installer/extconf.rb +25 -0
- data/lib/see5/installer.rb +10 -0
- data/lib/see5/installer/version.rb +7 -0
- data/see5-installer.gemspec +30 -0
- metadata +115 -0
data/ext/c5.0/text.i
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Text strings for UTF-8 internationalization */
|
30
|
+
/* ------------------------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
/* General stuff */
|
37
|
+
|
38
|
+
//#define UTF8 /* uncomment if using UTF-8 */
|
39
|
+
|
40
|
+
#ifdef UTF8
|
41
|
+
#define CharWidth(S) UTF8CharWidth(S)
|
42
|
+
#else
|
43
|
+
#define CharWidth(S) (int) strlen(S)
|
44
|
+
#endif
|
45
|
+
|
46
|
+
|
47
|
+
/* Strings with width/format restrictions */
|
48
|
+
/* (W = width when printed, C = centered) */
|
49
|
+
|
50
|
+
#define F_Fold "Fold" /* W<8 */
|
51
|
+
#define F_UFold "----" /* W same */
|
52
|
+
#define F_Trial "Trial" /* W<8 */
|
53
|
+
#define F_UTrial "-----" /* W same */
|
54
|
+
#define F_DecisionTree16 " Decision Tree " /* W=16C */
|
55
|
+
#define F_SizeErrors "Size Errors" /* W=16 */
|
56
|
+
#define F_DecisionTree23 " Decision Tree " /* W=23C */
|
57
|
+
#define F_SizeErrorsCost "Size Errors Cost" /* W=23 */
|
58
|
+
#define F_Rules16 " Rules " /* W=16C */
|
59
|
+
#define F_NoErrors " No Errors" /* W=16 */
|
60
|
+
#define F_Rules23 " Rules " /* W=23C */
|
61
|
+
#define F_NoErrorsCost " No Errors Cost" /* W=23 */
|
62
|
+
#define F_Rules "Rules" /* W<8 */
|
63
|
+
#define F_URules "-----" /* W same */
|
64
|
+
#define F_Errors "Errors" /* W<8 */
|
65
|
+
#define F_UErrors "------" /* W same */
|
66
|
+
#define F_Cost "Cost" /* W<8 */
|
67
|
+
#define F_UCost "----" /* W same */
|
68
|
+
#define F_Boost "boost" /* W<8 */
|
69
|
+
|
70
|
+
|
71
|
+
/* Strings of arbitrary length */
|
72
|
+
|
73
|
+
#define T_See5 "See5"
|
74
|
+
#define T_C50 "C5.0"
|
75
|
+
#define TX_Release(n) "Release " n
|
76
|
+
|
77
|
+
#define T_OptHeader "\n Options:\n"
|
78
|
+
#define T_OptApplication "\tApplication `%s'\n"
|
79
|
+
#define T_OptBoost "\tBoosted classifiers\n"
|
80
|
+
#define T_OptProbThresh "\tProbability thresholds\n"
|
81
|
+
#define T_OptTrials "\t%d boosting trials\n"
|
82
|
+
#define T_OptSubsets "\tTests on discrete attribute groups\n"
|
83
|
+
#define T_OptMinCases "\tTests require 2 branches with >=%g cases\n"
|
84
|
+
#define T_OptCF "\tPruning confidence level %g%%\n"
|
85
|
+
#define T_OptRules "\tRule-based classifiers\n"
|
86
|
+
#define T_OptSampling "\tUse %g%% of data for training\n"
|
87
|
+
#define T_OptSeed "\tRandom seed %d\n"
|
88
|
+
#define T_OptUtility "\tRule utility ordering (1/%d's)\n"
|
89
|
+
#define T_OptNoCosts "\tFocus on errors (ignore costs file)\n"
|
90
|
+
#define T_OptWinnow "\tWinnow attributes\n"
|
91
|
+
#define T_OptNoGlobal "\tDo not use global tree pruning\n"
|
92
|
+
#define T_OptXval "\tCross-validate using %d folds\n"
|
93
|
+
#define T_UnregnizedOpt "\n ** Unrecognised option %s %s\n"
|
94
|
+
#define T_SummaryOpts " ** Summary of options for c5.0:\n"
|
95
|
+
#define T_ListOpts "\t-f <filestem>\tapplication filestem\n"\
|
96
|
+
"\t-r\t\tuse rule-based classifiers\n"\
|
97
|
+
"\t-u <bands>\torder rules by utility in"\
|
98
|
+
" bands\n"\
|
99
|
+
"\t-w\t\tinvoke attribute winnowing\n"\
|
100
|
+
"\t-b\t\tinvoke boosting\n"\
|
101
|
+
"\t-t <trials>\tnumber of boosting trials\n"\
|
102
|
+
"\t-p\t\tuse soft thresholds\n"\
|
103
|
+
"\t-e\t\tfocus on errors (ignore costs file)\n"\
|
104
|
+
"\t-s\t\tfind subset tests for discrete atts\n"\
|
105
|
+
"\t-g\t\tdo not use global tree pruning\n"\
|
106
|
+
"\t-m <cases>\trestrict allowable splits\n"\
|
107
|
+
"\t-c <percent>\tconfidence level (CF) for"\
|
108
|
+
" pruning\n"\
|
109
|
+
"\t-S <percent>\ttraining sample percentage\n"\
|
110
|
+
"\t-X <folds>\tcross-validate\n"\
|
111
|
+
"\t-I <integer>\trandom seed for sampling"\
|
112
|
+
" and cross-validation\n"\
|
113
|
+
"\t-h\t\tprint this message\n"
|
114
|
+
#define T_UBWarn " ** Warning (-u): rule ordering "\
|
115
|
+
"has no effect on boosting\n"
|
116
|
+
#define T_ClassVar "\nClass specified by attribute `%s'\n"
|
117
|
+
#define TX_ReadData(c,a,f) "\nRead %d cases (%d attributes) from"\
|
118
|
+
" %s.data\n", c, a, f
|
119
|
+
#define TX_ReadTest(c,f) "Read %d cases from %s.test\n", c, f
|
120
|
+
#define T_ReadCosts "Read misclassification costs from %s.costs\n"
|
121
|
+
#define T_CWtAtt "Using relative case weighting\n"
|
122
|
+
#define T_AttributesIn "\nAttributes included:\n"
|
123
|
+
#define T_AttributesOut "\nAttributes excluded:\n"
|
124
|
+
#define T_AttributesWinnowed "\n%d attribute%s winnowed\n"
|
125
|
+
#define T_EstImportance "Estimated importance of remaining"\
|
126
|
+
" attributes:\n\n"
|
127
|
+
#define T_NoWinnow "\nNo attributes winnowed\n"
|
128
|
+
#define T_EvalTrain "\n\nEvaluation on training data (%d cases):\n"
|
129
|
+
#define T_Usage "\n\n\tAttribute usage:\n\n"
|
130
|
+
#define T_EvalTest "\nEvaluation on test data (%d cases):\n"
|
131
|
+
#define T_Time "\n\nTime: %.1f secs\n"
|
132
|
+
|
133
|
+
#define T_IgnoreBadClass "*** ignoring cases with bad or unknown class\n"
|
134
|
+
|
135
|
+
#define T_Subtree "\nSubTree [S%d]\n"
|
136
|
+
#define T_ElementOf "in"
|
137
|
+
#define T_InRange "in"
|
138
|
+
#define T_RuleHeader "\nRule "
|
139
|
+
#define T_RuleLift ", lift %.1f)\n"
|
140
|
+
#define T_IsUnknown " is unknown\n"
|
141
|
+
|
142
|
+
#define TX_Reduced1(t) ( (t) > 1 ?\
|
143
|
+
"\n*** boosting reduced to %d trials since"\
|
144
|
+
" last classifier is very accurate\n" :\
|
145
|
+
"\n*** boosting reduced to %d trial since"\
|
146
|
+
" last classifier is very accurate\n" )
|
147
|
+
#define TX_Reduced2(t) ( (t) > 1 ?\
|
148
|
+
"\n*** boosting reduced to %d trials since"\
|
149
|
+
" last classifier is very inaccurate\n" :\
|
150
|
+
"\n*** boosting reduced to %d trial since"\
|
151
|
+
" last classifier is very inaccurate\n" )
|
152
|
+
#define T_Abandoned "\n*** boosting abandoned (too few classifiers)\n"
|
153
|
+
#define T_BoostingUnhelpful "\n*** warning: boosting may be unhelpful\n"
|
154
|
+
#define T_Composite "Composite ruleset:"
|
155
|
+
#define T_Tree "Decision tree:"
|
156
|
+
#define T_Rules "Rules:"
|
157
|
+
|
158
|
+
#define T_Default_class "Default class"
|
159
|
+
#define T_boost "boost"
|
160
|
+
#define T_composite_ruleset "composite ruleset"
|
161
|
+
#define T_Rule_utility_summary "Rule utility summary"
|
162
|
+
#define T_class "class"
|
163
|
+
#define T_classified_as "classified as"
|
164
|
+
#define T_Summary "Summary"
|
165
|
+
#define T_FoldsReduced "\n*** folds reduced to number of cases\n"
|
166
|
+
#define T_EvalHoldOut "\nEvaluation on hold-out data (%d cases):\n"
|
167
|
+
#define T_Summary "Summary"
|
168
|
+
#define T_Fold "Fold"
|
169
|
+
#define T_Mean "Mean"
|
170
|
+
#define T_SE "SE"
|
171
|
+
|
172
|
+
#define TX_Line(l,f) "*** line %d of `%s': ", l, f
|
173
|
+
#define E_NOFILE(f,e) "cannot open file %s%s\n", f, e
|
174
|
+
#define E_ForWrite " for writing"
|
175
|
+
#define E_BADCLASSTHRESH "bad class threshold `%s'\n"
|
176
|
+
#define E_LEQCLASSTHRESH "class threshold `%s' <= previous threshold\n"
|
177
|
+
#define E_BADATTNAME "`:' or `:=' expected after attribute name"\
|
178
|
+
" `%s'\n"
|
179
|
+
#define E_EOFINATT "unexpected eof while reading attribute `%s'\n"
|
180
|
+
#define E_SINGLEATTVAL(a,v) "attribute `%s' has only one value `%s'\n",\
|
181
|
+
a, v
|
182
|
+
#define E_DUPATTNAME "multiple attributes with name `%s'\n"
|
183
|
+
#define E_CWTATTERR "case weight attribute must be continuous\n"
|
184
|
+
#define E_BADATTVAL(v,a) "bad value of `%s' for attribute `%s'\n", v, a
|
185
|
+
#define E_BADNUMBER(a) "value of `%s' changed to `?'\n", a
|
186
|
+
#define E_BADCLASS "bad class value `%s'\n"
|
187
|
+
#define E_BADCOSTCLASS "bad class `%s'\n"
|
188
|
+
#define E_BADCOST "bad cost value `%s'\n"
|
189
|
+
#define E_NOMEM "unable to allocate sufficient memory\n"
|
190
|
+
#define E_TOOMANYVALS(a,n) "too many values for attribute `%s'"\
|
191
|
+
" (max %d)\n", a, n
|
192
|
+
#define E_BADDISCRETE "bad number of discrete values for attribute"\
|
193
|
+
" `%s'\n"
|
194
|
+
#define E_NOTARGET "target attribute `%s' not found or"\
|
195
|
+
" type `ignore'\n"
|
196
|
+
#define E_BADCTARGET "target attribute `%s' must be"\
|
197
|
+
" type `continuous'\n"
|
198
|
+
#define E_BADDTARGET "target attribute `%s' must be specified by"\
|
199
|
+
" a list of discrete values\n"
|
200
|
+
#define E_LONGNAME "overlength name: check data file formats\n"
|
201
|
+
#define E_HITEOF "unexpected end of file\n"
|
202
|
+
#define E_MISSNAME "missing name or value before `%s'\n"
|
203
|
+
#define E_BADTSTMP(d,a) "bad timestamp `%s' for attribute `%s'\n", d, a
|
204
|
+
#define E_BADDATE(d,a) "bad date `%s' for attribute `%s'\n", d, a
|
205
|
+
#define E_BADTIME(d,a) "bad time `%s' for attribute `%s'\n", d, a
|
206
|
+
#define E_UNKNOWNATT "unknown attribute name `%s'\n"
|
207
|
+
#define E_BADDEF1(a,s,x) "in definition of attribute `%s':\n"\
|
208
|
+
"\tat `%.12s': expect %s\n", a, s, x
|
209
|
+
#define E_BADDEF2(a,s,x) "in definition of attribute `%s':\n"\
|
210
|
+
"\t`%s': %s\n", a, s, x
|
211
|
+
#define E_SAMEATT(a,b) "[warning] attribute `%s' is identical to"\
|
212
|
+
" attribute `%s'\n", a, b
|
213
|
+
#define E_BADDEF3 "cannot define target attribute `%s'\n"
|
214
|
+
#define E_BADDEF4 "[warning] target attribute appears in"\
|
215
|
+
" definition of attribute `%s'\n"
|
216
|
+
#define EX_MODELFILE(f) "file %s incompatible with .names file\n", f
|
217
|
+
#define E_MFATT "undefined or excluded attribute"
|
218
|
+
#define E_MFATTVAL "undefined attribute value"
|
219
|
+
#define E_MFCLASS "undefined class"
|
220
|
+
#define E_MFEOF "unexpected eof"
|
221
|
+
#define T_ErrorLimit "\nError limit exceeded\n"
|
222
|
+
#define TX_IllegalValue(v,l,h) "\t** illegal value %g -- "\
|
223
|
+
"should be between %g and %g\n", v, l, h
|
data/ext/c5.0/trees.c
ADDED
@@ -0,0 +1,740 @@
|
|
1
|
+
/*************************************************************************/
|
2
|
+
/* */
|
3
|
+
/* Copyright 2010 Rulequest Research Pty Ltd. */
|
4
|
+
/* */
|
5
|
+
/* This file is part of C5.0 GPL Edition, a single-threaded version */
|
6
|
+
/* of C5.0 release 2.07. */
|
7
|
+
/* */
|
8
|
+
/* C5.0 GPL Edition is free software: you can redistribute it and/or */
|
9
|
+
/* modify it under the terms of the GNU General Public License as */
|
10
|
+
/* published by the Free Software Foundation, either version 3 of the */
|
11
|
+
/* License, or (at your option) any later version. */
|
12
|
+
/* */
|
13
|
+
/* C5.0 GPL Edition is distributed in the hope that it will be useful, */
|
14
|
+
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
15
|
+
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
|
16
|
+
/* General Public License for more details. */
|
17
|
+
/* */
|
18
|
+
/* You should have received a copy of the GNU General Public License */
|
19
|
+
/* (gpl.txt) along with C5.0 GPL Edition. If not, see */
|
20
|
+
/* */
|
21
|
+
/* <http://www.gnu.org/licenses/>. */
|
22
|
+
/* */
|
23
|
+
/*************************************************************************/
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*************************************************************************/
|
28
|
+
/* */
|
29
|
+
/* Routines for displaying, building, saving and restoring trees */
|
30
|
+
/* ------------------------------------------------------------- */
|
31
|
+
/* */
|
32
|
+
/*************************************************************************/
|
33
|
+
|
34
|
+
|
35
|
+
#include "defns.i"
|
36
|
+
#include "extern.i"
|
37
|
+
|
38
|
+
|
39
|
+
#define TabSize 4
|
40
|
+
#define Utility ClassDist[0]
|
41
|
+
#define Digits(n) ((n) < 10 ? 3 : (int)(3 + log(n-1) / log(10.0)))
|
42
|
+
|
43
|
+
|
44
|
+
/* If lines look like getting too long while a tree is being
|
45
|
+
printed, subtrees are broken off and printed separately after
|
46
|
+
the main tree is finished */
|
47
|
+
|
48
|
+
int SubTree, /* highest subtree to be printed */
|
49
|
+
SubSpace=0; /* maximum subtree encountered */
|
50
|
+
Tree *SubDef=Nil; /* pointers to subtrees */
|
51
|
+
Boolean LastBranch[Width]; /* whether printing last branch of subtree */
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
/*************************************************************************/
|
56
|
+
/* */
|
57
|
+
/* Calculate the depth of nodes in a tree in Utility field */
|
58
|
+
/* */
|
59
|
+
/*************************************************************************/
|
60
|
+
|
61
|
+
|
62
|
+
void FindDepth(Tree T)
|
63
|
+
/* --------- */
|
64
|
+
{
|
65
|
+
float MaxDepth=0;
|
66
|
+
DiscrValue v;
|
67
|
+
|
68
|
+
if ( T->NodeType )
|
69
|
+
{
|
70
|
+
ForEach(v, 1, T->Forks)
|
71
|
+
{
|
72
|
+
FindDepth(T->Branch[v]);
|
73
|
+
if ( T->Branch[v]->Utility > MaxDepth )
|
74
|
+
{
|
75
|
+
MaxDepth = T->Branch[v]->Utility;
|
76
|
+
}
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
T->Utility = MaxDepth + 1;
|
81
|
+
}
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
/*************************************************************************/
|
86
|
+
/* */
|
87
|
+
/* Display entire decision tree T */
|
88
|
+
/* */
|
89
|
+
/*************************************************************************/
|
90
|
+
|
91
|
+
|
92
|
+
void PrintTree(Tree T, String Title)
|
93
|
+
/* --------- */
|
94
|
+
{
|
95
|
+
int s;
|
96
|
+
|
97
|
+
FindDepth(T);
|
98
|
+
|
99
|
+
SubTree=0;
|
100
|
+
fprintf(Of, "\n%s\n", Title);
|
101
|
+
Show(T, 0);
|
102
|
+
fprintf(Of, "\n");
|
103
|
+
|
104
|
+
ForEach(s, 1, SubTree)
|
105
|
+
{
|
106
|
+
fprintf(Of, T_Subtree, s);
|
107
|
+
Show(SubDef[s], 0);
|
108
|
+
fprintf(Of, "\n");
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
/*************************************************************************/
|
115
|
+
/* */
|
116
|
+
/* Display the tree T with offset Sh */
|
117
|
+
/* */
|
118
|
+
/*************************************************************************/
|
119
|
+
|
120
|
+
|
121
|
+
void Show(Tree T, int Sh)
|
122
|
+
/* ---- */
|
123
|
+
{
|
124
|
+
DiscrValue v, MaxV, BrNo, Simplest, First;
|
125
|
+
CaseCount Errors=0.0;
|
126
|
+
|
127
|
+
if ( T->NodeType )
|
128
|
+
{
|
129
|
+
/* See whether separate subtree needed */
|
130
|
+
|
131
|
+
if ( Sh && Sh * TabSize + MaxLine(T) > Width )
|
132
|
+
{
|
133
|
+
if ( ++SubTree >= SubSpace )
|
134
|
+
{
|
135
|
+
SubSpace += 100;
|
136
|
+
if ( SubDef )
|
137
|
+
{
|
138
|
+
Realloc(SubDef, SubSpace, Tree);
|
139
|
+
}
|
140
|
+
else
|
141
|
+
{
|
142
|
+
SubDef = Alloc(SubSpace, Tree);
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
SubDef[SubTree] = T;
|
147
|
+
fprintf(Of, " [S%d]", SubTree);
|
148
|
+
}
|
149
|
+
else
|
150
|
+
{
|
151
|
+
MaxV = T->Forks;
|
152
|
+
|
153
|
+
/* Skip N/A branch if no cases */
|
154
|
+
|
155
|
+
First = ( EmptyNA(T) ? 2 : 1 );
|
156
|
+
BrNo = First - 1;
|
157
|
+
|
158
|
+
/* Print simplest branches first */
|
159
|
+
|
160
|
+
while ( BrNo < MaxV )
|
161
|
+
{
|
162
|
+
Simplest = First;
|
163
|
+
ForEach(v, 2, MaxV)
|
164
|
+
{
|
165
|
+
if ( T->Branch[v]->Utility < T->Branch[Simplest]->Utility ||
|
166
|
+
|
167
|
+
T->Branch[v]->Utility == 1 && ! T->Branch[v]->Cases )
|
168
|
+
{
|
169
|
+
Simplest = v;
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
LastBranch[Sh+1] = ( ++BrNo == MaxV );
|
174
|
+
ShowBranch(Sh, T, Simplest, (int)( BrNo == First ));
|
175
|
+
T->Branch[Simplest]->Utility = 1E10;
|
176
|
+
}
|
177
|
+
}
|
178
|
+
}
|
179
|
+
else
|
180
|
+
{
|
181
|
+
fprintf(Of, " %s (%.8g", ClassName[T->Leaf], P1(T->Cases));
|
182
|
+
if ( T->Cases >= MinLeaf )
|
183
|
+
{
|
184
|
+
if ( (Errors = T->Cases - T->ClassDist[T->Leaf]) >= 0.05 )
|
185
|
+
{
|
186
|
+
fprintf(Of, "/%.8g", P1(Errors));
|
187
|
+
}
|
188
|
+
}
|
189
|
+
putc(')', Of);
|
190
|
+
}
|
191
|
+
}
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
/*************************************************************************/
|
196
|
+
/* */
|
197
|
+
/* Print a node T with offset Sh, branch value v, and continue */
|
198
|
+
/* */
|
199
|
+
/*************************************************************************/
|
200
|
+
|
201
|
+
|
202
|
+
void ShowBranch(int Sh, Tree T, DiscrValue v, DiscrValue BrNo)
|
203
|
+
/* ---------- */
|
204
|
+
{
|
205
|
+
DiscrValue Pv, Last;
|
206
|
+
Attribute Att;
|
207
|
+
Boolean FirstValue;
|
208
|
+
int TextWidth, Skip, Values, i, Extra;
|
209
|
+
char CVS1[20], CVS2[20];
|
210
|
+
|
211
|
+
Att = T->Tested;
|
212
|
+
|
213
|
+
switch ( T->NodeType )
|
214
|
+
{
|
215
|
+
case BrDiscr:
|
216
|
+
|
217
|
+
Indent(Sh, BrNo);
|
218
|
+
|
219
|
+
fprintf(Of, "%s = %s:", AttName[Att], AttValName[Att][v]);
|
220
|
+
|
221
|
+
break;
|
222
|
+
|
223
|
+
case BrThresh:
|
224
|
+
|
225
|
+
Indent(Sh, BrNo);
|
226
|
+
|
227
|
+
fprintf(Of, "%s", AttName[Att]);
|
228
|
+
|
229
|
+
if ( v == 1 )
|
230
|
+
{
|
231
|
+
fprintf(Of, " = N/A:");
|
232
|
+
}
|
233
|
+
else
|
234
|
+
if ( T->Lower != T->Upper )
|
235
|
+
{
|
236
|
+
if ( v == 2 )
|
237
|
+
{
|
238
|
+
CValToStr(T->Lower, Att, CVS1);
|
239
|
+
CValToStr(T->Mid , Att, CVS2);
|
240
|
+
fprintf(Of, " <= %s (%s):", CVS1, CVS2);
|
241
|
+
}
|
242
|
+
else
|
243
|
+
{
|
244
|
+
CValToStr(T->Upper, Att, CVS1);
|
245
|
+
CValToStr(T->Mid , Att, CVS2);
|
246
|
+
fprintf(Of, " >= %s (%s):", CVS1, CVS2);
|
247
|
+
}
|
248
|
+
}
|
249
|
+
else
|
250
|
+
{
|
251
|
+
CValToStr(T->Cut, Att, CVS1);
|
252
|
+
fprintf(Of, " %s %s:", ( v == 2 ? "<=" : ">" ), CVS1);
|
253
|
+
}
|
254
|
+
|
255
|
+
break;
|
256
|
+
|
257
|
+
case BrSubset:
|
258
|
+
|
259
|
+
/* Count values at this branch */
|
260
|
+
|
261
|
+
Values = Elements(Att, T->Subset[v], &Last);
|
262
|
+
if ( ! Values ) return;
|
263
|
+
|
264
|
+
Indent(Sh, BrNo);
|
265
|
+
|
266
|
+
if ( Values == 1 )
|
267
|
+
{
|
268
|
+
fprintf(Of, "%s = %s:", AttName[Att], AttValName[Att][Last]);
|
269
|
+
break;
|
270
|
+
}
|
271
|
+
|
272
|
+
if ( Ordered(Att) )
|
273
|
+
{
|
274
|
+
/* Find first value */
|
275
|
+
|
276
|
+
for ( Pv = 1 ; ! In(Pv, T->Subset[v]) ; Pv++ )
|
277
|
+
;
|
278
|
+
|
279
|
+
fprintf(Of, "%s %s [%s-%s]:", AttName[Att], T_InRange,
|
280
|
+
AttValName[Att][Pv], AttValName[Att][Last]);
|
281
|
+
break;
|
282
|
+
}
|
283
|
+
|
284
|
+
fprintf(Of, "%s %s {", AttName[Att], T_ElementOf);
|
285
|
+
FirstValue = true;
|
286
|
+
Skip = CharWidth(AttName[Att]) + CharWidth(T_ElementOf) + 3;
|
287
|
+
TextWidth = Skip + Sh * TabSize;
|
288
|
+
|
289
|
+
ForEach(Pv, 1, Last)
|
290
|
+
{
|
291
|
+
if ( In(Pv, T->Subset[v]) )
|
292
|
+
{
|
293
|
+
/* Find number of characters after this element */
|
294
|
+
|
295
|
+
if ( Pv != Last || T->Branch[v]->NodeType )
|
296
|
+
{
|
297
|
+
Extra = 1; /* for ":" */
|
298
|
+
}
|
299
|
+
else
|
300
|
+
{
|
301
|
+
Extra = 2 /* for ": " */
|
302
|
+
+ CharWidth(ClassName[T->Branch[v]->Leaf])
|
303
|
+
+ 3 /* for " ()" */
|
304
|
+
+ Digits(T->Cases)
|
305
|
+
+ ( T->Errors < 0.05 ? 0 :
|
306
|
+
1 /* for "/" */
|
307
|
+
+ Digits(T->Errors) );
|
308
|
+
}
|
309
|
+
|
310
|
+
if ( ! FirstValue &&
|
311
|
+
TextWidth + CharWidth(AttValName[Att][Pv]) +
|
312
|
+
Extra + 1 > Width )
|
313
|
+
{
|
314
|
+
Indent(Sh, 0);
|
315
|
+
fprintf(Of, "%s",
|
316
|
+
( LastBranch[Sh+1] && ! T->Branch[v]->NodeType ?
|
317
|
+
" " : ": " ));
|
318
|
+
ForEach(i, 5, Skip) putc(' ', Of);
|
319
|
+
|
320
|
+
TextWidth = Skip + Sh * TabSize;
|
321
|
+
FirstValue = true;
|
322
|
+
}
|
323
|
+
|
324
|
+
fprintf(Of, "%s%c",
|
325
|
+
AttValName[Att][Pv], Pv == Last ? '}' : ',');
|
326
|
+
TextWidth += CharWidth(AttValName[Att][Pv]) + 1;
|
327
|
+
FirstValue = false;
|
328
|
+
}
|
329
|
+
}
|
330
|
+
putc(':', Of);
|
331
|
+
}
|
332
|
+
|
333
|
+
Show(T->Branch[v], Sh+1);
|
334
|
+
}
|
335
|
+
|
336
|
+
|
337
|
+
|
338
|
+
/*************************************************************************/
|
339
|
+
/* */
|
340
|
+
/* Count the elements in a subset and record the last */
|
341
|
+
/* */
|
342
|
+
/*************************************************************************/
|
343
|
+
|
344
|
+
|
345
|
+
DiscrValue Elements(Attribute Att, Set S, DiscrValue *Last)
|
346
|
+
/* -------- */
|
347
|
+
{
|
348
|
+
DiscrValue Pv, Values=0;
|
349
|
+
|
350
|
+
ForEach(Pv, 1, MaxAttVal[Att])
|
351
|
+
{
|
352
|
+
if ( In(Pv, S) )
|
353
|
+
{
|
354
|
+
*Last = Pv;
|
355
|
+
Values++;
|
356
|
+
}
|
357
|
+
}
|
358
|
+
|
359
|
+
return Values;
|
360
|
+
}
|
361
|
+
|
362
|
+
|
363
|
+
|
364
|
+
/*************************************************************************/
|
365
|
+
/* */
|
366
|
+
/* Find the approximate maximum single line size for non-leaf */
|
367
|
+
/* subtree T */
|
368
|
+
/* */
|
369
|
+
/*************************************************************************/
|
370
|
+
|
371
|
+
|
372
|
+
int MaxLine(Tree T)
|
373
|
+
/* ------- */
|
374
|
+
{
|
375
|
+
Attribute Att;
|
376
|
+
DiscrValue v, vv;
|
377
|
+
int Ll, One, MaxLl=0;
|
378
|
+
|
379
|
+
Att = T->Tested;
|
380
|
+
|
381
|
+
/* First find the max length of the line excluding tested att */
|
382
|
+
|
383
|
+
ForEach(v, 1, T->Forks)
|
384
|
+
{
|
385
|
+
switch ( T->NodeType )
|
386
|
+
{
|
387
|
+
case BrThresh:
|
388
|
+
if ( TStampVal(Att) )
|
389
|
+
{
|
390
|
+
Ll = ( T->Lower != T->Upper ? 41 : 19 );
|
391
|
+
}
|
392
|
+
else
|
393
|
+
if ( DateVal(Att) )
|
394
|
+
{
|
395
|
+
Ll = ( T->Lower != T->Upper ? 23 : 10 );
|
396
|
+
}
|
397
|
+
else
|
398
|
+
if ( TimeVal(Att) )
|
399
|
+
{
|
400
|
+
Ll = ( T->Lower != T->Upper ? 19 : 8 );
|
401
|
+
}
|
402
|
+
else
|
403
|
+
{
|
404
|
+
Ll = ( T->Lower != T->Upper ? 11 : 4 );
|
405
|
+
}
|
406
|
+
break;
|
407
|
+
|
408
|
+
case BrDiscr:
|
409
|
+
if ( Ordered(Att) )
|
410
|
+
{
|
411
|
+
vv = T->Cut;
|
412
|
+
|
413
|
+
switch ( v )
|
414
|
+
{
|
415
|
+
case 1:
|
416
|
+
Ll = 3;
|
417
|
+
break;
|
418
|
+
|
419
|
+
case 2:
|
420
|
+
Ll = CharWidth(AttValName[Att][2]);
|
421
|
+
if ( vv != 2 )
|
422
|
+
{
|
423
|
+
Ll += CharWidth(AttValName[Att][vv])+1;
|
424
|
+
}
|
425
|
+
break;
|
426
|
+
|
427
|
+
case 3:
|
428
|
+
Ll = CharWidth(AttValName[Att][MaxAttVal[Att]]);
|
429
|
+
if ( vv != MaxAttVal[Att] - 1 )
|
430
|
+
{
|
431
|
+
Ll += CharWidth(AttValName[Att][vv+1])+1;
|
432
|
+
}
|
433
|
+
}
|
434
|
+
}
|
435
|
+
else
|
436
|
+
{
|
437
|
+
Ll = CharWidth(AttValName[Att][v]) + 1;
|
438
|
+
}
|
439
|
+
break;
|
440
|
+
|
441
|
+
case BrSubset: /* difficult! */
|
442
|
+
Ll = 0;
|
443
|
+
ForEach(vv, 1, MaxAttVal[Att])
|
444
|
+
{
|
445
|
+
if ( In(vv,T->Subset[v]) )
|
446
|
+
{
|
447
|
+
One = CharWidth(AttValName[Att][vv]) + 6;
|
448
|
+
if ( One > Ll ) Ll = One;
|
449
|
+
}
|
450
|
+
}
|
451
|
+
}
|
452
|
+
|
453
|
+
/* Check whether ends in leaf */
|
454
|
+
|
455
|
+
if ( ! T->Branch[v]->NodeType &&
|
456
|
+
( v > 1 || T->Branch[v]->Cases > 0.01 ) )
|
457
|
+
{
|
458
|
+
Ll += CharWidth(ClassName[T->Branch[v]->Leaf]) + 6;
|
459
|
+
}
|
460
|
+
|
461
|
+
if ( Ll > MaxLl ) MaxLl = Ll;
|
462
|
+
}
|
463
|
+
|
464
|
+
return CharWidth(AttName[Att]) + 4 + MaxLl;
|
465
|
+
}
|
466
|
+
|
467
|
+
|
468
|
+
|
469
|
+
/*************************************************************************/
|
470
|
+
/* */
|
471
|
+
/* Indent Sh columns */
|
472
|
+
/* */
|
473
|
+
/*************************************************************************/
|
474
|
+
|
475
|
+
|
476
|
+
void Indent(int Sh, int BrNo)
|
477
|
+
/* ------ */
|
478
|
+
{
|
479
|
+
int i;
|
480
|
+
|
481
|
+
fprintf(Of, "\n");
|
482
|
+
for ( i = 1 ; i <= Sh ; i++ )
|
483
|
+
{
|
484
|
+
fprintf(Of, "%s", ( i == Sh && BrNo == 1 ? ":..." :
|
485
|
+
LastBranch[i] ? " " : ": " ));
|
486
|
+
}
|
487
|
+
}
|
488
|
+
|
489
|
+
|
490
|
+
|
491
|
+
/*************************************************************************/
|
492
|
+
/* */
|
493
|
+
/* Free up space taken up by tree T */
|
494
|
+
/* */
|
495
|
+
/*************************************************************************/
|
496
|
+
|
497
|
+
|
498
|
+
void FreeTree(Tree T)
|
499
|
+
/* -------- */
|
500
|
+
{
|
501
|
+
DiscrValue v;
|
502
|
+
|
503
|
+
if ( ! T ) return;
|
504
|
+
|
505
|
+
if ( T->NodeType )
|
506
|
+
{
|
507
|
+
ForEach(v, 1, T->Forks)
|
508
|
+
{
|
509
|
+
FreeTree(T->Branch[v]);
|
510
|
+
}
|
511
|
+
|
512
|
+
Free(T->Branch);
|
513
|
+
|
514
|
+
if ( T->NodeType == BrSubset )
|
515
|
+
{
|
516
|
+
FreeVector((void **) T->Subset, 1, T->Forks);
|
517
|
+
}
|
518
|
+
|
519
|
+
}
|
520
|
+
|
521
|
+
Free(T->ClassDist);
|
522
|
+
Free(T);
|
523
|
+
}
|
524
|
+
|
525
|
+
|
526
|
+
|
527
|
+
/*************************************************************************/
|
528
|
+
/* */
|
529
|
+
/* Construct a leaf in a given node */
|
530
|
+
/* */
|
531
|
+
/*************************************************************************/
|
532
|
+
|
533
|
+
|
534
|
+
Tree Leaf(double *Freq, ClassNo NodeClass, CaseCount Cases, CaseCount Errors)
|
535
|
+
/* ---- */
|
536
|
+
{
|
537
|
+
Tree Node;
|
538
|
+
ClassNo c;
|
539
|
+
|
540
|
+
Node = AllocZero(1, TreeRec);
|
541
|
+
|
542
|
+
Node->ClassDist = AllocZero(MaxClass+1, CaseCount);
|
543
|
+
if ( Freq )
|
544
|
+
{
|
545
|
+
ForEach(c, 1, MaxClass)
|
546
|
+
{
|
547
|
+
Node->ClassDist[c] = Freq[c];
|
548
|
+
}
|
549
|
+
}
|
550
|
+
|
551
|
+
Node->NodeType = 0;
|
552
|
+
Node->Leaf = NodeClass;
|
553
|
+
Node->Cases = Cases;
|
554
|
+
Node->Errors = Errors;
|
555
|
+
|
556
|
+
return Node;
|
557
|
+
}
|
558
|
+
|
559
|
+
|
560
|
+
|
561
|
+
/*************************************************************************/
|
562
|
+
/* */
|
563
|
+
/* Insert branches in a node */
|
564
|
+
/* */
|
565
|
+
/*************************************************************************/
|
566
|
+
|
567
|
+
|
568
|
+
void Sprout(Tree T, DiscrValue Branches)
|
569
|
+
/* ------ */
|
570
|
+
{
|
571
|
+
T->Forks = Branches;
|
572
|
+
T->Branch = AllocZero(Branches+1, Tree);
|
573
|
+
}
|
574
|
+
|
575
|
+
|
576
|
+
|
577
|
+
/*************************************************************************/
|
578
|
+
/* */
|
579
|
+
/* Remove branches etc from a node */
|
580
|
+
/* */
|
581
|
+
/*************************************************************************/
|
582
|
+
|
583
|
+
|
584
|
+
void UnSprout(Tree T)
|
585
|
+
/* -------- */
|
586
|
+
{
|
587
|
+
DiscrValue v;
|
588
|
+
|
589
|
+
ForEach(v, 1, T->Forks)
|
590
|
+
{
|
591
|
+
FreeTree(T->Branch[v]);
|
592
|
+
}
|
593
|
+
Free(T->Branch); T->Branch = Nil;
|
594
|
+
|
595
|
+
if ( T->NodeType == BrSubset )
|
596
|
+
{
|
597
|
+
FreeVector((void **) T->Subset, 1, T->Forks); T->Subset = Nil;
|
598
|
+
}
|
599
|
+
|
600
|
+
T->Forks = T->NodeType = 0;
|
601
|
+
}
|
602
|
+
|
603
|
+
|
604
|
+
|
605
|
+
/*************************************************************************/
|
606
|
+
/* */
|
607
|
+
/* Count the non-null leaves in a tree */
|
608
|
+
/* */
|
609
|
+
/*************************************************************************/
|
610
|
+
|
611
|
+
|
612
|
+
int TreeSize(Tree T)
|
613
|
+
/* -------- */
|
614
|
+
{
|
615
|
+
int Sum=0;
|
616
|
+
DiscrValue v;
|
617
|
+
|
618
|
+
if ( T->NodeType )
|
619
|
+
{
|
620
|
+
ForEach(v, ( EmptyNA(T) ? 2 : 1 ), T->Forks)
|
621
|
+
{
|
622
|
+
Sum += TreeSize(T->Branch[v]);
|
623
|
+
}
|
624
|
+
|
625
|
+
return Sum;
|
626
|
+
}
|
627
|
+
|
628
|
+
return ( T->Cases >= MinLeaf ? 1 : 0 );
|
629
|
+
}
|
630
|
+
|
631
|
+
|
632
|
+
|
633
|
+
/*************************************************************************/
|
634
|
+
/* */
|
635
|
+
/* Count the non-null leaves in a tree that may contain */
|
636
|
+
/* compressed branches via CompressBranches() */
|
637
|
+
/* */
|
638
|
+
/*************************************************************************/
|
639
|
+
|
640
|
+
|
641
|
+
int ExpandedLeafCount(Tree T)
|
642
|
+
/* ----------------- */
|
643
|
+
{
|
644
|
+
int Sum=0;
|
645
|
+
DiscrValue v, Dummy;
|
646
|
+
|
647
|
+
if ( ! T->NodeType )
|
648
|
+
{
|
649
|
+
return 1;
|
650
|
+
}
|
651
|
+
|
652
|
+
ForEach(v, 1, T->Forks)
|
653
|
+
{
|
654
|
+
if ( T->Branch[v]->Cases < MinLeaf ) continue;
|
655
|
+
|
656
|
+
if ( T->NodeType == BrSubset && ! T->Branch[v]->NodeType )
|
657
|
+
{
|
658
|
+
Sum += Elements(T->Tested, T->Subset[v], &Dummy);
|
659
|
+
}
|
660
|
+
else
|
661
|
+
{
|
662
|
+
Sum += ExpandedLeafCount(T->Branch[v]);
|
663
|
+
}
|
664
|
+
}
|
665
|
+
|
666
|
+
return Sum;
|
667
|
+
}
|
668
|
+
|
669
|
+
|
670
|
+
|
671
|
+
/*************************************************************************/
|
672
|
+
/* */
|
673
|
+
/* Find the maximum depth of a tree */
|
674
|
+
/* */
|
675
|
+
/*************************************************************************/
|
676
|
+
|
677
|
+
|
678
|
+
int TreeDepth(Tree T)
|
679
|
+
/* --------- */
|
680
|
+
{
|
681
|
+
DiscrValue v;
|
682
|
+
int Subtree, MaxSubtree=0;
|
683
|
+
|
684
|
+
if ( T->NodeType )
|
685
|
+
{
|
686
|
+
ForEach(v, 1, T->Forks)
|
687
|
+
{
|
688
|
+
Subtree = TreeDepth(T->Branch[v]);
|
689
|
+
if ( Subtree > MaxSubtree ) MaxSubtree = Subtree;
|
690
|
+
}
|
691
|
+
}
|
692
|
+
|
693
|
+
return MaxSubtree + 1;
|
694
|
+
}
|
695
|
+
|
696
|
+
|
697
|
+
|
698
|
+
/*************************************************************************/
|
699
|
+
/* */
|
700
|
+
/* Return a copy of tree T */
|
701
|
+
/* */
|
702
|
+
/*************************************************************************/
|
703
|
+
|
704
|
+
|
705
|
+
Tree CopyTree(Tree T)
|
706
|
+
/* -------- */
|
707
|
+
{
|
708
|
+
DiscrValue v;
|
709
|
+
Tree New;
|
710
|
+
int Bytes;
|
711
|
+
|
712
|
+
New = Alloc(1, TreeRec);
|
713
|
+
memcpy(New, T, sizeof(TreeRec));
|
714
|
+
|
715
|
+
New->ClassDist = Alloc(MaxClass+1, CaseCount);
|
716
|
+
memcpy(New->ClassDist, T->ClassDist, (MaxClass + 1) * sizeof(CaseCount));
|
717
|
+
|
718
|
+
if ( T->NodeType == BrSubset )
|
719
|
+
{
|
720
|
+
Bytes = (MaxAttVal[T->Tested]>>3) + 1;
|
721
|
+
|
722
|
+
New->Subset = Alloc(T->Forks+1, Set);
|
723
|
+
ForEach(v, 1, T->Forks)
|
724
|
+
{
|
725
|
+
New->Subset[v] = Alloc(Bytes, unsigned char);
|
726
|
+
memcpy(New->Subset[v], T->Subset[v], Bytes);
|
727
|
+
}
|
728
|
+
}
|
729
|
+
|
730
|
+
if ( T->NodeType )
|
731
|
+
{
|
732
|
+
New->Branch = AllocZero(T->Forks+1, Tree);
|
733
|
+
ForEach(v, 1, T->Forks)
|
734
|
+
{
|
735
|
+
New->Branch[v] = CopyTree(T->Branch[v]);
|
736
|
+
}
|
737
|
+
}
|
738
|
+
|
739
|
+
return New;
|
740
|
+
}
|