see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
data/ext/c5.0/text.i ADDED
@@ -0,0 +1,223 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Text strings for UTF-8 internationalization */
30
+ /* ------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+
36
+ /* General stuff */
37
+
38
+ //#define UTF8 /* uncomment if using UTF-8 */
39
+
40
+ #ifdef UTF8
41
+ #define CharWidth(S) UTF8CharWidth(S)
42
+ #else
43
+ #define CharWidth(S) (int) strlen(S)
44
+ #endif
45
+
46
+
47
+ /* Strings with width/format restrictions */
48
+ /* (W = width when printed, C = centered) */
49
+
50
+ #define F_Fold "Fold" /* W<8 */
51
+ #define F_UFold "----" /* W same */
52
+ #define F_Trial "Trial" /* W<8 */
53
+ #define F_UTrial "-----" /* W same */
54
+ #define F_DecisionTree16 " Decision Tree " /* W=16C */
55
+ #define F_SizeErrors "Size Errors" /* W=16 */
56
+ #define F_DecisionTree23 " Decision Tree " /* W=23C */
57
+ #define F_SizeErrorsCost "Size Errors Cost" /* W=23 */
58
+ #define F_Rules16 " Rules " /* W=16C */
59
+ #define F_NoErrors " No Errors" /* W=16 */
60
+ #define F_Rules23 " Rules " /* W=23C */
61
+ #define F_NoErrorsCost " No Errors Cost" /* W=23 */
62
+ #define F_Rules "Rules" /* W<8 */
63
+ #define F_URules "-----" /* W same */
64
+ #define F_Errors "Errors" /* W<8 */
65
+ #define F_UErrors "------" /* W same */
66
+ #define F_Cost "Cost" /* W<8 */
67
+ #define F_UCost "----" /* W same */
68
+ #define F_Boost "boost" /* W<8 */
69
+
70
+
71
+ /* Strings of arbitrary length */
72
+
73
+ #define T_See5 "See5"
74
+ #define T_C50 "C5.0"
75
+ #define TX_Release(n) "Release " n
76
+
77
+ #define T_OptHeader "\n Options:\n"
78
+ #define T_OptApplication "\tApplication `%s'\n"
79
+ #define T_OptBoost "\tBoosted classifiers\n"
80
+ #define T_OptProbThresh "\tProbability thresholds\n"
81
+ #define T_OptTrials "\t%d boosting trials\n"
82
+ #define T_OptSubsets "\tTests on discrete attribute groups\n"
83
+ #define T_OptMinCases "\tTests require 2 branches with >=%g cases\n"
84
+ #define T_OptCF "\tPruning confidence level %g%%\n"
85
+ #define T_OptRules "\tRule-based classifiers\n"
86
+ #define T_OptSampling "\tUse %g%% of data for training\n"
87
+ #define T_OptSeed "\tRandom seed %d\n"
88
+ #define T_OptUtility "\tRule utility ordering (1/%d's)\n"
89
+ #define T_OptNoCosts "\tFocus on errors (ignore costs file)\n"
90
+ #define T_OptWinnow "\tWinnow attributes\n"
91
+ #define T_OptNoGlobal "\tDo not use global tree pruning\n"
92
+ #define T_OptXval "\tCross-validate using %d folds\n"
93
+ #define T_UnregnizedOpt "\n ** Unrecognised option %s %s\n"
94
+ #define T_SummaryOpts " ** Summary of options for c5.0:\n"
95
+ #define T_ListOpts "\t-f <filestem>\tapplication filestem\n"\
96
+ "\t-r\t\tuse rule-based classifiers\n"\
97
+ "\t-u <bands>\torder rules by utility in"\
98
+ " bands\n"\
99
+ "\t-w\t\tinvoke attribute winnowing\n"\
100
+ "\t-b\t\tinvoke boosting\n"\
101
+ "\t-t <trials>\tnumber of boosting trials\n"\
102
+ "\t-p\t\tuse soft thresholds\n"\
103
+ "\t-e\t\tfocus on errors (ignore costs file)\n"\
104
+ "\t-s\t\tfind subset tests for discrete atts\n"\
105
+ "\t-g\t\tdo not use global tree pruning\n"\
106
+ "\t-m <cases>\trestrict allowable splits\n"\
107
+ "\t-c <percent>\tconfidence level (CF) for"\
108
+ " pruning\n"\
109
+ "\t-S <percent>\ttraining sample percentage\n"\
110
+ "\t-X <folds>\tcross-validate\n"\
111
+ "\t-I <integer>\trandom seed for sampling"\
112
+ " and cross-validation\n"\
113
+ "\t-h\t\tprint this message\n"
114
+ #define T_UBWarn " ** Warning (-u): rule ordering "\
115
+ "has no effect on boosting\n"
116
+ #define T_ClassVar "\nClass specified by attribute `%s'\n"
117
+ #define TX_ReadData(c,a,f) "\nRead %d cases (%d attributes) from"\
118
+ " %s.data\n", c, a, f
119
+ #define TX_ReadTest(c,f) "Read %d cases from %s.test\n", c, f
120
+ #define T_ReadCosts "Read misclassification costs from %s.costs\n"
121
+ #define T_CWtAtt "Using relative case weighting\n"
122
+ #define T_AttributesIn "\nAttributes included:\n"
123
+ #define T_AttributesOut "\nAttributes excluded:\n"
124
+ #define T_AttributesWinnowed "\n%d attribute%s winnowed\n"
125
+ #define T_EstImportance "Estimated importance of remaining"\
126
+ " attributes:\n\n"
127
+ #define T_NoWinnow "\nNo attributes winnowed\n"
128
+ #define T_EvalTrain "\n\nEvaluation on training data (%d cases):\n"
129
+ #define T_Usage "\n\n\tAttribute usage:\n\n"
130
+ #define T_EvalTest "\nEvaluation on test data (%d cases):\n"
131
+ #define T_Time "\n\nTime: %.1f secs\n"
132
+
133
+ #define T_IgnoreBadClass "*** ignoring cases with bad or unknown class\n"
134
+
135
+ #define T_Subtree "\nSubTree [S%d]\n"
136
+ #define T_ElementOf "in"
137
+ #define T_InRange "in"
138
+ #define T_RuleHeader "\nRule "
139
+ #define T_RuleLift ", lift %.1f)\n"
140
+ #define T_IsUnknown " is unknown\n"
141
+
142
+ #define TX_Reduced1(t) ( (t) > 1 ?\
143
+ "\n*** boosting reduced to %d trials since"\
144
+ " last classifier is very accurate\n" :\
145
+ "\n*** boosting reduced to %d trial since"\
146
+ " last classifier is very accurate\n" )
147
+ #define TX_Reduced2(t) ( (t) > 1 ?\
148
+ "\n*** boosting reduced to %d trials since"\
149
+ " last classifier is very inaccurate\n" :\
150
+ "\n*** boosting reduced to %d trial since"\
151
+ " last classifier is very inaccurate\n" )
152
+ #define T_Abandoned "\n*** boosting abandoned (too few classifiers)\n"
153
+ #define T_BoostingUnhelpful "\n*** warning: boosting may be unhelpful\n"
154
+ #define T_Composite "Composite ruleset:"
155
+ #define T_Tree "Decision tree:"
156
+ #define T_Rules "Rules:"
157
+
158
+ #define T_Default_class "Default class"
159
+ #define T_boost "boost"
160
+ #define T_composite_ruleset "composite ruleset"
161
+ #define T_Rule_utility_summary "Rule utility summary"
162
+ #define T_class "class"
163
+ #define T_classified_as "classified as"
164
+ #define T_Summary "Summary"
165
+ #define T_FoldsReduced "\n*** folds reduced to number of cases\n"
166
+ #define T_EvalHoldOut "\nEvaluation on hold-out data (%d cases):\n"
167
+ #define T_Summary "Summary"
168
+ #define T_Fold "Fold"
169
+ #define T_Mean "Mean"
170
+ #define T_SE "SE"
171
+
172
+ #define TX_Line(l,f) "*** line %d of `%s': ", l, f
173
+ #define E_NOFILE(f,e) "cannot open file %s%s\n", f, e
174
+ #define E_ForWrite " for writing"
175
+ #define E_BADCLASSTHRESH "bad class threshold `%s'\n"
176
+ #define E_LEQCLASSTHRESH "class threshold `%s' <= previous threshold\n"
177
+ #define E_BADATTNAME "`:' or `:=' expected after attribute name"\
178
+ " `%s'\n"
179
+ #define E_EOFINATT "unexpected eof while reading attribute `%s'\n"
180
+ #define E_SINGLEATTVAL(a,v) "attribute `%s' has only one value `%s'\n",\
181
+ a, v
182
+ #define E_DUPATTNAME "multiple attributes with name `%s'\n"
183
+ #define E_CWTATTERR "case weight attribute must be continuous\n"
184
+ #define E_BADATTVAL(v,a) "bad value of `%s' for attribute `%s'\n", v, a
185
+ #define E_BADNUMBER(a) "value of `%s' changed to `?'\n", a
186
+ #define E_BADCLASS "bad class value `%s'\n"
187
+ #define E_BADCOSTCLASS "bad class `%s'\n"
188
+ #define E_BADCOST "bad cost value `%s'\n"
189
+ #define E_NOMEM "unable to allocate sufficient memory\n"
190
+ #define E_TOOMANYVALS(a,n) "too many values for attribute `%s'"\
191
+ " (max %d)\n", a, n
192
+ #define E_BADDISCRETE "bad number of discrete values for attribute"\
193
+ " `%s'\n"
194
+ #define E_NOTARGET "target attribute `%s' not found or"\
195
+ " type `ignore'\n"
196
+ #define E_BADCTARGET "target attribute `%s' must be"\
197
+ " type `continuous'\n"
198
+ #define E_BADDTARGET "target attribute `%s' must be specified by"\
199
+ " a list of discrete values\n"
200
+ #define E_LONGNAME "overlength name: check data file formats\n"
201
+ #define E_HITEOF "unexpected end of file\n"
202
+ #define E_MISSNAME "missing name or value before `%s'\n"
203
+ #define E_BADTSTMP(d,a) "bad timestamp `%s' for attribute `%s'\n", d, a
204
+ #define E_BADDATE(d,a) "bad date `%s' for attribute `%s'\n", d, a
205
+ #define E_BADTIME(d,a) "bad time `%s' for attribute `%s'\n", d, a
206
+ #define E_UNKNOWNATT "unknown attribute name `%s'\n"
207
+ #define E_BADDEF1(a,s,x) "in definition of attribute `%s':\n"\
208
+ "\tat `%.12s': expect %s\n", a, s, x
209
+ #define E_BADDEF2(a,s,x) "in definition of attribute `%s':\n"\
210
+ "\t`%s': %s\n", a, s, x
211
+ #define E_SAMEATT(a,b) "[warning] attribute `%s' is identical to"\
212
+ " attribute `%s'\n", a, b
213
+ #define E_BADDEF3 "cannot define target attribute `%s'\n"
214
+ #define E_BADDEF4 "[warning] target attribute appears in"\
215
+ " definition of attribute `%s'\n"
216
+ #define EX_MODELFILE(f) "file %s incompatible with .names file\n", f
217
+ #define E_MFATT "undefined or excluded attribute"
218
+ #define E_MFATTVAL "undefined attribute value"
219
+ #define E_MFCLASS "undefined class"
220
+ #define E_MFEOF "unexpected eof"
221
+ #define T_ErrorLimit "\nError limit exceeded\n"
222
+ #define TX_IllegalValue(v,l,h) "\t** illegal value %g -- "\
223
+ "should be between %g and %g\n", v, l, h
data/ext/c5.0/trees.c ADDED
@@ -0,0 +1,740 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Routines for displaying, building, saving and restoring trees */
30
+ /* ------------------------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ #define TabSize 4
40
+ #define Utility ClassDist[0]
41
+ #define Digits(n) ((n) < 10 ? 3 : (int)(3 + log(n-1) / log(10.0)))
42
+
43
+
44
+ /* If lines look like getting too long while a tree is being
45
+ printed, subtrees are broken off and printed separately after
46
+ the main tree is finished */
47
+
48
+ int SubTree, /* highest subtree to be printed */
49
+ SubSpace=0; /* maximum subtree encountered */
50
+ Tree *SubDef=Nil; /* pointers to subtrees */
51
+ Boolean LastBranch[Width]; /* whether printing last branch of subtree */
52
+
53
+
54
+
55
+ /*************************************************************************/
56
+ /* */
57
+ /* Calculate the depth of nodes in a tree in Utility field */
58
+ /* */
59
+ /*************************************************************************/
60
+
61
+
62
+ void FindDepth(Tree T)
63
+ /* --------- */
64
+ {
65
+ float MaxDepth=0;
66
+ DiscrValue v;
67
+
68
+ if ( T->NodeType )
69
+ {
70
+ ForEach(v, 1, T->Forks)
71
+ {
72
+ FindDepth(T->Branch[v]);
73
+ if ( T->Branch[v]->Utility > MaxDepth )
74
+ {
75
+ MaxDepth = T->Branch[v]->Utility;
76
+ }
77
+ }
78
+ }
79
+
80
+ T->Utility = MaxDepth + 1;
81
+ }
82
+
83
+
84
+
85
+ /*************************************************************************/
86
+ /* */
87
+ /* Display entire decision tree T */
88
+ /* */
89
+ /*************************************************************************/
90
+
91
+
92
+ void PrintTree(Tree T, String Title)
93
+ /* --------- */
94
+ {
95
+ int s;
96
+
97
+ FindDepth(T);
98
+
99
+ SubTree=0;
100
+ fprintf(Of, "\n%s\n", Title);
101
+ Show(T, 0);
102
+ fprintf(Of, "\n");
103
+
104
+ ForEach(s, 1, SubTree)
105
+ {
106
+ fprintf(Of, T_Subtree, s);
107
+ Show(SubDef[s], 0);
108
+ fprintf(Of, "\n");
109
+ }
110
+ }
111
+
112
+
113
+
114
+ /*************************************************************************/
115
+ /* */
116
+ /* Display the tree T with offset Sh */
117
+ /* */
118
+ /*************************************************************************/
119
+
120
+
121
+ void Show(Tree T, int Sh)
122
+ /* ---- */
123
+ {
124
+ DiscrValue v, MaxV, BrNo, Simplest, First;
125
+ CaseCount Errors=0.0;
126
+
127
+ if ( T->NodeType )
128
+ {
129
+ /* See whether separate subtree needed */
130
+
131
+ if ( Sh && Sh * TabSize + MaxLine(T) > Width )
132
+ {
133
+ if ( ++SubTree >= SubSpace )
134
+ {
135
+ SubSpace += 100;
136
+ if ( SubDef )
137
+ {
138
+ Realloc(SubDef, SubSpace, Tree);
139
+ }
140
+ else
141
+ {
142
+ SubDef = Alloc(SubSpace, Tree);
143
+ }
144
+ }
145
+
146
+ SubDef[SubTree] = T;
147
+ fprintf(Of, " [S%d]", SubTree);
148
+ }
149
+ else
150
+ {
151
+ MaxV = T->Forks;
152
+
153
+ /* Skip N/A branch if no cases */
154
+
155
+ First = ( EmptyNA(T) ? 2 : 1 );
156
+ BrNo = First - 1;
157
+
158
+ /* Print simplest branches first */
159
+
160
+ while ( BrNo < MaxV )
161
+ {
162
+ Simplest = First;
163
+ ForEach(v, 2, MaxV)
164
+ {
165
+ if ( T->Branch[v]->Utility < T->Branch[Simplest]->Utility ||
166
+
167
+ T->Branch[v]->Utility == 1 && ! T->Branch[v]->Cases )
168
+ {
169
+ Simplest = v;
170
+ }
171
+ }
172
+
173
+ LastBranch[Sh+1] = ( ++BrNo == MaxV );
174
+ ShowBranch(Sh, T, Simplest, (int)( BrNo == First ));
175
+ T->Branch[Simplest]->Utility = 1E10;
176
+ }
177
+ }
178
+ }
179
+ else
180
+ {
181
+ fprintf(Of, " %s (%.8g", ClassName[T->Leaf], P1(T->Cases));
182
+ if ( T->Cases >= MinLeaf )
183
+ {
184
+ if ( (Errors = T->Cases - T->ClassDist[T->Leaf]) >= 0.05 )
185
+ {
186
+ fprintf(Of, "/%.8g", P1(Errors));
187
+ }
188
+ }
189
+ putc(')', Of);
190
+ }
191
+ }
192
+
193
+
194
+
195
+ /*************************************************************************/
196
+ /* */
197
+ /* Print a node T with offset Sh, branch value v, and continue */
198
+ /* */
199
+ /*************************************************************************/
200
+
201
+
202
+ void ShowBranch(int Sh, Tree T, DiscrValue v, DiscrValue BrNo)
203
+ /* ---------- */
204
+ {
205
+ DiscrValue Pv, Last;
206
+ Attribute Att;
207
+ Boolean FirstValue;
208
+ int TextWidth, Skip, Values, i, Extra;
209
+ char CVS1[20], CVS2[20];
210
+
211
+ Att = T->Tested;
212
+
213
+ switch ( T->NodeType )
214
+ {
215
+ case BrDiscr:
216
+
217
+ Indent(Sh, BrNo);
218
+
219
+ fprintf(Of, "%s = %s:", AttName[Att], AttValName[Att][v]);
220
+
221
+ break;
222
+
223
+ case BrThresh:
224
+
225
+ Indent(Sh, BrNo);
226
+
227
+ fprintf(Of, "%s", AttName[Att]);
228
+
229
+ if ( v == 1 )
230
+ {
231
+ fprintf(Of, " = N/A:");
232
+ }
233
+ else
234
+ if ( T->Lower != T->Upper )
235
+ {
236
+ if ( v == 2 )
237
+ {
238
+ CValToStr(T->Lower, Att, CVS1);
239
+ CValToStr(T->Mid , Att, CVS2);
240
+ fprintf(Of, " <= %s (%s):", CVS1, CVS2);
241
+ }
242
+ else
243
+ {
244
+ CValToStr(T->Upper, Att, CVS1);
245
+ CValToStr(T->Mid , Att, CVS2);
246
+ fprintf(Of, " >= %s (%s):", CVS1, CVS2);
247
+ }
248
+ }
249
+ else
250
+ {
251
+ CValToStr(T->Cut, Att, CVS1);
252
+ fprintf(Of, " %s %s:", ( v == 2 ? "<=" : ">" ), CVS1);
253
+ }
254
+
255
+ break;
256
+
257
+ case BrSubset:
258
+
259
+ /* Count values at this branch */
260
+
261
+ Values = Elements(Att, T->Subset[v], &Last);
262
+ if ( ! Values ) return;
263
+
264
+ Indent(Sh, BrNo);
265
+
266
+ if ( Values == 1 )
267
+ {
268
+ fprintf(Of, "%s = %s:", AttName[Att], AttValName[Att][Last]);
269
+ break;
270
+ }
271
+
272
+ if ( Ordered(Att) )
273
+ {
274
+ /* Find first value */
275
+
276
+ for ( Pv = 1 ; ! In(Pv, T->Subset[v]) ; Pv++ )
277
+ ;
278
+
279
+ fprintf(Of, "%s %s [%s-%s]:", AttName[Att], T_InRange,
280
+ AttValName[Att][Pv], AttValName[Att][Last]);
281
+ break;
282
+ }
283
+
284
+ fprintf(Of, "%s %s {", AttName[Att], T_ElementOf);
285
+ FirstValue = true;
286
+ Skip = CharWidth(AttName[Att]) + CharWidth(T_ElementOf) + 3;
287
+ TextWidth = Skip + Sh * TabSize;
288
+
289
+ ForEach(Pv, 1, Last)
290
+ {
291
+ if ( In(Pv, T->Subset[v]) )
292
+ {
293
+ /* Find number of characters after this element */
294
+
295
+ if ( Pv != Last || T->Branch[v]->NodeType )
296
+ {
297
+ Extra = 1; /* for ":" */
298
+ }
299
+ else
300
+ {
301
+ Extra = 2 /* for ": " */
302
+ + CharWidth(ClassName[T->Branch[v]->Leaf])
303
+ + 3 /* for " ()" */
304
+ + Digits(T->Cases)
305
+ + ( T->Errors < 0.05 ? 0 :
306
+ 1 /* for "/" */
307
+ + Digits(T->Errors) );
308
+ }
309
+
310
+ if ( ! FirstValue &&
311
+ TextWidth + CharWidth(AttValName[Att][Pv]) +
312
+ Extra + 1 > Width )
313
+ {
314
+ Indent(Sh, 0);
315
+ fprintf(Of, "%s",
316
+ ( LastBranch[Sh+1] && ! T->Branch[v]->NodeType ?
317
+ " " : ": " ));
318
+ ForEach(i, 5, Skip) putc(' ', Of);
319
+
320
+ TextWidth = Skip + Sh * TabSize;
321
+ FirstValue = true;
322
+ }
323
+
324
+ fprintf(Of, "%s%c",
325
+ AttValName[Att][Pv], Pv == Last ? '}' : ',');
326
+ TextWidth += CharWidth(AttValName[Att][Pv]) + 1;
327
+ FirstValue = false;
328
+ }
329
+ }
330
+ putc(':', Of);
331
+ }
332
+
333
+ Show(T->Branch[v], Sh+1);
334
+ }
335
+
336
+
337
+
338
+ /*************************************************************************/
339
+ /* */
340
+ /* Count the elements in a subset and record the last */
341
+ /* */
342
+ /*************************************************************************/
343
+
344
+
345
+ DiscrValue Elements(Attribute Att, Set S, DiscrValue *Last)
346
+ /* -------- */
347
+ {
348
+ DiscrValue Pv, Values=0;
349
+
350
+ ForEach(Pv, 1, MaxAttVal[Att])
351
+ {
352
+ if ( In(Pv, S) )
353
+ {
354
+ *Last = Pv;
355
+ Values++;
356
+ }
357
+ }
358
+
359
+ return Values;
360
+ }
361
+
362
+
363
+
364
+ /*************************************************************************/
365
+ /* */
366
+ /* Find the approximate maximum single line size for non-leaf */
367
+ /* subtree T */
368
+ /* */
369
+ /*************************************************************************/
370
+
371
+
372
+ int MaxLine(Tree T)
373
+ /* ------- */
374
+ {
375
+ Attribute Att;
376
+ DiscrValue v, vv;
377
+ int Ll, One, MaxLl=0;
378
+
379
+ Att = T->Tested;
380
+
381
+ /* First find the max length of the line excluding tested att */
382
+
383
+ ForEach(v, 1, T->Forks)
384
+ {
385
+ switch ( T->NodeType )
386
+ {
387
+ case BrThresh:
388
+ if ( TStampVal(Att) )
389
+ {
390
+ Ll = ( T->Lower != T->Upper ? 41 : 19 );
391
+ }
392
+ else
393
+ if ( DateVal(Att) )
394
+ {
395
+ Ll = ( T->Lower != T->Upper ? 23 : 10 );
396
+ }
397
+ else
398
+ if ( TimeVal(Att) )
399
+ {
400
+ Ll = ( T->Lower != T->Upper ? 19 : 8 );
401
+ }
402
+ else
403
+ {
404
+ Ll = ( T->Lower != T->Upper ? 11 : 4 );
405
+ }
406
+ break;
407
+
408
+ case BrDiscr:
409
+ if ( Ordered(Att) )
410
+ {
411
+ vv = T->Cut;
412
+
413
+ switch ( v )
414
+ {
415
+ case 1:
416
+ Ll = 3;
417
+ break;
418
+
419
+ case 2:
420
+ Ll = CharWidth(AttValName[Att][2]);
421
+ if ( vv != 2 )
422
+ {
423
+ Ll += CharWidth(AttValName[Att][vv])+1;
424
+ }
425
+ break;
426
+
427
+ case 3:
428
+ Ll = CharWidth(AttValName[Att][MaxAttVal[Att]]);
429
+ if ( vv != MaxAttVal[Att] - 1 )
430
+ {
431
+ Ll += CharWidth(AttValName[Att][vv+1])+1;
432
+ }
433
+ }
434
+ }
435
+ else
436
+ {
437
+ Ll = CharWidth(AttValName[Att][v]) + 1;
438
+ }
439
+ break;
440
+
441
+ case BrSubset: /* difficult! */
442
+ Ll = 0;
443
+ ForEach(vv, 1, MaxAttVal[Att])
444
+ {
445
+ if ( In(vv,T->Subset[v]) )
446
+ {
447
+ One = CharWidth(AttValName[Att][vv]) + 6;
448
+ if ( One > Ll ) Ll = One;
449
+ }
450
+ }
451
+ }
452
+
453
+ /* Check whether ends in leaf */
454
+
455
+ if ( ! T->Branch[v]->NodeType &&
456
+ ( v > 1 || T->Branch[v]->Cases > 0.01 ) )
457
+ {
458
+ Ll += CharWidth(ClassName[T->Branch[v]->Leaf]) + 6;
459
+ }
460
+
461
+ if ( Ll > MaxLl ) MaxLl = Ll;
462
+ }
463
+
464
+ return CharWidth(AttName[Att]) + 4 + MaxLl;
465
+ }
466
+
467
+
468
+
469
+ /*************************************************************************/
470
+ /* */
471
+ /* Indent Sh columns */
472
+ /* */
473
+ /*************************************************************************/
474
+
475
+
476
+ void Indent(int Sh, int BrNo)
477
+ /* ------ */
478
+ {
479
+ int i;
480
+
481
+ fprintf(Of, "\n");
482
+ for ( i = 1 ; i <= Sh ; i++ )
483
+ {
484
+ fprintf(Of, "%s", ( i == Sh && BrNo == 1 ? ":..." :
485
+ LastBranch[i] ? " " : ": " ));
486
+ }
487
+ }
488
+
489
+
490
+
491
+ /*************************************************************************/
492
+ /* */
493
+ /* Free up space taken up by tree T */
494
+ /* */
495
+ /*************************************************************************/
496
+
497
+
498
+ void FreeTree(Tree T)
499
+ /* -------- */
500
+ {
501
+ DiscrValue v;
502
+
503
+ if ( ! T ) return;
504
+
505
+ if ( T->NodeType )
506
+ {
507
+ ForEach(v, 1, T->Forks)
508
+ {
509
+ FreeTree(T->Branch[v]);
510
+ }
511
+
512
+ Free(T->Branch);
513
+
514
+ if ( T->NodeType == BrSubset )
515
+ {
516
+ FreeVector((void **) T->Subset, 1, T->Forks);
517
+ }
518
+
519
+ }
520
+
521
+ Free(T->ClassDist);
522
+ Free(T);
523
+ }
524
+
525
+
526
+
527
+ /*************************************************************************/
528
+ /* */
529
+ /* Construct a leaf in a given node */
530
+ /* */
531
+ /*************************************************************************/
532
+
533
+
534
+ Tree Leaf(double *Freq, ClassNo NodeClass, CaseCount Cases, CaseCount Errors)
535
+ /* ---- */
536
+ {
537
+ Tree Node;
538
+ ClassNo c;
539
+
540
+ Node = AllocZero(1, TreeRec);
541
+
542
+ Node->ClassDist = AllocZero(MaxClass+1, CaseCount);
543
+ if ( Freq )
544
+ {
545
+ ForEach(c, 1, MaxClass)
546
+ {
547
+ Node->ClassDist[c] = Freq[c];
548
+ }
549
+ }
550
+
551
+ Node->NodeType = 0;
552
+ Node->Leaf = NodeClass;
553
+ Node->Cases = Cases;
554
+ Node->Errors = Errors;
555
+
556
+ return Node;
557
+ }
558
+
559
+
560
+
561
+ /*************************************************************************/
562
+ /* */
563
+ /* Insert branches in a node */
564
+ /* */
565
+ /*************************************************************************/
566
+
567
+
568
+ void Sprout(Tree T, DiscrValue Branches)
569
+ /* ------ */
570
+ {
571
+ T->Forks = Branches;
572
+ T->Branch = AllocZero(Branches+1, Tree);
573
+ }
574
+
575
+
576
+
577
+ /*************************************************************************/
578
+ /* */
579
+ /* Remove branches etc from a node */
580
+ /* */
581
+ /*************************************************************************/
582
+
583
+
584
+ void UnSprout(Tree T)
585
+ /* -------- */
586
+ {
587
+ DiscrValue v;
588
+
589
+ ForEach(v, 1, T->Forks)
590
+ {
591
+ FreeTree(T->Branch[v]);
592
+ }
593
+ Free(T->Branch); T->Branch = Nil;
594
+
595
+ if ( T->NodeType == BrSubset )
596
+ {
597
+ FreeVector((void **) T->Subset, 1, T->Forks); T->Subset = Nil;
598
+ }
599
+
600
+ T->Forks = T->NodeType = 0;
601
+ }
602
+
603
+
604
+
605
+ /*************************************************************************/
606
+ /* */
607
+ /* Count the non-null leaves in a tree */
608
+ /* */
609
+ /*************************************************************************/
610
+
611
+
612
+ int TreeSize(Tree T)
613
+ /* -------- */
614
+ {
615
+ int Sum=0;
616
+ DiscrValue v;
617
+
618
+ if ( T->NodeType )
619
+ {
620
+ ForEach(v, ( EmptyNA(T) ? 2 : 1 ), T->Forks)
621
+ {
622
+ Sum += TreeSize(T->Branch[v]);
623
+ }
624
+
625
+ return Sum;
626
+ }
627
+
628
+ return ( T->Cases >= MinLeaf ? 1 : 0 );
629
+ }
630
+
631
+
632
+
633
+ /*************************************************************************/
634
+ /* */
635
+ /* Count the non-null leaves in a tree that may contain */
636
+ /* compressed branches via CompressBranches() */
637
+ /* */
638
+ /*************************************************************************/
639
+
640
+
641
+ int ExpandedLeafCount(Tree T)
642
+ /* ----------------- */
643
+ {
644
+ int Sum=0;
645
+ DiscrValue v, Dummy;
646
+
647
+ if ( ! T->NodeType )
648
+ {
649
+ return 1;
650
+ }
651
+
652
+ ForEach(v, 1, T->Forks)
653
+ {
654
+ if ( T->Branch[v]->Cases < MinLeaf ) continue;
655
+
656
+ if ( T->NodeType == BrSubset && ! T->Branch[v]->NodeType )
657
+ {
658
+ Sum += Elements(T->Tested, T->Subset[v], &Dummy);
659
+ }
660
+ else
661
+ {
662
+ Sum += ExpandedLeafCount(T->Branch[v]);
663
+ }
664
+ }
665
+
666
+ return Sum;
667
+ }
668
+
669
+
670
+
671
+ /*************************************************************************/
672
+ /* */
673
+ /* Find the maximum depth of a tree */
674
+ /* */
675
+ /*************************************************************************/
676
+
677
+
678
+ int TreeDepth(Tree T)
679
+ /* --------- */
680
+ {
681
+ DiscrValue v;
682
+ int Subtree, MaxSubtree=0;
683
+
684
+ if ( T->NodeType )
685
+ {
686
+ ForEach(v, 1, T->Forks)
687
+ {
688
+ Subtree = TreeDepth(T->Branch[v]);
689
+ if ( Subtree > MaxSubtree ) MaxSubtree = Subtree;
690
+ }
691
+ }
692
+
693
+ return MaxSubtree + 1;
694
+ }
695
+
696
+
697
+
698
+ /*************************************************************************/
699
+ /* */
700
+ /* Return a copy of tree T */
701
+ /* */
702
+ /*************************************************************************/
703
+
704
+
705
+ Tree CopyTree(Tree T)
706
+ /* -------- */
707
+ {
708
+ DiscrValue v;
709
+ Tree New;
710
+ int Bytes;
711
+
712
+ New = Alloc(1, TreeRec);
713
+ memcpy(New, T, sizeof(TreeRec));
714
+
715
+ New->ClassDist = Alloc(MaxClass+1, CaseCount);
716
+ memcpy(New->ClassDist, T->ClassDist, (MaxClass + 1) * sizeof(CaseCount));
717
+
718
+ if ( T->NodeType == BrSubset )
719
+ {
720
+ Bytes = (MaxAttVal[T->Tested]>>3) + 1;
721
+
722
+ New->Subset = Alloc(T->Forks+1, Set);
723
+ ForEach(v, 1, T->Forks)
724
+ {
725
+ New->Subset[v] = Alloc(Bytes, unsigned char);
726
+ memcpy(New->Subset[v], T->Subset[v], Bytes);
727
+ }
728
+ }
729
+
730
+ if ( T->NodeType )
731
+ {
732
+ New->Branch = AllocZero(T->Forks+1, Tree);
733
+ ForEach(v, 1, T->Forks)
734
+ {
735
+ New->Branch[v] = CopyTree(T->Branch[v]);
736
+ }
737
+ }
738
+
739
+ return New;
740
+ }