see5-installer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
data/ext/c5.0/info.c ADDED
@@ -0,0 +1,146 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Calculate information, information gain, and print dists */
30
+ /* -------------------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ /*************************************************************************/
40
+ /* */
41
+ /* Given Freq[][] and ValFreq[], compute the information gain. */
42
+ /* */
43
+ /*************************************************************************/
44
+
45
+
46
+ double ComputeGain(double BaseInfo, float UnknFrac, DiscrValue MaxVal,
47
+ CaseCount TotalCases)
48
+ /* ----------- */
49
+ {
50
+ DiscrValue v;
51
+ double ThisInfo=0.0;
52
+
53
+ /* Check whether all values are unknown or the same */
54
+
55
+ if ( ! TotalCases ) return None;
56
+
57
+ /* Compute total info after split, by summing the
58
+ info of each of the subsets formed by the test */
59
+
60
+ ForEach(v, 1, MaxVal)
61
+ {
62
+ ThisInfo += TotalInfo(GEnv.Freq[v], 1, MaxClass);
63
+ }
64
+ ThisInfo /= TotalCases;
65
+
66
+ /* Set the gain in information for all cases, adjusted for unknowns */
67
+
68
+ return ( BaseInfo <= ThisInfo ? 0.0 :
69
+ (1 - UnknFrac) * (BaseInfo - ThisInfo) );
70
+
71
+ }
72
+
73
+
74
+
75
+ /*************************************************************************/
76
+ /* */
77
+ /* Compute the total information in V[ MinVal..MaxVal ] */
78
+ /* */
79
+ /*************************************************************************/
80
+
81
+
82
+ double TotalInfo(double V[], DiscrValue MinVal, DiscrValue MaxVal)
83
+ /* --------- */
84
+ {
85
+ DiscrValue v;
86
+ double Sum=0.0, TotalCases=0;
87
+ CaseCount N;
88
+
89
+ ForEach(v, MinVal, MaxVal)
90
+ {
91
+ N = V[v];
92
+
93
+ Sum += N * Log(N);
94
+ TotalCases += N;
95
+ }
96
+
97
+ return TotalCases * Log(TotalCases) - Sum;
98
+ }
99
+
100
+
101
+
102
+ /*************************************************************************/
103
+ /* */
104
+ /* Print distribution table for given attribute */
105
+ /* */
106
+ /*************************************************************************/
107
+
108
+
109
+ void PrintDistribution(Attribute Att, DiscrValue MinVal, DiscrValue MaxVal,
110
+ double **Freq, double *ValFreq, Boolean ShowNames)
111
+ /* ----------------- */
112
+ {
113
+ DiscrValue v;
114
+ ClassNo c;
115
+ String Val;
116
+
117
+ fprintf(Of, "\n\t\t\t ");
118
+ ForEach(c, 1, MaxClass)
119
+ {
120
+ fprintf(Of, "%7.6s", ClassName[c]);
121
+ }
122
+ fprintf(Of, "\n");
123
+
124
+ ForEach(v, MinVal, MaxVal)
125
+ {
126
+ if ( ShowNames )
127
+ {
128
+ Val = ( ! v ? "unknown" :
129
+ MaxAttVal[Att] ? AttValName[Att][v] :
130
+ v == 1 ? "N/A" :
131
+ v == 2 ? "below" : "above" );
132
+ fprintf(Of, "\t\t[%-7.7s:", Val);
133
+ }
134
+ else
135
+ {
136
+ fprintf(Of, "\t\t[%-7d:", v);
137
+ }
138
+
139
+ ForEach(c, 1, MaxClass)
140
+ {
141
+ fprintf(Of, " %6.1f", Freq[v][c]);
142
+ }
143
+
144
+ fprintf(Of, "]\n");
145
+ }
146
+ }
data/ext/c5.0/mcost.c ADDED
@@ -0,0 +1,138 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Read variable misclassification costs */
30
+ /* ------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ void GetMCosts(FILE *Cf)
40
+ /* --------- */
41
+ {
42
+ ClassNo Pred, Real, p, r;
43
+ char Name[1000];
44
+ CaseNo i;
45
+ float Val, Sum=0;
46
+
47
+ LineNo = 0;
48
+
49
+ /* Read entries from cost file */
50
+
51
+ while ( ReadName(Cf, Name, 1000, ':') )
52
+ {
53
+ if ( ! (Pred = Which(Name, ClassName, 1, MaxClass)) )
54
+ {
55
+ Error(BADCOSTCLASS, Name, "");
56
+ }
57
+
58
+ if ( ! ReadName(Cf, Name, 1000, ':') ||
59
+ ! (Real = Which(Name, ClassName, 1, MaxClass)) )
60
+ {
61
+ Error(BADCOSTCLASS, Name, "");
62
+ }
63
+
64
+ if ( ! ReadName(Cf, Name, 1000, ':') ||
65
+ sscanf(Name, "%f", &Val) != 1 || Val < 0 )
66
+ {
67
+ Error(BADCOST, "", "");
68
+ Val = 1;
69
+ }
70
+
71
+ if ( Pred > 0 && Real > 0 && Pred != Real && Val != 1 )
72
+ {
73
+ /* Have a non-trivial cost entry */
74
+
75
+ if ( ! MCost )
76
+ {
77
+ /* Set up cost matrices */
78
+
79
+ MCost = Alloc(MaxClass+1, float *);
80
+ ForEach(p, 1, MaxClass)
81
+ {
82
+ MCost[p] = Alloc(MaxClass+1, float);
83
+ ForEach(r, 1, MaxClass)
84
+ {
85
+ MCost[p][r] = ( p == r ? 0.0 : 1.0 );
86
+ }
87
+ }
88
+ }
89
+
90
+ MCost[Pred][Real] = Val;
91
+ }
92
+ }
93
+ fclose(Cf);
94
+
95
+ /* Don't need weights etc. for predict or interpret, or
96
+ if not using cost weighting */
97
+
98
+ if ( ! (CostWeights = MaxClass == 2 && MaxCase >= 0 && MCost) )
99
+ {
100
+ return;
101
+ }
102
+
103
+ /* Determine class frequency distribution */
104
+
105
+ ClassFreq = AllocZero(MaxClass+1, double);
106
+
107
+ if ( CWtAtt )
108
+ {
109
+ AvCWt = 1; /* relative weights not yet set */
110
+ ForEach(i, 0, MaxCase)
111
+ {
112
+ ClassFreq[Class(Case[i])] += RelCWt(Case[i]);
113
+ }
114
+ }
115
+ else
116
+ {
117
+ ForEach(i, 0, MaxCase)
118
+ {
119
+ ClassFreq[Class(Case[i])]++;
120
+ }
121
+ }
122
+
123
+ /* Find normalised weight multipliers */
124
+
125
+ WeightMul = Alloc(3, float);
126
+
127
+ Sum = (ClassFreq[1] * MCost[2][1] + ClassFreq[2] * MCost[1][2]) /
128
+ (ClassFreq[1] + ClassFreq[2]);
129
+
130
+ WeightMul[1] = MCost[2][1] / Sum;
131
+ WeightMul[2] = MCost[1][2] / Sum;
132
+
133
+ /* Adjust MINITEMS to take account of case reweighting */
134
+
135
+ MINITEMS *= Min(WeightMul[1], WeightMul[2]);
136
+
137
+ Free(ClassFreq); ClassFreq = Nil;
138
+ }
@@ -0,0 +1,952 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Routines for saving and reading model files */
30
+ /* ------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+ int Entry;
39
+
40
+ char* Prop[]={"null",
41
+ "att",
42
+ "class",
43
+ "cut",
44
+ "conds",
45
+ "elts",
46
+ "entries",
47
+ "forks",
48
+ "freq",
49
+ "id",
50
+ "type",
51
+ "low",
52
+ "mid",
53
+ "high",
54
+ "result",
55
+ "rules",
56
+ "val",
57
+ "lift",
58
+ "cover",
59
+ "ok",
60
+ "default",
61
+ "costs",
62
+ "sample",
63
+ "init"
64
+ };
65
+
66
+ char PropName[20],
67
+ *PropVal=Nil,
68
+ *Unquoted;
69
+ int PropValSize=0;
70
+
71
+ #define PROPS 23
72
+
73
+ #define ERRORP 0
74
+ #define ATTP 1
75
+ #define CLASSP 2
76
+ #define CUTP 3
77
+ #define CONDSP 4
78
+ #define ELTSP 5
79
+ #define ENTRIESP 6
80
+ #define FORKSP 7
81
+ #define FREQP 8
82
+ #define IDP 9
83
+ #define TYPEP 10
84
+ #define LOWP 11
85
+ #define MIDP 12
86
+ #define HIGHP 13
87
+ #define RESULTP 14
88
+ #define RULESP 15
89
+ #define VALP 16
90
+ #define LIFTP 17
91
+ #define COVERP 18
92
+ #define OKP 19
93
+ #define DEFAULTP 20
94
+ #define COSTSP 21
95
+ #define SAMPLEP 22
96
+ #define INITP 23
97
+
98
+
99
+ /*************************************************************************/
100
+ /* */
101
+ /* Check whether file is open. If it is not, open it and */
102
+ /* read/write sampling information and discrete names */
103
+ /* */
104
+ /*************************************************************************/
105
+
106
+
107
+ void CheckFile(String Extension, Boolean Write)
108
+ /* --------- */
109
+ {
110
+ static char *LastExt="";
111
+
112
+ if ( ! TRf || strcmp(LastExt, Extension) )
113
+ {
114
+ LastExt = Extension;
115
+
116
+ if ( TRf )
117
+ {
118
+ fprintf(TRf, "\n");
119
+ fclose(TRf);
120
+ }
121
+
122
+ if ( Write )
123
+ {
124
+ WriteFilePrefix(Extension);
125
+ }
126
+ else
127
+ {
128
+ ReadFilePrefix(Extension);
129
+ }
130
+ }
131
+ }
132
+
133
+
134
+
135
+ /*************************************************************************/
136
+ /* */
137
+ /* Write information on system, sampling */
138
+ /* */
139
+ /*************************************************************************/
140
+
141
+
142
+ void WriteFilePrefix(String Extension)
143
+ /* --------------- */
144
+ {
145
+ time_t clock;
146
+ struct tm *now;
147
+
148
+ if ( ! (TRf = GetFile(Extension, "w")) )
149
+ {
150
+ Error(NOFILE, Fn, E_ForWrite);
151
+ }
152
+
153
+ clock = time(0);
154
+ now = localtime(&clock);
155
+ now->tm_mon++;
156
+ fprintf(TRf, "id=\"See5/C5.0 %s %d-%d%d-%d%d\"\n",
157
+ RELEASE,
158
+ now->tm_year + 1900,
159
+ now->tm_mon / 10, now->tm_mon % 10,
160
+ now->tm_mday / 10, now->tm_mday % 10);
161
+
162
+ if ( MCost )
163
+ {
164
+ fprintf(TRf, "costs=\"1\"\n");
165
+ }
166
+
167
+ if ( SAMPLE > 0 )
168
+ {
169
+ fprintf(TRf, "sample=\"%g\" init=\"%d\"\n", SAMPLE, KRInit);
170
+ }
171
+
172
+ SaveDiscreteNames();
173
+
174
+ fprintf(TRf, "entries=\"%d\"\n", TRIALS);
175
+ }
176
+
177
+
178
+
179
+ /*************************************************************************/
180
+ /* */
181
+ /* Read header information */
182
+ /* */
183
+ /*************************************************************************/
184
+
185
+
186
+ void ReadFilePrefix(String Extension)
187
+ /* -------------- */
188
+ {
189
+ if ( ! (TRf = GetFile(Extension, "r")) ) Error(NOFILE, Fn, "");
190
+
191
+ StreamIn((char *) &TRIALS, sizeof(int));
192
+ if ( memcmp((char *) &TRIALS, "id=", 3) != 0 )
193
+ {
194
+ printf("\nCannot read old format classifiers\n");
195
+ exit(1);
196
+ }
197
+ else
198
+ {
199
+ rewind(TRf);
200
+ ReadHeader();
201
+ }
202
+ }
203
+
204
+
205
+
206
+ /*************************************************************************/
207
+ /* */
208
+ /* Save attribute values read with "discrete N" */
209
+ /* */
210
+ /*************************************************************************/
211
+
212
+
213
+ void SaveDiscreteNames()
214
+ /* ----------------- */
215
+ {
216
+ Attribute Att;
217
+ DiscrValue v;
218
+
219
+ ForEach(Att, 1, MaxAtt)
220
+ {
221
+ if ( ! StatBit(Att, DISCRETE) || MaxAttVal[Att] < 2 ) continue;
222
+
223
+ AsciiOut("att=", AttName[Att]);
224
+ AsciiOut(" elts=", AttValName[Att][2]); /* skip N/A */
225
+
226
+ ForEach(v, 3, MaxAttVal[Att])
227
+ {
228
+ AsciiOut(",", AttValName[Att][v]);
229
+ }
230
+ fprintf(TRf, "\n");
231
+ }
232
+ }
233
+
234
+
235
+
236
+ /*************************************************************************/
237
+ /* */
238
+ /* Save entire decision tree T in file with extension Extension */
239
+ /* */
240
+ /*************************************************************************/
241
+
242
+
243
+ void SaveTree(Tree T, String Extension)
244
+ /* -------- */
245
+ {
246
+ CheckFile(Extension, true);
247
+
248
+ OutTree(T);
249
+ }
250
+
251
+
252
+
253
+ void OutTree(Tree T)
254
+ /* ------- */
255
+ {
256
+ DiscrValue v, vv;
257
+ ClassNo c;
258
+ Boolean First;
259
+
260
+ fprintf(TRf, "type=\"%d\"", T->NodeType);
261
+ AsciiOut(" class=", ClassName[T->Leaf]);
262
+ if ( T->Cases > 0 )
263
+ {
264
+ fprintf(TRf, " freq=\"%g", T->ClassDist[1]);
265
+ ForEach(c, 2, MaxClass)
266
+ {
267
+ fprintf(TRf, ",%g", T->ClassDist[c]);
268
+ }
269
+ fprintf(TRf, "\"");
270
+ }
271
+
272
+ if ( T->NodeType )
273
+ {
274
+ AsciiOut(" att=", AttName[T->Tested]);
275
+ fprintf(TRf, " forks=\"%d\"", T->Forks);
276
+
277
+ switch ( T->NodeType )
278
+ {
279
+ case BrDiscr:
280
+ break;
281
+
282
+ case BrThresh:
283
+ fprintf(TRf, " cut=\"%.*g\"", PREC+1, T->Cut);
284
+ if ( T->Upper > T->Cut )
285
+ {
286
+ fprintf(TRf, " low=\"%.*g\" mid=\"%.*g\" high=\"%.*g\"",
287
+ PREC, T->Lower, PREC, T->Mid, PREC, T->Upper);
288
+ }
289
+ break;
290
+
291
+ case BrSubset:
292
+ ForEach(v, 1, T->Forks)
293
+ {
294
+ First=true;
295
+ ForEach(vv, 1, MaxAttVal[T->Tested])
296
+ {
297
+ if ( In(vv, T->Subset[v]) )
298
+ {
299
+ if ( First )
300
+ {
301
+ AsciiOut(" elts=", AttValName[T->Tested][vv]);
302
+ First = false;
303
+ }
304
+ else
305
+ {
306
+ AsciiOut(",", AttValName[T->Tested][vv]);
307
+ }
308
+ }
309
+ }
310
+ /* Make sure have printed at least one element */
311
+
312
+ if ( First ) AsciiOut(" elts=", "N/A");
313
+ }
314
+ break;
315
+ }
316
+ fprintf(TRf, "\n");
317
+
318
+ ForEach(v, 1, T->Forks)
319
+ {
320
+ OutTree(T->Branch[v]);
321
+ }
322
+ }
323
+ else
324
+ {
325
+ fprintf(TRf, "\n");
326
+ }
327
+ }
328
+
329
+
330
+
331
+ /*************************************************************************/
332
+ /* */
333
+ /* Save the current ruleset in rules file */
334
+ /* */
335
+ /*************************************************************************/
336
+
337
+
338
+ void SaveRules(CRuleSet RS, String Extension)
339
+ /* --------- */
340
+ {
341
+ int ri, d;
342
+ CRule R;
343
+ Condition C;
344
+ DiscrValue v;
345
+ Boolean First;
346
+
347
+ CheckFile(Extension, true);
348
+
349
+ fprintf(TRf, "rules=\"%d\"", RS->SNRules);
350
+ AsciiOut(" default=", ClassName[RS->SDefault]);
351
+ fprintf(TRf, "\n");
352
+
353
+ ForEach(ri, 1, RS->SNRules)
354
+ {
355
+ R = RS->SRule[ri];
356
+ fprintf(TRf, "conds=\"%d\" cover=\"%g\" ok=\"%g\" lift=\"%g\"",
357
+ R->Size, R->Cover, R->Correct,
358
+ (R->Correct + 1) / ((R->Cover + 2) * R->Prior));
359
+ AsciiOut(" class=", ClassName[R->Rhs]);
360
+ fprintf(TRf, "\n");
361
+
362
+ ForEach(d, 1, R->Size)
363
+ {
364
+ C = R->Lhs[d];
365
+
366
+ fprintf(TRf, "type=\"%d\"", C->NodeType);
367
+ AsciiOut(" att=", AttName[C->Tested]);
368
+
369
+ switch ( C->NodeType )
370
+ {
371
+ case BrDiscr:
372
+ AsciiOut(" val=", AttValName[C->Tested][C->TestValue]);
373
+ break;
374
+
375
+ case BrThresh:
376
+ if ( C->TestValue == 1 ) /* N/A */
377
+ {
378
+ fprintf(TRf, " val=\"N/A\"");
379
+ }
380
+ else
381
+ {
382
+ fprintf(TRf, " cut=\"%.*g\" result=\"%c\"",
383
+ PREC+1, C->Cut,
384
+ ( C->TestValue == 2 ? '<' : '>' ));
385
+ }
386
+ break;
387
+
388
+ case BrSubset:
389
+ First=true;
390
+ ForEach(v, 1, MaxAttVal[C->Tested])
391
+ {
392
+ if ( In(v, C->Subset) )
393
+ {
394
+ if ( First )
395
+ {
396
+ AsciiOut(" elts=", AttValName[C->Tested][v]);
397
+ First = false;
398
+ }
399
+ else
400
+ {
401
+ AsciiOut(",", AttValName[C->Tested][v]);
402
+ }
403
+ }
404
+ }
405
+ break;
406
+ }
407
+
408
+ fprintf(TRf, "\n");
409
+ }
410
+ }
411
+ }
412
+
413
+
414
+
415
+ /*************************************************************************/
416
+ /* */
417
+ /* Write ASCII string with prefix, escaping any quotes */
418
+ /* */
419
+ /*************************************************************************/
420
+
421
+
422
+ void AsciiOut(String Pre, String S)
423
+ /* -------- */
424
+ {
425
+ fprintf(TRf, "%s\"", Pre);
426
+ while ( *S )
427
+ {
428
+ if ( *S == '"' || *S == '\\' ) fputc('\\', TRf);
429
+ fputc(*S++, TRf);
430
+ }
431
+ fputc('"', TRf);
432
+ }
433
+
434
+
435
+
436
+ /*************************************************************************/
437
+ /* */
438
+ /* Read the header information (id, saved names, models) */
439
+ /* */
440
+ /*************************************************************************/
441
+
442
+
443
+ void ReadHeader()
444
+ /* --------- */
445
+ {
446
+ Attribute Att;
447
+ DiscrValue v;
448
+ char *p, Dummy;
449
+ int Year, Month, Day;
450
+ FILE *F;
451
+
452
+ while ( true )
453
+ {
454
+ switch ( ReadProp(&Dummy) )
455
+ {
456
+ case ERRORP:
457
+ return;
458
+
459
+ case IDP:
460
+ /* Recover year run and set base date for timestamps */
461
+
462
+ if ( sscanf(PropVal + strlen(PropVal) - 11,
463
+ "%d-%d-%d\"", &Year, &Month, &Day) == 3 )
464
+ {
465
+ SetTSBase(Year);
466
+ }
467
+ break;
468
+
469
+ case COSTSP:
470
+ /* Recover costs file used to generate model */
471
+
472
+ if ( (F = GetFile(".costs", "r")) )
473
+ {
474
+ GetMCosts(F);
475
+ }
476
+ break;
477
+ case SAMPLEP:
478
+ sscanf(PropVal, "\"%f\"", &SAMPLE);
479
+ break;
480
+
481
+ case INITP:
482
+ sscanf(PropVal, "\"%d\"", &KRInit);
483
+ break;
484
+
485
+ case ATTP:
486
+ Unquoted = RemoveQuotes(PropVal);
487
+ Att = Which(Unquoted, AttName, 1, MaxAtt);
488
+ if ( ! Att || Exclude(Att) )
489
+ {
490
+ Error(MODELFILE, E_MFATT, Unquoted);
491
+ }
492
+ break;
493
+
494
+ case ELTSP:
495
+ MaxAttVal[Att] = 1;
496
+ AttValName[Att][1] = strdup("N/A");
497
+
498
+ for ( p = PropVal ; *p ; )
499
+ {
500
+ p = RemoveQuotes(p);
501
+ v = ++MaxAttVal[Att];
502
+ AttValName[Att][v] = strdup(p);
503
+
504
+ for ( p += strlen(p) ; *p != '"' ; p++ )
505
+ ;
506
+ p++;
507
+ if ( *p == ',' ) p++;
508
+ }
509
+ AttValName[Att][MaxAttVal[Att]+1] = "<other>";
510
+ MaxDiscrVal = Max(MaxDiscrVal, MaxAttVal[Att]+1);
511
+ break;
512
+
513
+ case ENTRIESP:
514
+ sscanf(PropVal, "\"%d\"", &TRIALS);
515
+ Entry = 0;
516
+ return;
517
+ }
518
+ }
519
+ }
520
+
521
+
522
+
523
+ /*************************************************************************/
524
+ /* */
525
+ /* Retrieve decision tree with extension Extension */
526
+ /* */
527
+ /*************************************************************************/
528
+
529
+
530
+ Tree GetTree(String Extension)
531
+ /* ------- */
532
+ {
533
+ CheckFile(Extension, false);
534
+
535
+ return InTree();
536
+ }
537
+
538
+
539
+
540
+ Tree InTree()
541
+ /* ------ */
542
+ {
543
+ Tree T;
544
+ DiscrValue v, Subset=0;
545
+ char Delim, *p;
546
+ ClassNo c;
547
+ int X;
548
+ double XD;
549
+
550
+ T = (Tree) AllocZero(1, TreeRec);
551
+
552
+ do
553
+ {
554
+ switch ( ReadProp(&Delim) )
555
+ {
556
+ case ERRORP:
557
+ return Nil;
558
+
559
+ case TYPEP:
560
+ sscanf(PropVal, "\"%d\"", &X); T->NodeType = X;
561
+ break;
562
+
563
+ case CLASSP:
564
+ Unquoted = RemoveQuotes(PropVal);
565
+ T->Leaf = Which(Unquoted, ClassName, 1, MaxClass);
566
+ if ( ! T->Leaf ) Error(MODELFILE, E_MFCLASS, Unquoted);
567
+ break;
568
+
569
+ case ATTP:
570
+ Unquoted = RemoveQuotes(PropVal);
571
+ T->Tested = Which(Unquoted, AttName, 1, MaxAtt);
572
+ if ( ! T->Tested || Exclude(T->Tested) )
573
+ {
574
+ Error(MODELFILE, E_MFATT, Unquoted);
575
+ }
576
+ break;
577
+
578
+ case CUTP:
579
+ sscanf(PropVal, "\"%lf\"", &XD); T->Cut = XD;
580
+ T->Lower = T->Mid = T->Upper = T->Cut;
581
+ break;
582
+
583
+ case LOWP:
584
+ sscanf(PropVal, "\"%lf\"", &XD); T->Lower = XD;
585
+ break;
586
+
587
+ case MIDP:
588
+ sscanf(PropVal, "\"%lf\"", &XD); T->Mid = XD;
589
+ break;
590
+
591
+ case HIGHP:
592
+ sscanf(PropVal, "\"%lf\"", &XD); T->Upper = XD;
593
+ break;
594
+
595
+ case FORKSP:
596
+ sscanf(PropVal, "\"%d\"", &T->Forks);
597
+ break;
598
+
599
+ case FREQP:
600
+ T->ClassDist = Alloc(MaxClass+1, CaseCount);
601
+ p = PropVal+1;
602
+
603
+ ForEach(c, 1, MaxClass)
604
+ {
605
+ T->ClassDist[c] = strtod(p, &p);
606
+ T->Cases += T->ClassDist[c];
607
+ p++;
608
+ }
609
+ break;
610
+
611
+ case ELTSP:
612
+ if ( ! Subset++ )
613
+ {
614
+ T->Subset = AllocZero(T->Forks+1, Set);
615
+ }
616
+
617
+ T->Subset[Subset] = MakeSubset(T->Tested);
618
+ break;
619
+ }
620
+ }
621
+ while ( Delim == ' ' );
622
+
623
+ if ( T->ClassDist )
624
+ {
625
+ T->Errors = T->Cases - T->ClassDist[T->Leaf];
626
+ }
627
+ else
628
+ {
629
+ T->ClassDist = Alloc(1, CaseCount);
630
+ }
631
+
632
+ if ( T->NodeType )
633
+ {
634
+ T->Branch = AllocZero(T->Forks+1, Tree);
635
+ ForEach(v, 1, T->Forks)
636
+ {
637
+ T->Branch[v] = InTree();
638
+ }
639
+ }
640
+
641
+ return T;
642
+ }
643
+
644
+
645
+
646
+ /*************************************************************************/
647
+ /* */
648
+ /* Retrieve ruleset with extension Extension */
649
+ /* (Separate functions for ruleset, single rule, single condition) */
650
+ /* */
651
+ /*************************************************************************/
652
+
653
+
654
+ CRuleSet GetRules(String Extension)
655
+ /* -------- */
656
+ {
657
+ CheckFile(Extension, false);
658
+
659
+ return InRules();
660
+ }
661
+
662
+
663
+
664
+ CRuleSet InRules()
665
+ /* ------- */
666
+ {
667
+ CRuleSet RS;
668
+ RuleNo r;
669
+ char Delim;
670
+
671
+ RS = Alloc(1, RuleSetRec);
672
+
673
+ do
674
+ {
675
+ switch ( ReadProp(&Delim) )
676
+ {
677
+ case ERRORP:
678
+ return Nil;
679
+
680
+ case RULESP:
681
+ sscanf(PropVal, "\"%d\"", &RS->SNRules);
682
+ CheckActiveSpace(RS->SNRules);
683
+ break;
684
+
685
+ case DEFAULTP:
686
+ Unquoted = RemoveQuotes(PropVal);
687
+ RS->SDefault = Which(Unquoted, ClassName, 1, MaxClass);
688
+ if ( ! RS->SDefault ) Error(MODELFILE, E_MFCLASS, Unquoted);
689
+ break;
690
+ }
691
+ }
692
+ while ( Delim == ' ' );
693
+
694
+ /* Read each rule */
695
+
696
+ RS->SRule = Alloc(RS->SNRules+1, CRule);
697
+ ForEach(r, 1, RS->SNRules)
698
+ {
699
+ if ( (RS->SRule[r] = InRule()) )
700
+ {
701
+ RS->SRule[r]->RNo = r;
702
+ RS->SRule[r]->TNo = Entry;
703
+ }
704
+ }
705
+ ConstructRuleTree(RS);
706
+ Entry++;
707
+ return RS;
708
+ }
709
+
710
+
711
+
712
+ CRule InRule()
713
+ /* ------ */
714
+ {
715
+ CRule R;
716
+ int d;
717
+ char Delim;
718
+ float Lift;
719
+
720
+ R = Alloc(1, RuleRec);
721
+
722
+ do
723
+ {
724
+ switch ( ReadProp(&Delim) )
725
+ {
726
+ case ERRORP:
727
+ return Nil;
728
+
729
+ case CONDSP:
730
+ sscanf(PropVal, "\"%d\"", &R->Size);
731
+ break;
732
+
733
+ case COVERP:
734
+ sscanf(PropVal, "\"%f\"", &R->Cover);
735
+ break;
736
+
737
+ case OKP:
738
+ sscanf(PropVal, "\"%f\"", &R->Correct);
739
+ break;
740
+
741
+ case LIFTP:
742
+ sscanf(PropVal, "\"%f\"", &Lift);
743
+ R->Prior = (R->Correct + 1) / ((R->Cover + 2) * Lift);
744
+ break;
745
+
746
+ case CLASSP:
747
+ Unquoted = RemoveQuotes(PropVal);
748
+ R->Rhs = Which(Unquoted, ClassName, 1, MaxClass);
749
+ if ( ! R->Rhs ) Error(MODELFILE, E_MFCLASS, Unquoted);
750
+ break;
751
+ }
752
+ }
753
+ while ( Delim == ' ' );
754
+
755
+ R->Lhs = Alloc(R->Size+1, Condition);
756
+ ForEach(d, 1, R->Size)
757
+ {
758
+ R->Lhs[d] = InCondition();
759
+ }
760
+
761
+ R->Vote = 1000 * (R->Correct + 1.0) / (R->Cover + 2.0) + 0.5;
762
+
763
+ return R;
764
+ }
765
+
766
+
767
+
768
+ Condition InCondition()
769
+ /* ----------- */
770
+ {
771
+ Condition C;
772
+ char Delim;
773
+ int X;
774
+ double XD;
775
+
776
+ C = Alloc(1, CondRec);
777
+
778
+ do
779
+ {
780
+ switch ( ReadProp(&Delim) )
781
+ {
782
+ case ERRORP:
783
+ return Nil;
784
+
785
+ case TYPEP:
786
+ sscanf(PropVal, "\"%d\"", &X); C->NodeType = X;
787
+ break;
788
+
789
+ case ATTP:
790
+ Unquoted = RemoveQuotes(PropVal);
791
+ C->Tested = Which(Unquoted, AttName, 1, MaxAtt);
792
+ if ( ! C->Tested || Exclude(C->Tested) )
793
+ {
794
+ Error(MODELFILE, E_MFATT, Unquoted);
795
+ }
796
+ break;
797
+
798
+ case CUTP:
799
+ sscanf(PropVal, "\"%lf\"", &XD); C->Cut = XD;
800
+ break;
801
+
802
+ case RESULTP:
803
+ C->TestValue = ( PropVal[1] == '<' ? 2 : 3 );
804
+ break;
805
+
806
+ case VALP:
807
+ if ( Continuous(C->Tested) )
808
+ {
809
+ C->TestValue = 1;
810
+ }
811
+ else
812
+ {
813
+ Unquoted = RemoveQuotes(PropVal);
814
+ C->TestValue = Which(Unquoted,
815
+ AttValName[C->Tested],
816
+ 1, MaxAttVal[C->Tested]);
817
+ if ( ! C->TestValue ) Error(MODELFILE, E_MFATTVAL, Unquoted);
818
+ }
819
+ break;
820
+
821
+ case ELTSP:
822
+ C->Subset = MakeSubset(C->Tested);
823
+ C->TestValue = 1;
824
+ break;
825
+ }
826
+ }
827
+ while ( Delim == ' ' );
828
+
829
+ return C;
830
+ }
831
+
832
+
833
+
834
+ /*************************************************************************/
835
+ /* */
836
+ /* ASCII reading utilities */
837
+ /* */
838
+ /*************************************************************************/
839
+
840
+
841
+ int ReadProp(char *Delim)
842
+ /* -------- */
843
+ {
844
+ int c, i;
845
+ char *p;
846
+ Boolean Quote=false;
847
+
848
+ for ( p = PropName ; (c = fgetc(TRf)) != '=' ; )
849
+ {
850
+ if ( p - PropName >= 19 || c == EOF )
851
+ {
852
+ Error(MODELFILE, E_MFEOF, "");
853
+ PropName[0] = PropVal[0] = *Delim = '\00';
854
+ return 0;
855
+ }
856
+ *p++ = c;
857
+ }
858
+ *p = '\00';
859
+
860
+ for ( p = PropVal ; ((c = fgetc(TRf)) != ' ' && c != '\n') || Quote ; )
861
+ {
862
+ if ( c == EOF )
863
+ {
864
+ Error(MODELFILE, E_MFEOF, "");
865
+ PropName[0] = PropVal[0] = '\00';
866
+ return 0;
867
+ }
868
+
869
+ if ( (i = p - PropVal) >= PropValSize )
870
+ {
871
+ Realloc(PropVal, (PropValSize += 10000) + 3, char);
872
+ p = PropVal + i;
873
+ }
874
+
875
+ *p++ = c;
876
+ if ( c == '\\' )
877
+ {
878
+ *p++ = fgetc(TRf);
879
+ }
880
+ else
881
+ if ( c == '"' )
882
+ {
883
+ Quote = ! Quote;
884
+ }
885
+ }
886
+ *p = '\00';
887
+ *Delim = c;
888
+
889
+ return Which(PropName, Prop, 1, PROPS);
890
+ }
891
+
892
+
893
+ String RemoveQuotes(String S)
894
+ /* ------------ */
895
+ {
896
+ char *p, *Start;
897
+
898
+ p = Start = S;
899
+
900
+ for ( S++ ; *S != '"' ; S++ )
901
+ {
902
+ if ( *S == '\\' ) S++;
903
+ *p++ = *S;
904
+ *S = '-';
905
+ }
906
+ *p = '\00';
907
+
908
+ return Start;
909
+ }
910
+
911
+
912
+
913
+ Set MakeSubset(Attribute Att)
914
+ /* ---------- */
915
+ {
916
+ int Bytes, b;
917
+ char *p;
918
+ Set S;
919
+
920
+ Bytes = (MaxAttVal[Att]>>3) + 1;
921
+ S = AllocZero(Bytes, Byte);
922
+
923
+ for ( p = PropVal ; *p ; )
924
+ {
925
+ p = RemoveQuotes(p);
926
+ b = Which(p, AttValName[Att], 1, MaxAttVal[Att]);
927
+ if ( ! b ) Error(MODELFILE, E_MFATTVAL, p);
928
+ SetBit(b, S);
929
+
930
+ for ( p += strlen(p) ; *p != '"' ; p++ )
931
+ ;
932
+ p++;
933
+ if ( *p == ',' ) p++;
934
+ }
935
+
936
+ return S;
937
+ }
938
+
939
+
940
+
941
+ /*************************************************************************/
942
+ /* */
943
+ /* Character stream read for binary routines */
944
+ /* */
945
+ /*************************************************************************/
946
+
947
+
948
+ void StreamIn(String S, int n)
949
+ /* -------- */
950
+ {
951
+ while ( n-- ) *S++ = getc(TRf);
952
+ }