see5-installer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +11 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +10 -0
  6. data/README.md +29 -0
  7. data/Rakefile +12 -0
  8. data/ext/c5.0/Makefile +86 -0
  9. data/ext/c5.0/attwinnow.c +394 -0
  10. data/ext/c5.0/c50.c +330 -0
  11. data/ext/c5.0/classify.c +700 -0
  12. data/ext/c5.0/confmat.c +195 -0
  13. data/ext/c5.0/construct.c +853 -0
  14. data/ext/c5.0/contin.c +613 -0
  15. data/ext/c5.0/defns.i +788 -0
  16. data/ext/c5.0/discr.c +307 -0
  17. data/ext/c5.0/extern.i +170 -0
  18. data/ext/c5.0/formrules.c +720 -0
  19. data/ext/c5.0/formtree.c +1158 -0
  20. data/ext/c5.0/getdata.c +521 -0
  21. data/ext/c5.0/getnames.c +733 -0
  22. data/ext/c5.0/global.c +211 -0
  23. data/ext/c5.0/gpl.txt +674 -0
  24. data/ext/c5.0/implicitatt.c +1112 -0
  25. data/ext/c5.0/info.c +146 -0
  26. data/ext/c5.0/mcost.c +138 -0
  27. data/ext/c5.0/modelfiles.c +952 -0
  28. data/ext/c5.0/p-thresh.c +313 -0
  29. data/ext/c5.0/prune.c +1069 -0
  30. data/ext/c5.0/report.c +345 -0
  31. data/ext/c5.0/rules.c +579 -0
  32. data/ext/c5.0/ruletree.c +398 -0
  33. data/ext/c5.0/siftrules.c +1285 -0
  34. data/ext/c5.0/sort.c +156 -0
  35. data/ext/c5.0/subset.c +599 -0
  36. data/ext/c5.0/text.i +223 -0
  37. data/ext/c5.0/trees.c +740 -0
  38. data/ext/c5.0/update.c +129 -0
  39. data/ext/c5.0/utility.c +1146 -0
  40. data/ext/c5.0/xval +150 -0
  41. data/ext/c5.0/xval.c +402 -0
  42. data/ext/gritbot/Makefile +98 -0
  43. data/ext/gritbot/check.c +1110 -0
  44. data/ext/gritbot/cluster.c +342 -0
  45. data/ext/gritbot/common.c +1269 -0
  46. data/ext/gritbot/continatt.c +412 -0
  47. data/ext/gritbot/defns.i +623 -0
  48. data/ext/gritbot/discratt.c +459 -0
  49. data/ext/gritbot/extern.i +101 -0
  50. data/ext/gritbot/getdata.c +329 -0
  51. data/ext/gritbot/getnames.c +573 -0
  52. data/ext/gritbot/global.c +104 -0
  53. data/ext/gritbot/gpl.txt +674 -0
  54. data/ext/gritbot/gritbot.c +295 -0
  55. data/ext/gritbot/implicitatt.c +1108 -0
  56. data/ext/gritbot/inspect.c +794 -0
  57. data/ext/gritbot/modelfiles.c +687 -0
  58. data/ext/gritbot/outlier.c +415 -0
  59. data/ext/gritbot/sort.c +130 -0
  60. data/ext/gritbot/text.i +159 -0
  61. data/ext/gritbot/update.c +126 -0
  62. data/ext/gritbot/utility.c +1029 -0
  63. data/ext/see5-installer/extconf.rb +25 -0
  64. data/lib/see5/installer.rb +10 -0
  65. data/lib/see5/installer/version.rb +7 -0
  66. data/see5-installer.gemspec +30 -0
  67. metadata +115 -0
data/ext/c5.0/info.c ADDED
@@ -0,0 +1,146 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Calculate information, information gain, and print dists */
30
+ /* -------------------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ /*************************************************************************/
40
+ /* */
41
+ /* Given Freq[][] and ValFreq[], compute the information gain. */
42
+ /* */
43
+ /*************************************************************************/
44
+
45
+
46
+ double ComputeGain(double BaseInfo, float UnknFrac, DiscrValue MaxVal,
47
+ CaseCount TotalCases)
48
+ /* ----------- */
49
+ {
50
+ DiscrValue v;
51
+ double ThisInfo=0.0;
52
+
53
+ /* Check whether all values are unknown or the same */
54
+
55
+ if ( ! TotalCases ) return None;
56
+
57
+ /* Compute total info after split, by summing the
58
+ info of each of the subsets formed by the test */
59
+
60
+ ForEach(v, 1, MaxVal)
61
+ {
62
+ ThisInfo += TotalInfo(GEnv.Freq[v], 1, MaxClass);
63
+ }
64
+ ThisInfo /= TotalCases;
65
+
66
+ /* Set the gain in information for all cases, adjusted for unknowns */
67
+
68
+ return ( BaseInfo <= ThisInfo ? 0.0 :
69
+ (1 - UnknFrac) * (BaseInfo - ThisInfo) );
70
+
71
+ }
72
+
73
+
74
+
75
+ /*************************************************************************/
76
+ /* */
77
+ /* Compute the total information in V[ MinVal..MaxVal ] */
78
+ /* */
79
+ /*************************************************************************/
80
+
81
+
82
+ double TotalInfo(double V[], DiscrValue MinVal, DiscrValue MaxVal)
83
+ /* --------- */
84
+ {
85
+ DiscrValue v;
86
+ double Sum=0.0, TotalCases=0;
87
+ CaseCount N;
88
+
89
+ ForEach(v, MinVal, MaxVal)
90
+ {
91
+ N = V[v];
92
+
93
+ Sum += N * Log(N);
94
+ TotalCases += N;
95
+ }
96
+
97
+ return TotalCases * Log(TotalCases) - Sum;
98
+ }
99
+
100
+
101
+
102
+ /*************************************************************************/
103
+ /* */
104
+ /* Print distribution table for given attribute */
105
+ /* */
106
+ /*************************************************************************/
107
+
108
+
109
+ void PrintDistribution(Attribute Att, DiscrValue MinVal, DiscrValue MaxVal,
110
+ double **Freq, double *ValFreq, Boolean ShowNames)
111
+ /* ----------------- */
112
+ {
113
+ DiscrValue v;
114
+ ClassNo c;
115
+ String Val;
116
+
117
+ fprintf(Of, "\n\t\t\t ");
118
+ ForEach(c, 1, MaxClass)
119
+ {
120
+ fprintf(Of, "%7.6s", ClassName[c]);
121
+ }
122
+ fprintf(Of, "\n");
123
+
124
+ ForEach(v, MinVal, MaxVal)
125
+ {
126
+ if ( ShowNames )
127
+ {
128
+ Val = ( ! v ? "unknown" :
129
+ MaxAttVal[Att] ? AttValName[Att][v] :
130
+ v == 1 ? "N/A" :
131
+ v == 2 ? "below" : "above" );
132
+ fprintf(Of, "\t\t[%-7.7s:", Val);
133
+ }
134
+ else
135
+ {
136
+ fprintf(Of, "\t\t[%-7d:", v);
137
+ }
138
+
139
+ ForEach(c, 1, MaxClass)
140
+ {
141
+ fprintf(Of, " %6.1f", Freq[v][c]);
142
+ }
143
+
144
+ fprintf(Of, "]\n");
145
+ }
146
+ }
data/ext/c5.0/mcost.c ADDED
@@ -0,0 +1,138 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Read variable misclassification costs */
30
+ /* ------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+
39
+ void GetMCosts(FILE *Cf)
40
+ /* --------- */
41
+ {
42
+ ClassNo Pred, Real, p, r;
43
+ char Name[1000];
44
+ CaseNo i;
45
+ float Val, Sum=0;
46
+
47
+ LineNo = 0;
48
+
49
+ /* Read entries from cost file */
50
+
51
+ while ( ReadName(Cf, Name, 1000, ':') )
52
+ {
53
+ if ( ! (Pred = Which(Name, ClassName, 1, MaxClass)) )
54
+ {
55
+ Error(BADCOSTCLASS, Name, "");
56
+ }
57
+
58
+ if ( ! ReadName(Cf, Name, 1000, ':') ||
59
+ ! (Real = Which(Name, ClassName, 1, MaxClass)) )
60
+ {
61
+ Error(BADCOSTCLASS, Name, "");
62
+ }
63
+
64
+ if ( ! ReadName(Cf, Name, 1000, ':') ||
65
+ sscanf(Name, "%f", &Val) != 1 || Val < 0 )
66
+ {
67
+ Error(BADCOST, "", "");
68
+ Val = 1;
69
+ }
70
+
71
+ if ( Pred > 0 && Real > 0 && Pred != Real && Val != 1 )
72
+ {
73
+ /* Have a non-trivial cost entry */
74
+
75
+ if ( ! MCost )
76
+ {
77
+ /* Set up cost matrices */
78
+
79
+ MCost = Alloc(MaxClass+1, float *);
80
+ ForEach(p, 1, MaxClass)
81
+ {
82
+ MCost[p] = Alloc(MaxClass+1, float);
83
+ ForEach(r, 1, MaxClass)
84
+ {
85
+ MCost[p][r] = ( p == r ? 0.0 : 1.0 );
86
+ }
87
+ }
88
+ }
89
+
90
+ MCost[Pred][Real] = Val;
91
+ }
92
+ }
93
+ fclose(Cf);
94
+
95
+ /* Don't need weights etc. for predict or interpret, or
96
+ if not using cost weighting */
97
+
98
+ if ( ! (CostWeights = MaxClass == 2 && MaxCase >= 0 && MCost) )
99
+ {
100
+ return;
101
+ }
102
+
103
+ /* Determine class frequency distribution */
104
+
105
+ ClassFreq = AllocZero(MaxClass+1, double);
106
+
107
+ if ( CWtAtt )
108
+ {
109
+ AvCWt = 1; /* relative weights not yet set */
110
+ ForEach(i, 0, MaxCase)
111
+ {
112
+ ClassFreq[Class(Case[i])] += RelCWt(Case[i]);
113
+ }
114
+ }
115
+ else
116
+ {
117
+ ForEach(i, 0, MaxCase)
118
+ {
119
+ ClassFreq[Class(Case[i])]++;
120
+ }
121
+ }
122
+
123
+ /* Find normalised weight multipliers */
124
+
125
+ WeightMul = Alloc(3, float);
126
+
127
+ Sum = (ClassFreq[1] * MCost[2][1] + ClassFreq[2] * MCost[1][2]) /
128
+ (ClassFreq[1] + ClassFreq[2]);
129
+
130
+ WeightMul[1] = MCost[2][1] / Sum;
131
+ WeightMul[2] = MCost[1][2] / Sum;
132
+
133
+ /* Adjust MINITEMS to take account of case reweighting */
134
+
135
+ MINITEMS *= Min(WeightMul[1], WeightMul[2]);
136
+
137
+ Free(ClassFreq); ClassFreq = Nil;
138
+ }
@@ -0,0 +1,952 @@
1
+ /*************************************************************************/
2
+ /* */
3
+ /* Copyright 2010 Rulequest Research Pty Ltd. */
4
+ /* */
5
+ /* This file is part of C5.0 GPL Edition, a single-threaded version */
6
+ /* of C5.0 release 2.07. */
7
+ /* */
8
+ /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9
+ /* modify it under the terms of the GNU General Public License as */
10
+ /* published by the Free Software Foundation, either version 3 of the */
11
+ /* License, or (at your option) any later version. */
12
+ /* */
13
+ /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16
+ /* General Public License for more details. */
17
+ /* */
18
+ /* You should have received a copy of the GNU General Public License */
19
+ /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20
+ /* */
21
+ /* <http://www.gnu.org/licenses/>. */
22
+ /* */
23
+ /*************************************************************************/
24
+
25
+
26
+
27
+ /*************************************************************************/
28
+ /* */
29
+ /* Routines for saving and reading model files */
30
+ /* ------------------------------------------- */
31
+ /* */
32
+ /*************************************************************************/
33
+
34
+
35
+ #include "defns.i"
36
+ #include "extern.i"
37
+
38
+ int Entry;
39
+
40
+ char* Prop[]={"null",
41
+ "att",
42
+ "class",
43
+ "cut",
44
+ "conds",
45
+ "elts",
46
+ "entries",
47
+ "forks",
48
+ "freq",
49
+ "id",
50
+ "type",
51
+ "low",
52
+ "mid",
53
+ "high",
54
+ "result",
55
+ "rules",
56
+ "val",
57
+ "lift",
58
+ "cover",
59
+ "ok",
60
+ "default",
61
+ "costs",
62
+ "sample",
63
+ "init"
64
+ };
65
+
66
+ char PropName[20],
67
+ *PropVal=Nil,
68
+ *Unquoted;
69
+ int PropValSize=0;
70
+
71
+ #define PROPS 23
72
+
73
+ #define ERRORP 0
74
+ #define ATTP 1
75
+ #define CLASSP 2
76
+ #define CUTP 3
77
+ #define CONDSP 4
78
+ #define ELTSP 5
79
+ #define ENTRIESP 6
80
+ #define FORKSP 7
81
+ #define FREQP 8
82
+ #define IDP 9
83
+ #define TYPEP 10
84
+ #define LOWP 11
85
+ #define MIDP 12
86
+ #define HIGHP 13
87
+ #define RESULTP 14
88
+ #define RULESP 15
89
+ #define VALP 16
90
+ #define LIFTP 17
91
+ #define COVERP 18
92
+ #define OKP 19
93
+ #define DEFAULTP 20
94
+ #define COSTSP 21
95
+ #define SAMPLEP 22
96
+ #define INITP 23
97
+
98
+
99
+ /*************************************************************************/
100
+ /* */
101
+ /* Check whether file is open. If it is not, open it and */
102
+ /* read/write sampling information and discrete names */
103
+ /* */
104
+ /*************************************************************************/
105
+
106
+
107
+ void CheckFile(String Extension, Boolean Write)
108
+ /* --------- */
109
+ {
110
+ static char *LastExt="";
111
+
112
+ if ( ! TRf || strcmp(LastExt, Extension) )
113
+ {
114
+ LastExt = Extension;
115
+
116
+ if ( TRf )
117
+ {
118
+ fprintf(TRf, "\n");
119
+ fclose(TRf);
120
+ }
121
+
122
+ if ( Write )
123
+ {
124
+ WriteFilePrefix(Extension);
125
+ }
126
+ else
127
+ {
128
+ ReadFilePrefix(Extension);
129
+ }
130
+ }
131
+ }
132
+
133
+
134
+
135
+ /*************************************************************************/
136
+ /* */
137
+ /* Write information on system, sampling */
138
+ /* */
139
+ /*************************************************************************/
140
+
141
+
142
+ void WriteFilePrefix(String Extension)
143
+ /* --------------- */
144
+ {
145
+ time_t clock;
146
+ struct tm *now;
147
+
148
+ if ( ! (TRf = GetFile(Extension, "w")) )
149
+ {
150
+ Error(NOFILE, Fn, E_ForWrite);
151
+ }
152
+
153
+ clock = time(0);
154
+ now = localtime(&clock);
155
+ now->tm_mon++;
156
+ fprintf(TRf, "id=\"See5/C5.0 %s %d-%d%d-%d%d\"\n",
157
+ RELEASE,
158
+ now->tm_year + 1900,
159
+ now->tm_mon / 10, now->tm_mon % 10,
160
+ now->tm_mday / 10, now->tm_mday % 10);
161
+
162
+ if ( MCost )
163
+ {
164
+ fprintf(TRf, "costs=\"1\"\n");
165
+ }
166
+
167
+ if ( SAMPLE > 0 )
168
+ {
169
+ fprintf(TRf, "sample=\"%g\" init=\"%d\"\n", SAMPLE, KRInit);
170
+ }
171
+
172
+ SaveDiscreteNames();
173
+
174
+ fprintf(TRf, "entries=\"%d\"\n", TRIALS);
175
+ }
176
+
177
+
178
+
179
+ /*************************************************************************/
180
+ /* */
181
+ /* Read header information */
182
+ /* */
183
+ /*************************************************************************/
184
+
185
+
186
+ void ReadFilePrefix(String Extension)
187
+ /* -------------- */
188
+ {
189
+ if ( ! (TRf = GetFile(Extension, "r")) ) Error(NOFILE, Fn, "");
190
+
191
+ StreamIn((char *) &TRIALS, sizeof(int));
192
+ if ( memcmp((char *) &TRIALS, "id=", 3) != 0 )
193
+ {
194
+ printf("\nCannot read old format classifiers\n");
195
+ exit(1);
196
+ }
197
+ else
198
+ {
199
+ rewind(TRf);
200
+ ReadHeader();
201
+ }
202
+ }
203
+
204
+
205
+
206
+ /*************************************************************************/
207
+ /* */
208
+ /* Save attribute values read with "discrete N" */
209
+ /* */
210
+ /*************************************************************************/
211
+
212
+
213
+ void SaveDiscreteNames()
214
+ /* ----------------- */
215
+ {
216
+ Attribute Att;
217
+ DiscrValue v;
218
+
219
+ ForEach(Att, 1, MaxAtt)
220
+ {
221
+ if ( ! StatBit(Att, DISCRETE) || MaxAttVal[Att] < 2 ) continue;
222
+
223
+ AsciiOut("att=", AttName[Att]);
224
+ AsciiOut(" elts=", AttValName[Att][2]); /* skip N/A */
225
+
226
+ ForEach(v, 3, MaxAttVal[Att])
227
+ {
228
+ AsciiOut(",", AttValName[Att][v]);
229
+ }
230
+ fprintf(TRf, "\n");
231
+ }
232
+ }
233
+
234
+
235
+
236
+ /*************************************************************************/
237
+ /* */
238
+ /* Save entire decision tree T in file with extension Extension */
239
+ /* */
240
+ /*************************************************************************/
241
+
242
+
243
+ void SaveTree(Tree T, String Extension)
244
+ /* -------- */
245
+ {
246
+ CheckFile(Extension, true);
247
+
248
+ OutTree(T);
249
+ }
250
+
251
+
252
+
253
+ void OutTree(Tree T)
254
+ /* ------- */
255
+ {
256
+ DiscrValue v, vv;
257
+ ClassNo c;
258
+ Boolean First;
259
+
260
+ fprintf(TRf, "type=\"%d\"", T->NodeType);
261
+ AsciiOut(" class=", ClassName[T->Leaf]);
262
+ if ( T->Cases > 0 )
263
+ {
264
+ fprintf(TRf, " freq=\"%g", T->ClassDist[1]);
265
+ ForEach(c, 2, MaxClass)
266
+ {
267
+ fprintf(TRf, ",%g", T->ClassDist[c]);
268
+ }
269
+ fprintf(TRf, "\"");
270
+ }
271
+
272
+ if ( T->NodeType )
273
+ {
274
+ AsciiOut(" att=", AttName[T->Tested]);
275
+ fprintf(TRf, " forks=\"%d\"", T->Forks);
276
+
277
+ switch ( T->NodeType )
278
+ {
279
+ case BrDiscr:
280
+ break;
281
+
282
+ case BrThresh:
283
+ fprintf(TRf, " cut=\"%.*g\"", PREC+1, T->Cut);
284
+ if ( T->Upper > T->Cut )
285
+ {
286
+ fprintf(TRf, " low=\"%.*g\" mid=\"%.*g\" high=\"%.*g\"",
287
+ PREC, T->Lower, PREC, T->Mid, PREC, T->Upper);
288
+ }
289
+ break;
290
+
291
+ case BrSubset:
292
+ ForEach(v, 1, T->Forks)
293
+ {
294
+ First=true;
295
+ ForEach(vv, 1, MaxAttVal[T->Tested])
296
+ {
297
+ if ( In(vv, T->Subset[v]) )
298
+ {
299
+ if ( First )
300
+ {
301
+ AsciiOut(" elts=", AttValName[T->Tested][vv]);
302
+ First = false;
303
+ }
304
+ else
305
+ {
306
+ AsciiOut(",", AttValName[T->Tested][vv]);
307
+ }
308
+ }
309
+ }
310
+ /* Make sure have printed at least one element */
311
+
312
+ if ( First ) AsciiOut(" elts=", "N/A");
313
+ }
314
+ break;
315
+ }
316
+ fprintf(TRf, "\n");
317
+
318
+ ForEach(v, 1, T->Forks)
319
+ {
320
+ OutTree(T->Branch[v]);
321
+ }
322
+ }
323
+ else
324
+ {
325
+ fprintf(TRf, "\n");
326
+ }
327
+ }
328
+
329
+
330
+
331
+ /*************************************************************************/
332
+ /* */
333
+ /* Save the current ruleset in rules file */
334
+ /* */
335
+ /*************************************************************************/
336
+
337
+
338
+ void SaveRules(CRuleSet RS, String Extension)
339
+ /* --------- */
340
+ {
341
+ int ri, d;
342
+ CRule R;
343
+ Condition C;
344
+ DiscrValue v;
345
+ Boolean First;
346
+
347
+ CheckFile(Extension, true);
348
+
349
+ fprintf(TRf, "rules=\"%d\"", RS->SNRules);
350
+ AsciiOut(" default=", ClassName[RS->SDefault]);
351
+ fprintf(TRf, "\n");
352
+
353
+ ForEach(ri, 1, RS->SNRules)
354
+ {
355
+ R = RS->SRule[ri];
356
+ fprintf(TRf, "conds=\"%d\" cover=\"%g\" ok=\"%g\" lift=\"%g\"",
357
+ R->Size, R->Cover, R->Correct,
358
+ (R->Correct + 1) / ((R->Cover + 2) * R->Prior));
359
+ AsciiOut(" class=", ClassName[R->Rhs]);
360
+ fprintf(TRf, "\n");
361
+
362
+ ForEach(d, 1, R->Size)
363
+ {
364
+ C = R->Lhs[d];
365
+
366
+ fprintf(TRf, "type=\"%d\"", C->NodeType);
367
+ AsciiOut(" att=", AttName[C->Tested]);
368
+
369
+ switch ( C->NodeType )
370
+ {
371
+ case BrDiscr:
372
+ AsciiOut(" val=", AttValName[C->Tested][C->TestValue]);
373
+ break;
374
+
375
+ case BrThresh:
376
+ if ( C->TestValue == 1 ) /* N/A */
377
+ {
378
+ fprintf(TRf, " val=\"N/A\"");
379
+ }
380
+ else
381
+ {
382
+ fprintf(TRf, " cut=\"%.*g\" result=\"%c\"",
383
+ PREC+1, C->Cut,
384
+ ( C->TestValue == 2 ? '<' : '>' ));
385
+ }
386
+ break;
387
+
388
+ case BrSubset:
389
+ First=true;
390
+ ForEach(v, 1, MaxAttVal[C->Tested])
391
+ {
392
+ if ( In(v, C->Subset) )
393
+ {
394
+ if ( First )
395
+ {
396
+ AsciiOut(" elts=", AttValName[C->Tested][v]);
397
+ First = false;
398
+ }
399
+ else
400
+ {
401
+ AsciiOut(",", AttValName[C->Tested][v]);
402
+ }
403
+ }
404
+ }
405
+ break;
406
+ }
407
+
408
+ fprintf(TRf, "\n");
409
+ }
410
+ }
411
+ }
412
+
413
+
414
+
415
+ /*************************************************************************/
416
+ /* */
417
+ /* Write ASCII string with prefix, escaping any quotes */
418
+ /* */
419
+ /*************************************************************************/
420
+
421
+
422
+ void AsciiOut(String Pre, String S)
423
+ /* -------- */
424
+ {
425
+ fprintf(TRf, "%s\"", Pre);
426
+ while ( *S )
427
+ {
428
+ if ( *S == '"' || *S == '\\' ) fputc('\\', TRf);
429
+ fputc(*S++, TRf);
430
+ }
431
+ fputc('"', TRf);
432
+ }
433
+
434
+
435
+
436
+ /*************************************************************************/
437
+ /* */
438
+ /* Read the header information (id, saved names, models) */
439
+ /* */
440
+ /*************************************************************************/
441
+
442
+
443
+ void ReadHeader()
444
+ /* --------- */
445
+ {
446
+ Attribute Att;
447
+ DiscrValue v;
448
+ char *p, Dummy;
449
+ int Year, Month, Day;
450
+ FILE *F;
451
+
452
+ while ( true )
453
+ {
454
+ switch ( ReadProp(&Dummy) )
455
+ {
456
+ case ERRORP:
457
+ return;
458
+
459
+ case IDP:
460
+ /* Recover year run and set base date for timestamps */
461
+
462
+ if ( sscanf(PropVal + strlen(PropVal) - 11,
463
+ "%d-%d-%d\"", &Year, &Month, &Day) == 3 )
464
+ {
465
+ SetTSBase(Year);
466
+ }
467
+ break;
468
+
469
+ case COSTSP:
470
+ /* Recover costs file used to generate model */
471
+
472
+ if ( (F = GetFile(".costs", "r")) )
473
+ {
474
+ GetMCosts(F);
475
+ }
476
+ break;
477
+ case SAMPLEP:
478
+ sscanf(PropVal, "\"%f\"", &SAMPLE);
479
+ break;
480
+
481
+ case INITP:
482
+ sscanf(PropVal, "\"%d\"", &KRInit);
483
+ break;
484
+
485
+ case ATTP:
486
+ Unquoted = RemoveQuotes(PropVal);
487
+ Att = Which(Unquoted, AttName, 1, MaxAtt);
488
+ if ( ! Att || Exclude(Att) )
489
+ {
490
+ Error(MODELFILE, E_MFATT, Unquoted);
491
+ }
492
+ break;
493
+
494
+ case ELTSP:
495
+ MaxAttVal[Att] = 1;
496
+ AttValName[Att][1] = strdup("N/A");
497
+
498
+ for ( p = PropVal ; *p ; )
499
+ {
500
+ p = RemoveQuotes(p);
501
+ v = ++MaxAttVal[Att];
502
+ AttValName[Att][v] = strdup(p);
503
+
504
+ for ( p += strlen(p) ; *p != '"' ; p++ )
505
+ ;
506
+ p++;
507
+ if ( *p == ',' ) p++;
508
+ }
509
+ AttValName[Att][MaxAttVal[Att]+1] = "<other>";
510
+ MaxDiscrVal = Max(MaxDiscrVal, MaxAttVal[Att]+1);
511
+ break;
512
+
513
+ case ENTRIESP:
514
+ sscanf(PropVal, "\"%d\"", &TRIALS);
515
+ Entry = 0;
516
+ return;
517
+ }
518
+ }
519
+ }
520
+
521
+
522
+
523
+ /*************************************************************************/
524
+ /* */
525
+ /* Retrieve decision tree with extension Extension */
526
+ /* */
527
+ /*************************************************************************/
528
+
529
+
530
+ Tree GetTree(String Extension)
531
+ /* ------- */
532
+ {
533
+ CheckFile(Extension, false);
534
+
535
+ return InTree();
536
+ }
537
+
538
+
539
+
540
+ Tree InTree()
541
+ /* ------ */
542
+ {
543
+ Tree T;
544
+ DiscrValue v, Subset=0;
545
+ char Delim, *p;
546
+ ClassNo c;
547
+ int X;
548
+ double XD;
549
+
550
+ T = (Tree) AllocZero(1, TreeRec);
551
+
552
+ do
553
+ {
554
+ switch ( ReadProp(&Delim) )
555
+ {
556
+ case ERRORP:
557
+ return Nil;
558
+
559
+ case TYPEP:
560
+ sscanf(PropVal, "\"%d\"", &X); T->NodeType = X;
561
+ break;
562
+
563
+ case CLASSP:
564
+ Unquoted = RemoveQuotes(PropVal);
565
+ T->Leaf = Which(Unquoted, ClassName, 1, MaxClass);
566
+ if ( ! T->Leaf ) Error(MODELFILE, E_MFCLASS, Unquoted);
567
+ break;
568
+
569
+ case ATTP:
570
+ Unquoted = RemoveQuotes(PropVal);
571
+ T->Tested = Which(Unquoted, AttName, 1, MaxAtt);
572
+ if ( ! T->Tested || Exclude(T->Tested) )
573
+ {
574
+ Error(MODELFILE, E_MFATT, Unquoted);
575
+ }
576
+ break;
577
+
578
+ case CUTP:
579
+ sscanf(PropVal, "\"%lf\"", &XD); T->Cut = XD;
580
+ T->Lower = T->Mid = T->Upper = T->Cut;
581
+ break;
582
+
583
+ case LOWP:
584
+ sscanf(PropVal, "\"%lf\"", &XD); T->Lower = XD;
585
+ break;
586
+
587
+ case MIDP:
588
+ sscanf(PropVal, "\"%lf\"", &XD); T->Mid = XD;
589
+ break;
590
+
591
+ case HIGHP:
592
+ sscanf(PropVal, "\"%lf\"", &XD); T->Upper = XD;
593
+ break;
594
+
595
+ case FORKSP:
596
+ sscanf(PropVal, "\"%d\"", &T->Forks);
597
+ break;
598
+
599
+ case FREQP:
600
+ T->ClassDist = Alloc(MaxClass+1, CaseCount);
601
+ p = PropVal+1;
602
+
603
+ ForEach(c, 1, MaxClass)
604
+ {
605
+ T->ClassDist[c] = strtod(p, &p);
606
+ T->Cases += T->ClassDist[c];
607
+ p++;
608
+ }
609
+ break;
610
+
611
+ case ELTSP:
612
+ if ( ! Subset++ )
613
+ {
614
+ T->Subset = AllocZero(T->Forks+1, Set);
615
+ }
616
+
617
+ T->Subset[Subset] = MakeSubset(T->Tested);
618
+ break;
619
+ }
620
+ }
621
+ while ( Delim == ' ' );
622
+
623
+ if ( T->ClassDist )
624
+ {
625
+ T->Errors = T->Cases - T->ClassDist[T->Leaf];
626
+ }
627
+ else
628
+ {
629
+ T->ClassDist = Alloc(1, CaseCount);
630
+ }
631
+
632
+ if ( T->NodeType )
633
+ {
634
+ T->Branch = AllocZero(T->Forks+1, Tree);
635
+ ForEach(v, 1, T->Forks)
636
+ {
637
+ T->Branch[v] = InTree();
638
+ }
639
+ }
640
+
641
+ return T;
642
+ }
643
+
644
+
645
+
646
+ /*************************************************************************/
647
+ /* */
648
+ /* Retrieve ruleset with extension Extension */
649
+ /* (Separate functions for ruleset, single rule, single condition) */
650
+ /* */
651
+ /*************************************************************************/
652
+
653
+
654
+ CRuleSet GetRules(String Extension)
655
+ /* -------- */
656
+ {
657
+ CheckFile(Extension, false);
658
+
659
+ return InRules();
660
+ }
661
+
662
+
663
+
664
+ CRuleSet InRules()
665
+ /* ------- */
666
+ {
667
+ CRuleSet RS;
668
+ RuleNo r;
669
+ char Delim;
670
+
671
+ RS = Alloc(1, RuleSetRec);
672
+
673
+ do
674
+ {
675
+ switch ( ReadProp(&Delim) )
676
+ {
677
+ case ERRORP:
678
+ return Nil;
679
+
680
+ case RULESP:
681
+ sscanf(PropVal, "\"%d\"", &RS->SNRules);
682
+ CheckActiveSpace(RS->SNRules);
683
+ break;
684
+
685
+ case DEFAULTP:
686
+ Unquoted = RemoveQuotes(PropVal);
687
+ RS->SDefault = Which(Unquoted, ClassName, 1, MaxClass);
688
+ if ( ! RS->SDefault ) Error(MODELFILE, E_MFCLASS, Unquoted);
689
+ break;
690
+ }
691
+ }
692
+ while ( Delim == ' ' );
693
+
694
+ /* Read each rule */
695
+
696
+ RS->SRule = Alloc(RS->SNRules+1, CRule);
697
+ ForEach(r, 1, RS->SNRules)
698
+ {
699
+ if ( (RS->SRule[r] = InRule()) )
700
+ {
701
+ RS->SRule[r]->RNo = r;
702
+ RS->SRule[r]->TNo = Entry;
703
+ }
704
+ }
705
+ ConstructRuleTree(RS);
706
+ Entry++;
707
+ return RS;
708
+ }
709
+
710
+
711
+
712
+ CRule InRule()
713
+ /* ------ */
714
+ {
715
+ CRule R;
716
+ int d;
717
+ char Delim;
718
+ float Lift;
719
+
720
+ R = Alloc(1, RuleRec);
721
+
722
+ do
723
+ {
724
+ switch ( ReadProp(&Delim) )
725
+ {
726
+ case ERRORP:
727
+ return Nil;
728
+
729
+ case CONDSP:
730
+ sscanf(PropVal, "\"%d\"", &R->Size);
731
+ break;
732
+
733
+ case COVERP:
734
+ sscanf(PropVal, "\"%f\"", &R->Cover);
735
+ break;
736
+
737
+ case OKP:
738
+ sscanf(PropVal, "\"%f\"", &R->Correct);
739
+ break;
740
+
741
+ case LIFTP:
742
+ sscanf(PropVal, "\"%f\"", &Lift);
743
+ R->Prior = (R->Correct + 1) / ((R->Cover + 2) * Lift);
744
+ break;
745
+
746
+ case CLASSP:
747
+ Unquoted = RemoveQuotes(PropVal);
748
+ R->Rhs = Which(Unquoted, ClassName, 1, MaxClass);
749
+ if ( ! R->Rhs ) Error(MODELFILE, E_MFCLASS, Unquoted);
750
+ break;
751
+ }
752
+ }
753
+ while ( Delim == ' ' );
754
+
755
+ R->Lhs = Alloc(R->Size+1, Condition);
756
+ ForEach(d, 1, R->Size)
757
+ {
758
+ R->Lhs[d] = InCondition();
759
+ }
760
+
761
+ R->Vote = 1000 * (R->Correct + 1.0) / (R->Cover + 2.0) + 0.5;
762
+
763
+ return R;
764
+ }
765
+
766
+
767
+
768
+ Condition InCondition()
769
+ /* ----------- */
770
+ {
771
+ Condition C;
772
+ char Delim;
773
+ int X;
774
+ double XD;
775
+
776
+ C = Alloc(1, CondRec);
777
+
778
+ do
779
+ {
780
+ switch ( ReadProp(&Delim) )
781
+ {
782
+ case ERRORP:
783
+ return Nil;
784
+
785
+ case TYPEP:
786
+ sscanf(PropVal, "\"%d\"", &X); C->NodeType = X;
787
+ break;
788
+
789
+ case ATTP:
790
+ Unquoted = RemoveQuotes(PropVal);
791
+ C->Tested = Which(Unquoted, AttName, 1, MaxAtt);
792
+ if ( ! C->Tested || Exclude(C->Tested) )
793
+ {
794
+ Error(MODELFILE, E_MFATT, Unquoted);
795
+ }
796
+ break;
797
+
798
+ case CUTP:
799
+ sscanf(PropVal, "\"%lf\"", &XD); C->Cut = XD;
800
+ break;
801
+
802
+ case RESULTP:
803
+ C->TestValue = ( PropVal[1] == '<' ? 2 : 3 );
804
+ break;
805
+
806
+ case VALP:
807
+ if ( Continuous(C->Tested) )
808
+ {
809
+ C->TestValue = 1;
810
+ }
811
+ else
812
+ {
813
+ Unquoted = RemoveQuotes(PropVal);
814
+ C->TestValue = Which(Unquoted,
815
+ AttValName[C->Tested],
816
+ 1, MaxAttVal[C->Tested]);
817
+ if ( ! C->TestValue ) Error(MODELFILE, E_MFATTVAL, Unquoted);
818
+ }
819
+ break;
820
+
821
+ case ELTSP:
822
+ C->Subset = MakeSubset(C->Tested);
823
+ C->TestValue = 1;
824
+ break;
825
+ }
826
+ }
827
+ while ( Delim == ' ' );
828
+
829
+ return C;
830
+ }
831
+
832
+
833
+
834
+ /*************************************************************************/
835
+ /* */
836
+ /* ASCII reading utilities */
837
+ /* */
838
+ /*************************************************************************/
839
+
840
+
841
+ int ReadProp(char *Delim)
842
+ /* -------- */
843
+ {
844
+ int c, i;
845
+ char *p;
846
+ Boolean Quote=false;
847
+
848
+ for ( p = PropName ; (c = fgetc(TRf)) != '=' ; )
849
+ {
850
+ if ( p - PropName >= 19 || c == EOF )
851
+ {
852
+ Error(MODELFILE, E_MFEOF, "");
853
+ PropName[0] = PropVal[0] = *Delim = '\00';
854
+ return 0;
855
+ }
856
+ *p++ = c;
857
+ }
858
+ *p = '\00';
859
+
860
+ for ( p = PropVal ; ((c = fgetc(TRf)) != ' ' && c != '\n') || Quote ; )
861
+ {
862
+ if ( c == EOF )
863
+ {
864
+ Error(MODELFILE, E_MFEOF, "");
865
+ PropName[0] = PropVal[0] = '\00';
866
+ return 0;
867
+ }
868
+
869
+ if ( (i = p - PropVal) >= PropValSize )
870
+ {
871
+ Realloc(PropVal, (PropValSize += 10000) + 3, char);
872
+ p = PropVal + i;
873
+ }
874
+
875
+ *p++ = c;
876
+ if ( c == '\\' )
877
+ {
878
+ *p++ = fgetc(TRf);
879
+ }
880
+ else
881
+ if ( c == '"' )
882
+ {
883
+ Quote = ! Quote;
884
+ }
885
+ }
886
+ *p = '\00';
887
+ *Delim = c;
888
+
889
+ return Which(PropName, Prop, 1, PROPS);
890
+ }
891
+
892
+
893
+ String RemoveQuotes(String S)
894
+ /* ------------ */
895
+ {
896
+ char *p, *Start;
897
+
898
+ p = Start = S;
899
+
900
+ for ( S++ ; *S != '"' ; S++ )
901
+ {
902
+ if ( *S == '\\' ) S++;
903
+ *p++ = *S;
904
+ *S = '-';
905
+ }
906
+ *p = '\00';
907
+
908
+ return Start;
909
+ }
910
+
911
+
912
+
913
+ Set MakeSubset(Attribute Att)
914
+ /* ---------- */
915
+ {
916
+ int Bytes, b;
917
+ char *p;
918
+ Set S;
919
+
920
+ Bytes = (MaxAttVal[Att]>>3) + 1;
921
+ S = AllocZero(Bytes, Byte);
922
+
923
+ for ( p = PropVal ; *p ; )
924
+ {
925
+ p = RemoveQuotes(p);
926
+ b = Which(p, AttValName[Att], 1, MaxAttVal[Att]);
927
+ if ( ! b ) Error(MODELFILE, E_MFATTVAL, p);
928
+ SetBit(b, S);
929
+
930
+ for ( p += strlen(p) ; *p != '"' ; p++ )
931
+ ;
932
+ p++;
933
+ if ( *p == ',' ) p++;
934
+ }
935
+
936
+ return S;
937
+ }
938
+
939
+
940
+
941
+ /*************************************************************************/
942
+ /* */
943
+ /* Character stream read for binary routines */
944
+ /* */
945
+ /*************************************************************************/
946
+
947
+
948
+ void StreamIn(String S, int n)
949
+ /* -------- */
950
+ {
951
+ while ( n-- ) *S++ = getc(TRf);
952
+ }