apriori 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (122) hide show
  1. data/History.txt +16 -0
  2. data/License.txt +20 -0
  3. data/Manifest.txt +121 -0
  4. data/README.txt +149 -0
  5. data/Rakefile +15 -0
  6. data/TODO.txt +60 -0
  7. data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
  8. data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
  9. data/attic/c_ext_test1/mytest.rb +10 -0
  10. data/attic/test.c +12 -0
  11. data/config/hoe.rb +81 -0
  12. data/config/requirements.rb +29 -0
  13. data/examples/01_simple_example.rb +32 -0
  14. data/examples/02_small_file_example.rb +17 -0
  15. data/examples/03_large_file_example.rb +22 -0
  16. data/examples/test_data/market_basket_basic_test.dat +9 -0
  17. data/ext/Apriori.c +149 -0
  18. data/ext/Makefile +149 -0
  19. data/ext/apriori/doc/apriori.html +1301 -0
  20. data/ext/apriori/doc/arem.gp +68 -0
  21. data/ext/apriori/doc/c_rev.gp +89 -0
  22. data/ext/apriori/doc/chi2.tex +156 -0
  23. data/ext/apriori/doc/copying +504 -0
  24. data/ext/apriori/doc/line.gif +0 -0
  25. data/ext/apriori/doc/uparrow.gif +0 -0
  26. data/ext/apriori/ex/flg2set +15 -0
  27. data/ext/apriori/ex/hdr2set +13 -0
  28. data/ext/apriori/ex/readme +71 -0
  29. data/ext/apriori/ex/row2set +7 -0
  30. data/ext/apriori/ex/rulesort +24 -0
  31. data/ext/apriori/ex/tab2set +9 -0
  32. data/ext/apriori/ex/test.app +2 -0
  33. data/ext/apriori/ex/test.rul +9 -0
  34. data/ext/apriori/ex/test1.rul +43 -0
  35. data/ext/apriori/ex/test1.tab +10 -0
  36. data/ext/apriori/ex/test2.tab +10 -0
  37. data/ext/apriori/ex/test3.tab +30 -0
  38. data/ext/apriori/ex/test4.tab +11 -0
  39. data/ext/apriori/ex/test5.tab +39 -0
  40. data/ext/apriori/ex/tid2set +23 -0
  41. data/ext/apriori/ex/xhdr2set +33 -0
  42. data/ext/apriori/src/apriori.c +750 -0
  43. data/ext/apriori/src/apriori.dsp +120 -0
  44. data/ext/apriori/src/apriori.dsw +29 -0
  45. data/ext/apriori/src/apriori.mak +99 -0
  46. data/ext/apriori/src/istree.c +1411 -0
  47. data/ext/apriori/src/istree.h +160 -0
  48. data/ext/apriori/src/makefile +105 -0
  49. data/ext/apriori/src/tract.c +870 -0
  50. data/ext/apriori/src/tract.h +261 -0
  51. data/ext/apriori_wrapper.c +757 -0
  52. data/ext/apriori_wrapper.h +10 -0
  53. data/ext/extconf.rb +32 -0
  54. data/ext/math/doc/copying +504 -0
  55. data/ext/math/src/chi2.c +151 -0
  56. data/ext/math/src/chi2.h +27 -0
  57. data/ext/math/src/choose.c +71 -0
  58. data/ext/math/src/choose.h +16 -0
  59. data/ext/math/src/gamma.c +446 -0
  60. data/ext/math/src/gamma.h +39 -0
  61. data/ext/math/src/intexp.c +35 -0
  62. data/ext/math/src/intexp.h +15 -0
  63. data/ext/math/src/makefile +164 -0
  64. data/ext/math/src/math.mak +48 -0
  65. data/ext/math/src/normal.c +387 -0
  66. data/ext/math/src/normal.h +44 -0
  67. data/ext/math/src/radfn.c +113 -0
  68. data/ext/math/src/radfn.h +34 -0
  69. data/ext/math/src/zeta.c +49 -0
  70. data/ext/math/src/zeta.h +15 -0
  71. data/ext/pre-clean.rb +8 -0
  72. data/ext/pre-setup.rb +9 -0
  73. data/ext/util/doc/copying +504 -0
  74. data/ext/util/src/listops.c +76 -0
  75. data/ext/util/src/listops.h +26 -0
  76. data/ext/util/src/makefile +103 -0
  77. data/ext/util/src/memsys.c +84 -0
  78. data/ext/util/src/memsys.h +42 -0
  79. data/ext/util/src/nstats.c +288 -0
  80. data/ext/util/src/nstats.h +69 -0
  81. data/ext/util/src/params.c +86 -0
  82. data/ext/util/src/params.h +19 -0
  83. data/ext/util/src/parse.c +133 -0
  84. data/ext/util/src/parse.h +81 -0
  85. data/ext/util/src/scan.c +767 -0
  86. data/ext/util/src/scan.h +111 -0
  87. data/ext/util/src/symtab.c +443 -0
  88. data/ext/util/src/symtab.h +121 -0
  89. data/ext/util/src/tabscan.c +279 -0
  90. data/ext/util/src/tabscan.h +99 -0
  91. data/ext/util/src/util.mak +91 -0
  92. data/ext/util/src/vecops.c +317 -0
  93. data/ext/util/src/vecops.h +42 -0
  94. data/lib/apriori.rb +133 -0
  95. data/lib/apriori/adapter.rb +13 -0
  96. data/lib/apriori/association_rule.rb +89 -0
  97. data/lib/apriori/version.rb +9 -0
  98. data/script/console +10 -0
  99. data/script/destroy +14 -0
  100. data/script/generate +14 -0
  101. data/script/txt2html +82 -0
  102. data/setup.rb +1585 -0
  103. data/tasks/apriori.rake +20 -0
  104. data/tasks/attic.rake +28 -0
  105. data/tasks/deployment.rake +34 -0
  106. data/tasks/environment.rake +7 -0
  107. data/tasks/install.rake +13 -0
  108. data/tasks/website.rake +17 -0
  109. data/test/apriori_test.rb +13 -0
  110. data/test/fixtures/market_basket_results_test.txt +5 -0
  111. data/test/fixtures/market_basket_string_test.txt +7 -0
  112. data/test/fixtures/results.txt +2 -0
  113. data/test/fixtures/sample.txt +7 -0
  114. data/test/test_helper.rb +5 -0
  115. data/test/unit/test_apriori.rb +68 -0
  116. data/test/unit/test_itemsets_and_parsing.rb +82 -0
  117. data/website/index.html +248 -0
  118. data/website/index.txt +152 -0
  119. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  120. data/website/stylesheets/screen.css +142 -0
  121. data/website/template.html.erb +49 -0
  122. metadata +226 -0
@@ -0,0 +1,261 @@
1
+ /*----------------------------------------------------------------------
2
+ File : tract.h
3
+ Contents: item and transaction management
4
+ Author : Christian Borgelt
5
+ History : 2001.11.18 file created from file apriori.c
6
+ 2001.12.28 first version completed
7
+ 2001.01.02 ta_sort mapped to v_intsort
8
+ 2002.02.19 transaction tree functions added
9
+ 2003.07.17 functions is_filter, ta_filter, tas_filter added
10
+ 2003.08.21 parameter 'heap' added to tas_sort, tat_create
11
+ 2003.09.12 function tas_total added
12
+ 2003.09.20 empty transactions in input made possible
13
+ 2004.11.20 function tat_mark added
14
+ 2004.12.11 access functions for extended frequency added
15
+ 2004.12.15 function nim_trunc added
16
+ 2006.11.26 structures ISFMTR and ISEVAL added
17
+ 2007.02.13 adapted to modified tabscan module
18
+ 2008.06.30 support argument to ise_eval changed to double
19
+ ----------------------------------------------------------------------*/
20
+ #ifndef __TRACT__
21
+ #define __TRACT__
22
+ #ifndef NIMAPFN
23
+ #define NIMAPFN
24
+ #endif
25
+ #include "vecops.h"
26
+ #include "symtab.h"
27
+ #include "tabscan.h"
28
+
29
+ /*----------------------------------------------------------------------
30
+ Preprocessor Definitions
31
+ ----------------------------------------------------------------------*/
32
+ /* --- item appearance flags --- */
33
+ #define APP_NONE 0x00 /* item should be ignored */
34
+ #define APP_BODY 0x01 /* item may appear in rule body */
35
+ #define APP_HEAD 0x02 /* item may appear in rule head */
36
+ #define APP_BOTH (APP_HEAD|APP_BODY)
37
+
38
+ /* --- error codes --- */
39
+ #define E_NONE 0 /* no error */
40
+ #define E_NOMEM (-1) /* not enough memory */
41
+ #define E_FOPEN (-2) /* cannot open file */
42
+ #define E_FREAD (-3) /* read error on file */
43
+ #define E_FWRITE (-4) /* write error on file */
44
+
45
+ #define E_ITEMEXP (-16) /* item expected */
46
+ #define E_DUPITEM (-17) /* duplicate item */
47
+ #define E_APPEXP (-18) /* appearance indicator expected */
48
+ #define E_UNKAPP (-19) /* unknown appearance indicator */
49
+ #define E_FLDCNT (-20) /* too many fields */
50
+
51
+ /*----------------------------------------------------------------------
52
+ Type Definitions
53
+ ----------------------------------------------------------------------*/
54
+ typedef struct { /* --- an item --- */
55
+ int id; /* item identifier */
56
+ int frq; /* frequency in transactions */
57
+ int xfq; /* extended frequency (t.a. sizes) */
58
+ int app; /* appearance indicator */
59
+ } ITEM; /* (item) */
60
+
61
+ typedef struct { /* --- an item set --- */
62
+ NIMAP *nimap; /* name/identifier map */
63
+ TABSCAN *tscan; /* table scanner */
64
+ char chars[4]; /* special characters */
65
+ int tac; /* transaction counter */
66
+ int app; /* default appearance indicator */
67
+ int vsz; /* size of transaction buffer */
68
+ int cnt; /* number of items in transaction */
69
+ int *items; /* items in transaction */
70
+ } ITEMSET; /* (item set) */
71
+
72
+ typedef struct { /* --- an item set evaluator --- */
73
+ double logta; /* logarithm of num. of transactions */
74
+ double *logfs; /* logarithms of item frequencies */
75
+ double lsums[1]; /* sums of logarithms for prefixes */
76
+ } ISEVAL; /* (item set evaluator) */
77
+
78
+ typedef struct { /* --- item set formatter --- */
79
+ int cnt; /* number of formatted item names */
80
+ int len; /* length of description in buffer */
81
+ int *offs; /* prefix lengths in output buffer */
82
+ char *buf; /* output buffer */
83
+ const char *names[1]; /* formatted item names */
84
+ } ISFMTR; /* (item set formatter) */
85
+
86
+ typedef struct { /* --- a transaction --- */
87
+ int cnt; /* number of items */
88
+ int items[1]; /* item identifier vector */
89
+ } TRACT; /* (transaction) */
90
+
91
+ typedef struct { /* --- a transaction set --- */
92
+ ITEMSET *itemset; /* underlying item set */
93
+ int max; /* maximum number of items per t.a. */
94
+ int vsz; /* size of transaction vector */
95
+ int cnt; /* number of transactions */
96
+ int total; /* total number of items */
97
+ TRACT **tracts; /* transaction vector */
98
+ } TASET; /* (transaction set) */
99
+
100
+ typedef struct _tatree { /* --- a transaction tree (node) --- */
101
+ int cnt; /* number of transactions */
102
+ int max; /* size of largest transaction */
103
+ int size; /* node size (number of children) */
104
+ int items[1]; /* next items in rep. transactions */
105
+ } TATREE; /* (transaction tree) */
106
+
107
+ /*----------------------------------------------------------------------
108
+ Item Set Functions
109
+ ----------------------------------------------------------------------*/
110
+ extern ITEMSET* is_create (int cnt);
111
+ extern void is_delete (ITEMSET *iset);
112
+ extern TABSCAN* is_tabscan (ITEMSET *iset);
113
+ extern void is_chars (ITEMSET *iset, const char *blanks,
114
+ const char *fldseps,
115
+ const char *recseps,
116
+ const char *cominds);
117
+
118
+ extern int is_cnt (ITEMSET *iset);
119
+ extern int is_item (ITEMSET *iset, const char *name);
120
+ extern const char* is_name (ITEMSET *iset, int item);
121
+
122
+ extern int is_gettac (ITEMSET *iset);
123
+ extern int is_settac (ITEMSET *iset, int cnt);
124
+ extern int is_addtac (ITEMSET *iset, int cnt);
125
+ extern int is_getfrq (ITEMSET *iset, int item);
126
+ extern int is_setfrq (ITEMSET *iset, int item, int frq);
127
+ extern int is_addfrq (ITEMSET *iset, int item, int frq);
128
+ extern int is_getxfq (ITEMSET *iset, int item);
129
+ extern int is_setxfq (ITEMSET *iset, int item, int frq);
130
+ extern int is_getapp (ITEMSET *iset, int item);
131
+ extern int is_setapp (ITEMSET *iset, int item, int app);
132
+
133
+ extern int is_readapp (ITEMSET *iset, FILE *file);
134
+ extern int is_read (ITEMSET *iset, FILE *file);
135
+
136
+ extern int is_recode (ITEMSET *iset, int minfrq,
137
+ int dir, int *map);
138
+ extern void is_trunc (ITEMSET *iset, int cnt);
139
+ extern int is_filter (ITEMSET *iset, const char *marks);
140
+ extern int is_tsize (ITEMSET *iset);
141
+ extern int* is_tract (ITEMSET *iset);
142
+
143
+ /*----------------------------------------------------------------------
144
+ Item Set Evaluation Functions
145
+ ----------------------------------------------------------------------*/
146
+ extern ISEVAL* ise_create (ITEMSET *iset, int tacnt);
147
+ extern void ise_delete (ISEVAL *eval);
148
+ extern double ise_eval (ISEVAL *eval, int *ids, int cnt, int pre,
149
+ double supp);
150
+
151
+ /*----------------------------------------------------------------------
152
+ Item Set Formatting Functions
153
+ ----------------------------------------------------------------------*/
154
+ extern ISFMTR* isf_create (ITEMSET *iset, int scan);
155
+ extern void isf_delete (ISFMTR *fmt);
156
+ extern const char* isf_format (ISFMTR *fmt, int *ids, int cnt, int pre);
157
+ extern int isf_length (ISFMTR *fmt);
158
+ extern void isf_print (ISFMTR *fmt, FILE *out);
159
+
160
+ /*----------------------------------------------------------------------
161
+ Transaction Functions
162
+ ----------------------------------------------------------------------*/
163
+ extern void ta_sort (int *items, int n);
164
+ extern int ta_unique (int *items, int n);
165
+ extern int ta_filter (int *items, int n, const char *marks);
166
+
167
+ /*----------------------------------------------------------------------
168
+ Transaction Set Functions
169
+ ----------------------------------------------------------------------*/
170
+ extern TASET* tas_create (ITEMSET *itemset);
171
+ extern void tas_delete (TASET *taset, int delis);
172
+ extern ITEMSET* tas_itemset (TASET *taset);
173
+
174
+ extern int tas_cnt (TASET *taset);
175
+ extern int tas_add (TASET *taset, const int *items, int n);
176
+ extern int* tas_tract (TASET *taset, int index);
177
+ extern int tas_tsize (TASET *taset, int index);
178
+ extern int tas_total (TASET *taset);
179
+
180
+ extern void tas_recode (TASET *taset, int *map, int cnt);
181
+ extern int tas_filter (TASET *taset, const char *marks);
182
+ extern void tas_shuffle (TASET *taset, double randfn(void));
183
+ extern void tas_sort (TASET *taset, int heap);
184
+ extern int tas_occur (TASET *taset, const int *items, int n);
185
+
186
+ #ifndef NDEBUG
187
+ extern void tas_show (TASET *taset);
188
+ #endif
189
+
190
+ /*----------------------------------------------------------------------
191
+ Transaction Tree Functions
192
+ ----------------------------------------------------------------------*/
193
+ extern TATREE* tat_create (TASET *taset, int heap);
194
+ extern void tat_delete (TATREE *tat);
195
+ extern int tat_cnt (TATREE *tat);
196
+ extern int tat_max (TATREE *tat);
197
+ extern int tat_size (TATREE *tat);
198
+ extern int* tat_items (TATREE *tat);
199
+ extern int tat_item (TATREE *tat, int index);
200
+ extern TATREE* tat_child (TATREE *tat, int index);
201
+ extern void tat_mark (TATREE *tat);
202
+
203
+ #ifndef NDEBUG
204
+ extern void tat_show (TATREE *tat);
205
+ #endif
206
+
207
+ /*----------------------------------------------------------------------
208
+ Preprocessor Definitions
209
+ ----------------------------------------------------------------------*/
210
+ #define is_tabscan(s) ((s)->tscan)
211
+
212
+ #define is_cnt(s) nim_cnt((s)->nimap)
213
+ #define is_name(s,i) nim_name(nim_byid((s)->nimap, i))
214
+ #define is_gettac(s) ((s)->tac)
215
+ #define is_settac(s,n) ((s)->tac = (n))
216
+ #define is_addtac(s,n) ((s)->tac += (n))
217
+ #define is_getfrq(s,i) (((ITEM*)nim_byid((s)->nimap, i))->frq)
218
+ #define is_setfrq(s,i,f) (((ITEM*)nim_byid((s)->nimap, i))->frq = (f))
219
+ #define is_addfrq(s,i,f) (((ITEM*)nim_byid((s)->nimap, i))->frq += (f))
220
+ #define is_getxfq(s,i) (((ITEM*)nim_byid((s)->nimap, i))->xfq)
221
+ #define is_setxfq(s,i,f) (((ITEM*)nim_byid((s)->nimap, i))->xfq = (f))
222
+ #define is_getapp(s,i) (((ITEM*)nim_byid((s)->nimap, i))->app)
223
+ #define is_setapp(s,i,a) (((ITEM*)nim_byid((s)->nimap, i))->app = (a))
224
+
225
+ #define is_trunc(s,n) nim_trunc((s)->nimap, n)
226
+
227
+ #define is_tsize(s) ((s)->cnt)
228
+ #define is_tract(s) ((s)->items)
229
+
230
+ /*--------------------------------------------------------------------*/
231
+ #define ise_delete(e) free(e)
232
+
233
+ /*--------------------------------------------------------------------*/
234
+ #define isf_length(f) ((f)->len)
235
+ #define isf_print(f,o) fwrite((f)->buf, sizeof(char), (f)->len, o)
236
+
237
+ /*--------------------------------------------------------------------*/
238
+ #define ta_sort(v,n) v_intsort(v,n)
239
+
240
+ /*--------------------------------------------------------------------*/
241
+ #define tas_itemset(s) ((s)->itemset)
242
+ #define tas_cnt(s) ((s)->cnt)
243
+ #define tas_max(s) ((s)->max)
244
+
245
+ #define tas_tract(s,i) ((s)->tracts[i]->items)
246
+ #define tas_tsize(s,i) ((s)->tracts[i]->cnt)
247
+ #define tas_total(s) ((s)->total)
248
+
249
+ #define tas_shuffle(s,f) v_shuffle((s)->tracts, (s)->cnt, f)
250
+
251
+ /*--------------------------------------------------------------------*/
252
+ #define tat_cnt(t) ((t)->cnt)
253
+ #define tat_max(t) ((t)->max)
254
+ #define tat_size(t) ((t)->size)
255
+ #define tat_item(t,i) ((t)->items[i])
256
+ #define tat_items(t) ((t)->items)
257
+ #ifndef ARCH64
258
+ #define tat_child(t,i) (((TATREE**)((t)->items +(t)->size))[i])
259
+ #endif
260
+
261
+ #endif
@@ -0,0 +1,757 @@
1
+ /*----------------------------------------------------------------------
2
+ File : apriori.c
3
+ Contents: apriori algorithm for finding association rules
4
+ Author : Christian Borgelt
5
+ History : 1996.02.14 file created
6
+ 1996.07.26 output precision reduced
7
+ 1996.11.22 options -b, -f, and -r added
8
+ 1996.11.24 option -e added (add. evaluation measures)
9
+ 1997.08.18 normalized chi^2 measure added
10
+ option -m (minimal rule length) added
11
+ 1997.10.13 quiet version (no output to stdout or stderr)
12
+ 1998.01.27 adapted to changed ist_create() function
13
+ 1998.08.08 optional input file (item appearances) added
14
+ 1998.09.02 several assertions added
15
+ 1998.09.07 hyperedge mode (option -h) added
16
+ 1998.12.08 output of absolute support (option -a) added
17
+ float changed to double
18
+ 1998.12.09 conversion of names to a scanable form added
19
+ 1999.02.05 long int changed to int
20
+ 1999.02.09 input from stdin, output to stdout added
21
+ 1999.08.09 bug in check of support parameter (<= 0) fixed
22
+ 1999.11.05 rule evaluation measure EM_AIMP added
23
+ 1999.11.08 output of add. rule eval. measure value added
24
+ 2000.03.16 optional use of original rule support definition
25
+ 2001.04.01 option -h replaced by option -t (target type)
26
+ 2001.05.26 extended support output added (option -x)
27
+ 2001.06.09 extended support output for item sets added
28
+ 2001.08.15 module scan used for output formatting
29
+ 2001.11.18 item and transaction functions made a module
30
+ 2001.11.19 options -C, -l changed, option -y removed
31
+ 2001.12.28 adapted to module tract, some improvements
32
+ 2002.01.11 evaluation measures codes changed to letters
33
+ 2002.02.10 option -q extended by a direction parameter
34
+ 2002.02.11 memory usage minimization option added
35
+ 2002.06.09 arbitrary supp./conf. formats made possible
36
+ 2003.01.09 option -k (item separator) added
37
+ 2003.01.14 check for empty transaction set added
38
+ 2003.03.12 output of lift value (conf/prior) added
39
+ 2003.07.17 item filtering w.r.t. usage added (option -u)
40
+ 2003.07.17 sorting w.r.t. transaction size sum added
41
+ 2003.07.18 maximal itemset filter added
42
+ 2003.08.11 closed itemset filter added
43
+ 2003.08.15 item filtering for transaction tree added
44
+ 2003.08.16 parameter for transaction filtering added
45
+ 2003.08.18 dynamic filtering decision based on times added
46
+ 2003.08.21 option -j (heap sort for transactions) added
47
+ 2003.09.22 meaning of option -j reversed (heapsort default)
48
+ 2004.03.25 option -S added (maximal support of a set/rule)
49
+ 2004.05.09 additional selection measure for sets added
50
+ 2004.10.28 two unnecessary assignments removed
51
+ 2004.11.20 bug in evaluation of -j (heap/quicksort) fixed
52
+ 2004.11.23 absolute/relative support output changed
53
+ 2004.12.09 semantics of option -p changed
54
+ 2005.01.25 bug in output of absolute/relative support fixed
55
+ 2005.01.31 another bug in this output fixed
56
+ 2005.06.20 use of flag for "no item sorting" corrected
57
+ 2007.02.13 adapted to modified module tabscan
58
+ 2008.03.13 additional hyperedge evaluation added
59
+ 2008.03.24 additional target added (association groups)
60
+ ----------------------------------------------------------------------*/
61
+ #include <stdio.h>
62
+ #include <stdlib.h>
63
+ #include <stdarg.h>
64
+ #include <string.h>
65
+ #include <limits.h>
66
+ #include <math.h>
67
+ #include <time.h>
68
+ #include <assert.h>
69
+ #include "apriori_wrapper.h"
70
+ #include "scan.h"
71
+ #include "tract.h"
72
+ #include "istree.h"
73
+ #ifdef STORAGE
74
+ #include "storage.h"
75
+ #endif
76
+
77
+ // #include "symbtab.h"
78
+
79
+ /*----------------------------------------------------------------------
80
+ Preprocessor Definitions
81
+ ----------------------------------------------------------------------*/
82
+ #define PRGNAME "apriori"
83
+ #define DESCRIPTION "find association rules with the apriori algorithm"
84
+ #define VERSION "version 4.35 (2008.03.24) " \
85
+ "(c) 1996-2008 Christian Borgelt"
86
+
87
+ /* --- target types --- */
88
+ #define TT_SET 0 /* frequent item sets */
89
+ #define TT_CLSET 1 /* closed item sets */
90
+ #define TT_MFSET 2 /* maximal item sets */
91
+ #define TT_RULE 3 /* association rules */
92
+ #define TT_HEDGE 4 /* association hyperedges */
93
+ #define TT_GROUP 5 /* association groups */
94
+
95
+ /* --- error codes --- */
96
+ #define E_OPTION (-5) /* unknown option */
97
+ #define E_OPTARG (-6) /* missing option argument */
98
+ #define E_ARGCNT (-7) /* too few/many arguments */
99
+ #define E_STDIN (-8) /* double assignment of stdin */
100
+ #define E_TARGET (-9) /* invalid target type */
101
+ #define E_SUPP (-10) /* invalid support */
102
+ #define E_CONF (-11) /* invalid confidence */
103
+ #define E_MEASURE (-12) /* invalid evaluation measure */
104
+ #define E_RULELEN (-13) /* invalid rule length */
105
+ #define E_NOTAS (-14) /* no items or transactions */
106
+ #define E_NOFREQ (-15) /* no frequent items */
107
+ #define E_UNKNOWN (-21) /* unknown error */
108
+
109
+ #ifndef QUIET /* if not quiet version */
110
+ #ifdef FFLUSH
111
+ #define MSG(x) x /* print messages */
112
+ #else /* if to flush every output */
113
+ #define MSG(x) x, fflush(stderr)
114
+ #endif
115
+ #else /* if quiet version */
116
+ #define MSG(x) /* suppress messages */
117
+ #endif
118
+
119
+ #define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
120
+ #define RECCNT(s) (ts_reccnt(is_tabscan(s)) \
121
+ - ((ts_delim(is_tabscan(s)) == TS_REC) ? 1 : 0))
122
+ #define BUFFER(s) ts_buf(is_tabscan(s))
123
+
124
+ /*----------------------------------------------------------------------
125
+ Constants
126
+ ----------------------------------------------------------------------*/
127
+ #ifndef QUIET /* if not quiet version */
128
+ /* --- target types --- */
129
+ static const char *ttypes[] = {
130
+ /* TT_SET 0 */ "set",
131
+ /* TT_CLSET 1 */ "set",
132
+ /* TT_MFSET 2 */ "set",
133
+ /* TT_RULE 3 */ "rule",
134
+ /* TT_HEDGE 4 */ "hyperedge",
135
+ /* TT_GROUP 5 */ "group",
136
+ };
137
+
138
+ /* --- error messages --- */
139
+ static const char *errmsgs[] = {
140
+ /* E_NONE 0 */ "no error\n",
141
+ /* E_NOMEM -1 */ "not enough memory\n",
142
+ /* E_FOPEN -2 */ "cannot open file %s\n",
143
+ /* E_FREAD -3 */ "read error on file %s\n",
144
+ /* E_FWRITE -4 */ "write error on file %s\n",
145
+ /* E_OPTION -5 */ "unknown option -%c\n",
146
+ /* E_OPTARG -6 */ "missing option argument\n",
147
+ /* E_ARGCNT -7 */ "wrong number of arguments\n",
148
+ /* E_STDIN -8 */ "double assignment of standard input\n",
149
+ /* E_TARGET -9 */ "invalid target type '%c'\n",
150
+ /* E_SUPP -10 */ "invalid minimal support %g%%\n",
151
+ /* E_CONF -11 */ "invalid minimal confidence %g%%\n",
152
+ /* E_MEASURE -12 */ "invalid additional evaluation measure %c\n",
153
+ /* E_RULELEN -13 */ "invalid set size/rule length %d\n",
154
+ /* E_NOTAS -14 */ "no items or transactions to work on\n",
155
+ /* E_NOFREQ -15 */ "no frequent items\n",
156
+ /* E_ITEMEXP -16 */ "file %s, record %d: item expected\n",
157
+ /* E_DUPITEM -17 */ "file %s, record %d: duplicate item %s\n",
158
+ /* E_APPEXP -18 */ "file %s, record %d: "
159
+ "appearance indicator expected\n",
160
+ /* E_UNKAPP -19 */ "file %s, record %d: "
161
+ "unknown appearance indicator %s\n",
162
+ /* E_FLDCNT -20 */ "file %s, record %d: too many fields\n",
163
+ /* E_UNKNOWN -21 */ "unknown error\n"
164
+ };
165
+ #endif
166
+
167
+ /*----------------------------------------------------------------------
168
+ Global Variables
169
+ ----------------------------------------------------------------------*/
170
+ #ifndef QUIET
171
+ static char *prgname; /* program name for error messages */
172
+ #endif
173
+ static ITEMSET *itemset = NULL; /* item set */
174
+ static TASET *taset = NULL; /* transaction set */
175
+ static TATREE *tatree = NULL; /* transaction tree */
176
+ static ISTREE *istree = NULL; /* item set tree */
177
+ static FILE *in = NULL; /* input file */
178
+ static FILE *out = NULL; /* output file */
179
+
180
+ /*----------------------------------------------------------------------
181
+ Main Functions
182
+ ----------------------------------------------------------------------*/
183
+
184
+ void help (void)
185
+ { /* --- print help on eval. measures */
186
+ #ifndef QUIET
187
+ fprintf(stderr, "\n"); /* terminate startup message */
188
+ printf("additional evaluation measures (option -e#)\n");
189
+ printf("frequent item sets:\n");
190
+ printf("d or 1: binary logarithm of support quotient\n");
191
+ printf("association rules:\n");
192
+ printf("d or 1: absolute confidence difference to prior\n");
193
+ printf("q or 2: absolute difference of confidence quotient to 1\n");
194
+ printf("a or 3: absolute difference of improvement value to 1\n");
195
+ printf("i or 4: information difference to prior\n");
196
+ printf("c or 5: normalized chi^2 measure\n");
197
+ printf("p or 6: p-value computed from chi^2 measure\n");
198
+ #endif
199
+ exit(0); /* abort the program */
200
+ } /* help() */
201
+
202
+ /*--------------------------------------------------------------------*/
203
+
204
+ static void error (int code, ...)
205
+ { /* --- print an error message */
206
+ #ifndef QUIET /* if not quiet version */
207
+ va_list args; /* list of variable arguments */
208
+ const char *msg; /* error message */
209
+
210
+ assert(prgname); /* check the program name */
211
+ if (code < E_UNKNOWN) code = E_UNKNOWN;
212
+ if (code < 0) { /* if to report an error, */
213
+ msg = errmsgs[-code]; /* get the error message */
214
+ if (!msg) msg = errmsgs[-E_UNKNOWN];
215
+ fprintf(stderr, "\n%s: ", prgname);
216
+ va_start(args, code); /* get variable arguments */
217
+ vfprintf(stderr, msg, args);/* print error message */
218
+ va_end(args); /* end argument evaluation */
219
+ }
220
+ #endif
221
+ #ifndef NDEBUG /* if debug version */
222
+ if (istree) ist_delete(istree); /* clean up memory */
223
+ if (tatree) tat_delete(tatree); /* and close files */
224
+ if (taset) tas_delete(taset, 0);
225
+ if (itemset) is_delete(itemset);
226
+ if (in && (in != stdin)) fclose(in);
227
+ if (out && (out != stdout)) fclose(out);
228
+ #endif
229
+ #ifdef STORAGE /* if storage debugging */
230
+ showmem("at end of program"); /* check memory usage */
231
+ #endif
232
+ exit(code); /* abort the program */
233
+ } /* error() */
234
+
235
+ /*--------------------------------------------------------------------*/
236
+
237
+ int do_apriori (int argc, char *argv[])
238
+ { /* --- main function */
239
+ int i, k = 0, n; /* loop variables, counters */
240
+ char *s; /* to traverse the options */
241
+ char **optarg = NULL; /* option argument */
242
+ char *fn_in = NULL; /* name of input file */
243
+ char *fn_out = NULL; /* name of output file */
244
+ char *fn_app = NULL; /* name of item appearances file */
245
+ char *blanks = NULL; /* blanks */
246
+ char *fldseps = NULL; /* field separators */
247
+ char *recseps = NULL; /* record separators */
248
+ char *comment = NULL; /* comment indicators */
249
+ char *used = NULL; /* item usage vector */
250
+ double supp = 0.1; /* minimal support (in percent) */
251
+ double smax = 1.0; /* maximal support (in percent) */
252
+ double conf = 0.8; /* minimal confidence (in percent) */
253
+ int mode = IST_BODY; /* search mode (rule support def.) */
254
+ int target = 'r'; /* target type (sets/rules/h.edges) */
255
+ int arem = 0; /* additional rule evaluation measure */
256
+ int lift = 0; /* flag for printing the lift */
257
+ double minval = 0.1; /* minimal evaluation measure value */
258
+ double lftval = 0; /* lift value (confidence/prior) */
259
+ int minlen = 1; /* minimal rule length */
260
+ int maxlen = INT_MAX; /* maximal rule length */
261
+ int load = 1; /* flag for loading transactions */
262
+ int sort = 2; /* flag for item sorting and recoding */
263
+ double filter = 0.1; /* item usage filtering parameter */
264
+ int tree = 1; /* flag for transaction tree */
265
+ int heap = 1; /* flag for heap sort vs. quick sort */
266
+ int c2scf = 0; /* flag for conv. to scanable form */
267
+ char *sep = " "; /* item separator for output */
268
+ char *fmt = "%.1f"; /* output format for support/conf. */
269
+ int sout = 1; /* flag for abs./rel. support output */
270
+ int ext = 0; /* flag for extended support output */
271
+ int aval = 0; /* flag for add. eval. measure value */
272
+ int maxcnt = 0; /* maximal number of items per set */
273
+ int tacnt; /* number of transactions */
274
+ int frq; /* frequency of an item set */
275
+ int *map, *set; /* identifier map, item set */
276
+ int verbose = 0; /* flag for verboseness */
277
+ const char *name; /* buffer for item names */
278
+ static char buf[4*TS_SIZE+4]; /* buffer for formatting */
279
+ clock_t t, tt, tc, x; /* timer for measurements */
280
+
281
+ #ifndef QUIET /* if not quiet version */
282
+ prgname = argv[0]; /* get program name for error msgs. */
283
+
284
+ /* --- print usage message --- */
285
+ if (argc > 1) { /* if arguments are given */
286
+ fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
287
+ fprintf(stderr, VERSION); } /* print a startup message */
288
+ else { /* if no arguments given */
289
+ printf("usage: %s [options] infile outfile [appfile]\n", argv[0]);
290
+ printf("%s\n", DESCRIPTION);
291
+ printf("%s\n", VERSION);
292
+ printf("-t# target type (default: association rules)\n"
293
+ " (s: item sets, c: closed item sets,"
294
+ " m: maximal item sets,\n"
295
+ " r: association rules,"
296
+ " h: association hyperedges)\n");
297
+ printf("-m# minimal number of items per set/rule/hyperedge "
298
+ "(default: %d)\n", minlen);
299
+ printf("-n# maximal number of items per set/rule/hyperedge "
300
+ "(default: no limit)\n");
301
+ printf("-s# minimal support of a set/rule/hyperedge "
302
+ "(default: %g%%)\n", supp *100);
303
+ printf("-S# maximal support of a set/rule/hyperedge "
304
+ "(default: %g%%)\n", smax *100);
305
+ printf("-c# minimal confidence of a rule/hyperedge "
306
+ "(default: %g%%)\n", conf *100);
307
+ printf("-o use original definition of the support of a rule "
308
+ "(body & head)\n");
309
+ printf("-k# item separator for output "
310
+ "(default: \"%s\")\n", sep);
311
+ printf("-p# output format for support/confidence "
312
+ "(default: \"%s\")\n", fmt);
313
+ printf("-x extended support output "
314
+ "(print both rule support types)\n");
315
+ printf("-a print absolute support "
316
+ "(number of transactions)\n");
317
+ printf("-y print lift value (confidence divided by prior)\n");
318
+ printf("-e# additional evaluation measure (default: none)\n");
319
+ printf("-! print a list of additional evaluation measures\n");
320
+ printf("-d# minimal value of additional evaluation measure "
321
+ "(default: %g%%)\n", minval *100);
322
+ printf("-v print value of additional "
323
+ "rule evaluation measure\n");
324
+ printf("-g write output in scanable form "
325
+ "(quote certain characters)\n");
326
+ printf("-l do not load transactions into memory "
327
+ "(work on input file)\n");
328
+ printf("-q# sort items w.r.t. their frequency (default: %d)\n"
329
+ " (1: ascending, -1: descending, 0: do not sort,\n"
330
+ " 2: ascending, -2: descending w.r.t. "
331
+ "transaction size sum)\n", sort);
332
+ printf("-u# filter unused items from transactions "
333
+ "(default: %g)\n", filter);
334
+ printf(" (0: do not filter items w.r.t. usage in sets,\n"
335
+ " <0: fraction of removed items for filtering,\n"
336
+ " >0: take execution times ratio into account)\n");
337
+ printf("-h do not organize transactions as a prefix tree\n");
338
+ printf("-j use quicksort to sort the transactions "
339
+ "(default: heapsort)\n");
340
+ printf("-z minimize memory usage "
341
+ "(default: maximize speed)\n");
342
+ printf("-b/f/r# blank characters, field and record separators\n"
343
+ " (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
344
+ printf("-C# comment characters (default: \"#\")\n");
345
+ printf("-V verbose\n");
346
+
347
+ printf("infile file to read transactions from\n");
348
+ printf("outfile file to write item sets/association rules"
349
+ "/hyperedges to\n");
350
+ printf("appfile file stating item appearances (optional)\n");
351
+ return 0; /* print a usage message */
352
+ } /* and abort the program */
353
+ #endif /* #ifndef QUIET */
354
+
355
+ /* --- evaluate arguments --- */
356
+ for (i = 1; i < argc; i++) { /* traverse arguments */
357
+ s = argv[i]; /* get option argument */
358
+ if (optarg) { *optarg = s; optarg = NULL; continue; }
359
+ if ((*s == '-') && *++s) { /* -- if argument is an option */
360
+ while (*s) { /* traverse options */
361
+ switch (*s++) { /* evaluate switches */
362
+ case '!': help(); break;
363
+ case 't': target = (*s) ? *s++ : 'r'; break;
364
+ case 'm': minlen = (int)strtol(s, &s, 0); break;
365
+ case 'n': maxlen = (int)strtol(s, &s, 0); break;
366
+ case 's': supp = 0.01*strtod(s, &s); break;
367
+ case 'S': smax = 0.01*strtod(s, &s); break;
368
+ case 'c': conf = 0.01*strtod(s, &s); break;
369
+ case 'o': mode |= IST_BOTH; break;
370
+ case 'k': optarg = &sep; break;
371
+ case 'p': optarg = &fmt; break;
372
+ case 'x': ext = 1; break;
373
+ case 'a': sout |= 2; break;
374
+ case 'y': lift = 1; break;
375
+ case 'e': arem = (*s) ? *s++ : 0; break;
376
+ case 'd': minval = 0.01*strtod(s, &s); break;
377
+ case 'v': aval = 1; break;
378
+ case 'g': c2scf = 1; break;
379
+ case 'l': load = 0; break;
380
+ case 'q': sort = (int)strtol(s, &s, 0); break;
381
+ case 'u': filter = strtod(s, &s); break;
382
+ case 'h': tree = 0; break;
383
+ case 'j': heap = 0; break;
384
+ case 'z': mode |= IST_MEMOPT; break;
385
+ case 'b': optarg = &blanks; break;
386
+ case 'f': optarg = &fldseps; break;
387
+ case 'r': optarg = &recseps; break;
388
+ case 'C': optarg = &comment; break;
389
+ case 'V': verbose = 1; break;
390
+ default : error(E_OPTION, *--s); break;
391
+ } /* set option variables */
392
+ if (optarg && *s) { *optarg = s; optarg = NULL; break; }
393
+ } } /* get option argument */
394
+ else { /* -- if argument is no option */
395
+ switch (k++) { /* evaluate non-options */
396
+ case 0: fn_in = s; break;
397
+ case 1: fn_out = s; break;
398
+ case 2: fn_app = s; break;
399
+ default: error(E_ARGCNT); break;
400
+ } /* note filenames */
401
+ }
402
+ }
403
+ if (optarg) error(E_OPTARG); /* check option argument */
404
+ if ((k < 2) || (k > 3)) /* and the number of arguments */
405
+ error(E_ARGCNT); /* (either in/out or in/out/app) */
406
+ if ((!fn_in || !*fn_in) && (fn_app && !*fn_app))
407
+ error(E_STDIN); /* stdin must not be used twice */
408
+ switch (target) { /* check and translate target type */
409
+ case 's': target = TT_SET; break;
410
+ case 'c': target = TT_CLSET; break;
411
+ case 'm': target = TT_MFSET; break;
412
+ case 'r': target = TT_RULE; break;
413
+ case 'h': target = TT_HEDGE; break;
414
+ case 'g': target = TT_GROUP; break;
415
+ default : error(E_TARGET, (char)target); break;
416
+ }
417
+ if (supp > 1) /* check the minimal support */
418
+ error(E_SUPP, supp); /* (< 0: absolute number) */
419
+ if ((conf < 0) || (conf > 1))
420
+ error(E_CONF, conf); /* check the minimal confidence */
421
+ if (minlen <= 0) error(E_RULELEN, minlen); /* check the limits */
422
+ if (maxlen <= 0) error(E_RULELEN, maxlen); /* for the rule length */
423
+ switch (arem) { /* check and translate measure */
424
+ case 0 : case '0': arem = EM_NONE; break;
425
+ case 'd': case '1': arem = EM_DIFF; break;
426
+ case 'q': case '2': arem = EM_QUOT; break;
427
+ case 'a': case '3': arem = EM_AIMP; break;
428
+ case 'i': case '4': arem = EM_INFO; break;
429
+ case 'c': case '5': arem = EM_CHI2; break;
430
+ case 'p': case '6': arem = EM_PVAL; break;
431
+ default : error(E_MEASURE, (char)arem); break;
432
+ }
433
+ if (target <= TT_MFSET) { /* in item set mode neutralize */
434
+ mode |= IST_BOTH; conf = 1;}/* rule specific settings */
435
+ if (arem == EM_NONE) /* if no add. rule eval. measure, */
436
+ aval = 0; /* clear the corresp. output flag */
437
+ if ((filter <= -1) || (filter >= 1)) filter = 0;
438
+
439
+ /* --- create item set and transaction set --- */
440
+ itemset = is_create(-1); /* create an item set and */
441
+ if (!itemset) error(E_NOMEM); /* set the special characters */
442
+ is_chars(itemset, blanks, fldseps, recseps, comment);
443
+ if (load) { /* if to load the transactions */
444
+ taset = tas_create(itemset);
445
+ if (!taset) error(E_NOMEM); /* create a transaction set */
446
+ } /* to store the transactions */
447
+ MSG(fprintf(stderr, "\n")); /* terminate the startup message */
448
+
449
+ /* --- read item appearances --- */
450
+ if (fn_app) { /* if item appearances are given */
451
+ t = clock(); /* start the timer */
452
+ if (*fn_app) /* if an app. file name is given, */
453
+ in = fopen(fn_app, "r"); /* open the item appearances file */
454
+ else { /* if no app. file name is given, */
455
+ in = stdin; fn_app = "<stdin>"; } /* read from std. input */
456
+ MSG(fprintf(stderr, "reading %s ... ", fn_app));
457
+ if (!in) error(E_FOPEN, fn_app);
458
+ k = is_readapp(itemset,in); /* read the item appearances */
459
+ if (k != 0) error(k, fn_app, RECCNT(itemset), BUFFER(itemset));
460
+ if (in != stdin) /* if not read from standard input, */
461
+ fclose(in); /* close the input file */
462
+ MSG(fprintf(stderr, "[%d item(s)]", is_cnt(itemset)));
463
+ MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
464
+ } /* print a log message */
465
+
466
+ /* --- read transactions --- */
467
+ t = clock(); /* start the timer */
468
+ if (fn_in && *fn_in) /* if an input file name is given, */
469
+ in = fopen(fn_in, "r"); /* open input file for reading */
470
+ else { /* if no input file name is given, */
471
+ in = stdin; fn_in = "<stdin>"; } /* read from standard input */
472
+ MSG(fprintf(stderr, "reading %s ... \n", fn_in));
473
+ if (!in) error(E_FOPEN, fn_in);
474
+ while (1) { /* transaction read loop */
475
+ k = is_read(itemset, in); /* read the next transaction */
476
+ if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
477
+ if (k > 0) break; /* check for error and end of file */
478
+ k = is_tsize(itemset); /* update the maximal */
479
+ if (k > maxcnt) maxcnt = k; /* transaction size */
480
+ if (taset && (tas_add(taset, NULL, 0) != 0))
481
+ error(E_NOMEM); /* add the loaded transaction */
482
+ } /* to the transaction set */
483
+ if (taset) { /* if transactions have been loaded */
484
+ if (in != stdin) fclose(in);/* if not read from standard input, */
485
+ in = NULL; /* close the input file */
486
+ } /* clear the file variable */
487
+ n = is_cnt(itemset); /* get the number of items */
488
+ tacnt = is_gettac(itemset); /* and the number of transactions */
489
+ MSG(fprintf(stderr, "[%d item(s), %d transaction(s)]", n, tacnt));
490
+ MSG(fprintf(stderr, " done [%.2fs].", SEC_SINCE(t)));
491
+ if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
492
+ MSG(fprintf(stderr, "\n")); /* check for at least one transaction */
493
+ if (supp >= 0) /* if relative support is given */
494
+ supp = ceil(tacnt *supp); /* compute absolute support */
495
+ else { /* if absolute support is given, */
496
+ supp = ceil(-100 *supp); /* make the support value positive */
497
+ if (!(sout & 2)) sout = 2; /* switch to absolute support output */
498
+ } /* do the same with the max. support */
499
+ smax = floor(((smax >= 0) ? tacnt : -100) *smax);
500
+
501
+ /* --- sort and recode items --- */
502
+ MSG(fprintf(stderr, "filtering, sorting and recoding items ... "));
503
+ t = clock(); /* start the timer */
504
+ map = (int*)malloc(is_cnt(itemset) *sizeof(int));
505
+ if (!map) error(E_NOMEM); /* create an item identifier map */
506
+ k = (int)((mode & IST_HEAD) ? supp : ceil(supp *conf));
507
+ n = is_recode(itemset, k, sort, map);
508
+ if (taset) { /* sort and recode the items and */
509
+ tas_recode(taset, map,n); /* recode the loaded transactions */
510
+ maxcnt = tas_max(taset); /* get the new maximal t.a. size */
511
+ } /* (may be smaller than before) */
512
+ free(map); /* delete the item identifier map */
513
+ MSG(fprintf(stderr, "[%d item(s)] ", n));
514
+ MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
515
+ if (n <= 0) error(E_NOFREQ); /* print a log message and */
516
+ MSG(fprintf(stderr, "\n")); /* check the number of items */
517
+ if (maxlen > maxcnt) /* clamp the set/rule length */
518
+ maxlen = maxcnt; /* to the maximum set size */
519
+
520
+ /* --- create a transaction tree --- */
521
+ tt = 0; /* init. the tree construction time */
522
+ if (tree && taset) { /* if transactions were loaded */
523
+ MSG(fprintf(stderr, "creating transaction tree ... "));
524
+ t = clock(); /* start the timer */
525
+ tatree = tat_create(taset, heap);
526
+ if (!tatree) error(E_NOMEM);/* create a transaction tree */
527
+ if (filter == 0) { /* if a tree rebuild is not needed, */
528
+ tas_delete(taset, 0); taset = NULL; } /* delete transactions */
529
+ tt = clock() -t; /* note the time for the construction */
530
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
531
+ } /* print a log message */
532
+
533
+ /* --- create an item set tree --- */
534
+ t = clock(); tc = 0; /* start the timer */
535
+ istree = ist_create(itemset, mode, (int)supp, conf);
536
+ if (!istree) error(E_NOMEM); /* create an item set tree */
537
+
538
+ /* --- check item subsets --- */
539
+ if (filter) { /* if to filter unused items */
540
+ used = (char*)malloc(is_cnt(itemset) *sizeof(char));
541
+ if (!used) error(E_NOMEM); /* create a flag vector */
542
+ } /* for the items */
543
+ MSG(fprintf(stderr, "checking subsets of size 1"));
544
+ while (ist_height(istree) < maxlen) {
545
+ if (filter != 0) { /* if to filter w.r.t. item usage, */
546
+ i = ist_check(istree, used); /* check current item usage */
547
+ if (i < maxlen) maxlen = i; /* update the maximum size */
548
+ if (ist_height(istree) >= i) break;
549
+ } /* check the tree height */
550
+ k = ist_addlvl(istree); /* while max. height is not reached, */
551
+ if (k < 0) error(E_NOMEM); /* add a level to the item set tree */
552
+ if (k != 0) break; /* if no level was added, abort */
553
+ MSG(fprintf(stderr, " %d", ist_height(istree)));
554
+ if (tatree) { /* if a transaction tree was created */
555
+ if (((filter < 0) /* if to filter w.r.t. item usage */
556
+ && (i < -filter *n)) /* and enough items were removed */
557
+ || ((filter > 0) /* or counting time is long enough */
558
+ && (i < n) && (i *(double)tt < filter *n *tc))) {
559
+ n = i; x = clock(); /* note the new number of items */
560
+ tas_filter(taset, used);/* and remove unnecessary items */
561
+ tat_delete(tatree); /* delete the transaction tree */
562
+ tatree = tat_create(taset, heap);
563
+ if (!tatree) error(E_NOMEM);
564
+ tt = clock() -x; /* rebuild the transaction tree and */
565
+ } /* note the new construction time */
566
+ x = clock(); /* count the transaction tree */
567
+ ist_countx(istree, tatree);
568
+ tc = clock() -x; } /* note the new count time */
569
+ else if (taset) { /* if transactions were loaded */
570
+ if (((filter < 0) /* if to filter w.r.t. item usage */
571
+ && (i <= -filter *n)) /* and enough items were removed */
572
+ || ((filter > 0) /* or counting time is long enough */
573
+ && (i *(double)tt <= filter *n *tc))) {
574
+ n = i; x = clock(); /* note the new number of items */
575
+ tas_filter(taset, used);/* and remove unnecessary items */
576
+ tt = clock() -t; /* from the transactions */
577
+ } /* note the filtering time */
578
+ for (i = tacnt; --i >= 0;)/* traverse and count transactions */
579
+ ist_count(istree, tas_tract(taset, i), tas_tsize(taset, i));
580
+ tc = clock() -t; } /* note the new count time */
581
+ else { /* if to work on the input file, */
582
+ rewind(in); /* reset the file position */
583
+ for (maxcnt = 0; (i = is_read(itemset, in)) == 0; ) {
584
+ if (filter != 0) /* (re)read the transactions and */
585
+ is_filter(itemset, used); /* remove unnecessary items */
586
+ k = is_tsize(itemset); /* update the maximum size */
587
+ if (k > maxcnt) maxcnt = k; /* of a transaction */
588
+ ist_count(istree, is_tract(itemset), k);
589
+ } /* count the transaction in the tree */
590
+ if (i < 0) error(i, fn_in, RECCNT(itemset), BUFFER(itemset));
591
+ if (maxcnt < maxlen) /* update the maximal rule length */
592
+ maxlen = maxcnt; /* according to the max. t.a. size */
593
+ } /* (may be smaller than before) */
594
+ }
595
+ if (!taset && !tatree) { /* if transactions were not loaded */
596
+ if (in != stdin) fclose(in);/* if not read from standard input, */
597
+ in = NULL; /* close the input file */
598
+ } /* clear the file variable */
599
+ MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
600
+
601
+ /* --- filter found item sets --- */
602
+ if ((target == TT_CLSET) || (target == TT_MFSET)) {
603
+ MSG(fprintf(stderr, "filtering %s item sets ... ",
604
+ (target == TT_MFSET) ? "maximal" : "closed"));
605
+ t = clock(); /* filter the item sets */
606
+ ist_filter(istree, (target == TT_MFSET) ? IST_MAXFRQ : IST_CLOSED);
607
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
608
+ } /* (filter takes longer than print) */
609
+
610
+ /* --- sort transactions --- */
611
+ if (target <= TT_MFSET) { /* if to find frequent item sets */
612
+ if (!taset) /* transactions must be loaded */
613
+ ext = 0; /* for extended support output */
614
+ else if (ext) { /* if extended output is requested */
615
+ MSG(fprintf(stderr, "sorting transactions ... "));
616
+ t = clock(); /* start the timer */
617
+ tas_sort(taset, heap); /* sort the transactions */
618
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
619
+ } /* (sorting is necessary to find the */
620
+ } /* number of identical transactions) */
621
+
622
+ /* --- print item sets/rules/hyperedges --- */
623
+ t = clock(); /* start the timer */
624
+ if (fn_out && *fn_out) /* if an output file name is given, */
625
+ out = fopen(fn_out, "w"); /* open the output file */
626
+ else { /* if no output file name is given, */
627
+ out = stdout; fn_out = "<stdout>"; } /* write to std. output */
628
+ MSG(fprintf(stderr, "writing %s ... ", fn_out));
629
+ if (!out) error(E_FOPEN, fn_out);
630
+ ist_init(istree, minlen, arem, minval);
631
+ set = is_tract(itemset); /* get the transaction buffer */
632
+ if (target <= TT_MFSET) { /* if to find frequent item sets */
633
+ for (n = 0; 1; ) { /* extract item sets from the tree */
634
+ k = ist_set(istree, set, &frq, &conf);
635
+ if (k <= 0) break; /* get the next frequent item set */
636
+ if (frq > smax) continue; /* check against maximal support */
637
+ for (i = 0; i < k; i++) { /* traverse the set's items */
638
+ name = is_name(itemset, set[i]);
639
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
640
+ fputs(name, out); /* print the name of the next item */
641
+ fputs((i < k-1) ? sep : " ", out);
642
+ } /* print a separator */
643
+ fputs(" (", out); /* print the item set's support */
644
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
645
+ if (sout & 2) fputc('/', out); }
646
+ if (sout & 2) { fprintf(out, "%d", frq); }
647
+ if (ext) { /* if to print the extended support */
648
+ frq = tas_occur(taset, set, k);
649
+ fputs(", ", out); /* get the number of occurrences */
650
+ fprintf(out, fmt, (frq/(double)tacnt) *100);
651
+ if (sout & 2) fprintf(out, "/%d", frq);
652
+ } /* print the extended support data */
653
+ if (aval) { fputs(", ", out); fprintf(out, fmt, conf *100); }
654
+ fputs(")\n", out); /* print the add. eval. measure, */
655
+ n++; /* terminate the support output, */
656
+ } } /* and count the item set */
657
+ else if (target == TT_RULE) { /* if to find association rules, */
658
+ for (n = 0; 1; ) { /* extract rules from tree */
659
+ k = ist_rule(istree, set, &frq, &conf, &lftval, &minval);
660
+ if (k <= 0) break; /* get the next association rule */
661
+ if (frq > smax) continue; /* check against maximal support */
662
+ for (i = 0; i < k; i++) { /* traverse the rule's items */
663
+ name = is_name(itemset, set[i]);
664
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
665
+ fputs(name, out); /* print the next item */
666
+ fputs((i <= 0) ? " <- " : ((i < k-1) ? sep : " "), out);
667
+ } /* print a separator */
668
+ fputs(" (", out); /* print the rule evaluation */
669
+ if (sout & 1) supp = frq/(double)tacnt;
670
+ if (ext && !(mode & IST_HEAD)) {
671
+ if (sout & 1) { fprintf(out, fmt, supp *conf *100);
672
+ if (sout & 2) fputc('/', out); }
673
+ if (sout & 2) { fprintf(out, "%d", (int)(frq *conf +0.5));}
674
+ fputs(", ", out); /* print the support of the rule */
675
+ } /* from the support of the body */
676
+ if (sout & 1) { fprintf(out, fmt, supp *100);
677
+ if (sout & 2) fputc('/', out); }
678
+ if (sout & 2) { fprintf(out, "%d", frq); }
679
+ fputs(", ", out); /* print the rule support */
680
+ if (ext && (mode & IST_HEAD)) {
681
+ if (sout & 1) { fprintf(out, fmt, (supp/conf) *100);
682
+ if (sout & 2) fputc('/', out); }
683
+ if (sout & 2) { fprintf(out, "%d", (int)(frq /conf +0.5));}
684
+ fputs(", ", out); /* print the support of the body */
685
+ } /* from the support of the rule */
686
+ fprintf(out, fmt, conf *100); /* print the rule confidence */
687
+ if (lift) { fputs(", ", out); fprintf(out, fmt, lftval *100); }
688
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
689
+ fputs(")\n", out); /* print the value of the additional */
690
+ n++; /* rule evaluation measure and */
691
+ } } /* count the association rule */
692
+ else if (target == TT_HEDGE){ /* if to find association hyperedges */
693
+ for (n = 0; 1; ) { /* extract hyperedges from tree */
694
+ k = ist_hedge(istree, set, &frq, &conf, &minval);
695
+ if (k <= 0) break; /* get the next hyperedge */
696
+ if (frq > smax) continue; /* check against maximal support */
697
+ for (i = 0; i < k; i++) { /* traverse the edge's items */
698
+ name = is_name(itemset, set[i]);
699
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
700
+ fputs(name, out); /* print the name of the next item */
701
+ fputs((i < k-1) ? sep : " ", out);
702
+ } /* print a separator */
703
+ fputs(" (", out); /* print the hyperedge evaluation */
704
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
705
+ if (sout & 2) fputc('/', out); }
706
+ if (sout & 2) { fprintf(out, "%d", frq); }
707
+ fputs(", ", out); fprintf(out, fmt, conf *100);
708
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
709
+ fputs(")\n", out); /* print support and confidence */
710
+ n++; /* of the hyperedge and */
711
+ } } /* count the hyperedge */
712
+ else { /* if to find association groups */
713
+ for (n = 0; 1; ) { /* extract groups from tree */
714
+ k = ist_group(istree, set, &frq, &minval);
715
+ if (k <= 0) break; /* get the next group */
716
+ if (frq > smax) continue; /* check against maximal support */
717
+ for (i = 0; i < k; i++) { /* traverse the group's items */
718
+ name = is_name(itemset, set[i]);
719
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
720
+ fputs(name, out); /* print the name of the next item */
721
+ fputs((i < k-1) ? sep : " ", out);
722
+ } /* print a separator */
723
+ fputs(" (", out); /* print the group evaluation */
724
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
725
+ if (sout & 2) fputc('/', out); }
726
+ if (sout & 2) { fprintf(out, "%d", frq); }
727
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
728
+ fputs(")\n", out); /* print support and add. measure */
729
+ n++; /* and count the group */
730
+ }
731
+ } /* if (target <= TT_MFSET) .. else .. */
732
+ if (fflush(out) != 0) error(E_FWRITE, fn_out);
733
+ if (out != stdout) fclose(out);
734
+ out = NULL; /* close the output file */
735
+ MSG(fprintf(stderr, "[%d %s(s)] done ", n, ttypes[target]));
736
+ MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)));
737
+ #ifdef BENCH
738
+ printf("number of support counters: %d\n", istree->sccnt);
739
+ printf("necessary support counters: %d\n", istree->scnec);
740
+ printf("number of child pointers : %d\n", istree->cpcnt);
741
+ printf("necessary child pointers : %d\n", istree->cpnec);
742
+ printf("allocated memory (bytes) : %d\n", istree->bytes);
743
+ #endif
744
+
745
+ /* --- clean up --- */
746
+ #ifndef NDEBUG /* if this is a debug version */
747
+ free(used); /* delete the item app. vector */
748
+ ist_delete(istree); /* delete the item set tree, */
749
+ if (tatree) tat_delete(tatree); /* the transaction tree, */
750
+ if (taset) tas_delete(taset, 0); /* the transaction set, */
751
+ is_delete(itemset); /* and the item set */
752
+ #endif
753
+ #ifdef STORAGE /* if storage debugging */
754
+ showmem("at end of program"); /* check memory usage */
755
+ #endif
756
+ return 0; /* return 'ok' */
757
+ } /* main() */