apriori 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. data/History.txt +16 -0
  2. data/License.txt +20 -0
  3. data/Manifest.txt +121 -0
  4. data/README.txt +149 -0
  5. data/Rakefile +15 -0
  6. data/TODO.txt +60 -0
  7. data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
  8. data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
  9. data/attic/c_ext_test1/mytest.rb +10 -0
  10. data/attic/test.c +12 -0
  11. data/config/hoe.rb +81 -0
  12. data/config/requirements.rb +29 -0
  13. data/examples/01_simple_example.rb +32 -0
  14. data/examples/02_small_file_example.rb +17 -0
  15. data/examples/03_large_file_example.rb +22 -0
  16. data/examples/test_data/market_basket_basic_test.dat +9 -0
  17. data/ext/Apriori.c +149 -0
  18. data/ext/Makefile +149 -0
  19. data/ext/apriori/doc/apriori.html +1301 -0
  20. data/ext/apriori/doc/arem.gp +68 -0
  21. data/ext/apriori/doc/c_rev.gp +89 -0
  22. data/ext/apriori/doc/chi2.tex +156 -0
  23. data/ext/apriori/doc/copying +504 -0
  24. data/ext/apriori/doc/line.gif +0 -0
  25. data/ext/apriori/doc/uparrow.gif +0 -0
  26. data/ext/apriori/ex/flg2set +15 -0
  27. data/ext/apriori/ex/hdr2set +13 -0
  28. data/ext/apriori/ex/readme +71 -0
  29. data/ext/apriori/ex/row2set +7 -0
  30. data/ext/apriori/ex/rulesort +24 -0
  31. data/ext/apriori/ex/tab2set +9 -0
  32. data/ext/apriori/ex/test.app +2 -0
  33. data/ext/apriori/ex/test.rul +9 -0
  34. data/ext/apriori/ex/test1.rul +43 -0
  35. data/ext/apriori/ex/test1.tab +10 -0
  36. data/ext/apriori/ex/test2.tab +10 -0
  37. data/ext/apriori/ex/test3.tab +30 -0
  38. data/ext/apriori/ex/test4.tab +11 -0
  39. data/ext/apriori/ex/test5.tab +39 -0
  40. data/ext/apriori/ex/tid2set +23 -0
  41. data/ext/apriori/ex/xhdr2set +33 -0
  42. data/ext/apriori/src/apriori.c +750 -0
  43. data/ext/apriori/src/apriori.dsp +120 -0
  44. data/ext/apriori/src/apriori.dsw +29 -0
  45. data/ext/apriori/src/apriori.mak +99 -0
  46. data/ext/apriori/src/istree.c +1411 -0
  47. data/ext/apriori/src/istree.h +160 -0
  48. data/ext/apriori/src/makefile +105 -0
  49. data/ext/apriori/src/tract.c +870 -0
  50. data/ext/apriori/src/tract.h +261 -0
  51. data/ext/apriori_wrapper.c +757 -0
  52. data/ext/apriori_wrapper.h +10 -0
  53. data/ext/extconf.rb +32 -0
  54. data/ext/math/doc/copying +504 -0
  55. data/ext/math/src/chi2.c +151 -0
  56. data/ext/math/src/chi2.h +27 -0
  57. data/ext/math/src/choose.c +71 -0
  58. data/ext/math/src/choose.h +16 -0
  59. data/ext/math/src/gamma.c +446 -0
  60. data/ext/math/src/gamma.h +39 -0
  61. data/ext/math/src/intexp.c +35 -0
  62. data/ext/math/src/intexp.h +15 -0
  63. data/ext/math/src/makefile +164 -0
  64. data/ext/math/src/math.mak +48 -0
  65. data/ext/math/src/normal.c +387 -0
  66. data/ext/math/src/normal.h +44 -0
  67. data/ext/math/src/radfn.c +113 -0
  68. data/ext/math/src/radfn.h +34 -0
  69. data/ext/math/src/zeta.c +49 -0
  70. data/ext/math/src/zeta.h +15 -0
  71. data/ext/pre-clean.rb +8 -0
  72. data/ext/pre-setup.rb +9 -0
  73. data/ext/util/doc/copying +504 -0
  74. data/ext/util/src/listops.c +76 -0
  75. data/ext/util/src/listops.h +26 -0
  76. data/ext/util/src/makefile +103 -0
  77. data/ext/util/src/memsys.c +84 -0
  78. data/ext/util/src/memsys.h +42 -0
  79. data/ext/util/src/nstats.c +288 -0
  80. data/ext/util/src/nstats.h +69 -0
  81. data/ext/util/src/params.c +86 -0
  82. data/ext/util/src/params.h +19 -0
  83. data/ext/util/src/parse.c +133 -0
  84. data/ext/util/src/parse.h +81 -0
  85. data/ext/util/src/scan.c +767 -0
  86. data/ext/util/src/scan.h +111 -0
  87. data/ext/util/src/symtab.c +443 -0
  88. data/ext/util/src/symtab.h +121 -0
  89. data/ext/util/src/tabscan.c +279 -0
  90. data/ext/util/src/tabscan.h +99 -0
  91. data/ext/util/src/util.mak +91 -0
  92. data/ext/util/src/vecops.c +317 -0
  93. data/ext/util/src/vecops.h +42 -0
  94. data/lib/apriori.rb +133 -0
  95. data/lib/apriori/adapter.rb +13 -0
  96. data/lib/apriori/association_rule.rb +89 -0
  97. data/lib/apriori/version.rb +9 -0
  98. data/script/console +10 -0
  99. data/script/destroy +14 -0
  100. data/script/generate +14 -0
  101. data/script/txt2html +82 -0
  102. data/setup.rb +1585 -0
  103. data/tasks/apriori.rake +20 -0
  104. data/tasks/attic.rake +28 -0
  105. data/tasks/deployment.rake +34 -0
  106. data/tasks/environment.rake +7 -0
  107. data/tasks/install.rake +13 -0
  108. data/tasks/website.rake +17 -0
  109. data/test/apriori_test.rb +13 -0
  110. data/test/fixtures/market_basket_results_test.txt +5 -0
  111. data/test/fixtures/market_basket_string_test.txt +7 -0
  112. data/test/fixtures/results.txt +2 -0
  113. data/test/fixtures/sample.txt +7 -0
  114. data/test/test_helper.rb +5 -0
  115. data/test/unit/test_apriori.rb +68 -0
  116. data/test/unit/test_itemsets_and_parsing.rb +82 -0
  117. data/website/index.html +248 -0
  118. data/website/index.txt +152 -0
  119. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  120. data/website/stylesheets/screen.css +142 -0
  121. data/website/template.html.erb +49 -0
  122. metadata +226 -0
@@ -0,0 +1,261 @@
1
+ /*----------------------------------------------------------------------
2
+ File : tract.h
3
+ Contents: item and transaction management
4
+ Author : Christian Borgelt
5
+ History : 2001.11.18 file created from file apriori.c
6
+ 2001.12.28 first version completed
7
+ 2001.01.02 ta_sort mapped to v_intsort
8
+ 2002.02.19 transaction tree functions added
9
+ 2003.07.17 functions is_filter, ta_filter, tas_filter added
10
+ 2003.08.21 parameter 'heap' added to tas_sort, tat_create
11
+ 2003.09.12 function tas_total added
12
+ 2003.09.20 empty transactions in input made possible
13
+ 2004.11.20 function tat_mark added
14
+ 2004.12.11 access functions for extended frequency added
15
+ 2004.12.15 function nim_trunc added
16
+ 2006.11.26 structures ISFMTR and ISEVAL added
17
+ 2007.02.13 adapted to modified tabscan module
18
+ 2008.06.30 support argument to ise_eval changed to double
19
+ ----------------------------------------------------------------------*/
20
+ #ifndef __TRACT__
21
+ #define __TRACT__
22
+ #ifndef NIMAPFN
23
+ #define NIMAPFN
24
+ #endif
25
+ #include "vecops.h"
26
+ #include "symtab.h"
27
+ #include "tabscan.h"
28
+
29
+ /*----------------------------------------------------------------------
30
+ Preprocessor Definitions
31
+ ----------------------------------------------------------------------*/
32
+ /* --- item appearance flags --- */
33
+ #define APP_NONE 0x00 /* item should be ignored */
34
+ #define APP_BODY 0x01 /* item may appear in rule body */
35
+ #define APP_HEAD 0x02 /* item may appear in rule head */
36
+ #define APP_BOTH (APP_HEAD|APP_BODY)
37
+
38
+ /* --- error codes --- */
39
+ #define E_NONE 0 /* no error */
40
+ #define E_NOMEM (-1) /* not enough memory */
41
+ #define E_FOPEN (-2) /* cannot open file */
42
+ #define E_FREAD (-3) /* read error on file */
43
+ #define E_FWRITE (-4) /* write error on file */
44
+
45
+ #define E_ITEMEXP (-16) /* item expected */
46
+ #define E_DUPITEM (-17) /* duplicate item */
47
+ #define E_APPEXP (-18) /* appearance indicator expected */
48
+ #define E_UNKAPP (-19) /* unknown appearance indicator */
49
+ #define E_FLDCNT (-20) /* too many fields */
50
+
51
+ /*----------------------------------------------------------------------
52
+ Type Definitions
53
+ ----------------------------------------------------------------------*/
54
+ typedef struct { /* --- an item --- */
55
+ int id; /* item identifier */
56
+ int frq; /* frequency in transactions */
57
+ int xfq; /* extended frequency (t.a. sizes) */
58
+ int app; /* appearance indicator */
59
+ } ITEM; /* (item) */
60
+
61
+ typedef struct { /* --- an item set --- */
62
+ NIMAP *nimap; /* name/identifier map */
63
+ TABSCAN *tscan; /* table scanner */
64
+ char chars[4]; /* special characters */
65
+ int tac; /* transaction counter */
66
+ int app; /* default appearance indicator */
67
+ int vsz; /* size of transaction buffer */
68
+ int cnt; /* number of items in transaction */
69
+ int *items; /* items in transaction */
70
+ } ITEMSET; /* (item set) */
71
+
72
+ typedef struct { /* --- an item set evaluator --- */
73
+ double logta; /* logarithm of num. of transactions */
74
+ double *logfs; /* logarithms of item frequencies */
75
+ double lsums[1]; /* sums of logarithms for prefixes */
76
+ } ISEVAL; /* (item set evaluator) */
77
+
78
+ typedef struct { /* --- item set formatter --- */
79
+ int cnt; /* number of formatted item names */
80
+ int len; /* length of description in buffer */
81
+ int *offs; /* prefix lengths in output buffer */
82
+ char *buf; /* output buffer */
83
+ const char *names[1]; /* formatted item names */
84
+ } ISFMTR; /* (item set formatter) */
85
+
86
+ typedef struct { /* --- a transaction --- */
87
+ int cnt; /* number of items */
88
+ int items[1]; /* item identifier vector */
89
+ } TRACT; /* (transaction) */
90
+
91
+ typedef struct { /* --- a transaction set --- */
92
+ ITEMSET *itemset; /* underlying item set */
93
+ int max; /* maximum number of items per t.a. */
94
+ int vsz; /* size of transaction vector */
95
+ int cnt; /* number of transactions */
96
+ int total; /* total number of items */
97
+ TRACT **tracts; /* transaction vector */
98
+ } TASET; /* (transaction set) */
99
+
100
+ typedef struct _tatree { /* --- a transaction tree (node) --- */
101
+ int cnt; /* number of transactions */
102
+ int max; /* size of largest transaction */
103
+ int size; /* node size (number of children) */
104
+ int items[1]; /* next items in rep. transactions */
105
+ } TATREE; /* (transaction tree) */
106
+
107
+ /*----------------------------------------------------------------------
108
+ Item Set Functions
109
+ ----------------------------------------------------------------------*/
110
+ extern ITEMSET* is_create (int cnt);
111
+ extern void is_delete (ITEMSET *iset);
112
+ extern TABSCAN* is_tabscan (ITEMSET *iset);
113
+ extern void is_chars (ITEMSET *iset, const char *blanks,
114
+ const char *fldseps,
115
+ const char *recseps,
116
+ const char *cominds);
117
+
118
+ extern int is_cnt (ITEMSET *iset);
119
+ extern int is_item (ITEMSET *iset, const char *name);
120
+ extern const char* is_name (ITEMSET *iset, int item);
121
+
122
+ extern int is_gettac (ITEMSET *iset);
123
+ extern int is_settac (ITEMSET *iset, int cnt);
124
+ extern int is_addtac (ITEMSET *iset, int cnt);
125
+ extern int is_getfrq (ITEMSET *iset, int item);
126
+ extern int is_setfrq (ITEMSET *iset, int item, int frq);
127
+ extern int is_addfrq (ITEMSET *iset, int item, int frq);
128
+ extern int is_getxfq (ITEMSET *iset, int item);
129
+ extern int is_setxfq (ITEMSET *iset, int item, int frq);
130
+ extern int is_getapp (ITEMSET *iset, int item);
131
+ extern int is_setapp (ITEMSET *iset, int item, int app);
132
+
133
+ extern int is_readapp (ITEMSET *iset, FILE *file);
134
+ extern int is_read (ITEMSET *iset, FILE *file);
135
+
136
+ extern int is_recode (ITEMSET *iset, int minfrq,
137
+ int dir, int *map);
138
+ extern void is_trunc (ITEMSET *iset, int cnt);
139
+ extern int is_filter (ITEMSET *iset, const char *marks);
140
+ extern int is_tsize (ITEMSET *iset);
141
+ extern int* is_tract (ITEMSET *iset);
142
+
143
+ /*----------------------------------------------------------------------
144
+ Item Set Evaluation Functions
145
+ ----------------------------------------------------------------------*/
146
+ extern ISEVAL* ise_create (ITEMSET *iset, int tacnt);
147
+ extern void ise_delete (ISEVAL *eval);
148
+ extern double ise_eval (ISEVAL *eval, int *ids, int cnt, int pre,
149
+ double supp);
150
+
151
+ /*----------------------------------------------------------------------
152
+ Item Set Formatting Functions
153
+ ----------------------------------------------------------------------*/
154
+ extern ISFMTR* isf_create (ITEMSET *iset, int scan);
155
+ extern void isf_delete (ISFMTR *fmt);
156
+ extern const char* isf_format (ISFMTR *fmt, int *ids, int cnt, int pre);
157
+ extern int isf_length (ISFMTR *fmt);
158
+ extern void isf_print (ISFMTR *fmt, FILE *out);
159
+
160
+ /*----------------------------------------------------------------------
161
+ Transaction Functions
162
+ ----------------------------------------------------------------------*/
163
+ extern void ta_sort (int *items, int n);
164
+ extern int ta_unique (int *items, int n);
165
+ extern int ta_filter (int *items, int n, const char *marks);
166
+
167
+ /*----------------------------------------------------------------------
168
+ Transaction Set Functions
169
+ ----------------------------------------------------------------------*/
170
+ extern TASET* tas_create (ITEMSET *itemset);
171
+ extern void tas_delete (TASET *taset, int delis);
172
+ extern ITEMSET* tas_itemset (TASET *taset);
173
+
174
+ extern int tas_cnt (TASET *taset);
175
+ extern int tas_add (TASET *taset, const int *items, int n);
176
+ extern int* tas_tract (TASET *taset, int index);
177
+ extern int tas_tsize (TASET *taset, int index);
178
+ extern int tas_total (TASET *taset);
179
+
180
+ extern void tas_recode (TASET *taset, int *map, int cnt);
181
+ extern int tas_filter (TASET *taset, const char *marks);
182
+ extern void tas_shuffle (TASET *taset, double randfn(void));
183
+ extern void tas_sort (TASET *taset, int heap);
184
+ extern int tas_occur (TASET *taset, const int *items, int n);
185
+
186
+ #ifndef NDEBUG
187
+ extern void tas_show (TASET *taset);
188
+ #endif
189
+
190
+ /*----------------------------------------------------------------------
191
+ Transaction Tree Functions
192
+ ----------------------------------------------------------------------*/
193
+ extern TATREE* tat_create (TASET *taset, int heap);
194
+ extern void tat_delete (TATREE *tat);
195
+ extern int tat_cnt (TATREE *tat);
196
+ extern int tat_max (TATREE *tat);
197
+ extern int tat_size (TATREE *tat);
198
+ extern int* tat_items (TATREE *tat);
199
+ extern int tat_item (TATREE *tat, int index);
200
+ extern TATREE* tat_child (TATREE *tat, int index);
201
+ extern void tat_mark (TATREE *tat);
202
+
203
+ #ifndef NDEBUG
204
+ extern void tat_show (TATREE *tat);
205
+ #endif
206
+
207
+ /*----------------------------------------------------------------------
208
+ Preprocessor Definitions
209
+ ----------------------------------------------------------------------*/
210
+ #define is_tabscan(s) ((s)->tscan)
211
+
212
+ #define is_cnt(s) nim_cnt((s)->nimap)
213
+ #define is_name(s,i) nim_name(nim_byid((s)->nimap, i))
214
+ #define is_gettac(s) ((s)->tac)
215
+ #define is_settac(s,n) ((s)->tac = (n))
216
+ #define is_addtac(s,n) ((s)->tac += (n))
217
+ #define is_getfrq(s,i) (((ITEM*)nim_byid((s)->nimap, i))->frq)
218
+ #define is_setfrq(s,i,f) (((ITEM*)nim_byid((s)->nimap, i))->frq = (f))
219
+ #define is_addfrq(s,i,f) (((ITEM*)nim_byid((s)->nimap, i))->frq += (f))
220
+ #define is_getxfq(s,i) (((ITEM*)nim_byid((s)->nimap, i))->xfq)
221
+ #define is_setxfq(s,i,f) (((ITEM*)nim_byid((s)->nimap, i))->xfq = (f))
222
+ #define is_getapp(s,i) (((ITEM*)nim_byid((s)->nimap, i))->app)
223
+ #define is_setapp(s,i,a) (((ITEM*)nim_byid((s)->nimap, i))->app = (a))
224
+
225
+ #define is_trunc(s,n) nim_trunc((s)->nimap, n)
226
+
227
+ #define is_tsize(s) ((s)->cnt)
228
+ #define is_tract(s) ((s)->items)
229
+
230
+ /*--------------------------------------------------------------------*/
231
+ #define ise_delete(e) free(e)
232
+
233
+ /*--------------------------------------------------------------------*/
234
+ #define isf_length(f) ((f)->len)
235
+ #define isf_print(f,o) fwrite((f)->buf, sizeof(char), (f)->len, o)
236
+
237
+ /*--------------------------------------------------------------------*/
238
+ #define ta_sort(v,n) v_intsort(v,n)
239
+
240
+ /*--------------------------------------------------------------------*/
241
+ #define tas_itemset(s) ((s)->itemset)
242
+ #define tas_cnt(s) ((s)->cnt)
243
+ #define tas_max(s) ((s)->max)
244
+
245
+ #define tas_tract(s,i) ((s)->tracts[i]->items)
246
+ #define tas_tsize(s,i) ((s)->tracts[i]->cnt)
247
+ #define tas_total(s) ((s)->total)
248
+
249
+ #define tas_shuffle(s,f) v_shuffle((s)->tracts, (s)->cnt, f)
250
+
251
+ /*--------------------------------------------------------------------*/
252
+ #define tat_cnt(t) ((t)->cnt)
253
+ #define tat_max(t) ((t)->max)
254
+ #define tat_size(t) ((t)->size)
255
+ #define tat_item(t,i) ((t)->items[i])
256
+ #define tat_items(t) ((t)->items)
257
+ #ifndef ARCH64
258
+ #define tat_child(t,i) (((TATREE**)((t)->items +(t)->size))[i])
259
+ #endif
260
+
261
+ #endif
@@ -0,0 +1,757 @@
1
+ /*----------------------------------------------------------------------
2
+ File : apriori.c
3
+ Contents: apriori algorithm for finding association rules
4
+ Author : Christian Borgelt
5
+ History : 1996.02.14 file created
6
+ 1996.07.26 output precision reduced
7
+ 1996.11.22 options -b, -f, and -r added
8
+ 1996.11.24 option -e added (add. evaluation measures)
9
+ 1997.08.18 normalized chi^2 measure added
10
+ option -m (minimal rule length) added
11
+ 1997.10.13 quiet version (no output to stdout or stderr)
12
+ 1998.01.27 adapted to changed ist_create() function
13
+ 1998.08.08 optional input file (item appearances) added
14
+ 1998.09.02 several assertions added
15
+ 1998.09.07 hyperedge mode (option -h) added
16
+ 1998.12.08 output of absolute support (option -a) added
17
+ float changed to double
18
+ 1998.12.09 conversion of names to a scanable form added
19
+ 1999.02.05 long int changed to int
20
+ 1999.02.09 input from stdin, output to stdout added
21
+ 1999.08.09 bug in check of support parameter (<= 0) fixed
22
+ 1999.11.05 rule evaluation measure EM_AIMP added
23
+ 1999.11.08 output of add. rule eval. measure value added
24
+ 2000.03.16 optional use of original rule support definition
25
+ 2001.04.01 option -h replaced by option -t (target type)
26
+ 2001.05.26 extended support output added (option -x)
27
+ 2001.06.09 extended support output for item sets added
28
+ 2001.08.15 module scan used for output formatting
29
+ 2001.11.18 item and transaction functions made a module
30
+ 2001.11.19 options -C, -l changed, option -y removed
31
+ 2001.12.28 adapted to module tract, some improvements
32
+ 2002.01.11 evaluation measures codes changed to letters
33
+ 2002.02.10 option -q extended by a direction parameter
34
+ 2002.02.11 memory usage minimization option added
35
+ 2002.06.09 arbitrary supp./conf. formats made possible
36
+ 2003.01.09 option -k (item separator) added
37
+ 2003.01.14 check for empty transaction set added
38
+ 2003.03.12 output of lift value (conf/prior) added
39
+ 2003.07.17 item filtering w.r.t. usage added (option -u)
40
+ 2003.07.17 sorting w.r.t. transaction size sum added
41
+ 2003.07.18 maximal itemset filter added
42
+ 2003.08.11 closed itemset filter added
43
+ 2003.08.15 item filtering for transaction tree added
44
+ 2003.08.16 parameter for transaction filtering added
45
+ 2003.08.18 dynamic filtering decision based on times added
46
+ 2003.08.21 option -j (heap sort for transactions) added
47
+ 2003.09.22 meaning of option -j reversed (heapsort default)
48
+ 2004.03.25 option -S added (maximal support of a set/rule)
49
+ 2004.05.09 additional selection measure for sets added
50
+ 2004.10.28 two unnecessary assignments removed
51
+ 2004.11.20 bug in evaluation of -j (heap/quicksort) fixed
52
+ 2004.11.23 absolute/relative support output changed
53
+ 2004.12.09 semantics of option -p changed
54
+ 2005.01.25 bug in output of absolute/relative support fixed
55
+ 2005.01.31 another bug in this output fixed
56
+ 2005.06.20 use of flag for "no item sorting" corrected
57
+ 2007.02.13 adapted to modified module tabscan
58
+ 2008.03.13 additional hyperedge evaluation added
59
+ 2008.03.24 additional target added (association groups)
60
+ ----------------------------------------------------------------------*/
61
+ #include <stdio.h>
62
+ #include <stdlib.h>
63
+ #include <stdarg.h>
64
+ #include <string.h>
65
+ #include <limits.h>
66
+ #include <math.h>
67
+ #include <time.h>
68
+ #include <assert.h>
69
+ #include "apriori_wrapper.h"
70
+ #include "scan.h"
71
+ #include "tract.h"
72
+ #include "istree.h"
73
+ #ifdef STORAGE
74
+ #include "storage.h"
75
+ #endif
76
+
77
+ // #include "symbtab.h"
78
+
79
+ /*----------------------------------------------------------------------
80
+ Preprocessor Definitions
81
+ ----------------------------------------------------------------------*/
82
+ #define PRGNAME "apriori"
83
+ #define DESCRIPTION "find association rules with the apriori algorithm"
84
+ #define VERSION "version 4.35 (2008.03.24) " \
85
+ "(c) 1996-2008 Christian Borgelt"
86
+
87
+ /* --- target types --- */
88
+ #define TT_SET 0 /* frequent item sets */
89
+ #define TT_CLSET 1 /* closed item sets */
90
+ #define TT_MFSET 2 /* maximal item sets */
91
+ #define TT_RULE 3 /* association rules */
92
+ #define TT_HEDGE 4 /* association hyperedges */
93
+ #define TT_GROUP 5 /* association groups */
94
+
95
+ /* --- error codes --- */
96
+ #define E_OPTION (-5) /* unknown option */
97
+ #define E_OPTARG (-6) /* missing option argument */
98
+ #define E_ARGCNT (-7) /* too few/many arguments */
99
+ #define E_STDIN (-8) /* double assignment of stdin */
100
+ #define E_TARGET (-9) /* invalid target type */
101
+ #define E_SUPP (-10) /* invalid support */
102
+ #define E_CONF (-11) /* invalid confidence */
103
+ #define E_MEASURE (-12) /* invalid evaluation measure */
104
+ #define E_RULELEN (-13) /* invalid rule length */
105
+ #define E_NOTAS (-14) /* no items or transactions */
106
+ #define E_NOFREQ (-15) /* no frequent items */
107
+ #define E_UNKNOWN (-21) /* unknown error */
108
+
109
+ #ifndef QUIET /* if not quiet version */
110
+ #ifdef FFLUSH
111
+ #define MSG(x) x /* print messages */
112
+ #else /* if to flush every output */
113
+ #define MSG(x) x, fflush(stderr)
114
+ #endif
115
+ #else /* if quiet version */
116
+ #define MSG(x) /* suppress messages */
117
+ #endif
118
+
119
+ #define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
120
+ #define RECCNT(s) (ts_reccnt(is_tabscan(s)) \
121
+ - ((ts_delim(is_tabscan(s)) == TS_REC) ? 1 : 0))
122
+ #define BUFFER(s) ts_buf(is_tabscan(s))
123
+
124
+ /*----------------------------------------------------------------------
125
+ Constants
126
+ ----------------------------------------------------------------------*/
127
+ #ifndef QUIET /* if not quiet version */
128
+ /* --- target types --- */
129
+ static const char *ttypes[] = {
130
+ /* TT_SET 0 */ "set",
131
+ /* TT_CLSET 1 */ "set",
132
+ /* TT_MFSET 2 */ "set",
133
+ /* TT_RULE 3 */ "rule",
134
+ /* TT_HEDGE 4 */ "hyperedge",
135
+ /* TT_GROUP 5 */ "group",
136
+ };
137
+
138
+ /* --- error messages --- */
139
+ static const char *errmsgs[] = {
140
+ /* E_NONE 0 */ "no error\n",
141
+ /* E_NOMEM -1 */ "not enough memory\n",
142
+ /* E_FOPEN -2 */ "cannot open file %s\n",
143
+ /* E_FREAD -3 */ "read error on file %s\n",
144
+ /* E_FWRITE -4 */ "write error on file %s\n",
145
+ /* E_OPTION -5 */ "unknown option -%c\n",
146
+ /* E_OPTARG -6 */ "missing option argument\n",
147
+ /* E_ARGCNT -7 */ "wrong number of arguments\n",
148
+ /* E_STDIN -8 */ "double assignment of standard input\n",
149
+ /* E_TARGET -9 */ "invalid target type '%c'\n",
150
+ /* E_SUPP -10 */ "invalid minimal support %g%%\n",
151
+ /* E_CONF -11 */ "invalid minimal confidence %g%%\n",
152
+ /* E_MEASURE -12 */ "invalid additional evaluation measure %c\n",
153
+ /* E_RULELEN -13 */ "invalid set size/rule length %d\n",
154
+ /* E_NOTAS -14 */ "no items or transactions to work on\n",
155
+ /* E_NOFREQ -15 */ "no frequent items\n",
156
+ /* E_ITEMEXP -16 */ "file %s, record %d: item expected\n",
157
+ /* E_DUPITEM -17 */ "file %s, record %d: duplicate item %s\n",
158
+ /* E_APPEXP -18 */ "file %s, record %d: "
159
+ "appearance indicator expected\n",
160
+ /* E_UNKAPP -19 */ "file %s, record %d: "
161
+ "unknown appearance indicator %s\n",
162
+ /* E_FLDCNT -20 */ "file %s, record %d: too many fields\n",
163
+ /* E_UNKNOWN -21 */ "unknown error\n"
164
+ };
165
+ #endif
166
+
167
+ /*----------------------------------------------------------------------
168
+ Global Variables
169
+ ----------------------------------------------------------------------*/
170
+ #ifndef QUIET
171
+ static char *prgname; /* program name for error messages */
172
+ #endif
173
+ static ITEMSET *itemset = NULL; /* item set */
174
+ static TASET *taset = NULL; /* transaction set */
175
+ static TATREE *tatree = NULL; /* transaction tree */
176
+ static ISTREE *istree = NULL; /* item set tree */
177
+ static FILE *in = NULL; /* input file */
178
+ static FILE *out = NULL; /* output file */
179
+
180
+ /*----------------------------------------------------------------------
181
+ Main Functions
182
+ ----------------------------------------------------------------------*/
183
+
184
+ void help (void)
185
+ { /* --- print help on eval. measures */
186
+ #ifndef QUIET
187
+ fprintf(stderr, "\n"); /* terminate startup message */
188
+ printf("additional evaluation measures (option -e#)\n");
189
+ printf("frequent item sets:\n");
190
+ printf("d or 1: binary logarithm of support quotient\n");
191
+ printf("association rules:\n");
192
+ printf("d or 1: absolute confidence difference to prior\n");
193
+ printf("q or 2: absolute difference of confidence quotient to 1\n");
194
+ printf("a or 3: absolute difference of improvement value to 1\n");
195
+ printf("i or 4: information difference to prior\n");
196
+ printf("c or 5: normalized chi^2 measure\n");
197
+ printf("p or 6: p-value computed from chi^2 measure\n");
198
+ #endif
199
+ exit(0); /* abort the program */
200
+ } /* help() */
201
+
202
+ /*--------------------------------------------------------------------*/
203
+
204
+ static void error (int code, ...)
205
+ { /* --- print an error message */
206
+ #ifndef QUIET /* if not quiet version */
207
+ va_list args; /* list of variable arguments */
208
+ const char *msg; /* error message */
209
+
210
+ assert(prgname); /* check the program name */
211
+ if (code < E_UNKNOWN) code = E_UNKNOWN;
212
+ if (code < 0) { /* if to report an error, */
213
+ msg = errmsgs[-code]; /* get the error message */
214
+ if (!msg) msg = errmsgs[-E_UNKNOWN];
215
+ fprintf(stderr, "\n%s: ", prgname);
216
+ va_start(args, code); /* get variable arguments */
217
+ vfprintf(stderr, msg, args);/* print error message */
218
+ va_end(args); /* end argument evaluation */
219
+ }
220
+ #endif
221
+ #ifndef NDEBUG /* if debug version */
222
+ if (istree) ist_delete(istree); /* clean up memory */
223
+ if (tatree) tat_delete(tatree); /* and close files */
224
+ if (taset) tas_delete(taset, 0);
225
+ if (itemset) is_delete(itemset);
226
+ if (in && (in != stdin)) fclose(in);
227
+ if (out && (out != stdout)) fclose(out);
228
+ #endif
229
+ #ifdef STORAGE /* if storage debugging */
230
+ showmem("at end of program"); /* check memory usage */
231
+ #endif
232
+ exit(code); /* abort the program */
233
+ } /* error() */
234
+
235
+ /*--------------------------------------------------------------------*/
236
+
237
+ int do_apriori (int argc, char *argv[])
238
+ { /* --- main function */
239
+ int i, k = 0, n; /* loop variables, counters */
240
+ char *s; /* to traverse the options */
241
+ char **optarg = NULL; /* option argument */
242
+ char *fn_in = NULL; /* name of input file */
243
+ char *fn_out = NULL; /* name of output file */
244
+ char *fn_app = NULL; /* name of item appearances file */
245
+ char *blanks = NULL; /* blanks */
246
+ char *fldseps = NULL; /* field separators */
247
+ char *recseps = NULL; /* record separators */
248
+ char *comment = NULL; /* comment indicators */
249
+ char *used = NULL; /* item usage vector */
250
+ double supp = 0.1; /* minimal support (in percent) */
251
+ double smax = 1.0; /* maximal support (in percent) */
252
+ double conf = 0.8; /* minimal confidence (in percent) */
253
+ int mode = IST_BODY; /* search mode (rule support def.) */
254
+ int target = 'r'; /* target type (sets/rules/h.edges) */
255
+ int arem = 0; /* additional rule evaluation measure */
256
+ int lift = 0; /* flag for printing the lift */
257
+ double minval = 0.1; /* minimal evaluation measure value */
258
+ double lftval = 0; /* lift value (confidence/prior) */
259
+ int minlen = 1; /* minimal rule length */
260
+ int maxlen = INT_MAX; /* maximal rule length */
261
+ int load = 1; /* flag for loading transactions */
262
+ int sort = 2; /* flag for item sorting and recoding */
263
+ double filter = 0.1; /* item usage filtering parameter */
264
+ int tree = 1; /* flag for transaction tree */
265
+ int heap = 1; /* flag for heap sort vs. quick sort */
266
+ int c2scf = 0; /* flag for conv. to scanable form */
267
+ char *sep = " "; /* item separator for output */
268
+ char *fmt = "%.1f"; /* output format for support/conf. */
269
+ int sout = 1; /* flag for abs./rel. support output */
270
+ int ext = 0; /* flag for extended support output */
271
+ int aval = 0; /* flag for add. eval. measure value */
272
+ int maxcnt = 0; /* maximal number of items per set */
273
+ int tacnt; /* number of transactions */
274
+ int frq; /* frequency of an item set */
275
+ int *map, *set; /* identifier map, item set */
276
+ int verbose = 0; /* flag for verboseness */
277
+ const char *name; /* buffer for item names */
278
+ static char buf[4*TS_SIZE+4]; /* buffer for formatting */
279
+ clock_t t, tt, tc, x; /* timer for measurements */
280
+
281
+ #ifndef QUIET /* if not quiet version */
282
+ prgname = argv[0]; /* get program name for error msgs. */
283
+
284
+ /* --- print usage message --- */
285
+ if (argc > 1) { /* if arguments are given */
286
+ fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
287
+ fprintf(stderr, VERSION); } /* print a startup message */
288
+ else { /* if no arguments given */
289
+ printf("usage: %s [options] infile outfile [appfile]\n", argv[0]);
290
+ printf("%s\n", DESCRIPTION);
291
+ printf("%s\n", VERSION);
292
+ printf("-t# target type (default: association rules)\n"
293
+ " (s: item sets, c: closed item sets,"
294
+ " m: maximal item sets,\n"
295
+ " r: association rules,"
296
+ " h: association hyperedges)\n");
297
+ printf("-m# minimal number of items per set/rule/hyperedge "
298
+ "(default: %d)\n", minlen);
299
+ printf("-n# maximal number of items per set/rule/hyperedge "
300
+ "(default: no limit)\n");
301
+ printf("-s# minimal support of a set/rule/hyperedge "
302
+ "(default: %g%%)\n", supp *100);
303
+ printf("-S# maximal support of a set/rule/hyperedge "
304
+ "(default: %g%%)\n", smax *100);
305
+ printf("-c# minimal confidence of a rule/hyperedge "
306
+ "(default: %g%%)\n", conf *100);
307
+ printf("-o use original definition of the support of a rule "
308
+ "(body & head)\n");
309
+ printf("-k# item separator for output "
310
+ "(default: \"%s\")\n", sep);
311
+ printf("-p# output format for support/confidence "
312
+ "(default: \"%s\")\n", fmt);
313
+ printf("-x extended support output "
314
+ "(print both rule support types)\n");
315
+ printf("-a print absolute support "
316
+ "(number of transactions)\n");
317
+ printf("-y print lift value (confidence divided by prior)\n");
318
+ printf("-e# additional evaluation measure (default: none)\n");
319
+ printf("-! print a list of additional evaluation measures\n");
320
+ printf("-d# minimal value of additional evaluation measure "
321
+ "(default: %g%%)\n", minval *100);
322
+ printf("-v print value of additional "
323
+ "rule evaluation measure\n");
324
+ printf("-g write output in scanable form "
325
+ "(quote certain characters)\n");
326
+ printf("-l do not load transactions into memory "
327
+ "(work on input file)\n");
328
+ printf("-q# sort items w.r.t. their frequency (default: %d)\n"
329
+ " (1: ascending, -1: descending, 0: do not sort,\n"
330
+ " 2: ascending, -2: descending w.r.t. "
331
+ "transaction size sum)\n", sort);
332
+ printf("-u# filter unused items from transactions "
333
+ "(default: %g)\n", filter);
334
+ printf(" (0: do not filter items w.r.t. usage in sets,\n"
335
+ " <0: fraction of removed items for filtering,\n"
336
+ " >0: take execution times ratio into account)\n");
337
+ printf("-h do not organize transactions as a prefix tree\n");
338
+ printf("-j use quicksort to sort the transactions "
339
+ "(default: heapsort)\n");
340
+ printf("-z minimize memory usage "
341
+ "(default: maximize speed)\n");
342
+ printf("-b/f/r# blank characters, field and record separators\n"
343
+ " (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
344
+ printf("-C# comment characters (default: \"#\")\n");
345
+ printf("-V verbose\n");
346
+
347
+ printf("infile file to read transactions from\n");
348
+ printf("outfile file to write item sets/association rules"
349
+ "/hyperedges to\n");
350
+ printf("appfile file stating item appearances (optional)\n");
351
+ return 0; /* print a usage message */
352
+ } /* and abort the program */
353
+ #endif /* #ifndef QUIET */
354
+
355
+ /* --- evaluate arguments --- */
356
+ for (i = 1; i < argc; i++) { /* traverse arguments */
357
+ s = argv[i]; /* get option argument */
358
+ if (optarg) { *optarg = s; optarg = NULL; continue; }
359
+ if ((*s == '-') && *++s) { /* -- if argument is an option */
360
+ while (*s) { /* traverse options */
361
+ switch (*s++) { /* evaluate switches */
362
+ case '!': help(); break;
363
+ case 't': target = (*s) ? *s++ : 'r'; break;
364
+ case 'm': minlen = (int)strtol(s, &s, 0); break;
365
+ case 'n': maxlen = (int)strtol(s, &s, 0); break;
366
+ case 's': supp = 0.01*strtod(s, &s); break;
367
+ case 'S': smax = 0.01*strtod(s, &s); break;
368
+ case 'c': conf = 0.01*strtod(s, &s); break;
369
+ case 'o': mode |= IST_BOTH; break;
370
+ case 'k': optarg = &sep; break;
371
+ case 'p': optarg = &fmt; break;
372
+ case 'x': ext = 1; break;
373
+ case 'a': sout |= 2; break;
374
+ case 'y': lift = 1; break;
375
+ case 'e': arem = (*s) ? *s++ : 0; break;
376
+ case 'd': minval = 0.01*strtod(s, &s); break;
377
+ case 'v': aval = 1; break;
378
+ case 'g': c2scf = 1; break;
379
+ case 'l': load = 0; break;
380
+ case 'q': sort = (int)strtol(s, &s, 0); break;
381
+ case 'u': filter = strtod(s, &s); break;
382
+ case 'h': tree = 0; break;
383
+ case 'j': heap = 0; break;
384
+ case 'z': mode |= IST_MEMOPT; break;
385
+ case 'b': optarg = &blanks; break;
386
+ case 'f': optarg = &fldseps; break;
387
+ case 'r': optarg = &recseps; break;
388
+ case 'C': optarg = &comment; break;
389
+ case 'V': verbose = 1; break;
390
+ default : error(E_OPTION, *--s); break;
391
+ } /* set option variables */
392
+ if (optarg && *s) { *optarg = s; optarg = NULL; break; }
393
+ } } /* get option argument */
394
+ else { /* -- if argument is no option */
395
+ switch (k++) { /* evaluate non-options */
396
+ case 0: fn_in = s; break;
397
+ case 1: fn_out = s; break;
398
+ case 2: fn_app = s; break;
399
+ default: error(E_ARGCNT); break;
400
+ } /* note filenames */
401
+ }
402
+ }
403
+ if (optarg) error(E_OPTARG); /* check option argument */
404
+ if ((k < 2) || (k > 3)) /* and the number of arguments */
405
+ error(E_ARGCNT); /* (either in/out or in/out/app) */
406
+ if ((!fn_in || !*fn_in) && (fn_app && !*fn_app))
407
+ error(E_STDIN); /* stdin must not be used twice */
408
+ switch (target) { /* check and translate target type */
409
+ case 's': target = TT_SET; break;
410
+ case 'c': target = TT_CLSET; break;
411
+ case 'm': target = TT_MFSET; break;
412
+ case 'r': target = TT_RULE; break;
413
+ case 'h': target = TT_HEDGE; break;
414
+ case 'g': target = TT_GROUP; break;
415
+ default : error(E_TARGET, (char)target); break;
416
+ }
417
+ if (supp > 1) /* check the minimal support */
418
+ error(E_SUPP, supp); /* (< 0: absolute number) */
419
+ if ((conf < 0) || (conf > 1))
420
+ error(E_CONF, conf); /* check the minimal confidence */
421
+ if (minlen <= 0) error(E_RULELEN, minlen); /* check the limits */
422
+ if (maxlen <= 0) error(E_RULELEN, maxlen); /* for the rule length */
423
+ switch (arem) { /* check and translate measure */
424
+ case 0 : case '0': arem = EM_NONE; break;
425
+ case 'd': case '1': arem = EM_DIFF; break;
426
+ case 'q': case '2': arem = EM_QUOT; break;
427
+ case 'a': case '3': arem = EM_AIMP; break;
428
+ case 'i': case '4': arem = EM_INFO; break;
429
+ case 'c': case '5': arem = EM_CHI2; break;
430
+ case 'p': case '6': arem = EM_PVAL; break;
431
+ default : error(E_MEASURE, (char)arem); break;
432
+ }
433
+ if (target <= TT_MFSET) { /* in item set mode neutralize */
434
+ mode |= IST_BOTH; conf = 1;}/* rule specific settings */
435
+ if (arem == EM_NONE) /* if no add. rule eval. measure, */
436
+ aval = 0; /* clear the corresp. output flag */
437
+ if ((filter <= -1) || (filter >= 1)) filter = 0;
438
+
439
+ /* --- create item set and transaction set --- */
440
+ itemset = is_create(-1); /* create an item set and */
441
+ if (!itemset) error(E_NOMEM); /* set the special characters */
442
+ is_chars(itemset, blanks, fldseps, recseps, comment);
443
+ if (load) { /* if to load the transactions */
444
+ taset = tas_create(itemset);
445
+ if (!taset) error(E_NOMEM); /* create a transaction set */
446
+ } /* to store the transactions */
447
+ MSG(fprintf(stderr, "\n")); /* terminate the startup message */
448
+
449
+ /* --- read item appearances --- */
450
+ if (fn_app) { /* if item appearances are given */
451
+ t = clock(); /* start the timer */
452
+ if (*fn_app) /* if an app. file name is given, */
453
+ in = fopen(fn_app, "r"); /* open the item appearances file */
454
+ else { /* if no app. file name is given, */
455
+ in = stdin; fn_app = "<stdin>"; } /* read from std. input */
456
+ MSG(fprintf(stderr, "reading %s ... ", fn_app));
457
+ if (!in) error(E_FOPEN, fn_app);
458
+ k = is_readapp(itemset,in); /* read the item appearances */
459
+ if (k != 0) error(k, fn_app, RECCNT(itemset), BUFFER(itemset));
460
+ if (in != stdin) /* if not read from standard input, */
461
+ fclose(in); /* close the input file */
462
+ MSG(fprintf(stderr, "[%d item(s)]", is_cnt(itemset)));
463
+ MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
464
+ } /* print a log message */
465
+
466
+ /* --- read transactions --- */
467
+ t = clock(); /* start the timer */
468
+ if (fn_in && *fn_in) /* if an input file name is given, */
469
+ in = fopen(fn_in, "r"); /* open input file for reading */
470
+ else { /* if no input file name is given, */
471
+ in = stdin; fn_in = "<stdin>"; } /* read from standard input */
472
+ MSG(fprintf(stderr, "reading %s ... \n", fn_in));
473
+ if (!in) error(E_FOPEN, fn_in);
474
+ while (1) { /* transaction read loop */
475
+ k = is_read(itemset, in); /* read the next transaction */
476
+ if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
477
+ if (k > 0) break; /* check for error and end of file */
478
+ k = is_tsize(itemset); /* update the maximal */
479
+ if (k > maxcnt) maxcnt = k; /* transaction size */
480
+ if (taset && (tas_add(taset, NULL, 0) != 0))
481
+ error(E_NOMEM); /* add the loaded transaction */
482
+ } /* to the transaction set */
483
+ if (taset) { /* if transactions have been loaded */
484
+ if (in != stdin) fclose(in);/* if not read from standard input, */
485
+ in = NULL; /* close the input file */
486
+ } /* clear the file variable */
487
+ n = is_cnt(itemset); /* get the number of items */
488
+ tacnt = is_gettac(itemset); /* and the number of transactions */
489
+ MSG(fprintf(stderr, "[%d item(s), %d transaction(s)]", n, tacnt));
490
+ MSG(fprintf(stderr, " done [%.2fs].", SEC_SINCE(t)));
491
+ if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
492
+ MSG(fprintf(stderr, "\n")); /* check for at least one transaction */
493
+ if (supp >= 0) /* if relative support is given */
494
+ supp = ceil(tacnt *supp); /* compute absolute support */
495
+ else { /* if absolute support is given, */
496
+ supp = ceil(-100 *supp); /* make the support value positive */
497
+ if (!(sout & 2)) sout = 2; /* switch to absolute support output */
498
+ } /* do the same with the max. support */
499
+ smax = floor(((smax >= 0) ? tacnt : -100) *smax);
500
+
501
+ /* --- sort and recode items --- */
502
+ MSG(fprintf(stderr, "filtering, sorting and recoding items ... "));
503
+ t = clock(); /* start the timer */
504
+ map = (int*)malloc(is_cnt(itemset) *sizeof(int));
505
+ if (!map) error(E_NOMEM); /* create an item identifier map */
506
+ k = (int)((mode & IST_HEAD) ? supp : ceil(supp *conf));
507
+ n = is_recode(itemset, k, sort, map);
508
+ if (taset) { /* sort and recode the items and */
509
+ tas_recode(taset, map,n); /* recode the loaded transactions */
510
+ maxcnt = tas_max(taset); /* get the new maximal t.a. size */
511
+ } /* (may be smaller than before) */
512
+ free(map); /* delete the item identifier map */
513
+ MSG(fprintf(stderr, "[%d item(s)] ", n));
514
+ MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
515
+ if (n <= 0) error(E_NOFREQ); /* print a log message and */
516
+ MSG(fprintf(stderr, "\n")); /* check the number of items */
517
+ if (maxlen > maxcnt) /* clamp the set/rule length */
518
+ maxlen = maxcnt; /* to the maximum set size */
519
+
520
+ /* --- create a transaction tree --- */
521
+ tt = 0; /* init. the tree construction time */
522
+ if (tree && taset) { /* if transactions were loaded */
523
+ MSG(fprintf(stderr, "creating transaction tree ... "));
524
+ t = clock(); /* start the timer */
525
+ tatree = tat_create(taset, heap);
526
+ if (!tatree) error(E_NOMEM);/* create a transaction tree */
527
+ if (filter == 0) { /* if a tree rebuild is not needed, */
528
+ tas_delete(taset, 0); taset = NULL; } /* delete transactions */
529
+ tt = clock() -t; /* note the time for the construction */
530
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
531
+ } /* print a log message */
532
+
533
+ /* --- create an item set tree --- */
534
+ t = clock(); tc = 0; /* start the timer */
535
+ istree = ist_create(itemset, mode, (int)supp, conf);
536
+ if (!istree) error(E_NOMEM); /* create an item set tree */
537
+
538
+ /* --- check item subsets --- */
539
+ if (filter) { /* if to filter unused items */
540
+ used = (char*)malloc(is_cnt(itemset) *sizeof(char));
541
+ if (!used) error(E_NOMEM); /* create a flag vector */
542
+ } /* for the items */
543
+ MSG(fprintf(stderr, "checking subsets of size 1"));
544
+ while (ist_height(istree) < maxlen) {
545
+ if (filter != 0) { /* if to filter w.r.t. item usage, */
546
+ i = ist_check(istree, used); /* check current item usage */
547
+ if (i < maxlen) maxlen = i; /* update the maximum size */
548
+ if (ist_height(istree) >= i) break;
549
+ } /* check the tree height */
550
+ k = ist_addlvl(istree); /* while max. height is not reached, */
551
+ if (k < 0) error(E_NOMEM); /* add a level to the item set tree */
552
+ if (k != 0) break; /* if no level was added, abort */
553
+ MSG(fprintf(stderr, " %d", ist_height(istree)));
554
+ if (tatree) { /* if a transaction tree was created */
555
+ if (((filter < 0) /* if to filter w.r.t. item usage */
556
+ && (i < -filter *n)) /* and enough items were removed */
557
+ || ((filter > 0) /* or counting time is long enough */
558
+ && (i < n) && (i *(double)tt < filter *n *tc))) {
559
+ n = i; x = clock(); /* note the new number of items */
560
+ tas_filter(taset, used);/* and remove unnecessary items */
561
+ tat_delete(tatree); /* delete the transaction tree */
562
+ tatree = tat_create(taset, heap);
563
+ if (!tatree) error(E_NOMEM);
564
+ tt = clock() -x; /* rebuild the transaction tree and */
565
+ } /* note the new construction time */
566
+ x = clock(); /* count the transaction tree */
567
+ ist_countx(istree, tatree);
568
+ tc = clock() -x; } /* note the new count time */
569
+ else if (taset) { /* if transactions were loaded */
570
+ if (((filter < 0) /* if to filter w.r.t. item usage */
571
+ && (i <= -filter *n)) /* and enough items were removed */
572
+ || ((filter > 0) /* or counting time is long enough */
573
+ && (i *(double)tt <= filter *n *tc))) {
574
+ n = i; x = clock(); /* note the new number of items */
575
+ tas_filter(taset, used);/* and remove unnecessary items */
576
+ tt = clock() -t; /* from the transactions */
577
+ } /* note the filtering time */
578
+ for (i = tacnt; --i >= 0;)/* traverse and count transactions */
579
+ ist_count(istree, tas_tract(taset, i), tas_tsize(taset, i));
580
+ tc = clock() -t; } /* note the new count time */
581
+ else { /* if to work on the input file, */
582
+ rewind(in); /* reset the file position */
583
+ for (maxcnt = 0; (i = is_read(itemset, in)) == 0; ) {
584
+ if (filter != 0) /* (re)read the transactions and */
585
+ is_filter(itemset, used); /* remove unnecessary items */
586
+ k = is_tsize(itemset); /* update the maximum size */
587
+ if (k > maxcnt) maxcnt = k; /* of a transaction */
588
+ ist_count(istree, is_tract(itemset), k);
589
+ } /* count the transaction in the tree */
590
+ if (i < 0) error(i, fn_in, RECCNT(itemset), BUFFER(itemset));
591
+ if (maxcnt < maxlen) /* update the maximal rule length */
592
+ maxlen = maxcnt; /* according to the max. t.a. size */
593
+ } /* (may be smaller than before) */
594
+ }
595
+ if (!taset && !tatree) { /* if transactions were not loaded */
596
+ if (in != stdin) fclose(in);/* if not read from standard input, */
597
+ in = NULL; /* close the input file */
598
+ } /* clear the file variable */
599
+ MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
600
+
601
+ /* --- filter found item sets --- */
602
+ if ((target == TT_CLSET) || (target == TT_MFSET)) {
603
+ MSG(fprintf(stderr, "filtering %s item sets ... ",
604
+ (target == TT_MFSET) ? "maximal" : "closed"));
605
+ t = clock(); /* filter the item sets */
606
+ ist_filter(istree, (target == TT_MFSET) ? IST_MAXFRQ : IST_CLOSED);
607
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
608
+ } /* (filter takes longer than print) */
609
+
610
+ /* --- sort transactions --- */
611
+ if (target <= TT_MFSET) { /* if to find frequent item sets */
612
+ if (!taset) /* transactions must be loaded */
613
+ ext = 0; /* for extended support output */
614
+ else if (ext) { /* if extended output is requested */
615
+ MSG(fprintf(stderr, "sorting transactions ... "));
616
+ t = clock(); /* start the timer */
617
+ tas_sort(taset, heap); /* sort the transactions */
618
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
619
+ } /* (sorting is necessary to find the */
620
+ } /* number of identical transactions) */
621
+
622
+ /* --- print item sets/rules/hyperedges --- */
623
+ t = clock(); /* start the timer */
624
+ if (fn_out && *fn_out) /* if an output file name is given, */
625
+ out = fopen(fn_out, "w"); /* open the output file */
626
+ else { /* if no output file name is given, */
627
+ out = stdout; fn_out = "<stdout>"; } /* write to std. output */
628
+ MSG(fprintf(stderr, "writing %s ... ", fn_out));
629
+ if (!out) error(E_FOPEN, fn_out);
630
+ ist_init(istree, minlen, arem, minval);
631
+ set = is_tract(itemset); /* get the transaction buffer */
632
+ if (target <= TT_MFSET) { /* if to find frequent item sets */
633
+ for (n = 0; 1; ) { /* extract item sets from the tree */
634
+ k = ist_set(istree, set, &frq, &conf);
635
+ if (k <= 0) break; /* get the next frequent item set */
636
+ if (frq > smax) continue; /* check against maximal support */
637
+ for (i = 0; i < k; i++) { /* traverse the set's items */
638
+ name = is_name(itemset, set[i]);
639
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
640
+ fputs(name, out); /* print the name of the next item */
641
+ fputs((i < k-1) ? sep : " ", out);
642
+ } /* print a separator */
643
+ fputs(" (", out); /* print the item set's support */
644
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
645
+ if (sout & 2) fputc('/', out); }
646
+ if (sout & 2) { fprintf(out, "%d", frq); }
647
+ if (ext) { /* if to print the extended support */
648
+ frq = tas_occur(taset, set, k);
649
+ fputs(", ", out); /* get the number of occurrences */
650
+ fprintf(out, fmt, (frq/(double)tacnt) *100);
651
+ if (sout & 2) fprintf(out, "/%d", frq);
652
+ } /* print the extended support data */
653
+ if (aval) { fputs(", ", out); fprintf(out, fmt, conf *100); }
654
+ fputs(")\n", out); /* print the add. eval. measure, */
655
+ n++; /* terminate the support output, */
656
+ } } /* and count the item set */
657
+ else if (target == TT_RULE) { /* if to find association rules, */
658
+ for (n = 0; 1; ) { /* extract rules from tree */
659
+ k = ist_rule(istree, set, &frq, &conf, &lftval, &minval);
660
+ if (k <= 0) break; /* get the next association rule */
661
+ if (frq > smax) continue; /* check against maximal support */
662
+ for (i = 0; i < k; i++) { /* traverse the rule's items */
663
+ name = is_name(itemset, set[i]);
664
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
665
+ fputs(name, out); /* print the next item */
666
+ fputs((i <= 0) ? " <- " : ((i < k-1) ? sep : " "), out);
667
+ } /* print a separator */
668
+ fputs(" (", out); /* print the rule evaluation */
669
+ if (sout & 1) supp = frq/(double)tacnt;
670
+ if (ext && !(mode & IST_HEAD)) {
671
+ if (sout & 1) { fprintf(out, fmt, supp *conf *100);
672
+ if (sout & 2) fputc('/', out); }
673
+ if (sout & 2) { fprintf(out, "%d", (int)(frq *conf +0.5));}
674
+ fputs(", ", out); /* print the support of the rule */
675
+ } /* from the support of the body */
676
+ if (sout & 1) { fprintf(out, fmt, supp *100);
677
+ if (sout & 2) fputc('/', out); }
678
+ if (sout & 2) { fprintf(out, "%d", frq); }
679
+ fputs(", ", out); /* print the rule support */
680
+ if (ext && (mode & IST_HEAD)) {
681
+ if (sout & 1) { fprintf(out, fmt, (supp/conf) *100);
682
+ if (sout & 2) fputc('/', out); }
683
+ if (sout & 2) { fprintf(out, "%d", (int)(frq /conf +0.5));}
684
+ fputs(", ", out); /* print the support of the body */
685
+ } /* from the support of the rule */
686
+ fprintf(out, fmt, conf *100); /* print the rule confidence */
687
+ if (lift) { fputs(", ", out); fprintf(out, fmt, lftval *100); }
688
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
689
+ fputs(")\n", out); /* print the value of the additional */
690
+ n++; /* rule evaluation measure and */
691
+ } } /* count the association rule */
692
+ else if (target == TT_HEDGE){ /* if to find association hyperedges */
693
+ for (n = 0; 1; ) { /* extract hyperedges from tree */
694
+ k = ist_hedge(istree, set, &frq, &conf, &minval);
695
+ if (k <= 0) break; /* get the next hyperedge */
696
+ if (frq > smax) continue; /* check against maximal support */
697
+ for (i = 0; i < k; i++) { /* traverse the edge's items */
698
+ name = is_name(itemset, set[i]);
699
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
700
+ fputs(name, out); /* print the name of the next item */
701
+ fputs((i < k-1) ? sep : " ", out);
702
+ } /* print a separator */
703
+ fputs(" (", out); /* print the hyperedge evaluation */
704
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
705
+ if (sout & 2) fputc('/', out); }
706
+ if (sout & 2) { fprintf(out, "%d", frq); }
707
+ fputs(", ", out); fprintf(out, fmt, conf *100);
708
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
709
+ fputs(")\n", out); /* print support and confidence */
710
+ n++; /* of the hyperedge and */
711
+ } } /* count the hyperedge */
712
+ else { /* if to find association groups */
713
+ for (n = 0; 1; ) { /* extract groups from tree */
714
+ k = ist_group(istree, set, &frq, &minval);
715
+ if (k <= 0) break; /* get the next group */
716
+ if (frq > smax) continue; /* check against maximal support */
717
+ for (i = 0; i < k; i++) { /* traverse the group's items */
718
+ name = is_name(itemset, set[i]);
719
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
720
+ fputs(name, out); /* print the name of the next item */
721
+ fputs((i < k-1) ? sep : " ", out);
722
+ } /* print a separator */
723
+ fputs(" (", out); /* print the group evaluation */
724
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
725
+ if (sout & 2) fputc('/', out); }
726
+ if (sout & 2) { fprintf(out, "%d", frq); }
727
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
728
+ fputs(")\n", out); /* print support and add. measure */
729
+ n++; /* and count the group */
730
+ }
731
+ } /* if (target <= TT_MFSET) .. else .. */
732
+ if (fflush(out) != 0) error(E_FWRITE, fn_out);
733
+ if (out != stdout) fclose(out);
734
+ out = NULL; /* close the output file */
735
+ MSG(fprintf(stderr, "[%d %s(s)] done ", n, ttypes[target]));
736
+ MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)));
737
+ #ifdef BENCH
738
+ printf("number of support counters: %d\n", istree->sccnt);
739
+ printf("necessary support counters: %d\n", istree->scnec);
740
+ printf("number of child pointers : %d\n", istree->cpcnt);
741
+ printf("necessary child pointers : %d\n", istree->cpnec);
742
+ printf("allocated memory (bytes) : %d\n", istree->bytes);
743
+ #endif
744
+
745
+ /* --- clean up --- */
746
+ #ifndef NDEBUG /* if this is a debug version */
747
+ free(used); /* delete the item app. vector */
748
+ ist_delete(istree); /* delete the item set tree, */
749
+ if (tatree) tat_delete(tatree); /* the transaction tree, */
750
+ if (taset) tas_delete(taset, 0); /* the transaction set, */
751
+ is_delete(itemset); /* and the item set */
752
+ #endif
753
+ #ifdef STORAGE /* if storage debugging */
754
+ showmem("at end of program"); /* check memory usage */
755
+ #endif
756
+ return 0; /* return 'ok' */
757
+ } /* main() */