apriori-rails 0.2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +22 -0
- data/License.txt +20 -0
- data/Manifest.txt +121 -0
- data/README.txt +149 -0
- data/Rakefile +17 -0
- data/TODO.txt +60 -0
- data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
- data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
- data/attic/c_ext_test1/mytest.rb +10 -0
- data/attic/test.c +12 -0
- data/config/hoe.rb +88 -0
- data/config/requirements.rb +29 -0
- data/examples/01_simple_example.rb +39 -0
- data/examples/02_small_file_example.rb +17 -0
- data/examples/03_large_file_example.rb +22 -0
- data/examples/test_data/market_basket_basic_test.dat +9 -0
- data/ext/Apriori.c +149 -0
- data/ext/Makefile +149 -0
- data/ext/apriori/doc/apriori.html +1301 -0
- data/ext/apriori/doc/arem.gp +68 -0
- data/ext/apriori/doc/c_rev.gp +89 -0
- data/ext/apriori/doc/chi2.tex +156 -0
- data/ext/apriori/doc/copying +504 -0
- data/ext/apriori/doc/line.gif +0 -0
- data/ext/apriori/doc/uparrow.gif +0 -0
- data/ext/apriori/ex/flg2set +15 -0
- data/ext/apriori/ex/hdr2set +13 -0
- data/ext/apriori/ex/readme +71 -0
- data/ext/apriori/ex/row2set +7 -0
- data/ext/apriori/ex/rulesort +24 -0
- data/ext/apriori/ex/tab2set +9 -0
- data/ext/apriori/ex/test.app +2 -0
- data/ext/apriori/ex/test.rul +9 -0
- data/ext/apriori/ex/test1.rul +43 -0
- data/ext/apriori/ex/test1.tab +10 -0
- data/ext/apriori/ex/test2.tab +10 -0
- data/ext/apriori/ex/test3.tab +30 -0
- data/ext/apriori/ex/test4.tab +11 -0
- data/ext/apriori/ex/test5.tab +39 -0
- data/ext/apriori/ex/tid2set +23 -0
- data/ext/apriori/ex/xhdr2set +33 -0
- data/ext/apriori/src/apriori.c +750 -0
- data/ext/apriori/src/apriori.dsp +120 -0
- data/ext/apriori/src/apriori.dsw +29 -0
- data/ext/apriori/src/apriori.mak +99 -0
- data/ext/apriori/src/istree.c +1411 -0
- data/ext/apriori/src/istree.h +160 -0
- data/ext/apriori/src/makefile +105 -0
- data/ext/apriori/src/tract.c +870 -0
- data/ext/apriori/src/tract.h +261 -0
- data/ext/apriori_wrapper.c +757 -0
- data/ext/apriori_wrapper.h +10 -0
- data/ext/extconf.rb +32 -0
- data/ext/math/doc/copying +504 -0
- data/ext/math/src/chi2.c +151 -0
- data/ext/math/src/chi2.h +27 -0
- data/ext/math/src/choose.c +71 -0
- data/ext/math/src/choose.h +16 -0
- data/ext/math/src/gamma.c +446 -0
- data/ext/math/src/gamma.h +39 -0
- data/ext/math/src/intexp.c +35 -0
- data/ext/math/src/intexp.h +15 -0
- data/ext/math/src/makefile +164 -0
- data/ext/math/src/math.mak +48 -0
- data/ext/math/src/normal.c +387 -0
- data/ext/math/src/normal.h +44 -0
- data/ext/math/src/radfn.c +113 -0
- data/ext/math/src/radfn.h +34 -0
- data/ext/math/src/zeta.c +49 -0
- data/ext/math/src/zeta.h +15 -0
- data/ext/pre-clean.rb +8 -0
- data/ext/pre-setup.rb +9 -0
- data/ext/util/doc/copying +504 -0
- data/ext/util/src/listops.c +76 -0
- data/ext/util/src/listops.h +26 -0
- data/ext/util/src/makefile +103 -0
- data/ext/util/src/memsys.c +84 -0
- data/ext/util/src/memsys.h +42 -0
- data/ext/util/src/nstats.c +288 -0
- data/ext/util/src/nstats.h +69 -0
- data/ext/util/src/params.c +86 -0
- data/ext/util/src/params.h +19 -0
- data/ext/util/src/parse.c +133 -0
- data/ext/util/src/parse.h +81 -0
- data/ext/util/src/scan.c +767 -0
- data/ext/util/src/scan.h +111 -0
- data/ext/util/src/symtab.c +443 -0
- data/ext/util/src/symtab.h +121 -0
- data/ext/util/src/tabscan.c +279 -0
- data/ext/util/src/tabscan.h +99 -0
- data/ext/util/src/util.mak +91 -0
- data/ext/util/src/vecops.c +317 -0
- data/ext/util/src/vecops.h +42 -0
- data/lib/apriori.rb +133 -0
- data/lib/apriori/adapter.rb +13 -0
- data/lib/apriori/association_rule.rb +97 -0
- data/lib/apriori/version.rb +3 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/apriori.rake +20 -0
- data/tasks/attic.rake +28 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/install.rake +13 -0
- data/tasks/website.rake +17 -0
- data/test/apriori_test.rb +13 -0
- data/test/fixtures/market_basket_results_test.txt +5 -0
- data/test/fixtures/market_basket_string_test.txt +7 -0
- data/test/fixtures/results.txt +2 -0
- data/test/fixtures/sample.txt +7 -0
- data/test/test_helper.rb +5 -0
- data/test/unit/test_apriori.rb +68 -0
- data/test/unit/test_itemsets_and_parsing.rb +82 -0
- data/website/index.html +251 -0
- data/website/index.txt +154 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +142 -0
- data/website/template.html.erb +49 -0
- metadata +267 -0
@@ -0,0 +1,261 @@
|
|
1
|
+
/*----------------------------------------------------------------------
|
2
|
+
File : tract.h
|
3
|
+
Contents: item and transaction management
|
4
|
+
Author : Christian Borgelt
|
5
|
+
History : 2001.11.18 file created from file apriori.c
|
6
|
+
2001.12.28 first version completed
|
7
|
+
2001.01.02 ta_sort mapped to v_intsort
|
8
|
+
2002.02.19 transaction tree functions added
|
9
|
+
2003.07.17 functions is_filter, ta_filter, tas_filter added
|
10
|
+
2003.08.21 parameter 'heap' added to tas_sort, tat_create
|
11
|
+
2003.09.12 function tas_total added
|
12
|
+
2003.09.20 empty transactions in input made possible
|
13
|
+
2004.11.20 function tat_mark added
|
14
|
+
2004.12.11 access functions for extended frequency added
|
15
|
+
2004.12.15 function nim_trunc added
|
16
|
+
2006.11.26 structures ISFMTR and ISEVAL added
|
17
|
+
2007.02.13 adapted to modified tabscan module
|
18
|
+
2008.06.30 support argument to ise_eval changed to double
|
19
|
+
----------------------------------------------------------------------*/
|
20
|
+
#ifndef __TRACT__
|
21
|
+
#define __TRACT__
|
22
|
+
#ifndef NIMAPFN
|
23
|
+
#define NIMAPFN
|
24
|
+
#endif
|
25
|
+
#include "vecops.h"
|
26
|
+
#include "symtab.h"
|
27
|
+
#include "tabscan.h"
|
28
|
+
|
29
|
+
/*----------------------------------------------------------------------
|
30
|
+
Preprocessor Definitions
|
31
|
+
----------------------------------------------------------------------*/
|
32
|
+
/* --- item appearance flags --- */
|
33
|
+
#define APP_NONE 0x00 /* item should be ignored */
|
34
|
+
#define APP_BODY 0x01 /* item may appear in rule body */
|
35
|
+
#define APP_HEAD 0x02 /* item may appear in rule head */
|
36
|
+
#define APP_BOTH (APP_HEAD|APP_BODY)
|
37
|
+
|
38
|
+
/* --- error codes --- */
|
39
|
+
#define E_NONE 0 /* no error */
|
40
|
+
#define E_NOMEM (-1) /* not enough memory */
|
41
|
+
#define E_FOPEN (-2) /* cannot open file */
|
42
|
+
#define E_FREAD (-3) /* read error on file */
|
43
|
+
#define E_FWRITE (-4) /* write error on file */
|
44
|
+
|
45
|
+
#define E_ITEMEXP (-16) /* item expected */
|
46
|
+
#define E_DUPITEM (-17) /* duplicate item */
|
47
|
+
#define E_APPEXP (-18) /* appearance indicator expected */
|
48
|
+
#define E_UNKAPP (-19) /* unknown appearance indicator */
|
49
|
+
#define E_FLDCNT (-20) /* too many fields */
|
50
|
+
|
51
|
+
/*----------------------------------------------------------------------
|
52
|
+
Type Definitions
|
53
|
+
----------------------------------------------------------------------*/
|
54
|
+
typedef struct { /* --- an item --- */
|
55
|
+
int id; /* item identifier */
|
56
|
+
int frq; /* frequency in transactions */
|
57
|
+
int xfq; /* extended frequency (t.a. sizes) */
|
58
|
+
int app; /* appearance indicator */
|
59
|
+
} ITEM; /* (item) */
|
60
|
+
|
61
|
+
typedef struct { /* --- an item set --- */
|
62
|
+
NIMAP *nimap; /* name/identifier map */
|
63
|
+
TABSCAN *tscan; /* table scanner */
|
64
|
+
char chars[4]; /* special characters */
|
65
|
+
int tac; /* transaction counter */
|
66
|
+
int app; /* default appearance indicator */
|
67
|
+
int vsz; /* size of transaction buffer */
|
68
|
+
int cnt; /* number of items in transaction */
|
69
|
+
int *items; /* items in transaction */
|
70
|
+
} ITEMSET; /* (item set) */
|
71
|
+
|
72
|
+
typedef struct { /* --- an item set evaluator --- */
|
73
|
+
double logta; /* logarithm of num. of transactions */
|
74
|
+
double *logfs; /* logarithms of item frequencies */
|
75
|
+
double lsums[1]; /* sums of logarithms for prefixes */
|
76
|
+
} ISEVAL; /* (item set evaluator) */
|
77
|
+
|
78
|
+
typedef struct { /* --- item set formatter --- */
|
79
|
+
int cnt; /* number of formatted item names */
|
80
|
+
int len; /* length of description in buffer */
|
81
|
+
int *offs; /* prefix lengths in output buffer */
|
82
|
+
char *buf; /* output buffer */
|
83
|
+
const char *names[1]; /* formatted item names */
|
84
|
+
} ISFMTR; /* (item set formatter) */
|
85
|
+
|
86
|
+
typedef struct { /* --- a transaction --- */
|
87
|
+
int cnt; /* number of items */
|
88
|
+
int items[1]; /* item identifier vector */
|
89
|
+
} TRACT; /* (transaction) */
|
90
|
+
|
91
|
+
typedef struct { /* --- a transaction set --- */
|
92
|
+
ITEMSET *itemset; /* underlying item set */
|
93
|
+
int max; /* maximum number of items per t.a. */
|
94
|
+
int vsz; /* size of transaction vector */
|
95
|
+
int cnt; /* number of transactions */
|
96
|
+
int total; /* total number of items */
|
97
|
+
TRACT **tracts; /* transaction vector */
|
98
|
+
} TASET; /* (transaction set) */
|
99
|
+
|
100
|
+
typedef struct _tatree { /* --- a transaction tree (node) --- */
|
101
|
+
int cnt; /* number of transactions */
|
102
|
+
int max; /* size of largest transaction */
|
103
|
+
int size; /* node size (number of children) */
|
104
|
+
int items[1]; /* next items in rep. transactions */
|
105
|
+
} TATREE; /* (transaction tree) */
|
106
|
+
|
107
|
+
/*----------------------------------------------------------------------
|
108
|
+
Item Set Functions
|
109
|
+
----------------------------------------------------------------------*/
|
110
|
+
extern ITEMSET* is_create (int cnt);
|
111
|
+
extern void is_delete (ITEMSET *iset);
|
112
|
+
extern TABSCAN* is_tabscan (ITEMSET *iset);
|
113
|
+
extern void is_chars (ITEMSET *iset, const char *blanks,
|
114
|
+
const char *fldseps,
|
115
|
+
const char *recseps,
|
116
|
+
const char *cominds);
|
117
|
+
|
118
|
+
extern int is_cnt (ITEMSET *iset);
|
119
|
+
extern int is_item (ITEMSET *iset, const char *name);
|
120
|
+
extern const char* is_name (ITEMSET *iset, int item);
|
121
|
+
|
122
|
+
extern int is_gettac (ITEMSET *iset);
|
123
|
+
extern int is_settac (ITEMSET *iset, int cnt);
|
124
|
+
extern int is_addtac (ITEMSET *iset, int cnt);
|
125
|
+
extern int is_getfrq (ITEMSET *iset, int item);
|
126
|
+
extern int is_setfrq (ITEMSET *iset, int item, int frq);
|
127
|
+
extern int is_addfrq (ITEMSET *iset, int item, int frq);
|
128
|
+
extern int is_getxfq (ITEMSET *iset, int item);
|
129
|
+
extern int is_setxfq (ITEMSET *iset, int item, int frq);
|
130
|
+
extern int is_getapp (ITEMSET *iset, int item);
|
131
|
+
extern int is_setapp (ITEMSET *iset, int item, int app);
|
132
|
+
|
133
|
+
extern int is_readapp (ITEMSET *iset, FILE *file);
|
134
|
+
extern int is_read (ITEMSET *iset, FILE *file);
|
135
|
+
|
136
|
+
extern int is_recode (ITEMSET *iset, int minfrq,
|
137
|
+
int dir, int *map);
|
138
|
+
extern void is_trunc (ITEMSET *iset, int cnt);
|
139
|
+
extern int is_filter (ITEMSET *iset, const char *marks);
|
140
|
+
extern int is_tsize (ITEMSET *iset);
|
141
|
+
extern int* is_tract (ITEMSET *iset);
|
142
|
+
|
143
|
+
/*----------------------------------------------------------------------
|
144
|
+
Item Set Evaluation Functions
|
145
|
+
----------------------------------------------------------------------*/
|
146
|
+
extern ISEVAL* ise_create (ITEMSET *iset, int tacnt);
|
147
|
+
extern void ise_delete (ISEVAL *eval);
|
148
|
+
extern double ise_eval (ISEVAL *eval, int *ids, int cnt, int pre,
|
149
|
+
double supp);
|
150
|
+
|
151
|
+
/*----------------------------------------------------------------------
|
152
|
+
Item Set Formatting Functions
|
153
|
+
----------------------------------------------------------------------*/
|
154
|
+
extern ISFMTR* isf_create (ITEMSET *iset, int scan);
|
155
|
+
extern void isf_delete (ISFMTR *fmt);
|
156
|
+
extern const char* isf_format (ISFMTR *fmt, int *ids, int cnt, int pre);
|
157
|
+
extern int isf_length (ISFMTR *fmt);
|
158
|
+
extern void isf_print (ISFMTR *fmt, FILE *out);
|
159
|
+
|
160
|
+
/*----------------------------------------------------------------------
|
161
|
+
Transaction Functions
|
162
|
+
----------------------------------------------------------------------*/
|
163
|
+
extern void ta_sort (int *items, int n);
|
164
|
+
extern int ta_unique (int *items, int n);
|
165
|
+
extern int ta_filter (int *items, int n, const char *marks);
|
166
|
+
|
167
|
+
/*----------------------------------------------------------------------
|
168
|
+
Transaction Set Functions
|
169
|
+
----------------------------------------------------------------------*/
|
170
|
+
extern TASET* tas_create (ITEMSET *itemset);
|
171
|
+
extern void tas_delete (TASET *taset, int delis);
|
172
|
+
extern ITEMSET* tas_itemset (TASET *taset);
|
173
|
+
|
174
|
+
extern int tas_cnt (TASET *taset);
|
175
|
+
extern int tas_add (TASET *taset, const int *items, int n);
|
176
|
+
extern int* tas_tract (TASET *taset, int index);
|
177
|
+
extern int tas_tsize (TASET *taset, int index);
|
178
|
+
extern int tas_total (TASET *taset);
|
179
|
+
|
180
|
+
extern void tas_recode (TASET *taset, int *map, int cnt);
|
181
|
+
extern int tas_filter (TASET *taset, const char *marks);
|
182
|
+
extern void tas_shuffle (TASET *taset, double randfn(void));
|
183
|
+
extern void tas_sort (TASET *taset, int heap);
|
184
|
+
extern int tas_occur (TASET *taset, const int *items, int n);
|
185
|
+
|
186
|
+
#ifndef NDEBUG
|
187
|
+
extern void tas_show (TASET *taset);
|
188
|
+
#endif
|
189
|
+
|
190
|
+
/*----------------------------------------------------------------------
|
191
|
+
Transaction Tree Functions
|
192
|
+
----------------------------------------------------------------------*/
|
193
|
+
extern TATREE* tat_create (TASET *taset, int heap);
|
194
|
+
extern void tat_delete (TATREE *tat);
|
195
|
+
extern int tat_cnt (TATREE *tat);
|
196
|
+
extern int tat_max (TATREE *tat);
|
197
|
+
extern int tat_size (TATREE *tat);
|
198
|
+
extern int* tat_items (TATREE *tat);
|
199
|
+
extern int tat_item (TATREE *tat, int index);
|
200
|
+
extern TATREE* tat_child (TATREE *tat, int index);
|
201
|
+
extern void tat_mark (TATREE *tat);
|
202
|
+
|
203
|
+
#ifndef NDEBUG
|
204
|
+
extern void tat_show (TATREE *tat);
|
205
|
+
#endif
|
206
|
+
|
207
|
+
/*----------------------------------------------------------------------
|
208
|
+
Preprocessor Definitions
|
209
|
+
----------------------------------------------------------------------*/
|
210
|
+
#define is_tabscan(s) ((s)->tscan)
|
211
|
+
|
212
|
+
#define is_cnt(s) nim_cnt((s)->nimap)
|
213
|
+
#define is_name(s,i) nim_name(nim_byid((s)->nimap, i))
|
214
|
+
#define is_gettac(s) ((s)->tac)
|
215
|
+
#define is_settac(s,n) ((s)->tac = (n))
|
216
|
+
#define is_addtac(s,n) ((s)->tac += (n))
|
217
|
+
#define is_getfrq(s,i) (((ITEM*)nim_byid((s)->nimap, i))->frq)
|
218
|
+
#define is_setfrq(s,i,f) (((ITEM*)nim_byid((s)->nimap, i))->frq = (f))
|
219
|
+
#define is_addfrq(s,i,f) (((ITEM*)nim_byid((s)->nimap, i))->frq += (f))
|
220
|
+
#define is_getxfq(s,i) (((ITEM*)nim_byid((s)->nimap, i))->xfq)
|
221
|
+
#define is_setxfq(s,i,f) (((ITEM*)nim_byid((s)->nimap, i))->xfq = (f))
|
222
|
+
#define is_getapp(s,i) (((ITEM*)nim_byid((s)->nimap, i))->app)
|
223
|
+
#define is_setapp(s,i,a) (((ITEM*)nim_byid((s)->nimap, i))->app = (a))
|
224
|
+
|
225
|
+
#define is_trunc(s,n) nim_trunc((s)->nimap, n)
|
226
|
+
|
227
|
+
#define is_tsize(s) ((s)->cnt)
|
228
|
+
#define is_tract(s) ((s)->items)
|
229
|
+
|
230
|
+
/*--------------------------------------------------------------------*/
|
231
|
+
#define ise_delete(e) free(e)
|
232
|
+
|
233
|
+
/*--------------------------------------------------------------------*/
|
234
|
+
#define isf_length(f) ((f)->len)
|
235
|
+
#define isf_print(f,o) fwrite((f)->buf, sizeof(char), (f)->len, o)
|
236
|
+
|
237
|
+
/*--------------------------------------------------------------------*/
|
238
|
+
#define ta_sort(v,n) v_intsort(v,n)
|
239
|
+
|
240
|
+
/*--------------------------------------------------------------------*/
|
241
|
+
#define tas_itemset(s) ((s)->itemset)
|
242
|
+
#define tas_cnt(s) ((s)->cnt)
|
243
|
+
#define tas_max(s) ((s)->max)
|
244
|
+
|
245
|
+
#define tas_tract(s,i) ((s)->tracts[i]->items)
|
246
|
+
#define tas_tsize(s,i) ((s)->tracts[i]->cnt)
|
247
|
+
#define tas_total(s) ((s)->total)
|
248
|
+
|
249
|
+
#define tas_shuffle(s,f) v_shuffle((s)->tracts, (s)->cnt, f)
|
250
|
+
|
251
|
+
/*--------------------------------------------------------------------*/
|
252
|
+
#define tat_cnt(t) ((t)->cnt)
|
253
|
+
#define tat_max(t) ((t)->max)
|
254
|
+
#define tat_size(t) ((t)->size)
|
255
|
+
#define tat_item(t,i) ((t)->items[i])
|
256
|
+
#define tat_items(t) ((t)->items)
|
257
|
+
#ifndef ARCH64
|
258
|
+
#define tat_child(t,i) (((TATREE**)((t)->items +(t)->size))[i])
|
259
|
+
#endif
|
260
|
+
|
261
|
+
#endif
|
@@ -0,0 +1,757 @@
|
|
1
|
+
/*----------------------------------------------------------------------
|
2
|
+
File : apriori.c
|
3
|
+
Contents: apriori algorithm for finding association rules
|
4
|
+
Author : Christian Borgelt
|
5
|
+
History : 1996.02.14 file created
|
6
|
+
1996.07.26 output precision reduced
|
7
|
+
1996.11.22 options -b, -f, and -r added
|
8
|
+
1996.11.24 option -e added (add. evaluation measures)
|
9
|
+
1997.08.18 normalized chi^2 measure added
|
10
|
+
option -m (minimal rule length) added
|
11
|
+
1997.10.13 quiet version (no output to stdout or stderr)
|
12
|
+
1998.01.27 adapted to changed ist_create() function
|
13
|
+
1998.08.08 optional input file (item appearances) added
|
14
|
+
1998.09.02 several assertions added
|
15
|
+
1998.09.07 hyperedge mode (option -h) added
|
16
|
+
1998.12.08 output of absolute support (option -a) added
|
17
|
+
float changed to double
|
18
|
+
1998.12.09 conversion of names to a scanable form added
|
19
|
+
1999.02.05 long int changed to int
|
20
|
+
1999.02.09 input from stdin, output to stdout added
|
21
|
+
1999.08.09 bug in check of support parameter (<= 0) fixed
|
22
|
+
1999.11.05 rule evaluation measure EM_AIMP added
|
23
|
+
1999.11.08 output of add. rule eval. measure value added
|
24
|
+
2000.03.16 optional use of original rule support definition
|
25
|
+
2001.04.01 option -h replaced by option -t (target type)
|
26
|
+
2001.05.26 extended support output added (option -x)
|
27
|
+
2001.06.09 extended support output for item sets added
|
28
|
+
2001.08.15 module scan used for output formatting
|
29
|
+
2001.11.18 item and transaction functions made a module
|
30
|
+
2001.11.19 options -C, -l changed, option -y removed
|
31
|
+
2001.12.28 adapted to module tract, some improvements
|
32
|
+
2002.01.11 evaluation measures codes changed to letters
|
33
|
+
2002.02.10 option -q extended by a direction parameter
|
34
|
+
2002.02.11 memory usage minimization option added
|
35
|
+
2002.06.09 arbitrary supp./conf. formats made possible
|
36
|
+
2003.01.09 option -k (item separator) added
|
37
|
+
2003.01.14 check for empty transaction set added
|
38
|
+
2003.03.12 output of lift value (conf/prior) added
|
39
|
+
2003.07.17 item filtering w.r.t. usage added (option -u)
|
40
|
+
2003.07.17 sorting w.r.t. transaction size sum added
|
41
|
+
2003.07.18 maximal itemset filter added
|
42
|
+
2003.08.11 closed itemset filter added
|
43
|
+
2003.08.15 item filtering for transaction tree added
|
44
|
+
2003.08.16 parameter for transaction filtering added
|
45
|
+
2003.08.18 dynamic filtering decision based on times added
|
46
|
+
2003.08.21 option -j (heap sort for transactions) added
|
47
|
+
2003.09.22 meaning of option -j reversed (heapsort default)
|
48
|
+
2004.03.25 option -S added (maximal support of a set/rule)
|
49
|
+
2004.05.09 additional selection measure for sets added
|
50
|
+
2004.10.28 two unnecessary assignments removed
|
51
|
+
2004.11.20 bug in evaluation of -j (heap/quicksort) fixed
|
52
|
+
2004.11.23 absolute/relative support output changed
|
53
|
+
2004.12.09 semantics of option -p changed
|
54
|
+
2005.01.25 bug in output of absolute/relative support fixed
|
55
|
+
2005.01.31 another bug in this output fixed
|
56
|
+
2005.06.20 use of flag for "no item sorting" corrected
|
57
|
+
2007.02.13 adapted to modified module tabscan
|
58
|
+
2008.03.13 additional hyperedge evaluation added
|
59
|
+
2008.03.24 additional target added (association groups)
|
60
|
+
----------------------------------------------------------------------*/
|
61
|
+
#include <stdio.h>
|
62
|
+
#include <stdlib.h>
|
63
|
+
#include <stdarg.h>
|
64
|
+
#include <string.h>
|
65
|
+
#include <limits.h>
|
66
|
+
#include <math.h>
|
67
|
+
#include <time.h>
|
68
|
+
#include <assert.h>
|
69
|
+
#include "apriori_wrapper.h"
|
70
|
+
#include "scan.h"
|
71
|
+
#include "tract.h"
|
72
|
+
#include "istree.h"
|
73
|
+
#ifdef STORAGE
|
74
|
+
#include "storage.h"
|
75
|
+
#endif
|
76
|
+
|
77
|
+
// #include "symbtab.h"
|
78
|
+
|
79
|
+
/*----------------------------------------------------------------------
|
80
|
+
Preprocessor Definitions
|
81
|
+
----------------------------------------------------------------------*/
|
82
|
+
#define PRGNAME "apriori"
|
83
|
+
#define DESCRIPTION "find association rules with the apriori algorithm"
|
84
|
+
#define VERSION "version 4.35 (2008.03.24) " \
|
85
|
+
"(c) 1996-2008 Christian Borgelt"
|
86
|
+
|
87
|
+
/* --- target types --- */
|
88
|
+
#define TT_SET 0 /* frequent item sets */
|
89
|
+
#define TT_CLSET 1 /* closed item sets */
|
90
|
+
#define TT_MFSET 2 /* maximal item sets */
|
91
|
+
#define TT_RULE 3 /* association rules */
|
92
|
+
#define TT_HEDGE 4 /* association hyperedges */
|
93
|
+
#define TT_GROUP 5 /* association groups */
|
94
|
+
|
95
|
+
/* --- error codes --- */
|
96
|
+
#define E_OPTION (-5) /* unknown option */
|
97
|
+
#define E_OPTARG (-6) /* missing option argument */
|
98
|
+
#define E_ARGCNT (-7) /* too few/many arguments */
|
99
|
+
#define E_STDIN (-8) /* double assignment of stdin */
|
100
|
+
#define E_TARGET (-9) /* invalid target type */
|
101
|
+
#define E_SUPP (-10) /* invalid support */
|
102
|
+
#define E_CONF (-11) /* invalid confidence */
|
103
|
+
#define E_MEASURE (-12) /* invalid evaluation measure */
|
104
|
+
#define E_RULELEN (-13) /* invalid rule length */
|
105
|
+
#define E_NOTAS (-14) /* no items or transactions */
|
106
|
+
#define E_NOFREQ (-15) /* no frequent items */
|
107
|
+
#define E_UNKNOWN (-21) /* unknown error */
|
108
|
+
|
109
|
+
#ifndef QUIET /* if not quiet version */
|
110
|
+
#ifdef FFLUSH
|
111
|
+
#define MSG(x) x /* print messages */
|
112
|
+
#else /* if to flush every output */
|
113
|
+
#define MSG(x) x, fflush(stderr)
|
114
|
+
#endif
|
115
|
+
#else /* if quiet version */
|
116
|
+
#define MSG(x) /* suppress messages */
|
117
|
+
#endif
|
118
|
+
|
119
|
+
#define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
|
120
|
+
#define RECCNT(s) (ts_reccnt(is_tabscan(s)) \
|
121
|
+
- ((ts_delim(is_tabscan(s)) == TS_REC) ? 1 : 0))
|
122
|
+
#define BUFFER(s) ts_buf(is_tabscan(s))
|
123
|
+
|
124
|
+
/*----------------------------------------------------------------------
|
125
|
+
Constants
|
126
|
+
----------------------------------------------------------------------*/
|
127
|
+
#ifndef QUIET /* if not quiet version */
|
128
|
+
/* --- target types --- */
|
129
|
+
static const char *ttypes[] = {
|
130
|
+
/* TT_SET 0 */ "set",
|
131
|
+
/* TT_CLSET 1 */ "set",
|
132
|
+
/* TT_MFSET 2 */ "set",
|
133
|
+
/* TT_RULE 3 */ "rule",
|
134
|
+
/* TT_HEDGE 4 */ "hyperedge",
|
135
|
+
/* TT_GROUP 5 */ "group",
|
136
|
+
};
|
137
|
+
|
138
|
+
/* --- error messages --- */
|
139
|
+
static const char *errmsgs[] = {
|
140
|
+
/* E_NONE 0 */ "no error\n",
|
141
|
+
/* E_NOMEM -1 */ "not enough memory\n",
|
142
|
+
/* E_FOPEN -2 */ "cannot open file %s\n",
|
143
|
+
/* E_FREAD -3 */ "read error on file %s\n",
|
144
|
+
/* E_FWRITE -4 */ "write error on file %s\n",
|
145
|
+
/* E_OPTION -5 */ "unknown option -%c\n",
|
146
|
+
/* E_OPTARG -6 */ "missing option argument\n",
|
147
|
+
/* E_ARGCNT -7 */ "wrong number of arguments\n",
|
148
|
+
/* E_STDIN -8 */ "double assignment of standard input\n",
|
149
|
+
/* E_TARGET -9 */ "invalid target type '%c'\n",
|
150
|
+
/* E_SUPP -10 */ "invalid minimal support %g%%\n",
|
151
|
+
/* E_CONF -11 */ "invalid minimal confidence %g%%\n",
|
152
|
+
/* E_MEASURE -12 */ "invalid additional evaluation measure %c\n",
|
153
|
+
/* E_RULELEN -13 */ "invalid set size/rule length %d\n",
|
154
|
+
/* E_NOTAS -14 */ "no items or transactions to work on\n",
|
155
|
+
/* E_NOFREQ -15 */ "no frequent items\n",
|
156
|
+
/* E_ITEMEXP -16 */ "file %s, record %d: item expected\n",
|
157
|
+
/* E_DUPITEM -17 */ "file %s, record %d: duplicate item %s\n",
|
158
|
+
/* E_APPEXP -18 */ "file %s, record %d: "
|
159
|
+
"appearance indicator expected\n",
|
160
|
+
/* E_UNKAPP -19 */ "file %s, record %d: "
|
161
|
+
"unknown appearance indicator %s\n",
|
162
|
+
/* E_FLDCNT -20 */ "file %s, record %d: too many fields\n",
|
163
|
+
/* E_UNKNOWN -21 */ "unknown error\n"
|
164
|
+
};
|
165
|
+
#endif
|
166
|
+
|
167
|
+
/*----------------------------------------------------------------------
|
168
|
+
Global Variables
|
169
|
+
----------------------------------------------------------------------*/
|
170
|
+
#ifndef QUIET
|
171
|
+
static char *prgname; /* program name for error messages */
|
172
|
+
#endif
|
173
|
+
static ITEMSET *itemset = NULL; /* item set */
|
174
|
+
static TASET *taset = NULL; /* transaction set */
|
175
|
+
static TATREE *tatree = NULL; /* transaction tree */
|
176
|
+
static ISTREE *istree = NULL; /* item set tree */
|
177
|
+
static FILE *in = NULL; /* input file */
|
178
|
+
static FILE *out = NULL; /* output file */
|
179
|
+
|
180
|
+
/*----------------------------------------------------------------------
|
181
|
+
Main Functions
|
182
|
+
----------------------------------------------------------------------*/
|
183
|
+
|
184
|
+
void help (void)
|
185
|
+
{ /* --- print help on eval. measures */
|
186
|
+
#ifndef QUIET
|
187
|
+
fprintf(stderr, "\n"); /* terminate startup message */
|
188
|
+
printf("additional evaluation measures (option -e#)\n");
|
189
|
+
printf("frequent item sets:\n");
|
190
|
+
printf("d or 1: binary logarithm of support quotient\n");
|
191
|
+
printf("association rules:\n");
|
192
|
+
printf("d or 1: absolute confidence difference to prior\n");
|
193
|
+
printf("q or 2: absolute difference of confidence quotient to 1\n");
|
194
|
+
printf("a or 3: absolute difference of improvement value to 1\n");
|
195
|
+
printf("i or 4: information difference to prior\n");
|
196
|
+
printf("c or 5: normalized chi^2 measure\n");
|
197
|
+
printf("p or 6: p-value computed from chi^2 measure\n");
|
198
|
+
#endif
|
199
|
+
exit(0); /* abort the program */
|
200
|
+
} /* help() */
|
201
|
+
|
202
|
+
/*--------------------------------------------------------------------*/
|
203
|
+
|
204
|
+
static void error (int code, ...)
|
205
|
+
{ /* --- print an error message */
|
206
|
+
#ifndef QUIET /* if not quiet version */
|
207
|
+
va_list args; /* list of variable arguments */
|
208
|
+
const char *msg; /* error message */
|
209
|
+
|
210
|
+
assert(prgname); /* check the program name */
|
211
|
+
if (code < E_UNKNOWN) code = E_UNKNOWN;
|
212
|
+
if (code < 0) { /* if to report an error, */
|
213
|
+
msg = errmsgs[-code]; /* get the error message */
|
214
|
+
if (!msg) msg = errmsgs[-E_UNKNOWN];
|
215
|
+
fprintf(stderr, "\n%s: ", prgname);
|
216
|
+
va_start(args, code); /* get variable arguments */
|
217
|
+
vfprintf(stderr, msg, args);/* print error message */
|
218
|
+
va_end(args); /* end argument evaluation */
|
219
|
+
}
|
220
|
+
#endif
|
221
|
+
#ifndef NDEBUG /* if debug version */
|
222
|
+
if (istree) ist_delete(istree); /* clean up memory */
|
223
|
+
if (tatree) tat_delete(tatree); /* and close files */
|
224
|
+
if (taset) tas_delete(taset, 0);
|
225
|
+
if (itemset) is_delete(itemset);
|
226
|
+
if (in && (in != stdin)) fclose(in);
|
227
|
+
if (out && (out != stdout)) fclose(out);
|
228
|
+
#endif
|
229
|
+
#ifdef STORAGE /* if storage debugging */
|
230
|
+
showmem("at end of program"); /* check memory usage */
|
231
|
+
#endif
|
232
|
+
exit(code); /* abort the program */
|
233
|
+
} /* error() */
|
234
|
+
|
235
|
+
/*--------------------------------------------------------------------*/
|
236
|
+
|
237
|
+
int do_apriori (int argc, char *argv[])
|
238
|
+
{ /* --- main function */
|
239
|
+
int i, k = 0, n; /* loop variables, counters */
|
240
|
+
char *s; /* to traverse the options */
|
241
|
+
char **optarg = NULL; /* option argument */
|
242
|
+
char *fn_in = NULL; /* name of input file */
|
243
|
+
char *fn_out = NULL; /* name of output file */
|
244
|
+
char *fn_app = NULL; /* name of item appearances file */
|
245
|
+
char *blanks = NULL; /* blanks */
|
246
|
+
char *fldseps = NULL; /* field separators */
|
247
|
+
char *recseps = NULL; /* record separators */
|
248
|
+
char *comment = NULL; /* comment indicators */
|
249
|
+
char *used = NULL; /* item usage vector */
|
250
|
+
double supp = 0.1; /* minimal support (in percent) */
|
251
|
+
double smax = 1.0; /* maximal support (in percent) */
|
252
|
+
double conf = 0.8; /* minimal confidence (in percent) */
|
253
|
+
int mode = IST_BODY; /* search mode (rule support def.) */
|
254
|
+
int target = 'r'; /* target type (sets/rules/h.edges) */
|
255
|
+
int arem = 0; /* additional rule evaluation measure */
|
256
|
+
int lift = 0; /* flag for printing the lift */
|
257
|
+
double minval = 0.1; /* minimal evaluation measure value */
|
258
|
+
double lftval = 0; /* lift value (confidence/prior) */
|
259
|
+
int minlen = 1; /* minimal rule length */
|
260
|
+
int maxlen = INT_MAX; /* maximal rule length */
|
261
|
+
int load = 1; /* flag for loading transactions */
|
262
|
+
int sort = 2; /* flag for item sorting and recoding */
|
263
|
+
double filter = 0.1; /* item usage filtering parameter */
|
264
|
+
int tree = 1; /* flag for transaction tree */
|
265
|
+
int heap = 1; /* flag for heap sort vs. quick sort */
|
266
|
+
int c2scf = 0; /* flag for conv. to scanable form */
|
267
|
+
char *sep = " "; /* item separator for output */
|
268
|
+
char *fmt = "%.1f"; /* output format for support/conf. */
|
269
|
+
int sout = 1; /* flag for abs./rel. support output */
|
270
|
+
int ext = 0; /* flag for extended support output */
|
271
|
+
int aval = 0; /* flag for add. eval. measure value */
|
272
|
+
int maxcnt = 0; /* maximal number of items per set */
|
273
|
+
int tacnt; /* number of transactions */
|
274
|
+
int frq; /* frequency of an item set */
|
275
|
+
int *map, *set; /* identifier map, item set */
|
276
|
+
int verbose = 0; /* flag for verboseness */
|
277
|
+
const char *name; /* buffer for item names */
|
278
|
+
static char buf[4*TS_SIZE+4]; /* buffer for formatting */
|
279
|
+
clock_t t, tt, tc, x; /* timer for measurements */
|
280
|
+
|
281
|
+
#ifndef QUIET /* if not quiet version */
|
282
|
+
prgname = argv[0]; /* get program name for error msgs. */
|
283
|
+
|
284
|
+
/* --- print usage message --- */
|
285
|
+
if (argc > 1) { /* if arguments are given */
|
286
|
+
fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
|
287
|
+
fprintf(stderr, VERSION); } /* print a startup message */
|
288
|
+
else { /* if no arguments given */
|
289
|
+
printf("usage: %s [options] infile outfile [appfile]\n", argv[0]);
|
290
|
+
printf("%s\n", DESCRIPTION);
|
291
|
+
printf("%s\n", VERSION);
|
292
|
+
printf("-t# target type (default: association rules)\n"
|
293
|
+
" (s: item sets, c: closed item sets,"
|
294
|
+
" m: maximal item sets,\n"
|
295
|
+
" r: association rules,"
|
296
|
+
" h: association hyperedges)\n");
|
297
|
+
printf("-m# minimal number of items per set/rule/hyperedge "
|
298
|
+
"(default: %d)\n", minlen);
|
299
|
+
printf("-n# maximal number of items per set/rule/hyperedge "
|
300
|
+
"(default: no limit)\n");
|
301
|
+
printf("-s# minimal support of a set/rule/hyperedge "
|
302
|
+
"(default: %g%%)\n", supp *100);
|
303
|
+
printf("-S# maximal support of a set/rule/hyperedge "
|
304
|
+
"(default: %g%%)\n", smax *100);
|
305
|
+
printf("-c# minimal confidence of a rule/hyperedge "
|
306
|
+
"(default: %g%%)\n", conf *100);
|
307
|
+
printf("-o use original definition of the support of a rule "
|
308
|
+
"(body & head)\n");
|
309
|
+
printf("-k# item separator for output "
|
310
|
+
"(default: \"%s\")\n", sep);
|
311
|
+
printf("-p# output format for support/confidence "
|
312
|
+
"(default: \"%s\")\n", fmt);
|
313
|
+
printf("-x extended support output "
|
314
|
+
"(print both rule support types)\n");
|
315
|
+
printf("-a print absolute support "
|
316
|
+
"(number of transactions)\n");
|
317
|
+
printf("-y print lift value (confidence divided by prior)\n");
|
318
|
+
printf("-e# additional evaluation measure (default: none)\n");
|
319
|
+
printf("-! print a list of additional evaluation measures\n");
|
320
|
+
printf("-d# minimal value of additional evaluation measure "
|
321
|
+
"(default: %g%%)\n", minval *100);
|
322
|
+
printf("-v print value of additional "
|
323
|
+
"rule evaluation measure\n");
|
324
|
+
printf("-g write output in scanable form "
|
325
|
+
"(quote certain characters)\n");
|
326
|
+
printf("-l do not load transactions into memory "
|
327
|
+
"(work on input file)\n");
|
328
|
+
printf("-q# sort items w.r.t. their frequency (default: %d)\n"
|
329
|
+
" (1: ascending, -1: descending, 0: do not sort,\n"
|
330
|
+
" 2: ascending, -2: descending w.r.t. "
|
331
|
+
"transaction size sum)\n", sort);
|
332
|
+
printf("-u# filter unused items from transactions "
|
333
|
+
"(default: %g)\n", filter);
|
334
|
+
printf(" (0: do not filter items w.r.t. usage in sets,\n"
|
335
|
+
" <0: fraction of removed items for filtering,\n"
|
336
|
+
" >0: take execution times ratio into account)\n");
|
337
|
+
printf("-h do not organize transactions as a prefix tree\n");
|
338
|
+
printf("-j use quicksort to sort the transactions "
|
339
|
+
"(default: heapsort)\n");
|
340
|
+
printf("-z minimize memory usage "
|
341
|
+
"(default: maximize speed)\n");
|
342
|
+
printf("-b/f/r# blank characters, field and record separators\n"
|
343
|
+
" (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
|
344
|
+
printf("-C# comment characters (default: \"#\")\n");
|
345
|
+
printf("-V verbose\n");
|
346
|
+
|
347
|
+
printf("infile file to read transactions from\n");
|
348
|
+
printf("outfile file to write item sets/association rules"
|
349
|
+
"/hyperedges to\n");
|
350
|
+
printf("appfile file stating item appearances (optional)\n");
|
351
|
+
return 0; /* print a usage message */
|
352
|
+
} /* and abort the program */
|
353
|
+
#endif /* #ifndef QUIET */
|
354
|
+
|
355
|
+
/* --- evaluate arguments --- */
|
356
|
+
for (i = 1; i < argc; i++) { /* traverse arguments */
|
357
|
+
s = argv[i]; /* get option argument */
|
358
|
+
if (optarg) { *optarg = s; optarg = NULL; continue; }
|
359
|
+
if ((*s == '-') && *++s) { /* -- if argument is an option */
|
360
|
+
while (*s) { /* traverse options */
|
361
|
+
switch (*s++) { /* evaluate switches */
|
362
|
+
case '!': help(); break;
|
363
|
+
case 't': target = (*s) ? *s++ : 'r'; break;
|
364
|
+
case 'm': minlen = (int)strtol(s, &s, 0); break;
|
365
|
+
case 'n': maxlen = (int)strtol(s, &s, 0); break;
|
366
|
+
case 's': supp = 0.01*strtod(s, &s); break;
|
367
|
+
case 'S': smax = 0.01*strtod(s, &s); break;
|
368
|
+
case 'c': conf = 0.01*strtod(s, &s); break;
|
369
|
+
case 'o': mode |= IST_BOTH; break;
|
370
|
+
case 'k': optarg = &sep; break;
|
371
|
+
case 'p': optarg = &fmt; break;
|
372
|
+
case 'x': ext = 1; break;
|
373
|
+
case 'a': sout |= 2; break;
|
374
|
+
case 'y': lift = 1; break;
|
375
|
+
case 'e': arem = (*s) ? *s++ : 0; break;
|
376
|
+
case 'd': minval = 0.01*strtod(s, &s); break;
|
377
|
+
case 'v': aval = 1; break;
|
378
|
+
case 'g': c2scf = 1; break;
|
379
|
+
case 'l': load = 0; break;
|
380
|
+
case 'q': sort = (int)strtol(s, &s, 0); break;
|
381
|
+
case 'u': filter = strtod(s, &s); break;
|
382
|
+
case 'h': tree = 0; break;
|
383
|
+
case 'j': heap = 0; break;
|
384
|
+
case 'z': mode |= IST_MEMOPT; break;
|
385
|
+
case 'b': optarg = &blanks; break;
|
386
|
+
case 'f': optarg = &fldseps; break;
|
387
|
+
case 'r': optarg = &recseps; break;
|
388
|
+
case 'C': optarg = &comment; break;
|
389
|
+
case 'V': verbose = 1; break;
|
390
|
+
default : error(E_OPTION, *--s); break;
|
391
|
+
} /* set option variables */
|
392
|
+
if (optarg && *s) { *optarg = s; optarg = NULL; break; }
|
393
|
+
} } /* get option argument */
|
394
|
+
else { /* -- if argument is no option */
|
395
|
+
switch (k++) { /* evaluate non-options */
|
396
|
+
case 0: fn_in = s; break;
|
397
|
+
case 1: fn_out = s; break;
|
398
|
+
case 2: fn_app = s; break;
|
399
|
+
default: error(E_ARGCNT); break;
|
400
|
+
} /* note filenames */
|
401
|
+
}
|
402
|
+
}
|
403
|
+
if (optarg) error(E_OPTARG); /* check option argument */
|
404
|
+
if ((k < 2) || (k > 3)) /* and the number of arguments */
|
405
|
+
error(E_ARGCNT); /* (either in/out or in/out/app) */
|
406
|
+
if ((!fn_in || !*fn_in) && (fn_app && !*fn_app))
|
407
|
+
error(E_STDIN); /* stdin must not be used twice */
|
408
|
+
switch (target) { /* check and translate target type */
|
409
|
+
case 's': target = TT_SET; break;
|
410
|
+
case 'c': target = TT_CLSET; break;
|
411
|
+
case 'm': target = TT_MFSET; break;
|
412
|
+
case 'r': target = TT_RULE; break;
|
413
|
+
case 'h': target = TT_HEDGE; break;
|
414
|
+
case 'g': target = TT_GROUP; break;
|
415
|
+
default : error(E_TARGET, (char)target); break;
|
416
|
+
}
|
417
|
+
if (supp > 1) /* check the minimal support */
|
418
|
+
error(E_SUPP, supp); /* (< 0: absolute number) */
|
419
|
+
if ((conf < 0) || (conf > 1))
|
420
|
+
error(E_CONF, conf); /* check the minimal confidence */
|
421
|
+
if (minlen <= 0) error(E_RULELEN, minlen); /* check the limits */
|
422
|
+
if (maxlen <= 0) error(E_RULELEN, maxlen); /* for the rule length */
|
423
|
+
switch (arem) { /* check and translate measure */
|
424
|
+
case 0 : case '0': arem = EM_NONE; break;
|
425
|
+
case 'd': case '1': arem = EM_DIFF; break;
|
426
|
+
case 'q': case '2': arem = EM_QUOT; break;
|
427
|
+
case 'a': case '3': arem = EM_AIMP; break;
|
428
|
+
case 'i': case '4': arem = EM_INFO; break;
|
429
|
+
case 'c': case '5': arem = EM_CHI2; break;
|
430
|
+
case 'p': case '6': arem = EM_PVAL; break;
|
431
|
+
default : error(E_MEASURE, (char)arem); break;
|
432
|
+
}
|
433
|
+
if (target <= TT_MFSET) { /* in item set mode neutralize */
|
434
|
+
mode |= IST_BOTH; conf = 1;}/* rule specific settings */
|
435
|
+
if (arem == EM_NONE) /* if no add. rule eval. measure, */
|
436
|
+
aval = 0; /* clear the corresp. output flag */
|
437
|
+
if ((filter <= -1) || (filter >= 1)) filter = 0;
|
438
|
+
|
439
|
+
/* --- create item set and transaction set --- */
|
440
|
+
itemset = is_create(-1); /* create an item set and */
|
441
|
+
if (!itemset) error(E_NOMEM); /* set the special characters */
|
442
|
+
is_chars(itemset, blanks, fldseps, recseps, comment);
|
443
|
+
if (load) { /* if to load the transactions */
|
444
|
+
taset = tas_create(itemset);
|
445
|
+
if (!taset) error(E_NOMEM); /* create a transaction set */
|
446
|
+
} /* to store the transactions */
|
447
|
+
MSG(fprintf(stderr, "\n")); /* terminate the startup message */
|
448
|
+
|
449
|
+
/* --- read item appearances --- */
|
450
|
+
if (fn_app) { /* if item appearances are given */
|
451
|
+
t = clock(); /* start the timer */
|
452
|
+
if (*fn_app) /* if an app. file name is given, */
|
453
|
+
in = fopen(fn_app, "r"); /* open the item appearances file */
|
454
|
+
else { /* if no app. file name is given, */
|
455
|
+
in = stdin; fn_app = "<stdin>"; } /* read from std. input */
|
456
|
+
MSG(fprintf(stderr, "reading %s ... ", fn_app));
|
457
|
+
if (!in) error(E_FOPEN, fn_app);
|
458
|
+
k = is_readapp(itemset,in); /* read the item appearances */
|
459
|
+
if (k != 0) error(k, fn_app, RECCNT(itemset), BUFFER(itemset));
|
460
|
+
if (in != stdin) /* if not read from standard input, */
|
461
|
+
fclose(in); /* close the input file */
|
462
|
+
MSG(fprintf(stderr, "[%d item(s)]", is_cnt(itemset)));
|
463
|
+
MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
|
464
|
+
} /* print a log message */
|
465
|
+
|
466
|
+
/* --- read transactions --- */
|
467
|
+
t = clock(); /* start the timer */
|
468
|
+
if (fn_in && *fn_in) /* if an input file name is given, */
|
469
|
+
in = fopen(fn_in, "r"); /* open input file for reading */
|
470
|
+
else { /* if no input file name is given, */
|
471
|
+
in = stdin; fn_in = "<stdin>"; } /* read from standard input */
|
472
|
+
MSG(fprintf(stderr, "reading %s ... \n", fn_in));
|
473
|
+
if (!in) error(E_FOPEN, fn_in);
|
474
|
+
while (1) { /* transaction read loop */
|
475
|
+
k = is_read(itemset, in); /* read the next transaction */
|
476
|
+
if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
|
477
|
+
if (k > 0) break; /* check for error and end of file */
|
478
|
+
k = is_tsize(itemset); /* update the maximal */
|
479
|
+
if (k > maxcnt) maxcnt = k; /* transaction size */
|
480
|
+
if (taset && (tas_add(taset, NULL, 0) != 0))
|
481
|
+
error(E_NOMEM); /* add the loaded transaction */
|
482
|
+
} /* to the transaction set */
|
483
|
+
if (taset) { /* if transactions have been loaded */
|
484
|
+
if (in != stdin) fclose(in);/* if not read from standard input, */
|
485
|
+
in = NULL; /* close the input file */
|
486
|
+
} /* clear the file variable */
|
487
|
+
n = is_cnt(itemset); /* get the number of items */
|
488
|
+
tacnt = is_gettac(itemset); /* and the number of transactions */
|
489
|
+
MSG(fprintf(stderr, "[%d item(s), %d transaction(s)]", n, tacnt));
|
490
|
+
MSG(fprintf(stderr, " done [%.2fs].", SEC_SINCE(t)));
|
491
|
+
if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
|
492
|
+
MSG(fprintf(stderr, "\n")); /* check for at least one transaction */
|
493
|
+
if (supp >= 0) /* if relative support is given */
|
494
|
+
supp = ceil(tacnt *supp); /* compute absolute support */
|
495
|
+
else { /* if absolute support is given, */
|
496
|
+
supp = ceil(-100 *supp); /* make the support value positive */
|
497
|
+
if (!(sout & 2)) sout = 2; /* switch to absolute support output */
|
498
|
+
} /* do the same with the max. support */
|
499
|
+
smax = floor(((smax >= 0) ? tacnt : -100) *smax);
|
500
|
+
|
501
|
+
/* --- sort and recode items --- */
|
502
|
+
MSG(fprintf(stderr, "filtering, sorting and recoding items ... "));
|
503
|
+
t = clock(); /* start the timer */
|
504
|
+
map = (int*)malloc(is_cnt(itemset) *sizeof(int));
|
505
|
+
if (!map) error(E_NOMEM); /* create an item identifier map */
|
506
|
+
k = (int)((mode & IST_HEAD) ? supp : ceil(supp *conf));
|
507
|
+
n = is_recode(itemset, k, sort, map);
|
508
|
+
if (taset) { /* sort and recode the items and */
|
509
|
+
tas_recode(taset, map,n); /* recode the loaded transactions */
|
510
|
+
maxcnt = tas_max(taset); /* get the new maximal t.a. size */
|
511
|
+
} /* (may be smaller than before) */
|
512
|
+
free(map); /* delete the item identifier map */
|
513
|
+
MSG(fprintf(stderr, "[%d item(s)] ", n));
|
514
|
+
MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
|
515
|
+
if (n <= 0) error(E_NOFREQ); /* print a log message and */
|
516
|
+
MSG(fprintf(stderr, "\n")); /* check the number of items */
|
517
|
+
if (maxlen > maxcnt) /* clamp the set/rule length */
|
518
|
+
maxlen = maxcnt; /* to the maximum set size */
|
519
|
+
|
520
|
+
/* --- create a transaction tree --- */
|
521
|
+
tt = 0; /* init. the tree construction time */
|
522
|
+
if (tree && taset) { /* if transactions were loaded */
|
523
|
+
MSG(fprintf(stderr, "creating transaction tree ... "));
|
524
|
+
t = clock(); /* start the timer */
|
525
|
+
tatree = tat_create(taset, heap);
|
526
|
+
if (!tatree) error(E_NOMEM);/* create a transaction tree */
|
527
|
+
if (filter == 0) { /* if a tree rebuild is not needed, */
|
528
|
+
tas_delete(taset, 0); taset = NULL; } /* delete transactions */
|
529
|
+
tt = clock() -t; /* note the time for the construction */
|
530
|
+
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
|
531
|
+
} /* print a log message */
|
532
|
+
|
533
|
+
/* --- create an item set tree --- */
|
534
|
+
t = clock(); tc = 0; /* start the timer */
|
535
|
+
istree = ist_create(itemset, mode, (int)supp, conf);
|
536
|
+
if (!istree) error(E_NOMEM); /* create an item set tree */
|
537
|
+
|
538
|
+
/* --- check item subsets --- */
|
539
|
+
if (filter) { /* if to filter unused items */
|
540
|
+
used = (char*)malloc(is_cnt(itemset) *sizeof(char));
|
541
|
+
if (!used) error(E_NOMEM); /* create a flag vector */
|
542
|
+
} /* for the items */
|
543
|
+
MSG(fprintf(stderr, "checking subsets of size 1"));
|
544
|
+
while (ist_height(istree) < maxlen) {
|
545
|
+
if (filter != 0) { /* if to filter w.r.t. item usage, */
|
546
|
+
i = ist_check(istree, used); /* check current item usage */
|
547
|
+
if (i < maxlen) maxlen = i; /* update the maximum size */
|
548
|
+
if (ist_height(istree) >= i) break;
|
549
|
+
} /* check the tree height */
|
550
|
+
k = ist_addlvl(istree); /* while max. height is not reached, */
|
551
|
+
if (k < 0) error(E_NOMEM); /* add a level to the item set tree */
|
552
|
+
if (k != 0) break; /* if no level was added, abort */
|
553
|
+
MSG(fprintf(stderr, " %d", ist_height(istree)));
|
554
|
+
if (tatree) { /* if a transaction tree was created */
|
555
|
+
if (((filter < 0) /* if to filter w.r.t. item usage */
|
556
|
+
&& (i < -filter *n)) /* and enough items were removed */
|
557
|
+
|| ((filter > 0) /* or counting time is long enough */
|
558
|
+
&& (i < n) && (i *(double)tt < filter *n *tc))) {
|
559
|
+
n = i; x = clock(); /* note the new number of items */
|
560
|
+
tas_filter(taset, used);/* and remove unnecessary items */
|
561
|
+
tat_delete(tatree); /* delete the transaction tree */
|
562
|
+
tatree = tat_create(taset, heap);
|
563
|
+
if (!tatree) error(E_NOMEM);
|
564
|
+
tt = clock() -x; /* rebuild the transaction tree and */
|
565
|
+
} /* note the new construction time */
|
566
|
+
x = clock(); /* count the transaction tree */
|
567
|
+
ist_countx(istree, tatree);
|
568
|
+
tc = clock() -x; } /* note the new count time */
|
569
|
+
else if (taset) { /* if transactions were loaded */
|
570
|
+
if (((filter < 0) /* if to filter w.r.t. item usage */
|
571
|
+
&& (i <= -filter *n)) /* and enough items were removed */
|
572
|
+
|| ((filter > 0) /* or counting time is long enough */
|
573
|
+
&& (i *(double)tt <= filter *n *tc))) {
|
574
|
+
n = i; x = clock(); /* note the new number of items */
|
575
|
+
tas_filter(taset, used);/* and remove unnecessary items */
|
576
|
+
tt = clock() -t; /* from the transactions */
|
577
|
+
} /* note the filtering time */
|
578
|
+
for (i = tacnt; --i >= 0;)/* traverse and count transactions */
|
579
|
+
ist_count(istree, tas_tract(taset, i), tas_tsize(taset, i));
|
580
|
+
tc = clock() -t; } /* note the new count time */
|
581
|
+
else { /* if to work on the input file, */
|
582
|
+
rewind(in); /* reset the file position */
|
583
|
+
for (maxcnt = 0; (i = is_read(itemset, in)) == 0; ) {
|
584
|
+
if (filter != 0) /* (re)read the transactions and */
|
585
|
+
is_filter(itemset, used); /* remove unnecessary items */
|
586
|
+
k = is_tsize(itemset); /* update the maximum size */
|
587
|
+
if (k > maxcnt) maxcnt = k; /* of a transaction */
|
588
|
+
ist_count(istree, is_tract(itemset), k);
|
589
|
+
} /* count the transaction in the tree */
|
590
|
+
if (i < 0) error(i, fn_in, RECCNT(itemset), BUFFER(itemset));
|
591
|
+
if (maxcnt < maxlen) /* update the maximal rule length */
|
592
|
+
maxlen = maxcnt; /* according to the max. t.a. size */
|
593
|
+
} /* (may be smaller than before) */
|
594
|
+
}
|
595
|
+
if (!taset && !tatree) { /* if transactions were not loaded */
|
596
|
+
if (in != stdin) fclose(in);/* if not read from standard input, */
|
597
|
+
in = NULL; /* close the input file */
|
598
|
+
} /* clear the file variable */
|
599
|
+
MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
|
600
|
+
|
601
|
+
/* --- filter found item sets --- */
|
602
|
+
if ((target == TT_CLSET) || (target == TT_MFSET)) {
|
603
|
+
MSG(fprintf(stderr, "filtering %s item sets ... ",
|
604
|
+
(target == TT_MFSET) ? "maximal" : "closed"));
|
605
|
+
t = clock(); /* filter the item sets */
|
606
|
+
ist_filter(istree, (target == TT_MFSET) ? IST_MAXFRQ : IST_CLOSED);
|
607
|
+
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
|
608
|
+
} /* (filter takes longer than print) */
|
609
|
+
|
610
|
+
/* --- sort transactions --- */
|
611
|
+
if (target <= TT_MFSET) { /* if to find frequent item sets */
|
612
|
+
if (!taset) /* transactions must be loaded */
|
613
|
+
ext = 0; /* for extended support output */
|
614
|
+
else if (ext) { /* if extended output is requested */
|
615
|
+
MSG(fprintf(stderr, "sorting transactions ... "));
|
616
|
+
t = clock(); /* start the timer */
|
617
|
+
tas_sort(taset, heap); /* sort the transactions */
|
618
|
+
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
|
619
|
+
} /* (sorting is necessary to find the */
|
620
|
+
} /* number of identical transactions) */
|
621
|
+
|
622
|
+
/* --- print item sets/rules/hyperedges --- */
|
623
|
+
t = clock(); /* start the timer */
|
624
|
+
if (fn_out && *fn_out) /* if an output file name is given, */
|
625
|
+
out = fopen(fn_out, "w"); /* open the output file */
|
626
|
+
else { /* if no output file name is given, */
|
627
|
+
out = stdout; fn_out = "<stdout>"; } /* write to std. output */
|
628
|
+
MSG(fprintf(stderr, "writing %s ... ", fn_out));
|
629
|
+
if (!out) error(E_FOPEN, fn_out);
|
630
|
+
ist_init(istree, minlen, arem, minval);
|
631
|
+
set = is_tract(itemset); /* get the transaction buffer */
|
632
|
+
if (target <= TT_MFSET) { /* if to find frequent item sets */
|
633
|
+
for (n = 0; 1; ) { /* extract item sets from the tree */
|
634
|
+
k = ist_set(istree, set, &frq, &conf);
|
635
|
+
if (k <= 0) break; /* get the next frequent item set */
|
636
|
+
if (frq > smax) continue; /* check against maximal support */
|
637
|
+
for (i = 0; i < k; i++) { /* traverse the set's items */
|
638
|
+
name = is_name(itemset, set[i]);
|
639
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
640
|
+
fputs(name, out); /* print the name of the next item */
|
641
|
+
fputs((i < k-1) ? sep : " ", out);
|
642
|
+
} /* print a separator */
|
643
|
+
fputs(" (", out); /* print the item set's support */
|
644
|
+
if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
|
645
|
+
if (sout & 2) fputc('/', out); }
|
646
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
647
|
+
if (ext) { /* if to print the extended support */
|
648
|
+
frq = tas_occur(taset, set, k);
|
649
|
+
fputs(", ", out); /* get the number of occurrences */
|
650
|
+
fprintf(out, fmt, (frq/(double)tacnt) *100);
|
651
|
+
if (sout & 2) fprintf(out, "/%d", frq);
|
652
|
+
} /* print the extended support data */
|
653
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, conf *100); }
|
654
|
+
fputs(")\n", out); /* print the add. eval. measure, */
|
655
|
+
n++; /* terminate the support output, */
|
656
|
+
} } /* and count the item set */
|
657
|
+
else if (target == TT_RULE) { /* if to find association rules, */
|
658
|
+
for (n = 0; 1; ) { /* extract rules from tree */
|
659
|
+
k = ist_rule(istree, set, &frq, &conf, &lftval, &minval);
|
660
|
+
if (k <= 0) break; /* get the next association rule */
|
661
|
+
if (frq > smax) continue; /* check against maximal support */
|
662
|
+
for (i = 0; i < k; i++) { /* traverse the rule's items */
|
663
|
+
name = is_name(itemset, set[i]);
|
664
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
665
|
+
fputs(name, out); /* print the next item */
|
666
|
+
fputs((i <= 0) ? " <- " : ((i < k-1) ? sep : " "), out);
|
667
|
+
} /* print a separator */
|
668
|
+
fputs(" (", out); /* print the rule evaluation */
|
669
|
+
if (sout & 1) supp = frq/(double)tacnt;
|
670
|
+
if (ext && !(mode & IST_HEAD)) {
|
671
|
+
if (sout & 1) { fprintf(out, fmt, supp *conf *100);
|
672
|
+
if (sout & 2) fputc('/', out); }
|
673
|
+
if (sout & 2) { fprintf(out, "%d", (int)(frq *conf +0.5));}
|
674
|
+
fputs(", ", out); /* print the support of the rule */
|
675
|
+
} /* from the support of the body */
|
676
|
+
if (sout & 1) { fprintf(out, fmt, supp *100);
|
677
|
+
if (sout & 2) fputc('/', out); }
|
678
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
679
|
+
fputs(", ", out); /* print the rule support */
|
680
|
+
if (ext && (mode & IST_HEAD)) {
|
681
|
+
if (sout & 1) { fprintf(out, fmt, (supp/conf) *100);
|
682
|
+
if (sout & 2) fputc('/', out); }
|
683
|
+
if (sout & 2) { fprintf(out, "%d", (int)(frq /conf +0.5));}
|
684
|
+
fputs(", ", out); /* print the support of the body */
|
685
|
+
} /* from the support of the rule */
|
686
|
+
fprintf(out, fmt, conf *100); /* print the rule confidence */
|
687
|
+
if (lift) { fputs(", ", out); fprintf(out, fmt, lftval *100); }
|
688
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
|
689
|
+
fputs(")\n", out); /* print the value of the additional */
|
690
|
+
n++; /* rule evaluation measure and */
|
691
|
+
} } /* count the association rule */
|
692
|
+
else if (target == TT_HEDGE){ /* if to find association hyperedges */
|
693
|
+
for (n = 0; 1; ) { /* extract hyperedges from tree */
|
694
|
+
k = ist_hedge(istree, set, &frq, &conf, &minval);
|
695
|
+
if (k <= 0) break; /* get the next hyperedge */
|
696
|
+
if (frq > smax) continue; /* check against maximal support */
|
697
|
+
for (i = 0; i < k; i++) { /* traverse the edge's items */
|
698
|
+
name = is_name(itemset, set[i]);
|
699
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
700
|
+
fputs(name, out); /* print the name of the next item */
|
701
|
+
fputs((i < k-1) ? sep : " ", out);
|
702
|
+
} /* print a separator */
|
703
|
+
fputs(" (", out); /* print the hyperedge evaluation */
|
704
|
+
if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
|
705
|
+
if (sout & 2) fputc('/', out); }
|
706
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
707
|
+
fputs(", ", out); fprintf(out, fmt, conf *100);
|
708
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
|
709
|
+
fputs(")\n", out); /* print support and confidence */
|
710
|
+
n++; /* of the hyperedge and */
|
711
|
+
} } /* count the hyperedge */
|
712
|
+
else { /* if to find association groups */
|
713
|
+
for (n = 0; 1; ) { /* extract groups from tree */
|
714
|
+
k = ist_group(istree, set, &frq, &minval);
|
715
|
+
if (k <= 0) break; /* get the next group */
|
716
|
+
if (frq > smax) continue; /* check against maximal support */
|
717
|
+
for (i = 0; i < k; i++) { /* traverse the group's items */
|
718
|
+
name = is_name(itemset, set[i]);
|
719
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
720
|
+
fputs(name, out); /* print the name of the next item */
|
721
|
+
fputs((i < k-1) ? sep : " ", out);
|
722
|
+
} /* print a separator */
|
723
|
+
fputs(" (", out); /* print the group evaluation */
|
724
|
+
if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
|
725
|
+
if (sout & 2) fputc('/', out); }
|
726
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
727
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
|
728
|
+
fputs(")\n", out); /* print support and add. measure */
|
729
|
+
n++; /* and count the group */
|
730
|
+
}
|
731
|
+
} /* if (target <= TT_MFSET) .. else .. */
|
732
|
+
if (fflush(out) != 0) error(E_FWRITE, fn_out);
|
733
|
+
if (out != stdout) fclose(out);
|
734
|
+
out = NULL; /* close the output file */
|
735
|
+
MSG(fprintf(stderr, "[%d %s(s)] done ", n, ttypes[target]));
|
736
|
+
MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)));
|
737
|
+
#ifdef BENCH
|
738
|
+
printf("number of support counters: %d\n", istree->sccnt);
|
739
|
+
printf("necessary support counters: %d\n", istree->scnec);
|
740
|
+
printf("number of child pointers : %d\n", istree->cpcnt);
|
741
|
+
printf("necessary child pointers : %d\n", istree->cpnec);
|
742
|
+
printf("allocated memory (bytes) : %d\n", istree->bytes);
|
743
|
+
#endif
|
744
|
+
|
745
|
+
/* --- clean up --- */
|
746
|
+
#ifndef NDEBUG /* if this is a debug version */
|
747
|
+
free(used); /* delete the item app. vector */
|
748
|
+
ist_delete(istree); /* delete the item set tree, */
|
749
|
+
if (tatree) tat_delete(tatree); /* the transaction tree, */
|
750
|
+
if (taset) tas_delete(taset, 0); /* the transaction set, */
|
751
|
+
is_delete(itemset); /* and the item set */
|
752
|
+
#endif
|
753
|
+
#ifdef STORAGE /* if storage debugging */
|
754
|
+
showmem("at end of program"); /* check memory usage */
|
755
|
+
#endif
|
756
|
+
return 0; /* return 'ok' */
|
757
|
+
} /* main() */
|