apriori 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +16 -0
- data/License.txt +20 -0
- data/Manifest.txt +121 -0
- data/README.txt +149 -0
- data/Rakefile +15 -0
- data/TODO.txt +60 -0
- data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
- data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
- data/attic/c_ext_test1/mytest.rb +10 -0
- data/attic/test.c +12 -0
- data/config/hoe.rb +81 -0
- data/config/requirements.rb +29 -0
- data/examples/01_simple_example.rb +32 -0
- data/examples/02_small_file_example.rb +17 -0
- data/examples/03_large_file_example.rb +22 -0
- data/examples/test_data/market_basket_basic_test.dat +9 -0
- data/ext/Apriori.c +149 -0
- data/ext/Makefile +149 -0
- data/ext/apriori/doc/apriori.html +1301 -0
- data/ext/apriori/doc/arem.gp +68 -0
- data/ext/apriori/doc/c_rev.gp +89 -0
- data/ext/apriori/doc/chi2.tex +156 -0
- data/ext/apriori/doc/copying +504 -0
- data/ext/apriori/doc/line.gif +0 -0
- data/ext/apriori/doc/uparrow.gif +0 -0
- data/ext/apriori/ex/flg2set +15 -0
- data/ext/apriori/ex/hdr2set +13 -0
- data/ext/apriori/ex/readme +71 -0
- data/ext/apriori/ex/row2set +7 -0
- data/ext/apriori/ex/rulesort +24 -0
- data/ext/apriori/ex/tab2set +9 -0
- data/ext/apriori/ex/test.app +2 -0
- data/ext/apriori/ex/test.rul +9 -0
- data/ext/apriori/ex/test1.rul +43 -0
- data/ext/apriori/ex/test1.tab +10 -0
- data/ext/apriori/ex/test2.tab +10 -0
- data/ext/apriori/ex/test3.tab +30 -0
- data/ext/apriori/ex/test4.tab +11 -0
- data/ext/apriori/ex/test5.tab +39 -0
- data/ext/apriori/ex/tid2set +23 -0
- data/ext/apriori/ex/xhdr2set +33 -0
- data/ext/apriori/src/apriori.c +750 -0
- data/ext/apriori/src/apriori.dsp +120 -0
- data/ext/apriori/src/apriori.dsw +29 -0
- data/ext/apriori/src/apriori.mak +99 -0
- data/ext/apriori/src/istree.c +1411 -0
- data/ext/apriori/src/istree.h +160 -0
- data/ext/apriori/src/makefile +105 -0
- data/ext/apriori/src/tract.c +870 -0
- data/ext/apriori/src/tract.h +261 -0
- data/ext/apriori_wrapper.c +757 -0
- data/ext/apriori_wrapper.h +10 -0
- data/ext/extconf.rb +32 -0
- data/ext/math/doc/copying +504 -0
- data/ext/math/src/chi2.c +151 -0
- data/ext/math/src/chi2.h +27 -0
- data/ext/math/src/choose.c +71 -0
- data/ext/math/src/choose.h +16 -0
- data/ext/math/src/gamma.c +446 -0
- data/ext/math/src/gamma.h +39 -0
- data/ext/math/src/intexp.c +35 -0
- data/ext/math/src/intexp.h +15 -0
- data/ext/math/src/makefile +164 -0
- data/ext/math/src/math.mak +48 -0
- data/ext/math/src/normal.c +387 -0
- data/ext/math/src/normal.h +44 -0
- data/ext/math/src/radfn.c +113 -0
- data/ext/math/src/radfn.h +34 -0
- data/ext/math/src/zeta.c +49 -0
- data/ext/math/src/zeta.h +15 -0
- data/ext/pre-clean.rb +8 -0
- data/ext/pre-setup.rb +9 -0
- data/ext/util/doc/copying +504 -0
- data/ext/util/src/listops.c +76 -0
- data/ext/util/src/listops.h +26 -0
- data/ext/util/src/makefile +103 -0
- data/ext/util/src/memsys.c +84 -0
- data/ext/util/src/memsys.h +42 -0
- data/ext/util/src/nstats.c +288 -0
- data/ext/util/src/nstats.h +69 -0
- data/ext/util/src/params.c +86 -0
- data/ext/util/src/params.h +19 -0
- data/ext/util/src/parse.c +133 -0
- data/ext/util/src/parse.h +81 -0
- data/ext/util/src/scan.c +767 -0
- data/ext/util/src/scan.h +111 -0
- data/ext/util/src/symtab.c +443 -0
- data/ext/util/src/symtab.h +121 -0
- data/ext/util/src/tabscan.c +279 -0
- data/ext/util/src/tabscan.h +99 -0
- data/ext/util/src/util.mak +91 -0
- data/ext/util/src/vecops.c +317 -0
- data/ext/util/src/vecops.h +42 -0
- data/lib/apriori.rb +133 -0
- data/lib/apriori/adapter.rb +13 -0
- data/lib/apriori/association_rule.rb +89 -0
- data/lib/apriori/version.rb +9 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/apriori.rake +20 -0
- data/tasks/attic.rake +28 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/install.rake +13 -0
- data/tasks/website.rake +17 -0
- data/test/apriori_test.rb +13 -0
- data/test/fixtures/market_basket_results_test.txt +5 -0
- data/test/fixtures/market_basket_string_test.txt +7 -0
- data/test/fixtures/results.txt +2 -0
- data/test/fixtures/sample.txt +7 -0
- data/test/test_helper.rb +5 -0
- data/test/unit/test_apriori.rb +68 -0
- data/test/unit/test_itemsets_and_parsing.rb +82 -0
- data/website/index.html +248 -0
- data/website/index.txt +152 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +142 -0
- data/website/template.html.erb +49 -0
- metadata +226 -0
Binary file
|
Binary file
|
@@ -0,0 +1,71 @@
|
|
1
|
+
The example files in this directory demonstrate how to use the
|
2
|
+
options -b, -f, and -r and the optional item appearances file.
|
3
|
+
This file also explains the conversion scripts, which can convert
|
4
|
+
different input formats into the format needed by the apriori program.
|
5
|
+
|
6
|
+
In the file test1.tab transactions are separated by newline characters
|
7
|
+
and the items of a transaction are separated by spaces. This is the
|
8
|
+
standard input format and hence the file can be processed directly:
|
9
|
+
apriori test1.tab test1.rul
|
10
|
+
|
11
|
+
In the file test2.tab the same transactions can be found, but several
|
12
|
+
different field separators are used. This file can be processed with:
|
13
|
+
apriori -f ",.;:" -l test2.tab test2.rul
|
14
|
+
|
15
|
+
The files test3.tab to test5.tab are in formats that cannot be
|
16
|
+
processed directly with the apriori program, but which may be common.
|
17
|
+
|
18
|
+
In the file test3.tab each line contains a transaction identifier and
|
19
|
+
an item, separated by a space. This file can be converted into the
|
20
|
+
standard input format with the script tid2set, i.e., with
|
21
|
+
tid2set test3.tab x.tab
|
22
|
+
Note, however, that the input file (here: test3.tab) must be sorted
|
23
|
+
w.r.t. the transaction identifier, so that items belonging to the
|
24
|
+
same transaction occupy consecutive lines/records.
|
25
|
+
|
26
|
+
In the file test4.tab the first line states the item names and the
|
27
|
+
following lines contain flags T (true) and F (false) depending on
|
28
|
+
whether the item is contained in the transaction represented by the
|
29
|
+
line or not. This format can be converted into the standard input
|
30
|
+
format with the script flg2set, i.e., with
|
31
|
+
flg2set test4.tab x.tab
|
32
|
+
|
33
|
+
In the file test5.tab there is one item per line and transactions
|
34
|
+
are separated by blank lines. This format can be converted into the
|
35
|
+
standard input format with the script row2set, i.e., with
|
36
|
+
row2set test5.tab x.tab
|
37
|
+
|
38
|
+
The additional scripts tab2set and hdr2set convert tables with column
|
39
|
+
numbers or column names into a format appropriate for the apriori
|
40
|
+
program. They are invoked in the same way as all other scripts
|
41
|
+
discussed above, i.e., with
|
42
|
+
tab2set a.tab b.tab
|
43
|
+
or
|
44
|
+
hdr2set a.tab b.tab
|
45
|
+
where a.tab is the name of the input file and b.tab the name of the
|
46
|
+
output file. The script tab2set replaces each table entry "x" of the
|
47
|
+
input file by "Xi=x", where i is the column number (starting with 1).
|
48
|
+
The script hdr2set reads the variable names from the first line of
|
49
|
+
the input file and then replaces each table entry "x" by "X=x", where
|
50
|
+
"X" is the variable name that was found in the corresponding column
|
51
|
+
of the first line. These scripts are handy if you want to process
|
52
|
+
tabular data by treating each table row as a transaction.
|
53
|
+
|
54
|
+
The file test.app demonstrates the use of item appearance indicators.
|
55
|
+
The first line of this file ('body') states that any item not explicitly
|
56
|
+
mentioned in this file may appear only in the body of a rule. The second
|
57
|
+
line says that item 2 may appear only in the head of a rule. Hence, by
|
58
|
+
processing the file test1.tab with
|
59
|
+
apriori test1.tab test.rul test.app
|
60
|
+
only rules with item 2 in the head are generated.
|
61
|
+
|
62
|
+
Note that any input may also be read from standard input and any output
|
63
|
+
may be sent to standard output, simply by specifying a '-' or an empty
|
64
|
+
string "" instead of a filename. For example
|
65
|
+
apriori test1.tab -
|
66
|
+
writes the rules directly to the terminal. They may be piped to any
|
67
|
+
other program, since all other messages of the apriori program are
|
68
|
+
written to standard error.
|
69
|
+
|
70
|
+
Enjoy,
|
71
|
+
Christian Borgelt
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
#-----------------------------------------------------------------------
|
3
|
+
# File : rulesort
|
4
|
+
# Contents: sort output of apriori
|
5
|
+
# Author : Christian Borgelt
|
6
|
+
# History : ??.??.1996 file created
|
7
|
+
# 27.02.1997 default settings moved to default case
|
8
|
+
# 26.03.2003 adapted to current apriori version
|
9
|
+
#-----------------------------------------------------------------------
|
10
|
+
case $1 in
|
11
|
+
'-1')
|
12
|
+
cmd='s/\(.*(\)\([0-9]*\.[0-9]*\)\(%[,/].*\)/\2#\1\2\3/'
|
13
|
+
sopt='-n -r'
|
14
|
+
shift;;
|
15
|
+
'-2')
|
16
|
+
cmd='s/\(.*(.*[,/] \)\([0-9]*\.[0-9]*\)\(%.*\)/\2#\1\2\3/'
|
17
|
+
sopt='-n -r'
|
18
|
+
shift;;
|
19
|
+
default)
|
20
|
+
cmd=''
|
21
|
+
sopt='-d'
|
22
|
+
;;
|
23
|
+
esac
|
24
|
+
sed "$cmd" $1 | sort $sopt | sed 's/^.*#//' > $2
|
@@ -0,0 +1,43 @@
|
|
1
|
+
5 <- (100.0, 30.0)
|
2
|
+
1 <- (100.0, 60.0)
|
3
|
+
2 <- (100.0, 70.0)
|
4
|
+
3 <- (100.0, 70.0)
|
5
|
+
4 <- (100.0, 70.0)
|
6
|
+
1 <- 5 (30.0, 33.3)
|
7
|
+
3 <- 5 (30.0, 33.3)
|
8
|
+
4 <- 5 (30.0, 100.0)
|
9
|
+
5 <- 4 (70.0, 42.9)
|
10
|
+
2 <- 1 (60.0, 83.3)
|
11
|
+
1 <- 2 (70.0, 71.4)
|
12
|
+
3 <- 1 (60.0, 66.7)
|
13
|
+
1 <- 3 (70.0, 57.1)
|
14
|
+
4 <- 1 (60.0, 66.7)
|
15
|
+
1 <- 4 (70.0, 57.1)
|
16
|
+
3 <- 2 (70.0, 85.7)
|
17
|
+
2 <- 3 (70.0, 85.7)
|
18
|
+
4 <- 2 (70.0, 57.1)
|
19
|
+
2 <- 4 (70.0, 57.1)
|
20
|
+
4 <- 3 (70.0, 57.1)
|
21
|
+
3 <- 4 (70.0, 57.1)
|
22
|
+
4 <- 5 1 (10.0, 100.0)
|
23
|
+
1 <- 5 4 (30.0, 33.3)
|
24
|
+
5 <- 1 4 (40.0, 25.0)
|
25
|
+
4 <- 5 3 (10.0, 100.0)
|
26
|
+
3 <- 5 4 (30.0, 33.3)
|
27
|
+
5 <- 3 4 (40.0, 25.0)
|
28
|
+
3 <- 1 2 (50.0, 80.0)
|
29
|
+
2 <- 1 3 (40.0, 100.0)
|
30
|
+
1 <- 2 3 (60.0, 66.7)
|
31
|
+
4 <- 1 2 (50.0, 60.0)
|
32
|
+
2 <- 1 4 (40.0, 75.0)
|
33
|
+
1 <- 2 4 (40.0, 75.0)
|
34
|
+
4 <- 1 3 (40.0, 50.0)
|
35
|
+
3 <- 1 4 (40.0, 50.0)
|
36
|
+
1 <- 3 4 (40.0, 50.0)
|
37
|
+
4 <- 2 3 (60.0, 50.0)
|
38
|
+
3 <- 2 4 (40.0, 75.0)
|
39
|
+
2 <- 3 4 (40.0, 75.0)
|
40
|
+
4 <- 1 2 3 (40.0, 50.0)
|
41
|
+
3 <- 1 2 4 (30.0, 66.7)
|
42
|
+
2 <- 1 3 4 (20.0, 100.0)
|
43
|
+
1 <- 2 3 4 (30.0, 66.7)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
gawk '
|
3
|
+
function output ()
|
4
|
+
{
|
5
|
+
if (i > 0)
|
6
|
+
printf("%s", items[0]);
|
7
|
+
for (k = 0; ++k < i; )
|
8
|
+
printf(" %s", items[k]);
|
9
|
+
printf("\n");
|
10
|
+
}
|
11
|
+
|
12
|
+
BEGIN { tid = ""; i = 0; }
|
13
|
+
{
|
14
|
+
if ($1 == tid)
|
15
|
+
items[i++] = $2;
|
16
|
+
else {
|
17
|
+
if (tid != "") output();
|
18
|
+
tid = $1;
|
19
|
+
items[0] = $2; i = 1;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
END { output(); }
|
23
|
+
' $1 > $2
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
gawk -v app=$3 -v out=$4 '
|
3
|
+
BEGIN {
|
4
|
+
FS = " ";
|
5
|
+
if ((app != "") && (out != "")) {
|
6
|
+
getline dflt < app;
|
7
|
+
while ((getline < app) > 0)
|
8
|
+
base[$1] = $2;
|
9
|
+
}
|
10
|
+
FS = ",";
|
11
|
+
}
|
12
|
+
(NR == 1) {
|
13
|
+
for (i = 0; ++i <= NF; )
|
14
|
+
items[i] = $i;
|
15
|
+
}
|
16
|
+
(NR > 1) {
|
17
|
+
for (i = k = 0; ++i <= NF; ) {
|
18
|
+
if (k++ > 0) printf(" ");
|
19
|
+
item = (items[i] "=" $i);
|
20
|
+
printf("%s", item);
|
21
|
+
if (items[i] in base) apps[item] = base[items[i]];
|
22
|
+
else apps[item] = dflt;
|
23
|
+
}
|
24
|
+
printf("\n");
|
25
|
+
}
|
26
|
+
END {
|
27
|
+
if (out != "") {
|
28
|
+
print dflt > out;
|
29
|
+
for (t in apps)
|
30
|
+
if (apps[t] != dflt)
|
31
|
+
printf("%s %s\n", t, apps[t]) > out;
|
32
|
+
}
|
33
|
+
}' $1 > $2
|
@@ -0,0 +1,750 @@
|
|
1
|
+
/*----------------------------------------------------------------------
|
2
|
+
File : apriori.c
|
3
|
+
Contents: apriori algorithm for finding association rules
|
4
|
+
Author : Christian Borgelt
|
5
|
+
History : 1996.02.14 file created
|
6
|
+
1996.07.26 output precision reduced
|
7
|
+
1996.11.22 options -b, -f, and -r added
|
8
|
+
1996.11.24 option -e added (add. evaluation measures)
|
9
|
+
1997.08.18 normalized chi^2 measure added
|
10
|
+
option -m (minimal rule length) added
|
11
|
+
1997.10.13 quiet version (no output to stdout or stderr)
|
12
|
+
1998.01.27 adapted to changed ist_create() function
|
13
|
+
1998.08.08 optional input file (item appearances) added
|
14
|
+
1998.09.02 several assertions added
|
15
|
+
1998.09.07 hyperedge mode (option -h) added
|
16
|
+
1998.12.08 output of absolute support (option -a) added
|
17
|
+
float changed to double
|
18
|
+
1998.12.09 conversion of names to a scanable form added
|
19
|
+
1999.02.05 long int changed to int
|
20
|
+
1999.02.09 input from stdin, output to stdout added
|
21
|
+
1999.08.09 bug in check of support parameter (<= 0) fixed
|
22
|
+
1999.11.05 rule evaluation measure EM_AIMP added
|
23
|
+
1999.11.08 output of add. rule eval. measure value added
|
24
|
+
2000.03.16 optional use of original rule support definition
|
25
|
+
2001.04.01 option -h replaced by option -t (target type)
|
26
|
+
2001.05.26 extended support output added (option -x)
|
27
|
+
2001.06.09 extended support output for item sets added
|
28
|
+
2001.08.15 module scan used for output formatting
|
29
|
+
2001.11.18 item and transaction functions made a module
|
30
|
+
2001.11.19 options -C, -l changed, option -y removed
|
31
|
+
2001.12.28 adapted to module tract, some improvements
|
32
|
+
2002.01.11 evaluation measures codes changed to letters
|
33
|
+
2002.02.10 option -q extended by a direction parameter
|
34
|
+
2002.02.11 memory usage minimization option added
|
35
|
+
2002.06.09 arbitrary supp./conf. formats made possible
|
36
|
+
2003.01.09 option -k (item separator) added
|
37
|
+
2003.01.14 check for empty transaction set added
|
38
|
+
2003.03.12 output of lift value (conf/prior) added
|
39
|
+
2003.07.17 item filtering w.r.t. usage added (option -u)
|
40
|
+
2003.07.17 sorting w.r.t. transaction size sum added
|
41
|
+
2003.07.18 maximal itemset filter added
|
42
|
+
2003.08.11 closed itemset filter added
|
43
|
+
2003.08.15 item filtering for transaction tree added
|
44
|
+
2003.08.16 parameter for transaction filtering added
|
45
|
+
2003.08.18 dynamic filtering decision based on times added
|
46
|
+
2003.08.21 option -j (heap sort for transactions) added
|
47
|
+
2003.09.22 meaning of option -j reversed (heapsort default)
|
48
|
+
2004.03.25 option -S added (maximal support of a set/rule)
|
49
|
+
2004.05.09 additional selection measure for sets added
|
50
|
+
2004.10.28 two unnecessary assignments removed
|
51
|
+
2004.11.20 bug in evaluation of -j (heap/quicksort) fixed
|
52
|
+
2004.11.23 absolute/relative support output changed
|
53
|
+
2004.12.09 semantics of option -p changed
|
54
|
+
2005.01.25 bug in output of absolute/relative support fixed
|
55
|
+
2005.01.31 another bug in this output fixed
|
56
|
+
2005.06.20 use of flag for "no item sorting" corrected
|
57
|
+
2007.02.13 adapted to modified module tabscan
|
58
|
+
2008.03.13 additional hyperedge evaluation added
|
59
|
+
2008.03.24 additional target added (association groups)
|
60
|
+
----------------------------------------------------------------------*/
|
61
|
+
#include <stdio.h>
|
62
|
+
#include <stdlib.h>
|
63
|
+
#include <stdarg.h>
|
64
|
+
#include <string.h>
|
65
|
+
#include <limits.h>
|
66
|
+
#include <math.h>
|
67
|
+
#include <time.h>
|
68
|
+
#include <assert.h>
|
69
|
+
#include "scan.h"
|
70
|
+
#include "tract.h"
|
71
|
+
#include "istree.h"
|
72
|
+
#ifdef STORAGE
|
73
|
+
#include "storage.h"
|
74
|
+
#endif
|
75
|
+
|
76
|
+
/*----------------------------------------------------------------------
|
77
|
+
Preprocessor Definitions
|
78
|
+
----------------------------------------------------------------------*/
|
79
|
+
#define PRGNAME "apriori"
|
80
|
+
#define DESCRIPTION "find association rules with the apriori algorithm"
|
81
|
+
#define VERSION "version 4.35 (2008.03.24) " \
|
82
|
+
"(c) 1996-2008 Christian Borgelt"
|
83
|
+
|
84
|
+
/* --- target types --- */
|
85
|
+
#define TT_SET 0 /* frequent item sets */
|
86
|
+
#define TT_CLSET 1 /* closed item sets */
|
87
|
+
#define TT_MFSET 2 /* maximal item sets */
|
88
|
+
#define TT_RULE 3 /* association rules */
|
89
|
+
#define TT_HEDGE 4 /* association hyperedges */
|
90
|
+
#define TT_GROUP 5 /* association groups */
|
91
|
+
|
92
|
+
/* --- error codes --- */
|
93
|
+
#define E_OPTION (-5) /* unknown option */
|
94
|
+
#define E_OPTARG (-6) /* missing option argument */
|
95
|
+
#define E_ARGCNT (-7) /* too few/many arguments */
|
96
|
+
#define E_STDIN (-8) /* double assignment of stdin */
|
97
|
+
#define E_TARGET (-9) /* invalid target type */
|
98
|
+
#define E_SUPP (-10) /* invalid support */
|
99
|
+
#define E_CONF (-11) /* invalid confidence */
|
100
|
+
#define E_MEASURE (-12) /* invalid evaluation measure */
|
101
|
+
#define E_RULELEN (-13) /* invalid rule length */
|
102
|
+
#define E_NOTAS (-14) /* no items or transactions */
|
103
|
+
#define E_NOFREQ (-15) /* no frequent items */
|
104
|
+
#define E_UNKNOWN (-21) /* unknown error */
|
105
|
+
|
106
|
+
#ifndef QUIET /* if not quiet version */
|
107
|
+
#ifdef FFLUSH
|
108
|
+
#define MSG(x) x /* print messages */
|
109
|
+
#else /* if to flush every output */
|
110
|
+
#define MSG(x) x, fflush(stderr)
|
111
|
+
#endif
|
112
|
+
#else /* if quiet version */
|
113
|
+
#define MSG(x) /* suppress messages */
|
114
|
+
#endif
|
115
|
+
|
116
|
+
#define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
|
117
|
+
#define RECCNT(s) (ts_reccnt(is_tabscan(s)) \
|
118
|
+
- ((ts_delim(is_tabscan(s)) == TS_REC) ? 1 : 0))
|
119
|
+
#define BUFFER(s) ts_buf(is_tabscan(s))
|
120
|
+
|
121
|
+
/*----------------------------------------------------------------------
|
122
|
+
Constants
|
123
|
+
----------------------------------------------------------------------*/
|
124
|
+
#ifndef QUIET /* if not quiet version */
|
125
|
+
/* --- target types --- */
|
126
|
+
static const char *ttypes[] = {
|
127
|
+
/* TT_SET 0 */ "set",
|
128
|
+
/* TT_CLSET 1 */ "set",
|
129
|
+
/* TT_MFSET 2 */ "set",
|
130
|
+
/* TT_RULE 3 */ "rule",
|
131
|
+
/* TT_HEDGE 4 */ "hyperedge",
|
132
|
+
/* TT_GROUP 5 */ "group",
|
133
|
+
};
|
134
|
+
|
135
|
+
/* --- error messages --- */
|
136
|
+
static const char *errmsgs[] = {
|
137
|
+
/* E_NONE 0 */ "no error\n",
|
138
|
+
/* E_NOMEM -1 */ "not enough memory\n",
|
139
|
+
/* E_FOPEN -2 */ "cannot open file %s\n",
|
140
|
+
/* E_FREAD -3 */ "read error on file %s\n",
|
141
|
+
/* E_FWRITE -4 */ "write error on file %s\n",
|
142
|
+
/* E_OPTION -5 */ "unknown option -%c\n",
|
143
|
+
/* E_OPTARG -6 */ "missing option argument\n",
|
144
|
+
/* E_ARGCNT -7 */ "wrong number of arguments\n",
|
145
|
+
/* E_STDIN -8 */ "double assignment of standard input\n",
|
146
|
+
/* E_TARGET -9 */ "invalid target type '%c'\n",
|
147
|
+
/* E_SUPP -10 */ "invalid minimal support %g%%\n",
|
148
|
+
/* E_CONF -11 */ "invalid minimal confidence %g%%\n",
|
149
|
+
/* E_MEASURE -12 */ "invalid additional evaluation measure %c\n",
|
150
|
+
/* E_RULELEN -13 */ "invalid set size/rule length %d\n",
|
151
|
+
/* E_NOTAS -14 */ "no items or transactions to work on\n",
|
152
|
+
/* E_NOFREQ -15 */ "no frequent items\n",
|
153
|
+
/* E_ITEMEXP -16 */ "file %s, record %d: item expected\n",
|
154
|
+
/* E_DUPITEM -17 */ "file %s, record %d: duplicate item %s\n",
|
155
|
+
/* E_APPEXP -18 */ "file %s, record %d: "
|
156
|
+
"appearance indicator expected\n",
|
157
|
+
/* E_UNKAPP -19 */ "file %s, record %d: "
|
158
|
+
"unknown appearance indicator %s\n",
|
159
|
+
/* E_FLDCNT -20 */ "file %s, record %d: too many fields\n",
|
160
|
+
/* E_UNKNOWN -21 */ "unknown error\n"
|
161
|
+
};
|
162
|
+
#endif
|
163
|
+
|
164
|
+
/*----------------------------------------------------------------------
|
165
|
+
Global Variables
|
166
|
+
----------------------------------------------------------------------*/
|
167
|
+
#ifndef QUIET
|
168
|
+
static char *prgname; /* program name for error messages */
|
169
|
+
#endif
|
170
|
+
static ITEMSET *itemset = NULL; /* item set */
|
171
|
+
static TASET *taset = NULL; /* transaction set */
|
172
|
+
static TATREE *tatree = NULL; /* transaction tree */
|
173
|
+
static ISTREE *istree = NULL; /* item set tree */
|
174
|
+
static FILE *in = NULL; /* input file */
|
175
|
+
static FILE *out = NULL; /* output file */
|
176
|
+
|
177
|
+
/*----------------------------------------------------------------------
|
178
|
+
Main Functions
|
179
|
+
----------------------------------------------------------------------*/
|
180
|
+
|
181
|
+
static void help (void)
|
182
|
+
{ /* --- print help on eval. measures */
|
183
|
+
#ifndef QUIET
|
184
|
+
fprintf(stderr, "\n"); /* terminate startup message */
|
185
|
+
printf("additional evaluation measures (option -e#)\n");
|
186
|
+
printf("frequent item sets:\n");
|
187
|
+
printf("d or 1: binary logarithm of support quotient\n");
|
188
|
+
printf("association rules:\n");
|
189
|
+
printf("d or 1: absolute confidence difference to prior\n");
|
190
|
+
printf("q or 2: absolute difference of confidence quotient to 1\n");
|
191
|
+
printf("a or 3: absolute difference of improvement value to 1\n");
|
192
|
+
printf("i or 4: information difference to prior\n");
|
193
|
+
printf("c or 5: normalized chi^2 measure\n");
|
194
|
+
printf("p or 6: p-value computed from chi^2 measure\n");
|
195
|
+
#endif
|
196
|
+
exit(0); /* abort the program */
|
197
|
+
} /* help() */
|
198
|
+
|
199
|
+
/*--------------------------------------------------------------------*/
|
200
|
+
|
201
|
+
static void error (int code, ...)
|
202
|
+
{ /* --- print an error message */
|
203
|
+
#ifndef QUIET /* if not quiet version */
|
204
|
+
va_list args; /* list of variable arguments */
|
205
|
+
const char *msg; /* error message */
|
206
|
+
|
207
|
+
assert(prgname); /* check the program name */
|
208
|
+
if (code < E_UNKNOWN) code = E_UNKNOWN;
|
209
|
+
if (code < 0) { /* if to report an error, */
|
210
|
+
msg = errmsgs[-code]; /* get the error message */
|
211
|
+
if (!msg) msg = errmsgs[-E_UNKNOWN];
|
212
|
+
fprintf(stderr, "\n%s: ", prgname);
|
213
|
+
va_start(args, code); /* get variable arguments */
|
214
|
+
vfprintf(stderr, msg, args);/* print error message */
|
215
|
+
va_end(args); /* end argument evaluation */
|
216
|
+
}
|
217
|
+
#endif
|
218
|
+
#ifndef NDEBUG /* if debug version */
|
219
|
+
if (istree) ist_delete(istree); /* clean up memory */
|
220
|
+
if (tatree) tat_delete(tatree); /* and close files */
|
221
|
+
if (taset) tas_delete(taset, 0);
|
222
|
+
if (itemset) is_delete(itemset);
|
223
|
+
if (in && (in != stdin)) fclose(in);
|
224
|
+
if (out && (out != stdout)) fclose(out);
|
225
|
+
#endif
|
226
|
+
#ifdef STORAGE /* if storage debugging */
|
227
|
+
showmem("at end of program"); /* check memory usage */
|
228
|
+
#endif
|
229
|
+
exit(code); /* abort the program */
|
230
|
+
} /* error() */
|
231
|
+
|
232
|
+
/*--------------------------------------------------------------------*/
|
233
|
+
|
234
|
+
int main (int argc, char *argv[])
|
235
|
+
{ /* --- main function */
|
236
|
+
int i, k = 0, n; /* loop variables, counters */
|
237
|
+
char *s; /* to traverse the options */
|
238
|
+
char **optarg = NULL; /* option argument */
|
239
|
+
char *fn_in = NULL; /* name of input file */
|
240
|
+
char *fn_out = NULL; /* name of output file */
|
241
|
+
char *fn_app = NULL; /* name of item appearances file */
|
242
|
+
char *blanks = NULL; /* blanks */
|
243
|
+
char *fldseps = NULL; /* field separators */
|
244
|
+
char *recseps = NULL; /* record separators */
|
245
|
+
char *comment = NULL; /* comment indicators */
|
246
|
+
char *used = NULL; /* item usage vector */
|
247
|
+
double supp = 0.1; /* minimal support (in percent) */
|
248
|
+
double smax = 1.0; /* maximal support (in percent) */
|
249
|
+
double conf = 0.8; /* minimal confidence (in percent) */
|
250
|
+
int mode = IST_BODY; /* search mode (rule support def.) */
|
251
|
+
int target = 'r'; /* target type (sets/rules/h.edges) */
|
252
|
+
int arem = 0; /* additional rule evaluation measure */
|
253
|
+
int lift = 0; /* flag for printing the lift */
|
254
|
+
double minval = 0.1; /* minimal evaluation measure value */
|
255
|
+
double lftval = 0; /* lift value (confidence/prior) */
|
256
|
+
int minlen = 1; /* minimal rule length */
|
257
|
+
int maxlen = INT_MAX; /* maximal rule length */
|
258
|
+
int load = 1; /* flag for loading transactions */
|
259
|
+
int sort = 2; /* flag for item sorting and recoding */
|
260
|
+
double filter = 0.1; /* item usage filtering parameter */
|
261
|
+
int tree = 1; /* flag for transaction tree */
|
262
|
+
int heap = 1; /* flag for heap sort vs. quick sort */
|
263
|
+
int c2scf = 0; /* flag for conv. to scanable form */
|
264
|
+
char *sep = " "; /* item separator for output */
|
265
|
+
char *fmt = "%.1f"; /* output format for support/conf. */
|
266
|
+
int sout = 1; /* flag for abs./rel. support output */
|
267
|
+
int ext = 0; /* flag for extended support output */
|
268
|
+
int aval = 0; /* flag for add. eval. measure value */
|
269
|
+
int maxcnt = 0; /* maximal number of items per set */
|
270
|
+
int tacnt; /* number of transactions */
|
271
|
+
int frq; /* frequency of an item set */
|
272
|
+
int *map, *set; /* identifier map, item set */
|
273
|
+
const char *name; /* buffer for item names */
|
274
|
+
static char buf[4*TS_SIZE+4]; /* buffer for formatting */
|
275
|
+
clock_t t, tt, tc, x; /* timer for measurements */
|
276
|
+
|
277
|
+
#ifndef QUIET /* if not quiet version */
|
278
|
+
prgname = argv[0]; /* get program name for error msgs. */
|
279
|
+
|
280
|
+
/* --- print usage message --- */
|
281
|
+
if (argc > 1) { /* if arguments are given */
|
282
|
+
fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
|
283
|
+
fprintf(stderr, VERSION); } /* print a startup message */
|
284
|
+
else { /* if no arguments given */
|
285
|
+
printf("usage: %s [options] infile outfile [appfile]\n", argv[0]);
|
286
|
+
printf("%s\n", DESCRIPTION);
|
287
|
+
printf("%s\n", VERSION);
|
288
|
+
printf("-t# target type (default: association rules)\n"
|
289
|
+
" (s: item sets, c: closed item sets,"
|
290
|
+
" m: maximal item sets,\n"
|
291
|
+
" r: association rules,"
|
292
|
+
" h: association hyperedges)\n");
|
293
|
+
printf("-m# minimal number of items per set/rule/hyperedge "
|
294
|
+
"(default: %d)\n", minlen);
|
295
|
+
printf("-n# maximal number of items per set/rule/hyperedge "
|
296
|
+
"(default: no limit)\n");
|
297
|
+
printf("-s# minimal support of a set/rule/hyperedge "
|
298
|
+
"(default: %g%%)\n", supp *100);
|
299
|
+
printf("-S# maximal support of a set/rule/hyperedge "
|
300
|
+
"(default: %g%%)\n", smax *100);
|
301
|
+
printf("-c# minimal confidence of a rule/hyperedge "
|
302
|
+
"(default: %g%%)\n", conf *100);
|
303
|
+
printf("-o use original definition of the support of a rule "
|
304
|
+
"(body & head)\n");
|
305
|
+
printf("-k# item separator for output "
|
306
|
+
"(default: \"%s\")\n", sep);
|
307
|
+
printf("-p# output format for support/confidence "
|
308
|
+
"(default: \"%s\")\n", fmt);
|
309
|
+
printf("-x extended support output "
|
310
|
+
"(print both rule support types)\n");
|
311
|
+
printf("-a print absolute support "
|
312
|
+
"(number of transactions)\n");
|
313
|
+
printf("-y print lift value (confidence divided by prior)\n");
|
314
|
+
printf("-e# additional evaluation measure (default: none)\n");
|
315
|
+
printf("-! print a list of additional evaluation measures\n");
|
316
|
+
printf("-d# minimal value of additional evaluation measure "
|
317
|
+
"(default: %g%%)\n", minval *100);
|
318
|
+
printf("-v print value of additional "
|
319
|
+
"rule evaluation measure\n");
|
320
|
+
printf("-g write output in scanable form "
|
321
|
+
"(quote certain characters)\n");
|
322
|
+
printf("-l do not load transactions into memory "
|
323
|
+
"(work on input file)\n");
|
324
|
+
printf("-q# sort items w.r.t. their frequency (default: %d)\n"
|
325
|
+
" (1: ascending, -1: descending, 0: do not sort,\n"
|
326
|
+
" 2: ascending, -2: descending w.r.t. "
|
327
|
+
"transaction size sum)\n", sort);
|
328
|
+
printf("-u# filter unused items from transactions "
|
329
|
+
"(default: %g)\n", filter);
|
330
|
+
printf(" (0: do not filter items w.r.t. usage in sets,\n"
|
331
|
+
" <0: fraction of removed items for filtering,\n"
|
332
|
+
" >0: take execution times ratio into account)\n");
|
333
|
+
printf("-h do not organize transactions as a prefix tree\n");
|
334
|
+
printf("-j use quicksort to sort the transactions "
|
335
|
+
"(default: heapsort)\n");
|
336
|
+
printf("-z minimize memory usage "
|
337
|
+
"(default: maximize speed)\n");
|
338
|
+
printf("-b/f/r# blank characters, field and record separators\n"
|
339
|
+
" (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
|
340
|
+
printf("-C# comment characters (default: \"#\")\n");
|
341
|
+
printf("infile file to read transactions from\n");
|
342
|
+
printf("outfile file to write item sets/association rules"
|
343
|
+
"/hyperedges to\n");
|
344
|
+
printf("appfile file stating item appearances (optional)\n");
|
345
|
+
return 0; /* print a usage message */
|
346
|
+
} /* and abort the program */
|
347
|
+
#endif /* #ifndef QUIET */
|
348
|
+
|
349
|
+
/* --- evaluate arguments --- */
|
350
|
+
for (i = 1; i < argc; i++) { /* traverse arguments */
|
351
|
+
s = argv[i]; /* get option argument */
|
352
|
+
if (optarg) { *optarg = s; optarg = NULL; continue; }
|
353
|
+
if ((*s == '-') && *++s) { /* -- if argument is an option */
|
354
|
+
while (*s) { /* traverse options */
|
355
|
+
switch (*s++) { /* evaluate switches */
|
356
|
+
case '!': help(); break;
|
357
|
+
case 't': target = (*s) ? *s++ : 'r'; break;
|
358
|
+
case 'm': minlen = (int)strtol(s, &s, 0); break;
|
359
|
+
case 'n': maxlen = (int)strtol(s, &s, 0); break;
|
360
|
+
case 's': supp = 0.01*strtod(s, &s); break;
|
361
|
+
case 'S': smax = 0.01*strtod(s, &s); break;
|
362
|
+
case 'c': conf = 0.01*strtod(s, &s); break;
|
363
|
+
case 'o': mode |= IST_BOTH; break;
|
364
|
+
case 'k': optarg = &sep; break;
|
365
|
+
case 'p': optarg = &fmt; break;
|
366
|
+
case 'x': ext = 1; break;
|
367
|
+
case 'a': sout |= 2; break;
|
368
|
+
case 'y': lift = 1; break;
|
369
|
+
case 'e': arem = (*s) ? *s++ : 0; break;
|
370
|
+
case 'd': minval = 0.01*strtod(s, &s); break;
|
371
|
+
case 'v': aval = 1; break;
|
372
|
+
case 'g': c2scf = 1; break;
|
373
|
+
case 'l': load = 0; break;
|
374
|
+
case 'q': sort = (int)strtol(s, &s, 0); break;
|
375
|
+
case 'u': filter = strtod(s, &s); break;
|
376
|
+
case 'h': tree = 0; break;
|
377
|
+
case 'j': heap = 0; break;
|
378
|
+
case 'z': mode |= IST_MEMOPT; break;
|
379
|
+
case 'b': optarg = &blanks; break;
|
380
|
+
case 'f': optarg = &fldseps; break;
|
381
|
+
case 'r': optarg = &recseps; break;
|
382
|
+
case 'C': optarg = &comment; break;
|
383
|
+
default : error(E_OPTION, *--s); break;
|
384
|
+
} /* set option variables */
|
385
|
+
if (optarg && *s) { *optarg = s; optarg = NULL; break; }
|
386
|
+
} } /* get option argument */
|
387
|
+
else { /* -- if argument is no option */
|
388
|
+
switch (k++) { /* evaluate non-options */
|
389
|
+
case 0: fn_in = s; break;
|
390
|
+
case 1: fn_out = s; break;
|
391
|
+
case 2: fn_app = s; break;
|
392
|
+
default: error(E_ARGCNT); break;
|
393
|
+
} /* note filenames */
|
394
|
+
}
|
395
|
+
}
|
396
|
+
if (optarg) error(E_OPTARG); /* check option argument */
|
397
|
+
if ((k < 2) || (k > 3)) /* and the number of arguments */
|
398
|
+
error(E_ARGCNT); /* (either in/out or in/out/app) */
|
399
|
+
if ((!fn_in || !*fn_in) && (fn_app && !*fn_app))
|
400
|
+
error(E_STDIN); /* stdin must not be used twice */
|
401
|
+
switch (target) { /* check and translate target type */
|
402
|
+
case 's': target = TT_SET; break;
|
403
|
+
case 'c': target = TT_CLSET; break;
|
404
|
+
case 'm': target = TT_MFSET; break;
|
405
|
+
case 'r': target = TT_RULE; break;
|
406
|
+
case 'h': target = TT_HEDGE; break;
|
407
|
+
case 'g': target = TT_GROUP; break;
|
408
|
+
default : error(E_TARGET, (char)target); break;
|
409
|
+
}
|
410
|
+
if (supp > 1) /* check the minimal support */
|
411
|
+
error(E_SUPP, supp); /* (< 0: absolute number) */
|
412
|
+
if ((conf < 0) || (conf > 1))
|
413
|
+
error(E_CONF, conf); /* check the minimal confidence */
|
414
|
+
if (minlen <= 0) error(E_RULELEN, minlen); /* check the limits */
|
415
|
+
if (maxlen <= 0) error(E_RULELEN, maxlen); /* for the rule length */
|
416
|
+
switch (arem) { /* check and translate measure */
|
417
|
+
case 0 : case '0': arem = EM_NONE; break;
|
418
|
+
case 'd': case '1': arem = EM_DIFF; break;
|
419
|
+
case 'q': case '2': arem = EM_QUOT; break;
|
420
|
+
case 'a': case '3': arem = EM_AIMP; break;
|
421
|
+
case 'i': case '4': arem = EM_INFO; break;
|
422
|
+
case 'c': case '5': arem = EM_CHI2; break;
|
423
|
+
case 'p': case '6': arem = EM_PVAL; break;
|
424
|
+
default : error(E_MEASURE, (char)arem); break;
|
425
|
+
}
|
426
|
+
if (target <= TT_MFSET) { /* in item set mode neutralize */
|
427
|
+
mode |= IST_BOTH; conf = 1;}/* rule specific settings */
|
428
|
+
if (arem == EM_NONE) /* if no add. rule eval. measure, */
|
429
|
+
aval = 0; /* clear the corresp. output flag */
|
430
|
+
if ((filter <= -1) || (filter >= 1)) filter = 0;
|
431
|
+
|
432
|
+
/* --- create item set and transaction set --- */
|
433
|
+
itemset = is_create(-1); /* create an item set and */
|
434
|
+
if (!itemset) error(E_NOMEM); /* set the special characters */
|
435
|
+
is_chars(itemset, blanks, fldseps, recseps, comment);
|
436
|
+
if (load) { /* if to load the transactions */
|
437
|
+
taset = tas_create(itemset);
|
438
|
+
if (!taset) error(E_NOMEM); /* create a transaction set */
|
439
|
+
} /* to store the transactions */
|
440
|
+
MSG(fprintf(stderr, "\n")); /* terminate the startup message */
|
441
|
+
|
442
|
+
/* --- read item appearances --- */
|
443
|
+
if (fn_app) { /* if item appearances are given */
|
444
|
+
t = clock(); /* start the timer */
|
445
|
+
if (*fn_app) /* if an app. file name is given, */
|
446
|
+
in = fopen(fn_app, "r"); /* open the item appearances file */
|
447
|
+
else { /* if no app. file name is given, */
|
448
|
+
in = stdin; fn_app = "<stdin>"; } /* read from std. input */
|
449
|
+
MSG(fprintf(stderr, "reading %s ... ", fn_app));
|
450
|
+
if (!in) error(E_FOPEN, fn_app);
|
451
|
+
k = is_readapp(itemset,in); /* read the item appearances */
|
452
|
+
if (k != 0) error(k, fn_app, RECCNT(itemset), BUFFER(itemset));
|
453
|
+
if (in != stdin) /* if not read from standard input, */
|
454
|
+
fclose(in); /* close the input file */
|
455
|
+
MSG(fprintf(stderr, "[%d item(s)]", is_cnt(itemset)));
|
456
|
+
MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
|
457
|
+
} /* print a log message */
|
458
|
+
|
459
|
+
/* --- read transactions --- */
|
460
|
+
t = clock(); /* start the timer */
|
461
|
+
if (fn_in && *fn_in) /* if an input file name is given, */
|
462
|
+
in = fopen(fn_in, "r"); /* open input file for reading */
|
463
|
+
else { /* if no input file name is given, */
|
464
|
+
in = stdin; fn_in = "<stdin>"; } /* read from standard input */
|
465
|
+
MSG(fprintf(stderr, "reading %s ... ", fn_in));
|
466
|
+
if (!in) error(E_FOPEN, fn_in);
|
467
|
+
while (1) { /* transaction read loop */
|
468
|
+
k = is_read(itemset, in); /* read the next transaction */
|
469
|
+
if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
|
470
|
+
if (k > 0) break; /* check for error and end of file */
|
471
|
+
k = is_tsize(itemset); /* update the maximal */
|
472
|
+
if (k > maxcnt) maxcnt = k; /* transaction size */
|
473
|
+
if (taset && (tas_add(taset, NULL, 0) != 0))
|
474
|
+
error(E_NOMEM); /* add the loaded transaction */
|
475
|
+
} /* to the transaction set */
|
476
|
+
if (taset) { /* if transactions have been loaded */
|
477
|
+
if (in != stdin) fclose(in);/* if not read from standard input, */
|
478
|
+
in = NULL; /* close the input file */
|
479
|
+
} /* clear the file variable */
|
480
|
+
n = is_cnt(itemset); /* get the number of items */
|
481
|
+
tacnt = is_gettac(itemset); /* and the number of transactions */
|
482
|
+
MSG(fprintf(stderr, "[%d item(s), %d transaction(s)]", n, tacnt));
|
483
|
+
MSG(fprintf(stderr, " done [%.2fs].", SEC_SINCE(t)));
|
484
|
+
if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
|
485
|
+
MSG(fprintf(stderr, "\n")); /* check for at least one transaction */
|
486
|
+
if (supp >= 0) /* if relative support is given */
|
487
|
+
supp = ceil(tacnt *supp); /* compute absolute support */
|
488
|
+
else { /* if absolute support is given, */
|
489
|
+
supp = ceil(-100 *supp); /* make the support value positive */
|
490
|
+
if (!(sout & 2)) sout = 2; /* switch to absolute support output */
|
491
|
+
} /* do the same with the max. support */
|
492
|
+
smax = floor(((smax >= 0) ? tacnt : -100) *smax);
|
493
|
+
|
494
|
+
/* --- sort and recode items --- */
|
495
|
+
MSG(fprintf(stderr, "filtering, sorting and recoding items ... "));
|
496
|
+
t = clock(); /* start the timer */
|
497
|
+
map = (int*)malloc(is_cnt(itemset) *sizeof(int));
|
498
|
+
if (!map) error(E_NOMEM); /* create an item identifier map */
|
499
|
+
k = (int)((mode & IST_HEAD) ? supp : ceil(supp *conf));
|
500
|
+
n = is_recode(itemset, k, sort, map);
|
501
|
+
if (taset) { /* sort and recode the items and */
|
502
|
+
tas_recode(taset, map,n); /* recode the loaded transactions */
|
503
|
+
maxcnt = tas_max(taset); /* get the new maximal t.a. size */
|
504
|
+
} /* (may be smaller than before) */
|
505
|
+
free(map); /* delete the item identifier map */
|
506
|
+
MSG(fprintf(stderr, "[%d item(s)] ", n));
|
507
|
+
MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
|
508
|
+
if (n <= 0) error(E_NOFREQ); /* print a log message and */
|
509
|
+
MSG(fprintf(stderr, "\n")); /* check the number of items */
|
510
|
+
if (maxlen > maxcnt) /* clamp the set/rule length */
|
511
|
+
maxlen = maxcnt; /* to the maximum set size */
|
512
|
+
|
513
|
+
/* --- create a transaction tree --- */
|
514
|
+
tt = 0; /* init. the tree construction time */
|
515
|
+
if (tree && taset) { /* if transactions were loaded */
|
516
|
+
MSG(fprintf(stderr, "creating transaction tree ... "));
|
517
|
+
t = clock(); /* start the timer */
|
518
|
+
tatree = tat_create(taset, heap);
|
519
|
+
if (!tatree) error(E_NOMEM);/* create a transaction tree */
|
520
|
+
if (filter == 0) { /* if a tree rebuild is not needed, */
|
521
|
+
tas_delete(taset, 0); taset = NULL; } /* delete transactions */
|
522
|
+
tt = clock() -t; /* note the time for the construction */
|
523
|
+
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
|
524
|
+
} /* print a log message */
|
525
|
+
|
526
|
+
/* --- create an item set tree --- */
|
527
|
+
t = clock(); tc = 0; /* start the timer */
|
528
|
+
istree = ist_create(itemset, mode, (int)supp, conf);
|
529
|
+
if (!istree) error(E_NOMEM); /* create an item set tree */
|
530
|
+
|
531
|
+
/* --- check item subsets --- */
|
532
|
+
if (filter) { /* if to filter unused items */
|
533
|
+
used = (char*)malloc(is_cnt(itemset) *sizeof(char));
|
534
|
+
if (!used) error(E_NOMEM); /* create a flag vector */
|
535
|
+
} /* for the items */
|
536
|
+
MSG(fprintf(stderr, "checking subsets of size 1"));
|
537
|
+
while (ist_height(istree) < maxlen) {
|
538
|
+
if (filter != 0) { /* if to filter w.r.t. item usage, */
|
539
|
+
i = ist_check(istree, used); /* check current item usage */
|
540
|
+
if (i < maxlen) maxlen = i; /* update the maximum size */
|
541
|
+
if (ist_height(istree) >= i) break;
|
542
|
+
} /* check the tree height */
|
543
|
+
k = ist_addlvl(istree); /* while max. height is not reached, */
|
544
|
+
if (k < 0) error(E_NOMEM); /* add a level to the item set tree */
|
545
|
+
if (k != 0) break; /* if no level was added, abort */
|
546
|
+
MSG(fprintf(stderr, " %d", ist_height(istree)));
|
547
|
+
if (tatree) { /* if a transaction tree was created */
|
548
|
+
if (((filter < 0) /* if to filter w.r.t. item usage */
|
549
|
+
&& (i < -filter *n)) /* and enough items were removed */
|
550
|
+
|| ((filter > 0) /* or counting time is long enough */
|
551
|
+
&& (i < n) && (i *(double)tt < filter *n *tc))) {
|
552
|
+
n = i; x = clock(); /* note the new number of items */
|
553
|
+
tas_filter(taset, used);/* and remove unnecessary items */
|
554
|
+
tat_delete(tatree); /* delete the transaction tree */
|
555
|
+
tatree = tat_create(taset, heap);
|
556
|
+
if (!tatree) error(E_NOMEM);
|
557
|
+
tt = clock() -x; /* rebuild the transaction tree and */
|
558
|
+
} /* note the new construction time */
|
559
|
+
x = clock(); /* count the transaction tree */
|
560
|
+
ist_countx(istree, tatree);
|
561
|
+
tc = clock() -x; } /* note the new count time */
|
562
|
+
else if (taset) { /* if transactions were loaded */
|
563
|
+
if (((filter < 0) /* if to filter w.r.t. item usage */
|
564
|
+
&& (i <= -filter *n)) /* and enough items were removed */
|
565
|
+
|| ((filter > 0) /* or counting time is long enough */
|
566
|
+
&& (i *(double)tt <= filter *n *tc))) {
|
567
|
+
n = i; x = clock(); /* note the new number of items */
|
568
|
+
tas_filter(taset, used);/* and remove unnecessary items */
|
569
|
+
tt = clock() -t; /* from the transactions */
|
570
|
+
} /* note the filtering time */
|
571
|
+
for (i = tacnt; --i >= 0;)/* traverse and count transactions */
|
572
|
+
ist_count(istree, tas_tract(taset, i), tas_tsize(taset, i));
|
573
|
+
tc = clock() -t; } /* note the new count time */
|
574
|
+
else { /* if to work on the input file, */
|
575
|
+
rewind(in); /* reset the file position */
|
576
|
+
for (maxcnt = 0; (i = is_read(itemset, in)) == 0; ) {
|
577
|
+
if (filter != 0) /* (re)read the transactions and */
|
578
|
+
is_filter(itemset, used); /* remove unnecessary items */
|
579
|
+
k = is_tsize(itemset); /* update the maximum size */
|
580
|
+
if (k > maxcnt) maxcnt = k; /* of a transaction */
|
581
|
+
ist_count(istree, is_tract(itemset), k);
|
582
|
+
} /* count the transaction in the tree */
|
583
|
+
if (i < 0) error(i, fn_in, RECCNT(itemset), BUFFER(itemset));
|
584
|
+
if (maxcnt < maxlen) /* update the maximal rule length */
|
585
|
+
maxlen = maxcnt; /* according to the max. t.a. size */
|
586
|
+
} /* (may be smaller than before) */
|
587
|
+
}
|
588
|
+
if (!taset && !tatree) { /* if transactions were not loaded */
|
589
|
+
if (in != stdin) fclose(in);/* if not read from standard input, */
|
590
|
+
in = NULL; /* close the input file */
|
591
|
+
} /* clear the file variable */
|
592
|
+
MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
|
593
|
+
|
594
|
+
/* --- filter found item sets --- */
|
595
|
+
if ((target == TT_CLSET) || (target == TT_MFSET)) {
|
596
|
+
MSG(fprintf(stderr, "filtering %s item sets ... ",
|
597
|
+
(target == TT_MFSET) ? "maximal" : "closed"));
|
598
|
+
t = clock(); /* filter the item sets */
|
599
|
+
ist_filter(istree, (target == TT_MFSET) ? IST_MAXFRQ : IST_CLOSED);
|
600
|
+
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
|
601
|
+
} /* (filter takes longer than print) */
|
602
|
+
|
603
|
+
/* --- sort transactions --- */
|
604
|
+
if (target <= TT_MFSET) { /* if to find frequent item sets */
|
605
|
+
if (!taset) /* transactions must be loaded */
|
606
|
+
ext = 0; /* for extended support output */
|
607
|
+
else if (ext) { /* if extended output is requested */
|
608
|
+
MSG(fprintf(stderr, "sorting transactions ... "));
|
609
|
+
t = clock(); /* start the timer */
|
610
|
+
tas_sort(taset, heap); /* sort the transactions */
|
611
|
+
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
|
612
|
+
} /* (sorting is necessary to find the */
|
613
|
+
} /* number of identical transactions) */
|
614
|
+
|
615
|
+
/* --- print item sets/rules/hyperedges --- */
|
616
|
+
t = clock(); /* start the timer */
|
617
|
+
if (fn_out && *fn_out) /* if an output file name is given, */
|
618
|
+
out = fopen(fn_out, "w"); /* open the output file */
|
619
|
+
else { /* if no output file name is given, */
|
620
|
+
out = stdout; fn_out = "<stdout>"; } /* write to std. output */
|
621
|
+
MSG(fprintf(stderr, "writing %s ... ", fn_out));
|
622
|
+
if (!out) error(E_FOPEN, fn_out);
|
623
|
+
ist_init(istree, minlen, arem, minval);
|
624
|
+
set = is_tract(itemset); /* get the transaction buffer */
|
625
|
+
if (target <= TT_MFSET) { /* if to find frequent item sets */
|
626
|
+
for (n = 0; 1; ) { /* extract item sets from the tree */
|
627
|
+
k = ist_set(istree, set, &frq, &conf);
|
628
|
+
if (k <= 0) break; /* get the next frequent item set */
|
629
|
+
if (frq > smax) continue; /* check against maximal support */
|
630
|
+
for (i = 0; i < k; i++) { /* traverse the set's items */
|
631
|
+
name = is_name(itemset, set[i]);
|
632
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
633
|
+
fputs(name, out); /* print the name of the next item */
|
634
|
+
fputs((i < k-1) ? sep : " ", out);
|
635
|
+
} /* print a separator */
|
636
|
+
fputs(" (", out); /* print the item set's support */
|
637
|
+
if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
|
638
|
+
if (sout & 2) fputc('/', out); }
|
639
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
640
|
+
if (ext) { /* if to print the extended support */
|
641
|
+
frq = tas_occur(taset, set, k);
|
642
|
+
fputs(", ", out); /* get the number of occurrences */
|
643
|
+
fprintf(out, fmt, (frq/(double)tacnt) *100);
|
644
|
+
if (sout & 2) fprintf(out, "/%d", frq);
|
645
|
+
} /* print the extended support data */
|
646
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, conf *100); }
|
647
|
+
fputs(")\n", out); /* print the add. eval. measure, */
|
648
|
+
n++; /* terminate the support output, */
|
649
|
+
} } /* and count the item set */
|
650
|
+
else if (target == TT_RULE) { /* if to find association rules, */
|
651
|
+
for (n = 0; 1; ) { /* extract rules from tree */
|
652
|
+
k = ist_rule(istree, set, &frq, &conf, &lftval, &minval);
|
653
|
+
if (k <= 0) break; /* get the next association rule */
|
654
|
+
if (frq > smax) continue; /* check against maximal support */
|
655
|
+
for (i = 0; i < k; i++) { /* traverse the rule's items */
|
656
|
+
name = is_name(itemset, set[i]);
|
657
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
658
|
+
fputs(name, out); /* print the next item */
|
659
|
+
fputs((i <= 0) ? " <- " : ((i < k-1) ? sep : " "), out);
|
660
|
+
} /* print a separator */
|
661
|
+
fputs(" (", out); /* print the rule evaluation */
|
662
|
+
if (sout & 1) supp = frq/(double)tacnt;
|
663
|
+
if (ext && !(mode & IST_HEAD)) {
|
664
|
+
if (sout & 1) { fprintf(out, fmt, supp *conf *100);
|
665
|
+
if (sout & 2) fputc('/', out); }
|
666
|
+
if (sout & 2) { fprintf(out, "%d", (int)(frq *conf +0.5));}
|
667
|
+
fputs(", ", out); /* print the support of the rule */
|
668
|
+
} /* from the support of the body */
|
669
|
+
if (sout & 1) { fprintf(out, fmt, supp *100);
|
670
|
+
if (sout & 2) fputc('/', out); }
|
671
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
672
|
+
fputs(", ", out); /* print the rule support */
|
673
|
+
if (ext && (mode & IST_HEAD)) {
|
674
|
+
if (sout & 1) { fprintf(out, fmt, (supp/conf) *100);
|
675
|
+
if (sout & 2) fputc('/', out); }
|
676
|
+
if (sout & 2) { fprintf(out, "%d", (int)(frq /conf +0.5));}
|
677
|
+
fputs(", ", out); /* print the support of the body */
|
678
|
+
} /* from the support of the rule */
|
679
|
+
fprintf(out, fmt, conf *100); /* print the rule confidence */
|
680
|
+
if (lift) { fputs(", ", out); fprintf(out, fmt, lftval *100); }
|
681
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
|
682
|
+
fputs(")\n", out); /* print the value of the additional */
|
683
|
+
n++; /* rule evaluation measure and */
|
684
|
+
} } /* count the association rule */
|
685
|
+
else if (target == TT_HEDGE){ /* if to find association hyperedges */
|
686
|
+
for (n = 0; 1; ) { /* extract hyperedges from tree */
|
687
|
+
k = ist_hedge(istree, set, &frq, &conf, &minval);
|
688
|
+
if (k <= 0) break; /* get the next hyperedge */
|
689
|
+
if (frq > smax) continue; /* check against maximal support */
|
690
|
+
for (i = 0; i < k; i++) { /* traverse the edge's items */
|
691
|
+
name = is_name(itemset, set[i]);
|
692
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
693
|
+
fputs(name, out); /* print the name of the next item */
|
694
|
+
fputs((i < k-1) ? sep : " ", out);
|
695
|
+
} /* print a separator */
|
696
|
+
fputs(" (", out); /* print the hyperedge evaluation */
|
697
|
+
if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
|
698
|
+
if (sout & 2) fputc('/', out); }
|
699
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
700
|
+
fputs(", ", out); fprintf(out, fmt, conf *100);
|
701
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
|
702
|
+
fputs(")\n", out); /* print support and confidence */
|
703
|
+
n++; /* of the hyperedge and */
|
704
|
+
} } /* count the hyperedge */
|
705
|
+
else { /* if to find association groups */
|
706
|
+
for (n = 0; 1; ) { /* extract groups from tree */
|
707
|
+
k = ist_group(istree, set, &frq, &minval);
|
708
|
+
if (k <= 0) break; /* get the next group */
|
709
|
+
if (frq > smax) continue; /* check against maximal support */
|
710
|
+
for (i = 0; i < k; i++) { /* traverse the group's items */
|
711
|
+
name = is_name(itemset, set[i]);
|
712
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
713
|
+
fputs(name, out); /* print the name of the next item */
|
714
|
+
fputs((i < k-1) ? sep : " ", out);
|
715
|
+
} /* print a separator */
|
716
|
+
fputs(" (", out); /* print the group evaluation */
|
717
|
+
if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
|
718
|
+
if (sout & 2) fputc('/', out); }
|
719
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
720
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
|
721
|
+
fputs(")\n", out); /* print support and add. measure */
|
722
|
+
n++; /* and count the group */
|
723
|
+
}
|
724
|
+
} /* if (target <= TT_MFSET) .. else .. */
|
725
|
+
if (fflush(out) != 0) error(E_FWRITE, fn_out);
|
726
|
+
if (out != stdout) fclose(out);
|
727
|
+
out = NULL; /* close the output file */
|
728
|
+
MSG(fprintf(stderr, "[%d %s(s)] done ", n, ttypes[target]));
|
729
|
+
MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)));
|
730
|
+
#ifdef BENCH
|
731
|
+
printf("number of support counters: %d\n", istree->sccnt);
|
732
|
+
printf("necessary support counters: %d\n", istree->scnec);
|
733
|
+
printf("number of child pointers : %d\n", istree->cpcnt);
|
734
|
+
printf("necessary child pointers : %d\n", istree->cpnec);
|
735
|
+
printf("allocated memory (bytes) : %d\n", istree->bytes);
|
736
|
+
#endif
|
737
|
+
|
738
|
+
/* --- clean up --- */
|
739
|
+
#ifndef NDEBUG /* if this is a debug version */
|
740
|
+
free(used); /* delete the item app. vector */
|
741
|
+
ist_delete(istree); /* delete the item set tree, */
|
742
|
+
if (tatree) tat_delete(tatree); /* the transaction tree, */
|
743
|
+
if (taset) tas_delete(taset, 0); /* the transaction set, */
|
744
|
+
is_delete(itemset); /* and the item set */
|
745
|
+
#endif
|
746
|
+
#ifdef STORAGE /* if storage debugging */
|
747
|
+
showmem("at end of program"); /* check memory usage */
|
748
|
+
#endif
|
749
|
+
return 0; /* return 'ok' */
|
750
|
+
} /* main() */
|