apriori 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +16 -0
- data/License.txt +20 -0
- data/Manifest.txt +121 -0
- data/README.txt +149 -0
- data/Rakefile +15 -0
- data/TODO.txt +60 -0
- data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
- data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
- data/attic/c_ext_test1/mytest.rb +10 -0
- data/attic/test.c +12 -0
- data/config/hoe.rb +81 -0
- data/config/requirements.rb +29 -0
- data/examples/01_simple_example.rb +32 -0
- data/examples/02_small_file_example.rb +17 -0
- data/examples/03_large_file_example.rb +22 -0
- data/examples/test_data/market_basket_basic_test.dat +9 -0
- data/ext/Apriori.c +149 -0
- data/ext/Makefile +149 -0
- data/ext/apriori/doc/apriori.html +1301 -0
- data/ext/apriori/doc/arem.gp +68 -0
- data/ext/apriori/doc/c_rev.gp +89 -0
- data/ext/apriori/doc/chi2.tex +156 -0
- data/ext/apriori/doc/copying +504 -0
- data/ext/apriori/doc/line.gif +0 -0
- data/ext/apriori/doc/uparrow.gif +0 -0
- data/ext/apriori/ex/flg2set +15 -0
- data/ext/apriori/ex/hdr2set +13 -0
- data/ext/apriori/ex/readme +71 -0
- data/ext/apriori/ex/row2set +7 -0
- data/ext/apriori/ex/rulesort +24 -0
- data/ext/apriori/ex/tab2set +9 -0
- data/ext/apriori/ex/test.app +2 -0
- data/ext/apriori/ex/test.rul +9 -0
- data/ext/apriori/ex/test1.rul +43 -0
- data/ext/apriori/ex/test1.tab +10 -0
- data/ext/apriori/ex/test2.tab +10 -0
- data/ext/apriori/ex/test3.tab +30 -0
- data/ext/apriori/ex/test4.tab +11 -0
- data/ext/apriori/ex/test5.tab +39 -0
- data/ext/apriori/ex/tid2set +23 -0
- data/ext/apriori/ex/xhdr2set +33 -0
- data/ext/apriori/src/apriori.c +750 -0
- data/ext/apriori/src/apriori.dsp +120 -0
- data/ext/apriori/src/apriori.dsw +29 -0
- data/ext/apriori/src/apriori.mak +99 -0
- data/ext/apriori/src/istree.c +1411 -0
- data/ext/apriori/src/istree.h +160 -0
- data/ext/apriori/src/makefile +105 -0
- data/ext/apriori/src/tract.c +870 -0
- data/ext/apriori/src/tract.h +261 -0
- data/ext/apriori_wrapper.c +757 -0
- data/ext/apriori_wrapper.h +10 -0
- data/ext/extconf.rb +32 -0
- data/ext/math/doc/copying +504 -0
- data/ext/math/src/chi2.c +151 -0
- data/ext/math/src/chi2.h +27 -0
- data/ext/math/src/choose.c +71 -0
- data/ext/math/src/choose.h +16 -0
- data/ext/math/src/gamma.c +446 -0
- data/ext/math/src/gamma.h +39 -0
- data/ext/math/src/intexp.c +35 -0
- data/ext/math/src/intexp.h +15 -0
- data/ext/math/src/makefile +164 -0
- data/ext/math/src/math.mak +48 -0
- data/ext/math/src/normal.c +387 -0
- data/ext/math/src/normal.h +44 -0
- data/ext/math/src/radfn.c +113 -0
- data/ext/math/src/radfn.h +34 -0
- data/ext/math/src/zeta.c +49 -0
- data/ext/math/src/zeta.h +15 -0
- data/ext/pre-clean.rb +8 -0
- data/ext/pre-setup.rb +9 -0
- data/ext/util/doc/copying +504 -0
- data/ext/util/src/listops.c +76 -0
- data/ext/util/src/listops.h +26 -0
- data/ext/util/src/makefile +103 -0
- data/ext/util/src/memsys.c +84 -0
- data/ext/util/src/memsys.h +42 -0
- data/ext/util/src/nstats.c +288 -0
- data/ext/util/src/nstats.h +69 -0
- data/ext/util/src/params.c +86 -0
- data/ext/util/src/params.h +19 -0
- data/ext/util/src/parse.c +133 -0
- data/ext/util/src/parse.h +81 -0
- data/ext/util/src/scan.c +767 -0
- data/ext/util/src/scan.h +111 -0
- data/ext/util/src/symtab.c +443 -0
- data/ext/util/src/symtab.h +121 -0
- data/ext/util/src/tabscan.c +279 -0
- data/ext/util/src/tabscan.h +99 -0
- data/ext/util/src/util.mak +91 -0
- data/ext/util/src/vecops.c +317 -0
- data/ext/util/src/vecops.h +42 -0
- data/lib/apriori.rb +133 -0
- data/lib/apriori/adapter.rb +13 -0
- data/lib/apriori/association_rule.rb +89 -0
- data/lib/apriori/version.rb +9 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/apriori.rake +20 -0
- data/tasks/attic.rake +28 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/install.rake +13 -0
- data/tasks/website.rake +17 -0
- data/test/apriori_test.rb +13 -0
- data/test/fixtures/market_basket_results_test.txt +5 -0
- data/test/fixtures/market_basket_string_test.txt +7 -0
- data/test/fixtures/results.txt +2 -0
- data/test/fixtures/sample.txt +7 -0
- data/test/test_helper.rb +5 -0
- data/test/unit/test_apriori.rb +68 -0
- data/test/unit/test_itemsets_and_parsing.rb +82 -0
- data/website/index.html +248 -0
- data/website/index.txt +152 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +142 -0
- data/website/template.html.erb +49 -0
- metadata +226 -0
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
The example files in this directory demonstrate how to use the
|
|
2
|
+
options -b, -f, and -r and the optional item appearances file.
|
|
3
|
+
This file also explains the conversion scripts, which can convert
|
|
4
|
+
different input formats into the format needed by the apriori program.
|
|
5
|
+
|
|
6
|
+
In the file test1.tab transactions are separated by newline characters
|
|
7
|
+
and the items of a transaction are separated by spaces. This is the
|
|
8
|
+
standard input format and hence the file can be processed directly:
|
|
9
|
+
apriori test1.tab test1.rul
|
|
10
|
+
|
|
11
|
+
In the file test2.tab the same transactions can be found, but several
|
|
12
|
+
different field separators are used. This file can be processed with:
|
|
13
|
+
apriori -f ",.;:" -l test2.tab test2.rul
|
|
14
|
+
|
|
15
|
+
The files test3.tab to test5.tab are in formats that cannot be
|
|
16
|
+
processed directly with the apriori program, but which may be common.
|
|
17
|
+
|
|
18
|
+
In the file test3.tab each line contains a transaction identifier and
|
|
19
|
+
an item, separated by a space. This file can be converted into the
|
|
20
|
+
standard input format with the script tid2set, i.e., with
|
|
21
|
+
tid2set test3.tab x.tab
|
|
22
|
+
Note, however, that the input file (here: test3.tab) must be sorted
|
|
23
|
+
w.r.t. the transaction identifier, so that items belonging to the
|
|
24
|
+
same transaction occupy consecutive lines/records.
|
|
25
|
+
|
|
26
|
+
In the file test4.tab the first line states the item names and the
|
|
27
|
+
following lines contain flags T (true) and F (false) depending on
|
|
28
|
+
whether the item is contained in the transaction represented by the
|
|
29
|
+
line or not. This format can be converted into the standard input
|
|
30
|
+
format with the script flg2set, i.e., with
|
|
31
|
+
flg2set test4.tab x.tab
|
|
32
|
+
|
|
33
|
+
In the file test5.tab there is one item per line and transactions
|
|
34
|
+
are separated by blank lines. This format can be converted into the
|
|
35
|
+
standard input format with the script row2set, i.e., with
|
|
36
|
+
row2set test5.tab x.tab
|
|
37
|
+
|
|
38
|
+
The additional scripts tab2set and hdr2set convert tables with column
|
|
39
|
+
numbers or column names into a format appropriate for the apriori
|
|
40
|
+
program. They are invoked in the same way as all other scripts
|
|
41
|
+
discussed above, i.e., with
|
|
42
|
+
tab2set a.tab b.tab
|
|
43
|
+
or
|
|
44
|
+
hdr2set a.tab b.tab
|
|
45
|
+
where a.tab is the name of the input file and b.tab the name of the
|
|
46
|
+
output file. The script tab2set replaces each table entry "x" of the
|
|
47
|
+
input file by "Xi=x", where i is the column number (starting with 1).
|
|
48
|
+
The script hdr2set reads the variable names from the first line of
|
|
49
|
+
the input file and then replaces each table entry "x" by "X=x", where
|
|
50
|
+
"X" is the variable name that was found in the corresponding column
|
|
51
|
+
of the first line. These scripts are handy if you want to process
|
|
52
|
+
tabular data by treating each table row as a transaction.
|
|
53
|
+
|
|
54
|
+
The file test.app demonstrates the use of item appearance indicators.
|
|
55
|
+
The first line of this file ('body') states that any item not explicitly
|
|
56
|
+
mentioned in this file may appear only in the body of a rule. The second
|
|
57
|
+
line says that item 2 may appear only in the head of a rule. Hence, by
|
|
58
|
+
processing the file test1.tab with
|
|
59
|
+
apriori test1.tab test.rul test.app
|
|
60
|
+
only rules with item 2 in the head are generated.
|
|
61
|
+
|
|
62
|
+
Note that any input may also be read from standard input and any output
|
|
63
|
+
may be sent to standard output, simply by specifying a '-' or an empty
|
|
64
|
+
string "" instead of a filename. For example
|
|
65
|
+
apriori test1.tab -
|
|
66
|
+
writes the rules directly to the terminal. They may be piped to any
|
|
67
|
+
other program, since all other messages of the apriori program are
|
|
68
|
+
written to standard error.
|
|
69
|
+
|
|
70
|
+
Enjoy,
|
|
71
|
+
Christian Borgelt
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
#-----------------------------------------------------------------------
|
|
3
|
+
# File : rulesort
|
|
4
|
+
# Contents: sort output of apriori
|
|
5
|
+
# Author : Christian Borgelt
|
|
6
|
+
# History : ??.??.1996 file created
|
|
7
|
+
# 27.02.1997 default settings moved to default case
|
|
8
|
+
# 26.03.2003 adapted to current apriori version
|
|
9
|
+
#-----------------------------------------------------------------------
|
|
10
|
+
case $1 in
|
|
11
|
+
'-1')
|
|
12
|
+
cmd='s/\(.*(\)\([0-9]*\.[0-9]*\)\(%[,/].*\)/\2#\1\2\3/'
|
|
13
|
+
sopt='-n -r'
|
|
14
|
+
shift;;
|
|
15
|
+
'-2')
|
|
16
|
+
cmd='s/\(.*(.*[,/] \)\([0-9]*\.[0-9]*\)\(%.*\)/\2#\1\2\3/'
|
|
17
|
+
sopt='-n -r'
|
|
18
|
+
shift;;
|
|
19
|
+
default)
|
|
20
|
+
cmd=''
|
|
21
|
+
sopt='-d'
|
|
22
|
+
;;
|
|
23
|
+
esac
|
|
24
|
+
sed "$cmd" $1 | sort $sopt | sed 's/^.*#//' > $2
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
5 <- (100.0, 30.0)
|
|
2
|
+
1 <- (100.0, 60.0)
|
|
3
|
+
2 <- (100.0, 70.0)
|
|
4
|
+
3 <- (100.0, 70.0)
|
|
5
|
+
4 <- (100.0, 70.0)
|
|
6
|
+
1 <- 5 (30.0, 33.3)
|
|
7
|
+
3 <- 5 (30.0, 33.3)
|
|
8
|
+
4 <- 5 (30.0, 100.0)
|
|
9
|
+
5 <- 4 (70.0, 42.9)
|
|
10
|
+
2 <- 1 (60.0, 83.3)
|
|
11
|
+
1 <- 2 (70.0, 71.4)
|
|
12
|
+
3 <- 1 (60.0, 66.7)
|
|
13
|
+
1 <- 3 (70.0, 57.1)
|
|
14
|
+
4 <- 1 (60.0, 66.7)
|
|
15
|
+
1 <- 4 (70.0, 57.1)
|
|
16
|
+
3 <- 2 (70.0, 85.7)
|
|
17
|
+
2 <- 3 (70.0, 85.7)
|
|
18
|
+
4 <- 2 (70.0, 57.1)
|
|
19
|
+
2 <- 4 (70.0, 57.1)
|
|
20
|
+
4 <- 3 (70.0, 57.1)
|
|
21
|
+
3 <- 4 (70.0, 57.1)
|
|
22
|
+
4 <- 5 1 (10.0, 100.0)
|
|
23
|
+
1 <- 5 4 (30.0, 33.3)
|
|
24
|
+
5 <- 1 4 (40.0, 25.0)
|
|
25
|
+
4 <- 5 3 (10.0, 100.0)
|
|
26
|
+
3 <- 5 4 (30.0, 33.3)
|
|
27
|
+
5 <- 3 4 (40.0, 25.0)
|
|
28
|
+
3 <- 1 2 (50.0, 80.0)
|
|
29
|
+
2 <- 1 3 (40.0, 100.0)
|
|
30
|
+
1 <- 2 3 (60.0, 66.7)
|
|
31
|
+
4 <- 1 2 (50.0, 60.0)
|
|
32
|
+
2 <- 1 4 (40.0, 75.0)
|
|
33
|
+
1 <- 2 4 (40.0, 75.0)
|
|
34
|
+
4 <- 1 3 (40.0, 50.0)
|
|
35
|
+
3 <- 1 4 (40.0, 50.0)
|
|
36
|
+
1 <- 3 4 (40.0, 50.0)
|
|
37
|
+
4 <- 2 3 (60.0, 50.0)
|
|
38
|
+
3 <- 2 4 (40.0, 75.0)
|
|
39
|
+
2 <- 3 4 (40.0, 75.0)
|
|
40
|
+
4 <- 1 2 3 (40.0, 50.0)
|
|
41
|
+
3 <- 1 2 4 (30.0, 66.7)
|
|
42
|
+
2 <- 1 3 4 (20.0, 100.0)
|
|
43
|
+
1 <- 2 3 4 (30.0, 66.7)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
gawk '
|
|
3
|
+
function output ()
|
|
4
|
+
{
|
|
5
|
+
if (i > 0)
|
|
6
|
+
printf("%s", items[0]);
|
|
7
|
+
for (k = 0; ++k < i; )
|
|
8
|
+
printf(" %s", items[k]);
|
|
9
|
+
printf("\n");
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
BEGIN { tid = ""; i = 0; }
|
|
13
|
+
{
|
|
14
|
+
if ($1 == tid)
|
|
15
|
+
items[i++] = $2;
|
|
16
|
+
else {
|
|
17
|
+
if (tid != "") output();
|
|
18
|
+
tid = $1;
|
|
19
|
+
items[0] = $2; i = 1;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
END { output(); }
|
|
23
|
+
' $1 > $2
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
gawk -v app=$3 -v out=$4 '
|
|
3
|
+
BEGIN {
|
|
4
|
+
FS = " ";
|
|
5
|
+
if ((app != "") && (out != "")) {
|
|
6
|
+
getline dflt < app;
|
|
7
|
+
while ((getline < app) > 0)
|
|
8
|
+
base[$1] = $2;
|
|
9
|
+
}
|
|
10
|
+
FS = ",";
|
|
11
|
+
}
|
|
12
|
+
(NR == 1) {
|
|
13
|
+
for (i = 0; ++i <= NF; )
|
|
14
|
+
items[i] = $i;
|
|
15
|
+
}
|
|
16
|
+
(NR > 1) {
|
|
17
|
+
for (i = k = 0; ++i <= NF; ) {
|
|
18
|
+
if (k++ > 0) printf(" ");
|
|
19
|
+
item = (items[i] "=" $i);
|
|
20
|
+
printf("%s", item);
|
|
21
|
+
if (items[i] in base) apps[item] = base[items[i]];
|
|
22
|
+
else apps[item] = dflt;
|
|
23
|
+
}
|
|
24
|
+
printf("\n");
|
|
25
|
+
}
|
|
26
|
+
END {
|
|
27
|
+
if (out != "") {
|
|
28
|
+
print dflt > out;
|
|
29
|
+
for (t in apps)
|
|
30
|
+
if (apps[t] != dflt)
|
|
31
|
+
printf("%s %s\n", t, apps[t]) > out;
|
|
32
|
+
}
|
|
33
|
+
}' $1 > $2
|
|
@@ -0,0 +1,750 @@
|
|
|
1
|
+
/*----------------------------------------------------------------------
|
|
2
|
+
File : apriori.c
|
|
3
|
+
Contents: apriori algorithm for finding association rules
|
|
4
|
+
Author : Christian Borgelt
|
|
5
|
+
History : 1996.02.14 file created
|
|
6
|
+
1996.07.26 output precision reduced
|
|
7
|
+
1996.11.22 options -b, -f, and -r added
|
|
8
|
+
1996.11.24 option -e added (add. evaluation measures)
|
|
9
|
+
1997.08.18 normalized chi^2 measure added
|
|
10
|
+
option -m (minimal rule length) added
|
|
11
|
+
1997.10.13 quiet version (no output to stdout or stderr)
|
|
12
|
+
1998.01.27 adapted to changed ist_create() function
|
|
13
|
+
1998.08.08 optional input file (item appearances) added
|
|
14
|
+
1998.09.02 several assertions added
|
|
15
|
+
1998.09.07 hyperedge mode (option -h) added
|
|
16
|
+
1998.12.08 output of absolute support (option -a) added
|
|
17
|
+
float changed to double
|
|
18
|
+
1998.12.09 conversion of names to a scanable form added
|
|
19
|
+
1999.02.05 long int changed to int
|
|
20
|
+
1999.02.09 input from stdin, output to stdout added
|
|
21
|
+
1999.08.09 bug in check of support parameter (<= 0) fixed
|
|
22
|
+
1999.11.05 rule evaluation measure EM_AIMP added
|
|
23
|
+
1999.11.08 output of add. rule eval. measure value added
|
|
24
|
+
2000.03.16 optional use of original rule support definition
|
|
25
|
+
2001.04.01 option -h replaced by option -t (target type)
|
|
26
|
+
2001.05.26 extended support output added (option -x)
|
|
27
|
+
2001.06.09 extended support output for item sets added
|
|
28
|
+
2001.08.15 module scan used for output formatting
|
|
29
|
+
2001.11.18 item and transaction functions made a module
|
|
30
|
+
2001.11.19 options -C, -l changed, option -y removed
|
|
31
|
+
2001.12.28 adapted to module tract, some improvements
|
|
32
|
+
2002.01.11 evaluation measures codes changed to letters
|
|
33
|
+
2002.02.10 option -q extended by a direction parameter
|
|
34
|
+
2002.02.11 memory usage minimization option added
|
|
35
|
+
2002.06.09 arbitrary supp./conf. formats made possible
|
|
36
|
+
2003.01.09 option -k (item separator) added
|
|
37
|
+
2003.01.14 check for empty transaction set added
|
|
38
|
+
2003.03.12 output of lift value (conf/prior) added
|
|
39
|
+
2003.07.17 item filtering w.r.t. usage added (option -u)
|
|
40
|
+
2003.07.17 sorting w.r.t. transaction size sum added
|
|
41
|
+
2003.07.18 maximal itemset filter added
|
|
42
|
+
2003.08.11 closed itemset filter added
|
|
43
|
+
2003.08.15 item filtering for transaction tree added
|
|
44
|
+
2003.08.16 parameter for transaction filtering added
|
|
45
|
+
2003.08.18 dynamic filtering decision based on times added
|
|
46
|
+
2003.08.21 option -j (heap sort for transactions) added
|
|
47
|
+
2003.09.22 meaning of option -j reversed (heapsort default)
|
|
48
|
+
2004.03.25 option -S added (maximal support of a set/rule)
|
|
49
|
+
2004.05.09 additional selection measure for sets added
|
|
50
|
+
2004.10.28 two unnecessary assignments removed
|
|
51
|
+
2004.11.20 bug in evaluation of -j (heap/quicksort) fixed
|
|
52
|
+
2004.11.23 absolute/relative support output changed
|
|
53
|
+
2004.12.09 semantics of option -p changed
|
|
54
|
+
2005.01.25 bug in output of absolute/relative support fixed
|
|
55
|
+
2005.01.31 another bug in this output fixed
|
|
56
|
+
2005.06.20 use of flag for "no item sorting" corrected
|
|
57
|
+
2007.02.13 adapted to modified module tabscan
|
|
58
|
+
2008.03.13 additional hyperedge evaluation added
|
|
59
|
+
2008.03.24 additional target added (association groups)
|
|
60
|
+
----------------------------------------------------------------------*/
|
|
61
|
+
#include <stdio.h>
|
|
62
|
+
#include <stdlib.h>
|
|
63
|
+
#include <stdarg.h>
|
|
64
|
+
#include <string.h>
|
|
65
|
+
#include <limits.h>
|
|
66
|
+
#include <math.h>
|
|
67
|
+
#include <time.h>
|
|
68
|
+
#include <assert.h>
|
|
69
|
+
#include "scan.h"
|
|
70
|
+
#include "tract.h"
|
|
71
|
+
#include "istree.h"
|
|
72
|
+
#ifdef STORAGE
|
|
73
|
+
#include "storage.h"
|
|
74
|
+
#endif
|
|
75
|
+
|
|
76
|
+
/*----------------------------------------------------------------------
|
|
77
|
+
Preprocessor Definitions
|
|
78
|
+
----------------------------------------------------------------------*/
|
|
79
|
+
#define PRGNAME "apriori"
|
|
80
|
+
#define DESCRIPTION "find association rules with the apriori algorithm"
|
|
81
|
+
#define VERSION "version 4.35 (2008.03.24) " \
|
|
82
|
+
"(c) 1996-2008 Christian Borgelt"
|
|
83
|
+
|
|
84
|
+
/* --- target types --- */
|
|
85
|
+
#define TT_SET 0 /* frequent item sets */
|
|
86
|
+
#define TT_CLSET 1 /* closed item sets */
|
|
87
|
+
#define TT_MFSET 2 /* maximal item sets */
|
|
88
|
+
#define TT_RULE 3 /* association rules */
|
|
89
|
+
#define TT_HEDGE 4 /* association hyperedges */
|
|
90
|
+
#define TT_GROUP 5 /* association groups */
|
|
91
|
+
|
|
92
|
+
/* --- error codes --- */
|
|
93
|
+
#define E_OPTION (-5) /* unknown option */
|
|
94
|
+
#define E_OPTARG (-6) /* missing option argument */
|
|
95
|
+
#define E_ARGCNT (-7) /* too few/many arguments */
|
|
96
|
+
#define E_STDIN (-8) /* double assignment of stdin */
|
|
97
|
+
#define E_TARGET (-9) /* invalid target type */
|
|
98
|
+
#define E_SUPP (-10) /* invalid support */
|
|
99
|
+
#define E_CONF (-11) /* invalid confidence */
|
|
100
|
+
#define E_MEASURE (-12) /* invalid evaluation measure */
|
|
101
|
+
#define E_RULELEN (-13) /* invalid rule length */
|
|
102
|
+
#define E_NOTAS (-14) /* no items or transactions */
|
|
103
|
+
#define E_NOFREQ (-15) /* no frequent items */
|
|
104
|
+
#define E_UNKNOWN (-21) /* unknown error */
|
|
105
|
+
|
|
106
|
+
#ifndef QUIET /* if not quiet version */
|
|
107
|
+
#ifdef FFLUSH
|
|
108
|
+
#define MSG(x) x /* print messages */
|
|
109
|
+
#else /* if to flush every output */
|
|
110
|
+
#define MSG(x) x, fflush(stderr)
|
|
111
|
+
#endif
|
|
112
|
+
#else /* if quiet version */
|
|
113
|
+
#define MSG(x) /* suppress messages */
|
|
114
|
+
#endif
|
|
115
|
+
|
|
116
|
+
#define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
|
|
117
|
+
#define RECCNT(s) (ts_reccnt(is_tabscan(s)) \
|
|
118
|
+
- ((ts_delim(is_tabscan(s)) == TS_REC) ? 1 : 0))
|
|
119
|
+
#define BUFFER(s) ts_buf(is_tabscan(s))
|
|
120
|
+
|
|
121
|
+
/*----------------------------------------------------------------------
|
|
122
|
+
Constants
|
|
123
|
+
----------------------------------------------------------------------*/
|
|
124
|
+
#ifndef QUIET /* if not quiet version */
|
|
125
|
+
/* --- target types --- */
|
|
126
|
+
static const char *ttypes[] = {
|
|
127
|
+
/* TT_SET 0 */ "set",
|
|
128
|
+
/* TT_CLSET 1 */ "set",
|
|
129
|
+
/* TT_MFSET 2 */ "set",
|
|
130
|
+
/* TT_RULE 3 */ "rule",
|
|
131
|
+
/* TT_HEDGE 4 */ "hyperedge",
|
|
132
|
+
/* TT_GROUP 5 */ "group",
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
/* --- error messages --- */
|
|
136
|
+
static const char *errmsgs[] = {
|
|
137
|
+
/* E_NONE 0 */ "no error\n",
|
|
138
|
+
/* E_NOMEM -1 */ "not enough memory\n",
|
|
139
|
+
/* E_FOPEN -2 */ "cannot open file %s\n",
|
|
140
|
+
/* E_FREAD -3 */ "read error on file %s\n",
|
|
141
|
+
/* E_FWRITE -4 */ "write error on file %s\n",
|
|
142
|
+
/* E_OPTION -5 */ "unknown option -%c\n",
|
|
143
|
+
/* E_OPTARG -6 */ "missing option argument\n",
|
|
144
|
+
/* E_ARGCNT -7 */ "wrong number of arguments\n",
|
|
145
|
+
/* E_STDIN -8 */ "double assignment of standard input\n",
|
|
146
|
+
/* E_TARGET -9 */ "invalid target type '%c'\n",
|
|
147
|
+
/* E_SUPP -10 */ "invalid minimal support %g%%\n",
|
|
148
|
+
/* E_CONF -11 */ "invalid minimal confidence %g%%\n",
|
|
149
|
+
/* E_MEASURE -12 */ "invalid additional evaluation measure %c\n",
|
|
150
|
+
/* E_RULELEN -13 */ "invalid set size/rule length %d\n",
|
|
151
|
+
/* E_NOTAS -14 */ "no items or transactions to work on\n",
|
|
152
|
+
/* E_NOFREQ -15 */ "no frequent items\n",
|
|
153
|
+
/* E_ITEMEXP -16 */ "file %s, record %d: item expected\n",
|
|
154
|
+
/* E_DUPITEM -17 */ "file %s, record %d: duplicate item %s\n",
|
|
155
|
+
/* E_APPEXP -18 */ "file %s, record %d: "
|
|
156
|
+
"appearance indicator expected\n",
|
|
157
|
+
/* E_UNKAPP -19 */ "file %s, record %d: "
|
|
158
|
+
"unknown appearance indicator %s\n",
|
|
159
|
+
/* E_FLDCNT -20 */ "file %s, record %d: too many fields\n",
|
|
160
|
+
/* E_UNKNOWN -21 */ "unknown error\n"
|
|
161
|
+
};
|
|
162
|
+
#endif
|
|
163
|
+
|
|
164
|
+
/*----------------------------------------------------------------------
|
|
165
|
+
Global Variables
|
|
166
|
+
----------------------------------------------------------------------*/
|
|
167
|
+
#ifndef QUIET
|
|
168
|
+
static char *prgname; /* program name for error messages */
|
|
169
|
+
#endif
|
|
170
|
+
static ITEMSET *itemset = NULL; /* item set */
|
|
171
|
+
static TASET *taset = NULL; /* transaction set */
|
|
172
|
+
static TATREE *tatree = NULL; /* transaction tree */
|
|
173
|
+
static ISTREE *istree = NULL; /* item set tree */
|
|
174
|
+
static FILE *in = NULL; /* input file */
|
|
175
|
+
static FILE *out = NULL; /* output file */
|
|
176
|
+
|
|
177
|
+
/*----------------------------------------------------------------------
|
|
178
|
+
Main Functions
|
|
179
|
+
----------------------------------------------------------------------*/
|
|
180
|
+
|
|
181
|
+
static void help (void)
|
|
182
|
+
{ /* --- print help on eval. measures */
|
|
183
|
+
#ifndef QUIET
|
|
184
|
+
fprintf(stderr, "\n"); /* terminate startup message */
|
|
185
|
+
printf("additional evaluation measures (option -e#)\n");
|
|
186
|
+
printf("frequent item sets:\n");
|
|
187
|
+
printf("d or 1: binary logarithm of support quotient\n");
|
|
188
|
+
printf("association rules:\n");
|
|
189
|
+
printf("d or 1: absolute confidence difference to prior\n");
|
|
190
|
+
printf("q or 2: absolute difference of confidence quotient to 1\n");
|
|
191
|
+
printf("a or 3: absolute difference of improvement value to 1\n");
|
|
192
|
+
printf("i or 4: information difference to prior\n");
|
|
193
|
+
printf("c or 5: normalized chi^2 measure\n");
|
|
194
|
+
printf("p or 6: p-value computed from chi^2 measure\n");
|
|
195
|
+
#endif
|
|
196
|
+
exit(0); /* abort the program */
|
|
197
|
+
} /* help() */
|
|
198
|
+
|
|
199
|
+
/*--------------------------------------------------------------------*/
|
|
200
|
+
|
|
201
|
+
static void error (int code, ...)
|
|
202
|
+
{ /* --- print an error message */
|
|
203
|
+
#ifndef QUIET /* if not quiet version */
|
|
204
|
+
va_list args; /* list of variable arguments */
|
|
205
|
+
const char *msg; /* error message */
|
|
206
|
+
|
|
207
|
+
assert(prgname); /* check the program name */
|
|
208
|
+
if (code < E_UNKNOWN) code = E_UNKNOWN;
|
|
209
|
+
if (code < 0) { /* if to report an error, */
|
|
210
|
+
msg = errmsgs[-code]; /* get the error message */
|
|
211
|
+
if (!msg) msg = errmsgs[-E_UNKNOWN];
|
|
212
|
+
fprintf(stderr, "\n%s: ", prgname);
|
|
213
|
+
va_start(args, code); /* get variable arguments */
|
|
214
|
+
vfprintf(stderr, msg, args);/* print error message */
|
|
215
|
+
va_end(args); /* end argument evaluation */
|
|
216
|
+
}
|
|
217
|
+
#endif
|
|
218
|
+
#ifndef NDEBUG /* if debug version */
|
|
219
|
+
if (istree) ist_delete(istree); /* clean up memory */
|
|
220
|
+
if (tatree) tat_delete(tatree); /* and close files */
|
|
221
|
+
if (taset) tas_delete(taset, 0);
|
|
222
|
+
if (itemset) is_delete(itemset);
|
|
223
|
+
if (in && (in != stdin)) fclose(in);
|
|
224
|
+
if (out && (out != stdout)) fclose(out);
|
|
225
|
+
#endif
|
|
226
|
+
#ifdef STORAGE /* if storage debugging */
|
|
227
|
+
showmem("at end of program"); /* check memory usage */
|
|
228
|
+
#endif
|
|
229
|
+
exit(code); /* abort the program */
|
|
230
|
+
} /* error() */
|
|
231
|
+
|
|
232
|
+
/*--------------------------------------------------------------------*/
|
|
233
|
+
|
|
234
|
+
int main (int argc, char *argv[])
|
|
235
|
+
{ /* --- main function */
|
|
236
|
+
int i, k = 0, n; /* loop variables, counters */
|
|
237
|
+
char *s; /* to traverse the options */
|
|
238
|
+
char **optarg = NULL; /* option argument */
|
|
239
|
+
char *fn_in = NULL; /* name of input file */
|
|
240
|
+
char *fn_out = NULL; /* name of output file */
|
|
241
|
+
char *fn_app = NULL; /* name of item appearances file */
|
|
242
|
+
char *blanks = NULL; /* blanks */
|
|
243
|
+
char *fldseps = NULL; /* field separators */
|
|
244
|
+
char *recseps = NULL; /* record separators */
|
|
245
|
+
char *comment = NULL; /* comment indicators */
|
|
246
|
+
char *used = NULL; /* item usage vector */
|
|
247
|
+
double supp = 0.1; /* minimal support (in percent) */
|
|
248
|
+
double smax = 1.0; /* maximal support (in percent) */
|
|
249
|
+
double conf = 0.8; /* minimal confidence (in percent) */
|
|
250
|
+
int mode = IST_BODY; /* search mode (rule support def.) */
|
|
251
|
+
int target = 'r'; /* target type (sets/rules/h.edges) */
|
|
252
|
+
int arem = 0; /* additional rule evaluation measure */
|
|
253
|
+
int lift = 0; /* flag for printing the lift */
|
|
254
|
+
double minval = 0.1; /* minimal evaluation measure value */
|
|
255
|
+
double lftval = 0; /* lift value (confidence/prior) */
|
|
256
|
+
int minlen = 1; /* minimal rule length */
|
|
257
|
+
int maxlen = INT_MAX; /* maximal rule length */
|
|
258
|
+
int load = 1; /* flag for loading transactions */
|
|
259
|
+
int sort = 2; /* flag for item sorting and recoding */
|
|
260
|
+
double filter = 0.1; /* item usage filtering parameter */
|
|
261
|
+
int tree = 1; /* flag for transaction tree */
|
|
262
|
+
int heap = 1; /* flag for heap sort vs. quick sort */
|
|
263
|
+
int c2scf = 0; /* flag for conv. to scanable form */
|
|
264
|
+
char *sep = " "; /* item separator for output */
|
|
265
|
+
char *fmt = "%.1f"; /* output format for support/conf. */
|
|
266
|
+
int sout = 1; /* flag for abs./rel. support output */
|
|
267
|
+
int ext = 0; /* flag for extended support output */
|
|
268
|
+
int aval = 0; /* flag for add. eval. measure value */
|
|
269
|
+
int maxcnt = 0; /* maximal number of items per set */
|
|
270
|
+
int tacnt; /* number of transactions */
|
|
271
|
+
int frq; /* frequency of an item set */
|
|
272
|
+
int *map, *set; /* identifier map, item set */
|
|
273
|
+
const char *name; /* buffer for item names */
|
|
274
|
+
static char buf[4*TS_SIZE+4]; /* buffer for formatting */
|
|
275
|
+
clock_t t, tt, tc, x; /* timer for measurements */
|
|
276
|
+
|
|
277
|
+
#ifndef QUIET /* if not quiet version */
|
|
278
|
+
prgname = argv[0]; /* get program name for error msgs. */
|
|
279
|
+
|
|
280
|
+
/* --- print usage message --- */
|
|
281
|
+
if (argc > 1) { /* if arguments are given */
|
|
282
|
+
fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
|
|
283
|
+
fprintf(stderr, VERSION); } /* print a startup message */
|
|
284
|
+
else { /* if no arguments given */
|
|
285
|
+
printf("usage: %s [options] infile outfile [appfile]\n", argv[0]);
|
|
286
|
+
printf("%s\n", DESCRIPTION);
|
|
287
|
+
printf("%s\n", VERSION);
|
|
288
|
+
printf("-t# target type (default: association rules)\n"
|
|
289
|
+
" (s: item sets, c: closed item sets,"
|
|
290
|
+
" m: maximal item sets,\n"
|
|
291
|
+
" r: association rules,"
|
|
292
|
+
" h: association hyperedges)\n");
|
|
293
|
+
printf("-m# minimal number of items per set/rule/hyperedge "
|
|
294
|
+
"(default: %d)\n", minlen);
|
|
295
|
+
printf("-n# maximal number of items per set/rule/hyperedge "
|
|
296
|
+
"(default: no limit)\n");
|
|
297
|
+
printf("-s# minimal support of a set/rule/hyperedge "
|
|
298
|
+
"(default: %g%%)\n", supp *100);
|
|
299
|
+
printf("-S# maximal support of a set/rule/hyperedge "
|
|
300
|
+
"(default: %g%%)\n", smax *100);
|
|
301
|
+
printf("-c# minimal confidence of a rule/hyperedge "
|
|
302
|
+
"(default: %g%%)\n", conf *100);
|
|
303
|
+
printf("-o use original definition of the support of a rule "
|
|
304
|
+
"(body & head)\n");
|
|
305
|
+
printf("-k# item separator for output "
|
|
306
|
+
"(default: \"%s\")\n", sep);
|
|
307
|
+
printf("-p# output format for support/confidence "
|
|
308
|
+
"(default: \"%s\")\n", fmt);
|
|
309
|
+
printf("-x extended support output "
|
|
310
|
+
"(print both rule support types)\n");
|
|
311
|
+
printf("-a print absolute support "
|
|
312
|
+
"(number of transactions)\n");
|
|
313
|
+
printf("-y print lift value (confidence divided by prior)\n");
|
|
314
|
+
printf("-e# additional evaluation measure (default: none)\n");
|
|
315
|
+
printf("-! print a list of additional evaluation measures\n");
|
|
316
|
+
printf("-d# minimal value of additional evaluation measure "
|
|
317
|
+
"(default: %g%%)\n", minval *100);
|
|
318
|
+
printf("-v print value of additional "
|
|
319
|
+
"rule evaluation measure\n");
|
|
320
|
+
printf("-g write output in scanable form "
|
|
321
|
+
"(quote certain characters)\n");
|
|
322
|
+
printf("-l do not load transactions into memory "
|
|
323
|
+
"(work on input file)\n");
|
|
324
|
+
printf("-q# sort items w.r.t. their frequency (default: %d)\n"
|
|
325
|
+
" (1: ascending, -1: descending, 0: do not sort,\n"
|
|
326
|
+
" 2: ascending, -2: descending w.r.t. "
|
|
327
|
+
"transaction size sum)\n", sort);
|
|
328
|
+
printf("-u# filter unused items from transactions "
|
|
329
|
+
"(default: %g)\n", filter);
|
|
330
|
+
printf(" (0: do not filter items w.r.t. usage in sets,\n"
|
|
331
|
+
" <0: fraction of removed items for filtering,\n"
|
|
332
|
+
" >0: take execution times ratio into account)\n");
|
|
333
|
+
printf("-h do not organize transactions as a prefix tree\n");
|
|
334
|
+
printf("-j use quicksort to sort the transactions "
|
|
335
|
+
"(default: heapsort)\n");
|
|
336
|
+
printf("-z minimize memory usage "
|
|
337
|
+
"(default: maximize speed)\n");
|
|
338
|
+
printf("-b/f/r# blank characters, field and record separators\n"
|
|
339
|
+
" (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
|
|
340
|
+
printf("-C# comment characters (default: \"#\")\n");
|
|
341
|
+
printf("infile file to read transactions from\n");
|
|
342
|
+
printf("outfile file to write item sets/association rules"
|
|
343
|
+
"/hyperedges to\n");
|
|
344
|
+
printf("appfile file stating item appearances (optional)\n");
|
|
345
|
+
return 0; /* print a usage message */
|
|
346
|
+
} /* and abort the program */
|
|
347
|
+
#endif /* #ifndef QUIET */
|
|
348
|
+
|
|
349
|
+
/* --- evaluate arguments --- */
|
|
350
|
+
for (i = 1; i < argc; i++) { /* traverse arguments */
|
|
351
|
+
s = argv[i]; /* get option argument */
|
|
352
|
+
if (optarg) { *optarg = s; optarg = NULL; continue; }
|
|
353
|
+
if ((*s == '-') && *++s) { /* -- if argument is an option */
|
|
354
|
+
while (*s) { /* traverse options */
|
|
355
|
+
switch (*s++) { /* evaluate switches */
|
|
356
|
+
case '!': help(); break;
|
|
357
|
+
case 't': target = (*s) ? *s++ : 'r'; break;
|
|
358
|
+
case 'm': minlen = (int)strtol(s, &s, 0); break;
|
|
359
|
+
case 'n': maxlen = (int)strtol(s, &s, 0); break;
|
|
360
|
+
case 's': supp = 0.01*strtod(s, &s); break;
|
|
361
|
+
case 'S': smax = 0.01*strtod(s, &s); break;
|
|
362
|
+
case 'c': conf = 0.01*strtod(s, &s); break;
|
|
363
|
+
case 'o': mode |= IST_BOTH; break;
|
|
364
|
+
case 'k': optarg = &sep; break;
|
|
365
|
+
case 'p': optarg = &fmt; break;
|
|
366
|
+
case 'x': ext = 1; break;
|
|
367
|
+
case 'a': sout |= 2; break;
|
|
368
|
+
case 'y': lift = 1; break;
|
|
369
|
+
case 'e': arem = (*s) ? *s++ : 0; break;
|
|
370
|
+
case 'd': minval = 0.01*strtod(s, &s); break;
|
|
371
|
+
case 'v': aval = 1; break;
|
|
372
|
+
case 'g': c2scf = 1; break;
|
|
373
|
+
case 'l': load = 0; break;
|
|
374
|
+
case 'q': sort = (int)strtol(s, &s, 0); break;
|
|
375
|
+
case 'u': filter = strtod(s, &s); break;
|
|
376
|
+
case 'h': tree = 0; break;
|
|
377
|
+
case 'j': heap = 0; break;
|
|
378
|
+
case 'z': mode |= IST_MEMOPT; break;
|
|
379
|
+
case 'b': optarg = &blanks; break;
|
|
380
|
+
case 'f': optarg = &fldseps; break;
|
|
381
|
+
case 'r': optarg = &recseps; break;
|
|
382
|
+
case 'C': optarg = &comment; break;
|
|
383
|
+
default : error(E_OPTION, *--s); break;
|
|
384
|
+
} /* set option variables */
|
|
385
|
+
if (optarg && *s) { *optarg = s; optarg = NULL; break; }
|
|
386
|
+
} } /* get option argument */
|
|
387
|
+
else { /* -- if argument is no option */
|
|
388
|
+
switch (k++) { /* evaluate non-options */
|
|
389
|
+
case 0: fn_in = s; break;
|
|
390
|
+
case 1: fn_out = s; break;
|
|
391
|
+
case 2: fn_app = s; break;
|
|
392
|
+
default: error(E_ARGCNT); break;
|
|
393
|
+
} /* note filenames */
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
if (optarg) error(E_OPTARG); /* check option argument */
|
|
397
|
+
if ((k < 2) || (k > 3)) /* and the number of arguments */
|
|
398
|
+
error(E_ARGCNT); /* (either in/out or in/out/app) */
|
|
399
|
+
if ((!fn_in || !*fn_in) && (fn_app && !*fn_app))
|
|
400
|
+
error(E_STDIN); /* stdin must not be used twice */
|
|
401
|
+
switch (target) { /* check and translate target type */
|
|
402
|
+
case 's': target = TT_SET; break;
|
|
403
|
+
case 'c': target = TT_CLSET; break;
|
|
404
|
+
case 'm': target = TT_MFSET; break;
|
|
405
|
+
case 'r': target = TT_RULE; break;
|
|
406
|
+
case 'h': target = TT_HEDGE; break;
|
|
407
|
+
case 'g': target = TT_GROUP; break;
|
|
408
|
+
default : error(E_TARGET, (char)target); break;
|
|
409
|
+
}
|
|
410
|
+
if (supp > 1) /* check the minimal support */
|
|
411
|
+
error(E_SUPP, supp); /* (< 0: absolute number) */
|
|
412
|
+
if ((conf < 0) || (conf > 1))
|
|
413
|
+
error(E_CONF, conf); /* check the minimal confidence */
|
|
414
|
+
if (minlen <= 0) error(E_RULELEN, minlen); /* check the limits */
|
|
415
|
+
if (maxlen <= 0) error(E_RULELEN, maxlen); /* for the rule length */
|
|
416
|
+
switch (arem) { /* check and translate measure */
|
|
417
|
+
case 0 : case '0': arem = EM_NONE; break;
|
|
418
|
+
case 'd': case '1': arem = EM_DIFF; break;
|
|
419
|
+
case 'q': case '2': arem = EM_QUOT; break;
|
|
420
|
+
case 'a': case '3': arem = EM_AIMP; break;
|
|
421
|
+
case 'i': case '4': arem = EM_INFO; break;
|
|
422
|
+
case 'c': case '5': arem = EM_CHI2; break;
|
|
423
|
+
case 'p': case '6': arem = EM_PVAL; break;
|
|
424
|
+
default : error(E_MEASURE, (char)arem); break;
|
|
425
|
+
}
|
|
426
|
+
if (target <= TT_MFSET) { /* in item set mode neutralize */
|
|
427
|
+
mode |= IST_BOTH; conf = 1;}/* rule specific settings */
|
|
428
|
+
if (arem == EM_NONE) /* if no add. rule eval. measure, */
|
|
429
|
+
aval = 0; /* clear the corresp. output flag */
|
|
430
|
+
if ((filter <= -1) || (filter >= 1)) filter = 0;
|
|
431
|
+
|
|
432
|
+
/* --- create item set and transaction set --- */
|
|
433
|
+
itemset = is_create(-1); /* create an item set and */
|
|
434
|
+
if (!itemset) error(E_NOMEM); /* set the special characters */
|
|
435
|
+
is_chars(itemset, blanks, fldseps, recseps, comment);
|
|
436
|
+
if (load) { /* if to load the transactions */
|
|
437
|
+
taset = tas_create(itemset);
|
|
438
|
+
if (!taset) error(E_NOMEM); /* create a transaction set */
|
|
439
|
+
} /* to store the transactions */
|
|
440
|
+
MSG(fprintf(stderr, "\n")); /* terminate the startup message */
|
|
441
|
+
|
|
442
|
+
/* --- read item appearances --- */
|
|
443
|
+
if (fn_app) { /* if item appearances are given */
|
|
444
|
+
t = clock(); /* start the timer */
|
|
445
|
+
if (*fn_app) /* if an app. file name is given, */
|
|
446
|
+
in = fopen(fn_app, "r"); /* open the item appearances file */
|
|
447
|
+
else { /* if no app. file name is given, */
|
|
448
|
+
in = stdin; fn_app = "<stdin>"; } /* read from std. input */
|
|
449
|
+
MSG(fprintf(stderr, "reading %s ... ", fn_app));
|
|
450
|
+
if (!in) error(E_FOPEN, fn_app);
|
|
451
|
+
k = is_readapp(itemset,in); /* read the item appearances */
|
|
452
|
+
if (k != 0) error(k, fn_app, RECCNT(itemset), BUFFER(itemset));
|
|
453
|
+
if (in != stdin) /* if not read from standard input, */
|
|
454
|
+
fclose(in); /* close the input file */
|
|
455
|
+
MSG(fprintf(stderr, "[%d item(s)]", is_cnt(itemset)));
|
|
456
|
+
MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
|
|
457
|
+
} /* print a log message */
|
|
458
|
+
|
|
459
|
+
/* --- read transactions --- */
|
|
460
|
+
t = clock(); /* start the timer */
|
|
461
|
+
if (fn_in && *fn_in) /* if an input file name is given, */
|
|
462
|
+
in = fopen(fn_in, "r"); /* open input file for reading */
|
|
463
|
+
else { /* if no input file name is given, */
|
|
464
|
+
in = stdin; fn_in = "<stdin>"; } /* read from standard input */
|
|
465
|
+
MSG(fprintf(stderr, "reading %s ... ", fn_in));
|
|
466
|
+
if (!in) error(E_FOPEN, fn_in);
|
|
467
|
+
while (1) { /* transaction read loop */
|
|
468
|
+
k = is_read(itemset, in); /* read the next transaction */
|
|
469
|
+
if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
|
|
470
|
+
if (k > 0) break; /* check for error and end of file */
|
|
471
|
+
k = is_tsize(itemset); /* update the maximal */
|
|
472
|
+
if (k > maxcnt) maxcnt = k; /* transaction size */
|
|
473
|
+
if (taset && (tas_add(taset, NULL, 0) != 0))
|
|
474
|
+
error(E_NOMEM); /* add the loaded transaction */
|
|
475
|
+
} /* to the transaction set */
|
|
476
|
+
if (taset) { /* if transactions have been loaded */
|
|
477
|
+
if (in != stdin) fclose(in);/* if not read from standard input, */
|
|
478
|
+
in = NULL; /* close the input file */
|
|
479
|
+
} /* clear the file variable */
|
|
480
|
+
n = is_cnt(itemset); /* get the number of items */
|
|
481
|
+
tacnt = is_gettac(itemset); /* and the number of transactions */
|
|
482
|
+
MSG(fprintf(stderr, "[%d item(s), %d transaction(s)]", n, tacnt));
|
|
483
|
+
MSG(fprintf(stderr, " done [%.2fs].", SEC_SINCE(t)));
|
|
484
|
+
if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
|
|
485
|
+
MSG(fprintf(stderr, "\n")); /* check for at least one transaction */
|
|
486
|
+
if (supp >= 0) /* if relative support is given */
|
|
487
|
+
supp = ceil(tacnt *supp); /* compute absolute support */
|
|
488
|
+
else { /* if absolute support is given, */
|
|
489
|
+
supp = ceil(-100 *supp); /* make the support value positive */
|
|
490
|
+
if (!(sout & 2)) sout = 2; /* switch to absolute support output */
|
|
491
|
+
} /* do the same with the max. support */
|
|
492
|
+
smax = floor(((smax >= 0) ? tacnt : -100) *smax);
|
|
493
|
+
|
|
494
|
+
/* --- sort and recode items --- */
|
|
495
|
+
MSG(fprintf(stderr, "filtering, sorting and recoding items ... "));
|
|
496
|
+
t = clock(); /* start the timer */
|
|
497
|
+
map = (int*)malloc(is_cnt(itemset) *sizeof(int));
|
|
498
|
+
if (!map) error(E_NOMEM); /* create an item identifier map */
|
|
499
|
+
k = (int)((mode & IST_HEAD) ? supp : ceil(supp *conf));
|
|
500
|
+
n = is_recode(itemset, k, sort, map);
|
|
501
|
+
if (taset) { /* sort and recode the items and */
|
|
502
|
+
tas_recode(taset, map,n); /* recode the loaded transactions */
|
|
503
|
+
maxcnt = tas_max(taset); /* get the new maximal t.a. size */
|
|
504
|
+
} /* (may be smaller than before) */
|
|
505
|
+
free(map); /* delete the item identifier map */
|
|
506
|
+
MSG(fprintf(stderr, "[%d item(s)] ", n));
|
|
507
|
+
MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
|
|
508
|
+
if (n <= 0) error(E_NOFREQ); /* print a log message and */
|
|
509
|
+
MSG(fprintf(stderr, "\n")); /* check the number of items */
|
|
510
|
+
if (maxlen > maxcnt) /* clamp the set/rule length */
|
|
511
|
+
maxlen = maxcnt; /* to the maximum set size */
|
|
512
|
+
|
|
513
|
+
/* --- create a transaction tree --- */
|
|
514
|
+
tt = 0; /* init. the tree construction time */
|
|
515
|
+
if (tree && taset) { /* if transactions were loaded */
|
|
516
|
+
MSG(fprintf(stderr, "creating transaction tree ... "));
|
|
517
|
+
t = clock(); /* start the timer */
|
|
518
|
+
tatree = tat_create(taset, heap);
|
|
519
|
+
if (!tatree) error(E_NOMEM);/* create a transaction tree */
|
|
520
|
+
if (filter == 0) { /* if a tree rebuild is not needed, */
|
|
521
|
+
tas_delete(taset, 0); taset = NULL; } /* delete transactions */
|
|
522
|
+
tt = clock() -t; /* note the time for the construction */
|
|
523
|
+
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
|
|
524
|
+
} /* print a log message */
|
|
525
|
+
|
|
526
|
+
/* --- create an item set tree --- */
|
|
527
|
+
t = clock(); tc = 0; /* start the timer */
|
|
528
|
+
istree = ist_create(itemset, mode, (int)supp, conf);
|
|
529
|
+
if (!istree) error(E_NOMEM); /* create an item set tree */
|
|
530
|
+
|
|
531
|
+
/* --- check item subsets --- */
|
|
532
|
+
if (filter) { /* if to filter unused items */
|
|
533
|
+
used = (char*)malloc(is_cnt(itemset) *sizeof(char));
|
|
534
|
+
if (!used) error(E_NOMEM); /* create a flag vector */
|
|
535
|
+
} /* for the items */
|
|
536
|
+
MSG(fprintf(stderr, "checking subsets of size 1"));
|
|
537
|
+
while (ist_height(istree) < maxlen) {
|
|
538
|
+
if (filter != 0) { /* if to filter w.r.t. item usage, */
|
|
539
|
+
i = ist_check(istree, used); /* check current item usage */
|
|
540
|
+
if (i < maxlen) maxlen = i; /* update the maximum size */
|
|
541
|
+
if (ist_height(istree) >= i) break;
|
|
542
|
+
} /* check the tree height */
|
|
543
|
+
k = ist_addlvl(istree); /* while max. height is not reached, */
|
|
544
|
+
if (k < 0) error(E_NOMEM); /* add a level to the item set tree */
|
|
545
|
+
if (k != 0) break; /* if no level was added, abort */
|
|
546
|
+
MSG(fprintf(stderr, " %d", ist_height(istree)));
|
|
547
|
+
if (tatree) { /* if a transaction tree was created */
|
|
548
|
+
if (((filter < 0) /* if to filter w.r.t. item usage */
|
|
549
|
+
&& (i < -filter *n)) /* and enough items were removed */
|
|
550
|
+
|| ((filter > 0) /* or counting time is long enough */
|
|
551
|
+
&& (i < n) && (i *(double)tt < filter *n *tc))) {
|
|
552
|
+
n = i; x = clock(); /* note the new number of items */
|
|
553
|
+
tas_filter(taset, used);/* and remove unnecessary items */
|
|
554
|
+
tat_delete(tatree); /* delete the transaction tree */
|
|
555
|
+
tatree = tat_create(taset, heap);
|
|
556
|
+
if (!tatree) error(E_NOMEM);
|
|
557
|
+
tt = clock() -x; /* rebuild the transaction tree and */
|
|
558
|
+
} /* note the new construction time */
|
|
559
|
+
x = clock(); /* count the transaction tree */
|
|
560
|
+
ist_countx(istree, tatree);
|
|
561
|
+
tc = clock() -x; } /* note the new count time */
|
|
562
|
+
else if (taset) { /* if transactions were loaded */
|
|
563
|
+
if (((filter < 0) /* if to filter w.r.t. item usage */
|
|
564
|
+
&& (i <= -filter *n)) /* and enough items were removed */
|
|
565
|
+
|| ((filter > 0) /* or counting time is long enough */
|
|
566
|
+
&& (i *(double)tt <= filter *n *tc))) {
|
|
567
|
+
n = i; x = clock(); /* note the new number of items */
|
|
568
|
+
tas_filter(taset, used);/* and remove unnecessary items */
|
|
569
|
+
tt = clock() -t; /* from the transactions */
|
|
570
|
+
} /* note the filtering time */
|
|
571
|
+
for (i = tacnt; --i >= 0;)/* traverse and count transactions */
|
|
572
|
+
ist_count(istree, tas_tract(taset, i), tas_tsize(taset, i));
|
|
573
|
+
tc = clock() -t; } /* note the new count time */
|
|
574
|
+
else { /* if to work on the input file, */
|
|
575
|
+
rewind(in); /* reset the file position */
|
|
576
|
+
for (maxcnt = 0; (i = is_read(itemset, in)) == 0; ) {
|
|
577
|
+
if (filter != 0) /* (re)read the transactions and */
|
|
578
|
+
is_filter(itemset, used); /* remove unnecessary items */
|
|
579
|
+
k = is_tsize(itemset); /* update the maximum size */
|
|
580
|
+
if (k > maxcnt) maxcnt = k; /* of a transaction */
|
|
581
|
+
ist_count(istree, is_tract(itemset), k);
|
|
582
|
+
} /* count the transaction in the tree */
|
|
583
|
+
if (i < 0) error(i, fn_in, RECCNT(itemset), BUFFER(itemset));
|
|
584
|
+
if (maxcnt < maxlen) /* update the maximal rule length */
|
|
585
|
+
maxlen = maxcnt; /* according to the max. t.a. size */
|
|
586
|
+
} /* (may be smaller than before) */
|
|
587
|
+
}
|
|
588
|
+
if (!taset && !tatree) { /* if transactions were not loaded */
|
|
589
|
+
if (in != stdin) fclose(in);/* if not read from standard input, */
|
|
590
|
+
in = NULL; /* close the input file */
|
|
591
|
+
} /* clear the file variable */
|
|
592
|
+
MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
|
|
593
|
+
|
|
594
|
+
/* --- filter found item sets --- */
|
|
595
|
+
if ((target == TT_CLSET) || (target == TT_MFSET)) {
|
|
596
|
+
MSG(fprintf(stderr, "filtering %s item sets ... ",
|
|
597
|
+
(target == TT_MFSET) ? "maximal" : "closed"));
|
|
598
|
+
t = clock(); /* filter the item sets */
|
|
599
|
+
ist_filter(istree, (target == TT_MFSET) ? IST_MAXFRQ : IST_CLOSED);
|
|
600
|
+
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
|
|
601
|
+
} /* (filter takes longer than print) */
|
|
602
|
+
|
|
603
|
+
/* --- sort transactions --- */
|
|
604
|
+
if (target <= TT_MFSET) { /* if to find frequent item sets */
|
|
605
|
+
if (!taset) /* transactions must be loaded */
|
|
606
|
+
ext = 0; /* for extended support output */
|
|
607
|
+
else if (ext) { /* if extended output is requested */
|
|
608
|
+
MSG(fprintf(stderr, "sorting transactions ... "));
|
|
609
|
+
t = clock(); /* start the timer */
|
|
610
|
+
tas_sort(taset, heap); /* sort the transactions */
|
|
611
|
+
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
|
|
612
|
+
} /* (sorting is necessary to find the */
|
|
613
|
+
} /* number of identical transactions) */
|
|
614
|
+
|
|
615
|
+
/* --- print item sets/rules/hyperedges --- */
|
|
616
|
+
t = clock(); /* start the timer */
|
|
617
|
+
if (fn_out && *fn_out) /* if an output file name is given, */
|
|
618
|
+
out = fopen(fn_out, "w"); /* open the output file */
|
|
619
|
+
else { /* if no output file name is given, */
|
|
620
|
+
out = stdout; fn_out = "<stdout>"; } /* write to std. output */
|
|
621
|
+
MSG(fprintf(stderr, "writing %s ... ", fn_out));
|
|
622
|
+
if (!out) error(E_FOPEN, fn_out);
|
|
623
|
+
ist_init(istree, minlen, arem, minval);
|
|
624
|
+
set = is_tract(itemset); /* get the transaction buffer */
|
|
625
|
+
if (target <= TT_MFSET) { /* if to find frequent item sets */
|
|
626
|
+
for (n = 0; 1; ) { /* extract item sets from the tree */
|
|
627
|
+
k = ist_set(istree, set, &frq, &conf);
|
|
628
|
+
if (k <= 0) break; /* get the next frequent item set */
|
|
629
|
+
if (frq > smax) continue; /* check against maximal support */
|
|
630
|
+
for (i = 0; i < k; i++) { /* traverse the set's items */
|
|
631
|
+
name = is_name(itemset, set[i]);
|
|
632
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
|
633
|
+
fputs(name, out); /* print the name of the next item */
|
|
634
|
+
fputs((i < k-1) ? sep : " ", out);
|
|
635
|
+
} /* print a separator */
|
|
636
|
+
fputs(" (", out); /* print the item set's support */
|
|
637
|
+
if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
|
|
638
|
+
if (sout & 2) fputc('/', out); }
|
|
639
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
|
640
|
+
if (ext) { /* if to print the extended support */
|
|
641
|
+
frq = tas_occur(taset, set, k);
|
|
642
|
+
fputs(", ", out); /* get the number of occurrences */
|
|
643
|
+
fprintf(out, fmt, (frq/(double)tacnt) *100);
|
|
644
|
+
if (sout & 2) fprintf(out, "/%d", frq);
|
|
645
|
+
} /* print the extended support data */
|
|
646
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, conf *100); }
|
|
647
|
+
fputs(")\n", out); /* print the add. eval. measure, */
|
|
648
|
+
n++; /* terminate the support output, */
|
|
649
|
+
} } /* and count the item set */
|
|
650
|
+
else if (target == TT_RULE) { /* if to find association rules, */
|
|
651
|
+
for (n = 0; 1; ) { /* extract rules from tree */
|
|
652
|
+
k = ist_rule(istree, set, &frq, &conf, &lftval, &minval);
|
|
653
|
+
if (k <= 0) break; /* get the next association rule */
|
|
654
|
+
if (frq > smax) continue; /* check against maximal support */
|
|
655
|
+
for (i = 0; i < k; i++) { /* traverse the rule's items */
|
|
656
|
+
name = is_name(itemset, set[i]);
|
|
657
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
|
658
|
+
fputs(name, out); /* print the next item */
|
|
659
|
+
fputs((i <= 0) ? " <- " : ((i < k-1) ? sep : " "), out);
|
|
660
|
+
} /* print a separator */
|
|
661
|
+
fputs(" (", out); /* print the rule evaluation */
|
|
662
|
+
if (sout & 1) supp = frq/(double)tacnt;
|
|
663
|
+
if (ext && !(mode & IST_HEAD)) {
|
|
664
|
+
if (sout & 1) { fprintf(out, fmt, supp *conf *100);
|
|
665
|
+
if (sout & 2) fputc('/', out); }
|
|
666
|
+
if (sout & 2) { fprintf(out, "%d", (int)(frq *conf +0.5));}
|
|
667
|
+
fputs(", ", out); /* print the support of the rule */
|
|
668
|
+
} /* from the support of the body */
|
|
669
|
+
if (sout & 1) { fprintf(out, fmt, supp *100);
|
|
670
|
+
if (sout & 2) fputc('/', out); }
|
|
671
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
|
672
|
+
fputs(", ", out); /* print the rule support */
|
|
673
|
+
if (ext && (mode & IST_HEAD)) {
|
|
674
|
+
if (sout & 1) { fprintf(out, fmt, (supp/conf) *100);
|
|
675
|
+
if (sout & 2) fputc('/', out); }
|
|
676
|
+
if (sout & 2) { fprintf(out, "%d", (int)(frq /conf +0.5));}
|
|
677
|
+
fputs(", ", out); /* print the support of the body */
|
|
678
|
+
} /* from the support of the rule */
|
|
679
|
+
fprintf(out, fmt, conf *100); /* print the rule confidence */
|
|
680
|
+
if (lift) { fputs(", ", out); fprintf(out, fmt, lftval *100); }
|
|
681
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
|
|
682
|
+
fputs(")\n", out); /* print the value of the additional */
|
|
683
|
+
n++; /* rule evaluation measure and */
|
|
684
|
+
} } /* count the association rule */
|
|
685
|
+
else if (target == TT_HEDGE){ /* if to find association hyperedges */
|
|
686
|
+
for (n = 0; 1; ) { /* extract hyperedges from tree */
|
|
687
|
+
k = ist_hedge(istree, set, &frq, &conf, &minval);
|
|
688
|
+
if (k <= 0) break; /* get the next hyperedge */
|
|
689
|
+
if (frq > smax) continue; /* check against maximal support */
|
|
690
|
+
for (i = 0; i < k; i++) { /* traverse the edge's items */
|
|
691
|
+
name = is_name(itemset, set[i]);
|
|
692
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
|
693
|
+
fputs(name, out); /* print the name of the next item */
|
|
694
|
+
fputs((i < k-1) ? sep : " ", out);
|
|
695
|
+
} /* print a separator */
|
|
696
|
+
fputs(" (", out); /* print the hyperedge evaluation */
|
|
697
|
+
if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
|
|
698
|
+
if (sout & 2) fputc('/', out); }
|
|
699
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
|
700
|
+
fputs(", ", out); fprintf(out, fmt, conf *100);
|
|
701
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
|
|
702
|
+
fputs(")\n", out); /* print support and confidence */
|
|
703
|
+
n++; /* of the hyperedge and */
|
|
704
|
+
} } /* count the hyperedge */
|
|
705
|
+
else { /* if to find association groups */
|
|
706
|
+
for (n = 0; 1; ) { /* extract groups from tree */
|
|
707
|
+
k = ist_group(istree, set, &frq, &minval);
|
|
708
|
+
if (k <= 0) break; /* get the next group */
|
|
709
|
+
if (frq > smax) continue; /* check against maximal support */
|
|
710
|
+
for (i = 0; i < k; i++) { /* traverse the group's items */
|
|
711
|
+
name = is_name(itemset, set[i]);
|
|
712
|
+
if (c2scf) { sc_format(buf, name, 0); name = buf; }
|
|
713
|
+
fputs(name, out); /* print the name of the next item */
|
|
714
|
+
fputs((i < k-1) ? sep : " ", out);
|
|
715
|
+
} /* print a separator */
|
|
716
|
+
fputs(" (", out); /* print the group evaluation */
|
|
717
|
+
if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
|
|
718
|
+
if (sout & 2) fputc('/', out); }
|
|
719
|
+
if (sout & 2) { fprintf(out, "%d", frq); }
|
|
720
|
+
if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
|
|
721
|
+
fputs(")\n", out); /* print support and add. measure */
|
|
722
|
+
n++; /* and count the group */
|
|
723
|
+
}
|
|
724
|
+
} /* if (target <= TT_MFSET) .. else .. */
|
|
725
|
+
if (fflush(out) != 0) error(E_FWRITE, fn_out);
|
|
726
|
+
if (out != stdout) fclose(out);
|
|
727
|
+
out = NULL; /* close the output file */
|
|
728
|
+
MSG(fprintf(stderr, "[%d %s(s)] done ", n, ttypes[target]));
|
|
729
|
+
MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)));
|
|
730
|
+
#ifdef BENCH
|
|
731
|
+
printf("number of support counters: %d\n", istree->sccnt);
|
|
732
|
+
printf("necessary support counters: %d\n", istree->scnec);
|
|
733
|
+
printf("number of child pointers : %d\n", istree->cpcnt);
|
|
734
|
+
printf("necessary child pointers : %d\n", istree->cpnec);
|
|
735
|
+
printf("allocated memory (bytes) : %d\n", istree->bytes);
|
|
736
|
+
#endif
|
|
737
|
+
|
|
738
|
+
/* --- clean up --- */
|
|
739
|
+
#ifndef NDEBUG /* if this is a debug version */
|
|
740
|
+
free(used); /* delete the item app. vector */
|
|
741
|
+
ist_delete(istree); /* delete the item set tree, */
|
|
742
|
+
if (tatree) tat_delete(tatree); /* the transaction tree, */
|
|
743
|
+
if (taset) tas_delete(taset, 0); /* the transaction set, */
|
|
744
|
+
is_delete(itemset); /* and the item set */
|
|
745
|
+
#endif
|
|
746
|
+
#ifdef STORAGE /* if storage debugging */
|
|
747
|
+
showmem("at end of program"); /* check memory usage */
|
|
748
|
+
#endif
|
|
749
|
+
return 0; /* return 'ok' */
|
|
750
|
+
} /* main() */
|