apriori 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. data/History.txt +16 -0
  2. data/License.txt +20 -0
  3. data/Manifest.txt +121 -0
  4. data/README.txt +149 -0
  5. data/Rakefile +15 -0
  6. data/TODO.txt +60 -0
  7. data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
  8. data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
  9. data/attic/c_ext_test1/mytest.rb +10 -0
  10. data/attic/test.c +12 -0
  11. data/config/hoe.rb +81 -0
  12. data/config/requirements.rb +29 -0
  13. data/examples/01_simple_example.rb +32 -0
  14. data/examples/02_small_file_example.rb +17 -0
  15. data/examples/03_large_file_example.rb +22 -0
  16. data/examples/test_data/market_basket_basic_test.dat +9 -0
  17. data/ext/Apriori.c +149 -0
  18. data/ext/Makefile +149 -0
  19. data/ext/apriori/doc/apriori.html +1301 -0
  20. data/ext/apriori/doc/arem.gp +68 -0
  21. data/ext/apriori/doc/c_rev.gp +89 -0
  22. data/ext/apriori/doc/chi2.tex +156 -0
  23. data/ext/apriori/doc/copying +504 -0
  24. data/ext/apriori/doc/line.gif +0 -0
  25. data/ext/apriori/doc/uparrow.gif +0 -0
  26. data/ext/apriori/ex/flg2set +15 -0
  27. data/ext/apriori/ex/hdr2set +13 -0
  28. data/ext/apriori/ex/readme +71 -0
  29. data/ext/apriori/ex/row2set +7 -0
  30. data/ext/apriori/ex/rulesort +24 -0
  31. data/ext/apriori/ex/tab2set +9 -0
  32. data/ext/apriori/ex/test.app +2 -0
  33. data/ext/apriori/ex/test.rul +9 -0
  34. data/ext/apriori/ex/test1.rul +43 -0
  35. data/ext/apriori/ex/test1.tab +10 -0
  36. data/ext/apriori/ex/test2.tab +10 -0
  37. data/ext/apriori/ex/test3.tab +30 -0
  38. data/ext/apriori/ex/test4.tab +11 -0
  39. data/ext/apriori/ex/test5.tab +39 -0
  40. data/ext/apriori/ex/tid2set +23 -0
  41. data/ext/apriori/ex/xhdr2set +33 -0
  42. data/ext/apriori/src/apriori.c +750 -0
  43. data/ext/apriori/src/apriori.dsp +120 -0
  44. data/ext/apriori/src/apriori.dsw +29 -0
  45. data/ext/apriori/src/apriori.mak +99 -0
  46. data/ext/apriori/src/istree.c +1411 -0
  47. data/ext/apriori/src/istree.h +160 -0
  48. data/ext/apriori/src/makefile +105 -0
  49. data/ext/apriori/src/tract.c +870 -0
  50. data/ext/apriori/src/tract.h +261 -0
  51. data/ext/apriori_wrapper.c +757 -0
  52. data/ext/apriori_wrapper.h +10 -0
  53. data/ext/extconf.rb +32 -0
  54. data/ext/math/doc/copying +504 -0
  55. data/ext/math/src/chi2.c +151 -0
  56. data/ext/math/src/chi2.h +27 -0
  57. data/ext/math/src/choose.c +71 -0
  58. data/ext/math/src/choose.h +16 -0
  59. data/ext/math/src/gamma.c +446 -0
  60. data/ext/math/src/gamma.h +39 -0
  61. data/ext/math/src/intexp.c +35 -0
  62. data/ext/math/src/intexp.h +15 -0
  63. data/ext/math/src/makefile +164 -0
  64. data/ext/math/src/math.mak +48 -0
  65. data/ext/math/src/normal.c +387 -0
  66. data/ext/math/src/normal.h +44 -0
  67. data/ext/math/src/radfn.c +113 -0
  68. data/ext/math/src/radfn.h +34 -0
  69. data/ext/math/src/zeta.c +49 -0
  70. data/ext/math/src/zeta.h +15 -0
  71. data/ext/pre-clean.rb +8 -0
  72. data/ext/pre-setup.rb +9 -0
  73. data/ext/util/doc/copying +504 -0
  74. data/ext/util/src/listops.c +76 -0
  75. data/ext/util/src/listops.h +26 -0
  76. data/ext/util/src/makefile +103 -0
  77. data/ext/util/src/memsys.c +84 -0
  78. data/ext/util/src/memsys.h +42 -0
  79. data/ext/util/src/nstats.c +288 -0
  80. data/ext/util/src/nstats.h +69 -0
  81. data/ext/util/src/params.c +86 -0
  82. data/ext/util/src/params.h +19 -0
  83. data/ext/util/src/parse.c +133 -0
  84. data/ext/util/src/parse.h +81 -0
  85. data/ext/util/src/scan.c +767 -0
  86. data/ext/util/src/scan.h +111 -0
  87. data/ext/util/src/symtab.c +443 -0
  88. data/ext/util/src/symtab.h +121 -0
  89. data/ext/util/src/tabscan.c +279 -0
  90. data/ext/util/src/tabscan.h +99 -0
  91. data/ext/util/src/util.mak +91 -0
  92. data/ext/util/src/vecops.c +317 -0
  93. data/ext/util/src/vecops.h +42 -0
  94. data/lib/apriori.rb +133 -0
  95. data/lib/apriori/adapter.rb +13 -0
  96. data/lib/apriori/association_rule.rb +89 -0
  97. data/lib/apriori/version.rb +9 -0
  98. data/script/console +10 -0
  99. data/script/destroy +14 -0
  100. data/script/generate +14 -0
  101. data/script/txt2html +82 -0
  102. data/setup.rb +1585 -0
  103. data/tasks/apriori.rake +20 -0
  104. data/tasks/attic.rake +28 -0
  105. data/tasks/deployment.rake +34 -0
  106. data/tasks/environment.rake +7 -0
  107. data/tasks/install.rake +13 -0
  108. data/tasks/website.rake +17 -0
  109. data/test/apriori_test.rb +13 -0
  110. data/test/fixtures/market_basket_results_test.txt +5 -0
  111. data/test/fixtures/market_basket_string_test.txt +7 -0
  112. data/test/fixtures/results.txt +2 -0
  113. data/test/fixtures/sample.txt +7 -0
  114. data/test/test_helper.rb +5 -0
  115. data/test/unit/test_apriori.rb +68 -0
  116. data/test/unit/test_itemsets_and_parsing.rb +82 -0
  117. data/website/index.html +248 -0
  118. data/website/index.txt +152 -0
  119. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  120. data/website/stylesheets/screen.css +142 -0
  121. data/website/template.html.erb +49 -0
  122. metadata +226 -0
Binary file
@@ -0,0 +1,15 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ (NR == 1) {
4
+ for (i = 0; ++i <= NF; )
5
+ items[i] = $i;
6
+ }
7
+ (NR > 1) {
8
+ for (i = k = 0; ++i <= NF; ) {
9
+ if ($i == "T") {
10
+ if (k++ > 0) printf(" ");
11
+ printf("%s", items[i]);
12
+ }
13
+ }
14
+ printf("\n");
15
+ }' $1 > $2
@@ -0,0 +1,13 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ (NR == 1) {
4
+ for (i = 0; ++i <= NF; )
5
+ items[i] = $i;
6
+ }
7
+ (NR > 1) {
8
+ for (i = k = 0; ++i <= NF; ) {
9
+ if (k++ > 0) printf(" ");
10
+ printf("%s=%s", items[i], $i);
11
+ }
12
+ printf("\n");
13
+ }' $1 > $2
@@ -0,0 +1,71 @@
1
+ The example files in this directory demonstrate how to use the
2
+ options -b, -f, and -r and the optional item appearances file.
3
+ This file also explains the conversion scripts, which can convert
4
+ different input formats into the format needed by the apriori program.
5
+
6
+ In the file test1.tab transactions are separated by newline characters
7
+ and the items of a transaction are separated by spaces. This is the
8
+ standard input format and hence the file can be processed directly:
9
+ apriori test1.tab test1.rul
10
+
11
+ In the file test2.tab the same transactions can be found, but several
12
+ different field separators are used. This file can be processed with:
13
+ apriori -f ",.;:" -l test2.tab test2.rul
14
+
15
+ The files test3.tab to test5.tab are in formats that cannot be
16
+ processed directly with the apriori program, but which may be common.
17
+
18
+ In the file test3.tab each line contains a transaction identifier and
19
+ an item, separated by a space. This file can be converted into the
20
+ standard input format with the script tid2set, i.e., with
21
+ tid2set test3.tab x.tab
22
+ Note, however, that the input file (here: test3.tab) must be sorted
23
+ w.r.t. the transaction identifier, so that items belonging to the
24
+ same transaction occupy consecutive lines/records.
25
+
26
+ In the file test4.tab the first line states the item names and the
27
+ following lines contain flags T (true) and F (false) depending on
28
+ whether the item is contained in the transaction represented by the
29
+ line or not. This format can be converted into the standard input
30
+ format with the script flg2set, i.e., with
31
+ flg2set test4.tab x.tab
32
+
33
+ In the file test5.tab there is one item per line and transactions
34
+ are separated by blank lines. This format can be converted into the
35
+ standard input format with the script row2set, i.e., with
36
+ row2set test5.tab x.tab
37
+
38
+ The additional scripts tab2set and hdr2set convert tables with column
39
+ numbers or column names into a format appropriate for the apriori
40
+ program. They are invoked in the same way as all other scripts
41
+ discussed above, i.e., with
42
+ tab2set a.tab b.tab
43
+ or
44
+ hdr2set a.tab b.tab
45
+ where a.tab is the name of the input file and b.tab the name of the
46
+ output file. The script tab2set replaces each table entry "x" of the
47
+ input file by "Xi=x", where i is the column number (starting with 1).
48
+ The script hdr2set reads the variable names from the first line of
49
+ the input file and then replaces each table entry "x" by "X=x", where
50
+ "X" is the variable name that was found in the corresponding column
51
+ of the first line. These scripts are handy if you want to process
52
+ tabular data by treating each table row as a transaction.
53
+
54
+ The file test.app demonstrates the use of item appearance indicators.
55
+ The first line of this file ('body') states that any item not explicitly
56
+ mentioned in this file may appear only in the body of a rule. The second
57
+ line says that item 2 may appear only in the head of a rule. Hence, by
58
+ processing the file test1.tab with
59
+ apriori test1.tab test.rul test.app
60
+ only rules with item 2 in the head are generated.
61
+
62
+ Note that any input may also be read from standard input and any output
63
+ may be sent to standard output, simply by specifying a '-' or an empty
64
+ string "" instead of a filename. For example
65
+ apriori test1.tab -
66
+ writes the rules directly to the terminal. They may be piped to any
67
+ other program, since all other messages of the apriori program are
68
+ written to standard error.
69
+
70
+ Enjoy,
71
+ Christian Borgelt
@@ -0,0 +1,7 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ BEGIN { n = 0; }
4
+ ($0 != "") { if (n++ > 0) printf(" "); printf($1); }
5
+ ($0 == "") { printf("\n"); n = 0; }
6
+ END { printf("\n"); }
7
+ ' $1 > $2
@@ -0,0 +1,24 @@
1
+ #!/bin/bash
2
+ #-----------------------------------------------------------------------
3
+ # File : rulesort
4
+ # Contents: sort output of apriori
5
+ # Author : Christian Borgelt
6
+ # History : ??.??.1996 file created
7
+ # 27.02.1997 default settings moved to default case
8
+ # 26.03.2003 adapted to current apriori version
9
+ #-----------------------------------------------------------------------
10
+ case $1 in
11
+ '-1')
12
+ cmd='s/\(.*(\)\([0-9]*\.[0-9]*\)\(%[,/].*\)/\2#\1\2\3/'
13
+ sopt='-n -r'
14
+ shift;;
15
+ '-2')
16
+ cmd='s/\(.*(.*[,/] \)\([0-9]*\.[0-9]*\)\(%.*\)/\2#\1\2\3/'
17
+ sopt='-n -r'
18
+ shift;;
19
+ default)
20
+ cmd=''
21
+ sopt='-d'
22
+ ;;
23
+ esac
24
+ sed "$cmd" $1 | sort $sopt | sed 's/^.*#//' > $2
@@ -0,0 +1,9 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ {
4
+ for (i = k = 0; ++i <= NF; ) {
5
+ if (k++ > 0) printf(" ");
6
+ printf("X%d=%s", i, $i);
7
+ }
8
+ printf("\n");
9
+ }' $1 > $2
@@ -0,0 +1,2 @@
1
+ body
2
+ 2 head
@@ -0,0 +1,9 @@
1
+ 4 <- 5 (30.0, 100.0)
2
+ 2 <- 1 (60.0, 83.3)
3
+ 3 <- 2 (70.0, 85.7)
4
+ 2 <- 3 (70.0, 85.7)
5
+ 4 <- 5 1 (10.0, 100.0)
6
+ 4 <- 5 3 (10.0, 100.0)
7
+ 3 <- 1 2 (50.0, 80.0)
8
+ 2 <- 1 3 (40.0, 100.0)
9
+ 2 <- 1 3 4 (20.0, 100.0)
@@ -0,0 +1,43 @@
1
+ 5 <- (100.0, 30.0)
2
+ 1 <- (100.0, 60.0)
3
+ 2 <- (100.0, 70.0)
4
+ 3 <- (100.0, 70.0)
5
+ 4 <- (100.0, 70.0)
6
+ 1 <- 5 (30.0, 33.3)
7
+ 3 <- 5 (30.0, 33.3)
8
+ 4 <- 5 (30.0, 100.0)
9
+ 5 <- 4 (70.0, 42.9)
10
+ 2 <- 1 (60.0, 83.3)
11
+ 1 <- 2 (70.0, 71.4)
12
+ 3 <- 1 (60.0, 66.7)
13
+ 1 <- 3 (70.0, 57.1)
14
+ 4 <- 1 (60.0, 66.7)
15
+ 1 <- 4 (70.0, 57.1)
16
+ 3 <- 2 (70.0, 85.7)
17
+ 2 <- 3 (70.0, 85.7)
18
+ 4 <- 2 (70.0, 57.1)
19
+ 2 <- 4 (70.0, 57.1)
20
+ 4 <- 3 (70.0, 57.1)
21
+ 3 <- 4 (70.0, 57.1)
22
+ 4 <- 5 1 (10.0, 100.0)
23
+ 1 <- 5 4 (30.0, 33.3)
24
+ 5 <- 1 4 (40.0, 25.0)
25
+ 4 <- 5 3 (10.0, 100.0)
26
+ 3 <- 5 4 (30.0, 33.3)
27
+ 5 <- 3 4 (40.0, 25.0)
28
+ 3 <- 1 2 (50.0, 80.0)
29
+ 2 <- 1 3 (40.0, 100.0)
30
+ 1 <- 2 3 (60.0, 66.7)
31
+ 4 <- 1 2 (50.0, 60.0)
32
+ 2 <- 1 4 (40.0, 75.0)
33
+ 1 <- 2 4 (40.0, 75.0)
34
+ 4 <- 1 3 (40.0, 50.0)
35
+ 3 <- 1 4 (40.0, 50.0)
36
+ 1 <- 3 4 (40.0, 50.0)
37
+ 4 <- 2 3 (60.0, 50.0)
38
+ 3 <- 2 4 (40.0, 75.0)
39
+ 2 <- 3 4 (40.0, 75.0)
40
+ 4 <- 1 2 3 (40.0, 50.0)
41
+ 3 <- 1 2 4 (30.0, 66.7)
42
+ 2 <- 1 3 4 (20.0, 100.0)
43
+ 1 <- 2 3 4 (30.0, 66.7)
@@ -0,0 +1,10 @@
1
+ 1 2 3
2
+ 1 4 5
3
+ 2 3 4
4
+ 1 2 3 4
5
+ 2 3
6
+ 1 2 4
7
+ 4 5
8
+ 1 2 3 4
9
+ 3 4 5
10
+ 1 2 3
@@ -0,0 +1,10 @@
1
+ 1,2,3
2
+ 1,4,5
3
+ 2.3.4
4
+ 1,2,3,4
5
+ 2:3
6
+ 1,2,4
7
+ 4,5
8
+ 1,2,3,4
9
+ 3;4;5
10
+ 1,2,3
@@ -0,0 +1,30 @@
1
+ 0 1
2
+ 0 2
3
+ 0 3
4
+ 1 1
5
+ 1 4
6
+ 1 5
7
+ 2 2
8
+ 2 3
9
+ 2 4
10
+ 3 1
11
+ 3 2
12
+ 3 3
13
+ 3 4
14
+ 4 2
15
+ 4 3
16
+ 5 1
17
+ 5 2
18
+ 5 4
19
+ 6 4
20
+ 6 5
21
+ 7 1
22
+ 7 2
23
+ 7 3
24
+ 7 4
25
+ 8 3
26
+ 8 4
27
+ 8 5
28
+ 9 1
29
+ 9 2
30
+ 9 3
@@ -0,0 +1,11 @@
1
+ 1 2 3 4 5
2
+ T T T F F
3
+ T F F T T
4
+ F T T T F
5
+ T T T T F
6
+ F T T F F
7
+ T T F T F
8
+ F F F T T
9
+ T T T T F
10
+ F F T T T
11
+ T T T F F
@@ -0,0 +1,39 @@
1
+ 1
2
+ 2
3
+ 3
4
+
5
+ 1
6
+ 4
7
+ 5
8
+
9
+ 2
10
+ 3
11
+ 4
12
+
13
+ 1
14
+ 2
15
+ 3
16
+ 4
17
+
18
+ 2
19
+ 3
20
+
21
+ 1
22
+ 2
23
+ 4
24
+
25
+ 4
26
+ 5
27
+
28
+ 1
29
+ 2
30
+ 3
31
+ 4
32
+
33
+ 3
34
+ 4
35
+ 5
36
+
37
+ 1
38
+ 2
39
+ 3
@@ -0,0 +1,23 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ function output ()
4
+ {
5
+ if (i > 0)
6
+ printf("%s", items[0]);
7
+ for (k = 0; ++k < i; )
8
+ printf(" %s", items[k]);
9
+ printf("\n");
10
+ }
11
+
12
+ BEGIN { tid = ""; i = 0; }
13
+ {
14
+ if ($1 == tid)
15
+ items[i++] = $2;
16
+ else {
17
+ if (tid != "") output();
18
+ tid = $1;
19
+ items[0] = $2; i = 1;
20
+ }
21
+ }
22
+ END { output(); }
23
+ ' $1 > $2
@@ -0,0 +1,33 @@
1
+ #!/bin/sh
2
+ gawk -v app=$3 -v out=$4 '
3
+ BEGIN {
4
+ FS = " ";
5
+ if ((app != "") && (out != "")) {
6
+ getline dflt < app;
7
+ while ((getline < app) > 0)
8
+ base[$1] = $2;
9
+ }
10
+ FS = ",";
11
+ }
12
+ (NR == 1) {
13
+ for (i = 0; ++i <= NF; )
14
+ items[i] = $i;
15
+ }
16
+ (NR > 1) {
17
+ for (i = k = 0; ++i <= NF; ) {
18
+ if (k++ > 0) printf(" ");
19
+ item = (items[i] "=" $i);
20
+ printf("%s", item);
21
+ if (items[i] in base) apps[item] = base[items[i]];
22
+ else apps[item] = dflt;
23
+ }
24
+ printf("\n");
25
+ }
26
+ END {
27
+ if (out != "") {
28
+ print dflt > out;
29
+ for (t in apps)
30
+ if (apps[t] != dflt)
31
+ printf("%s %s\n", t, apps[t]) > out;
32
+ }
33
+ }' $1 > $2
@@ -0,0 +1,750 @@
1
+ /*----------------------------------------------------------------------
2
+ File : apriori.c
3
+ Contents: apriori algorithm for finding association rules
4
+ Author : Christian Borgelt
5
+ History : 1996.02.14 file created
6
+ 1996.07.26 output precision reduced
7
+ 1996.11.22 options -b, -f, and -r added
8
+ 1996.11.24 option -e added (add. evaluation measures)
9
+ 1997.08.18 normalized chi^2 measure added
10
+ option -m (minimal rule length) added
11
+ 1997.10.13 quiet version (no output to stdout or stderr)
12
+ 1998.01.27 adapted to changed ist_create() function
13
+ 1998.08.08 optional input file (item appearances) added
14
+ 1998.09.02 several assertions added
15
+ 1998.09.07 hyperedge mode (option -h) added
16
+ 1998.12.08 output of absolute support (option -a) added
17
+ float changed to double
18
+ 1998.12.09 conversion of names to a scanable form added
19
+ 1999.02.05 long int changed to int
20
+ 1999.02.09 input from stdin, output to stdout added
21
+ 1999.08.09 bug in check of support parameter (<= 0) fixed
22
+ 1999.11.05 rule evaluation measure EM_AIMP added
23
+ 1999.11.08 output of add. rule eval. measure value added
24
+ 2000.03.16 optional use of original rule support definition
25
+ 2001.04.01 option -h replaced by option -t (target type)
26
+ 2001.05.26 extended support output added (option -x)
27
+ 2001.06.09 extended support output for item sets added
28
+ 2001.08.15 module scan used for output formatting
29
+ 2001.11.18 item and transaction functions made a module
30
+ 2001.11.19 options -C, -l changed, option -y removed
31
+ 2001.12.28 adapted to module tract, some improvements
32
+ 2002.01.11 evaluation measures codes changed to letters
33
+ 2002.02.10 option -q extended by a direction parameter
34
+ 2002.02.11 memory usage minimization option added
35
+ 2002.06.09 arbitrary supp./conf. formats made possible
36
+ 2003.01.09 option -k (item separator) added
37
+ 2003.01.14 check for empty transaction set added
38
+ 2003.03.12 output of lift value (conf/prior) added
39
+ 2003.07.17 item filtering w.r.t. usage added (option -u)
40
+ 2003.07.17 sorting w.r.t. transaction size sum added
41
+ 2003.07.18 maximal itemset filter added
42
+ 2003.08.11 closed itemset filter added
43
+ 2003.08.15 item filtering for transaction tree added
44
+ 2003.08.16 parameter for transaction filtering added
45
+ 2003.08.18 dynamic filtering decision based on times added
46
+ 2003.08.21 option -j (heap sort for transactions) added
47
+ 2003.09.22 meaning of option -j reversed (heapsort default)
48
+ 2004.03.25 option -S added (maximal support of a set/rule)
49
+ 2004.05.09 additional selection measure for sets added
50
+ 2004.10.28 two unnecessary assignments removed
51
+ 2004.11.20 bug in evaluation of -j (heap/quicksort) fixed
52
+ 2004.11.23 absolute/relative support output changed
53
+ 2004.12.09 semantics of option -p changed
54
+ 2005.01.25 bug in output of absolute/relative support fixed
55
+ 2005.01.31 another bug in this output fixed
56
+ 2005.06.20 use of flag for "no item sorting" corrected
57
+ 2007.02.13 adapted to modified module tabscan
58
+ 2008.03.13 additional hyperedge evaluation added
59
+ 2008.03.24 additional target added (association groups)
60
+ ----------------------------------------------------------------------*/
61
+ #include <stdio.h>
62
+ #include <stdlib.h>
63
+ #include <stdarg.h>
64
+ #include <string.h>
65
+ #include <limits.h>
66
+ #include <math.h>
67
+ #include <time.h>
68
+ #include <assert.h>
69
+ #include "scan.h"
70
+ #include "tract.h"
71
+ #include "istree.h"
72
+ #ifdef STORAGE
73
+ #include "storage.h"
74
+ #endif
75
+
76
+ /*----------------------------------------------------------------------
77
+ Preprocessor Definitions
78
+ ----------------------------------------------------------------------*/
79
+ #define PRGNAME "apriori"
80
+ #define DESCRIPTION "find association rules with the apriori algorithm"
81
+ #define VERSION "version 4.35 (2008.03.24) " \
82
+ "(c) 1996-2008 Christian Borgelt"
83
+
84
+ /* --- target types --- */
85
+ #define TT_SET 0 /* frequent item sets */
86
+ #define TT_CLSET 1 /* closed item sets */
87
+ #define TT_MFSET 2 /* maximal item sets */
88
+ #define TT_RULE 3 /* association rules */
89
+ #define TT_HEDGE 4 /* association hyperedges */
90
+ #define TT_GROUP 5 /* association groups */
91
+
92
+ /* --- error codes --- */
93
+ #define E_OPTION (-5) /* unknown option */
94
+ #define E_OPTARG (-6) /* missing option argument */
95
+ #define E_ARGCNT (-7) /* too few/many arguments */
96
+ #define E_STDIN (-8) /* double assignment of stdin */
97
+ #define E_TARGET (-9) /* invalid target type */
98
+ #define E_SUPP (-10) /* invalid support */
99
+ #define E_CONF (-11) /* invalid confidence */
100
+ #define E_MEASURE (-12) /* invalid evaluation measure */
101
+ #define E_RULELEN (-13) /* invalid rule length */
102
+ #define E_NOTAS (-14) /* no items or transactions */
103
+ #define E_NOFREQ (-15) /* no frequent items */
104
+ #define E_UNKNOWN (-21) /* unknown error */
105
+
106
+ #ifndef QUIET /* if not quiet version */
107
+ #ifdef FFLUSH
108
+ #define MSG(x) x /* print messages */
109
+ #else /* if to flush every output */
110
+ #define MSG(x) x, fflush(stderr)
111
+ #endif
112
+ #else /* if quiet version */
113
+ #define MSG(x) /* suppress messages */
114
+ #endif
115
+
116
+ #define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
117
+ #define RECCNT(s) (ts_reccnt(is_tabscan(s)) \
118
+ - ((ts_delim(is_tabscan(s)) == TS_REC) ? 1 : 0))
119
+ #define BUFFER(s) ts_buf(is_tabscan(s))
120
+
121
+ /*----------------------------------------------------------------------
122
+ Constants
123
+ ----------------------------------------------------------------------*/
124
+ #ifndef QUIET /* if not quiet version */
125
+ /* --- target types --- */
126
+ static const char *ttypes[] = {
127
+ /* TT_SET 0 */ "set",
128
+ /* TT_CLSET 1 */ "set",
129
+ /* TT_MFSET 2 */ "set",
130
+ /* TT_RULE 3 */ "rule",
131
+ /* TT_HEDGE 4 */ "hyperedge",
132
+ /* TT_GROUP 5 */ "group",
133
+ };
134
+
135
+ /* --- error messages --- */
136
+ static const char *errmsgs[] = {
137
+ /* E_NONE 0 */ "no error\n",
138
+ /* E_NOMEM -1 */ "not enough memory\n",
139
+ /* E_FOPEN -2 */ "cannot open file %s\n",
140
+ /* E_FREAD -3 */ "read error on file %s\n",
141
+ /* E_FWRITE -4 */ "write error on file %s\n",
142
+ /* E_OPTION -5 */ "unknown option -%c\n",
143
+ /* E_OPTARG -6 */ "missing option argument\n",
144
+ /* E_ARGCNT -7 */ "wrong number of arguments\n",
145
+ /* E_STDIN -8 */ "double assignment of standard input\n",
146
+ /* E_TARGET -9 */ "invalid target type '%c'\n",
147
+ /* E_SUPP -10 */ "invalid minimal support %g%%\n",
148
+ /* E_CONF -11 */ "invalid minimal confidence %g%%\n",
149
+ /* E_MEASURE -12 */ "invalid additional evaluation measure %c\n",
150
+ /* E_RULELEN -13 */ "invalid set size/rule length %d\n",
151
+ /* E_NOTAS -14 */ "no items or transactions to work on\n",
152
+ /* E_NOFREQ -15 */ "no frequent items\n",
153
+ /* E_ITEMEXP -16 */ "file %s, record %d: item expected\n",
154
+ /* E_DUPITEM -17 */ "file %s, record %d: duplicate item %s\n",
155
+ /* E_APPEXP -18 */ "file %s, record %d: "
156
+ "appearance indicator expected\n",
157
+ /* E_UNKAPP -19 */ "file %s, record %d: "
158
+ "unknown appearance indicator %s\n",
159
+ /* E_FLDCNT -20 */ "file %s, record %d: too many fields\n",
160
+ /* E_UNKNOWN -21 */ "unknown error\n"
161
+ };
162
+ #endif
163
+
164
+ /*----------------------------------------------------------------------
165
+ Global Variables
166
+ ----------------------------------------------------------------------*/
167
+ #ifndef QUIET
168
+ static char *prgname; /* program name for error messages */
169
+ #endif
170
+ static ITEMSET *itemset = NULL; /* item set */
171
+ static TASET *taset = NULL; /* transaction set */
172
+ static TATREE *tatree = NULL; /* transaction tree */
173
+ static ISTREE *istree = NULL; /* item set tree */
174
+ static FILE *in = NULL; /* input file */
175
+ static FILE *out = NULL; /* output file */
176
+
177
+ /*----------------------------------------------------------------------
178
+ Main Functions
179
+ ----------------------------------------------------------------------*/
180
+
181
+ static void help (void)
182
+ { /* --- print help on eval. measures */
183
+ #ifndef QUIET
184
+ fprintf(stderr, "\n"); /* terminate startup message */
185
+ printf("additional evaluation measures (option -e#)\n");
186
+ printf("frequent item sets:\n");
187
+ printf("d or 1: binary logarithm of support quotient\n");
188
+ printf("association rules:\n");
189
+ printf("d or 1: absolute confidence difference to prior\n");
190
+ printf("q or 2: absolute difference of confidence quotient to 1\n");
191
+ printf("a or 3: absolute difference of improvement value to 1\n");
192
+ printf("i or 4: information difference to prior\n");
193
+ printf("c or 5: normalized chi^2 measure\n");
194
+ printf("p or 6: p-value computed from chi^2 measure\n");
195
+ #endif
196
+ exit(0); /* abort the program */
197
+ } /* help() */
198
+
199
+ /*--------------------------------------------------------------------*/
200
+
201
+ static void error (int code, ...)
202
+ { /* --- print an error message */
203
+ #ifndef QUIET /* if not quiet version */
204
+ va_list args; /* list of variable arguments */
205
+ const char *msg; /* error message */
206
+
207
+ assert(prgname); /* check the program name */
208
+ if (code < E_UNKNOWN) code = E_UNKNOWN;
209
+ if (code < 0) { /* if to report an error, */
210
+ msg = errmsgs[-code]; /* get the error message */
211
+ if (!msg) msg = errmsgs[-E_UNKNOWN];
212
+ fprintf(stderr, "\n%s: ", prgname);
213
+ va_start(args, code); /* get variable arguments */
214
+ vfprintf(stderr, msg, args);/* print error message */
215
+ va_end(args); /* end argument evaluation */
216
+ }
217
+ #endif
218
+ #ifndef NDEBUG /* if debug version */
219
+ if (istree) ist_delete(istree); /* clean up memory */
220
+ if (tatree) tat_delete(tatree); /* and close files */
221
+ if (taset) tas_delete(taset, 0);
222
+ if (itemset) is_delete(itemset);
223
+ if (in && (in != stdin)) fclose(in);
224
+ if (out && (out != stdout)) fclose(out);
225
+ #endif
226
+ #ifdef STORAGE /* if storage debugging */
227
+ showmem("at end of program"); /* check memory usage */
228
+ #endif
229
+ exit(code); /* abort the program */
230
+ } /* error() */
231
+
232
+ /*--------------------------------------------------------------------*/
233
+
234
+ int main (int argc, char *argv[])
235
+ { /* --- main function */
236
+ int i, k = 0, n; /* loop variables, counters */
237
+ char *s; /* to traverse the options */
238
+ char **optarg = NULL; /* option argument */
239
+ char *fn_in = NULL; /* name of input file */
240
+ char *fn_out = NULL; /* name of output file */
241
+ char *fn_app = NULL; /* name of item appearances file */
242
+ char *blanks = NULL; /* blanks */
243
+ char *fldseps = NULL; /* field separators */
244
+ char *recseps = NULL; /* record separators */
245
+ char *comment = NULL; /* comment indicators */
246
+ char *used = NULL; /* item usage vector */
247
+ double supp = 0.1; /* minimal support (in percent) */
248
+ double smax = 1.0; /* maximal support (in percent) */
249
+ double conf = 0.8; /* minimal confidence (in percent) */
250
+ int mode = IST_BODY; /* search mode (rule support def.) */
251
+ int target = 'r'; /* target type (sets/rules/h.edges) */
252
+ int arem = 0; /* additional rule evaluation measure */
253
+ int lift = 0; /* flag for printing the lift */
254
+ double minval = 0.1; /* minimal evaluation measure value */
255
+ double lftval = 0; /* lift value (confidence/prior) */
256
+ int minlen = 1; /* minimal rule length */
257
+ int maxlen = INT_MAX; /* maximal rule length */
258
+ int load = 1; /* flag for loading transactions */
259
+ int sort = 2; /* flag for item sorting and recoding */
260
+ double filter = 0.1; /* item usage filtering parameter */
261
+ int tree = 1; /* flag for transaction tree */
262
+ int heap = 1; /* flag for heap sort vs. quick sort */
263
+ int c2scf = 0; /* flag for conv. to scanable form */
264
+ char *sep = " "; /* item separator for output */
265
+ char *fmt = "%.1f"; /* output format for support/conf. */
266
+ int sout = 1; /* flag for abs./rel. support output */
267
+ int ext = 0; /* flag for extended support output */
268
+ int aval = 0; /* flag for add. eval. measure value */
269
+ int maxcnt = 0; /* maximal number of items per set */
270
+ int tacnt; /* number of transactions */
271
+ int frq; /* frequency of an item set */
272
+ int *map, *set; /* identifier map, item set */
273
+ const char *name; /* buffer for item names */
274
+ static char buf[4*TS_SIZE+4]; /* buffer for formatting */
275
+ clock_t t, tt, tc, x; /* timer for measurements */
276
+
277
+ #ifndef QUIET /* if not quiet version */
278
+ prgname = argv[0]; /* get program name for error msgs. */
279
+
280
+ /* --- print usage message --- */
281
+ if (argc > 1) { /* if arguments are given */
282
+ fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
283
+ fprintf(stderr, VERSION); } /* print a startup message */
284
+ else { /* if no arguments given */
285
+ printf("usage: %s [options] infile outfile [appfile]\n", argv[0]);
286
+ printf("%s\n", DESCRIPTION);
287
+ printf("%s\n", VERSION);
288
+ printf("-t# target type (default: association rules)\n"
289
+ " (s: item sets, c: closed item sets,"
290
+ " m: maximal item sets,\n"
291
+ " r: association rules,"
292
+ " h: association hyperedges)\n");
293
+ printf("-m# minimal number of items per set/rule/hyperedge "
294
+ "(default: %d)\n", minlen);
295
+ printf("-n# maximal number of items per set/rule/hyperedge "
296
+ "(default: no limit)\n");
297
+ printf("-s# minimal support of a set/rule/hyperedge "
298
+ "(default: %g%%)\n", supp *100);
299
+ printf("-S# maximal support of a set/rule/hyperedge "
300
+ "(default: %g%%)\n", smax *100);
301
+ printf("-c# minimal confidence of a rule/hyperedge "
302
+ "(default: %g%%)\n", conf *100);
303
+ printf("-o use original definition of the support of a rule "
304
+ "(body & head)\n");
305
+ printf("-k# item separator for output "
306
+ "(default: \"%s\")\n", sep);
307
+ printf("-p# output format for support/confidence "
308
+ "(default: \"%s\")\n", fmt);
309
+ printf("-x extended support output "
310
+ "(print both rule support types)\n");
311
+ printf("-a print absolute support "
312
+ "(number of transactions)\n");
313
+ printf("-y print lift value (confidence divided by prior)\n");
314
+ printf("-e# additional evaluation measure (default: none)\n");
315
+ printf("-! print a list of additional evaluation measures\n");
316
+ printf("-d# minimal value of additional evaluation measure "
317
+ "(default: %g%%)\n", minval *100);
318
+ printf("-v print value of additional "
319
+ "rule evaluation measure\n");
320
+ printf("-g write output in scanable form "
321
+ "(quote certain characters)\n");
322
+ printf("-l do not load transactions into memory "
323
+ "(work on input file)\n");
324
+ printf("-q# sort items w.r.t. their frequency (default: %d)\n"
325
+ " (1: ascending, -1: descending, 0: do not sort,\n"
326
+ " 2: ascending, -2: descending w.r.t. "
327
+ "transaction size sum)\n", sort);
328
+ printf("-u# filter unused items from transactions "
329
+ "(default: %g)\n", filter);
330
+ printf(" (0: do not filter items w.r.t. usage in sets,\n"
331
+ " <0: fraction of removed items for filtering,\n"
332
+ " >0: take execution times ratio into account)\n");
333
+ printf("-h do not organize transactions as a prefix tree\n");
334
+ printf("-j use quicksort to sort the transactions "
335
+ "(default: heapsort)\n");
336
+ printf("-z minimize memory usage "
337
+ "(default: maximize speed)\n");
338
+ printf("-b/f/r# blank characters, field and record separators\n"
339
+ " (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
340
+ printf("-C# comment characters (default: \"#\")\n");
341
+ printf("infile file to read transactions from\n");
342
+ printf("outfile file to write item sets/association rules"
343
+ "/hyperedges to\n");
344
+ printf("appfile file stating item appearances (optional)\n");
345
+ return 0; /* print a usage message */
346
+ } /* and abort the program */
347
+ #endif /* #ifndef QUIET */
348
+
349
+ /* --- evaluate arguments --- */
350
+ for (i = 1; i < argc; i++) { /* traverse arguments */
351
+ s = argv[i]; /* get option argument */
352
+ if (optarg) { *optarg = s; optarg = NULL; continue; }
353
+ if ((*s == '-') && *++s) { /* -- if argument is an option */
354
+ while (*s) { /* traverse options */
355
+ switch (*s++) { /* evaluate switches */
356
+ case '!': help(); break;
357
+ case 't': target = (*s) ? *s++ : 'r'; break;
358
+ case 'm': minlen = (int)strtol(s, &s, 0); break;
359
+ case 'n': maxlen = (int)strtol(s, &s, 0); break;
360
+ case 's': supp = 0.01*strtod(s, &s); break;
361
+ case 'S': smax = 0.01*strtod(s, &s); break;
362
+ case 'c': conf = 0.01*strtod(s, &s); break;
363
+ case 'o': mode |= IST_BOTH; break;
364
+ case 'k': optarg = &sep; break;
365
+ case 'p': optarg = &fmt; break;
366
+ case 'x': ext = 1; break;
367
+ case 'a': sout |= 2; break;
368
+ case 'y': lift = 1; break;
369
+ case 'e': arem = (*s) ? *s++ : 0; break;
370
+ case 'd': minval = 0.01*strtod(s, &s); break;
371
+ case 'v': aval = 1; break;
372
+ case 'g': c2scf = 1; break;
373
+ case 'l': load = 0; break;
374
+ case 'q': sort = (int)strtol(s, &s, 0); break;
375
+ case 'u': filter = strtod(s, &s); break;
376
+ case 'h': tree = 0; break;
377
+ case 'j': heap = 0; break;
378
+ case 'z': mode |= IST_MEMOPT; break;
379
+ case 'b': optarg = &blanks; break;
380
+ case 'f': optarg = &fldseps; break;
381
+ case 'r': optarg = &recseps; break;
382
+ case 'C': optarg = &comment; break;
383
+ default : error(E_OPTION, *--s); break;
384
+ } /* set option variables */
385
+ if (optarg && *s) { *optarg = s; optarg = NULL; break; }
386
+ } } /* get option argument */
387
+ else { /* -- if argument is no option */
388
+ switch (k++) { /* evaluate non-options */
389
+ case 0: fn_in = s; break;
390
+ case 1: fn_out = s; break;
391
+ case 2: fn_app = s; break;
392
+ default: error(E_ARGCNT); break;
393
+ } /* note filenames */
394
+ }
395
+ }
396
+ if (optarg) error(E_OPTARG); /* check option argument */
397
+ if ((k < 2) || (k > 3)) /* and the number of arguments */
398
+ error(E_ARGCNT); /* (either in/out or in/out/app) */
399
+ if ((!fn_in || !*fn_in) && (fn_app && !*fn_app))
400
+ error(E_STDIN); /* stdin must not be used twice */
401
+ switch (target) { /* check and translate target type */
402
+ case 's': target = TT_SET; break;
403
+ case 'c': target = TT_CLSET; break;
404
+ case 'm': target = TT_MFSET; break;
405
+ case 'r': target = TT_RULE; break;
406
+ case 'h': target = TT_HEDGE; break;
407
+ case 'g': target = TT_GROUP; break;
408
+ default : error(E_TARGET, (char)target); break;
409
+ }
410
+ if (supp > 1) /* check the minimal support */
411
+ error(E_SUPP, supp); /* (< 0: absolute number) */
412
+ if ((conf < 0) || (conf > 1))
413
+ error(E_CONF, conf); /* check the minimal confidence */
414
+ if (minlen <= 0) error(E_RULELEN, minlen); /* check the limits */
415
+ if (maxlen <= 0) error(E_RULELEN, maxlen); /* for the rule length */
416
+ switch (arem) { /* check and translate measure */
417
+ case 0 : case '0': arem = EM_NONE; break;
418
+ case 'd': case '1': arem = EM_DIFF; break;
419
+ case 'q': case '2': arem = EM_QUOT; break;
420
+ case 'a': case '3': arem = EM_AIMP; break;
421
+ case 'i': case '4': arem = EM_INFO; break;
422
+ case 'c': case '5': arem = EM_CHI2; break;
423
+ case 'p': case '6': arem = EM_PVAL; break;
424
+ default : error(E_MEASURE, (char)arem); break;
425
+ }
426
+ if (target <= TT_MFSET) { /* in item set mode neutralize */
427
+ mode |= IST_BOTH; conf = 1;}/* rule specific settings */
428
+ if (arem == EM_NONE) /* if no add. rule eval. measure, */
429
+ aval = 0; /* clear the corresp. output flag */
430
+ if ((filter <= -1) || (filter >= 1)) filter = 0;
431
+
432
+ /* --- create item set and transaction set --- */
433
+ itemset = is_create(-1); /* create an item set and */
434
+ if (!itemset) error(E_NOMEM); /* set the special characters */
435
+ is_chars(itemset, blanks, fldseps, recseps, comment);
436
+ if (load) { /* if to load the transactions */
437
+ taset = tas_create(itemset);
438
+ if (!taset) error(E_NOMEM); /* create a transaction set */
439
+ } /* to store the transactions */
440
+ MSG(fprintf(stderr, "\n")); /* terminate the startup message */
441
+
442
+ /* --- read item appearances --- */
443
+ if (fn_app) { /* if item appearances are given */
444
+ t = clock(); /* start the timer */
445
+ if (*fn_app) /* if an app. file name is given, */
446
+ in = fopen(fn_app, "r"); /* open the item appearances file */
447
+ else { /* if no app. file name is given, */
448
+ in = stdin; fn_app = "<stdin>"; } /* read from std. input */
449
+ MSG(fprintf(stderr, "reading %s ... ", fn_app));
450
+ if (!in) error(E_FOPEN, fn_app);
451
+ k = is_readapp(itemset,in); /* read the item appearances */
452
+ if (k != 0) error(k, fn_app, RECCNT(itemset), BUFFER(itemset));
453
+ if (in != stdin) /* if not read from standard input, */
454
+ fclose(in); /* close the input file */
455
+ MSG(fprintf(stderr, "[%d item(s)]", is_cnt(itemset)));
456
+ MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
457
+ } /* print a log message */
458
+
459
+ /* --- read transactions --- */
460
+ t = clock(); /* start the timer */
461
+ if (fn_in && *fn_in) /* if an input file name is given, */
462
+ in = fopen(fn_in, "r"); /* open input file for reading */
463
+ else { /* if no input file name is given, */
464
+ in = stdin; fn_in = "<stdin>"; } /* read from standard input */
465
+ MSG(fprintf(stderr, "reading %s ... ", fn_in));
466
+ if (!in) error(E_FOPEN, fn_in);
467
+ while (1) { /* transaction read loop */
468
+ k = is_read(itemset, in); /* read the next transaction */
469
+ if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
470
+ if (k > 0) break; /* check for error and end of file */
471
+ k = is_tsize(itemset); /* update the maximal */
472
+ if (k > maxcnt) maxcnt = k; /* transaction size */
473
+ if (taset && (tas_add(taset, NULL, 0) != 0))
474
+ error(E_NOMEM); /* add the loaded transaction */
475
+ } /* to the transaction set */
476
+ if (taset) { /* if transactions have been loaded */
477
+ if (in != stdin) fclose(in);/* if not read from standard input, */
478
+ in = NULL; /* close the input file */
479
+ } /* clear the file variable */
480
+ n = is_cnt(itemset); /* get the number of items */
481
+ tacnt = is_gettac(itemset); /* and the number of transactions */
482
+ MSG(fprintf(stderr, "[%d item(s), %d transaction(s)]", n, tacnt));
483
+ MSG(fprintf(stderr, " done [%.2fs].", SEC_SINCE(t)));
484
+ if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
485
+ MSG(fprintf(stderr, "\n")); /* check for at least one transaction */
486
+ if (supp >= 0) /* if relative support is given */
487
+ supp = ceil(tacnt *supp); /* compute absolute support */
488
+ else { /* if absolute support is given, */
489
+ supp = ceil(-100 *supp); /* make the support value positive */
490
+ if (!(sout & 2)) sout = 2; /* switch to absolute support output */
491
+ } /* do the same with the max. support */
492
+ smax = floor(((smax >= 0) ? tacnt : -100) *smax);
493
+
494
+ /* --- sort and recode items --- */
495
+ MSG(fprintf(stderr, "filtering, sorting and recoding items ... "));
496
+ t = clock(); /* start the timer */
497
+ map = (int*)malloc(is_cnt(itemset) *sizeof(int));
498
+ if (!map) error(E_NOMEM); /* create an item identifier map */
499
+ k = (int)((mode & IST_HEAD) ? supp : ceil(supp *conf));
500
+ n = is_recode(itemset, k, sort, map);
501
+ if (taset) { /* sort and recode the items and */
502
+ tas_recode(taset, map,n); /* recode the loaded transactions */
503
+ maxcnt = tas_max(taset); /* get the new maximal t.a. size */
504
+ } /* (may be smaller than before) */
505
+ free(map); /* delete the item identifier map */
506
+ MSG(fprintf(stderr, "[%d item(s)] ", n));
507
+ MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
508
+ if (n <= 0) error(E_NOFREQ); /* print a log message and */
509
+ MSG(fprintf(stderr, "\n")); /* check the number of items */
510
+ if (maxlen > maxcnt) /* clamp the set/rule length */
511
+ maxlen = maxcnt; /* to the maximum set size */
512
+
513
+ /* --- create a transaction tree --- */
514
+ tt = 0; /* init. the tree construction time */
515
+ if (tree && taset) { /* if transactions were loaded */
516
+ MSG(fprintf(stderr, "creating transaction tree ... "));
517
+ t = clock(); /* start the timer */
518
+ tatree = tat_create(taset, heap);
519
+ if (!tatree) error(E_NOMEM);/* create a transaction tree */
520
+ if (filter == 0) { /* if a tree rebuild is not needed, */
521
+ tas_delete(taset, 0); taset = NULL; } /* delete transactions */
522
+ tt = clock() -t; /* note the time for the construction */
523
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
524
+ } /* print a log message */
525
+
526
+ /* --- create an item set tree --- */
527
+ t = clock(); tc = 0; /* start the timer */
528
+ istree = ist_create(itemset, mode, (int)supp, conf);
529
+ if (!istree) error(E_NOMEM); /* create an item set tree */
530
+
531
+ /* --- check item subsets --- */
532
+ if (filter) { /* if to filter unused items */
533
+ used = (char*)malloc(is_cnt(itemset) *sizeof(char));
534
+ if (!used) error(E_NOMEM); /* create a flag vector */
535
+ } /* for the items */
536
+ MSG(fprintf(stderr, "checking subsets of size 1"));
537
+ while (ist_height(istree) < maxlen) {
538
+ if (filter != 0) { /* if to filter w.r.t. item usage, */
539
+ i = ist_check(istree, used); /* check current item usage */
540
+ if (i < maxlen) maxlen = i; /* update the maximum size */
541
+ if (ist_height(istree) >= i) break;
542
+ } /* check the tree height */
543
+ k = ist_addlvl(istree); /* while max. height is not reached, */
544
+ if (k < 0) error(E_NOMEM); /* add a level to the item set tree */
545
+ if (k != 0) break; /* if no level was added, abort */
546
+ MSG(fprintf(stderr, " %d", ist_height(istree)));
547
+ if (tatree) { /* if a transaction tree was created */
548
+ if (((filter < 0) /* if to filter w.r.t. item usage */
549
+ && (i < -filter *n)) /* and enough items were removed */
550
+ || ((filter > 0) /* or counting time is long enough */
551
+ && (i < n) && (i *(double)tt < filter *n *tc))) {
552
+ n = i; x = clock(); /* note the new number of items */
553
+ tas_filter(taset, used);/* and remove unnecessary items */
554
+ tat_delete(tatree); /* delete the transaction tree */
555
+ tatree = tat_create(taset, heap);
556
+ if (!tatree) error(E_NOMEM);
557
+ tt = clock() -x; /* rebuild the transaction tree and */
558
+ } /* note the new construction time */
559
+ x = clock(); /* count the transaction tree */
560
+ ist_countx(istree, tatree);
561
+ tc = clock() -x; } /* note the new count time */
562
+ else if (taset) { /* if transactions were loaded */
563
+ if (((filter < 0) /* if to filter w.r.t. item usage */
564
+ && (i <= -filter *n)) /* and enough items were removed */
565
+ || ((filter > 0) /* or counting time is long enough */
566
+ && (i *(double)tt <= filter *n *tc))) {
567
+ n = i; x = clock(); /* note the new number of items */
568
+ tas_filter(taset, used);/* and remove unnecessary items */
569
+ tt = clock() -t; /* from the transactions */
570
+ } /* note the filtering time */
571
+ for (i = tacnt; --i >= 0;)/* traverse and count transactions */
572
+ ist_count(istree, tas_tract(taset, i), tas_tsize(taset, i));
573
+ tc = clock() -t; } /* note the new count time */
574
+ else { /* if to work on the input file, */
575
+ rewind(in); /* reset the file position */
576
+ for (maxcnt = 0; (i = is_read(itemset, in)) == 0; ) {
577
+ if (filter != 0) /* (re)read the transactions and */
578
+ is_filter(itemset, used); /* remove unnecessary items */
579
+ k = is_tsize(itemset); /* update the maximum size */
580
+ if (k > maxcnt) maxcnt = k; /* of a transaction */
581
+ ist_count(istree, is_tract(itemset), k);
582
+ } /* count the transaction in the tree */
583
+ if (i < 0) error(i, fn_in, RECCNT(itemset), BUFFER(itemset));
584
+ if (maxcnt < maxlen) /* update the maximal rule length */
585
+ maxlen = maxcnt; /* according to the max. t.a. size */
586
+ } /* (may be smaller than before) */
587
+ }
588
+ if (!taset && !tatree) { /* if transactions were not loaded */
589
+ if (in != stdin) fclose(in);/* if not read from standard input, */
590
+ in = NULL; /* close the input file */
591
+ } /* clear the file variable */
592
+ MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
593
+
594
+ /* --- filter found item sets --- */
595
+ if ((target == TT_CLSET) || (target == TT_MFSET)) {
596
+ MSG(fprintf(stderr, "filtering %s item sets ... ",
597
+ (target == TT_MFSET) ? "maximal" : "closed"));
598
+ t = clock(); /* filter the item sets */
599
+ ist_filter(istree, (target == TT_MFSET) ? IST_MAXFRQ : IST_CLOSED);
600
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
601
+ } /* (filter takes longer than print) */
602
+
603
+ /* --- sort transactions --- */
604
+ if (target <= TT_MFSET) { /* if to find frequent item sets */
605
+ if (!taset) /* transactions must be loaded */
606
+ ext = 0; /* for extended support output */
607
+ else if (ext) { /* if extended output is requested */
608
+ MSG(fprintf(stderr, "sorting transactions ... "));
609
+ t = clock(); /* start the timer */
610
+ tas_sort(taset, heap); /* sort the transactions */
611
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
612
+ } /* (sorting is necessary to find the */
613
+ } /* number of identical transactions) */
614
+
615
+ /* --- print item sets/rules/hyperedges --- */
616
+ t = clock(); /* start the timer */
617
+ if (fn_out && *fn_out) /* if an output file name is given, */
618
+ out = fopen(fn_out, "w"); /* open the output file */
619
+ else { /* if no output file name is given, */
620
+ out = stdout; fn_out = "<stdout>"; } /* write to std. output */
621
+ MSG(fprintf(stderr, "writing %s ... ", fn_out));
622
+ if (!out) error(E_FOPEN, fn_out);
623
+ ist_init(istree, minlen, arem, minval);
624
+ set = is_tract(itemset); /* get the transaction buffer */
625
+ if (target <= TT_MFSET) { /* if to find frequent item sets */
626
+ for (n = 0; 1; ) { /* extract item sets from the tree */
627
+ k = ist_set(istree, set, &frq, &conf);
628
+ if (k <= 0) break; /* get the next frequent item set */
629
+ if (frq > smax) continue; /* check against maximal support */
630
+ for (i = 0; i < k; i++) { /* traverse the set's items */
631
+ name = is_name(itemset, set[i]);
632
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
633
+ fputs(name, out); /* print the name of the next item */
634
+ fputs((i < k-1) ? sep : " ", out);
635
+ } /* print a separator */
636
+ fputs(" (", out); /* print the item set's support */
637
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
638
+ if (sout & 2) fputc('/', out); }
639
+ if (sout & 2) { fprintf(out, "%d", frq); }
640
+ if (ext) { /* if to print the extended support */
641
+ frq = tas_occur(taset, set, k);
642
+ fputs(", ", out); /* get the number of occurrences */
643
+ fprintf(out, fmt, (frq/(double)tacnt) *100);
644
+ if (sout & 2) fprintf(out, "/%d", frq);
645
+ } /* print the extended support data */
646
+ if (aval) { fputs(", ", out); fprintf(out, fmt, conf *100); }
647
+ fputs(")\n", out); /* print the add. eval. measure, */
648
+ n++; /* terminate the support output, */
649
+ } } /* and count the item set */
650
+ else if (target == TT_RULE) { /* if to find association rules, */
651
+ for (n = 0; 1; ) { /* extract rules from tree */
652
+ k = ist_rule(istree, set, &frq, &conf, &lftval, &minval);
653
+ if (k <= 0) break; /* get the next association rule */
654
+ if (frq > smax) continue; /* check against maximal support */
655
+ for (i = 0; i < k; i++) { /* traverse the rule's items */
656
+ name = is_name(itemset, set[i]);
657
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
658
+ fputs(name, out); /* print the next item */
659
+ fputs((i <= 0) ? " <- " : ((i < k-1) ? sep : " "), out);
660
+ } /* print a separator */
661
+ fputs(" (", out); /* print the rule evaluation */
662
+ if (sout & 1) supp = frq/(double)tacnt;
663
+ if (ext && !(mode & IST_HEAD)) {
664
+ if (sout & 1) { fprintf(out, fmt, supp *conf *100);
665
+ if (sout & 2) fputc('/', out); }
666
+ if (sout & 2) { fprintf(out, "%d", (int)(frq *conf +0.5));}
667
+ fputs(", ", out); /* print the support of the rule */
668
+ } /* from the support of the body */
669
+ if (sout & 1) { fprintf(out, fmt, supp *100);
670
+ if (sout & 2) fputc('/', out); }
671
+ if (sout & 2) { fprintf(out, "%d", frq); }
672
+ fputs(", ", out); /* print the rule support */
673
+ if (ext && (mode & IST_HEAD)) {
674
+ if (sout & 1) { fprintf(out, fmt, (supp/conf) *100);
675
+ if (sout & 2) fputc('/', out); }
676
+ if (sout & 2) { fprintf(out, "%d", (int)(frq /conf +0.5));}
677
+ fputs(", ", out); /* print the support of the body */
678
+ } /* from the support of the rule */
679
+ fprintf(out, fmt, conf *100); /* print the rule confidence */
680
+ if (lift) { fputs(", ", out); fprintf(out, fmt, lftval *100); }
681
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
682
+ fputs(")\n", out); /* print the value of the additional */
683
+ n++; /* rule evaluation measure and */
684
+ } } /* count the association rule */
685
+ else if (target == TT_HEDGE){ /* if to find association hyperedges */
686
+ for (n = 0; 1; ) { /* extract hyperedges from tree */
687
+ k = ist_hedge(istree, set, &frq, &conf, &minval);
688
+ if (k <= 0) break; /* get the next hyperedge */
689
+ if (frq > smax) continue; /* check against maximal support */
690
+ for (i = 0; i < k; i++) { /* traverse the edge's items */
691
+ name = is_name(itemset, set[i]);
692
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
693
+ fputs(name, out); /* print the name of the next item */
694
+ fputs((i < k-1) ? sep : " ", out);
695
+ } /* print a separator */
696
+ fputs(" (", out); /* print the hyperedge evaluation */
697
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
698
+ if (sout & 2) fputc('/', out); }
699
+ if (sout & 2) { fprintf(out, "%d", frq); }
700
+ fputs(", ", out); fprintf(out, fmt, conf *100);
701
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
702
+ fputs(")\n", out); /* print support and confidence */
703
+ n++; /* of the hyperedge and */
704
+ } } /* count the hyperedge */
705
+ else { /* if to find association groups */
706
+ for (n = 0; 1; ) { /* extract groups from tree */
707
+ k = ist_group(istree, set, &frq, &minval);
708
+ if (k <= 0) break; /* get the next group */
709
+ if (frq > smax) continue; /* check against maximal support */
710
+ for (i = 0; i < k; i++) { /* traverse the group's items */
711
+ name = is_name(itemset, set[i]);
712
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
713
+ fputs(name, out); /* print the name of the next item */
714
+ fputs((i < k-1) ? sep : " ", out);
715
+ } /* print a separator */
716
+ fputs(" (", out); /* print the group evaluation */
717
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
718
+ if (sout & 2) fputc('/', out); }
719
+ if (sout & 2) { fprintf(out, "%d", frq); }
720
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
721
+ fputs(")\n", out); /* print support and add. measure */
722
+ n++; /* and count the group */
723
+ }
724
+ } /* if (target <= TT_MFSET) .. else .. */
725
+ if (fflush(out) != 0) error(E_FWRITE, fn_out);
726
+ if (out != stdout) fclose(out);
727
+ out = NULL; /* close the output file */
728
+ MSG(fprintf(stderr, "[%d %s(s)] done ", n, ttypes[target]));
729
+ MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)));
730
+ #ifdef BENCH
731
+ printf("number of support counters: %d\n", istree->sccnt);
732
+ printf("necessary support counters: %d\n", istree->scnec);
733
+ printf("number of child pointers : %d\n", istree->cpcnt);
734
+ printf("necessary child pointers : %d\n", istree->cpnec);
735
+ printf("allocated memory (bytes) : %d\n", istree->bytes);
736
+ #endif
737
+
738
+ /* --- clean up --- */
739
+ #ifndef NDEBUG /* if this is a debug version */
740
+ free(used); /* delete the item app. vector */
741
+ ist_delete(istree); /* delete the item set tree, */
742
+ if (tatree) tat_delete(tatree); /* the transaction tree, */
743
+ if (taset) tas_delete(taset, 0); /* the transaction set, */
744
+ is_delete(itemset); /* and the item set */
745
+ #endif
746
+ #ifdef STORAGE /* if storage debugging */
747
+ showmem("at end of program"); /* check memory usage */
748
+ #endif
749
+ return 0; /* return 'ok' */
750
+ } /* main() */