apriori-rails 0.2.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (122) hide show
  1. data/History.txt +22 -0
  2. data/License.txt +20 -0
  3. data/Manifest.txt +121 -0
  4. data/README.txt +149 -0
  5. data/Rakefile +17 -0
  6. data/TODO.txt +60 -0
  7. data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
  8. data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
  9. data/attic/c_ext_test1/mytest.rb +10 -0
  10. data/attic/test.c +12 -0
  11. data/config/hoe.rb +88 -0
  12. data/config/requirements.rb +29 -0
  13. data/examples/01_simple_example.rb +39 -0
  14. data/examples/02_small_file_example.rb +17 -0
  15. data/examples/03_large_file_example.rb +22 -0
  16. data/examples/test_data/market_basket_basic_test.dat +9 -0
  17. data/ext/Apriori.c +149 -0
  18. data/ext/Makefile +149 -0
  19. data/ext/apriori/doc/apriori.html +1301 -0
  20. data/ext/apriori/doc/arem.gp +68 -0
  21. data/ext/apriori/doc/c_rev.gp +89 -0
  22. data/ext/apriori/doc/chi2.tex +156 -0
  23. data/ext/apriori/doc/copying +504 -0
  24. data/ext/apriori/doc/line.gif +0 -0
  25. data/ext/apriori/doc/uparrow.gif +0 -0
  26. data/ext/apriori/ex/flg2set +15 -0
  27. data/ext/apriori/ex/hdr2set +13 -0
  28. data/ext/apriori/ex/readme +71 -0
  29. data/ext/apriori/ex/row2set +7 -0
  30. data/ext/apriori/ex/rulesort +24 -0
  31. data/ext/apriori/ex/tab2set +9 -0
  32. data/ext/apriori/ex/test.app +2 -0
  33. data/ext/apriori/ex/test.rul +9 -0
  34. data/ext/apriori/ex/test1.rul +43 -0
  35. data/ext/apriori/ex/test1.tab +10 -0
  36. data/ext/apriori/ex/test2.tab +10 -0
  37. data/ext/apriori/ex/test3.tab +30 -0
  38. data/ext/apriori/ex/test4.tab +11 -0
  39. data/ext/apriori/ex/test5.tab +39 -0
  40. data/ext/apriori/ex/tid2set +23 -0
  41. data/ext/apriori/ex/xhdr2set +33 -0
  42. data/ext/apriori/src/apriori.c +750 -0
  43. data/ext/apriori/src/apriori.dsp +120 -0
  44. data/ext/apriori/src/apriori.dsw +29 -0
  45. data/ext/apriori/src/apriori.mak +99 -0
  46. data/ext/apriori/src/istree.c +1411 -0
  47. data/ext/apriori/src/istree.h +160 -0
  48. data/ext/apriori/src/makefile +105 -0
  49. data/ext/apriori/src/tract.c +870 -0
  50. data/ext/apriori/src/tract.h +261 -0
  51. data/ext/apriori_wrapper.c +757 -0
  52. data/ext/apriori_wrapper.h +10 -0
  53. data/ext/extconf.rb +32 -0
  54. data/ext/math/doc/copying +504 -0
  55. data/ext/math/src/chi2.c +151 -0
  56. data/ext/math/src/chi2.h +27 -0
  57. data/ext/math/src/choose.c +71 -0
  58. data/ext/math/src/choose.h +16 -0
  59. data/ext/math/src/gamma.c +446 -0
  60. data/ext/math/src/gamma.h +39 -0
  61. data/ext/math/src/intexp.c +35 -0
  62. data/ext/math/src/intexp.h +15 -0
  63. data/ext/math/src/makefile +164 -0
  64. data/ext/math/src/math.mak +48 -0
  65. data/ext/math/src/normal.c +387 -0
  66. data/ext/math/src/normal.h +44 -0
  67. data/ext/math/src/radfn.c +113 -0
  68. data/ext/math/src/radfn.h +34 -0
  69. data/ext/math/src/zeta.c +49 -0
  70. data/ext/math/src/zeta.h +15 -0
  71. data/ext/pre-clean.rb +8 -0
  72. data/ext/pre-setup.rb +9 -0
  73. data/ext/util/doc/copying +504 -0
  74. data/ext/util/src/listops.c +76 -0
  75. data/ext/util/src/listops.h +26 -0
  76. data/ext/util/src/makefile +103 -0
  77. data/ext/util/src/memsys.c +84 -0
  78. data/ext/util/src/memsys.h +42 -0
  79. data/ext/util/src/nstats.c +288 -0
  80. data/ext/util/src/nstats.h +69 -0
  81. data/ext/util/src/params.c +86 -0
  82. data/ext/util/src/params.h +19 -0
  83. data/ext/util/src/parse.c +133 -0
  84. data/ext/util/src/parse.h +81 -0
  85. data/ext/util/src/scan.c +767 -0
  86. data/ext/util/src/scan.h +111 -0
  87. data/ext/util/src/symtab.c +443 -0
  88. data/ext/util/src/symtab.h +121 -0
  89. data/ext/util/src/tabscan.c +279 -0
  90. data/ext/util/src/tabscan.h +99 -0
  91. data/ext/util/src/util.mak +91 -0
  92. data/ext/util/src/vecops.c +317 -0
  93. data/ext/util/src/vecops.h +42 -0
  94. data/lib/apriori.rb +133 -0
  95. data/lib/apriori/adapter.rb +13 -0
  96. data/lib/apriori/association_rule.rb +97 -0
  97. data/lib/apriori/version.rb +3 -0
  98. data/script/console +10 -0
  99. data/script/destroy +14 -0
  100. data/script/generate +14 -0
  101. data/script/txt2html +82 -0
  102. data/setup.rb +1585 -0
  103. data/tasks/apriori.rake +20 -0
  104. data/tasks/attic.rake +28 -0
  105. data/tasks/deployment.rake +34 -0
  106. data/tasks/environment.rake +7 -0
  107. data/tasks/install.rake +13 -0
  108. data/tasks/website.rake +17 -0
  109. data/test/apriori_test.rb +13 -0
  110. data/test/fixtures/market_basket_results_test.txt +5 -0
  111. data/test/fixtures/market_basket_string_test.txt +7 -0
  112. data/test/fixtures/results.txt +2 -0
  113. data/test/fixtures/sample.txt +7 -0
  114. data/test/test_helper.rb +5 -0
  115. data/test/unit/test_apriori.rb +68 -0
  116. data/test/unit/test_itemsets_and_parsing.rb +82 -0
  117. data/website/index.html +251 -0
  118. data/website/index.txt +154 -0
  119. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  120. data/website/stylesheets/screen.css +142 -0
  121. data/website/template.html.erb +49 -0
  122. metadata +267 -0
Binary file
@@ -0,0 +1,15 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ (NR == 1) {
4
+ for (i = 0; ++i <= NF; )
5
+ items[i] = $i;
6
+ }
7
+ (NR > 1) {
8
+ for (i = k = 0; ++i <= NF; ) {
9
+ if ($i == "T") {
10
+ if (k++ > 0) printf(" ");
11
+ printf("%s", items[i]);
12
+ }
13
+ }
14
+ printf("\n");
15
+ }' $1 > $2
@@ -0,0 +1,13 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ (NR == 1) {
4
+ for (i = 0; ++i <= NF; )
5
+ items[i] = $i;
6
+ }
7
+ (NR > 1) {
8
+ for (i = k = 0; ++i <= NF; ) {
9
+ if (k++ > 0) printf(" ");
10
+ printf("%s=%s", items[i], $i);
11
+ }
12
+ printf("\n");
13
+ }' $1 > $2
@@ -0,0 +1,71 @@
1
+ The example files in this directory demonstrate how to use the
2
+ options -b, -f, and -r and the optional item appearances file.
3
+ This file also explains the conversion scripts, which can convert
4
+ different input formats into the format needed by the apriori program.
5
+
6
+ In the file test1.tab transactions are separated by newline characters
7
+ and the items of a transaction are separated by spaces. This is the
8
+ standard input format and hence the file can be processed directly:
9
+ apriori test1.tab test1.rul
10
+
11
+ In the file test2.tab the same transactions can be found, but several
12
+ different field separators are used. This file can be processed with:
13
+ apriori -f ",.;:" -l test2.tab test2.rul
14
+
15
+ The files test3.tab to test5.tab are in formats that cannot be
16
+ processed directly with the apriori program, but which may be common.
17
+
18
+ In the file test3.tab each line contains a transaction identifier and
19
+ an item, separated by a space. This file can be converted into the
20
+ standard input format with the script tid2set, i.e., with
21
+ tid2set test3.tab x.tab
22
+ Note, however, that the input file (here: test3.tab) must be sorted
23
+ w.r.t. the transaction identifier, so that items belonging to the
24
+ same transaction occupy consecutive lines/records.
25
+
26
+ In the file test4.tab the first line states the item names and the
27
+ following lines contain flags T (true) and F (false) depending on
28
+ whether the item is contained in the transaction represented by the
29
+ line or not. This format can be converted into the standard input
30
+ format with the script flg2set, i.e., with
31
+ flg2set test4.tab x.tab
32
+
33
+ In the file test5.tab there is one item per line and transactions
34
+ are separated by blank lines. This format can be converted into the
35
+ standard input format with the script row2set, i.e., with
36
+ row2set test5.tab x.tab
37
+
38
+ The additional scripts tab2set and hdr2set convert tables with column
39
+ numbers or column names into a format appropriate for the apriori
40
+ program. They are invoked in the same way as all other scripts
41
+ discussed above, i.e., with
42
+ tab2set a.tab b.tab
43
+ or
44
+ hdr2set a.tab b.tab
45
+ where a.tab is the name of the input file and b.tab the name of the
46
+ output file. The script tab2set replaces each table entry "x" of the
47
+ input file by "Xi=x", where i is the column number (starting with 1).
48
+ The script hdr2set reads the variable names from the first line of
49
+ the input file and then replaces each table entry "x" by "X=x", where
50
+ "X" is the variable name that was found in the corresponding column
51
+ of the first line. These scripts are handy if you want to process
52
+ tabular data by treating each table row as a transaction.
53
+
54
+ The file test.app demonstrates the use of item appearance indicators.
55
+ The first line of this file ('body') states that any item not explicitly
56
+ mentioned in this file may appear only in the body of a rule. The second
57
+ line says that item 2 may appear only in the head of a rule. Hence, by
58
+ processing the file test1.tab with
59
+ apriori test1.tab test.rul test.app
60
+ only rules with item 2 in the head are generated.
61
+
62
+ Note that any input may also be read from standard input and any output
63
+ may be sent to standard output, simply by specifying a '-' or an empty
64
+ string "" instead of a filename. For example
65
+ apriori test1.tab -
66
+ writes the rules directly to the terminal. They may be piped to any
67
+ other program, since all other messages of the apriori program are
68
+ written to standard error.
69
+
70
+ Enjoy,
71
+ Christian Borgelt
@@ -0,0 +1,7 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ BEGIN { n = 0; }
4
+ ($0 != "") { if (n++ > 0) printf(" "); printf($1); }
5
+ ($0 == "") { printf("\n"); n = 0; }
6
+ END { printf("\n"); }
7
+ ' $1 > $2
@@ -0,0 +1,24 @@
1
+ #!/bin/bash
2
+ #-----------------------------------------------------------------------
3
+ # File : rulesort
4
+ # Contents: sort output of apriori
5
+ # Author : Christian Borgelt
6
+ # History : ??.??.1996 file created
7
+ # 27.02.1997 default settings moved to default case
8
+ # 26.03.2003 adapted to current apriori version
9
+ #-----------------------------------------------------------------------
10
+ case $1 in
11
+ '-1')
12
+ cmd='s/\(.*(\)\([0-9]*\.[0-9]*\)\(%[,/].*\)/\2#\1\2\3/'
13
+ sopt='-n -r'
14
+ shift;;
15
+ '-2')
16
+ cmd='s/\(.*(.*[,/] \)\([0-9]*\.[0-9]*\)\(%.*\)/\2#\1\2\3/'
17
+ sopt='-n -r'
18
+ shift;;
19
+ default)
20
+ cmd=''
21
+ sopt='-d'
22
+ ;;
23
+ esac
24
+ sed "$cmd" $1 | sort $sopt | sed 's/^.*#//' > $2
@@ -0,0 +1,9 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ {
4
+ for (i = k = 0; ++i <= NF; ) {
5
+ if (k++ > 0) printf(" ");
6
+ printf("X%d=%s", i, $i);
7
+ }
8
+ printf("\n");
9
+ }' $1 > $2
@@ -0,0 +1,2 @@
1
+ body
2
+ 2 head
@@ -0,0 +1,9 @@
1
+ 4 <- 5 (30.0, 100.0)
2
+ 2 <- 1 (60.0, 83.3)
3
+ 3 <- 2 (70.0, 85.7)
4
+ 2 <- 3 (70.0, 85.7)
5
+ 4 <- 5 1 (10.0, 100.0)
6
+ 4 <- 5 3 (10.0, 100.0)
7
+ 3 <- 1 2 (50.0, 80.0)
8
+ 2 <- 1 3 (40.0, 100.0)
9
+ 2 <- 1 3 4 (20.0, 100.0)
@@ -0,0 +1,43 @@
1
+ 5 <- (100.0, 30.0)
2
+ 1 <- (100.0, 60.0)
3
+ 2 <- (100.0, 70.0)
4
+ 3 <- (100.0, 70.0)
5
+ 4 <- (100.0, 70.0)
6
+ 1 <- 5 (30.0, 33.3)
7
+ 3 <- 5 (30.0, 33.3)
8
+ 4 <- 5 (30.0, 100.0)
9
+ 5 <- 4 (70.0, 42.9)
10
+ 2 <- 1 (60.0, 83.3)
11
+ 1 <- 2 (70.0, 71.4)
12
+ 3 <- 1 (60.0, 66.7)
13
+ 1 <- 3 (70.0, 57.1)
14
+ 4 <- 1 (60.0, 66.7)
15
+ 1 <- 4 (70.0, 57.1)
16
+ 3 <- 2 (70.0, 85.7)
17
+ 2 <- 3 (70.0, 85.7)
18
+ 4 <- 2 (70.0, 57.1)
19
+ 2 <- 4 (70.0, 57.1)
20
+ 4 <- 3 (70.0, 57.1)
21
+ 3 <- 4 (70.0, 57.1)
22
+ 4 <- 5 1 (10.0, 100.0)
23
+ 1 <- 5 4 (30.0, 33.3)
24
+ 5 <- 1 4 (40.0, 25.0)
25
+ 4 <- 5 3 (10.0, 100.0)
26
+ 3 <- 5 4 (30.0, 33.3)
27
+ 5 <- 3 4 (40.0, 25.0)
28
+ 3 <- 1 2 (50.0, 80.0)
29
+ 2 <- 1 3 (40.0, 100.0)
30
+ 1 <- 2 3 (60.0, 66.7)
31
+ 4 <- 1 2 (50.0, 60.0)
32
+ 2 <- 1 4 (40.0, 75.0)
33
+ 1 <- 2 4 (40.0, 75.0)
34
+ 4 <- 1 3 (40.0, 50.0)
35
+ 3 <- 1 4 (40.0, 50.0)
36
+ 1 <- 3 4 (40.0, 50.0)
37
+ 4 <- 2 3 (60.0, 50.0)
38
+ 3 <- 2 4 (40.0, 75.0)
39
+ 2 <- 3 4 (40.0, 75.0)
40
+ 4 <- 1 2 3 (40.0, 50.0)
41
+ 3 <- 1 2 4 (30.0, 66.7)
42
+ 2 <- 1 3 4 (20.0, 100.0)
43
+ 1 <- 2 3 4 (30.0, 66.7)
@@ -0,0 +1,10 @@
1
+ 1 2 3
2
+ 1 4 5
3
+ 2 3 4
4
+ 1 2 3 4
5
+ 2 3
6
+ 1 2 4
7
+ 4 5
8
+ 1 2 3 4
9
+ 3 4 5
10
+ 1 2 3
@@ -0,0 +1,10 @@
1
+ 1,2,3
2
+ 1,4,5
3
+ 2.3.4
4
+ 1,2,3,4
5
+ 2:3
6
+ 1,2,4
7
+ 4,5
8
+ 1,2,3,4
9
+ 3;4;5
10
+ 1,2,3
@@ -0,0 +1,30 @@
1
+ 0 1
2
+ 0 2
3
+ 0 3
4
+ 1 1
5
+ 1 4
6
+ 1 5
7
+ 2 2
8
+ 2 3
9
+ 2 4
10
+ 3 1
11
+ 3 2
12
+ 3 3
13
+ 3 4
14
+ 4 2
15
+ 4 3
16
+ 5 1
17
+ 5 2
18
+ 5 4
19
+ 6 4
20
+ 6 5
21
+ 7 1
22
+ 7 2
23
+ 7 3
24
+ 7 4
25
+ 8 3
26
+ 8 4
27
+ 8 5
28
+ 9 1
29
+ 9 2
30
+ 9 3
@@ -0,0 +1,11 @@
1
+ 1 2 3 4 5
2
+ T T T F F
3
+ T F F T T
4
+ F T T T F
5
+ T T T T F
6
+ F T T F F
7
+ T T F T F
8
+ F F F T T
9
+ T T T T F
10
+ F F T T T
11
+ T T T F F
@@ -0,0 +1,39 @@
1
+ 1
2
+ 2
3
+ 3
4
+
5
+ 1
6
+ 4
7
+ 5
8
+
9
+ 2
10
+ 3
11
+ 4
12
+
13
+ 1
14
+ 2
15
+ 3
16
+ 4
17
+
18
+ 2
19
+ 3
20
+
21
+ 1
22
+ 2
23
+ 4
24
+
25
+ 4
26
+ 5
27
+
28
+ 1
29
+ 2
30
+ 3
31
+ 4
32
+
33
+ 3
34
+ 4
35
+ 5
36
+
37
+ 1
38
+ 2
39
+ 3
@@ -0,0 +1,23 @@
1
+ #!/bin/sh
2
+ gawk '
3
+ function output ()
4
+ {
5
+ if (i > 0)
6
+ printf("%s", items[0]);
7
+ for (k = 0; ++k < i; )
8
+ printf(" %s", items[k]);
9
+ printf("\n");
10
+ }
11
+
12
+ BEGIN { tid = ""; i = 0; }
13
+ {
14
+ if ($1 == tid)
15
+ items[i++] = $2;
16
+ else {
17
+ if (tid != "") output();
18
+ tid = $1;
19
+ items[0] = $2; i = 1;
20
+ }
21
+ }
22
+ END { output(); }
23
+ ' $1 > $2
@@ -0,0 +1,33 @@
1
+ #!/bin/sh
2
+ gawk -v app=$3 -v out=$4 '
3
+ BEGIN {
4
+ FS = " ";
5
+ if ((app != "") && (out != "")) {
6
+ getline dflt < app;
7
+ while ((getline < app) > 0)
8
+ base[$1] = $2;
9
+ }
10
+ FS = ",";
11
+ }
12
+ (NR == 1) {
13
+ for (i = 0; ++i <= NF; )
14
+ items[i] = $i;
15
+ }
16
+ (NR > 1) {
17
+ for (i = k = 0; ++i <= NF; ) {
18
+ if (k++ > 0) printf(" ");
19
+ item = (items[i] "=" $i);
20
+ printf("%s", item);
21
+ if (items[i] in base) apps[item] = base[items[i]];
22
+ else apps[item] = dflt;
23
+ }
24
+ printf("\n");
25
+ }
26
+ END {
27
+ if (out != "") {
28
+ print dflt > out;
29
+ for (t in apps)
30
+ if (apps[t] != dflt)
31
+ printf("%s %s\n", t, apps[t]) > out;
32
+ }
33
+ }' $1 > $2
@@ -0,0 +1,750 @@
1
+ /*----------------------------------------------------------------------
2
+ File : apriori.c
3
+ Contents: apriori algorithm for finding association rules
4
+ Author : Christian Borgelt
5
+ History : 1996.02.14 file created
6
+ 1996.07.26 output precision reduced
7
+ 1996.11.22 options -b, -f, and -r added
8
+ 1996.11.24 option -e added (add. evaluation measures)
9
+ 1997.08.18 normalized chi^2 measure added
10
+ option -m (minimal rule length) added
11
+ 1997.10.13 quiet version (no output to stdout or stderr)
12
+ 1998.01.27 adapted to changed ist_create() function
13
+ 1998.08.08 optional input file (item appearances) added
14
+ 1998.09.02 several assertions added
15
+ 1998.09.07 hyperedge mode (option -h) added
16
+ 1998.12.08 output of absolute support (option -a) added
17
+ float changed to double
18
+ 1998.12.09 conversion of names to a scanable form added
19
+ 1999.02.05 long int changed to int
20
+ 1999.02.09 input from stdin, output to stdout added
21
+ 1999.08.09 bug in check of support parameter (<= 0) fixed
22
+ 1999.11.05 rule evaluation measure EM_AIMP added
23
+ 1999.11.08 output of add. rule eval. measure value added
24
+ 2000.03.16 optional use of original rule support definition
25
+ 2001.04.01 option -h replaced by option -t (target type)
26
+ 2001.05.26 extended support output added (option -x)
27
+ 2001.06.09 extended support output for item sets added
28
+ 2001.08.15 module scan used for output formatting
29
+ 2001.11.18 item and transaction functions made a module
30
+ 2001.11.19 options -C, -l changed, option -y removed
31
+ 2001.12.28 adapted to module tract, some improvements
32
+ 2002.01.11 evaluation measures codes changed to letters
33
+ 2002.02.10 option -q extended by a direction parameter
34
+ 2002.02.11 memory usage minimization option added
35
+ 2002.06.09 arbitrary supp./conf. formats made possible
36
+ 2003.01.09 option -k (item separator) added
37
+ 2003.01.14 check for empty transaction set added
38
+ 2003.03.12 output of lift value (conf/prior) added
39
+ 2003.07.17 item filtering w.r.t. usage added (option -u)
40
+ 2003.07.17 sorting w.r.t. transaction size sum added
41
+ 2003.07.18 maximal itemset filter added
42
+ 2003.08.11 closed itemset filter added
43
+ 2003.08.15 item filtering for transaction tree added
44
+ 2003.08.16 parameter for transaction filtering added
45
+ 2003.08.18 dynamic filtering decision based on times added
46
+ 2003.08.21 option -j (heap sort for transactions) added
47
+ 2003.09.22 meaning of option -j reversed (heapsort default)
48
+ 2004.03.25 option -S added (maximal support of a set/rule)
49
+ 2004.05.09 additional selection measure for sets added
50
+ 2004.10.28 two unnecessary assignments removed
51
+ 2004.11.20 bug in evaluation of -j (heap/quicksort) fixed
52
+ 2004.11.23 absolute/relative support output changed
53
+ 2004.12.09 semantics of option -p changed
54
+ 2005.01.25 bug in output of absolute/relative support fixed
55
+ 2005.01.31 another bug in this output fixed
56
+ 2005.06.20 use of flag for "no item sorting" corrected
57
+ 2007.02.13 adapted to modified module tabscan
58
+ 2008.03.13 additional hyperedge evaluation added
59
+ 2008.03.24 additional target added (association groups)
60
+ ----------------------------------------------------------------------*/
61
+ #include <stdio.h>
62
+ #include <stdlib.h>
63
+ #include <stdarg.h>
64
+ #include <string.h>
65
+ #include <limits.h>
66
+ #include <math.h>
67
+ #include <time.h>
68
+ #include <assert.h>
69
+ #include "scan.h"
70
+ #include "tract.h"
71
+ #include "istree.h"
72
+ #ifdef STORAGE
73
+ #include "storage.h"
74
+ #endif
75
+
76
+ /*----------------------------------------------------------------------
77
+ Preprocessor Definitions
78
+ ----------------------------------------------------------------------*/
79
+ #define PRGNAME "apriori"
80
+ #define DESCRIPTION "find association rules with the apriori algorithm"
81
+ #define VERSION "version 4.35 (2008.03.24) " \
82
+ "(c) 1996-2008 Christian Borgelt"
83
+
84
+ /* --- target types --- */
85
+ #define TT_SET 0 /* frequent item sets */
86
+ #define TT_CLSET 1 /* closed item sets */
87
+ #define TT_MFSET 2 /* maximal item sets */
88
+ #define TT_RULE 3 /* association rules */
89
+ #define TT_HEDGE 4 /* association hyperedges */
90
+ #define TT_GROUP 5 /* association groups */
91
+
92
+ /* --- error codes --- */
93
+ #define E_OPTION (-5) /* unknown option */
94
+ #define E_OPTARG (-6) /* missing option argument */
95
+ #define E_ARGCNT (-7) /* too few/many arguments */
96
+ #define E_STDIN (-8) /* double assignment of stdin */
97
+ #define E_TARGET (-9) /* invalid target type */
98
+ #define E_SUPP (-10) /* invalid support */
99
+ #define E_CONF (-11) /* invalid confidence */
100
+ #define E_MEASURE (-12) /* invalid evaluation measure */
101
+ #define E_RULELEN (-13) /* invalid rule length */
102
+ #define E_NOTAS (-14) /* no items or transactions */
103
+ #define E_NOFREQ (-15) /* no frequent items */
104
+ #define E_UNKNOWN (-21) /* unknown error */
105
+
106
+ #ifndef QUIET /* if not quiet version */
107
+ #ifdef FFLUSH
108
+ #define MSG(x) x /* print messages */
109
+ #else /* if to flush every output */
110
+ #define MSG(x) x, fflush(stderr)
111
+ #endif
112
+ #else /* if quiet version */
113
+ #define MSG(x) /* suppress messages */
114
+ #endif
115
+
116
+ #define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
117
+ #define RECCNT(s) (ts_reccnt(is_tabscan(s)) \
118
+ - ((ts_delim(is_tabscan(s)) == TS_REC) ? 1 : 0))
119
+ #define BUFFER(s) ts_buf(is_tabscan(s))
120
+
121
+ /*----------------------------------------------------------------------
122
+ Constants
123
+ ----------------------------------------------------------------------*/
124
+ #ifndef QUIET /* if not quiet version */
125
+ /* --- target types --- */
126
+ static const char *ttypes[] = {
127
+ /* TT_SET 0 */ "set",
128
+ /* TT_CLSET 1 */ "set",
129
+ /* TT_MFSET 2 */ "set",
130
+ /* TT_RULE 3 */ "rule",
131
+ /* TT_HEDGE 4 */ "hyperedge",
132
+ /* TT_GROUP 5 */ "group",
133
+ };
134
+
135
+ /* --- error messages --- */
136
+ static const char *errmsgs[] = {
137
+ /* E_NONE 0 */ "no error\n",
138
+ /* E_NOMEM -1 */ "not enough memory\n",
139
+ /* E_FOPEN -2 */ "cannot open file %s\n",
140
+ /* E_FREAD -3 */ "read error on file %s\n",
141
+ /* E_FWRITE -4 */ "write error on file %s\n",
142
+ /* E_OPTION -5 */ "unknown option -%c\n",
143
+ /* E_OPTARG -6 */ "missing option argument\n",
144
+ /* E_ARGCNT -7 */ "wrong number of arguments\n",
145
+ /* E_STDIN -8 */ "double assignment of standard input\n",
146
+ /* E_TARGET -9 */ "invalid target type '%c'\n",
147
+ /* E_SUPP -10 */ "invalid minimal support %g%%\n",
148
+ /* E_CONF -11 */ "invalid minimal confidence %g%%\n",
149
+ /* E_MEASURE -12 */ "invalid additional evaluation measure %c\n",
150
+ /* E_RULELEN -13 */ "invalid set size/rule length %d\n",
151
+ /* E_NOTAS -14 */ "no items or transactions to work on\n",
152
+ /* E_NOFREQ -15 */ "no frequent items\n",
153
+ /* E_ITEMEXP -16 */ "file %s, record %d: item expected\n",
154
+ /* E_DUPITEM -17 */ "file %s, record %d: duplicate item %s\n",
155
+ /* E_APPEXP -18 */ "file %s, record %d: "
156
+ "appearance indicator expected\n",
157
+ /* E_UNKAPP -19 */ "file %s, record %d: "
158
+ "unknown appearance indicator %s\n",
159
+ /* E_FLDCNT -20 */ "file %s, record %d: too many fields\n",
160
+ /* E_UNKNOWN -21 */ "unknown error\n"
161
+ };
162
+ #endif
163
+
164
+ /*----------------------------------------------------------------------
165
+ Global Variables
166
+ ----------------------------------------------------------------------*/
167
+ #ifndef QUIET
168
+ static char *prgname; /* program name for error messages */
169
+ #endif
170
+ static ITEMSET *itemset = NULL; /* item set */
171
+ static TASET *taset = NULL; /* transaction set */
172
+ static TATREE *tatree = NULL; /* transaction tree */
173
+ static ISTREE *istree = NULL; /* item set tree */
174
+ static FILE *in = NULL; /* input file */
175
+ static FILE *out = NULL; /* output file */
176
+
177
+ /*----------------------------------------------------------------------
178
+ Main Functions
179
+ ----------------------------------------------------------------------*/
180
+
181
+ static void help (void)
182
+ { /* --- print help on eval. measures */
183
+ #ifndef QUIET
184
+ fprintf(stderr, "\n"); /* terminate startup message */
185
+ printf("additional evaluation measures (option -e#)\n");
186
+ printf("frequent item sets:\n");
187
+ printf("d or 1: binary logarithm of support quotient\n");
188
+ printf("association rules:\n");
189
+ printf("d or 1: absolute confidence difference to prior\n");
190
+ printf("q or 2: absolute difference of confidence quotient to 1\n");
191
+ printf("a or 3: absolute difference of improvement value to 1\n");
192
+ printf("i or 4: information difference to prior\n");
193
+ printf("c or 5: normalized chi^2 measure\n");
194
+ printf("p or 6: p-value computed from chi^2 measure\n");
195
+ #endif
196
+ exit(0); /* abort the program */
197
+ } /* help() */
198
+
199
+ /*--------------------------------------------------------------------*/
200
+
201
+ static void error (int code, ...)
202
+ { /* --- print an error message */
203
+ #ifndef QUIET /* if not quiet version */
204
+ va_list args; /* list of variable arguments */
205
+ const char *msg; /* error message */
206
+
207
+ assert(prgname); /* check the program name */
208
+ if (code < E_UNKNOWN) code = E_UNKNOWN;
209
+ if (code < 0) { /* if to report an error, */
210
+ msg = errmsgs[-code]; /* get the error message */
211
+ if (!msg) msg = errmsgs[-E_UNKNOWN];
212
+ fprintf(stderr, "\n%s: ", prgname);
213
+ va_start(args, code); /* get variable arguments */
214
+ vfprintf(stderr, msg, args);/* print error message */
215
+ va_end(args); /* end argument evaluation */
216
+ }
217
+ #endif
218
+ #ifndef NDEBUG /* if debug version */
219
+ if (istree) ist_delete(istree); /* clean up memory */
220
+ if (tatree) tat_delete(tatree); /* and close files */
221
+ if (taset) tas_delete(taset, 0);
222
+ if (itemset) is_delete(itemset);
223
+ if (in && (in != stdin)) fclose(in);
224
+ if (out && (out != stdout)) fclose(out);
225
+ #endif
226
+ #ifdef STORAGE /* if storage debugging */
227
+ showmem("at end of program"); /* check memory usage */
228
+ #endif
229
+ exit(code); /* abort the program */
230
+ } /* error() */
231
+
232
+ /*--------------------------------------------------------------------*/
233
+
234
+ int main (int argc, char *argv[])
235
+ { /* --- main function */
236
+ int i, k = 0, n; /* loop variables, counters */
237
+ char *s; /* to traverse the options */
238
+ char **optarg = NULL; /* option argument */
239
+ char *fn_in = NULL; /* name of input file */
240
+ char *fn_out = NULL; /* name of output file */
241
+ char *fn_app = NULL; /* name of item appearances file */
242
+ char *blanks = NULL; /* blanks */
243
+ char *fldseps = NULL; /* field separators */
244
+ char *recseps = NULL; /* record separators */
245
+ char *comment = NULL; /* comment indicators */
246
+ char *used = NULL; /* item usage vector */
247
+ double supp = 0.1; /* minimal support (in percent) */
248
+ double smax = 1.0; /* maximal support (in percent) */
249
+ double conf = 0.8; /* minimal confidence (in percent) */
250
+ int mode = IST_BODY; /* search mode (rule support def.) */
251
+ int target = 'r'; /* target type (sets/rules/h.edges) */
252
+ int arem = 0; /* additional rule evaluation measure */
253
+ int lift = 0; /* flag for printing the lift */
254
+ double minval = 0.1; /* minimal evaluation measure value */
255
+ double lftval = 0; /* lift value (confidence/prior) */
256
+ int minlen = 1; /* minimal rule length */
257
+ int maxlen = INT_MAX; /* maximal rule length */
258
+ int load = 1; /* flag for loading transactions */
259
+ int sort = 2; /* flag for item sorting and recoding */
260
+ double filter = 0.1; /* item usage filtering parameter */
261
+ int tree = 1; /* flag for transaction tree */
262
+ int heap = 1; /* flag for heap sort vs. quick sort */
263
+ int c2scf = 0; /* flag for conv. to scanable form */
264
+ char *sep = " "; /* item separator for output */
265
+ char *fmt = "%.1f"; /* output format for support/conf. */
266
+ int sout = 1; /* flag for abs./rel. support output */
267
+ int ext = 0; /* flag for extended support output */
268
+ int aval = 0; /* flag for add. eval. measure value */
269
+ int maxcnt = 0; /* maximal number of items per set */
270
+ int tacnt; /* number of transactions */
271
+ int frq; /* frequency of an item set */
272
+ int *map, *set; /* identifier map, item set */
273
+ const char *name; /* buffer for item names */
274
+ static char buf[4*TS_SIZE+4]; /* buffer for formatting */
275
+ clock_t t, tt, tc, x; /* timer for measurements */
276
+
277
+ #ifndef QUIET /* if not quiet version */
278
+ prgname = argv[0]; /* get program name for error msgs. */
279
+
280
+ /* --- print usage message --- */
281
+ if (argc > 1) { /* if arguments are given */
282
+ fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
283
+ fprintf(stderr, VERSION); } /* print a startup message */
284
+ else { /* if no arguments given */
285
+ printf("usage: %s [options] infile outfile [appfile]\n", argv[0]);
286
+ printf("%s\n", DESCRIPTION);
287
+ printf("%s\n", VERSION);
288
+ printf("-t# target type (default: association rules)\n"
289
+ " (s: item sets, c: closed item sets,"
290
+ " m: maximal item sets,\n"
291
+ " r: association rules,"
292
+ " h: association hyperedges)\n");
293
+ printf("-m# minimal number of items per set/rule/hyperedge "
294
+ "(default: %d)\n", minlen);
295
+ printf("-n# maximal number of items per set/rule/hyperedge "
296
+ "(default: no limit)\n");
297
+ printf("-s# minimal support of a set/rule/hyperedge "
298
+ "(default: %g%%)\n", supp *100);
299
+ printf("-S# maximal support of a set/rule/hyperedge "
300
+ "(default: %g%%)\n", smax *100);
301
+ printf("-c# minimal confidence of a rule/hyperedge "
302
+ "(default: %g%%)\n", conf *100);
303
+ printf("-o use original definition of the support of a rule "
304
+ "(body & head)\n");
305
+ printf("-k# item separator for output "
306
+ "(default: \"%s\")\n", sep);
307
+ printf("-p# output format for support/confidence "
308
+ "(default: \"%s\")\n", fmt);
309
+ printf("-x extended support output "
310
+ "(print both rule support types)\n");
311
+ printf("-a print absolute support "
312
+ "(number of transactions)\n");
313
+ printf("-y print lift value (confidence divided by prior)\n");
314
+ printf("-e# additional evaluation measure (default: none)\n");
315
+ printf("-! print a list of additional evaluation measures\n");
316
+ printf("-d# minimal value of additional evaluation measure "
317
+ "(default: %g%%)\n", minval *100);
318
+ printf("-v print value of additional "
319
+ "rule evaluation measure\n");
320
+ printf("-g write output in scanable form "
321
+ "(quote certain characters)\n");
322
+ printf("-l do not load transactions into memory "
323
+ "(work on input file)\n");
324
+ printf("-q# sort items w.r.t. their frequency (default: %d)\n"
325
+ " (1: ascending, -1: descending, 0: do not sort,\n"
326
+ " 2: ascending, -2: descending w.r.t. "
327
+ "transaction size sum)\n", sort);
328
+ printf("-u# filter unused items from transactions "
329
+ "(default: %g)\n", filter);
330
+ printf(" (0: do not filter items w.r.t. usage in sets,\n"
331
+ " <0: fraction of removed items for filtering,\n"
332
+ " >0: take execution times ratio into account)\n");
333
+ printf("-h do not organize transactions as a prefix tree\n");
334
+ printf("-j use quicksort to sort the transactions "
335
+ "(default: heapsort)\n");
336
+ printf("-z minimize memory usage "
337
+ "(default: maximize speed)\n");
338
+ printf("-b/f/r# blank characters, field and record separators\n"
339
+ " (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
340
+ printf("-C# comment characters (default: \"#\")\n");
341
+ printf("infile file to read transactions from\n");
342
+ printf("outfile file to write item sets/association rules"
343
+ "/hyperedges to\n");
344
+ printf("appfile file stating item appearances (optional)\n");
345
+ return 0; /* print a usage message */
346
+ } /* and abort the program */
347
+ #endif /* #ifndef QUIET */
348
+
349
+ /* --- evaluate arguments --- */
350
+ for (i = 1; i < argc; i++) { /* traverse arguments */
351
+ s = argv[i]; /* get option argument */
352
+ if (optarg) { *optarg = s; optarg = NULL; continue; }
353
+ if ((*s == '-') && *++s) { /* -- if argument is an option */
354
+ while (*s) { /* traverse options */
355
+ switch (*s++) { /* evaluate switches */
356
+ case '!': help(); break;
357
+ case 't': target = (*s) ? *s++ : 'r'; break;
358
+ case 'm': minlen = (int)strtol(s, &s, 0); break;
359
+ case 'n': maxlen = (int)strtol(s, &s, 0); break;
360
+ case 's': supp = 0.01*strtod(s, &s); break;
361
+ case 'S': smax = 0.01*strtod(s, &s); break;
362
+ case 'c': conf = 0.01*strtod(s, &s); break;
363
+ case 'o': mode |= IST_BOTH; break;
364
+ case 'k': optarg = &sep; break;
365
+ case 'p': optarg = &fmt; break;
366
+ case 'x': ext = 1; break;
367
+ case 'a': sout |= 2; break;
368
+ case 'y': lift = 1; break;
369
+ case 'e': arem = (*s) ? *s++ : 0; break;
370
+ case 'd': minval = 0.01*strtod(s, &s); break;
371
+ case 'v': aval = 1; break;
372
+ case 'g': c2scf = 1; break;
373
+ case 'l': load = 0; break;
374
+ case 'q': sort = (int)strtol(s, &s, 0); break;
375
+ case 'u': filter = strtod(s, &s); break;
376
+ case 'h': tree = 0; break;
377
+ case 'j': heap = 0; break;
378
+ case 'z': mode |= IST_MEMOPT; break;
379
+ case 'b': optarg = &blanks; break;
380
+ case 'f': optarg = &fldseps; break;
381
+ case 'r': optarg = &recseps; break;
382
+ case 'C': optarg = &comment; break;
383
+ default : error(E_OPTION, *--s); break;
384
+ } /* set option variables */
385
+ if (optarg && *s) { *optarg = s; optarg = NULL; break; }
386
+ } } /* get option argument */
387
+ else { /* -- if argument is no option */
388
+ switch (k++) { /* evaluate non-options */
389
+ case 0: fn_in = s; break;
390
+ case 1: fn_out = s; break;
391
+ case 2: fn_app = s; break;
392
+ default: error(E_ARGCNT); break;
393
+ } /* note filenames */
394
+ }
395
+ }
396
+ if (optarg) error(E_OPTARG); /* check option argument */
397
+ if ((k < 2) || (k > 3)) /* and the number of arguments */
398
+ error(E_ARGCNT); /* (either in/out or in/out/app) */
399
+ if ((!fn_in || !*fn_in) && (fn_app && !*fn_app))
400
+ error(E_STDIN); /* stdin must not be used twice */
401
+ switch (target) { /* check and translate target type */
402
+ case 's': target = TT_SET; break;
403
+ case 'c': target = TT_CLSET; break;
404
+ case 'm': target = TT_MFSET; break;
405
+ case 'r': target = TT_RULE; break;
406
+ case 'h': target = TT_HEDGE; break;
407
+ case 'g': target = TT_GROUP; break;
408
+ default : error(E_TARGET, (char)target); break;
409
+ }
410
+ if (supp > 1) /* check the minimal support */
411
+ error(E_SUPP, supp); /* (< 0: absolute number) */
412
+ if ((conf < 0) || (conf > 1))
413
+ error(E_CONF, conf); /* check the minimal confidence */
414
+ if (minlen <= 0) error(E_RULELEN, minlen); /* check the limits */
415
+ if (maxlen <= 0) error(E_RULELEN, maxlen); /* for the rule length */
416
+ switch (arem) { /* check and translate measure */
417
+ case 0 : case '0': arem = EM_NONE; break;
418
+ case 'd': case '1': arem = EM_DIFF; break;
419
+ case 'q': case '2': arem = EM_QUOT; break;
420
+ case 'a': case '3': arem = EM_AIMP; break;
421
+ case 'i': case '4': arem = EM_INFO; break;
422
+ case 'c': case '5': arem = EM_CHI2; break;
423
+ case 'p': case '6': arem = EM_PVAL; break;
424
+ default : error(E_MEASURE, (char)arem); break;
425
+ }
426
+ if (target <= TT_MFSET) { /* in item set mode neutralize */
427
+ mode |= IST_BOTH; conf = 1;}/* rule specific settings */
428
+ if (arem == EM_NONE) /* if no add. rule eval. measure, */
429
+ aval = 0; /* clear the corresp. output flag */
430
+ if ((filter <= -1) || (filter >= 1)) filter = 0;
431
+
432
+ /* --- create item set and transaction set --- */
433
+ itemset = is_create(-1); /* create an item set and */
434
+ if (!itemset) error(E_NOMEM); /* set the special characters */
435
+ is_chars(itemset, blanks, fldseps, recseps, comment);
436
+ if (load) { /* if to load the transactions */
437
+ taset = tas_create(itemset);
438
+ if (!taset) error(E_NOMEM); /* create a transaction set */
439
+ } /* to store the transactions */
440
+ MSG(fprintf(stderr, "\n")); /* terminate the startup message */
441
+
442
+ /* --- read item appearances --- */
443
+ if (fn_app) { /* if item appearances are given */
444
+ t = clock(); /* start the timer */
445
+ if (*fn_app) /* if an app. file name is given, */
446
+ in = fopen(fn_app, "r"); /* open the item appearances file */
447
+ else { /* if no app. file name is given, */
448
+ in = stdin; fn_app = "<stdin>"; } /* read from std. input */
449
+ MSG(fprintf(stderr, "reading %s ... ", fn_app));
450
+ if (!in) error(E_FOPEN, fn_app);
451
+ k = is_readapp(itemset,in); /* read the item appearances */
452
+ if (k != 0) error(k, fn_app, RECCNT(itemset), BUFFER(itemset));
453
+ if (in != stdin) /* if not read from standard input, */
454
+ fclose(in); /* close the input file */
455
+ MSG(fprintf(stderr, "[%d item(s)]", is_cnt(itemset)));
456
+ MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
457
+ } /* print a log message */
458
+
459
+ /* --- read transactions --- */
460
+ t = clock(); /* start the timer */
461
+ if (fn_in && *fn_in) /* if an input file name is given, */
462
+ in = fopen(fn_in, "r"); /* open input file for reading */
463
+ else { /* if no input file name is given, */
464
+ in = stdin; fn_in = "<stdin>"; } /* read from standard input */
465
+ MSG(fprintf(stderr, "reading %s ... ", fn_in));
466
+ if (!in) error(E_FOPEN, fn_in);
467
+ while (1) { /* transaction read loop */
468
+ k = is_read(itemset, in); /* read the next transaction */
469
+ if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
470
+ if (k > 0) break; /* check for error and end of file */
471
+ k = is_tsize(itemset); /* update the maximal */
472
+ if (k > maxcnt) maxcnt = k; /* transaction size */
473
+ if (taset && (tas_add(taset, NULL, 0) != 0))
474
+ error(E_NOMEM); /* add the loaded transaction */
475
+ } /* to the transaction set */
476
+ if (taset) { /* if transactions have been loaded */
477
+ if (in != stdin) fclose(in);/* if not read from standard input, */
478
+ in = NULL; /* close the input file */
479
+ } /* clear the file variable */
480
+ n = is_cnt(itemset); /* get the number of items */
481
+ tacnt = is_gettac(itemset); /* and the number of transactions */
482
+ MSG(fprintf(stderr, "[%d item(s), %d transaction(s)]", n, tacnt));
483
+ MSG(fprintf(stderr, " done [%.2fs].", SEC_SINCE(t)));
484
+ if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
485
+ MSG(fprintf(stderr, "\n")); /* check for at least one transaction */
486
+ if (supp >= 0) /* if relative support is given */
487
+ supp = ceil(tacnt *supp); /* compute absolute support */
488
+ else { /* if absolute support is given, */
489
+ supp = ceil(-100 *supp); /* make the support value positive */
490
+ if (!(sout & 2)) sout = 2; /* switch to absolute support output */
491
+ } /* do the same with the max. support */
492
+ smax = floor(((smax >= 0) ? tacnt : -100) *smax);
493
+
494
+ /* --- sort and recode items --- */
495
+ MSG(fprintf(stderr, "filtering, sorting and recoding items ... "));
496
+ t = clock(); /* start the timer */
497
+ map = (int*)malloc(is_cnt(itemset) *sizeof(int));
498
+ if (!map) error(E_NOMEM); /* create an item identifier map */
499
+ k = (int)((mode & IST_HEAD) ? supp : ceil(supp *conf));
500
+ n = is_recode(itemset, k, sort, map);
501
+ if (taset) { /* sort and recode the items and */
502
+ tas_recode(taset, map,n); /* recode the loaded transactions */
503
+ maxcnt = tas_max(taset); /* get the new maximal t.a. size */
504
+ } /* (may be smaller than before) */
505
+ free(map); /* delete the item identifier map */
506
+ MSG(fprintf(stderr, "[%d item(s)] ", n));
507
+ MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
508
+ if (n <= 0) error(E_NOFREQ); /* print a log message and */
509
+ MSG(fprintf(stderr, "\n")); /* check the number of items */
510
+ if (maxlen > maxcnt) /* clamp the set/rule length */
511
+ maxlen = maxcnt; /* to the maximum set size */
512
+
513
+ /* --- create a transaction tree --- */
514
+ tt = 0; /* init. the tree construction time */
515
+ if (tree && taset) { /* if transactions were loaded */
516
+ MSG(fprintf(stderr, "creating transaction tree ... "));
517
+ t = clock(); /* start the timer */
518
+ tatree = tat_create(taset, heap);
519
+ if (!tatree) error(E_NOMEM);/* create a transaction tree */
520
+ if (filter == 0) { /* if a tree rebuild is not needed, */
521
+ tas_delete(taset, 0); taset = NULL; } /* delete transactions */
522
+ tt = clock() -t; /* note the time for the construction */
523
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
524
+ } /* print a log message */
525
+
526
+ /* --- create an item set tree --- */
527
+ t = clock(); tc = 0; /* start the timer */
528
+ istree = ist_create(itemset, mode, (int)supp, conf);
529
+ if (!istree) error(E_NOMEM); /* create an item set tree */
530
+
531
+ /* --- check item subsets --- */
532
+ if (filter) { /* if to filter unused items */
533
+ used = (char*)malloc(is_cnt(itemset) *sizeof(char));
534
+ if (!used) error(E_NOMEM); /* create a flag vector */
535
+ } /* for the items */
536
+ MSG(fprintf(stderr, "checking subsets of size 1"));
537
+ while (ist_height(istree) < maxlen) {
538
+ if (filter != 0) { /* if to filter w.r.t. item usage, */
539
+ i = ist_check(istree, used); /* check current item usage */
540
+ if (i < maxlen) maxlen = i; /* update the maximum size */
541
+ if (ist_height(istree) >= i) break;
542
+ } /* check the tree height */
543
+ k = ist_addlvl(istree); /* while max. height is not reached, */
544
+ if (k < 0) error(E_NOMEM); /* add a level to the item set tree */
545
+ if (k != 0) break; /* if no level was added, abort */
546
+ MSG(fprintf(stderr, " %d", ist_height(istree)));
547
+ if (tatree) { /* if a transaction tree was created */
548
+ if (((filter < 0) /* if to filter w.r.t. item usage */
549
+ && (i < -filter *n)) /* and enough items were removed */
550
+ || ((filter > 0) /* or counting time is long enough */
551
+ && (i < n) && (i *(double)tt < filter *n *tc))) {
552
+ n = i; x = clock(); /* note the new number of items */
553
+ tas_filter(taset, used);/* and remove unnecessary items */
554
+ tat_delete(tatree); /* delete the transaction tree */
555
+ tatree = tat_create(taset, heap);
556
+ if (!tatree) error(E_NOMEM);
557
+ tt = clock() -x; /* rebuild the transaction tree and */
558
+ } /* note the new construction time */
559
+ x = clock(); /* count the transaction tree */
560
+ ist_countx(istree, tatree);
561
+ tc = clock() -x; } /* note the new count time */
562
+ else if (taset) { /* if transactions were loaded */
563
+ if (((filter < 0) /* if to filter w.r.t. item usage */
564
+ && (i <= -filter *n)) /* and enough items were removed */
565
+ || ((filter > 0) /* or counting time is long enough */
566
+ && (i *(double)tt <= filter *n *tc))) {
567
+ n = i; x = clock(); /* note the new number of items */
568
+ tas_filter(taset, used);/* and remove unnecessary items */
569
+ tt = clock() -t; /* from the transactions */
570
+ } /* note the filtering time */
571
+ for (i = tacnt; --i >= 0;)/* traverse and count transactions */
572
+ ist_count(istree, tas_tract(taset, i), tas_tsize(taset, i));
573
+ tc = clock() -t; } /* note the new count time */
574
+ else { /* if to work on the input file, */
575
+ rewind(in); /* reset the file position */
576
+ for (maxcnt = 0; (i = is_read(itemset, in)) == 0; ) {
577
+ if (filter != 0) /* (re)read the transactions and */
578
+ is_filter(itemset, used); /* remove unnecessary items */
579
+ k = is_tsize(itemset); /* update the maximum size */
580
+ if (k > maxcnt) maxcnt = k; /* of a transaction */
581
+ ist_count(istree, is_tract(itemset), k);
582
+ } /* count the transaction in the tree */
583
+ if (i < 0) error(i, fn_in, RECCNT(itemset), BUFFER(itemset));
584
+ if (maxcnt < maxlen) /* update the maximal rule length */
585
+ maxlen = maxcnt; /* according to the max. t.a. size */
586
+ } /* (may be smaller than before) */
587
+ }
588
+ if (!taset && !tatree) { /* if transactions were not loaded */
589
+ if (in != stdin) fclose(in);/* if not read from standard input, */
590
+ in = NULL; /* close the input file */
591
+ } /* clear the file variable */
592
+ MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)));
593
+
594
+ /* --- filter found item sets --- */
595
+ if ((target == TT_CLSET) || (target == TT_MFSET)) {
596
+ MSG(fprintf(stderr, "filtering %s item sets ... ",
597
+ (target == TT_MFSET) ? "maximal" : "closed"));
598
+ t = clock(); /* filter the item sets */
599
+ ist_filter(istree, (target == TT_MFSET) ? IST_MAXFRQ : IST_CLOSED);
600
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
601
+ } /* (filter takes longer than print) */
602
+
603
+ /* --- sort transactions --- */
604
+ if (target <= TT_MFSET) { /* if to find frequent item sets */
605
+ if (!taset) /* transactions must be loaded */
606
+ ext = 0; /* for extended support output */
607
+ else if (ext) { /* if extended output is requested */
608
+ MSG(fprintf(stderr, "sorting transactions ... "));
609
+ t = clock(); /* start the timer */
610
+ tas_sort(taset, heap); /* sort the transactions */
611
+ MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
612
+ } /* (sorting is necessary to find the */
613
+ } /* number of identical transactions) */
614
+
615
+ /* --- print item sets/rules/hyperedges --- */
616
+ t = clock(); /* start the timer */
617
+ if (fn_out && *fn_out) /* if an output file name is given, */
618
+ out = fopen(fn_out, "w"); /* open the output file */
619
+ else { /* if no output file name is given, */
620
+ out = stdout; fn_out = "<stdout>"; } /* write to std. output */
621
+ MSG(fprintf(stderr, "writing %s ... ", fn_out));
622
+ if (!out) error(E_FOPEN, fn_out);
623
+ ist_init(istree, minlen, arem, minval);
624
+ set = is_tract(itemset); /* get the transaction buffer */
625
+ if (target <= TT_MFSET) { /* if to find frequent item sets */
626
+ for (n = 0; 1; ) { /* extract item sets from the tree */
627
+ k = ist_set(istree, set, &frq, &conf);
628
+ if (k <= 0) break; /* get the next frequent item set */
629
+ if (frq > smax) continue; /* check against maximal support */
630
+ for (i = 0; i < k; i++) { /* traverse the set's items */
631
+ name = is_name(itemset, set[i]);
632
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
633
+ fputs(name, out); /* print the name of the next item */
634
+ fputs((i < k-1) ? sep : " ", out);
635
+ } /* print a separator */
636
+ fputs(" (", out); /* print the item set's support */
637
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
638
+ if (sout & 2) fputc('/', out); }
639
+ if (sout & 2) { fprintf(out, "%d", frq); }
640
+ if (ext) { /* if to print the extended support */
641
+ frq = tas_occur(taset, set, k);
642
+ fputs(", ", out); /* get the number of occurrences */
643
+ fprintf(out, fmt, (frq/(double)tacnt) *100);
644
+ if (sout & 2) fprintf(out, "/%d", frq);
645
+ } /* print the extended support data */
646
+ if (aval) { fputs(", ", out); fprintf(out, fmt, conf *100); }
647
+ fputs(")\n", out); /* print the add. eval. measure, */
648
+ n++; /* terminate the support output, */
649
+ } } /* and count the item set */
650
+ else if (target == TT_RULE) { /* if to find association rules, */
651
+ for (n = 0; 1; ) { /* extract rules from tree */
652
+ k = ist_rule(istree, set, &frq, &conf, &lftval, &minval);
653
+ if (k <= 0) break; /* get the next association rule */
654
+ if (frq > smax) continue; /* check against maximal support */
655
+ for (i = 0; i < k; i++) { /* traverse the rule's items */
656
+ name = is_name(itemset, set[i]);
657
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
658
+ fputs(name, out); /* print the next item */
659
+ fputs((i <= 0) ? " <- " : ((i < k-1) ? sep : " "), out);
660
+ } /* print a separator */
661
+ fputs(" (", out); /* print the rule evaluation */
662
+ if (sout & 1) supp = frq/(double)tacnt;
663
+ if (ext && !(mode & IST_HEAD)) {
664
+ if (sout & 1) { fprintf(out, fmt, supp *conf *100);
665
+ if (sout & 2) fputc('/', out); }
666
+ if (sout & 2) { fprintf(out, "%d", (int)(frq *conf +0.5));}
667
+ fputs(", ", out); /* print the support of the rule */
668
+ } /* from the support of the body */
669
+ if (sout & 1) { fprintf(out, fmt, supp *100);
670
+ if (sout & 2) fputc('/', out); }
671
+ if (sout & 2) { fprintf(out, "%d", frq); }
672
+ fputs(", ", out); /* print the rule support */
673
+ if (ext && (mode & IST_HEAD)) {
674
+ if (sout & 1) { fprintf(out, fmt, (supp/conf) *100);
675
+ if (sout & 2) fputc('/', out); }
676
+ if (sout & 2) { fprintf(out, "%d", (int)(frq /conf +0.5));}
677
+ fputs(", ", out); /* print the support of the body */
678
+ } /* from the support of the rule */
679
+ fprintf(out, fmt, conf *100); /* print the rule confidence */
680
+ if (lift) { fputs(", ", out); fprintf(out, fmt, lftval *100); }
681
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
682
+ fputs(")\n", out); /* print the value of the additional */
683
+ n++; /* rule evaluation measure and */
684
+ } } /* count the association rule */
685
+ else if (target == TT_HEDGE){ /* if to find association hyperedges */
686
+ for (n = 0; 1; ) { /* extract hyperedges from tree */
687
+ k = ist_hedge(istree, set, &frq, &conf, &minval);
688
+ if (k <= 0) break; /* get the next hyperedge */
689
+ if (frq > smax) continue; /* check against maximal support */
690
+ for (i = 0; i < k; i++) { /* traverse the edge's items */
691
+ name = is_name(itemset, set[i]);
692
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
693
+ fputs(name, out); /* print the name of the next item */
694
+ fputs((i < k-1) ? sep : " ", out);
695
+ } /* print a separator */
696
+ fputs(" (", out); /* print the hyperedge evaluation */
697
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
698
+ if (sout & 2) fputc('/', out); }
699
+ if (sout & 2) { fprintf(out, "%d", frq); }
700
+ fputs(", ", out); fprintf(out, fmt, conf *100);
701
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
702
+ fputs(")\n", out); /* print support and confidence */
703
+ n++; /* of the hyperedge and */
704
+ } } /* count the hyperedge */
705
+ else { /* if to find association groups */
706
+ for (n = 0; 1; ) { /* extract groups from tree */
707
+ k = ist_group(istree, set, &frq, &minval);
708
+ if (k <= 0) break; /* get the next group */
709
+ if (frq > smax) continue; /* check against maximal support */
710
+ for (i = 0; i < k; i++) { /* traverse the group's items */
711
+ name = is_name(itemset, set[i]);
712
+ if (c2scf) { sc_format(buf, name, 0); name = buf; }
713
+ fputs(name, out); /* print the name of the next item */
714
+ fputs((i < k-1) ? sep : " ", out);
715
+ } /* print a separator */
716
+ fputs(" (", out); /* print the group evaluation */
717
+ if (sout & 1) { fprintf(out, fmt, (frq/(double)tacnt) *100);
718
+ if (sout & 2) fputc('/', out); }
719
+ if (sout & 2) { fprintf(out, "%d", frq); }
720
+ if (aval) { fputs(", ", out); fprintf(out, fmt, minval *100); }
721
+ fputs(")\n", out); /* print support and add. measure */
722
+ n++; /* and count the group */
723
+ }
724
+ } /* if (target <= TT_MFSET) .. else .. */
725
+ if (fflush(out) != 0) error(E_FWRITE, fn_out);
726
+ if (out != stdout) fclose(out);
727
+ out = NULL; /* close the output file */
728
+ MSG(fprintf(stderr, "[%d %s(s)] done ", n, ttypes[target]));
729
+ MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)));
730
+ #ifdef BENCH
731
+ printf("number of support counters: %d\n", istree->sccnt);
732
+ printf("necessary support counters: %d\n", istree->scnec);
733
+ printf("number of child pointers : %d\n", istree->cpcnt);
734
+ printf("necessary child pointers : %d\n", istree->cpnec);
735
+ printf("allocated memory (bytes) : %d\n", istree->bytes);
736
+ #endif
737
+
738
+ /* --- clean up --- */
739
+ #ifndef NDEBUG /* if this is a debug version */
740
+ free(used); /* delete the item app. vector */
741
+ ist_delete(istree); /* delete the item set tree, */
742
+ if (tatree) tat_delete(tatree); /* the transaction tree, */
743
+ if (taset) tas_delete(taset, 0); /* the transaction set, */
744
+ is_delete(itemset); /* and the item set */
745
+ #endif
746
+ #ifdef STORAGE /* if storage debugging */
747
+ showmem("at end of program"); /* check memory usage */
748
+ #endif
749
+ return 0; /* return 'ok' */
750
+ } /* main() */