jashmenn-apriori 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (122) hide show
  1. data/History.txt +4 -0
  2. data/License.txt +20 -0
  3. data/Manifest.txt +121 -0
  4. data/README.txt +139 -0
  5. data/Rakefile +4 -0
  6. data/TODO.txt +60 -0
  7. data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
  8. data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
  9. data/attic/c_ext_test1/mytest.rb +10 -0
  10. data/attic/test.c +12 -0
  11. data/config/hoe.rb +77 -0
  12. data/config/requirements.rb +15 -0
  13. data/examples/01_simple_example.rb +23 -0
  14. data/examples/02_small_file_example.rb +17 -0
  15. data/examples/03_large_file_example.rb +22 -0
  16. data/examples/test_data/market_basket_basic_test.dat +9 -0
  17. data/ext/Apriori.c +149 -0
  18. data/ext/Makefile +149 -0
  19. data/ext/apriori/doc/apriori.html +1301 -0
  20. data/ext/apriori/doc/arem.gp +68 -0
  21. data/ext/apriori/doc/c_rev.gp +89 -0
  22. data/ext/apriori/doc/chi2.tex +156 -0
  23. data/ext/apriori/doc/copying +504 -0
  24. data/ext/apriori/doc/line.gif +0 -0
  25. data/ext/apriori/doc/uparrow.gif +0 -0
  26. data/ext/apriori/ex/flg2set +15 -0
  27. data/ext/apriori/ex/hdr2set +13 -0
  28. data/ext/apriori/ex/readme +71 -0
  29. data/ext/apriori/ex/row2set +7 -0
  30. data/ext/apriori/ex/rulesort +24 -0
  31. data/ext/apriori/ex/tab2set +9 -0
  32. data/ext/apriori/ex/test.app +2 -0
  33. data/ext/apriori/ex/test.rul +9 -0
  34. data/ext/apriori/ex/test1.rul +43 -0
  35. data/ext/apriori/ex/test1.tab +10 -0
  36. data/ext/apriori/ex/test2.tab +10 -0
  37. data/ext/apriori/ex/test3.tab +30 -0
  38. data/ext/apriori/ex/test4.tab +11 -0
  39. data/ext/apriori/ex/test5.tab +39 -0
  40. data/ext/apriori/ex/tid2set +23 -0
  41. data/ext/apriori/ex/xhdr2set +33 -0
  42. data/ext/apriori/src/apriori.c +750 -0
  43. data/ext/apriori/src/apriori.dsp +120 -0
  44. data/ext/apriori/src/apriori.dsw +29 -0
  45. data/ext/apriori/src/apriori.mak +99 -0
  46. data/ext/apriori/src/istree.c +1411 -0
  47. data/ext/apriori/src/istree.h +160 -0
  48. data/ext/apriori/src/makefile +105 -0
  49. data/ext/apriori/src/tract.c +870 -0
  50. data/ext/apriori/src/tract.h +261 -0
  51. data/ext/apriori_wrapper.c +757 -0
  52. data/ext/apriori_wrapper.h +10 -0
  53. data/ext/extconf.rb +32 -0
  54. data/ext/math/doc/copying +504 -0
  55. data/ext/math/src/chi2.c +151 -0
  56. data/ext/math/src/chi2.h +27 -0
  57. data/ext/math/src/choose.c +71 -0
  58. data/ext/math/src/choose.h +16 -0
  59. data/ext/math/src/gamma.c +446 -0
  60. data/ext/math/src/gamma.h +39 -0
  61. data/ext/math/src/intexp.c +35 -0
  62. data/ext/math/src/intexp.h +15 -0
  63. data/ext/math/src/makefile +164 -0
  64. data/ext/math/src/math.mak +48 -0
  65. data/ext/math/src/normal.c +387 -0
  66. data/ext/math/src/normal.h +44 -0
  67. data/ext/math/src/radfn.c +113 -0
  68. data/ext/math/src/radfn.h +34 -0
  69. data/ext/math/src/zeta.c +49 -0
  70. data/ext/math/src/zeta.h +15 -0
  71. data/ext/pre-clean.rb +8 -0
  72. data/ext/pre-setup.rb +9 -0
  73. data/ext/util/doc/copying +504 -0
  74. data/ext/util/src/listops.c +76 -0
  75. data/ext/util/src/listops.h +26 -0
  76. data/ext/util/src/makefile +103 -0
  77. data/ext/util/src/memsys.c +84 -0
  78. data/ext/util/src/memsys.h +42 -0
  79. data/ext/util/src/nstats.c +288 -0
  80. data/ext/util/src/nstats.h +69 -0
  81. data/ext/util/src/params.c +86 -0
  82. data/ext/util/src/params.h +19 -0
  83. data/ext/util/src/parse.c +133 -0
  84. data/ext/util/src/parse.h +81 -0
  85. data/ext/util/src/scan.c +767 -0
  86. data/ext/util/src/scan.h +111 -0
  87. data/ext/util/src/symtab.c +443 -0
  88. data/ext/util/src/symtab.h +121 -0
  89. data/ext/util/src/tabscan.c +279 -0
  90. data/ext/util/src/tabscan.h +99 -0
  91. data/ext/util/src/util.mak +91 -0
  92. data/ext/util/src/vecops.c +317 -0
  93. data/ext/util/src/vecops.h +42 -0
  94. data/lib/apriori/adapter.rb +13 -0
  95. data/lib/apriori/association_rule.rb +85 -0
  96. data/lib/apriori/version.rb +9 -0
  97. data/lib/apriori.rb +133 -0
  98. data/script/console +10 -0
  99. data/script/destroy +14 -0
  100. data/script/generate +14 -0
  101. data/script/txt2html +82 -0
  102. data/setup.rb +1585 -0
  103. data/tasks/apriori.rake +20 -0
  104. data/tasks/attic.rake +28 -0
  105. data/tasks/deployment.rake +34 -0
  106. data/tasks/environment.rake +7 -0
  107. data/tasks/install.rake +6 -0
  108. data/tasks/website.rake +17 -0
  109. data/test/apriori_test.rb +13 -0
  110. data/test/fixtures/market_basket_results_test.txt +5 -0
  111. data/test/fixtures/market_basket_string_test.txt +7 -0
  112. data/test/fixtures/results.txt +2 -0
  113. data/test/fixtures/sample.txt +7 -0
  114. data/test/test_helper.rb +5 -0
  115. data/test/unit/test_apriori.rb +68 -0
  116. data/test/unit/test_itemsets_and_parsing.rb +82 -0
  117. data/website/index.html +233 -0
  118. data/website/index.txt +142 -0
  119. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  120. data/website/stylesheets/screen.css +142 -0
  121. data/website/template.html.erb +49 -0
  122. metadata +188 -0
@@ -0,0 +1,68 @@
1
+ #-----------------------------------------------------------------------
2
+ # File : arem.gp
3
+ # Contents: visualization of the selection behaviour
4
+ # of the additional rule evaluation measures
5
+ # Author : Christian Borgelt
6
+ # History : 22.09.1998 file created
7
+ #-----------------------------------------------------------------------
8
+ # set terminal postscript
9
+
10
+ set view 50,325,1
11
+ set hidden3d
12
+ set isosamples 20,20
13
+
14
+ set xrange [0.001:0.999]
15
+ set yrange [0.001:0.999]
16
+ set zrange [0:1]
17
+
18
+ min(a,b) = a < b ? a : b
19
+
20
+ diff(x,y) = abs(x -y)
21
+
22
+ quot(x,y) = 1 -min(x/y, y/x)
23
+
24
+ info(x,y,s) = (x >= y*s) && (1-x >= s *(1-y)) \
25
+ ? (s*y*log(y/x) +(x-s*y)*log((x-s*y)/(x*(1-s))) \
26
+ + s*(1-y)*log((1-y)/(1-x)) \
27
+ + (1-x-s*(1-y))*log((1-x-s*(1-y))/((1-x)*(1-s)))) /log(2) : 0
28
+
29
+ chi2(x,y,s) = (x >= y*s) && (1-x >= s *(1-y)) \
30
+ ? (x*s -y*s)**2 /(x*(1-x)*s*(1-s)) : 0
31
+
32
+ set title "d_diff (independent of antecedent support)"
33
+ set xlabel "c_prior"
34
+ set ylabel "c_post"
35
+ set zlabel "d_diff"
36
+ splot diff(x,y)
37
+ pause -1 "Hit return to continue"
38
+
39
+ set title "d_quot (independent of antecedent support)"
40
+ set zlabel "d_quot"
41
+ splot quot(x,y)
42
+ pause -1 "Hit return to continue"
43
+
44
+ set zlabel "d_info"
45
+ set title "d_info with antecedent support 0.2"
46
+ splot info(x,y,0.2)
47
+ pause -1 "Hit return to continue"
48
+
49
+ set title "d_info with antecedent support 0.3"
50
+ splot info(x,y,0.3)
51
+ pause -1 "Hit return to continue"
52
+
53
+ set title "d_info with antecedent support 0.4"
54
+ splot info(x,y,0.4)
55
+ pause -1 "Hit return to continue"
56
+
57
+ set zlabel "d_chi^2"
58
+ set title "d_chi^2 with antecedent support 0.2"
59
+ splot chi2(x,y,0.2)
60
+ pause -1 "Hit return to continue"
61
+
62
+ set title "d_chi^2 with antecedent support 0.3"
63
+ splot chi2(x,y,0.3)
64
+ pause -1 "Hit return to continue"
65
+
66
+ set title "d_chi^2 with antecedent support 0.4"
67
+ splot chi2(x,y,0.4)
68
+ pause -1 "Hit return to continue"
@@ -0,0 +1,89 @@
1
+ #-----------------------------------------------------------------------
2
+ # File : c_rev.gp
3
+ # Contents: visualization of the dependence between posterior confidence
4
+ # and reversed confidence, i.e. the confidence of the reversed
5
+ # rule antecedent <- consequent
6
+ # Author : Christian Borgelt
7
+ # History : 23.09.1998 file created
8
+ #-----------------------------------------------------------------------
9
+ # set terminal postscript
10
+
11
+ set view 50,330,1
12
+ set xrange [0.001:0.999]
13
+ set yrange [0.001:0.999]
14
+ set zrange [0:1]
15
+ set isosamples 20
16
+ set hidden3d
17
+
18
+ set xlabel "c_post"
19
+ set ylabel "c_rev"
20
+
21
+ min(a,b) = a < b ? a : b
22
+
23
+ diff(x,y,p) = (1 -p -(y*p)/x +y*p >= 0) ? abs(x -p) : 0
24
+
25
+ quot(x,y,p) = (1 -p -(y*p)/x +y*p >= 0) ? 1 -min(x/p,p/x) : 0
26
+
27
+ i(x,y,p) = (y*p*log((y*p)/(p*(y*p)/x)) \
28
+ + (p-y*p)*log((p-y*p)/(p*(1-(y*p)/x))) \
29
+ + ((y*p)/x-y*p)*log(((y*p)/x-y*p)/((1-p)*(y*p)/x)) \
30
+ + (1-p-(y*p)/x+y*p)*log((1-p-(y*p)/x+y*p) \
31
+ /((1-p)*(1-(y*p)/x)))) /log(2)
32
+
33
+ info(x,y,p) = (1 -p -(y*p)/x +y*p >= 0) ? i(x,y,p) : 0
34
+
35
+ chi2(x,y,p) = (1 -p -(y*p)/x +y*p >= 0) \
36
+ ? (p*((y*p)/x)-y*p)**2 /(p*(1-p)*((y*p)/x)*(1-(y*p)/x)) : 0
37
+
38
+ set zlabel "d_diff"
39
+ set title "prior confidence 0.2"
40
+ splot diff(x,y,0.2)
41
+ pause -1 "Hit return to continue"
42
+
43
+ set title "prior confidence 0.3"
44
+ splot diff(x,y,0.3)
45
+ pause -1 "Hit return to continue"
46
+
47
+ set title "prior confidence 0.4"
48
+ splot diff(x,y,0.4)
49
+ pause -1 "Hit return to continue"
50
+
51
+ set zlabel "d_quot"
52
+ set title "prior confidence 0.2"
53
+ splot quot(x,y,0.2)
54
+ pause -1 "Hit return to continue"
55
+
56
+ set title "prior confidence 0.3"
57
+ splot quot(x,y,0.3)
58
+ pause -1 "Hit return to continue"
59
+
60
+ set title "prior confidence 0.4"
61
+ splot quot(x,y,0.4)
62
+ pause -1 "Hit return to continue"
63
+
64
+ set zlabel "d_info"
65
+ set title "prior confidence 0.2"
66
+ splot info(x,y,0.2)
67
+ pause -1 "Hit return to continue"
68
+
69
+ set title "prior confidence 0.3"
70
+ splot info(x,y,0.3)
71
+ pause -1 "Hit return to continue"
72
+
73
+ set title "prior confidence 0.4"
74
+ splot info(x,y,0.4)
75
+ pause -1 "Hit return to continue"
76
+
77
+ set zlabel "d_chi^2"
78
+ set title "prior confidence 0.2"
79
+ splot chi2(x,y,0.2)
80
+ pause -1 "Hit return to continue"
81
+
82
+ set title "prior confidence 0.3"
83
+ splot chi2(x,y,0.3)
84
+ pause -1 "Hit return to continue"
85
+
86
+ set title "prior confidence 0.4"
87
+ splot chi2(x,y,0.4)
88
+ pause -1 "Hit return to continue"
89
+
@@ -0,0 +1,156 @@
1
+ \documentclass[a4paper]{article}
2
+ \oddsidemargin 2.1mm
3
+ \textwidth 155mm
4
+ \topmargin -12mm
5
+ \textheight 230mm
6
+
7
+ \def\tabstrut{\rule{0pt}{2.4ex}}
8
+ \def\eq{\!\!\!=\!\!\!}
9
+
10
+ \begin{document}
11
+
12
+ \subsection*{The Normalized $\chi^2$ Measure
13
+ for Association Rule Evaluation}
14
+
15
+ Let $C$ and $A$ be two attributes with domains
16
+ $\mbox{dom}(A) = \{ a_1, \ldots a_{n_A} \}$ and
17
+ $\mbox{dom}(C) = \{ c_1, \ldots c_{n_C} \}$, respectively,
18
+ and let $\cal X$ be a dataset over $C$ and $A$.
19
+ Let $N_{ij}$, $1 \le i \le n_C$, $1 \le j \le n_A$, be the number of
20
+ sample cases in $\cal X$, which contain both the attribute values~$c_i$
21
+ and $a_j$. Furthermore, let
22
+ \[ N_{i.} = \sum_{j=1}^{n_A} N_{ij}, \qquad
23
+ N_{.j} = \sum_{i=1}^{n_C} N_{ij}, \qquad\mbox{and}\qquad
24
+ N_{..} = \sum_{i=1}^{n_C} \sum_{j=1}^{n_A} N_{ij} = |{\cal X}|. \]
25
+ Finally, let
26
+ \[ p_{i.} = \frac{N_{i.}}{N_{..}}, \qquad
27
+ p_{.j} = \frac{N_{.j}}{N_{..}}, \qquad\mbox{and}\qquad
28
+ p_{ij} = \frac{N_{ij}}{N_{..}} \]
29
+ be the probabilities of the attribute values and their combinations,
30
+ as they can be estimated from these numbers. Then the well-known
31
+ $\chi^2$ measure is usually defined as
32
+ \begin{eqnarray*}
33
+ \chi^2(C,A)
34
+ & = & \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
35
+ \frac{(E_{ij} -N_{ij})^2}{E_{ij}}
36
+ \qquad\mbox{where}\quad E_{ij} = \frac{N_{i.}N_{.j}}{N_{..}} \\
37
+ & = & \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
38
+ \frac{\left(\frac{N_{i.}N_{.j}}{N_{..}} -N_{ij}\right)^2}
39
+ {\frac{N_{i.}N_{.j}}{N_{..}}}
40
+ ~~=~~ \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
41
+ \frac{N_{..}^2 \left(\frac{N_{i.\phantom{j}}}{N_{..}}
42
+ \frac{N_{.j}}{N_{..}}
43
+ - \frac{N_{ij}}{N_{..}}\right)^2}
44
+ {N_{..}\; \frac{N_{i.\phantom{j}}}{N_{..}}
45
+ \frac{N_{.j}}{N_{..}}} \\
46
+ & = & N_{..} \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
47
+ \frac{(p_{i.}\;p_{.j} - p_{ij})^2}{p_{i.}\;p_{.j}}
48
+ ~~=~~ N_{..} \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
49
+ \frac{(N_{i.}\;N_{.j} - N_{..}N_{ij})^2}{N_{i.}\;N_{.j}}.
50
+ \end{eqnarray*}
51
+ This measure is often normalized by dividing it by the
52
+ size~$N_{..} = |{\cal X}|$ of the dataset to remove the
53
+ dependence on the number of sample cases.
54
+
55
+ For association rule evaluation, $C$ refers the consequent and $A$ to
56
+ the antecedent of the rule. Both have two values, which we denote by
57
+ $c_0$, $c_1$ and $a_0$, $a_1$, respectively. $c_0$ means that the
58
+ consequent of the rule is not satisfied, $c_1$ that it is satisfied;
59
+ likewise for $A$. Then we have to compute the $\chi^2$ measure from
60
+ the $2 \times 2$ contingency table
61
+ \begin{center}
62
+ \begin{tabular}{|l|c|c|l|} \cline{2-3}
63
+ \multicolumn{1}{l|}{}
64
+ & $a_0$ & $a_1$ \\ \hline
65
+ $c_0$ & $N_{00}$ & $N_{01}$ & $N_{0.}$\tabstrut \\ \hline
66
+ $c_1$ & $N_{10}$ & $N_{11}$ & $N_{1.}$\tabstrut \\ \hline
67
+ \multicolumn{1}{l|}{}
68
+ & $N_{.0}$ & $N_{.1}$ & $N_{..}$\tabstrut \\ \cline{2-4}
69
+ \end{tabular}
70
+ \end{center}
71
+ or the estimated probability table
72
+ \begin{center}
73
+ \begin{tabular}{|l|c|c|l|} \cline{2-3}
74
+ \multicolumn{1}{l|}{}
75
+ & $a_0$ & $a_1$ \\ \hline
76
+ $c_0$ & $p_{00}$ & $p_{01}$ & $p_{0.}$\tabstrut \\ \hline
77
+ $c_1$ & $p_{10}$ & $p_{11}$ & $p_{1.}$\tabstrut \\ \hline
78
+ \multicolumn{1}{l|}{}
79
+ & $p_{.0}$ & $p_{.1}$ & $1$\tabstrut \\ \cline{2-4}
80
+ \end{tabular}
81
+ \end{center}
82
+ That is, we have
83
+ \begin{eqnarray*}
84
+ \frac{\chi^2(C,A)}{N_{..}}
85
+ & = & \sum_{i=0}^1 \sum_{j=0}^1
86
+ \frac{(p_{i.}\;p_{.j} - p_{ij})^2}{p_{i.}\;p_{.j}}. \\
87
+ & = & \frac{(p_{0.}\;p_{.0} -p_{00})^2}{p_{0.}\;p_{.0}}
88
+ + \frac{(p_{0.}\;p_{.1} -p_{01})^2}{p_{0.}\;p_{.1}}
89
+ + \frac{(p_{1.}\;p_{.0} -p_{10})^2}{p_{1.}\;p_{.0}}
90
+ + \frac{(p_{1.}\;p_{.1} -p_{11})^2}{p_{1.}\;p_{.1}}
91
+ \end{eqnarray*}
92
+ Now we can exploit
93
+ \[ p_{00} + p_{01} = p_{0.}, \quad
94
+ p_{10} + p_{10} = p_{1.}, \quad
95
+ p_{00} + p_{10} = p_{.0}, \quad
96
+ p_{01} + p_{11} = p_{.1}, \quad
97
+ p_{0.} + p_{1.} = 1, \quad
98
+ p_{.0} + p_{.1} = 1, \]
99
+ which leads to
100
+ \begin{eqnarray*}
101
+ p_{0.}\;p_{.0} -p_{00}
102
+ & = & (1 -p_{1.})(1 -p_{.1}) -(1 -p_{1.} -p_{.1} +p_{11})
103
+ ~~=~~ p_{1.}\;p_{.1} -p_{11}, \\
104
+ p_{0.}\;p_{.1} -p_{01}
105
+ & = & (1 -p_{1.})p_{.1} -(p_{.1} -p_{11})
106
+ ~~=~~ p_{11} -p_{1.}\;p_{.1}, \\
107
+ p_{1.}\;p_{.0} -p_{10}
108
+ & = & p_{1.}(1 -p_{.1}) -(p_{1.} -p_{11})
109
+ ~~=~~ p_{11} -p_{1.}\;p_{.1}. \\
110
+ \end{eqnarray*}
111
+ Therefore it is
112
+ \begin{eqnarray*}
113
+ \frac{\chi^2(C,A)}{N_{..}}
114
+ & = & \frac{(p_{1.}\;p_{.1} -p_{11})^2}{(1 -p_{1.})(1 -p_{.1})}
115
+ + \frac{(p_{1.}\;p_{.1} -p_{11})^2}{(1 -p_{1.})\;p_{.1}}
116
+ + \frac{(p_{1.}\;p_{.1} -p_{11})^2}{p_{1.}(1 -p_{.1})}
117
+ + \frac{(p_{1.}\;p_{.1} -p_{11})^2}{p_{1.}\;p_{.1}} \\
118
+ & = & \frac{(p_{1.}\;p_{.1} -p_{11})^2
119
+ (p_{1.}\;p_{.1}
120
+ +p_{1.}(1 -p_{.1})
121
+ +(1 -p_{1.})p_{.1}
122
+ +(1 -p_{1.})(1 -p_{.1}))}
123
+ {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})} \\
124
+ & = & \frac{(p_{1.}\;p_{.1} -p_{11})^2
125
+ (p_{1.}\;p_{.1}
126
+ +p_{1.} -p_{1.}\;p_{.1}
127
+ +p_{.1} -p_{1.}\;p_{.1}
128
+ +1 -p_{1.} -p_{.1} +p_{1.}\;p_{.1})}
129
+ {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})} \\
130
+ & = & \frac{(p_{1.}\;p_{.1} -p_{11})^2}
131
+ {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})}.
132
+ \end{eqnarray*}
133
+ In the program, $p_{1.}$ (argument {\tt head}), $p_{.1}$
134
+ (argument {\tt body}) and $p_{1|1} = \frac{p_{11}}{p_{.1}}$
135
+ (argument {\tt post}, rule confidence) are passed to the routine
136
+ that computes the measure, so the actual computation is
137
+ \begin{eqnarray*}
138
+ \frac{\chi^2(C,A)}{N_{..}}
139
+ & = & \frac{(p_{1.}\;p_{.1} -p_{1|1}\;p_{.1})^2}
140
+ {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})}.
141
+ ~~=~~ \frac{((p_{1.} -p_{1|1})p_{.1})^2}
142
+ {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})}.
143
+ \end{eqnarray*}
144
+ In an analogous way the measure can also be computed from the absolute
145
+ frequencies $N_{ij}$, $N_{i.}$, $N_{.j}$ and $N_{..}$, namely as
146
+ \begin{eqnarray*}
147
+ \frac{\chi^2(C,A)}{N_{..}}
148
+ & = & \frac{(N_{1.}N_{.1} -N_{..}N_{11})^2}
149
+ {N_{1.}(N_{..} -N_{1.})N_{.1}(N_{..} -N_{.1})}.
150
+ \end{eqnarray*}
151
+ \end{document}
152
+
153
+ %%% Local Variables:
154
+ %%% mode: latex
155
+ %%% TeX-master: t
156
+ %%% End: