RubyGems - jashmenn-apriori - Versions diffs - 0.1.0 - Mend

jashmenn-apriori 0.1.0

Files changed (122) hide show

data/History.txt +4 -0
data/License.txt +20 -0
data/Manifest.txt +121 -0
data/README.txt +139 -0
data/Rakefile +4 -0
data/TODO.txt +60 -0
data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
data/attic/c_ext_test1/mytest.rb +10 -0
data/attic/test.c +12 -0
data/config/hoe.rb +77 -0
data/config/requirements.rb +15 -0
data/examples/01_simple_example.rb +23 -0
data/examples/02_small_file_example.rb +17 -0
data/examples/03_large_file_example.rb +22 -0
data/examples/test_data/market_basket_basic_test.dat +9 -0
data/ext/Apriori.c +149 -0
data/ext/Makefile +149 -0
data/ext/apriori/doc/apriori.html +1301 -0
data/ext/apriori/doc/arem.gp +68 -0
data/ext/apriori/doc/c_rev.gp +89 -0
data/ext/apriori/doc/chi2.tex +156 -0
data/ext/apriori/doc/copying +504 -0
data/ext/apriori/doc/line.gif +0 -0
data/ext/apriori/doc/uparrow.gif +0 -0
data/ext/apriori/ex/flg2set +15 -0
data/ext/apriori/ex/hdr2set +13 -0
data/ext/apriori/ex/readme +71 -0
data/ext/apriori/ex/row2set +7 -0
data/ext/apriori/ex/rulesort +24 -0
data/ext/apriori/ex/tab2set +9 -0
data/ext/apriori/ex/test.app +2 -0
data/ext/apriori/ex/test.rul +9 -0
data/ext/apriori/ex/test1.rul +43 -0
data/ext/apriori/ex/test1.tab +10 -0
data/ext/apriori/ex/test2.tab +10 -0
data/ext/apriori/ex/test3.tab +30 -0
data/ext/apriori/ex/test4.tab +11 -0
data/ext/apriori/ex/test5.tab +39 -0
data/ext/apriori/ex/tid2set +23 -0
data/ext/apriori/ex/xhdr2set +33 -0
data/ext/apriori/src/apriori.c +750 -0
data/ext/apriori/src/apriori.dsp +120 -0
data/ext/apriori/src/apriori.dsw +29 -0
data/ext/apriori/src/apriori.mak +99 -0
data/ext/apriori/src/istree.c +1411 -0
data/ext/apriori/src/istree.h +160 -0
data/ext/apriori/src/makefile +105 -0
data/ext/apriori/src/tract.c +870 -0
data/ext/apriori/src/tract.h +261 -0
data/ext/apriori_wrapper.c +757 -0
data/ext/apriori_wrapper.h +10 -0
data/ext/extconf.rb +32 -0
data/ext/math/doc/copying +504 -0
data/ext/math/src/chi2.c +151 -0
data/ext/math/src/chi2.h +27 -0
data/ext/math/src/choose.c +71 -0
data/ext/math/src/choose.h +16 -0
data/ext/math/src/gamma.c +446 -0
data/ext/math/src/gamma.h +39 -0
data/ext/math/src/intexp.c +35 -0
data/ext/math/src/intexp.h +15 -0
data/ext/math/src/makefile +164 -0
data/ext/math/src/math.mak +48 -0
data/ext/math/src/normal.c +387 -0
data/ext/math/src/normal.h +44 -0
data/ext/math/src/radfn.c +113 -0
data/ext/math/src/radfn.h +34 -0
data/ext/math/src/zeta.c +49 -0
data/ext/math/src/zeta.h +15 -0
data/ext/pre-clean.rb +8 -0
data/ext/pre-setup.rb +9 -0
data/ext/util/doc/copying +504 -0
data/ext/util/src/listops.c +76 -0
data/ext/util/src/listops.h +26 -0
data/ext/util/src/makefile +103 -0
data/ext/util/src/memsys.c +84 -0
data/ext/util/src/memsys.h +42 -0
data/ext/util/src/nstats.c +288 -0
data/ext/util/src/nstats.h +69 -0
data/ext/util/src/params.c +86 -0
data/ext/util/src/params.h +19 -0
data/ext/util/src/parse.c +133 -0
data/ext/util/src/parse.h +81 -0
data/ext/util/src/scan.c +767 -0
data/ext/util/src/scan.h +111 -0
data/ext/util/src/symtab.c +443 -0
data/ext/util/src/symtab.h +121 -0
data/ext/util/src/tabscan.c +279 -0
data/ext/util/src/tabscan.h +99 -0
data/ext/util/src/util.mak +91 -0
data/ext/util/src/vecops.c +317 -0
data/ext/util/src/vecops.h +42 -0
data/lib/apriori/adapter.rb +13 -0
data/lib/apriori/association_rule.rb +85 -0
data/lib/apriori/version.rb +9 -0
data/lib/apriori.rb +133 -0
data/script/console +10 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/script/txt2html +82 -0
data/setup.rb +1585 -0
data/tasks/apriori.rake +20 -0
data/tasks/attic.rake +28 -0
data/tasks/deployment.rake +34 -0
data/tasks/environment.rake +7 -0
data/tasks/install.rake +6 -0
data/tasks/website.rake +17 -0
data/test/apriori_test.rb +13 -0
data/test/fixtures/market_basket_results_test.txt +5 -0
data/test/fixtures/market_basket_string_test.txt +7 -0
data/test/fixtures/results.txt +2 -0
data/test/fixtures/sample.txt +7 -0
data/test/test_helper.rb +5 -0
data/test/unit/test_apriori.rb +68 -0
data/test/unit/test_itemsets_and_parsing.rb +82 -0
data/website/index.html +233 -0
data/website/index.txt +142 -0
data/website/javascripts/rounded_corners_lite.inc.js +285 -0
data/website/stylesheets/screen.css +142 -0
data/website/template.html.erb +49 -0
metadata +188 -0

data/ext/apriori/doc/arem.gp ADDED Viewed

@@ -0,0 +1,68 @@
+#-----------------------------------------------------------------------
+# File    : arem.gp
+# Contents: visualization of the selection behaviour
+#           of the additional rule evaluation measures
+# Author  : Christian Borgelt
+# History : 22.09.1998 file created
+#-----------------------------------------------------------------------
+# set terminal postscript
+set view 50,325,1
+set hidden3d
+set isosamples 20,20
+set xrange [0.001:0.999]
+set yrange [0.001:0.999]
+set zrange [0:1]
+min(a,b) = a < b ? a : b
+diff(x,y) = abs(x -y)
+quot(x,y) = 1 -min(x/y, y/x)
+info(x,y,s) = (x >= y*s) && (1-x >= s *(1-y)) \
+            ? (s*y*log(y/x) +(x-s*y)*log((x-s*y)/(x*(1-s))) \
+            +  s*(1-y)*log((1-y)/(1-x)) \
+            + (1-x-s*(1-y))*log((1-x-s*(1-y))/((1-x)*(1-s)))) /log(2) : 0
+chi2(x,y,s) = (x >= y*s) && (1-x >= s *(1-y)) \
+            ? (x*s -y*s)**2 /(x*(1-x)*s*(1-s)) : 0
+set title "d_diff (independent of antecedent support)"
+set xlabel "c_prior"
+set ylabel "c_post"
+set zlabel "d_diff"
+splot diff(x,y)
+pause -1 "Hit return to continue"
+set title "d_quot (independent of antecedent support)"
+set zlabel "d_quot"
+splot quot(x,y)
+pause -1 "Hit return to continue"
+set zlabel "d_info"
+set title "d_info with antecedent support 0.2"
+splot info(x,y,0.2)
+pause -1 "Hit return to continue"
+set title "d_info with antecedent support 0.3"
+splot info(x,y,0.3)
+pause -1 "Hit return to continue"
+set title "d_info with antecedent support 0.4"
+splot info(x,y,0.4)
+pause -1 "Hit return to continue"
+set zlabel "d_chi^2"
+set title "d_chi^2 with antecedent support 0.2"
+splot chi2(x,y,0.2)
+pause -1 "Hit return to continue"
+set title "d_chi^2 with antecedent support 0.3"
+splot chi2(x,y,0.3)
+pause -1 "Hit return to continue"
+set title "d_chi^2 with antecedent support 0.4"
+splot chi2(x,y,0.4)
+pause -1 "Hit return to continue"

data/ext/apriori/doc/c_rev.gp ADDED Viewed

@@ -0,0 +1,89 @@
+#-----------------------------------------------------------------------
+# File    : c_rev.gp
+# Contents: visualization of the dependence between posterior confidence
+#           and reversed confidence, i.e. the confidence of the reversed
+#           rule antecedent <- consequent
+# Author  : Christian Borgelt
+# History : 23.09.1998 file created
+#-----------------------------------------------------------------------
+# set terminal postscript
+set view 50,330,1
+set xrange [0.001:0.999]
+set yrange [0.001:0.999]
+set zrange [0:1]
+set isosamples 20
+set hidden3d
+set xlabel "c_post"
+set ylabel "c_rev"
+min(a,b) = a < b ? a : b
+diff(x,y,p) = (1 -p -(y*p)/x +y*p >= 0) ? abs(x -p) : 0
+quot(x,y,p) = (1 -p -(y*p)/x +y*p >= 0) ? 1 -min(x/p,p/x) : 0
+i(x,y,p)    = (y*p*log((y*p)/(p*(y*p)/x)) \
+            + (p-y*p)*log((p-y*p)/(p*(1-(y*p)/x))) \
+            + ((y*p)/x-y*p)*log(((y*p)/x-y*p)/((1-p)*(y*p)/x)) \
+            + (1-p-(y*p)/x+y*p)*log((1-p-(y*p)/x+y*p) \
+                                  /((1-p)*(1-(y*p)/x)))) /log(2)
+info(x,y,p) = (1 -p -(y*p)/x +y*p >= 0) ? i(x,y,p) : 0
+chi2(x,y,p) = (1 -p -(y*p)/x +y*p >= 0) \
+            ? (p*((y*p)/x)-y*p)**2 /(p*(1-p)*((y*p)/x)*(1-(y*p)/x)) : 0
+set zlabel "d_diff"
+set title "prior confidence 0.2"
+splot diff(x,y,0.2)
+pause -1 "Hit return to continue"
+set title "prior confidence 0.3"
+splot diff(x,y,0.3)
+pause -1 "Hit return to continue"
+set title "prior confidence 0.4"
+splot diff(x,y,0.4)
+pause -1 "Hit return to continue"
+set zlabel "d_quot"
+set title "prior confidence 0.2"
+splot quot(x,y,0.2)
+pause -1 "Hit return to continue"
+set title "prior confidence 0.3"
+splot quot(x,y,0.3)
+pause -1 "Hit return to continue"
+set title "prior confidence 0.4"
+splot quot(x,y,0.4)
+pause -1 "Hit return to continue"
+set zlabel "d_info"
+set title "prior confidence 0.2"
+splot info(x,y,0.2)
+pause -1 "Hit return to continue"
+set title "prior confidence 0.3"
+splot info(x,y,0.3)
+pause -1 "Hit return to continue"
+set title "prior confidence 0.4"
+splot info(x,y,0.4)
+pause -1 "Hit return to continue"
+set zlabel "d_chi^2"
+set title "prior confidence 0.2"
+splot chi2(x,y,0.2)
+pause -1 "Hit return to continue"
+set title "prior confidence 0.3"
+splot chi2(x,y,0.3)
+pause -1 "Hit return to continue"
+set title "prior confidence 0.4"
+splot chi2(x,y,0.4)
+pause -1 "Hit return to continue"

data/ext/apriori/doc/chi2.tex ADDED Viewed

@@ -0,0 +1,156 @@
+\documentclass[a4paper]{article}
+\oddsidemargin 2.1mm
+\textwidth     155mm
+\topmargin     -12mm
+\textheight    230mm
+\def\tabstrut{\rule{0pt}{2.4ex}}
+\def\eq{\!\!\!=\!\!\!}
+\begin{document}
+\subsection*{The Normalized $\chi^2$ Measure
+             for Association Rule Evaluation}
+Let $C$ and $A$ be two attributes with domains
+$\mbox{dom}(A) = \{ a_1, \ldots a_{n_A} \}$ and
+$\mbox{dom}(C) = \{ c_1, \ldots c_{n_C} \}$, respectively,
+and let $\cal X$ be a dataset over $C$ and $A$.
+Let $N_{ij}$, $1 \le i \le n_C$, $1 \le j \le n_A$, be the number of
+sample cases in $\cal X$, which contain both the attribute values~$c_i$
+and $a_j$. Furthermore, let
+\[ N_{i.} = \sum_{j=1}^{n_A} N_{ij}, \qquad
+   N_{.j} = \sum_{i=1}^{n_C} N_{ij}, \qquad\mbox{and}\qquad
+   N_{..} = \sum_{i=1}^{n_C} \sum_{j=1}^{n_A} N_{ij} = |{\cal X}|. \]
+Finally, let
+\[ p_{i.} = \frac{N_{i.}}{N_{..}}, \qquad
+   p_{.j} = \frac{N_{.j}}{N_{..}}, \qquad\mbox{and}\qquad
+   p_{ij} = \frac{N_{ij}}{N_{..}} \]
+be the probabilities of the attribute values and their combinations,
+as they can be estimated from these numbers. Then the well-known
+$\chi^2$ measure is usually defined as
+\begin{eqnarray*}
+\chi^2(C,A)
+& = & \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
+      \frac{(E_{ij} -N_{ij})^2}{E_{ij}}
+      \qquad\mbox{where}\quad E_{ij} = \frac{N_{i.}N_{.j}}{N_{..}} \\
+& = & \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
+      \frac{\left(\frac{N_{i.}N_{.j}}{N_{..}} -N_{ij}\right)^2}
+           {\frac{N_{i.}N_{.j}}{N_{..}}}
+~~=~~ \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
+      \frac{N_{..}^2 \left(\frac{N_{i.\phantom{j}}}{N_{..}}
+                           \frac{N_{.j}}{N_{..}}
+                         - \frac{N_{ij}}{N_{..}}\right)^2}
+           {N_{..}\;       \frac{N_{i.\phantom{j}}}{N_{..}}
+                           \frac{N_{.j}}{N_{..}}} \\
+& = & N_{..} \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
+      \frac{(p_{i.}\;p_{.j} - p_{ij})^2}{p_{i.}\;p_{.j}}
+~~=~~ N_{..} \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
+      \frac{(N_{i.}\;N_{.j} - N_{..}N_{ij})^2}{N_{i.}\;N_{.j}}.
+\end{eqnarray*}
+This measure is often normalized by dividing it by the
+size~$N_{..} = |{\cal X}|$ of the dataset to remove the
+dependence on the number of sample cases.
+For association rule evaluation, $C$ refers the consequent and $A$ to
+the antecedent of the rule. Both have two values, which we denote by
+$c_0$, $c_1$ and $a_0$, $a_1$, respectively. $c_0$ means that the
+consequent of the rule is not satisfied, $c_1$ that it is satisfied;
+likewise for $A$. Then we have to compute the $\chi^2$ measure from
+the $2 \times 2$ contingency table
+\begin{center}
+\begin{tabular}{|l|c|c|l|} \cline{2-3}
+\multicolumn{1}{l|}{}
+      & $a_0$    & $a_1$    \\ \hline
+$c_0$ & $N_{00}$ & $N_{01}$ & $N_{0.}$\tabstrut \\ \hline
+$c_1$ & $N_{10}$ & $N_{11}$ & $N_{1.}$\tabstrut \\ \hline
+\multicolumn{1}{l|}{}
+      & $N_{.0}$ & $N_{.1}$ & $N_{..}$\tabstrut \\ \cline{2-4}
+\end{tabular}
+\end{center}
+or the estimated probability table
+\begin{center}
+\begin{tabular}{|l|c|c|l|} \cline{2-3}
+\multicolumn{1}{l|}{}
+      & $a_0$    & $a_1$    \\ \hline
+$c_0$ & $p_{00}$ & $p_{01}$ & $p_{0.}$\tabstrut \\ \hline
+$c_1$ & $p_{10}$ & $p_{11}$ & $p_{1.}$\tabstrut \\ \hline
+\multicolumn{1}{l|}{}
+      & $p_{.0}$ & $p_{.1}$ & $1$\tabstrut \\ \cline{2-4}
+\end{tabular}
+\end{center}
+That is, we have
+\begin{eqnarray*}
+\frac{\chi^2(C,A)}{N_{..}}
+& = & \sum_{i=0}^1 \sum_{j=0}^1
+      \frac{(p_{i.}\;p_{.j} - p_{ij})^2}{p_{i.}\;p_{.j}}. \\
+& = & \frac{(p_{0.}\;p_{.0} -p_{00})^2}{p_{0.}\;p_{.0}}
+  +   \frac{(p_{0.}\;p_{.1} -p_{01})^2}{p_{0.}\;p_{.1}}
+  +   \frac{(p_{1.}\;p_{.0} -p_{10})^2}{p_{1.}\;p_{.0}}
+  +   \frac{(p_{1.}\;p_{.1} -p_{11})^2}{p_{1.}\;p_{.1}}
+\end{eqnarray*}
+Now we can exploit
+\[ p_{00} + p_{01} = p_{0.}, \quad
+   p_{10} + p_{10} = p_{1.}, \quad
+   p_{00} + p_{10} = p_{.0}, \quad
+   p_{01} + p_{11} = p_{.1}, \quad
+   p_{0.} + p_{1.} = 1, \quad
+   p_{.0} + p_{.1} = 1, \]
+which leads to
+\begin{eqnarray*}
+p_{0.}\;p_{.0} -p_{00}
+& = & (1 -p_{1.})(1 -p_{.1}) -(1 -p_{1.} -p_{.1} +p_{11})
+~~=~~ p_{1.}\;p_{.1} -p_{11}, \\
+p_{0.}\;p_{.1} -p_{01}
+& = & (1 -p_{1.})p_{.1} -(p_{.1} -p_{11})
+~~=~~ p_{11} -p_{1.}\;p_{.1}, \\
+p_{1.}\;p_{.0} -p_{10}
+& = & p_{1.}(1 -p_{.1}) -(p_{1.} -p_{11})
+~~=~~ p_{11} -p_{1.}\;p_{.1}. \\
+\end{eqnarray*}
+Therefore it is
+\begin{eqnarray*}
+\frac{\chi^2(C,A)}{N_{..}}
+& = & \frac{(p_{1.}\;p_{.1} -p_{11})^2}{(1 -p_{1.})(1 -p_{.1})}
+  +   \frac{(p_{1.}\;p_{.1} -p_{11})^2}{(1 -p_{1.})\;p_{.1}}
+  +   \frac{(p_{1.}\;p_{.1} -p_{11})^2}{p_{1.}(1 -p_{.1})}
+  +   \frac{(p_{1.}\;p_{.1} -p_{11})^2}{p_{1.}\;p_{.1}} \\
+& = & \frac{(p_{1.}\;p_{.1} -p_{11})^2
+            (p_{1.}\;p_{.1}
+            +p_{1.}(1 -p_{.1})
+            +(1 -p_{1.})p_{.1}
+            +(1 -p_{1.})(1 -p_{.1}))}
+           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})} \\
+& = & \frac{(p_{1.}\;p_{.1} -p_{11})^2
+            (p_{1.}\;p_{.1}
+            +p_{1.} -p_{1.}\;p_{.1}
+            +p_{.1} -p_{1.}\;p_{.1}
+            +1 -p_{1.} -p_{.1} +p_{1.}\;p_{.1})}
+           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})} \\
+& = & \frac{(p_{1.}\;p_{.1} -p_{11})^2}
+           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})}.
+\end{eqnarray*}
+In the program, $p_{1.}$ (argument {\tt head}), $p_{.1}$
+(argument {\tt body}) and $p_{1|1} = \frac{p_{11}}{p_{.1}}$
+(argument {\tt post}, rule confidence) are passed to the routine
+that computes the measure, so the actual computation is
+\begin{eqnarray*}
+\frac{\chi^2(C,A)}{N_{..}}
+& = & \frac{(p_{1.}\;p_{.1} -p_{1|1}\;p_{.1})^2}
+           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})}.
+~~=~~ \frac{((p_{1.} -p_{1|1})p_{.1})^2}
+           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})}.
+\end{eqnarray*}
+In an analogous way the measure can also be computed from the absolute
+frequencies $N_{ij}$, $N_{i.}$, $N_{.j}$ and $N_{..}$, namely as
+\begin{eqnarray*}
+\frac{\chi^2(C,A)}{N_{..}}
+& = & \frac{(N_{1.}N_{.1} -N_{..}N_{11})^2}
+           {N_{1.}(N_{..} -N_{1.})N_{.1}(N_{..} -N_{.1})}.
+\end{eqnarray*}
+\end{document}
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% End: