apriori 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +16 -0
- data/License.txt +20 -0
- data/Manifest.txt +121 -0
- data/README.txt +149 -0
- data/Rakefile +15 -0
- data/TODO.txt +60 -0
- data/attic/c_ext_test1/MyTest/MyTest.c +23 -0
- data/attic/c_ext_test1/MyTest/extconf.rb +11 -0
- data/attic/c_ext_test1/mytest.rb +10 -0
- data/attic/test.c +12 -0
- data/config/hoe.rb +81 -0
- data/config/requirements.rb +29 -0
- data/examples/01_simple_example.rb +32 -0
- data/examples/02_small_file_example.rb +17 -0
- data/examples/03_large_file_example.rb +22 -0
- data/examples/test_data/market_basket_basic_test.dat +9 -0
- data/ext/Apriori.c +149 -0
- data/ext/Makefile +149 -0
- data/ext/apriori/doc/apriori.html +1301 -0
- data/ext/apriori/doc/arem.gp +68 -0
- data/ext/apriori/doc/c_rev.gp +89 -0
- data/ext/apriori/doc/chi2.tex +156 -0
- data/ext/apriori/doc/copying +504 -0
- data/ext/apriori/doc/line.gif +0 -0
- data/ext/apriori/doc/uparrow.gif +0 -0
- data/ext/apriori/ex/flg2set +15 -0
- data/ext/apriori/ex/hdr2set +13 -0
- data/ext/apriori/ex/readme +71 -0
- data/ext/apriori/ex/row2set +7 -0
- data/ext/apriori/ex/rulesort +24 -0
- data/ext/apriori/ex/tab2set +9 -0
- data/ext/apriori/ex/test.app +2 -0
- data/ext/apriori/ex/test.rul +9 -0
- data/ext/apriori/ex/test1.rul +43 -0
- data/ext/apriori/ex/test1.tab +10 -0
- data/ext/apriori/ex/test2.tab +10 -0
- data/ext/apriori/ex/test3.tab +30 -0
- data/ext/apriori/ex/test4.tab +11 -0
- data/ext/apriori/ex/test5.tab +39 -0
- data/ext/apriori/ex/tid2set +23 -0
- data/ext/apriori/ex/xhdr2set +33 -0
- data/ext/apriori/src/apriori.c +750 -0
- data/ext/apriori/src/apriori.dsp +120 -0
- data/ext/apriori/src/apriori.dsw +29 -0
- data/ext/apriori/src/apriori.mak +99 -0
- data/ext/apriori/src/istree.c +1411 -0
- data/ext/apriori/src/istree.h +160 -0
- data/ext/apriori/src/makefile +105 -0
- data/ext/apriori/src/tract.c +870 -0
- data/ext/apriori/src/tract.h +261 -0
- data/ext/apriori_wrapper.c +757 -0
- data/ext/apriori_wrapper.h +10 -0
- data/ext/extconf.rb +32 -0
- data/ext/math/doc/copying +504 -0
- data/ext/math/src/chi2.c +151 -0
- data/ext/math/src/chi2.h +27 -0
- data/ext/math/src/choose.c +71 -0
- data/ext/math/src/choose.h +16 -0
- data/ext/math/src/gamma.c +446 -0
- data/ext/math/src/gamma.h +39 -0
- data/ext/math/src/intexp.c +35 -0
- data/ext/math/src/intexp.h +15 -0
- data/ext/math/src/makefile +164 -0
- data/ext/math/src/math.mak +48 -0
- data/ext/math/src/normal.c +387 -0
- data/ext/math/src/normal.h +44 -0
- data/ext/math/src/radfn.c +113 -0
- data/ext/math/src/radfn.h +34 -0
- data/ext/math/src/zeta.c +49 -0
- data/ext/math/src/zeta.h +15 -0
- data/ext/pre-clean.rb +8 -0
- data/ext/pre-setup.rb +9 -0
- data/ext/util/doc/copying +504 -0
- data/ext/util/src/listops.c +76 -0
- data/ext/util/src/listops.h +26 -0
- data/ext/util/src/makefile +103 -0
- data/ext/util/src/memsys.c +84 -0
- data/ext/util/src/memsys.h +42 -0
- data/ext/util/src/nstats.c +288 -0
- data/ext/util/src/nstats.h +69 -0
- data/ext/util/src/params.c +86 -0
- data/ext/util/src/params.h +19 -0
- data/ext/util/src/parse.c +133 -0
- data/ext/util/src/parse.h +81 -0
- data/ext/util/src/scan.c +767 -0
- data/ext/util/src/scan.h +111 -0
- data/ext/util/src/symtab.c +443 -0
- data/ext/util/src/symtab.h +121 -0
- data/ext/util/src/tabscan.c +279 -0
- data/ext/util/src/tabscan.h +99 -0
- data/ext/util/src/util.mak +91 -0
- data/ext/util/src/vecops.c +317 -0
- data/ext/util/src/vecops.h +42 -0
- data/lib/apriori.rb +133 -0
- data/lib/apriori/adapter.rb +13 -0
- data/lib/apriori/association_rule.rb +89 -0
- data/lib/apriori/version.rb +9 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/apriori.rake +20 -0
- data/tasks/attic.rake +28 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/install.rake +13 -0
- data/tasks/website.rake +17 -0
- data/test/apriori_test.rb +13 -0
- data/test/fixtures/market_basket_results_test.txt +5 -0
- data/test/fixtures/market_basket_string_test.txt +7 -0
- data/test/fixtures/results.txt +2 -0
- data/test/fixtures/sample.txt +7 -0
- data/test/test_helper.rb +5 -0
- data/test/unit/test_apriori.rb +68 -0
- data/test/unit/test_itemsets_and_parsing.rb +82 -0
- data/website/index.html +248 -0
- data/website/index.txt +152 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +142 -0
- data/website/template.html.erb +49 -0
- metadata +226 -0
@@ -0,0 +1,1301 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
|
2
|
+
<!-- ===================================================================
|
3
|
+
File : apriori.html
|
4
|
+
Contents: Description of apriori program
|
5
|
+
Author : Christian Borgelt
|
6
|
+
==================================================================== -->
|
7
|
+
<html>
|
8
|
+
<head>
|
9
|
+
<title>Apriori Documentation</title>
|
10
|
+
</head>
|
11
|
+
|
12
|
+
<!-- =============================================================== -->
|
13
|
+
|
14
|
+
<body bgcolor=white>
|
15
|
+
<h1><a name="top">Apriori</a></h1>
|
16
|
+
<h3>Finding Association Rules/Hyperedges with the Apriori Algorithm</h3>
|
17
|
+
|
18
|
+
<!-- =============================================================== -->
|
19
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
20
|
+
|
21
|
+
<h3>Contents</h3>
|
22
|
+
<ul type=disc>
|
23
|
+
<li><a href="#intro">Introduction</a></li>
|
24
|
+
<li><a href="#terms">Support and Confidence</a>
|
25
|
+
<ul type=circle>
|
26
|
+
<li><a href="#suppset">Support of an Item Set</a></li>
|
27
|
+
<li><a href="#confrule">Confidence of an Association Rule</a></li>
|
28
|
+
<li><a href="#supprule">Support of an Association Rule</a></li>
|
29
|
+
</ul></li>
|
30
|
+
<li><a href="#target">Target Types</a>
|
31
|
+
<ul type=circle>
|
32
|
+
<li><a href="#assrules">Association Rules</a></li>
|
33
|
+
<li><a href="#itemsets">Frequent Item Sets</a></li>
|
34
|
+
<li><a href="#closed">Closed Item Sets</a></li>
|
35
|
+
<li><a href="#maximal">Maximal Item Sets</a></li>
|
36
|
+
<li><a href="#hyperedges">Association Hyperedges</a></li>
|
37
|
+
</ul></li>
|
38
|
+
<li><a href="#select">Extended Rule Selection</a>
|
39
|
+
<ul type=circle>
|
40
|
+
<li><a href="#diff">
|
41
|
+
Absolute Confidence Difference to Prior</a></li>
|
42
|
+
<li><a href="#quotient">
|
43
|
+
Difference of Confidence Quotient to 1</a></li>
|
44
|
+
<li><a href="#improve">
|
45
|
+
Absolute Difference of Improvement Value to 1</a></li>
|
46
|
+
<li><a href="#info">
|
47
|
+
Information Difference to Prior</a></li>
|
48
|
+
<li><a href="#chi2">
|
49
|
+
Normalized chi<sup>2</sup> Measure</a></li>
|
50
|
+
<li><a href="#behavior">
|
51
|
+
Selection Behavior of the Measures</a></li>
|
52
|
+
<li><a href="#appear">Item Appearances</a></li>
|
53
|
+
</ul></li>
|
54
|
+
<li><a href="#select">Extended Item Set Selection</a>
|
55
|
+
<ul type=circle>
|
56
|
+
<li><a href="#logquot">
|
57
|
+
Binary Logarithm of Support Quotient</a></li>
|
58
|
+
<li><a href="#suppquot">
|
59
|
+
Difference of Support Quotient to 1</a></li>
|
60
|
+
</ul></li>
|
61
|
+
<li><a href="#tatree">Transaction Prefix Tree</a></li>
|
62
|
+
<li><a href="#options">Program Invocation and Options</a></li>
|
63
|
+
<li><a href="#input">Input Format</a>
|
64
|
+
<ul type=circle>
|
65
|
+
<li><a href="#transin">Format of the Transactions File</a></li>
|
66
|
+
<li><a href="#appearin">Format of the Item Appearances File</a></li>
|
67
|
+
</ul></li>
|
68
|
+
<li><a href="#output">Output Format</a>
|
69
|
+
<ul type=circle>
|
70
|
+
<li><a href="#ruleout">Output Format for Association Rules</a></li>
|
71
|
+
<li><a href="#setout">Output Format for Frequent Item Sets</a></li>
|
72
|
+
<li><a href="#edgeout">Output Format for Association Hyperedges</a>
|
73
|
+
</li>
|
74
|
+
</ul></li>
|
75
|
+
<li><a href="#compopt">Compilation Options</a></li>
|
76
|
+
<li><a href="#copying">Copying</a></li>
|
77
|
+
<li><a href="#download">Download</a></li>
|
78
|
+
<li><a href="#contact">Contact</a></li>
|
79
|
+
</ul>
|
80
|
+
|
81
|
+
<!-- =============================================================== -->
|
82
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
83
|
+
|
84
|
+
<h3><a name="intro">Introduction</a></h3>
|
85
|
+
|
86
|
+
<p>Association rule induction [Agrawal et al. 1993] is a powerful method
|
87
|
+
for so-called <i>market basket analysis</i>, which aims at finding
|
88
|
+
regularities in the shopping behavior of customers of supermarkets,
|
89
|
+
mail-order companies and the like. With the induction of association
|
90
|
+
rules one tries to find sets of products that are frequently bought
|
91
|
+
together, so that from the presence of certain products in a shopping
|
92
|
+
cart one can infer (with a high probability) that certain other products
|
93
|
+
are present. Such information, expressed in the form of rules, can
|
94
|
+
often be used to increase the number of items sold, for instance, by
|
95
|
+
appropriately arranging the products in the shelves of a supermarket
|
96
|
+
(they may, for example, be placed adjacent to each other in order to
|
97
|
+
invite even more customers to buy them together) or by directly
|
98
|
+
suggesting items to a customer, which may be of interest for him/her.
|
99
|
+
</p>
|
100
|
+
|
101
|
+
<p>An <i>association rule</i> is a rule like "If a customer buys wine
|
102
|
+
and bread, he often buys cheese, too." It expresses an association
|
103
|
+
between (sets of) <i>items</i>, which may be products of a supermarket
|
104
|
+
or a mail-order company, special equipment options of a car, optional
|
105
|
+
services offered by telecommunication companies etc. An association
|
106
|
+
rule states that if we pick a customer at random and find out that
|
107
|
+
he selected certain items (bought certain products, chose certain
|
108
|
+
options etc.), we can be confident, quantified by a percentage, that
|
109
|
+
he also selected certain other items (bought certain other products,
|
110
|
+
chose certain other options etc.).</p>
|
111
|
+
|
112
|
+
<p>Of course, we do not want just any association rules, we want
|
113
|
+
"good" rules, rules that are "expressive" and "reliable". The standard
|
114
|
+
measures to assess association rules are the <i>support</i> and the
|
115
|
+
<i>confidence</i> of a rule, both of which are computed from the
|
116
|
+
<i>support</i> of certain item sets. These notions are discussed
|
117
|
+
<a href="#terms">here</a> in more detail. However, these standard
|
118
|
+
criteria are often not sufficient to restrict the set of rules to
|
119
|
+
the interesting ones. Therefore some additional rule evaluation
|
120
|
+
measures are considered <a href="#select">here</a>.</p>
|
121
|
+
|
122
|
+
<p>The main problem of association rule induction is that there are
|
123
|
+
so many possible rules. For example, for the product range of a
|
124
|
+
supermarket, which may consist of several thousand different products,
|
125
|
+
there are billions of possible association rules. It is obvious that
|
126
|
+
such a vast amount of rules cannot be processed by inspecting each
|
127
|
+
one in turn. Therefore efficient algorithms are needed that restrict
|
128
|
+
the search space and check only a subset of all rules, but, if possible,
|
129
|
+
without missing important rules. One such algorithm is the apriori
|
130
|
+
algorithm, which was developed by [Agrawal et al. 1994] and which
|
131
|
+
is implemented in a specific way in my apriori program. A brief
|
132
|
+
description of some implementation aspects can be found in these
|
133
|
+
papers:</p>
|
134
|
+
<ul type=disc>
|
135
|
+
<li><b>Induction of Association Rules: Apriori Implementation</b><br>
|
136
|
+
Christian Borgelt and Rudolf Kruse<br>
|
137
|
+
<i>15th Conference on Computational Statistics</i>
|
138
|
+
(Compstat 2002, Berlin, Germany)<br>
|
139
|
+
Physica Verlag, Heidelberg, Germany 2002<br>
|
140
|
+
(6 pages)
|
141
|
+
<a href="http://fuzzy.cs.uni-magdeburg.de/~borgelt/papers/cstat_02.pdf">
|
142
|
+
cstat_02.pdf</a> (105 kb)
|
143
|
+
<a href="http://fuzzy.cs.uni-magdeburg.de/~borgelt/papers/cstat_02.ps.gz">
|
144
|
+
cstat_02.ps.gz</a> (91 kb)</li>
|
145
|
+
<li><b>Efficient Implementations of Apriori and Eclat</b><br>
|
146
|
+
Christian Borgelt.<br>
|
147
|
+
<i>Workshop of Frequent Item Set Mining Implementations</i>
|
148
|
+
(FIMI 2003, Melbourne, FL, USA).<br>
|
149
|
+
(9 pages)
|
150
|
+
<a href="http://fuzzy.cs.uni-magdeburg.de/~borgelt/papers/fimi_03.pdf">
|
151
|
+
fimi_03.pdf</a> (304 kb)
|
152
|
+
<a href="http://fuzzy.cs.uni-magdeburg.de/~borgelt/papers/fimi_03.ps.gz">
|
153
|
+
fimi_03.ps.gz</a> (197 kb)</li>
|
154
|
+
</ul>
|
155
|
+
|
156
|
+
<p>By the way: Earlier versions of my apriori program
|
157
|
+
are incorporated in the well-known data mining tool
|
158
|
+
<a href="http://www.spss.com/Clementine/">Clementine</a>
|
159
|
+
(apriori version 1.8 in Clementine version 5.0,
|
160
|
+
apriori version 2.7 in Clementine version 7.0), available from
|
161
|
+
<a href="http://www.spss.com">SPSS</a>. Newer versions of Clementine
|
162
|
+
still use my program, but I am not completely sure about the version
|
163
|
+
number of the underlying apriori program.</p>
|
164
|
+
|
165
|
+
<p>Enjoy,<br>
|
166
|
+
<a href="http://fuzzy.cs.uni-magdeburg.de/~borgelt/">
|
167
|
+
Christian Borgelt</a></p>
|
168
|
+
|
169
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
170
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
171
|
+
<td width=5></td>
|
172
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
173
|
+
</table>
|
174
|
+
|
175
|
+
<!-- =============================================================== -->
|
176
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
177
|
+
|
178
|
+
<h3><a name="terms">Support and Confidence</a></h3>
|
179
|
+
|
180
|
+
<h4><a name="suppset">Support of an Item Set</a></h4>
|
181
|
+
|
182
|
+
<p>Let T be the set of all transactions under consideration, e.g.,
|
183
|
+
let T be the set of all "baskets" or "carts" of products bought by the
|
184
|
+
customers of a supermarket - on a given day if you like. The support
|
185
|
+
of an item set S is the percentage of those transactions in T which
|
186
|
+
contain S. In the supermarket example this is the number of "baskets"
|
187
|
+
that contain a given set S of products, for example S = { bread, wine,
|
188
|
+
cheese }. If U is the set of all transactions that contain all items
|
189
|
+
in S, then</p>
|
190
|
+
<p>support(S) = (|U| / |T|) *100%,</p>
|
191
|
+
<p>where |U| and |T| are the number of elements in U and T,
|
192
|
+
respectively. For example, if a customer buys the set
|
193
|
+
X = { milk, bread, apples, wine, sausages, cheese, onions, potatoes }
|
194
|
+
then S is obviously a subset of X, hence S is in U. If there are 318
|
195
|
+
customers and 242 of them buy such a set U or a similar one that
|
196
|
+
contains S, then support(S) = 76.1%.</p>
|
197
|
+
|
198
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
199
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
200
|
+
<td width=5></td>
|
201
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
202
|
+
</table>
|
203
|
+
|
204
|
+
<!-- =============================================================== -->
|
205
|
+
|
206
|
+
<h4><a name="confrule">Confidence of an Association Rule</a></h4>
|
207
|
+
|
208
|
+
<p>This is the measure used by [Agrawal et al. 1993], the inventors of
|
209
|
+
the apriori algorithm, to evaluate association rules. The confidence
|
210
|
+
of a rule R = "A and B -> C" is the support of the set of all items
|
211
|
+
that appear in the rule divided by the support of the antecedent of
|
212
|
+
the rule, i.e.</p>
|
213
|
+
<p>confidence(R) = (support({A, B, C}) / support({A, B})) *100%.</p>
|
214
|
+
<p>More intuitively, the confidence of a rule is the number of cases in
|
215
|
+
which the rule is correct relative to the number of cases in which it
|
216
|
+
is applicable. For example, let R = "wine and bread -> cheese". If a
|
217
|
+
customer buys wine and bread, then the rule is applicable and it says
|
218
|
+
that he/she can be expected to buy cheese. If he/she does not buy wine
|
219
|
+
or does not buy bread or buys neither, than the rule is not applicable
|
220
|
+
and thus (obviously) does not say anything about this customer.</p>
|
221
|
+
|
222
|
+
<p>If the rule is applicable, it says that the customer can be expected
|
223
|
+
to buy cheese. But he/she may or may not buy cheese, that is, the rule
|
224
|
+
may or may not be correct. Of course, we are interested in how good the
|
225
|
+
rule is, i.e., how often its prediction that the customer buys cheese
|
226
|
+
is correct. The rule confidence measures this: It states the percentage
|
227
|
+
of cases in which the rule is correct. It computes the percentage
|
228
|
+
relative to the number of cases in which the antecedent holds, since
|
229
|
+
these are the cases in which the rule makes a prediction that can be
|
230
|
+
true or false. If the antecedent does not hold, then the rule does not
|
231
|
+
make a prediction, so these cases are excluded.</p>
|
232
|
+
|
233
|
+
<p>With this measure a rule is selected if its confidence exceeds or
|
234
|
+
is equal to a given lower limit. That is, we look for rules that have
|
235
|
+
a high probability of being true, i.e., we look for "good" rules, which
|
236
|
+
make correct (or very often correct) predictions. My apriori program
|
237
|
+
always uses this measure to select association rules. The default value
|
238
|
+
for the confidence limit is 80%. It can be changed with the option
|
239
|
+
<tt>-c</tt>.</p>
|
240
|
+
|
241
|
+
<p>In addition to the rule confidence my apriori program lets you
|
242
|
+
select several other rule evaluation measures, which are explained
|
243
|
+
below, but it will also use rule confidence. If you want to rely
|
244
|
+
entirely on some other measure, you can do so by setting the minimal
|
245
|
+
rule confidence to zero. (Attention: If you have a large number of
|
246
|
+
items, setting the minimal rule confidence to zero can result in
|
247
|
+
<i>very</i> high memory consumption.)</p>
|
248
|
+
|
249
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
250
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
251
|
+
<td width=5></td>
|
252
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
253
|
+
</table>
|
254
|
+
|
255
|
+
<!-- =============================================================== -->
|
256
|
+
|
257
|
+
<h4><a name="supprule">Support of an Association Rule</a></h4>
|
258
|
+
|
259
|
+
<p>The support of rules may cause some confusion, because I use this
|
260
|
+
term in a different way than [Agrawal et al. 1993] do. For them, the
|
261
|
+
support of a rule "A and B -> C" is the support of the set {A, B, C}.
|
262
|
+
This is fine if rule confidence is the only rule evaluation measure,
|
263
|
+
but it causes problems if some other measure is used. For these other
|
264
|
+
measures it is often much more appropriate to call the support of the
|
265
|
+
antecedent of the rule, i.e. the support of {A, B} in the example above,
|
266
|
+
the support of the rule.</p>
|
267
|
+
|
268
|
+
<p>The difference can also be stated in the following way: For [Agrawal
|
269
|
+
et al. 1993], the support of the rule is the (relative) number of cases
|
270
|
+
in which the rule is correct (i.e., in which the presence of the item C
|
271
|
+
follows from the presence of the items A and B), whereas for me (and
|
272
|
+
thus my apriori program) the support of a rule is the (relative) number
|
273
|
+
of cases in which it is applicable (i.e., in which the antecedent of the
|
274
|
+
rule holds), although in some of these cases it may be false (because
|
275
|
+
only the items A and B are present, but the item C is missing).</p>
|
276
|
+
|
277
|
+
<p>One reason for this, as already mentioned, is that the definition
|
278
|
+
of [Agrawal et al. 1993] does not work well for evaluation measures
|
279
|
+
other than rule confidence. This is explained in more detail below.
|
280
|
+
Another reason is that I prefer the support of a rule to say something
|
281
|
+
about the "statistical" support of a rule and its confidence, i.e.,
|
282
|
+
from how many cases the confidence is computed in order to express
|
283
|
+
how well founded the assertion about the confidence is.</p>
|
284
|
+
|
285
|
+
<p>Maybe an example will make this clearer. Suppose you have a die which
|
286
|
+
you suspect to be biased. To test this hypothesis, you throw the die,
|
287
|
+
say, a thousand times. 307 times the 6 turns up. Hence you assume that
|
288
|
+
the die is actually biased, since the relative frequency is about 30%
|
289
|
+
although for an unbiased die it should be around 17%. Now, what is the
|
290
|
+
"statistical" support of this assertion, i.e., on how many experiments
|
291
|
+
does it rest? Obviously it rests on all 1000 experiments and not only
|
292
|
+
on the 307 experiments in which the 6 turned up. This is so, simply
|
293
|
+
because you had to do 1000 experiments to find out that the relative
|
294
|
+
frequency is around 30% and not only the 307 in which a 6 turned up.</p>
|
295
|
+
|
296
|
+
<p>Or suppose you are doing an opinion poll to find out about the
|
297
|
+
acceptance of a certain political party, maybe with the usual question
|
298
|
+
"If an election were held next Sunday ...?" You ask 2000 persons, of
|
299
|
+
which 857 say that they would vote for the party you are interested in.
|
300
|
+
What is the support of the assertion that this party would get around
|
301
|
+
43% of all votes? It is the size of your sample, i.e., all 2000 persons,
|
302
|
+
and not only the 857 that answered in the positive. Again you had to ask
|
303
|
+
all 2000 people to find out about the percentage of 43%. Of course, you
|
304
|
+
could have asked fewer people, say, 100, of which, say, 43 said that
|
305
|
+
they would vote for the party, but then your assertion would be less
|
306
|
+
reliable, because it is less "supported". The number of votes for the
|
307
|
+
party could also be 40% or 50%, because of some random influences. Such
|
308
|
+
deviations are much less likely, if you asked 2000 persons, since then
|
309
|
+
the random influences can be expected to cancel out.</p>
|
310
|
+
|
311
|
+
<p>The rule support can be used to select association rules by stating
|
312
|
+
a lower bound for the support of a rule. This is equivalent to saying
|
313
|
+
that you are interested only in such rules that have a large enough
|
314
|
+
statistical basis (since my apriori program uses the term "support"
|
315
|
+
in my interpretation and not in the one used by [Agrawal et al. 1993]).
|
316
|
+
The default value for the support limit is 10%. It can be changed
|
317
|
+
with the option <tt>-s</tt>. If the number given is negative, it is
|
318
|
+
interpreted as an absolute number (number of transactions) rather than
|
319
|
+
a percentage. (Note that in this case the option <tt>-a</tt> reverses
|
320
|
+
its meaning: if it is not given only the absolute support is printed,
|
321
|
+
if it is added, the relative supoort is printed, too.) The lower bound
|
322
|
+
for the rule support is combined with the lower bound for the rule
|
323
|
+
confidence, i.e., my apriori program generates only rules the confidence
|
324
|
+
of which is greater than or equal to the confidence limit <i>and</i> the
|
325
|
+
support of which is greater than or equal to the support limit.</p>
|
326
|
+
|
327
|
+
<p>Despite the above arguments in favor of my definition of the support
|
328
|
+
of an association rule, a rule support compatibility mode is available.
|
329
|
+
With the option <tt>-o</tt> the original rule support definition can be
|
330
|
+
selected. In this case the support of an association rule is the support
|
331
|
+
of the set with the items in the antecedent and the consequent of the
|
332
|
+
rule, i.e. the support of a rule as defined in [Agrawal et al. 1993].
|
333
|
+
</p>
|
334
|
+
|
335
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
336
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
337
|
+
<td width=5></td>
|
338
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
339
|
+
</table>
|
340
|
+
|
341
|
+
<!-- =============================================================== -->
|
342
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
343
|
+
|
344
|
+
<h3><a name="target">Target Types</a></h3>
|
345
|
+
|
346
|
+
<p>The target type, which can be selected via the option <tt>-t</tt>,
|
347
|
+
is either association rules (option <tt>-tr</tt>, default), frequent
|
348
|
+
item sets (option <tt>-ts</tt>), closed item sets (option <tt>-tc</tt>),
|
349
|
+
maximal item sets (option <tt>-tm</tt>), or association hyperedges
|
350
|
+
(option <tt>-th</tt>).</p>
|
351
|
+
|
352
|
+
<!-- =============================================================== -->
|
353
|
+
|
354
|
+
<h4><a name="assrules">Association Rules (default, option -tr)</a></h4>
|
355
|
+
|
356
|
+
<p>By default my apriori program produces association rules with
|
357
|
+
a single item in the consequent. The restriction to single item
|
358
|
+
consequents is due to the following considerations: In the first place,
|
359
|
+
association rule mining usually produces too many rules even if one
|
360
|
+
confines oneself to rules with only one item in the consequent. So why
|
361
|
+
should one make the situation worse by allowing more than one item in
|
362
|
+
the consequent? (It merely blows up the output size.)</p>
|
363
|
+
|
364
|
+
<p>Secondly, I do not know any application in which rules with more
|
365
|
+
than one item in the consequent are of any real use. The reason, in
|
366
|
+
my opinion, is that such more complex rules add almost nothing to the
|
367
|
+
insights about the data set. To understand this, consider the simpler
|
368
|
+
rules that correspond to a rule with multiple items in the consequent,
|
369
|
+
that is, rules having the same antecedent and consequents with only
|
370
|
+
single items from the consequent of the complex rule. All of these
|
371
|
+
rules must necessarily be in the output, because neither their support
|
372
|
+
nor their confidence can be less than that of the more complex rule.
|
373
|
+
That is, if you have a rule c d <- a b, you will necessarily also
|
374
|
+
have the rules c <- a b and d <- a b in the output. Of course,
|
375
|
+
these latter two rules together do <i>not</i> say the same as the more
|
376
|
+
complex rule. However, what do you gain from the additional information
|
377
|
+
the more complex rule gives you? How can you use it? And is this little
|
378
|
+
extra information worth having to analyze a much bigger rule set?</p>
|
379
|
+
|
380
|
+
<!-- =============================================================== -->
|
381
|
+
|
382
|
+
<h4><a name="itemsets">Frequent Item Sets (option -ts)</a></h4>
|
383
|
+
|
384
|
+
<p>Sometimes one may not want to find association rules, but only the
|
385
|
+
frequent item sets underlying them. That is, one wants to find all
|
386
|
+
item sets with a support exceeding a certain threshold. My apriori
|
387
|
+
program supports this search, too: If the option <tt>-ts</tt> is
|
388
|
+
given, only frequent item sets are determined.</p>
|
389
|
+
|
390
|
+
<!-- =============================================================== -->
|
391
|
+
|
392
|
+
<h4><a name="closed">Closed Item Sets (option -tc)</a></h4>
|
393
|
+
|
394
|
+
<p>A frequent item set is called <i>closed</i> if no superset has the
|
395
|
+
same support. If the option <tt>-tc</tt> is given, the found frequent
|
396
|
+
item sets are subsequently filtered and only the closed item sets
|
397
|
+
are kept.</p>
|
398
|
+
|
399
|
+
<!-- =============================================================== -->
|
400
|
+
|
401
|
+
<h4><a name="maximal">Maximal Item Sets (option -tm)</a></h4>
|
402
|
+
|
403
|
+
<p>A frequent item set is called <i>maximal</i> if no superset is
|
404
|
+
frequent, i.e., has a support exceeding the minimal support. If the
|
405
|
+
option <tt>-tm</tt> is given, the found frequent item sets are
|
406
|
+
subsequently filtered and only the maximal item sets are kept.</p>
|
407
|
+
|
408
|
+
<!-- =============================================================== -->
|
409
|
+
|
410
|
+
<h4><a name="hyperedges">Association Hyperedges (option -th)</a></h4>
|
411
|
+
|
412
|
+
<p>My apriori program can also find association hyperedges, i.e., sets
|
413
|
+
of items that are strongly predictive w.r.t. each other. In this mode
|
414
|
+
no rules are generated, only item sets are selected. The selection
|
415
|
+
criterion is as follows: Given an item set with enough support (option
|
416
|
+
<tt>-s</tt>), all rules are checked which can be formed using this set
|
417
|
+
with all items appearing in the rule. For example, for the item set
|
418
|
+
{a b c}, the rules c <- a b, b <- a c, a <- b c would be
|
419
|
+
considered. The confidences of these rules are computed and averaged.
|
420
|
+
If the resulting average confidence is greater than the minimal
|
421
|
+
confidence (option <tt>-c</tt>), the item set is selected. (I am
|
422
|
+
grateful to Bastien Duclaux for requesting the possibility to generate
|
423
|
+
association hyperedges.)</p>
|
424
|
+
|
425
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
426
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
427
|
+
<td width=5></td>
|
428
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
429
|
+
</table>
|
430
|
+
|
431
|
+
<!-- =============================================================== -->
|
432
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
433
|
+
|
434
|
+
<h3><a name="select">Extended Rule Selection</a></h3>
|
435
|
+
|
436
|
+
<p>If rules are selected using the rule confidence, the following
|
437
|
+
problem arises: "Good" rules (rules that are often true) are not
|
438
|
+
always "interesting" rules (rules that reveal something about the
|
439
|
+
interdependence of the items). You certainly know the examples that
|
440
|
+
are usually given to illustrate this fact. For instance, it is easy
|
441
|
+
to find out in a medical database that the rule "pregnant -> female"
|
442
|
+
is true with a confidence of 100%. Hence it is a perfect rule, it
|
443
|
+
never fails, but, of course, this is not very surprising. Although
|
444
|
+
the measures explained below cannot deal with this problem (which is
|
445
|
+
semantical), they may be able to improve on the results in a related
|
446
|
+
case.</p>
|
447
|
+
|
448
|
+
<p>Let us look at the supermarket example again and let us assume
|
449
|
+
that 60% of all customers buy some kind of bread. Consider the rule
|
450
|
+
"cheese -> bread", which holds with a confidence of, say, 62%.
|
451
|
+
Is this an important rule? Obviously not, since the fact that the
|
452
|
+
customer buys cheese does not have a significant influence on him/her
|
453
|
+
buying bread: The percentages are almost the same. But if you had set
|
454
|
+
a confidence limit of 60%, you would get both rules "-> bread"
|
455
|
+
(confidence 60%) and "cheese -> bread" (confidence 62%), although
|
456
|
+
the first would suffice (the first, since it is the simpler of the
|
457
|
+
two). The idea of all measures that can be used in addition or instead
|
458
|
+
of rule confidence is to handle such situations and to suppress the
|
459
|
+
second rule.</p>
|
460
|
+
|
461
|
+
<p>In addition, consider the following case: Assume that the confidence
|
462
|
+
of the rule "cheese -> bread" is not 62% but 35%. With a confidence
|
463
|
+
limit of 60% it would not be selected, but it may be very important to
|
464
|
+
know about this rule! Together with cheese bread is bought much less
|
465
|
+
frequent than it is bought at all. Is cheese some kind of substitute
|
466
|
+
for bread, so that one does not need any bread, if one has cheese? Ok,
|
467
|
+
maybe this is not a very good example. However, what can be seen is
|
468
|
+
that a rule with low confidence can be very interesting, since it may
|
469
|
+
capture an important influence. Furthermore, this is a way to express
|
470
|
+
negation (though only in the consequent of a rule), since
|
471
|
+
"cheese -> bread" with confidence 35% is obviously equivalent to
|
472
|
+
"cheese -> no bread" with confidence 65%. This also makes clear
|
473
|
+
why the support of the item set that contains all items in the body
|
474
|
+
<i>and</i> the head of the rule is not appropriate for this measure.
|
475
|
+
An important rule may have confidence 0 and thus a support (in the
|
476
|
+
interpretation of [Agrawal et al. 1993]) of 0. Hence it is not
|
477
|
+
reasonable to set a lower bound for this kind of support.</p>
|
478
|
+
|
479
|
+
<p>I hope that the intention underlying all this is already clear:
|
480
|
+
Potentially interesting rules differ significantly in their confidence
|
481
|
+
from the confidence of rules with the same consequent, but a simpler
|
482
|
+
antecedent. Adding an item to the antecedent is informative only if it
|
483
|
+
significantly changes the confidence of the rule. Otherwise the simpler
|
484
|
+
rule suffices.</p>
|
485
|
+
|
486
|
+
<p>Unfortunately the measures other than rule confidence do not solve
|
487
|
+
the rule selection problem in the very general form in which it was
|
488
|
+
stated above. It is not that easy to deal with all rules that have a
|
489
|
+
simpler antecedent, to keep track of which of these rules were selected
|
490
|
+
(this obviously influences the selection of more complicated rules),
|
491
|
+
to deal with the special type of Poincare paradox that can occur, etc.
|
492
|
+
Hence the measures always compare the confidence of a rule with the
|
493
|
+
confidence of the rule with empty antecedent, i.e. with the relative
|
494
|
+
frequency of the consequent.</p>
|
495
|
+
|
496
|
+
<p>I call the confidence of a rule with empty antecedent the prior
|
497
|
+
confidence, since it is the confidence that the item in the consequent
|
498
|
+
of the rule will be present in an item set prior to any information
|
499
|
+
about other items that are present. The confidence of a rule with
|
500
|
+
non-empty antecedent (and the same consequent) I call the posterior
|
501
|
+
confidence, since it is the confidence that the item in the consequent
|
502
|
+
of the rule will be present after it gets known that the items in the
|
503
|
+
antecedent of the rule are present.</p>
|
504
|
+
|
505
|
+
<p>All measures that can be used in addition to rule confidence are
|
506
|
+
computed from these two values: the prior confidence and the posterior
|
507
|
+
confidence. Only those rules are selected for which the value of the
|
508
|
+
chosen additional evaluation measure exceeds or is equal to a certain
|
509
|
+
limit. The measures are chosen with the option <tt>-e</tt>, the limit
|
510
|
+
is passed to the program via the option <tt>-d</tt>. The default value
|
511
|
+
for the limit is 10%.</p>
|
512
|
+
|
513
|
+
<p>All additional rule evaluation measures are combined with the limits
|
514
|
+
for rule confidence and rule support. I.e., my apriori program selects
|
515
|
+
only those rules the confidence of which is greater than or equal to
|
516
|
+
the confidence limit, the support of which is greater than or equal to
|
517
|
+
the support limit, <i>and</i> for which the additional evaluation value
|
518
|
+
is greater than or equal to the limit for this measure. The default is
|
519
|
+
to use no additional evaluation measure, i.e., to rely only on rule
|
520
|
+
confidence and rule support. Of course you can remove the restriction
|
521
|
+
that the rule confidence must exceed a certain limit by simply setting
|
522
|
+
this limit to zero. In this case rules are selected using only the
|
523
|
+
limits for the rule support and the additional evaluation measure.
|
524
|
+
(Attention: If you have a large number of items, setting the minimal
|
525
|
+
rule confidence to zero can result in <i>very</i> high memory
|
526
|
+
consumption.)</p>
|
527
|
+
|
528
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
529
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
530
|
+
<td width=5></td>
|
531
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
532
|
+
</table>
|
533
|
+
|
534
|
+
<!-- =============================================================== -->
|
535
|
+
|
536
|
+
<h4><a name="diff">Absolute Confidence Difference to Prior
|
537
|
+
(option <tt>-ed</tt> or <tt>-e1</tt>)</a></h4>
|
538
|
+
|
539
|
+
<p>The simplest way to compare the two confidences is to compute the
|
540
|
+
absolute value of their difference. I.e., if "-> bread" has a
|
541
|
+
confidence of 60% and "cheese -> bread" has a confidence of 62%,
|
542
|
+
then the value of this measure is 2%. The parameter given via the
|
543
|
+
option <tt>-d</tt> to the program states a lower bound for this
|
544
|
+
difference. It follows that this measure selects rules the confidence
|
545
|
+
of which differs more than a given threshold from the corresponding
|
546
|
+
prior confidence. For example, with the option <tt>-d20</tt> (and, of
|
547
|
+
course, the option <tt>-ed</tt> to select the measure) for the item
|
548
|
+
"bread" only rules with a confidence less than 40% or greater than 80%
|
549
|
+
would be selected. Of course, for other items, with a different prior
|
550
|
+
confidence, the upper and lower bounds are different, too.</p>
|
551
|
+
|
552
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
553
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
554
|
+
<td width=5></td>
|
555
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
556
|
+
</table>
|
557
|
+
|
558
|
+
<!-- =============================================================== -->
|
559
|
+
|
560
|
+
<h4><a name="quotient">Difference of Confidence Quotient to 1
|
561
|
+
(option <tt>-eq</tt> or <tt>-e2</tt>)</a></h4>
|
562
|
+
|
563
|
+
<p>An equally simple way to compare the two confidences is to compute
|
564
|
+
their quotient. Since either the prior or the posterior confidence
|
565
|
+
can be greater (which was handled by computing the absolute value
|
566
|
+
for the previous measure), this quotient or its reciprocal, whichever
|
567
|
+
is smaller, is then compared to one. A quotient of one says that the
|
568
|
+
rule is not interesting, since the prior and the posterior confidence
|
569
|
+
are identical. The more the quotient differs from one, the more
|
570
|
+
"interesting" the rule is. Hence, just as above, a lower bound for
|
571
|
+
this difference is given via the option <tt>-d</tt>. For the bread
|
572
|
+
example, with the option <tt>-d20</tt> rules with a confidence less
|
573
|
+
than or equal to (1 -20%) *60% = 0.8 *60% = 48% or a confidence greater
|
574
|
+
than or equal to 60% / (1 -20%) = 60% / 0.8 = 75% are selected. The
|
575
|
+
difference between this measure and the absolute confidence difference
|
576
|
+
to the prior is that the deviation that is considered to be significant
|
577
|
+
depends on the prior confidence. If it is high, then the deviation of
|
578
|
+
the posterior confidence must also be high, and if it is low, then
|
579
|
+
the deviation need only be low. For example, if "-> bread" had a
|
580
|
+
confidence of only 30%, then the option <tt>-d20</tt> (just as above)
|
581
|
+
would select rules the confidence of which is less than 0.8 *30% = 24%
|
582
|
+
or greater than 30% /0.8 = 37.5%. As you can see, for a prior confidence
|
583
|
+
of 60% the deviation has to be at least 12%/15%, for a prior confidence
|
584
|
+
of 30% it has to be only 6%/7.5% in order to make a rule eligible.
|
585
|
+
The idea is that an increment of the confidence from 30% to 40% is more
|
586
|
+
important than an increment from 60% to 70%, since the relative change
|
587
|
+
is greater.</p>
|
588
|
+
|
589
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
590
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
591
|
+
<td width=5></td>
|
592
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
593
|
+
</table>
|
594
|
+
|
595
|
+
<!-- =============================================================== -->
|
596
|
+
|
597
|
+
<h4><a name="improve">Absolute Difference of Improvement Value to 1
|
598
|
+
(option <tt>-ea</tt> or <tt>-e3</tt>)</a></h4>
|
599
|
+
|
600
|
+
<p>This measure is very similar to the preceding one. Actually, if
|
601
|
+
the confidence of a rule is smaller than the prior confidence, then
|
602
|
+
it coincides with it. The improvement value is simply the posterior
|
603
|
+
confidence divided by the prior confidence. It is greater than
|
604
|
+
one if the confidence increases due to the antecedent, and it is
|
605
|
+
smaller than one if the confidence decreases due to the antecedent.
|
606
|
+
By computing the absolute value of the difference to one, the
|
607
|
+
improvement value can easily be made a rule selection measure.
|
608
|
+
The advantage of this measure over the preceding one is that it is
|
609
|
+
symmetric w.r.t. changes of the confidence due to the antecedent of
|
610
|
+
a rule. For the bread example, with the option <tt>-d20</tt> rules with
|
611
|
+
a confidence less than or equal to (1 -20%) *60% = 0.8 *60% = 48% or a
|
612
|
+
confidence greater than or equal to (1 +20%) *60% = 1.2 * 60% = 72%
|
613
|
+
are selected. (Note the difference of 72% compared to 75% for the
|
614
|
+
preceding measure.) Similarly, for the second bread example
|
615
|
+
discussed above, the numbers are 0.8 *30% = 24% and 1.2 *30% = 36%.
|
616
|
+
Note that this is the only measure for which a value greater than 100
|
617
|
+
may be specified with the <tt>-d</tt> option, since it can exceed
|
618
|
+
100% if the posterior confidence of a rule exceeds twice the prior
|
619
|
+
confidence. (I am grateful to Roland Jonscher, who pointed out this
|
620
|
+
measure to me.)</p>
|
621
|
+
|
622
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
623
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
624
|
+
<td width=5></td>
|
625
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
626
|
+
</table>
|
627
|
+
|
628
|
+
<!-- =============================================================== -->
|
629
|
+
|
630
|
+
<h4><a name="info">Information Difference to Prior
|
631
|
+
(option <tt>-ei</tt> or <tt>-e4</tt>)</a></h4>
|
632
|
+
|
633
|
+
<p>This measure is simply the information gain criterion that can be
|
634
|
+
used in decision tree learners like C4.5 to select the split attributes.
|
635
|
+
Its idea is as follows: Without any further information about other
|
636
|
+
items in the set, we have a certain probability (or, to be exact, a
|
637
|
+
relative frequency) distribution for, say "bread" and "no bread".
|
638
|
+
Let us assume it is 60% : 40% (prior confidence of the item "bread",
|
639
|
+
just as above). This distribution has a certain entropy</p>
|
640
|
+
<p>H = - P(bread) log<sub>2</sub> P(bread)
|
641
|
+
- P(no bread) log<sub>2</sub> P(no bread),</p>
|
642
|
+
<p>where P(bread) is equivalent to the support of "bread", which in
|
643
|
+
turn is equivalent to the prior confidence of "bread". The entropy of a
|
644
|
+
probability distribution is, intuitively, a lower bound on the number
|
645
|
+
of yes-no-questions you have to ask in order to determine the actual
|
646
|
+
value. This cannot be understood very well with only two possible
|
647
|
+
values, but it can be made to work for this case too. I skip the
|
648
|
+
details here, they are not that important.</p>
|
649
|
+
|
650
|
+
<p>After we get the information that the items in the antecedent of
|
651
|
+
the rule are present (say, cheese), we have a different probability
|
652
|
+
distribution, say 35% : 65%. I.e., P(bread|cheese) = 0.35 and
|
653
|
+
P(no bread|cheese) = 0.65. If we also know the support of the item
|
654
|
+
"cheese" (let it be P(cheese) = 0.4 and P(no cheese) = 0.6), then
|
655
|
+
we can also compute the probabilities P(bread|no cheese) = 0.77 and
|
656
|
+
P(no bread|no cheese) = 0.23. Hence we have two posterior probability
|
657
|
+
distributions. The question now is: How much information do we receive
|
658
|
+
from observing the antecedent of the rule? Information is measured
|
659
|
+
as a reduction of entropy. Hence the entropies of the two conditional
|
660
|
+
probability distributions (for "cheese" and "no cheese") are computed
|
661
|
+
and summed weighted with the probability of their occurrence (i.e.,
|
662
|
+
the relative frequency of "cheese" and "no cheese", respectively).
|
663
|
+
This gives the expected value of the posterior or conditional entropy.
|
664
|
+
The difference of this value to the prior entropy (see above) is the
|
665
|
+
gain in information from the antecedent of the rule or, as I called
|
666
|
+
it, the information difference to the prior.</p>
|
667
|
+
|
668
|
+
<p>The value that can be given via the <tt>-d</tt> option is a lower
|
669
|
+
bound for the information gain, measured in hundreds of a bit. Since
|
670
|
+
all items can only be present or absent, the information gain can be
|
671
|
+
at most one bit. Therefore a percent value is still reasonable.</p>
|
672
|
+
|
673
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
674
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
675
|
+
<td width=5></td>
|
676
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
677
|
+
</table>
|
678
|
+
|
679
|
+
<!-- =============================================================== -->
|
680
|
+
|
681
|
+
<h4><a name="chi2">Normalized</a> chi<sup>2</sup> Measure
|
682
|
+
(option <tt>-ec</tt> or <tt>-e5</tt>)</h4>
|
683
|
+
|
684
|
+
<p>The chi<sup>2</sup> measure is well known from statistics. It is
|
685
|
+
often used to measure the difference between a supposed independent
|
686
|
+
distribution of two discrete variables and the actual joint distribution
|
687
|
+
in order to determine how strongly two variables depend on each other.
|
688
|
+
This measure (as it is defined in statistics) contains the number of
|
689
|
+
cases it is computed from as a factor. This is not very appropriate
|
690
|
+
if one wants to evaluate rules that can have varying support. Hence
|
691
|
+
this factor is removed by simply dividing the measure by the number
|
692
|
+
of items sets (the total number, i.e. with the names used above, the
|
693
|
+
number of sets in X). With this normalization, the chi<sup>2</sup>
|
694
|
+
measure can assume values between 0 (no dependence) and 1 (very strong
|
695
|
+
dependence). The value that can be given via the <tt>-d</tt> option is
|
696
|
+
a lower bound for the strength of the dependence of the head on the
|
697
|
+
body in percent (0 - no dependence, 100 - perfect dependence). Only
|
698
|
+
those rules are selected, in which the head depends on the body with
|
699
|
+
a higher degree of dependence.</p>
|
700
|
+
|
701
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
702
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
703
|
+
<td width=5></td>
|
704
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
705
|
+
</table>
|
706
|
+
|
707
|
+
<!-- =============================================================== -->
|
708
|
+
|
709
|
+
<h4><a name="behavior">Selection Behavior of the Measures</a></h4>
|
710
|
+
|
711
|
+
<p>In the directory <tt>apriori/doc</tt> you can find a Gnuplot script
|
712
|
+
named <tt>arem.gp</tt> (<tt>arem</tt> stands for additional rule
|
713
|
+
evaluation measures) which visualizes the behavior of the additional
|
714
|
+
rule evaluation measures. This script draws eight 3d graphs, one for
|
715
|
+
the absolute confidence difference, one for the difference of the
|
716
|
+
confidence quotient to one, three for the information difference to
|
717
|
+
the prior confidence and three for the normalized chi<sup>2</sup>
|
718
|
+
measure. All graphs show the value of an additional rule evaluation
|
719
|
+
measure over a plane defined by the prior and the posterior confidence
|
720
|
+
of a rule. The latter two measures need three graphs, since they depend
|
721
|
+
on the antecedent support of a rule as a third parameter. Setting a
|
722
|
+
minimal value for an additional rule evaluation measure is like
|
723
|
+
flooding the corresponding graph landscape up to a certain level
|
724
|
+
(given as a percentage, since all considered measures assume values
|
725
|
+
between 0 and 1). Only those rules are selected that sit on dry land.
|
726
|
+
</p>
|
727
|
+
|
728
|
+
<p>The first graph shows the behavior of the absolute confidence
|
729
|
+
difference. For the diagonal, i.e. the line where the prior and the
|
730
|
+
posterior confidence are identical, its value is zero (as expected).
|
731
|
+
The more the two confidences differ, the higher the value of this
|
732
|
+
measure gets, but in a linear way.</p>
|
733
|
+
|
734
|
+
<p>The second graph shows the behavior of the confidence quotient
|
735
|
+
to one. Again its value is zero for the diagonal (as the value of
|
736
|
+
all measures is) and becomes greater the more the prior and the
|
737
|
+
posterior confidence differ. But it is much steeper for a small
|
738
|
+
prior confidence than for a large one and it is non-linear.</p>
|
739
|
+
|
740
|
+
<p>The third to fifth graph show the information difference to the
|
741
|
+
prior confidence for an antecedent support (which is identical to the
|
742
|
+
rule support in my interpretation, see above) of 0.2 (20%), 0.3 (30%)
|
743
|
+
and 0.4 (40%). The regions at the margins, where the measure is zero,
|
744
|
+
correspond to impossible combinations of prior and posterior confidence
|
745
|
+
and antecedent support. As you can see, the valley gets narrower with
|
746
|
+
increasing antecedent support. I.e., with the same minimal value for
|
747
|
+
this measure, rules with low antecedent support need a higher confidence
|
748
|
+
difference to be selected than rules with a high antecedent support.
|
749
|
+
This nicely models the statistical significance of confidence changes.
|
750
|
+
If you only have a few cases to support your rule, even a large
|
751
|
+
deviation from the prior confidence can be explained by random
|
752
|
+
fluctuations, since only a few transactions need to be different to
|
753
|
+
produce a considerable change. However, if the antecedent support
|
754
|
+
is large, even a small deviation (in percent) has to be considered
|
755
|
+
significant (non random), since it takes a lot of changes to
|
756
|
+
transactions to produce even a small change in the percentage.
|
757
|
+
This dependence on the antecedent support of the rule and that the
|
758
|
+
valley is not pointed at the diagonal (which means that even a low
|
759
|
+
minimal value can exclude a lot of rules) is the main difference
|
760
|
+
between the information gain and the normalized chi<sup>2</sup>
|
761
|
+
measure on the one hand and the absolute confidence difference and
|
762
|
+
difference of the confidence quotient to one on the other.</p>
|
763
|
+
|
764
|
+
<p>The sixth to eighth graph show the normalized chi<sup>2</sup> measure
|
765
|
+
for an antecedent support of 0.2, 0.3, and 0.4. The valleys are very
|
766
|
+
similar to those for the information difference to the prior confidence,
|
767
|
+
they only have slightly steeper flanks, especially for low antecedent
|
768
|
+
support. So in practice there is no big difference between the
|
769
|
+
information difference and the normalized chi<sup>2</sup> measure.</p>
|
770
|
+
|
771
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
772
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
773
|
+
<td width=5></td>
|
774
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
775
|
+
</table>
|
776
|
+
|
777
|
+
<!-- =============================================================== -->
|
778
|
+
|
779
|
+
<h4><a name="appear">Item Appearances</a></h4>
|
780
|
+
|
781
|
+
<p>My apriori program provides a simple way to restrict the rules to
|
782
|
+
generate w.r.t. the items that shall appear in them. It accepts a third
|
783
|
+
optional input file, in which item appearances can be given. For each
|
784
|
+
item it can be stated whether it may appear in the body (antecedent)
|
785
|
+
of a rule, in the head (consequent), or in both. A description of the
|
786
|
+
format of this additional input file, including examples, can be found
|
787
|
+
<a href="#appearin">here</a>. If no item appearances file is given, the
|
788
|
+
rule selection is not restricted. (I am grateful to the people at
|
789
|
+
Integral Solutions Ltd., who developed the well-known data mining tool
|
790
|
+
<a href="http://www.spss.com/Clementine/">Clementine</a>, but are now
|
791
|
+
part of <a href="http://www.spss.com">SPSS</a>, for requesting the
|
792
|
+
possibility to restrict item appearances.)</p>
|
793
|
+
|
794
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
795
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
796
|
+
<td width=5></td>
|
797
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
798
|
+
</table>
|
799
|
+
|
800
|
+
<!-- =============================================================== -->
|
801
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
802
|
+
|
803
|
+
<h3><a name="select">Extended Item Set Selection</a></h3>
|
804
|
+
|
805
|
+
<p>Since version 4.20 there are extended selection possibilities for
|
806
|
+
frequent item sets, too. (These were added due to a coopertion with
|
807
|
+
Sonja Gruen, FU Berlin.)</p>
|
808
|
+
|
809
|
+
<!-- =============================================================== -->
|
810
|
+
|
811
|
+
<h4><a name="logquot">Binary Logarithm of Support Quotient</a></h4>
|
812
|
+
|
813
|
+
<p>An expected value for the support of an item set is computed from
|
814
|
+
the support values of the individual items, assuming independence.
|
815
|
+
Then the binary logarithm of the quotient of actual support and
|
816
|
+
expected support is computed. A minimum value for this measure can
|
817
|
+
be set with the option <tt>-d</tt>. In this case only frequent item
|
818
|
+
sets for which this measure exceeds the given threshold are kept.</p>
|
819
|
+
|
820
|
+
<!-- =============================================================== -->
|
821
|
+
|
822
|
+
<h4><a name="suppquot">Difference of Support Quotient to 1</a></h4>
|
823
|
+
|
824
|
+
<p>As with the preceding measure the quotient of actual and expected
|
825
|
+
support of an item set is computed and compared to 1 (a value of 1
|
826
|
+
signifies independence of the items). A minimum value for this measure
|
827
|
+
can be set with the option <tt>-d</tt>. In this case only frequent item
|
828
|
+
sets for which this measure exceeds the given threshold are kept.</p>
|
829
|
+
|
830
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
831
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
832
|
+
<td width=5></td>
|
833
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
834
|
+
</table>
|
835
|
+
|
836
|
+
<!-- =============================================================== -->
|
837
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
838
|
+
|
839
|
+
<h3><a name="tatree">Transaction Prefix Tree</a></h3>
|
840
|
+
|
841
|
+
<p>The counting process can be sped up by organizing the transactions
|
842
|
+
into a prefix tree. That is, the items in each transaction are sorted
|
843
|
+
and then transactions with the same prefix are grouped together and
|
844
|
+
are counted, as one may say, in parallel. This way of organizing the
|
845
|
+
transactions was added in version 4.03 and is the default behavior now.
|
846
|
+
If you prefer that the transactions are treated individually (i.e., the
|
847
|
+
transactions are stored in a simple list and only one transaction is
|
848
|
+
counted at a time), use the option <tt>-h</tt>.</p>
|
849
|
+
|
850
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
851
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
852
|
+
<td width=5></td>
|
853
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
854
|
+
</table>
|
855
|
+
|
856
|
+
<!-- =============================================================== -->
|
857
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
858
|
+
|
859
|
+
<h3><a name="options">Program Invocation and Options</a></h3>
|
860
|
+
|
861
|
+
<p>My apriori program is invoked as follows:</p>
|
862
|
+
<p><tt>apriori [options] infile outfile [appfile]</tt></p>
|
863
|
+
<p>The normal arguments are:</p>
|
864
|
+
<table border=0 cellpadding=0 cellspacing=0>
|
865
|
+
<tr><td>infile</td><td width=10></td>
|
866
|
+
<td>file to read transactions from</td></tr>
|
867
|
+
<tr><td>outfile</td><td></td>
|
868
|
+
<td>file to write association rules / hyperedges to</td></tr>
|
869
|
+
<tr><td>appfile</td><td></td>
|
870
|
+
<td>file stating item appearances (optional)</td></tr>
|
871
|
+
</table>
|
872
|
+
<p>The possible options are:</p>
|
873
|
+
<table border=0 cellpadding=0 cellspacing=0>
|
874
|
+
<tr><td><tt>-t#</tt></td><td width=10></td>
|
875
|
+
<td>target type (default: association rules)</td></tr>
|
876
|
+
<tr><td><tt></tt></td><td width=10></td>
|
877
|
+
<td>(s: itemsets, c: closed itemsets, m: maximal itemsets,<br>
|
878
|
+
<font color="white">(</font>r: association rules,
|
879
|
+
h: association hyperedges)</td></tr>
|
880
|
+
<tr><td><tt>-m#</tt></td><td></td>
|
881
|
+
<td>minimal number of items per set/rule/hyperedge
|
882
|
+
(default: 1)</td></tr>
|
883
|
+
<tr><td><tt>-n#</tt></td><td></td>
|
884
|
+
<td>maximal number of items per set/rule/hyperedge
|
885
|
+
(default: 5)</td></tr>
|
886
|
+
<tr><td><tt>-s#</tt></td><td></td>
|
887
|
+
<td>minimal support of a set/rule/hyperedge
|
888
|
+
(default: 10%)</td></tr>
|
889
|
+
<tr><td><tt>-S#</tt></td><td></td>
|
890
|
+
<td>minimal support of a set/rule/hyperedge
|
891
|
+
(default: 100%)</td></tr>
|
892
|
+
<tr><td><tt>-c#</tt></td><td></td>
|
893
|
+
<td>minimal confidence of a rule/hyperedge
|
894
|
+
(default: 80%)</td></tr>
|
895
|
+
<tr><td><tt>-o</tt></td><td></td>
|
896
|
+
<td>use original definition of the support of a rule
|
897
|
+
(body & head)</td></tr>
|
898
|
+
<tr><td><tt>-k#</tt></td><td></td>
|
899
|
+
<td>item separator for output (default: " ")</td></tr>
|
900
|
+
<tr><td><tt>-p#</tt></td><td></td>
|
901
|
+
<td>output format for support/confidence (default: "%.1f%%")</td></tr>
|
902
|
+
<tr><td><tt>-x</tt></td><td></td>
|
903
|
+
<td>extended support output (print both rule support types)
|
904
|
+
</td></tr>
|
905
|
+
<tr><td><tt>-a</tt></td><td></td>
|
906
|
+
<td>print absolute support (number of transactions)</td></tr>
|
907
|
+
<tr><td><tt>-y</tt></td><td></td>
|
908
|
+
<td>print lift value (confidence divided by prior)</td></tr>
|
909
|
+
<tr><td><tt>-e#</tt></td><td></td>
|
910
|
+
<td>additional rule evaluation measure (default: none)</td></tr>
|
911
|
+
<tr><td><tt>-!</tt></td><td></td>
|
912
|
+
<td>print a list of additional rule evaluation measures</td></tr>
|
913
|
+
<tr><td><tt>-d#</tt></td><td></td>
|
914
|
+
<td>minimal value of additional evaluation measure
|
915
|
+
(default: 10%)</td></tr>
|
916
|
+
<tr><td><tt>-v</tt></td><td></td>
|
917
|
+
<td>print value of additional rule evaluation measure</td></tr>
|
918
|
+
<tr><td><tt>-g</tt></td><td></td>
|
919
|
+
<td>write output in scanable form
|
920
|
+
(quote certain characters)</td></tr>
|
921
|
+
<tr><td><tt>-l</tt></td><td></td>
|
922
|
+
<td>do not load transactions into memory
|
923
|
+
(work on input file)</td></tr>
|
924
|
+
<tr><td><tt>-q#</tt></td><td></td>
|
925
|
+
<td>sort items w.r.t. their frequency (default: 1)</td></tr>
|
926
|
+
<tr><td><tt></tt></td><td></td>
|
927
|
+
<td>(1: ascending, -1: descending, 0: do not sort,</td></tr>
|
928
|
+
<tr><td><tt></tt></td><td></td>
|
929
|
+
<td><font color="white">(</font>2: ascending, -2: descending
|
930
|
+
w.r.t. transaction size sum)</td></tr>
|
931
|
+
<tr><td><tt>-u#</tt></td><td></td>
|
932
|
+
<td>filter unused items from transactions (default: 0.5)</td></tr>
|
933
|
+
<tr><td><tt></tt></td><td></td>
|
934
|
+
<td>(0: do not filter items w.r.t. usage in item sets,<br>
|
935
|
+
<0: fraction of removed items for filtering,<br>
|
936
|
+
>0: take execution times ratio into account)</td></tr>
|
937
|
+
<tr><td><tt>-h</tt></td><td></td>
|
938
|
+
<td>do not organize transactions as a prefix tree</td></tr>
|
939
|
+
<tr><td><tt>-j</tt></td><td></td>
|
940
|
+
<td>use quicksort to sort the transactions (default: heapsort)
|
941
|
+
</td></tr>
|
942
|
+
<tr><td><tt>-z</tt></td><td></td>
|
943
|
+
<td>minimize memory usage (default: maximize speed)</td></tr>
|
944
|
+
<tr><td><tt>-i#</tt></td><td></td>
|
945
|
+
<td>ignore records starting with characters in the given
|
946
|
+
string</td></tr>
|
947
|
+
<tr><td valign="top"><tt>-b/f/r#</tt></td><td></td>
|
948
|
+
<td>blank characters, field and record separators</td></tr>
|
949
|
+
<tr><td><tt></tt></td><td></td>
|
950
|
+
<td>(default: "<tt> \t\r</tt>", "<tt> \t</tt>", "<tt>\n</tt>")
|
951
|
+
</td></tr>
|
952
|
+
</table>
|
953
|
+
<p>(<tt>#</tt> always means a number, a letter, or a string that
|
954
|
+
specifies the parameter of the option.)</p>
|
955
|
+
<p>Note that the effect of the option <tt>-z</tt> can depend heavily
|
956
|
+
on how the items are sorted (option <tt>-q</tt>). Highest savings
|
957
|
+
in memory usually result if items are sorted with descending
|
958
|
+
frequency (<tt>-q-1</tt>). However, this often worsens the
|
959
|
+
processing time considerably.</p>
|
960
|
+
<p>A note on the option <tt>-j</tt>: Constructing the prefix tree for
|
961
|
+
the transactions requires sorting the transactions. Since version
|
962
|
+
4.17 heap sort is the default sorting method for the transactions,
|
963
|
+
because it turned out that in conjunction with the item sorting
|
964
|
+
(and especially for artificial datasets like T10I4D100K) quicksort
|
965
|
+
can lead to very bad processing times (almost worst case behavior,
|
966
|
+
i.e., O(n<sup>2</sup>) run time for the sorting). However, sometimes
|
967
|
+
this is not a problem and then quicksort is slightly faster, which
|
968
|
+
can be activated with the option -j.</p>
|
969
|
+
|
970
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
971
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
972
|
+
<td width=5></td>
|
973
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
974
|
+
</table>
|
975
|
+
|
976
|
+
<!-- =============================================================== -->
|
977
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
978
|
+
|
979
|
+
<h3><a name="input">Input Format</a></h3>
|
980
|
+
|
981
|
+
<h4><a name="transin">Format of the Transactions File</a></h4>
|
982
|
+
|
983
|
+
<p>A text file structured by field and record separators and blanks.
|
984
|
+
Record separators, not surprisingly, separate records, usually lines,
|
985
|
+
field separators fields (or columns), usually words. Blanks are used
|
986
|
+
to fill fields (columns), e.g. to align them. In the transactions
|
987
|
+
file each record must contain one transaction, i.e. a list of item
|
988
|
+
identifiers, which are separated by field separators. An empty record
|
989
|
+
is interpreted as an empty transaction.</p>
|
990
|
+
|
991
|
+
<p>Examples can be found in the directory <tt>apriori/ex</tt> in the
|
992
|
+
source package. Refer to the file <tt>apriori/ex/readme</tt>, which
|
993
|
+
explains how to process the different example files in the directory
|
994
|
+
<tt>apriori/ex</tt> in the source package.</p>
|
995
|
+
|
996
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
997
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
998
|
+
<td width=5></td>
|
999
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
1000
|
+
</table>
|
1001
|
+
|
1002
|
+
<!-- =============================================================== -->
|
1003
|
+
|
1004
|
+
<h4><a name="appearin">Format of the Item Appearances File</a></h4>
|
1005
|
+
|
1006
|
+
<p>A text file structured by field and record separators and blanks.
|
1007
|
+
(Note: For this file the same field and record separators and blanks
|
1008
|
+
are used as for the transactions file.)</p>
|
1009
|
+
|
1010
|
+
<p>The first record, which must have one field, contains the default
|
1011
|
+
appearance to be used with all items not mentioned in the appearances
|
1012
|
+
file. Other records state the appearance of specific items. The first
|
1013
|
+
field states the item, the second the appearance indicator. If no
|
1014
|
+
appearance indicator is given, the item will be ignored (i.e. may
|
1015
|
+
appear neither in the body (antecedent) nor in the head (consequent)
|
1016
|
+
of a rule). Empty records are ignored.</p>
|
1017
|
+
|
1018
|
+
<p>The following appearance indicators are recognized:</p>
|
1019
|
+
<ul type=circle>
|
1020
|
+
<li>item may appear only in rule bodies (antecedents):<br>
|
1021
|
+
<tt>i in b body a ante antecedent</tt></li>
|
1022
|
+
<li>item may appear only in rule heads (consequents):<br>
|
1023
|
+
<tt>o out h head c cons consequent</tt></li>
|
1024
|
+
<li>item may appear in rule bodies (antecedents)
|
1025
|
+
or in rule heads (consequents):<br>
|
1026
|
+
<tt>io inout bh b&h ac a&c both</tt></li>
|
1027
|
+
<li>item may appear neither in rule bodies (antecedents)
|
1028
|
+
nor in rule heads (consequents):<br>
|
1029
|
+
<tt>n neither none ign ignore -</tt></li>
|
1030
|
+
</ul>
|
1031
|
+
|
1032
|
+
<p><b>Example 1:</b>
|
1033
|
+
Generate only rules with item "x" in the consequent.</p>
|
1034
|
+
<p><tt>in<br>
|
1035
|
+
x out</tt></p>
|
1036
|
+
|
1037
|
+
<p><b>Example 2:</b>
|
1038
|
+
Item "x" may appear only in a rule head (consequent),
|
1039
|
+
item "y" only in a rule body (antecedent);
|
1040
|
+
appearance of all other items is not restricted.</p>
|
1041
|
+
<p><tt>both<br>
|
1042
|
+
x head<br>
|
1043
|
+
y body</tt></p>
|
1044
|
+
|
1045
|
+
<p>Providing no item appearances file is equivalent to an item
|
1046
|
+
appearances file containing only an indicator like "both", which
|
1047
|
+
does not restrict the appearance of any items.</p>
|
1048
|
+
|
1049
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
1050
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
1051
|
+
<td width=5></td>
|
1052
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
1053
|
+
</table>
|
1054
|
+
|
1055
|
+
<!-- =============================================================== -->
|
1056
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
1057
|
+
|
1058
|
+
<h3><a name="output">Output Format</a></h3>
|
1059
|
+
|
1060
|
+
<h4><a name="ruleout">Output Format for Association Rules</a></h4>
|
1061
|
+
|
1062
|
+
<p>Each line of the output file contains one association rule in the
|
1063
|
+
format</p>
|
1064
|
+
<p><tt>c <- a b ... (x%, y%)</tt></p>
|
1065
|
+
<p>where a, b, and c are item identifiers, and</p>
|
1066
|
+
|
1067
|
+
<table border=0 cellpadding=0 cellspacing=0>
|
1068
|
+
<tr><td valign=top>x</td><td width=10></td>
|
1069
|
+
<td>the percentage of transactions that contain all items appearing
|
1070
|
+
in the rule body (antecedent), that is, in the example above,
|
1071
|
+
a and b. (support of the rule, i.e., the support in my
|
1072
|
+
interpretation)</td>
|
1073
|
+
<tr><td valign=top>y</td><td></td>
|
1074
|
+
<td>the confidence of the rule, which is computed as the quotient of
|
1075
|
+
the percentage of transactions that contain all items appearing in
|
1076
|
+
the rule body (antecedent) and the rule head (consequent) - that is,
|
1077
|
+
in the example above, a, b, and c - and the above percentage x.</td>
|
1078
|
+
</tr>
|
1079
|
+
</table>
|
1080
|
+
|
1081
|
+
<p>If the option -o is used, x is replaced by the rule support in the
|
1082
|
+
original definition (i.e., the one used by [Agrawal et al. 1993]),
|
1083
|
+
namely the percentage of transactions that contain all items appearing
|
1084
|
+
in the rule (antecedent) and the rule head (consequent), that is, in
|
1085
|
+
the example above, a, b, and c. The value of y, however, is still
|
1086
|
+
computed from the value of x as described above.</p>
|
1087
|
+
|
1088
|
+
<p>If the option -x is given, both types of rule support (support of
|
1089
|
+
all items in the rule and support of the items in the body/antecedent
|
1090
|
+
of the rule) will be printed. The confidence of a rule (see above) is
|
1091
|
+
the quotient of the two support values (* 100%), i.e., a rule will
|
1092
|
+
be printed as</p>
|
1093
|
+
<p><tt>c <- a b ... (x<sub>1</sub>%, x<sub>2</sub>%, y%)</tt></p>
|
1094
|
+
<p>where x<sub>1</sub> is the support of the set of all items in the
|
1095
|
+
rule, x<sub>2</sub> is the support of the set of items in the body
|
1096
|
+
(antecedent) of the rule, and y = x<sub>1</sub>/x<sub>2</sub> * 100%
|
1097
|
+
is the confidence of the rule.</p>
|
1098
|
+
|
1099
|
+
<p>If the option -a is given, the support percentage x is supplemented
|
1100
|
+
by the absolute number of transactions underlying it:</p>
|
1101
|
+
<p><tt>c <- a b ... (x%/s, y%)</tt></p>
|
1102
|
+
<p>where s is the absolute number of transactions. If the option -x is
|
1103
|
+
given, the absolute support is printed for both types of rule support.
|
1104
|
+
</p>
|
1105
|
+
|
1106
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
1107
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
1108
|
+
<td width=5></td>
|
1109
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
1110
|
+
</table>
|
1111
|
+
|
1112
|
+
<!-- =============================================================== -->
|
1113
|
+
|
1114
|
+
<h4><a name="setout">Output Format for Frequent Item Sets</a></h4>
|
1115
|
+
|
1116
|
+
<p>Each line of the output file contains one item set in the format</p>
|
1117
|
+
<p><tt>a b c ... (x%)</tt></p>
|
1118
|
+
<p>where a, b, and c are item identifiers and x is the percentage of
|
1119
|
+
transactions that contain this item set (item set support).</p>
|
1120
|
+
|
1121
|
+
<p>If the option -a is given, this percentage is supplemented by the
|
1122
|
+
absolute number of transactions underlying it:</p>
|
1123
|
+
<p><tt>a b c ... (x%/s)</tt></p>
|
1124
|
+
<p>where s is the absolute number of transactions.</p>
|
1125
|
+
|
1126
|
+
<p>If the option -x is given, the percentage of transactions that are
|
1127
|
+
identical to the item set is printed, too (whereas the normal support
|
1128
|
+
is the percentage of transactions that are a superset of the item set):
|
1129
|
+
</p>
|
1130
|
+
<p><tt>a b c ... (x%, %y)</tt></p>
|
1131
|
+
<p>where x is the normal item set support and y is the percentage of
|
1132
|
+
transactions identical to the item set. (This output option was added
|
1133
|
+
in response to a request by Laura Maruster.) If the option -a is also
|
1134
|
+
given, both percentages are supplemented by the absolute number of
|
1135
|
+
transactions underlying these percentages.</p>
|
1136
|
+
|
1137
|
+
<p>Note that for frequent item sets the option -x cannot be combined
|
1138
|
+
with the option -y. That is, in order to compute the second support
|
1139
|
+
measure for item sets, the transactions have to be loaded into memory.
|
1140
|
+
</p>
|
1141
|
+
|
1142
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
1143
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
1144
|
+
<td width=5></td>
|
1145
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
1146
|
+
</table>
|
1147
|
+
|
1148
|
+
<!-- =============================================================== -->
|
1149
|
+
|
1150
|
+
<h4><a name="edgeout">Output Format for Association Hyperedges</a></h4>
|
1151
|
+
|
1152
|
+
<p>Each line of the output file contains one hyperedge the format</p>
|
1153
|
+
<p><tt>a b c ... (x%, y%)</tt></p>
|
1154
|
+
<p>where a, b, and c are item identifiers, and</p>
|
1155
|
+
|
1156
|
+
<table border=0 cellpadding=0 cellspacing=0>
|
1157
|
+
<tr><td valign=top>x</td><td width=10></td>
|
1158
|
+
<td>the percentage of transactions that contain all items appearing
|
1159
|
+
in the hyperedge, that is, in the example above, a, b, and c.</td>
|
1160
|
+
</tr>
|
1161
|
+
<tr><td valign=top>y</td><td></td>
|
1162
|
+
<td>the average confidence of all rules that can be formed using
|
1163
|
+
the items in the hyperedge with all items appearing in the rule
|
1164
|
+
(see above), i.e., for the example above, the average confidence
|
1165
|
+
of the rules c <- a b, b <- a c, and a <- b c.</td></tr>
|
1166
|
+
</table>
|
1167
|
+
|
1168
|
+
<p>If the option -a is given, the support percentage x is supplemented
|
1169
|
+
by the absolute number of transactions underlying it:</p>
|
1170
|
+
<p><tt>a b c ... (x%/s, y%)</tt></p>
|
1171
|
+
<p>where s is the absolute number of transactions.</p>
|
1172
|
+
|
1173
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
1174
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
1175
|
+
<td width=5></td>
|
1176
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
1177
|
+
</table>
|
1178
|
+
|
1179
|
+
<!-- =============================================================== -->
|
1180
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
1181
|
+
|
1182
|
+
<h3><a name="compopt">Compilation Options</a></h3>
|
1183
|
+
|
1184
|
+
<p>The program can be compiled with two additional compilation options
|
1185
|
+
(see <tt>makefile</tt>), namely <tt>-DBENCH</tt> and <tt>-DARCH64</tt>.
|
1186
|
+
</p>
|
1187
|
+
|
1188
|
+
<p>Compiling the program with <tt>-DBENCH</tt> produces a version that
|
1189
|
+
prints some benchmark information on termination, in particular about
|
1190
|
+
the memory used during the item set tree construction (number of nodes,
|
1191
|
+
counters, necessary counters, child pointers, necessary child pointers).
|
1192
|
+
Collecting the memory usage information slightly, but negligibly
|
1193
|
+
increases the execution time.</p>
|
1194
|
+
|
1195
|
+
<p>Compiling the program with <tt>-DARCH64</tt> produces a version for
|
1196
|
+
64 bit machines (architecture model: pointers are 64 bits, integers are
|
1197
|
+
32 bits wide), by removing some alignment issues in the transaction and
|
1198
|
+
item set tree representations, which would otherwise lead to bus errors.
|
1199
|
+
These adaptations slightly, but negligibly increase memory consumption.
|
1200
|
+
(I am grateful to Anthony Casaletto, SPSS Inc., for helping me a lot to
|
1201
|
+
identify these alignment problems, by compiling and testing the program
|
1202
|
+
on a 64 bit machine, since I do not have access to one.)</p>
|
1203
|
+
|
1204
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
1205
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
1206
|
+
<td width=5></td>
|
1207
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
1208
|
+
</table>
|
1209
|
+
|
1210
|
+
<!-- =============================================================== -->
|
1211
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
1212
|
+
|
1213
|
+
<h3><a name="copying">Copying</a></h3>
|
1214
|
+
|
1215
|
+
<p>apriori -
|
1216
|
+
find association rules/hyperedges with apriori algorithm<br>
|
1217
|
+
copyright © 1996-2003 Christian Borgelt</p>
|
1218
|
+
|
1219
|
+
<p>This program is free software; you can redistribute it and/or
|
1220
|
+
modify it under the terms of the
|
1221
|
+
<a href="http://www.fsf.org/copyleft/lesser.html">
|
1222
|
+
GNU Lesser (Library) General Public License</a> as published by the
|
1223
|
+
<a href="http://www.fsf.org">Free Software Foundation</a>.</p>
|
1224
|
+
|
1225
|
+
<p>This program is distributed in the hope that it will be useful,
|
1226
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
1227
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
1228
|
+
<a href="http://www.fsf.org/copyleft/lesser.html">
|
1229
|
+
GNU Lesser (Library) General Public License</a> for more details.</p>
|
1230
|
+
|
1231
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
1232
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
1233
|
+
<td width=5></td>
|
1234
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
1235
|
+
</table>
|
1236
|
+
|
1237
|
+
<!-- =============================================================== -->
|
1238
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
1239
|
+
|
1240
|
+
<h3><a name="download">Download</a></h3>
|
1241
|
+
|
1242
|
+
<p><a href="http://fuzzy.cs.uni-magdeburg.de/~borgelt/apriori.html">
|
1243
|
+
Download page</a> with most recent version.</p>
|
1244
|
+
|
1245
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
1246
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
1247
|
+
<td width=5></td>
|
1248
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
1249
|
+
</table>
|
1250
|
+
|
1251
|
+
<!-- =============================================================== -->
|
1252
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
1253
|
+
|
1254
|
+
<h3><a name="contact">Contact</a></h3>
|
1255
|
+
|
1256
|
+
<table border=0 cellpadding=0 cellspacing=0>
|
1257
|
+
<tr><td valign=top>Snail mail:</td><td width=10></td>
|
1258
|
+
<td><a href="http://fuzzy.cs.uni-magdeburg.de/~borgelt/index.html">
|
1259
|
+
Christian Borgelt</a><br>
|
1260
|
+
<a href="http://fuzzy.cs.uni-magdeburg.de/index.html">
|
1261
|
+
Working Group Neural Networks and Fuzzy Systems</a><br>
|
1262
|
+
<a href="http://www-iws.cs.uni-magdeburg.de/iws.html">
|
1263
|
+
Department of Knowledge Processing and Language Engineering</a><br>
|
1264
|
+
<a href="http://www.cs.uni-magdeburg.de/">
|
1265
|
+
School of Computer Science</a><br>
|
1266
|
+
<a href="http://www.uni-magdeburg.de/">
|
1267
|
+
Otto-von-Guericke-University of Magdeburg</a><br>
|
1268
|
+
Universitätsplatz 2<br>
|
1269
|
+
D-39106 Magdeburg<br>
|
1270
|
+
Germany</td></tr>
|
1271
|
+
<tr><td valign=top>E-mail:</td><td></td>
|
1272
|
+
<td><a href="mailto:christian.borgelt@cs.uni-magdeburg.de">
|
1273
|
+
christian.borgelt@cs.uni-magdeburg.de</a><br>
|
1274
|
+
<a href="mailto:borgelt@iws.cs.uni-magdeburg.de">
|
1275
|
+
borgelt@iws.cs.uni-magdeburg.de</a></td></tr>
|
1276
|
+
<tr><td>Phone:</td><td></td>
|
1277
|
+
<td>+49 391 67 12700</td></tr>
|
1278
|
+
<tr><td>Fax:</td><td></td>
|
1279
|
+
<td>+49 391 67 12018</td></tr>
|
1280
|
+
<tr><td>Office:</td><td></td>
|
1281
|
+
<td>29.015</td></tr>
|
1282
|
+
</table>
|
1283
|
+
|
1284
|
+
<table width="100%" border=0 cellpadding=0 cellspacing=0>
|
1285
|
+
<tr><td width="95%" align=right><a href="#top">back to the top</a></td>
|
1286
|
+
<td width=5></td>
|
1287
|
+
<td><a href="#top"><img src="uparrow.gif" border=0></a></td></tr>
|
1288
|
+
</table>
|
1289
|
+
|
1290
|
+
<!-- =============================================================== -->
|
1291
|
+
<p><img src="line.gif" alt="" height=7 width=704></p>
|
1292
|
+
|
1293
|
+
<address>© 2002-2004
|
1294
|
+
<a href="mailto:borgelt@iws.cs.uni-magdeburg.de">Christian Borgelt</a>
|
1295
|
+
</address>
|
1296
|
+
<!-- Created: Thu May 24 12:28:05 CEST 2001 -->
|
1297
|
+
<!-- hhmts start -->
|
1298
|
+
Last modified: Tue Nov 23 13:49:10 CET 2004
|
1299
|
+
<!-- hhmts end -->
|
1300
|
+
</body>
|
1301
|
+
</html>
|