fp-growth 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +5 -0
- data/examples/cao.rb +207 -0
- data/examples/small_ones.rb +56 -0
- data/lib/fp_growth.rb +9 -0
- data/lib/fp_growth/fp_tree.rb +220 -0
- data/lib/fp_growth/fp_tree_node.rb +81 -0
- data/lib/fp_growth/header_list.rb +51 -0
- data/lib/fp_growth/header_list_node.rb +44 -0
- data/lib/fp_growth/inflections.rb +83 -0
- data/lib/fp_growth/prefix_path.rb +41 -0
- data/tests/profile_fp_tree.rb +24 -0
- data/tests/tc_fp_tree.rb +55 -0
- data/tests/ts_all.rb +14 -0
- metadata +76 -0
data/README
ADDED
data/examples/cao.rb
ADDED
@@ -0,0 +1,207 @@
|
|
1
|
+
begin
|
2
|
+
require 'rubygems'
|
3
|
+
require 'fp_growth'
|
4
|
+
rescue LoadError
|
5
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
require 'fp_growth'
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'mysql'
|
10
|
+
require 'singleton'
|
11
|
+
|
12
|
+
HOST =
|
13
|
+
USER =
|
14
|
+
PASSWD =
|
15
|
+
DB =
|
16
|
+
|
17
|
+
# This example shows the usage of fp-growth with a database from an
|
18
|
+
# inventory management system called cao_faktura (http://www.cao-faktura.de/)
|
19
|
+
class CaoDb
|
20
|
+
|
21
|
+
include Singleton
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
connect
|
25
|
+
end
|
26
|
+
|
27
|
+
def article_tree(min_support)
|
28
|
+
FpGrowth::FpTree.new(min_support, article_ids(min_support), article_transactions)
|
29
|
+
end
|
30
|
+
|
31
|
+
def category_tree(min_support)
|
32
|
+
transactions = category_transactions
|
33
|
+
FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(transactions), transactions)
|
34
|
+
end
|
35
|
+
|
36
|
+
def article_name(id)
|
37
|
+
@connection.query(%Q!
|
38
|
+
SELECT KURZNAME
|
39
|
+
FROM ARTIKEL
|
40
|
+
WHERE REC_ID = #{id}
|
41
|
+
!).each do |row|
|
42
|
+
return row[0]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def category_name(id)
|
47
|
+
@connection.query(%Q!
|
48
|
+
SELECT NAME
|
49
|
+
FROM WARENGRUPPEN
|
50
|
+
WHERE ID = #{id}
|
51
|
+
!).each do |row|
|
52
|
+
return row[0]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def article_ids(min_support)
|
57
|
+
result = []
|
58
|
+
data = @connection.query(%Q!
|
59
|
+
SELECT ARTIKEL_ID,
|
60
|
+
# ein Bon kann mehrere journalpos mit ident. Artikel enthalten
|
61
|
+
# und soll trotzdem nur einmal gezählt werden
|
62
|
+
COUNT(DISTINCT JOURNAL_ID) AS anzahl
|
63
|
+
FROM JOURNALPOS
|
64
|
+
GROUP BY ARTIKEL_ID
|
65
|
+
HAVING anzahl >= #{min_support}
|
66
|
+
# mit zwei spalten ist Sortierung eindeutig
|
67
|
+
ORDER BY anzahl DESC, ARTIKEL_ID DESC
|
68
|
+
!)
|
69
|
+
data.each do |row|
|
70
|
+
result << [ row[0].to_i, row[1].to_i ]
|
71
|
+
end
|
72
|
+
data.free
|
73
|
+
result
|
74
|
+
end
|
75
|
+
|
76
|
+
def category_ids(min_support)
|
77
|
+
result = []
|
78
|
+
data = @connection.query(%Q!
|
79
|
+
SELECT WARENGRUPPE,
|
80
|
+
COUNT(DISTINCT JOURNAL_ID) AS anzahl
|
81
|
+
FROM JOURNALPOS
|
82
|
+
GROUP BY ARTIKEL_ID
|
83
|
+
HAVING anzahl >= #{min_support}
|
84
|
+
ORDER BY anzahl DESC, WARENGRUPPE DESC
|
85
|
+
!)
|
86
|
+
data.each do |row|
|
87
|
+
result << [ row[0].to_i, row[1].to_i ]
|
88
|
+
end
|
89
|
+
data.free
|
90
|
+
result
|
91
|
+
end
|
92
|
+
|
93
|
+
def article_transactions
|
94
|
+
result = []
|
95
|
+
data = @connection.query(%q!
|
96
|
+
SELECT JOURNAL_ID,
|
97
|
+
CONVERT(GROUP_CONCAT(ARTIKEL_ID SEPARATOR ',') USING UTF8)
|
98
|
+
FROM JOURNALPOS
|
99
|
+
GROUP BY JOURNAL_ID
|
100
|
+
!)
|
101
|
+
data.each do |row|
|
102
|
+
result << row[1].split(',').collect { |id| id.to_i }
|
103
|
+
end
|
104
|
+
data.free
|
105
|
+
result
|
106
|
+
end
|
107
|
+
|
108
|
+
def category_transactions
|
109
|
+
result = []
|
110
|
+
data = @connection.query(%q!
|
111
|
+
SELECT JOURNAL_ID,
|
112
|
+
CONVERT(GROUP_CONCAT(WARENGRUPPE SEPARATOR ',') USING UTF8)
|
113
|
+
FROM JOURNALPOS
|
114
|
+
GROUP BY JOURNAL_ID
|
115
|
+
!)
|
116
|
+
data.each do |row|
|
117
|
+
result << row[1].split(',').collect { |id| id.to_i }
|
118
|
+
end
|
119
|
+
data.free
|
120
|
+
result
|
121
|
+
end
|
122
|
+
|
123
|
+
def count_transactions
|
124
|
+
@connection.query(%q!
|
125
|
+
SELECT COUNT(*) FROM JOURNAL
|
126
|
+
!).each do |row|
|
127
|
+
return row[0].to_i
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def do_articles(min_support, min_confidence)
|
132
|
+
print_article_assoziation_rules(article_tree(min_support).create_assoziation_rules(min_confidence))
|
133
|
+
end
|
134
|
+
private
|
135
|
+
|
136
|
+
def connect
|
137
|
+
@connection = Mysql::new(HOST, USER, PASSWD, DB)
|
138
|
+
@connection.query('SET NAMES "UTF8"')
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
def is_unnecessary_rule?(all, wanted, beta)
|
143
|
+
all.each do |candidate|
|
144
|
+
next if candidate[:right] != wanted[:right]
|
145
|
+
next if !candidate[:left].is_superset_of?(wanted[:left])
|
146
|
+
next if !candidate[:confidence].between?(wanted[:confidence] - beta, wanted[:confidence] + beta)
|
147
|
+
return true
|
148
|
+
end
|
149
|
+
false
|
150
|
+
end
|
151
|
+
|
152
|
+
def remove_unnecessary_rules(all, beta)
|
153
|
+
all.select { |candidate| !is_unnecessary_rule?(all, candidate, beta) }
|
154
|
+
end
|
155
|
+
|
156
|
+
def print_article_assoziation_rules(array)
|
157
|
+
array.each { |rule| print_article_assoziation_rule(rule) }
|
158
|
+
end
|
159
|
+
|
160
|
+
def print_article_assoziation_rule(rule)
|
161
|
+
print '['
|
162
|
+
rule[:left].each do |id|
|
163
|
+
print "(#{id}:#{CaoDb.instance.article_name(id)})"
|
164
|
+
end
|
165
|
+
print '] => ['
|
166
|
+
rule[:right].each do |id|
|
167
|
+
print "(#{id}:#{CaoDb.instance.article_name(id)})"
|
168
|
+
end
|
169
|
+
print '] = '
|
170
|
+
puts sprintf('s%i c%.2f', rule[:support], rule[:confidence])
|
171
|
+
end
|
172
|
+
|
173
|
+
def print_category_assoziation_rules(array)
|
174
|
+
array.each do |rule|
|
175
|
+
print '['
|
176
|
+
rule[:left].each do |id|
|
177
|
+
print "(#{id}:#{CaoDb.instance.category_name(id)})"
|
178
|
+
end
|
179
|
+
print '] => ['
|
180
|
+
rule[:right].each do |id|
|
181
|
+
print "(#{id}:#{CaoDb.instance.category_name(id)})"
|
182
|
+
end
|
183
|
+
print '] = '
|
184
|
+
puts sprintf('s%i c%.2f', rule[:support], rule[:confidence])
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def do_categories(min_support, min_confidence)
|
189
|
+
frequent_patterns = CaoDb.instance.category_tree(min_support).fp_growth
|
190
|
+
print_category_assoziation_rules(FpGrowth::Helper.create_assoziation_rules(frequent_patterns, min_confidence))
|
191
|
+
end
|
192
|
+
|
193
|
+
def do_articles(min_support, min_confidence)
|
194
|
+
frequent_patterns = CaoDb.instance.article_tree(min_support).fp_growth
|
195
|
+
print_article_assoziation_rules(
|
196
|
+
remove_unnecessary_rules(
|
197
|
+
FpGrowth::Helper.create_assoziation_rules(frequent_patterns, min_confidence),
|
198
|
+
0.2
|
199
|
+
)
|
200
|
+
)
|
201
|
+
end
|
202
|
+
|
203
|
+
min_support = 9 # abs = (rel * database.count_transactions).round
|
204
|
+
min_confidence = 0.5
|
205
|
+
|
206
|
+
do_articles(min_support, min_confidence)
|
207
|
+
#CaoDb.instance.article_tree(min_support).save
|
@@ -0,0 +1,56 @@
|
|
1
|
+
begin
|
2
|
+
require 'rubygems'
|
3
|
+
require 'fp_growth'
|
4
|
+
rescue LoadError
|
5
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
require 'fp_growth'
|
7
|
+
end
|
8
|
+
|
9
|
+
def example_1 # paper
|
10
|
+
transactions = [
|
11
|
+
['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'],
|
12
|
+
['a', 'b', 'c', 'f', 'l', 'm', 'o'],
|
13
|
+
['b', 'f', 'h', 'j', 'o'],
|
14
|
+
['b', 'c', 'k', 's', 'p'],
|
15
|
+
['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']
|
16
|
+
]
|
17
|
+
FpGrowth::FpTree.new(3, v.get_items(transactions), transactions)
|
18
|
+
end
|
19
|
+
|
20
|
+
def example_2 # buch s 123
|
21
|
+
transactions = [
|
22
|
+
[1,3,6,8,9],
|
23
|
+
[1,2,3,6,7,8],
|
24
|
+
[2,4,6],
|
25
|
+
[2,3,4,9],
|
26
|
+
[1,3,5,6,7,8,9]
|
27
|
+
]
|
28
|
+
FpGrowth::FpTree.new(3, FpGrowth::FpTree.get_items(transactions), transactions)
|
29
|
+
end
|
30
|
+
|
31
|
+
def example_3
|
32
|
+
transactions = [
|
33
|
+
[1,3,4],
|
34
|
+
[4],
|
35
|
+
[1,2,4,5],
|
36
|
+
[1,6],
|
37
|
+
[1,2],
|
38
|
+
[1,6],
|
39
|
+
[1,4],
|
40
|
+
[1,2,4]
|
41
|
+
]
|
42
|
+
FpGrowth::FpTree.new(2, FpGrowth::FpTree.get_items(transactions), transactions)
|
43
|
+
end
|
44
|
+
|
45
|
+
def example_4 # thesis adaptive-fp
|
46
|
+
transactions = [
|
47
|
+
['a', 'c', 'd', 'e', 'f'],
|
48
|
+
['a', 'b', 'e'],
|
49
|
+
['c', 'e', 'f'],
|
50
|
+
['a', 'c', 'd', 'f'],
|
51
|
+
['c', 'e', 'f']
|
52
|
+
]
|
53
|
+
FpGrowth::FpTree.new(2, FpGrowth::FpTree.get_items(transactions), transactions)
|
54
|
+
end
|
55
|
+
|
56
|
+
puts example_4
|
data/lib/fp_growth.rb
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
require 'fp_growth/inflections'
|
2
|
+
|
3
|
+
module FpGrowth
|
4
|
+
autoload :FpTreeNode, 'fp_growth/fp_tree_node'
|
5
|
+
autoload :FpTree, 'fp_growth/fp_tree'
|
6
|
+
autoload :HeaderListNode, 'fp_growth/header_list_node'
|
7
|
+
autoload :HeaderList, 'fp_growth/header_list'
|
8
|
+
autoload :PrefixPath, 'fp_growth/prefix_path'
|
9
|
+
end
|
@@ -0,0 +1,220 @@
|
|
1
|
+
module FpGrowth::Helper
|
2
|
+
def self.subsets_as_prefix_path(array)
|
3
|
+
result = []
|
4
|
+
array.subsets.each do |combination|
|
5
|
+
min_support = combination.min { |a, b| a.count <=> b.count }.count
|
6
|
+
elements = combination.collect { |c| c.key }
|
7
|
+
result << FpGrowth::PrefixPath.new(min_support, elements)
|
8
|
+
end
|
9
|
+
result
|
10
|
+
end
|
11
|
+
|
12
|
+
# takes arrays of prefix_paths
|
13
|
+
def self.cross_product(a, b)
|
14
|
+
result = []
|
15
|
+
a.each do |a_el|
|
16
|
+
b.each do |b_el|
|
17
|
+
result << (a_el + b_el)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
result
|
21
|
+
end
|
22
|
+
|
23
|
+
# TODO put handling of assoziation rules in an extra namespace
|
24
|
+
# and copy the unnecessary rules code from examples/cao.rb in it too
|
25
|
+
def self.create_assoziation_rules(frequent_patterns, min_confidence)
|
26
|
+
result = []
|
27
|
+
frequent_patterns.each do |pattern|
|
28
|
+
next if pattern.size < 2
|
29
|
+
pattern.to_a.parts.each do |part|
|
30
|
+
other_pattern = frequent_patterns.find { |p| part.first.content_equal?(p.to_a) }
|
31
|
+
confidence = pattern.support.to_f / other_pattern.support.to_f
|
32
|
+
if confidence >= min_confidence
|
33
|
+
result << {
|
34
|
+
:left => part.first,
|
35
|
+
:right => part.last,
|
36
|
+
:support => pattern.support,
|
37
|
+
:confidence => confidence
|
38
|
+
}
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
result
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
class FpGrowth::FpTree
|
48
|
+
|
49
|
+
attr_reader :min_support, :root
|
50
|
+
attr_accessor :header_list
|
51
|
+
protected :header_list, :header_list=, :root
|
52
|
+
|
53
|
+
# items is sorted array of [key, count]
|
54
|
+
# min_support is not in %, but absolute
|
55
|
+
def initialize(min_support, items, transactions = [])
|
56
|
+
@min_support = min_support
|
57
|
+
@root = FpGrowth::FpTreeNode.new(nil)
|
58
|
+
|
59
|
+
# the count information is lost in this step
|
60
|
+
# HeaderList calculates this value again if needed
|
61
|
+
@header_list = FpGrowth::HeaderList.new(select_only_frequent_items(items))
|
62
|
+
|
63
|
+
add_transactions(transactions)
|
64
|
+
end
|
65
|
+
|
66
|
+
def empty?
|
67
|
+
@root.children.empty?
|
68
|
+
end
|
69
|
+
|
70
|
+
def add_transactions(transactions)
|
71
|
+
transactions.each do |transaction|
|
72
|
+
add_transaction(transaction)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# TODO decide: bild header list online or with rebuild_header_list afterwards
|
77
|
+
def add_transaction(transaction)
|
78
|
+
transaction = prepare_transaction(transaction)
|
79
|
+
increment = transaction.support
|
80
|
+
tree_node = @root
|
81
|
+
transaction.each do |key|
|
82
|
+
if next_node = tree_node.find_child(key)
|
83
|
+
next_node.increase_count(increment)
|
84
|
+
else
|
85
|
+
next_node = FpGrowth::FpTreeNode.new(key, increment)
|
86
|
+
tree_node.add_child(next_node)
|
87
|
+
|
88
|
+
@header_list[key] << next_node
|
89
|
+
end
|
90
|
+
|
91
|
+
tree_node = next_node
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def to_s
|
96
|
+
if false
|
97
|
+
@root.to_s
|
98
|
+
else
|
99
|
+
result = "Tree (min_s = #{@min_support})\n"
|
100
|
+
result << @root.to_s
|
101
|
+
result << "\nHeader list\n"
|
102
|
+
result << @header_list.to_s
|
103
|
+
result
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# arg is new root
|
108
|
+
def subtree(root_node)
|
109
|
+
result = FpGrowth::FpTree.new(@min_support, [])
|
110
|
+
root_node.clone_children(result.root)
|
111
|
+
result.rebuild_header_list
|
112
|
+
result
|
113
|
+
end
|
114
|
+
|
115
|
+
def fp_growth(alpha = nil)
|
116
|
+
single, multi = split_paths
|
117
|
+
single_teil = []
|
118
|
+
multi_teil = []
|
119
|
+
if single
|
120
|
+
single_teil = FpGrowth::Helper.subsets_as_prefix_path(single)
|
121
|
+
end
|
122
|
+
if multi
|
123
|
+
multi.header_list.each do |node|
|
124
|
+
immediate_frequent_pattern = node.immediate_frequent_pattern
|
125
|
+
multi_teil << immediate_frequent_pattern
|
126
|
+
new_tree = node.conditional_fp_tree(@min_support)
|
127
|
+
if !new_tree.empty?
|
128
|
+
multi_teil += new_tree.fp_growth(immediate_frequent_pattern)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
FpGrowth::Helper.cross_product(single_teil + multi_teil + FpGrowth::Helper.cross_product(single_teil, multi_teil), [alpha])
|
133
|
+
end
|
134
|
+
|
135
|
+
# slow version used only as reference for testing
|
136
|
+
# does not split tree in single- and multipath parts
|
137
|
+
def fp_growth_slow(alpha = nil)
|
138
|
+
result = []
|
139
|
+
header_list.each do |node|
|
140
|
+
base_path = node.immediate_frequent_pattern + alpha
|
141
|
+
result << base_path
|
142
|
+
new_tree = node.conditional_fp_tree(@min_support)
|
143
|
+
if !new_tree.empty?
|
144
|
+
result += new_tree.fp_growth_slow(base_path)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
result
|
148
|
+
end
|
149
|
+
|
150
|
+
# can be used with Array or PrefixPath
|
151
|
+
# returns sorted array of [key, count]
|
152
|
+
def self.get_items(transactions)
|
153
|
+
result = {}
|
154
|
+
|
155
|
+
transactions.each do |transaction|
|
156
|
+
|
157
|
+
increment = transaction.respond_to?(:support) ? transaction.support : 1
|
158
|
+
|
159
|
+
transaction.each do |item|
|
160
|
+
result[item] ||= 0
|
161
|
+
result[item] += increment
|
162
|
+
end
|
163
|
+
end
|
164
|
+
result.to_a.
|
165
|
+
sort { |a, b| b.last <=> a.last }
|
166
|
+
end
|
167
|
+
|
168
|
+
def save(filename = 'fp_tree.txt')
|
169
|
+
File.open(filename, 'w') do |file|
|
170
|
+
file.write Marshal.dump(self)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def self.load(filename = 'fp_tree.txt')
|
175
|
+
Marshal.load(IO.read(filename))
|
176
|
+
end
|
177
|
+
|
178
|
+
protected
|
179
|
+
|
180
|
+
def rebuild_header_list
|
181
|
+
@root.each do |node|
|
182
|
+
@header_list[node.key] << node
|
183
|
+
end
|
184
|
+
@header_list.sort
|
185
|
+
end
|
186
|
+
|
187
|
+
private
|
188
|
+
|
189
|
+
# returnvalue is array of two elements
|
190
|
+
# first element is nil or array of tree_nodes which form the single prefix path
|
191
|
+
# second element is nil or FpTree which forms the multipath part
|
192
|
+
def split_paths
|
193
|
+
return [nil, self] if @root.children.size != 1
|
194
|
+
single = []
|
195
|
+
node = @root
|
196
|
+
while node.children.size == 1
|
197
|
+
# this leaves out root
|
198
|
+
node = node.children.first
|
199
|
+
single << node
|
200
|
+
end
|
201
|
+
multi = node.children.empty? ? nil : subtree(node)
|
202
|
+
[single, multi]
|
203
|
+
end
|
204
|
+
|
205
|
+
# removes non-frequent-items and sorts remaining frequent-items on stored ordered-items
|
206
|
+
# arg can be Array or PrefixPath
|
207
|
+
# returns PrefixPath
|
208
|
+
def prepare_transaction(transaction)
|
209
|
+
support = transaction.respond_to?(:support) ? transaction.support : 1
|
210
|
+
# relies on the fact that Array#& keeps order of first array
|
211
|
+
FpGrowth::PrefixPath.new(support, @header_list.all_keys & transaction.to_a)
|
212
|
+
end
|
213
|
+
|
214
|
+
# takes array of [key,count]
|
215
|
+
# keeps order
|
216
|
+
def select_only_frequent_items(items)
|
217
|
+
items.select { |item| item.last >= @min_support }
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
# knows nothing about header list
|
4
|
+
class FpGrowth::FpTreeNode
|
5
|
+
|
6
|
+
attr_reader :key, :children, :count
|
7
|
+
attr_accessor :parent
|
8
|
+
|
9
|
+
protected :parent=
|
10
|
+
|
11
|
+
def initialize(key, count = 1)
|
12
|
+
@key = key
|
13
|
+
@count = count
|
14
|
+
@parent = nil
|
15
|
+
@children = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_child(child)
|
19
|
+
@children << child
|
20
|
+
child.parent = self
|
21
|
+
end
|
22
|
+
|
23
|
+
def find_child(key)
|
24
|
+
children.find { |child| child.key == key }
|
25
|
+
end
|
26
|
+
|
27
|
+
# prints tree structure
|
28
|
+
def to_s(depth = 0)
|
29
|
+
if false
|
30
|
+
title = @key ? "#{@key}:#{count}" : 'root'
|
31
|
+
"( #{title} #{@children.collect { |child| child.to_s }.join(' ')})"
|
32
|
+
else
|
33
|
+
result = key ? "#{@key}:#{count} " : ''
|
34
|
+
result << @children.collect { |child| child.to_s(depth + 1) }.join("\n" << ' ' * 4 * depth)
|
35
|
+
result
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def increase_count(value = 1)
|
40
|
+
@count += value
|
41
|
+
end
|
42
|
+
|
43
|
+
def prefix_path
|
44
|
+
result = FpGrowth::PrefixPath.new(count)
|
45
|
+
node = parent
|
46
|
+
while !node.is_root?
|
47
|
+
result << node.key
|
48
|
+
node = node.parent
|
49
|
+
end
|
50
|
+
result
|
51
|
+
end
|
52
|
+
|
53
|
+
def clone_children(new_parent)
|
54
|
+
@children.each do |child|
|
55
|
+
cloned_child = child.clone
|
56
|
+
cloned_child.parent = new_parent
|
57
|
+
new_parent.children << cloned_child
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# walks recursive through whole tree, without self
|
62
|
+
def each &block
|
63
|
+
children.each do |child|
|
64
|
+
block.call(child)
|
65
|
+
child.each(&block)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def is_root?
|
70
|
+
!parent
|
71
|
+
end
|
72
|
+
|
73
|
+
protected
|
74
|
+
|
75
|
+
def clone
|
76
|
+
result = FpGrowth::FpTreeNode.new(@key, @count)
|
77
|
+
clone_children(result)
|
78
|
+
result
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# Uses array as storage. Slow lookup but keeps order of items
|
2
|
+
class FpGrowth::HeaderList
|
3
|
+
|
4
|
+
# takes array of [key,count] or array of keys
|
5
|
+
def initialize(items)
|
6
|
+
items = items.first.respond_to?(:last) ? get_item_keys(items) : items
|
7
|
+
@array = items.collect { |item| FpGrowth::HeaderListNode.new(item) }
|
8
|
+
end
|
9
|
+
|
10
|
+
# lookup element via key
|
11
|
+
# if key is not found, a new element is added
|
12
|
+
def [](key)
|
13
|
+
if existing_node = @array.find { |item| item.key == key }
|
14
|
+
existing_node
|
15
|
+
else
|
16
|
+
new_node = FpGrowth::HeaderListNode.new(key)
|
17
|
+
@array << new_node
|
18
|
+
new_node
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# delegate to @array
|
23
|
+
def each(&block)
|
24
|
+
@array.each &block
|
25
|
+
end
|
26
|
+
|
27
|
+
def each_key
|
28
|
+
@array.each { |node| yield(node.key) }
|
29
|
+
end
|
30
|
+
|
31
|
+
def all_keys
|
32
|
+
@array.collect { |node| node.key }
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
@array.collect { |node| node.to_s }.join("\n")
|
37
|
+
end
|
38
|
+
|
39
|
+
# needed for cloning of fptree
|
40
|
+
def sort
|
41
|
+
@array.sort { |a, b| b.count <=> a.count }
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
# takes array of [key,count]
|
47
|
+
# returns array of keys
|
48
|
+
def get_item_keys(items)
|
49
|
+
items.collect { |item| item.first }
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
class FpGrowth::HeaderListNode
|
2
|
+
|
3
|
+
attr_reader :key
|
4
|
+
|
5
|
+
def initialize(key)
|
6
|
+
@key = key
|
7
|
+
@array = []
|
8
|
+
end
|
9
|
+
|
10
|
+
# delegate to @array
|
11
|
+
def <<(tree_node)
|
12
|
+
@array << tree_node
|
13
|
+
end
|
14
|
+
|
15
|
+
# delegate to @array
|
16
|
+
def each(&block)
|
17
|
+
@array.each &block
|
18
|
+
end
|
19
|
+
|
20
|
+
# returns array of all prefix paths
|
21
|
+
def conditional_pattern_base
|
22
|
+
result = @array.collect { |tree_node| tree_node.prefix_path }
|
23
|
+
result.select { |pp| !pp.to_a.empty? } # strip empty ones
|
24
|
+
end
|
25
|
+
|
26
|
+
def conditional_fp_tree(min_support)
|
27
|
+
cpb = conditional_pattern_base
|
28
|
+
FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(cpb), cpb)
|
29
|
+
end
|
30
|
+
|
31
|
+
def immediate_frequent_pattern
|
32
|
+
FpGrowth::PrefixPath.new(count,[key])
|
33
|
+
end
|
34
|
+
|
35
|
+
# TODO count could be given at initialize if known
|
36
|
+
def count
|
37
|
+
@array.inject(0) { |result, tree_node| result + tree_node.count }
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_s
|
41
|
+
"#{key}:#{count} -> " <<
|
42
|
+
@array.collect { |tree_node| "#{tree_node.key}:#{tree_node.count}" }.join(' ')
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
class Array
|
2
|
+
# # slow, but nonrecursive solution
|
3
|
+
# def subsets_slow(include_empty = false, just_real = false)
|
4
|
+
# bits = size
|
5
|
+
# limit = 2 ** bits - 1
|
6
|
+
# result = []
|
7
|
+
# (include_empty ? 0 : 1).upto(just_real ? (limit - 1) : limit) do |count|
|
8
|
+
# mask = sprintf('%01$*2$b', count, bits).split(//).collect { |bit| bit == '1' }
|
9
|
+
#
|
10
|
+
# temp = []
|
11
|
+
# mask.each_index do |index|
|
12
|
+
# temp << self[index] if mask[index]
|
13
|
+
# end
|
14
|
+
# result << temp
|
15
|
+
# end
|
16
|
+
# result
|
17
|
+
# end
|
18
|
+
|
19
|
+
# modified from http://branch14.org/snippets/subsets_in_ruby.html
|
20
|
+
def subsets(include_empty = false, just_real = false, ranks = nil)
|
21
|
+
result = all_subsets
|
22
|
+
result.shift unless include_empty
|
23
|
+
result.pop if just_real
|
24
|
+
if ranks
|
25
|
+
result = result.select do |i|
|
26
|
+
if ranks.respond_to?(:include?)
|
27
|
+
ranks.include?(i.size)
|
28
|
+
else
|
29
|
+
ranks == i.size
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
result
|
34
|
+
end
|
35
|
+
|
36
|
+
def all_subsets
|
37
|
+
return [[]] if empty?
|
38
|
+
set = clone
|
39
|
+
first = set.shift
|
40
|
+
sets = set.all_subsets
|
41
|
+
sets.concat(sets.collect { |s| [first] + s })
|
42
|
+
return sets
|
43
|
+
end
|
44
|
+
protected :all_subsets
|
45
|
+
|
46
|
+
# TODO faster
|
47
|
+
def is_superset_of?(other)
|
48
|
+
subsets(false, true).find { |subset| other.content_equal?(subset) }
|
49
|
+
end
|
50
|
+
|
51
|
+
# TODO faster
|
52
|
+
def is_subset_of?(other)
|
53
|
+
other.subsets.find { |subset| content_equal?(subset) }
|
54
|
+
end
|
55
|
+
|
56
|
+
# TODO faster
|
57
|
+
def is_real_subset_of?(other)
|
58
|
+
other.subsets(false, true).find { |subset| content_equal?(subset) }
|
59
|
+
end
|
60
|
+
|
61
|
+
def parts(allow_empty = false)
|
62
|
+
subsets(allow_empty, !allow_empty).collect do |subset|
|
63
|
+
[ subset, self - subset]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def content_equal?(other)
|
68
|
+
return false if other.size != size
|
69
|
+
each do |item|
|
70
|
+
return false if !other.find { |other_item|
|
71
|
+
item.respond_to?(:content_equal?) ? item.content_equal?(other_item) : item == other_item
|
72
|
+
}
|
73
|
+
end
|
74
|
+
true
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
#puts [1,2].is_subset_of?([1,2]).inspect
|
79
|
+
#puts [1,2].is_subset_of?([1,2,3]).inspect
|
80
|
+
#puts [1,2].is_real_subset_of?([1,2]).inspect
|
81
|
+
#puts [1,2].is_real_subset_of?([1,2,3]).inspect
|
82
|
+
#puts [1,2,3].is_superset_of?([1,2]).inspect
|
83
|
+
#puts [1,2].is_superset_of?([1,2]).inspect
|
@@ -0,0 +1,41 @@
|
|
1
|
+
class FpGrowth::PrefixPath
|
2
|
+
|
3
|
+
attr_reader :support
|
4
|
+
|
5
|
+
def initialize(support = 1, array = [])
|
6
|
+
@array = array
|
7
|
+
@support = support
|
8
|
+
end
|
9
|
+
|
10
|
+
# delegate
|
11
|
+
def each(&block)
|
12
|
+
@array.each(&block)
|
13
|
+
end
|
14
|
+
|
15
|
+
# delegate
|
16
|
+
def <<(arg)
|
17
|
+
@array << arg
|
18
|
+
end
|
19
|
+
|
20
|
+
#delegate
|
21
|
+
def size
|
22
|
+
@array.size
|
23
|
+
end
|
24
|
+
|
25
|
+
def +(other)
|
26
|
+
return self unless other
|
27
|
+
FpGrowth::PrefixPath.new([@support, other.support].min, @array + other.to_a)
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
"[#{@array}:#{support}]"
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_a
|
35
|
+
@array
|
36
|
+
end
|
37
|
+
|
38
|
+
def ==(other)
|
39
|
+
(support == other.support) && (to_a.content_equal?(other.to_a))
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
2
|
+
require 'fp_growth'
|
3
|
+
|
4
|
+
if false
|
5
|
+
min_support = 3
|
6
|
+
transactions = [
|
7
|
+
['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'],
|
8
|
+
['a', 'b', 'c', 'f', 'l', 'm', 'o'],
|
9
|
+
['b', 'f', 'h', 'j', 'o'],
|
10
|
+
['b', 'c', 'k', 's', 'p'],
|
11
|
+
['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']
|
12
|
+
]
|
13
|
+
items = FpGrowth::FpTree.get_items(transactions)
|
14
|
+
tree = FpGrowth::FpTree.new(min_support, items, transactions)
|
15
|
+
else
|
16
|
+
tree = FpGrowth::FpTree.load
|
17
|
+
end
|
18
|
+
|
19
|
+
[:fp_growth, :fp_growth_slow].each do |m|
|
20
|
+
print "#{m}"
|
21
|
+
start = Time.now
|
22
|
+
100.times { tree.send(m) }
|
23
|
+
puts " took #{Time.now - start}s"
|
24
|
+
end
|
data/tests/tc_fp_tree.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
class TestFpTree < Test::Unit::TestCase
|
2
|
+
|
3
|
+
def test_file
|
4
|
+
assert_both_algorithms_equal FpGrowth::FpTree.load
|
5
|
+
end
|
6
|
+
|
7
|
+
# fails with ruby-1.8.7, ruby-1.9.1, jruby-1.3.0rc1
|
8
|
+
def test_something
|
9
|
+
transactions = [
|
10
|
+
[1, 2, 4, 3],
|
11
|
+
[1, 2, 4],
|
12
|
+
[1, 2],
|
13
|
+
[1, 3, 4],
|
14
|
+
[3]
|
15
|
+
]
|
16
|
+
assert_both_algorithms_equal create_tree(2, transactions)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def assert_both_algorithms_equal(tree)
|
22
|
+
frequent_patterns_right = tree.fp_growth_slow
|
23
|
+
frequent_patterns_wrong = tree.fp_growth
|
24
|
+
|
25
|
+
# text = "wrong has\n" << pp_hash(generate_difference_hash frequent_patterns_wrong, frequent_patterns_right)
|
26
|
+
# text << "\nright has\n" << pp_hash(generate_difference_hash frequent_patterns_right, frequent_patterns_wrong)
|
27
|
+
assert frequent_patterns_right.content_equal?(frequent_patterns_wrong)#, text
|
28
|
+
end
|
29
|
+
|
30
|
+
def pp_frequent_patterns(arg)
|
31
|
+
arg.collect { |a| a.to_s }.join(' ')
|
32
|
+
end
|
33
|
+
|
34
|
+
def create_tree(min_support, transactions)
|
35
|
+
FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(transactions), transactions)
|
36
|
+
end
|
37
|
+
|
38
|
+
# result holds elements in a but not in b
|
39
|
+
def generate_difference_hash(a, b)
|
40
|
+
hash = {}
|
41
|
+
a.each do |item|
|
42
|
+
hash[item.to_a.sort] = item.support
|
43
|
+
end
|
44
|
+
b.each do |item|
|
45
|
+
hash.delete(item.to_a.sort) if hash[item.to_a.sort] == item.support
|
46
|
+
end
|
47
|
+
hash
|
48
|
+
end
|
49
|
+
|
50
|
+
def pp_hash(hash)
|
51
|
+
hash.collect { |key, value| "#{key.collect { |item| item.to_s }.join(' ')}:#{value}" }.join("\n")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# what is used to test equality on array - array?
|
data/tests/ts_all.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'rubygems'
|
5
|
+
require 'fp_growth'
|
6
|
+
rescue LoadError
|
7
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
8
|
+
require 'fp_growth'
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
Dir.glob(File.join(File.dirname(__FILE__), 'tc_*.rb')).each do |file|
|
13
|
+
require file
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fp-growth
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Stefan Achatz
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-07-08 00:00:00 +02:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: |
|
17
|
+
This is an implementation of the fp-growth frequent pattern mining algorithm as
|
18
|
+
stated in the paper
|
19
|
+
|
20
|
+
Mining Frequent Patterns without Candidate Generation: A Frequent-Pattern Tree Approach
|
21
|
+
Han et al, Data Mining and Knowledge Discovery, 8, 53-87, 2004
|
22
|
+
|
23
|
+
email: stefan_achatz@web.de
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files:
|
29
|
+
- README
|
30
|
+
files:
|
31
|
+
- examples/small_ones.rb
|
32
|
+
- examples/cao.rb
|
33
|
+
- lib/fp_growth.rb
|
34
|
+
- lib/fp_growth/fp_tree.rb
|
35
|
+
- lib/fp_growth/fp_tree_node.rb
|
36
|
+
- lib/fp_growth/header_list.rb
|
37
|
+
- lib/fp_growth/inflections.rb
|
38
|
+
- lib/fp_growth/prefix_path.rb
|
39
|
+
- lib/fp_growth/header_list_node.rb
|
40
|
+
- tests/tc_fp_tree.rb
|
41
|
+
- tests/profile_fp_tree.rb
|
42
|
+
- tests/ts_all.rb
|
43
|
+
- README
|
44
|
+
has_rdoc: true
|
45
|
+
homepage: http://rubyforge.org/projects/fp-growth/
|
46
|
+
licenses: []
|
47
|
+
|
48
|
+
post_install_message:
|
49
|
+
rdoc_options:
|
50
|
+
- --main
|
51
|
+
- README
|
52
|
+
- --title
|
53
|
+
- fp-growth Documentation
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
version:
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: fp-growth
|
71
|
+
rubygems_version: 1.3.4
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Implementation of the fp-growth frequent pattern algorithm
|
75
|
+
test_files:
|
76
|
+
- tests/ts_all.rb
|