fp-growth 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +5 -0
- data/examples/cao.rb +207 -0
- data/examples/small_ones.rb +56 -0
- data/lib/fp_growth.rb +9 -0
- data/lib/fp_growth/fp_tree.rb +220 -0
- data/lib/fp_growth/fp_tree_node.rb +81 -0
- data/lib/fp_growth/header_list.rb +51 -0
- data/lib/fp_growth/header_list_node.rb +44 -0
- data/lib/fp_growth/inflections.rb +83 -0
- data/lib/fp_growth/prefix_path.rb +41 -0
- data/tests/profile_fp_tree.rb +24 -0
- data/tests/tc_fp_tree.rb +55 -0
- data/tests/ts_all.rb +14 -0
- metadata +76 -0
data/README
ADDED
data/examples/cao.rb
ADDED
@@ -0,0 +1,207 @@
|
|
1
|
+
begin
|
2
|
+
require 'rubygems'
|
3
|
+
require 'fp_growth'
|
4
|
+
rescue LoadError
|
5
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
require 'fp_growth'
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'mysql'
|
10
|
+
require 'singleton'
|
11
|
+
|
12
|
+
HOST =
|
13
|
+
USER =
|
14
|
+
PASSWD =
|
15
|
+
DB =
|
16
|
+
|
17
|
+
# This example shows the usage of fp-growth with a database from an
|
18
|
+
# inventory management system called cao_faktura (http://www.cao-faktura.de/)
|
19
|
+
class CaoDb
|
20
|
+
|
21
|
+
include Singleton
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
connect
|
25
|
+
end
|
26
|
+
|
27
|
+
def article_tree(min_support)
|
28
|
+
FpGrowth::FpTree.new(min_support, article_ids(min_support), article_transactions)
|
29
|
+
end
|
30
|
+
|
31
|
+
def category_tree(min_support)
|
32
|
+
transactions = category_transactions
|
33
|
+
FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(transactions), transactions)
|
34
|
+
end
|
35
|
+
|
36
|
+
def article_name(id)
|
37
|
+
@connection.query(%Q!
|
38
|
+
SELECT KURZNAME
|
39
|
+
FROM ARTIKEL
|
40
|
+
WHERE REC_ID = #{id}
|
41
|
+
!).each do |row|
|
42
|
+
return row[0]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def category_name(id)
|
47
|
+
@connection.query(%Q!
|
48
|
+
SELECT NAME
|
49
|
+
FROM WARENGRUPPEN
|
50
|
+
WHERE ID = #{id}
|
51
|
+
!).each do |row|
|
52
|
+
return row[0]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def article_ids(min_support)
|
57
|
+
result = []
|
58
|
+
data = @connection.query(%Q!
|
59
|
+
SELECT ARTIKEL_ID,
|
60
|
+
# ein Bon kann mehrere journalpos mit ident. Artikel enthalten
|
61
|
+
# und soll trotzdem nur einmal gezählt werden
|
62
|
+
COUNT(DISTINCT JOURNAL_ID) AS anzahl
|
63
|
+
FROM JOURNALPOS
|
64
|
+
GROUP BY ARTIKEL_ID
|
65
|
+
HAVING anzahl >= #{min_support}
|
66
|
+
# mit zwei spalten ist Sortierung eindeutig
|
67
|
+
ORDER BY anzahl DESC, ARTIKEL_ID DESC
|
68
|
+
!)
|
69
|
+
data.each do |row|
|
70
|
+
result << [ row[0].to_i, row[1].to_i ]
|
71
|
+
end
|
72
|
+
data.free
|
73
|
+
result
|
74
|
+
end
|
75
|
+
|
76
|
+
def category_ids(min_support)
|
77
|
+
result = []
|
78
|
+
data = @connection.query(%Q!
|
79
|
+
SELECT WARENGRUPPE,
|
80
|
+
COUNT(DISTINCT JOURNAL_ID) AS anzahl
|
81
|
+
FROM JOURNALPOS
|
82
|
+
GROUP BY ARTIKEL_ID
|
83
|
+
HAVING anzahl >= #{min_support}
|
84
|
+
ORDER BY anzahl DESC, WARENGRUPPE DESC
|
85
|
+
!)
|
86
|
+
data.each do |row|
|
87
|
+
result << [ row[0].to_i, row[1].to_i ]
|
88
|
+
end
|
89
|
+
data.free
|
90
|
+
result
|
91
|
+
end
|
92
|
+
|
93
|
+
def article_transactions
|
94
|
+
result = []
|
95
|
+
data = @connection.query(%q!
|
96
|
+
SELECT JOURNAL_ID,
|
97
|
+
CONVERT(GROUP_CONCAT(ARTIKEL_ID SEPARATOR ',') USING UTF8)
|
98
|
+
FROM JOURNALPOS
|
99
|
+
GROUP BY JOURNAL_ID
|
100
|
+
!)
|
101
|
+
data.each do |row|
|
102
|
+
result << row[1].split(',').collect { |id| id.to_i }
|
103
|
+
end
|
104
|
+
data.free
|
105
|
+
result
|
106
|
+
end
|
107
|
+
|
108
|
+
def category_transactions
|
109
|
+
result = []
|
110
|
+
data = @connection.query(%q!
|
111
|
+
SELECT JOURNAL_ID,
|
112
|
+
CONVERT(GROUP_CONCAT(WARENGRUPPE SEPARATOR ',') USING UTF8)
|
113
|
+
FROM JOURNALPOS
|
114
|
+
GROUP BY JOURNAL_ID
|
115
|
+
!)
|
116
|
+
data.each do |row|
|
117
|
+
result << row[1].split(',').collect { |id| id.to_i }
|
118
|
+
end
|
119
|
+
data.free
|
120
|
+
result
|
121
|
+
end
|
122
|
+
|
123
|
+
def count_transactions
|
124
|
+
@connection.query(%q!
|
125
|
+
SELECT COUNT(*) FROM JOURNAL
|
126
|
+
!).each do |row|
|
127
|
+
return row[0].to_i
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def do_articles(min_support, min_confidence)
|
132
|
+
print_article_assoziation_rules(article_tree(min_support).create_assoziation_rules(min_confidence))
|
133
|
+
end
|
134
|
+
private
|
135
|
+
|
136
|
+
def connect
|
137
|
+
@connection = Mysql::new(HOST, USER, PASSWD, DB)
|
138
|
+
@connection.query('SET NAMES "UTF8"')
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
def is_unnecessary_rule?(all, wanted, beta)
|
143
|
+
all.each do |candidate|
|
144
|
+
next if candidate[:right] != wanted[:right]
|
145
|
+
next if !candidate[:left].is_superset_of?(wanted[:left])
|
146
|
+
next if !candidate[:confidence].between?(wanted[:confidence] - beta, wanted[:confidence] + beta)
|
147
|
+
return true
|
148
|
+
end
|
149
|
+
false
|
150
|
+
end
|
151
|
+
|
152
|
+
def remove_unnecessary_rules(all, beta)
|
153
|
+
all.select { |candidate| !is_unnecessary_rule?(all, candidate, beta) }
|
154
|
+
end
|
155
|
+
|
156
|
+
def print_article_assoziation_rules(array)
|
157
|
+
array.each { |rule| print_article_assoziation_rule(rule) }
|
158
|
+
end
|
159
|
+
|
160
|
+
def print_article_assoziation_rule(rule)
|
161
|
+
print '['
|
162
|
+
rule[:left].each do |id|
|
163
|
+
print "(#{id}:#{CaoDb.instance.article_name(id)})"
|
164
|
+
end
|
165
|
+
print '] => ['
|
166
|
+
rule[:right].each do |id|
|
167
|
+
print "(#{id}:#{CaoDb.instance.article_name(id)})"
|
168
|
+
end
|
169
|
+
print '] = '
|
170
|
+
puts sprintf('s%i c%.2f', rule[:support], rule[:confidence])
|
171
|
+
end
|
172
|
+
|
173
|
+
def print_category_assoziation_rules(array)
|
174
|
+
array.each do |rule|
|
175
|
+
print '['
|
176
|
+
rule[:left].each do |id|
|
177
|
+
print "(#{id}:#{CaoDb.instance.category_name(id)})"
|
178
|
+
end
|
179
|
+
print '] => ['
|
180
|
+
rule[:right].each do |id|
|
181
|
+
print "(#{id}:#{CaoDb.instance.category_name(id)})"
|
182
|
+
end
|
183
|
+
print '] = '
|
184
|
+
puts sprintf('s%i c%.2f', rule[:support], rule[:confidence])
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def do_categories(min_support, min_confidence)
|
189
|
+
frequent_patterns = CaoDb.instance.category_tree(min_support).fp_growth
|
190
|
+
print_category_assoziation_rules(FpGrowth::Helper.create_assoziation_rules(frequent_patterns, min_confidence))
|
191
|
+
end
|
192
|
+
|
193
|
+
def do_articles(min_support, min_confidence)
|
194
|
+
frequent_patterns = CaoDb.instance.article_tree(min_support).fp_growth
|
195
|
+
print_article_assoziation_rules(
|
196
|
+
remove_unnecessary_rules(
|
197
|
+
FpGrowth::Helper.create_assoziation_rules(frequent_patterns, min_confidence),
|
198
|
+
0.2
|
199
|
+
)
|
200
|
+
)
|
201
|
+
end
|
202
|
+
|
203
|
+
min_support = 9 # abs = (rel * database.count_transactions).round
|
204
|
+
min_confidence = 0.5
|
205
|
+
|
206
|
+
do_articles(min_support, min_confidence)
|
207
|
+
#CaoDb.instance.article_tree(min_support).save
|
@@ -0,0 +1,56 @@
|
|
1
|
+
begin
|
2
|
+
require 'rubygems'
|
3
|
+
require 'fp_growth'
|
4
|
+
rescue LoadError
|
5
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
require 'fp_growth'
|
7
|
+
end
|
8
|
+
|
9
|
+
def example_1 # paper
|
10
|
+
transactions = [
|
11
|
+
['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'],
|
12
|
+
['a', 'b', 'c', 'f', 'l', 'm', 'o'],
|
13
|
+
['b', 'f', 'h', 'j', 'o'],
|
14
|
+
['b', 'c', 'k', 's', 'p'],
|
15
|
+
['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']
|
16
|
+
]
|
17
|
+
FpGrowth::FpTree.new(3, v.get_items(transactions), transactions)
|
18
|
+
end
|
19
|
+
|
20
|
+
def example_2 # buch s 123
|
21
|
+
transactions = [
|
22
|
+
[1,3,6,8,9],
|
23
|
+
[1,2,3,6,7,8],
|
24
|
+
[2,4,6],
|
25
|
+
[2,3,4,9],
|
26
|
+
[1,3,5,6,7,8,9]
|
27
|
+
]
|
28
|
+
FpGrowth::FpTree.new(3, FpGrowth::FpTree.get_items(transactions), transactions)
|
29
|
+
end
|
30
|
+
|
31
|
+
def example_3
|
32
|
+
transactions = [
|
33
|
+
[1,3,4],
|
34
|
+
[4],
|
35
|
+
[1,2,4,5],
|
36
|
+
[1,6],
|
37
|
+
[1,2],
|
38
|
+
[1,6],
|
39
|
+
[1,4],
|
40
|
+
[1,2,4]
|
41
|
+
]
|
42
|
+
FpGrowth::FpTree.new(2, FpGrowth::FpTree.get_items(transactions), transactions)
|
43
|
+
end
|
44
|
+
|
45
|
+
def example_4 # thesis adaptive-fp
|
46
|
+
transactions = [
|
47
|
+
['a', 'c', 'd', 'e', 'f'],
|
48
|
+
['a', 'b', 'e'],
|
49
|
+
['c', 'e', 'f'],
|
50
|
+
['a', 'c', 'd', 'f'],
|
51
|
+
['c', 'e', 'f']
|
52
|
+
]
|
53
|
+
FpGrowth::FpTree.new(2, FpGrowth::FpTree.get_items(transactions), transactions)
|
54
|
+
end
|
55
|
+
|
56
|
+
puts example_4
|
data/lib/fp_growth.rb
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
require 'fp_growth/inflections'
|
2
|
+
|
3
|
+
module FpGrowth
|
4
|
+
autoload :FpTreeNode, 'fp_growth/fp_tree_node'
|
5
|
+
autoload :FpTree, 'fp_growth/fp_tree'
|
6
|
+
autoload :HeaderListNode, 'fp_growth/header_list_node'
|
7
|
+
autoload :HeaderList, 'fp_growth/header_list'
|
8
|
+
autoload :PrefixPath, 'fp_growth/prefix_path'
|
9
|
+
end
|
@@ -0,0 +1,220 @@
|
|
1
|
+
module FpGrowth::Helper
|
2
|
+
def self.subsets_as_prefix_path(array)
|
3
|
+
result = []
|
4
|
+
array.subsets.each do |combination|
|
5
|
+
min_support = combination.min { |a, b| a.count <=> b.count }.count
|
6
|
+
elements = combination.collect { |c| c.key }
|
7
|
+
result << FpGrowth::PrefixPath.new(min_support, elements)
|
8
|
+
end
|
9
|
+
result
|
10
|
+
end
|
11
|
+
|
12
|
+
# takes arrays of prefix_paths
|
13
|
+
def self.cross_product(a, b)
|
14
|
+
result = []
|
15
|
+
a.each do |a_el|
|
16
|
+
b.each do |b_el|
|
17
|
+
result << (a_el + b_el)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
result
|
21
|
+
end
|
22
|
+
|
23
|
+
# TODO put handling of assoziation rules in an extra namespace
|
24
|
+
# and copy the unnecessary rules code from examples/cao.rb in it too
|
25
|
+
def self.create_assoziation_rules(frequent_patterns, min_confidence)
|
26
|
+
result = []
|
27
|
+
frequent_patterns.each do |pattern|
|
28
|
+
next if pattern.size < 2
|
29
|
+
pattern.to_a.parts.each do |part|
|
30
|
+
other_pattern = frequent_patterns.find { |p| part.first.content_equal?(p.to_a) }
|
31
|
+
confidence = pattern.support.to_f / other_pattern.support.to_f
|
32
|
+
if confidence >= min_confidence
|
33
|
+
result << {
|
34
|
+
:left => part.first,
|
35
|
+
:right => part.last,
|
36
|
+
:support => pattern.support,
|
37
|
+
:confidence => confidence
|
38
|
+
}
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
result
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
class FpGrowth::FpTree
|
48
|
+
|
49
|
+
attr_reader :min_support, :root
|
50
|
+
attr_accessor :header_list
|
51
|
+
protected :header_list, :header_list=, :root
|
52
|
+
|
53
|
+
# items is sorted array of [key, count]
|
54
|
+
# min_support is not in %, but absolute
|
55
|
+
def initialize(min_support, items, transactions = [])
|
56
|
+
@min_support = min_support
|
57
|
+
@root = FpGrowth::FpTreeNode.new(nil)
|
58
|
+
|
59
|
+
# the count information is lost in this step
|
60
|
+
# HeaderList calculates this value again if needed
|
61
|
+
@header_list = FpGrowth::HeaderList.new(select_only_frequent_items(items))
|
62
|
+
|
63
|
+
add_transactions(transactions)
|
64
|
+
end
|
65
|
+
|
66
|
+
def empty?
|
67
|
+
@root.children.empty?
|
68
|
+
end
|
69
|
+
|
70
|
+
def add_transactions(transactions)
|
71
|
+
transactions.each do |transaction|
|
72
|
+
add_transaction(transaction)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# TODO decide: bild header list online or with rebuild_header_list afterwards
|
77
|
+
def add_transaction(transaction)
|
78
|
+
transaction = prepare_transaction(transaction)
|
79
|
+
increment = transaction.support
|
80
|
+
tree_node = @root
|
81
|
+
transaction.each do |key|
|
82
|
+
if next_node = tree_node.find_child(key)
|
83
|
+
next_node.increase_count(increment)
|
84
|
+
else
|
85
|
+
next_node = FpGrowth::FpTreeNode.new(key, increment)
|
86
|
+
tree_node.add_child(next_node)
|
87
|
+
|
88
|
+
@header_list[key] << next_node
|
89
|
+
end
|
90
|
+
|
91
|
+
tree_node = next_node
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def to_s
|
96
|
+
if false
|
97
|
+
@root.to_s
|
98
|
+
else
|
99
|
+
result = "Tree (min_s = #{@min_support})\n"
|
100
|
+
result << @root.to_s
|
101
|
+
result << "\nHeader list\n"
|
102
|
+
result << @header_list.to_s
|
103
|
+
result
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# arg is new root
|
108
|
+
def subtree(root_node)
|
109
|
+
result = FpGrowth::FpTree.new(@min_support, [])
|
110
|
+
root_node.clone_children(result.root)
|
111
|
+
result.rebuild_header_list
|
112
|
+
result
|
113
|
+
end
|
114
|
+
|
115
|
+
def fp_growth(alpha = nil)
|
116
|
+
single, multi = split_paths
|
117
|
+
single_teil = []
|
118
|
+
multi_teil = []
|
119
|
+
if single
|
120
|
+
single_teil = FpGrowth::Helper.subsets_as_prefix_path(single)
|
121
|
+
end
|
122
|
+
if multi
|
123
|
+
multi.header_list.each do |node|
|
124
|
+
immediate_frequent_pattern = node.immediate_frequent_pattern
|
125
|
+
multi_teil << immediate_frequent_pattern
|
126
|
+
new_tree = node.conditional_fp_tree(@min_support)
|
127
|
+
if !new_tree.empty?
|
128
|
+
multi_teil += new_tree.fp_growth(immediate_frequent_pattern)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
FpGrowth::Helper.cross_product(single_teil + multi_teil + FpGrowth::Helper.cross_product(single_teil, multi_teil), [alpha])
|
133
|
+
end
|
134
|
+
|
135
|
+
# slow version used only as reference for testing
|
136
|
+
# does not split tree in single- and multipath parts
|
137
|
+
def fp_growth_slow(alpha = nil)
|
138
|
+
result = []
|
139
|
+
header_list.each do |node|
|
140
|
+
base_path = node.immediate_frequent_pattern + alpha
|
141
|
+
result << base_path
|
142
|
+
new_tree = node.conditional_fp_tree(@min_support)
|
143
|
+
if !new_tree.empty?
|
144
|
+
result += new_tree.fp_growth_slow(base_path)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
result
|
148
|
+
end
|
149
|
+
|
150
|
+
# can be used with Array or PrefixPath
|
151
|
+
# returns sorted array of [key, count]
|
152
|
+
def self.get_items(transactions)
|
153
|
+
result = {}
|
154
|
+
|
155
|
+
transactions.each do |transaction|
|
156
|
+
|
157
|
+
increment = transaction.respond_to?(:support) ? transaction.support : 1
|
158
|
+
|
159
|
+
transaction.each do |item|
|
160
|
+
result[item] ||= 0
|
161
|
+
result[item] += increment
|
162
|
+
end
|
163
|
+
end
|
164
|
+
result.to_a.
|
165
|
+
sort { |a, b| b.last <=> a.last }
|
166
|
+
end
|
167
|
+
|
168
|
+
def save(filename = 'fp_tree.txt')
|
169
|
+
File.open(filename, 'w') do |file|
|
170
|
+
file.write Marshal.dump(self)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def self.load(filename = 'fp_tree.txt')
|
175
|
+
Marshal.load(IO.read(filename))
|
176
|
+
end
|
177
|
+
|
178
|
+
protected
|
179
|
+
|
180
|
+
def rebuild_header_list
|
181
|
+
@root.each do |node|
|
182
|
+
@header_list[node.key] << node
|
183
|
+
end
|
184
|
+
@header_list.sort
|
185
|
+
end
|
186
|
+
|
187
|
+
private
|
188
|
+
|
189
|
+
# returnvalue is array of two elements
|
190
|
+
# first element is nil or array of tree_nodes which form the single prefix path
|
191
|
+
# second element is nil or FpTree which forms the multipath part
|
192
|
+
def split_paths
|
193
|
+
return [nil, self] if @root.children.size != 1
|
194
|
+
single = []
|
195
|
+
node = @root
|
196
|
+
while node.children.size == 1
|
197
|
+
# this leaves out root
|
198
|
+
node = node.children.first
|
199
|
+
single << node
|
200
|
+
end
|
201
|
+
multi = node.children.empty? ? nil : subtree(node)
|
202
|
+
[single, multi]
|
203
|
+
end
|
204
|
+
|
205
|
+
# removes non-frequent-items and sorts remaining frequent-items on stored ordered-items
|
206
|
+
# arg can be Array or PrefixPath
|
207
|
+
# returns PrefixPath
|
208
|
+
def prepare_transaction(transaction)
|
209
|
+
support = transaction.respond_to?(:support) ? transaction.support : 1
|
210
|
+
# relies on the fact that Array#& keeps order of first array
|
211
|
+
FpGrowth::PrefixPath.new(support, @header_list.all_keys & transaction.to_a)
|
212
|
+
end
|
213
|
+
|
214
|
+
# takes array of [key,count]
|
215
|
+
# keeps order
|
216
|
+
def select_only_frequent_items(items)
|
217
|
+
items.select { |item| item.last >= @min_support }
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
# knows nothing about header list
|
4
|
+
class FpGrowth::FpTreeNode
|
5
|
+
|
6
|
+
attr_reader :key, :children, :count
|
7
|
+
attr_accessor :parent
|
8
|
+
|
9
|
+
protected :parent=
|
10
|
+
|
11
|
+
def initialize(key, count = 1)
|
12
|
+
@key = key
|
13
|
+
@count = count
|
14
|
+
@parent = nil
|
15
|
+
@children = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_child(child)
|
19
|
+
@children << child
|
20
|
+
child.parent = self
|
21
|
+
end
|
22
|
+
|
23
|
+
def find_child(key)
|
24
|
+
children.find { |child| child.key == key }
|
25
|
+
end
|
26
|
+
|
27
|
+
# prints tree structure
|
28
|
+
def to_s(depth = 0)
|
29
|
+
if false
|
30
|
+
title = @key ? "#{@key}:#{count}" : 'root'
|
31
|
+
"( #{title} #{@children.collect { |child| child.to_s }.join(' ')})"
|
32
|
+
else
|
33
|
+
result = key ? "#{@key}:#{count} " : ''
|
34
|
+
result << @children.collect { |child| child.to_s(depth + 1) }.join("\n" << ' ' * 4 * depth)
|
35
|
+
result
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def increase_count(value = 1)
|
40
|
+
@count += value
|
41
|
+
end
|
42
|
+
|
43
|
+
def prefix_path
|
44
|
+
result = FpGrowth::PrefixPath.new(count)
|
45
|
+
node = parent
|
46
|
+
while !node.is_root?
|
47
|
+
result << node.key
|
48
|
+
node = node.parent
|
49
|
+
end
|
50
|
+
result
|
51
|
+
end
|
52
|
+
|
53
|
+
def clone_children(new_parent)
|
54
|
+
@children.each do |child|
|
55
|
+
cloned_child = child.clone
|
56
|
+
cloned_child.parent = new_parent
|
57
|
+
new_parent.children << cloned_child
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# walks recursive through whole tree, without self
|
62
|
+
def each &block
|
63
|
+
children.each do |child|
|
64
|
+
block.call(child)
|
65
|
+
child.each(&block)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def is_root?
|
70
|
+
!parent
|
71
|
+
end
|
72
|
+
|
73
|
+
protected
|
74
|
+
|
75
|
+
def clone
|
76
|
+
result = FpGrowth::FpTreeNode.new(@key, @count)
|
77
|
+
clone_children(result)
|
78
|
+
result
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# Uses array as storage. Slow lookup but keeps order of items
|
2
|
+
class FpGrowth::HeaderList
|
3
|
+
|
4
|
+
# takes array of [key,count] or array of keys
|
5
|
+
def initialize(items)
|
6
|
+
items = items.first.respond_to?(:last) ? get_item_keys(items) : items
|
7
|
+
@array = items.collect { |item| FpGrowth::HeaderListNode.new(item) }
|
8
|
+
end
|
9
|
+
|
10
|
+
# lookup element via key
|
11
|
+
# if key is not found, a new element is added
|
12
|
+
def [](key)
|
13
|
+
if existing_node = @array.find { |item| item.key == key }
|
14
|
+
existing_node
|
15
|
+
else
|
16
|
+
new_node = FpGrowth::HeaderListNode.new(key)
|
17
|
+
@array << new_node
|
18
|
+
new_node
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# delegate to @array
|
23
|
+
def each(&block)
|
24
|
+
@array.each &block
|
25
|
+
end
|
26
|
+
|
27
|
+
def each_key
|
28
|
+
@array.each { |node| yield(node.key) }
|
29
|
+
end
|
30
|
+
|
31
|
+
def all_keys
|
32
|
+
@array.collect { |node| node.key }
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
@array.collect { |node| node.to_s }.join("\n")
|
37
|
+
end
|
38
|
+
|
39
|
+
# needed for cloning of fptree
|
40
|
+
def sort
|
41
|
+
@array.sort { |a, b| b.count <=> a.count }
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
# takes array of [key,count]
|
47
|
+
# returns array of keys
|
48
|
+
def get_item_keys(items)
|
49
|
+
items.collect { |item| item.first }
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
class FpGrowth::HeaderListNode
|
2
|
+
|
3
|
+
attr_reader :key
|
4
|
+
|
5
|
+
def initialize(key)
|
6
|
+
@key = key
|
7
|
+
@array = []
|
8
|
+
end
|
9
|
+
|
10
|
+
# delegate to @array
|
11
|
+
def <<(tree_node)
|
12
|
+
@array << tree_node
|
13
|
+
end
|
14
|
+
|
15
|
+
# delegate to @array
|
16
|
+
def each(&block)
|
17
|
+
@array.each &block
|
18
|
+
end
|
19
|
+
|
20
|
+
# returns array of all prefix paths
|
21
|
+
def conditional_pattern_base
|
22
|
+
result = @array.collect { |tree_node| tree_node.prefix_path }
|
23
|
+
result.select { |pp| !pp.to_a.empty? } # strip empty ones
|
24
|
+
end
|
25
|
+
|
26
|
+
def conditional_fp_tree(min_support)
|
27
|
+
cpb = conditional_pattern_base
|
28
|
+
FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(cpb), cpb)
|
29
|
+
end
|
30
|
+
|
31
|
+
def immediate_frequent_pattern
|
32
|
+
FpGrowth::PrefixPath.new(count,[key])
|
33
|
+
end
|
34
|
+
|
35
|
+
# TODO count could be given at initialize if known
|
36
|
+
def count
|
37
|
+
@array.inject(0) { |result, tree_node| result + tree_node.count }
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_s
|
41
|
+
"#{key}:#{count} -> " <<
|
42
|
+
@array.collect { |tree_node| "#{tree_node.key}:#{tree_node.count}" }.join(' ')
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
class Array
|
2
|
+
# # slow, but nonrecursive solution
|
3
|
+
# def subsets_slow(include_empty = false, just_real = false)
|
4
|
+
# bits = size
|
5
|
+
# limit = 2 ** bits - 1
|
6
|
+
# result = []
|
7
|
+
# (include_empty ? 0 : 1).upto(just_real ? (limit - 1) : limit) do |count|
|
8
|
+
# mask = sprintf('%01$*2$b', count, bits).split(//).collect { |bit| bit == '1' }
|
9
|
+
#
|
10
|
+
# temp = []
|
11
|
+
# mask.each_index do |index|
|
12
|
+
# temp << self[index] if mask[index]
|
13
|
+
# end
|
14
|
+
# result << temp
|
15
|
+
# end
|
16
|
+
# result
|
17
|
+
# end
|
18
|
+
|
19
|
+
# modified from http://branch14.org/snippets/subsets_in_ruby.html
|
20
|
+
def subsets(include_empty = false, just_real = false, ranks = nil)
|
21
|
+
result = all_subsets
|
22
|
+
result.shift unless include_empty
|
23
|
+
result.pop if just_real
|
24
|
+
if ranks
|
25
|
+
result = result.select do |i|
|
26
|
+
if ranks.respond_to?(:include?)
|
27
|
+
ranks.include?(i.size)
|
28
|
+
else
|
29
|
+
ranks == i.size
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
result
|
34
|
+
end
|
35
|
+
|
36
|
+
def all_subsets
|
37
|
+
return [[]] if empty?
|
38
|
+
set = clone
|
39
|
+
first = set.shift
|
40
|
+
sets = set.all_subsets
|
41
|
+
sets.concat(sets.collect { |s| [first] + s })
|
42
|
+
return sets
|
43
|
+
end
|
44
|
+
protected :all_subsets
|
45
|
+
|
46
|
+
# TODO faster
|
47
|
+
def is_superset_of?(other)
|
48
|
+
subsets(false, true).find { |subset| other.content_equal?(subset) }
|
49
|
+
end
|
50
|
+
|
51
|
+
# TODO faster
|
52
|
+
def is_subset_of?(other)
|
53
|
+
other.subsets.find { |subset| content_equal?(subset) }
|
54
|
+
end
|
55
|
+
|
56
|
+
# TODO faster
|
57
|
+
def is_real_subset_of?(other)
|
58
|
+
other.subsets(false, true).find { |subset| content_equal?(subset) }
|
59
|
+
end
|
60
|
+
|
61
|
+
def parts(allow_empty = false)
|
62
|
+
subsets(allow_empty, !allow_empty).collect do |subset|
|
63
|
+
[ subset, self - subset]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def content_equal?(other)
|
68
|
+
return false if other.size != size
|
69
|
+
each do |item|
|
70
|
+
return false if !other.find { |other_item|
|
71
|
+
item.respond_to?(:content_equal?) ? item.content_equal?(other_item) : item == other_item
|
72
|
+
}
|
73
|
+
end
|
74
|
+
true
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
#puts [1,2].is_subset_of?([1,2]).inspect
|
79
|
+
#puts [1,2].is_subset_of?([1,2,3]).inspect
|
80
|
+
#puts [1,2].is_real_subset_of?([1,2]).inspect
|
81
|
+
#puts [1,2].is_real_subset_of?([1,2,3]).inspect
|
82
|
+
#puts [1,2,3].is_superset_of?([1,2]).inspect
|
83
|
+
#puts [1,2].is_superset_of?([1,2]).inspect
|
@@ -0,0 +1,41 @@
|
|
1
|
+
class FpGrowth::PrefixPath
|
2
|
+
|
3
|
+
attr_reader :support
|
4
|
+
|
5
|
+
def initialize(support = 1, array = [])
|
6
|
+
@array = array
|
7
|
+
@support = support
|
8
|
+
end
|
9
|
+
|
10
|
+
# delegate
|
11
|
+
def each(&block)
|
12
|
+
@array.each(&block)
|
13
|
+
end
|
14
|
+
|
15
|
+
# delegate
|
16
|
+
def <<(arg)
|
17
|
+
@array << arg
|
18
|
+
end
|
19
|
+
|
20
|
+
#delegate
|
21
|
+
def size
|
22
|
+
@array.size
|
23
|
+
end
|
24
|
+
|
25
|
+
def +(other)
|
26
|
+
return self unless other
|
27
|
+
FpGrowth::PrefixPath.new([@support, other.support].min, @array + other.to_a)
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
"[#{@array}:#{support}]"
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_a
|
35
|
+
@array
|
36
|
+
end
|
37
|
+
|
38
|
+
def ==(other)
|
39
|
+
(support == other.support) && (to_a.content_equal?(other.to_a))
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
2
|
+
require 'fp_growth'
|
3
|
+
|
4
|
+
if false
|
5
|
+
min_support = 3
|
6
|
+
transactions = [
|
7
|
+
['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'],
|
8
|
+
['a', 'b', 'c', 'f', 'l', 'm', 'o'],
|
9
|
+
['b', 'f', 'h', 'j', 'o'],
|
10
|
+
['b', 'c', 'k', 's', 'p'],
|
11
|
+
['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']
|
12
|
+
]
|
13
|
+
items = FpGrowth::FpTree.get_items(transactions)
|
14
|
+
tree = FpGrowth::FpTree.new(min_support, items, transactions)
|
15
|
+
else
|
16
|
+
tree = FpGrowth::FpTree.load
|
17
|
+
end
|
18
|
+
|
19
|
+
[:fp_growth, :fp_growth_slow].each do |m|
|
20
|
+
print "#{m}"
|
21
|
+
start = Time.now
|
22
|
+
100.times { tree.send(m) }
|
23
|
+
puts " took #{Time.now - start}s"
|
24
|
+
end
|
data/tests/tc_fp_tree.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
class TestFpTree < Test::Unit::TestCase
|
2
|
+
|
3
|
+
def test_file
|
4
|
+
assert_both_algorithms_equal FpGrowth::FpTree.load
|
5
|
+
end
|
6
|
+
|
7
|
+
# fails with ruby-1.8.7, ruby-1.9.1, jruby-1.3.0rc1
|
8
|
+
def test_something
|
9
|
+
transactions = [
|
10
|
+
[1, 2, 4, 3],
|
11
|
+
[1, 2, 4],
|
12
|
+
[1, 2],
|
13
|
+
[1, 3, 4],
|
14
|
+
[3]
|
15
|
+
]
|
16
|
+
assert_both_algorithms_equal create_tree(2, transactions)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def assert_both_algorithms_equal(tree)
|
22
|
+
frequent_patterns_right = tree.fp_growth_slow
|
23
|
+
frequent_patterns_wrong = tree.fp_growth
|
24
|
+
|
25
|
+
# text = "wrong has\n" << pp_hash(generate_difference_hash frequent_patterns_wrong, frequent_patterns_right)
|
26
|
+
# text << "\nright has\n" << pp_hash(generate_difference_hash frequent_patterns_right, frequent_patterns_wrong)
|
27
|
+
assert frequent_patterns_right.content_equal?(frequent_patterns_wrong)#, text
|
28
|
+
end
|
29
|
+
|
30
|
+
def pp_frequent_patterns(arg)
|
31
|
+
arg.collect { |a| a.to_s }.join(' ')
|
32
|
+
end
|
33
|
+
|
34
|
+
def create_tree(min_support, transactions)
|
35
|
+
FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(transactions), transactions)
|
36
|
+
end
|
37
|
+
|
38
|
+
# result holds elements in a but not in b
|
39
|
+
def generate_difference_hash(a, b)
|
40
|
+
hash = {}
|
41
|
+
a.each do |item|
|
42
|
+
hash[item.to_a.sort] = item.support
|
43
|
+
end
|
44
|
+
b.each do |item|
|
45
|
+
hash.delete(item.to_a.sort) if hash[item.to_a.sort] == item.support
|
46
|
+
end
|
47
|
+
hash
|
48
|
+
end
|
49
|
+
|
50
|
+
def pp_hash(hash)
|
51
|
+
hash.collect { |key, value| "#{key.collect { |item| item.to_s }.join(' ')}:#{value}" }.join("\n")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# what is used to test equality on array - array?
|
data/tests/ts_all.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'rubygems'
|
5
|
+
require 'fp_growth'
|
6
|
+
rescue LoadError
|
7
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
8
|
+
require 'fp_growth'
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
Dir.glob(File.join(File.dirname(__FILE__), 'tc_*.rb')).each do |file|
|
13
|
+
require file
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fp-growth
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Stefan Achatz
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-07-08 00:00:00 +02:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: |
|
17
|
+
This is an implementation of the fp-growth frequent pattern mining algorithm as
|
18
|
+
stated in the paper
|
19
|
+
|
20
|
+
Mining Frequent Patterns without Candidate Generation: A Frequent-Pattern Tree Approach
|
21
|
+
Han et al, Data Mining and Knowledge Discovery, 8, 53-87, 2004
|
22
|
+
|
23
|
+
email: stefan_achatz@web.de
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files:
|
29
|
+
- README
|
30
|
+
files:
|
31
|
+
- examples/small_ones.rb
|
32
|
+
- examples/cao.rb
|
33
|
+
- lib/fp_growth.rb
|
34
|
+
- lib/fp_growth/fp_tree.rb
|
35
|
+
- lib/fp_growth/fp_tree_node.rb
|
36
|
+
- lib/fp_growth/header_list.rb
|
37
|
+
- lib/fp_growth/inflections.rb
|
38
|
+
- lib/fp_growth/prefix_path.rb
|
39
|
+
- lib/fp_growth/header_list_node.rb
|
40
|
+
- tests/tc_fp_tree.rb
|
41
|
+
- tests/profile_fp_tree.rb
|
42
|
+
- tests/ts_all.rb
|
43
|
+
- README
|
44
|
+
has_rdoc: true
|
45
|
+
homepage: http://rubyforge.org/projects/fp-growth/
|
46
|
+
licenses: []
|
47
|
+
|
48
|
+
post_install_message:
|
49
|
+
rdoc_options:
|
50
|
+
- --main
|
51
|
+
- README
|
52
|
+
- --title
|
53
|
+
- fp-growth Documentation
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
version:
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: fp-growth
|
71
|
+
rubygems_version: 1.3.4
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Implementation of the fp-growth frequent pattern algorithm
|
75
|
+
test_files:
|
76
|
+
- tests/ts_all.rb
|