fp-growth 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,5 @@
1
+ This is an implementation of the fp-growth frequent pattern mining algorithm as
2
+ stated in the paper
3
+
4
+ Mining Frequent Patterns without Candidate Generation: A Frequent-Pattern Tree Approach
5
+ Han et al, Data Mining and Knowledge Discovery, 8, 53-87, 2004
@@ -0,0 +1,207 @@
1
+ begin
2
+ require 'rubygems'
3
+ require 'fp_growth'
4
+ rescue LoadError
5
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
+ require 'fp_growth'
7
+ end
8
+
9
+ require 'mysql'
10
+ require 'singleton'
11
+
12
+ HOST =
13
+ USER =
14
+ PASSWD =
15
+ DB =
16
+
17
+ # This example shows the usage of fp-growth with a database from an
18
+ # inventory management system called cao_faktura (http://www.cao-faktura.de/)
19
+ class CaoDb
20
+
21
+ include Singleton
22
+
23
+ def initialize
24
+ connect
25
+ end
26
+
27
+ def article_tree(min_support)
28
+ FpGrowth::FpTree.new(min_support, article_ids(min_support), article_transactions)
29
+ end
30
+
31
+ def category_tree(min_support)
32
+ transactions = category_transactions
33
+ FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(transactions), transactions)
34
+ end
35
+
36
+ def article_name(id)
37
+ @connection.query(%Q!
38
+ SELECT KURZNAME
39
+ FROM ARTIKEL
40
+ WHERE REC_ID = #{id}
41
+ !).each do |row|
42
+ return row[0]
43
+ end
44
+ end
45
+
46
+ def category_name(id)
47
+ @connection.query(%Q!
48
+ SELECT NAME
49
+ FROM WARENGRUPPEN
50
+ WHERE ID = #{id}
51
+ !).each do |row|
52
+ return row[0]
53
+ end
54
+ end
55
+
56
+ def article_ids(min_support)
57
+ result = []
58
+ data = @connection.query(%Q!
59
+ SELECT ARTIKEL_ID,
60
+ # ein Bon kann mehrere journalpos mit ident. Artikel enthalten
61
+ # und soll trotzdem nur einmal gezählt werden
62
+ COUNT(DISTINCT JOURNAL_ID) AS anzahl
63
+ FROM JOURNALPOS
64
+ GROUP BY ARTIKEL_ID
65
+ HAVING anzahl >= #{min_support}
66
+ # mit zwei spalten ist Sortierung eindeutig
67
+ ORDER BY anzahl DESC, ARTIKEL_ID DESC
68
+ !)
69
+ data.each do |row|
70
+ result << [ row[0].to_i, row[1].to_i ]
71
+ end
72
+ data.free
73
+ result
74
+ end
75
+
76
+ def category_ids(min_support)
77
+ result = []
78
+ data = @connection.query(%Q!
79
+ SELECT WARENGRUPPE,
80
+ COUNT(DISTINCT JOURNAL_ID) AS anzahl
81
+ FROM JOURNALPOS
82
+ GROUP BY ARTIKEL_ID
83
+ HAVING anzahl >= #{min_support}
84
+ ORDER BY anzahl DESC, WARENGRUPPE DESC
85
+ !)
86
+ data.each do |row|
87
+ result << [ row[0].to_i, row[1].to_i ]
88
+ end
89
+ data.free
90
+ result
91
+ end
92
+
93
+ def article_transactions
94
+ result = []
95
+ data = @connection.query(%q!
96
+ SELECT JOURNAL_ID,
97
+ CONVERT(GROUP_CONCAT(ARTIKEL_ID SEPARATOR ',') USING UTF8)
98
+ FROM JOURNALPOS
99
+ GROUP BY JOURNAL_ID
100
+ !)
101
+ data.each do |row|
102
+ result << row[1].split(',').collect { |id| id.to_i }
103
+ end
104
+ data.free
105
+ result
106
+ end
107
+
108
+ def category_transactions
109
+ result = []
110
+ data = @connection.query(%q!
111
+ SELECT JOURNAL_ID,
112
+ CONVERT(GROUP_CONCAT(WARENGRUPPE SEPARATOR ',') USING UTF8)
113
+ FROM JOURNALPOS
114
+ GROUP BY JOURNAL_ID
115
+ !)
116
+ data.each do |row|
117
+ result << row[1].split(',').collect { |id| id.to_i }
118
+ end
119
+ data.free
120
+ result
121
+ end
122
+
123
+ def count_transactions
124
+ @connection.query(%q!
125
+ SELECT COUNT(*) FROM JOURNAL
126
+ !).each do |row|
127
+ return row[0].to_i
128
+ end
129
+ end
130
+
131
+ def do_articles(min_support, min_confidence)
132
+ print_article_assoziation_rules(article_tree(min_support).create_assoziation_rules(min_confidence))
133
+ end
134
+ private
135
+
136
+ def connect
137
+ @connection = Mysql::new(HOST, USER, PASSWD, DB)
138
+ @connection.query('SET NAMES "UTF8"')
139
+ end
140
+ end
141
+
142
+ def is_unnecessary_rule?(all, wanted, beta)
143
+ all.each do |candidate|
144
+ next if candidate[:right] != wanted[:right]
145
+ next if !candidate[:left].is_superset_of?(wanted[:left])
146
+ next if !candidate[:confidence].between?(wanted[:confidence] - beta, wanted[:confidence] + beta)
147
+ return true
148
+ end
149
+ false
150
+ end
151
+
152
+ def remove_unnecessary_rules(all, beta)
153
+ all.select { |candidate| !is_unnecessary_rule?(all, candidate, beta) }
154
+ end
155
+
156
+ def print_article_assoziation_rules(array)
157
+ array.each { |rule| print_article_assoziation_rule(rule) }
158
+ end
159
+
160
+ def print_article_assoziation_rule(rule)
161
+ print '['
162
+ rule[:left].each do |id|
163
+ print "(#{id}:#{CaoDb.instance.article_name(id)})"
164
+ end
165
+ print '] => ['
166
+ rule[:right].each do |id|
167
+ print "(#{id}:#{CaoDb.instance.article_name(id)})"
168
+ end
169
+ print '] = '
170
+ puts sprintf('s%i c%.2f', rule[:support], rule[:confidence])
171
+ end
172
+
173
+ def print_category_assoziation_rules(array)
174
+ array.each do |rule|
175
+ print '['
176
+ rule[:left].each do |id|
177
+ print "(#{id}:#{CaoDb.instance.category_name(id)})"
178
+ end
179
+ print '] => ['
180
+ rule[:right].each do |id|
181
+ print "(#{id}:#{CaoDb.instance.category_name(id)})"
182
+ end
183
+ print '] = '
184
+ puts sprintf('s%i c%.2f', rule[:support], rule[:confidence])
185
+ end
186
+ end
187
+
188
+ def do_categories(min_support, min_confidence)
189
+ frequent_patterns = CaoDb.instance.category_tree(min_support).fp_growth
190
+ print_category_assoziation_rules(FpGrowth::Helper.create_assoziation_rules(frequent_patterns, min_confidence))
191
+ end
192
+
193
+ def do_articles(min_support, min_confidence)
194
+ frequent_patterns = CaoDb.instance.article_tree(min_support).fp_growth
195
+ print_article_assoziation_rules(
196
+ remove_unnecessary_rules(
197
+ FpGrowth::Helper.create_assoziation_rules(frequent_patterns, min_confidence),
198
+ 0.2
199
+ )
200
+ )
201
+ end
202
+
203
+ min_support = 9 # abs = (rel * database.count_transactions).round
204
+ min_confidence = 0.5
205
+
206
+ do_articles(min_support, min_confidence)
207
+ #CaoDb.instance.article_tree(min_support).save
@@ -0,0 +1,56 @@
1
+ begin
2
+ require 'rubygems'
3
+ require 'fp_growth'
4
+ rescue LoadError
5
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
+ require 'fp_growth'
7
+ end
8
+
9
+ def example_1 # paper
10
+ transactions = [
11
+ ['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'],
12
+ ['a', 'b', 'c', 'f', 'l', 'm', 'o'],
13
+ ['b', 'f', 'h', 'j', 'o'],
14
+ ['b', 'c', 'k', 's', 'p'],
15
+ ['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']
16
+ ]
17
+ FpGrowth::FpTree.new(3, v.get_items(transactions), transactions)
18
+ end
19
+
20
+ def example_2 # buch s 123
21
+ transactions = [
22
+ [1,3,6,8,9],
23
+ [1,2,3,6,7,8],
24
+ [2,4,6],
25
+ [2,3,4,9],
26
+ [1,3,5,6,7,8,9]
27
+ ]
28
+ FpGrowth::FpTree.new(3, FpGrowth::FpTree.get_items(transactions), transactions)
29
+ end
30
+
31
+ def example_3
32
+ transactions = [
33
+ [1,3,4],
34
+ [4],
35
+ [1,2,4,5],
36
+ [1,6],
37
+ [1,2],
38
+ [1,6],
39
+ [1,4],
40
+ [1,2,4]
41
+ ]
42
+ FpGrowth::FpTree.new(2, FpGrowth::FpTree.get_items(transactions), transactions)
43
+ end
44
+
45
+ def example_4 # thesis adaptive-fp
46
+ transactions = [
47
+ ['a', 'c', 'd', 'e', 'f'],
48
+ ['a', 'b', 'e'],
49
+ ['c', 'e', 'f'],
50
+ ['a', 'c', 'd', 'f'],
51
+ ['c', 'e', 'f']
52
+ ]
53
+ FpGrowth::FpTree.new(2, FpGrowth::FpTree.get_items(transactions), transactions)
54
+ end
55
+
56
+ puts example_4
@@ -0,0 +1,9 @@
1
+ require 'fp_growth/inflections'
2
+
3
+ module FpGrowth
4
+ autoload :FpTreeNode, 'fp_growth/fp_tree_node'
5
+ autoload :FpTree, 'fp_growth/fp_tree'
6
+ autoload :HeaderListNode, 'fp_growth/header_list_node'
7
+ autoload :HeaderList, 'fp_growth/header_list'
8
+ autoload :PrefixPath, 'fp_growth/prefix_path'
9
+ end
@@ -0,0 +1,220 @@
1
+ module FpGrowth::Helper
2
+ def self.subsets_as_prefix_path(array)
3
+ result = []
4
+ array.subsets.each do |combination|
5
+ min_support = combination.min { |a, b| a.count <=> b.count }.count
6
+ elements = combination.collect { |c| c.key }
7
+ result << FpGrowth::PrefixPath.new(min_support, elements)
8
+ end
9
+ result
10
+ end
11
+
12
+ # takes arrays of prefix_paths
13
+ def self.cross_product(a, b)
14
+ result = []
15
+ a.each do |a_el|
16
+ b.each do |b_el|
17
+ result << (a_el + b_el)
18
+ end
19
+ end
20
+ result
21
+ end
22
+
23
+ # TODO put handling of assoziation rules in an extra namespace
24
+ # and copy the unnecessary rules code from examples/cao.rb in it too
25
+ def self.create_assoziation_rules(frequent_patterns, min_confidence)
26
+ result = []
27
+ frequent_patterns.each do |pattern|
28
+ next if pattern.size < 2
29
+ pattern.to_a.parts.each do |part|
30
+ other_pattern = frequent_patterns.find { |p| part.first.content_equal?(p.to_a) }
31
+ confidence = pattern.support.to_f / other_pattern.support.to_f
32
+ if confidence >= min_confidence
33
+ result << {
34
+ :left => part.first,
35
+ :right => part.last,
36
+ :support => pattern.support,
37
+ :confidence => confidence
38
+ }
39
+ end
40
+ end
41
+ end
42
+ result
43
+ end
44
+
45
+ end
46
+
47
+ class FpGrowth::FpTree
48
+
49
+ attr_reader :min_support, :root
50
+ attr_accessor :header_list
51
+ protected :header_list, :header_list=, :root
52
+
53
+ # items is sorted array of [key, count]
54
+ # min_support is not in %, but absolute
55
+ def initialize(min_support, items, transactions = [])
56
+ @min_support = min_support
57
+ @root = FpGrowth::FpTreeNode.new(nil)
58
+
59
+ # the count information is lost in this step
60
+ # HeaderList calculates this value again if needed
61
+ @header_list = FpGrowth::HeaderList.new(select_only_frequent_items(items))
62
+
63
+ add_transactions(transactions)
64
+ end
65
+
66
+ def empty?
67
+ @root.children.empty?
68
+ end
69
+
70
+ def add_transactions(transactions)
71
+ transactions.each do |transaction|
72
+ add_transaction(transaction)
73
+ end
74
+ end
75
+
76
+ # TODO decide: bild header list online or with rebuild_header_list afterwards
77
+ def add_transaction(transaction)
78
+ transaction = prepare_transaction(transaction)
79
+ increment = transaction.support
80
+ tree_node = @root
81
+ transaction.each do |key|
82
+ if next_node = tree_node.find_child(key)
83
+ next_node.increase_count(increment)
84
+ else
85
+ next_node = FpGrowth::FpTreeNode.new(key, increment)
86
+ tree_node.add_child(next_node)
87
+
88
+ @header_list[key] << next_node
89
+ end
90
+
91
+ tree_node = next_node
92
+ end
93
+ end
94
+
95
+ def to_s
96
+ if false
97
+ @root.to_s
98
+ else
99
+ result = "Tree (min_s = #{@min_support})\n"
100
+ result << @root.to_s
101
+ result << "\nHeader list\n"
102
+ result << @header_list.to_s
103
+ result
104
+ end
105
+ end
106
+
107
+ # arg is new root
108
+ def subtree(root_node)
109
+ result = FpGrowth::FpTree.new(@min_support, [])
110
+ root_node.clone_children(result.root)
111
+ result.rebuild_header_list
112
+ result
113
+ end
114
+
115
+ def fp_growth(alpha = nil)
116
+ single, multi = split_paths
117
+ single_teil = []
118
+ multi_teil = []
119
+ if single
120
+ single_teil = FpGrowth::Helper.subsets_as_prefix_path(single)
121
+ end
122
+ if multi
123
+ multi.header_list.each do |node|
124
+ immediate_frequent_pattern = node.immediate_frequent_pattern
125
+ multi_teil << immediate_frequent_pattern
126
+ new_tree = node.conditional_fp_tree(@min_support)
127
+ if !new_tree.empty?
128
+ multi_teil += new_tree.fp_growth(immediate_frequent_pattern)
129
+ end
130
+ end
131
+ end
132
+ FpGrowth::Helper.cross_product(single_teil + multi_teil + FpGrowth::Helper.cross_product(single_teil, multi_teil), [alpha])
133
+ end
134
+
135
+ # slow version used only as reference for testing
136
+ # does not split tree in single- and multipath parts
137
+ def fp_growth_slow(alpha = nil)
138
+ result = []
139
+ header_list.each do |node|
140
+ base_path = node.immediate_frequent_pattern + alpha
141
+ result << base_path
142
+ new_tree = node.conditional_fp_tree(@min_support)
143
+ if !new_tree.empty?
144
+ result += new_tree.fp_growth_slow(base_path)
145
+ end
146
+ end
147
+ result
148
+ end
149
+
150
+ # can be used with Array or PrefixPath
151
+ # returns sorted array of [key, count]
152
+ def self.get_items(transactions)
153
+ result = {}
154
+
155
+ transactions.each do |transaction|
156
+
157
+ increment = transaction.respond_to?(:support) ? transaction.support : 1
158
+
159
+ transaction.each do |item|
160
+ result[item] ||= 0
161
+ result[item] += increment
162
+ end
163
+ end
164
+ result.to_a.
165
+ sort { |a, b| b.last <=> a.last }
166
+ end
167
+
168
+ def save(filename = 'fp_tree.txt')
169
+ File.open(filename, 'w') do |file|
170
+ file.write Marshal.dump(self)
171
+ end
172
+ end
173
+
174
+ def self.load(filename = 'fp_tree.txt')
175
+ Marshal.load(IO.read(filename))
176
+ end
177
+
178
+ protected
179
+
180
+ def rebuild_header_list
181
+ @root.each do |node|
182
+ @header_list[node.key] << node
183
+ end
184
+ @header_list.sort
185
+ end
186
+
187
+ private
188
+
189
+ # returnvalue is array of two elements
190
+ # first element is nil or array of tree_nodes which form the single prefix path
191
+ # second element is nil or FpTree which forms the multipath part
192
+ def split_paths
193
+ return [nil, self] if @root.children.size != 1
194
+ single = []
195
+ node = @root
196
+ while node.children.size == 1
197
+ # this leaves out root
198
+ node = node.children.first
199
+ single << node
200
+ end
201
+ multi = node.children.empty? ? nil : subtree(node)
202
+ [single, multi]
203
+ end
204
+
205
+ # removes non-frequent-items and sorts remaining frequent-items on stored ordered-items
206
+ # arg can be Array or PrefixPath
207
+ # returns PrefixPath
208
+ def prepare_transaction(transaction)
209
+ support = transaction.respond_to?(:support) ? transaction.support : 1
210
+ # relies on the fact that Array#& keeps order of first array
211
+ FpGrowth::PrefixPath.new(support, @header_list.all_keys & transaction.to_a)
212
+ end
213
+
214
+ # takes array of [key,count]
215
+ # keeps order
216
+ def select_only_frequent_items(items)
217
+ items.select { |item| item.last >= @min_support }
218
+ end
219
+
220
+ end
@@ -0,0 +1,81 @@
1
+ require 'set'
2
+
3
+ # knows nothing about header list
4
+ class FpGrowth::FpTreeNode
5
+
6
+ attr_reader :key, :children, :count
7
+ attr_accessor :parent
8
+
9
+ protected :parent=
10
+
11
+ def initialize(key, count = 1)
12
+ @key = key
13
+ @count = count
14
+ @parent = nil
15
+ @children = []
16
+ end
17
+
18
+ def add_child(child)
19
+ @children << child
20
+ child.parent = self
21
+ end
22
+
23
+ def find_child(key)
24
+ children.find { |child| child.key == key }
25
+ end
26
+
27
+ # prints tree structure
28
+ def to_s(depth = 0)
29
+ if false
30
+ title = @key ? "#{@key}:#{count}" : 'root'
31
+ "( #{title} #{@children.collect { |child| child.to_s }.join(' ')})"
32
+ else
33
+ result = key ? "#{@key}:#{count} " : ''
34
+ result << @children.collect { |child| child.to_s(depth + 1) }.join("\n" << ' ' * 4 * depth)
35
+ result
36
+ end
37
+ end
38
+
39
+ def increase_count(value = 1)
40
+ @count += value
41
+ end
42
+
43
+ def prefix_path
44
+ result = FpGrowth::PrefixPath.new(count)
45
+ node = parent
46
+ while !node.is_root?
47
+ result << node.key
48
+ node = node.parent
49
+ end
50
+ result
51
+ end
52
+
53
+ def clone_children(new_parent)
54
+ @children.each do |child|
55
+ cloned_child = child.clone
56
+ cloned_child.parent = new_parent
57
+ new_parent.children << cloned_child
58
+ end
59
+ end
60
+
61
+ # walks recursive through whole tree, without self
62
+ def each &block
63
+ children.each do |child|
64
+ block.call(child)
65
+ child.each(&block)
66
+ end
67
+ end
68
+
69
+ def is_root?
70
+ !parent
71
+ end
72
+
73
+ protected
74
+
75
+ def clone
76
+ result = FpGrowth::FpTreeNode.new(@key, @count)
77
+ clone_children(result)
78
+ result
79
+ end
80
+
81
+ end
@@ -0,0 +1,51 @@
1
+ # Uses array as storage. Slow lookup but keeps order of items
2
+ class FpGrowth::HeaderList
3
+
4
+ # takes array of [key,count] or array of keys
5
+ def initialize(items)
6
+ items = items.first.respond_to?(:last) ? get_item_keys(items) : items
7
+ @array = items.collect { |item| FpGrowth::HeaderListNode.new(item) }
8
+ end
9
+
10
+ # lookup element via key
11
+ # if key is not found, a new element is added
12
+ def [](key)
13
+ if existing_node = @array.find { |item| item.key == key }
14
+ existing_node
15
+ else
16
+ new_node = FpGrowth::HeaderListNode.new(key)
17
+ @array << new_node
18
+ new_node
19
+ end
20
+ end
21
+
22
+ # delegate to @array
23
+ def each(&block)
24
+ @array.each &block
25
+ end
26
+
27
+ def each_key
28
+ @array.each { |node| yield(node.key) }
29
+ end
30
+
31
+ def all_keys
32
+ @array.collect { |node| node.key }
33
+ end
34
+
35
+ def to_s
36
+ @array.collect { |node| node.to_s }.join("\n")
37
+ end
38
+
39
+ # needed for cloning of fptree
40
+ def sort
41
+ @array.sort { |a, b| b.count <=> a.count }
42
+ end
43
+
44
+ private
45
+
46
+ # takes array of [key,count]
47
+ # returns array of keys
48
+ def get_item_keys(items)
49
+ items.collect { |item| item.first }
50
+ end
51
+ end
@@ -0,0 +1,44 @@
1
+ class FpGrowth::HeaderListNode
2
+
3
+ attr_reader :key
4
+
5
+ def initialize(key)
6
+ @key = key
7
+ @array = []
8
+ end
9
+
10
+ # delegate to @array
11
+ def <<(tree_node)
12
+ @array << tree_node
13
+ end
14
+
15
+ # delegate to @array
16
+ def each(&block)
17
+ @array.each &block
18
+ end
19
+
20
+ # returns array of all prefix paths
21
+ def conditional_pattern_base
22
+ result = @array.collect { |tree_node| tree_node.prefix_path }
23
+ result.select { |pp| !pp.to_a.empty? } # strip empty ones
24
+ end
25
+
26
+ def conditional_fp_tree(min_support)
27
+ cpb = conditional_pattern_base
28
+ FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(cpb), cpb)
29
+ end
30
+
31
+ def immediate_frequent_pattern
32
+ FpGrowth::PrefixPath.new(count,[key])
33
+ end
34
+
35
+ # TODO count could be given at initialize if known
36
+ def count
37
+ @array.inject(0) { |result, tree_node| result + tree_node.count }
38
+ end
39
+
40
+ def to_s
41
+ "#{key}:#{count} -> " <<
42
+ @array.collect { |tree_node| "#{tree_node.key}:#{tree_node.count}" }.join(' ')
43
+ end
44
+ end
@@ -0,0 +1,83 @@
1
+ class Array
2
+ # # slow, but nonrecursive solution
3
+ # def subsets_slow(include_empty = false, just_real = false)
4
+ # bits = size
5
+ # limit = 2 ** bits - 1
6
+ # result = []
7
+ # (include_empty ? 0 : 1).upto(just_real ? (limit - 1) : limit) do |count|
8
+ # mask = sprintf('%01$*2$b', count, bits).split(//).collect { |bit| bit == '1' }
9
+ #
10
+ # temp = []
11
+ # mask.each_index do |index|
12
+ # temp << self[index] if mask[index]
13
+ # end
14
+ # result << temp
15
+ # end
16
+ # result
17
+ # end
18
+
19
+ # modified from http://branch14.org/snippets/subsets_in_ruby.html
20
+ def subsets(include_empty = false, just_real = false, ranks = nil)
21
+ result = all_subsets
22
+ result.shift unless include_empty
23
+ result.pop if just_real
24
+ if ranks
25
+ result = result.select do |i|
26
+ if ranks.respond_to?(:include?)
27
+ ranks.include?(i.size)
28
+ else
29
+ ranks == i.size
30
+ end
31
+ end
32
+ end
33
+ result
34
+ end
35
+
36
+ def all_subsets
37
+ return [[]] if empty?
38
+ set = clone
39
+ first = set.shift
40
+ sets = set.all_subsets
41
+ sets.concat(sets.collect { |s| [first] + s })
42
+ return sets
43
+ end
44
+ protected :all_subsets
45
+
46
+ # TODO faster
47
+ def is_superset_of?(other)
48
+ subsets(false, true).find { |subset| other.content_equal?(subset) }
49
+ end
50
+
51
+ # TODO faster
52
+ def is_subset_of?(other)
53
+ other.subsets.find { |subset| content_equal?(subset) }
54
+ end
55
+
56
+ # TODO faster
57
+ def is_real_subset_of?(other)
58
+ other.subsets(false, true).find { |subset| content_equal?(subset) }
59
+ end
60
+
61
+ def parts(allow_empty = false)
62
+ subsets(allow_empty, !allow_empty).collect do |subset|
63
+ [ subset, self - subset]
64
+ end
65
+ end
66
+
67
+ def content_equal?(other)
68
+ return false if other.size != size
69
+ each do |item|
70
+ return false if !other.find { |other_item|
71
+ item.respond_to?(:content_equal?) ? item.content_equal?(other_item) : item == other_item
72
+ }
73
+ end
74
+ true
75
+ end
76
+ end
77
+
78
+ #puts [1,2].is_subset_of?([1,2]).inspect
79
+ #puts [1,2].is_subset_of?([1,2,3]).inspect
80
+ #puts [1,2].is_real_subset_of?([1,2]).inspect
81
+ #puts [1,2].is_real_subset_of?([1,2,3]).inspect
82
+ #puts [1,2,3].is_superset_of?([1,2]).inspect
83
+ #puts [1,2].is_superset_of?([1,2]).inspect
@@ -0,0 +1,41 @@
1
+ class FpGrowth::PrefixPath
2
+
3
+ attr_reader :support
4
+
5
+ def initialize(support = 1, array = [])
6
+ @array = array
7
+ @support = support
8
+ end
9
+
10
+ # delegate
11
+ def each(&block)
12
+ @array.each(&block)
13
+ end
14
+
15
+ # delegate
16
+ def <<(arg)
17
+ @array << arg
18
+ end
19
+
20
+ #delegate
21
+ def size
22
+ @array.size
23
+ end
24
+
25
+ def +(other)
26
+ return self unless other
27
+ FpGrowth::PrefixPath.new([@support, other.support].min, @array + other.to_a)
28
+ end
29
+
30
+ def to_s
31
+ "[#{@array}:#{support}]"
32
+ end
33
+
34
+ def to_a
35
+ @array
36
+ end
37
+
38
+ def ==(other)
39
+ (support == other.support) && (to_a.content_equal?(other.to_a))
40
+ end
41
+ end
@@ -0,0 +1,24 @@
1
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
2
+ require 'fp_growth'
3
+
4
+ if false
5
+ min_support = 3
6
+ transactions = [
7
+ ['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'],
8
+ ['a', 'b', 'c', 'f', 'l', 'm', 'o'],
9
+ ['b', 'f', 'h', 'j', 'o'],
10
+ ['b', 'c', 'k', 's', 'p'],
11
+ ['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']
12
+ ]
13
+ items = FpGrowth::FpTree.get_items(transactions)
14
+ tree = FpGrowth::FpTree.new(min_support, items, transactions)
15
+ else
16
+ tree = FpGrowth::FpTree.load
17
+ end
18
+
19
+ [:fp_growth, :fp_growth_slow].each do |m|
20
+ print "#{m}"
21
+ start = Time.now
22
+ 100.times { tree.send(m) }
23
+ puts " took #{Time.now - start}s"
24
+ end
@@ -0,0 +1,55 @@
1
+ class TestFpTree < Test::Unit::TestCase
2
+
3
+ def test_file
4
+ assert_both_algorithms_equal FpGrowth::FpTree.load
5
+ end
6
+
7
+ # fails with ruby-1.8.7, ruby-1.9.1, jruby-1.3.0rc1
8
+ def test_something
9
+ transactions = [
10
+ [1, 2, 4, 3],
11
+ [1, 2, 4],
12
+ [1, 2],
13
+ [1, 3, 4],
14
+ [3]
15
+ ]
16
+ assert_both_algorithms_equal create_tree(2, transactions)
17
+ end
18
+
19
+ private
20
+
21
+ def assert_both_algorithms_equal(tree)
22
+ frequent_patterns_right = tree.fp_growth_slow
23
+ frequent_patterns_wrong = tree.fp_growth
24
+
25
+ # text = "wrong has\n" << pp_hash(generate_difference_hash frequent_patterns_wrong, frequent_patterns_right)
26
+ # text << "\nright has\n" << pp_hash(generate_difference_hash frequent_patterns_right, frequent_patterns_wrong)
27
+ assert frequent_patterns_right.content_equal?(frequent_patterns_wrong)#, text
28
+ end
29
+
30
+ def pp_frequent_patterns(arg)
31
+ arg.collect { |a| a.to_s }.join(' ')
32
+ end
33
+
34
+ def create_tree(min_support, transactions)
35
+ FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(transactions), transactions)
36
+ end
37
+
38
+ # result holds elements in a but not in b
39
+ def generate_difference_hash(a, b)
40
+ hash = {}
41
+ a.each do |item|
42
+ hash[item.to_a.sort] = item.support
43
+ end
44
+ b.each do |item|
45
+ hash.delete(item.to_a.sort) if hash[item.to_a.sort] == item.support
46
+ end
47
+ hash
48
+ end
49
+
50
+ def pp_hash(hash)
51
+ hash.collect { |key, value| "#{key.collect { |item| item.to_s }.join(' ')}:#{value}" }.join("\n")
52
+ end
53
+ end
54
+
55
+ # what is used to test equality on array - array?
@@ -0,0 +1,14 @@
1
+ require 'test/unit'
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'fp_growth'
6
+ rescue LoadError
7
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
8
+ require 'fp_growth'
9
+ end
10
+
11
+
12
+ Dir.glob(File.join(File.dirname(__FILE__), 'tc_*.rb')).each do |file|
13
+ require file
14
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fp-growth
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Stefan Achatz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-07-08 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: |
17
+ This is an implementation of the fp-growth frequent pattern mining algorithm as
18
+ stated in the paper
19
+
20
+ Mining Frequent Patterns without Candidate Generation: A Frequent-Pattern Tree Approach
21
+ Han et al, Data Mining and Knowledge Discovery, 8, 53-87, 2004
22
+
23
+ email: stefan_achatz@web.de
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - README
30
+ files:
31
+ - examples/small_ones.rb
32
+ - examples/cao.rb
33
+ - lib/fp_growth.rb
34
+ - lib/fp_growth/fp_tree.rb
35
+ - lib/fp_growth/fp_tree_node.rb
36
+ - lib/fp_growth/header_list.rb
37
+ - lib/fp_growth/inflections.rb
38
+ - lib/fp_growth/prefix_path.rb
39
+ - lib/fp_growth/header_list_node.rb
40
+ - tests/tc_fp_tree.rb
41
+ - tests/profile_fp_tree.rb
42
+ - tests/ts_all.rb
43
+ - README
44
+ has_rdoc: true
45
+ homepage: http://rubyforge.org/projects/fp-growth/
46
+ licenses: []
47
+
48
+ post_install_message:
49
+ rdoc_options:
50
+ - --main
51
+ - README
52
+ - --title
53
+ - fp-growth Documentation
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ requirements: []
69
+
70
+ rubyforge_project: fp-growth
71
+ rubygems_version: 1.3.4
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Implementation of the fp-growth frequent pattern algorithm
75
+ test_files:
76
+ - tests/ts_all.rb