fp-growth 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,5 @@
1
+ This is an implementation of the fp-growth frequent pattern mining algorithm as
2
+ stated in the paper
3
+
4
+ Mining Frequent Patterns without Candidate Generation: A Frequent-Pattern Tree Approach
5
+ Han et al, Data Mining and Knowledge Discovery, 8, 53-87, 2004
@@ -0,0 +1,207 @@
1
+ begin
2
+ require 'rubygems'
3
+ require 'fp_growth'
4
+ rescue LoadError
5
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
+ require 'fp_growth'
7
+ end
8
+
9
+ require 'mysql'
10
+ require 'singleton'
11
+
12
+ HOST =
13
+ USER =
14
+ PASSWD =
15
+ DB =
16
+
17
+ # This example shows the usage of fp-growth with a database from an
18
+ # inventory management system called cao_faktura (http://www.cao-faktura.de/)
19
+ class CaoDb
20
+
21
+ include Singleton
22
+
23
+ def initialize
24
+ connect
25
+ end
26
+
27
+ def article_tree(min_support)
28
+ FpGrowth::FpTree.new(min_support, article_ids(min_support), article_transactions)
29
+ end
30
+
31
+ def category_tree(min_support)
32
+ transactions = category_transactions
33
+ FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(transactions), transactions)
34
+ end
35
+
36
+ def article_name(id)
37
+ @connection.query(%Q!
38
+ SELECT KURZNAME
39
+ FROM ARTIKEL
40
+ WHERE REC_ID = #{id}
41
+ !).each do |row|
42
+ return row[0]
43
+ end
44
+ end
45
+
46
+ def category_name(id)
47
+ @connection.query(%Q!
48
+ SELECT NAME
49
+ FROM WARENGRUPPEN
50
+ WHERE ID = #{id}
51
+ !).each do |row|
52
+ return row[0]
53
+ end
54
+ end
55
+
56
+ def article_ids(min_support)
57
+ result = []
58
+ data = @connection.query(%Q!
59
+ SELECT ARTIKEL_ID,
60
+ # ein Bon kann mehrere journalpos mit ident. Artikel enthalten
61
+ # und soll trotzdem nur einmal gezählt werden
62
+ COUNT(DISTINCT JOURNAL_ID) AS anzahl
63
+ FROM JOURNALPOS
64
+ GROUP BY ARTIKEL_ID
65
+ HAVING anzahl >= #{min_support}
66
+ # mit zwei spalten ist Sortierung eindeutig
67
+ ORDER BY anzahl DESC, ARTIKEL_ID DESC
68
+ !)
69
+ data.each do |row|
70
+ result << [ row[0].to_i, row[1].to_i ]
71
+ end
72
+ data.free
73
+ result
74
+ end
75
+
76
+ def category_ids(min_support)
77
+ result = []
78
+ data = @connection.query(%Q!
79
+ SELECT WARENGRUPPE,
80
+ COUNT(DISTINCT JOURNAL_ID) AS anzahl
81
+ FROM JOURNALPOS
82
+ GROUP BY ARTIKEL_ID
83
+ HAVING anzahl >= #{min_support}
84
+ ORDER BY anzahl DESC, WARENGRUPPE DESC
85
+ !)
86
+ data.each do |row|
87
+ result << [ row[0].to_i, row[1].to_i ]
88
+ end
89
+ data.free
90
+ result
91
+ end
92
+
93
+ def article_transactions
94
+ result = []
95
+ data = @connection.query(%q!
96
+ SELECT JOURNAL_ID,
97
+ CONVERT(GROUP_CONCAT(ARTIKEL_ID SEPARATOR ',') USING UTF8)
98
+ FROM JOURNALPOS
99
+ GROUP BY JOURNAL_ID
100
+ !)
101
+ data.each do |row|
102
+ result << row[1].split(',').collect { |id| id.to_i }
103
+ end
104
+ data.free
105
+ result
106
+ end
107
+
108
+ def category_transactions
109
+ result = []
110
+ data = @connection.query(%q!
111
+ SELECT JOURNAL_ID,
112
+ CONVERT(GROUP_CONCAT(WARENGRUPPE SEPARATOR ',') USING UTF8)
113
+ FROM JOURNALPOS
114
+ GROUP BY JOURNAL_ID
115
+ !)
116
+ data.each do |row|
117
+ result << row[1].split(',').collect { |id| id.to_i }
118
+ end
119
+ data.free
120
+ result
121
+ end
122
+
123
+ def count_transactions
124
+ @connection.query(%q!
125
+ SELECT COUNT(*) FROM JOURNAL
126
+ !).each do |row|
127
+ return row[0].to_i
128
+ end
129
+ end
130
+
131
+ def do_articles(min_support, min_confidence)
132
+ print_article_assoziation_rules(article_tree(min_support).create_assoziation_rules(min_confidence))
133
+ end
134
+ private
135
+
136
+ def connect
137
+ @connection = Mysql::new(HOST, USER, PASSWD, DB)
138
+ @connection.query('SET NAMES "UTF8"')
139
+ end
140
+ end
141
+
142
+ def is_unnecessary_rule?(all, wanted, beta)
143
+ all.each do |candidate|
144
+ next if candidate[:right] != wanted[:right]
145
+ next if !candidate[:left].is_superset_of?(wanted[:left])
146
+ next if !candidate[:confidence].between?(wanted[:confidence] - beta, wanted[:confidence] + beta)
147
+ return true
148
+ end
149
+ false
150
+ end
151
+
152
+ def remove_unnecessary_rules(all, beta)
153
+ all.select { |candidate| !is_unnecessary_rule?(all, candidate, beta) }
154
+ end
155
+
156
+ def print_article_assoziation_rules(array)
157
+ array.each { |rule| print_article_assoziation_rule(rule) }
158
+ end
159
+
160
+ def print_article_assoziation_rule(rule)
161
+ print '['
162
+ rule[:left].each do |id|
163
+ print "(#{id}:#{CaoDb.instance.article_name(id)})"
164
+ end
165
+ print '] => ['
166
+ rule[:right].each do |id|
167
+ print "(#{id}:#{CaoDb.instance.article_name(id)})"
168
+ end
169
+ print '] = '
170
+ puts sprintf('s%i c%.2f', rule[:support], rule[:confidence])
171
+ end
172
+
173
+ def print_category_assoziation_rules(array)
174
+ array.each do |rule|
175
+ print '['
176
+ rule[:left].each do |id|
177
+ print "(#{id}:#{CaoDb.instance.category_name(id)})"
178
+ end
179
+ print '] => ['
180
+ rule[:right].each do |id|
181
+ print "(#{id}:#{CaoDb.instance.category_name(id)})"
182
+ end
183
+ print '] = '
184
+ puts sprintf('s%i c%.2f', rule[:support], rule[:confidence])
185
+ end
186
+ end
187
+
188
+ def do_categories(min_support, min_confidence)
189
+ frequent_patterns = CaoDb.instance.category_tree(min_support).fp_growth
190
+ print_category_assoziation_rules(FpGrowth::Helper.create_assoziation_rules(frequent_patterns, min_confidence))
191
+ end
192
+
193
+ def do_articles(min_support, min_confidence)
194
+ frequent_patterns = CaoDb.instance.article_tree(min_support).fp_growth
195
+ print_article_assoziation_rules(
196
+ remove_unnecessary_rules(
197
+ FpGrowth::Helper.create_assoziation_rules(frequent_patterns, min_confidence),
198
+ 0.2
199
+ )
200
+ )
201
+ end
202
+
203
+ min_support = 9 # abs = (rel * database.count_transactions).round
204
+ min_confidence = 0.5
205
+
206
+ do_articles(min_support, min_confidence)
207
+ #CaoDb.instance.article_tree(min_support).save
@@ -0,0 +1,56 @@
1
+ begin
2
+ require 'rubygems'
3
+ require 'fp_growth'
4
+ rescue LoadError
5
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
+ require 'fp_growth'
7
+ end
8
+
9
+ def example_1 # paper
10
+ transactions = [
11
+ ['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'],
12
+ ['a', 'b', 'c', 'f', 'l', 'm', 'o'],
13
+ ['b', 'f', 'h', 'j', 'o'],
14
+ ['b', 'c', 'k', 's', 'p'],
15
+ ['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']
16
+ ]
17
+ FpGrowth::FpTree.new(3, v.get_items(transactions), transactions)
18
+ end
19
+
20
+ def example_2 # buch s 123
21
+ transactions = [
22
+ [1,3,6,8,9],
23
+ [1,2,3,6,7,8],
24
+ [2,4,6],
25
+ [2,3,4,9],
26
+ [1,3,5,6,7,8,9]
27
+ ]
28
+ FpGrowth::FpTree.new(3, FpGrowth::FpTree.get_items(transactions), transactions)
29
+ end
30
+
31
+ def example_3
32
+ transactions = [
33
+ [1,3,4],
34
+ [4],
35
+ [1,2,4,5],
36
+ [1,6],
37
+ [1,2],
38
+ [1,6],
39
+ [1,4],
40
+ [1,2,4]
41
+ ]
42
+ FpGrowth::FpTree.new(2, FpGrowth::FpTree.get_items(transactions), transactions)
43
+ end
44
+
45
+ def example_4 # thesis adaptive-fp
46
+ transactions = [
47
+ ['a', 'c', 'd', 'e', 'f'],
48
+ ['a', 'b', 'e'],
49
+ ['c', 'e', 'f'],
50
+ ['a', 'c', 'd', 'f'],
51
+ ['c', 'e', 'f']
52
+ ]
53
+ FpGrowth::FpTree.new(2, FpGrowth::FpTree.get_items(transactions), transactions)
54
+ end
55
+
56
+ puts example_4
@@ -0,0 +1,9 @@
1
+ require 'fp_growth/inflections'
2
+
3
+ module FpGrowth
4
+ autoload :FpTreeNode, 'fp_growth/fp_tree_node'
5
+ autoload :FpTree, 'fp_growth/fp_tree'
6
+ autoload :HeaderListNode, 'fp_growth/header_list_node'
7
+ autoload :HeaderList, 'fp_growth/header_list'
8
+ autoload :PrefixPath, 'fp_growth/prefix_path'
9
+ end
@@ -0,0 +1,220 @@
1
+ module FpGrowth::Helper
2
+ def self.subsets_as_prefix_path(array)
3
+ result = []
4
+ array.subsets.each do |combination|
5
+ min_support = combination.min { |a, b| a.count <=> b.count }.count
6
+ elements = combination.collect { |c| c.key }
7
+ result << FpGrowth::PrefixPath.new(min_support, elements)
8
+ end
9
+ result
10
+ end
11
+
12
+ # takes arrays of prefix_paths
13
+ def self.cross_product(a, b)
14
+ result = []
15
+ a.each do |a_el|
16
+ b.each do |b_el|
17
+ result << (a_el + b_el)
18
+ end
19
+ end
20
+ result
21
+ end
22
+
23
+ # TODO put handling of assoziation rules in an extra namespace
24
+ # and copy the unnecessary rules code from examples/cao.rb in it too
25
+ def self.create_assoziation_rules(frequent_patterns, min_confidence)
26
+ result = []
27
+ frequent_patterns.each do |pattern|
28
+ next if pattern.size < 2
29
+ pattern.to_a.parts.each do |part|
30
+ other_pattern = frequent_patterns.find { |p| part.first.content_equal?(p.to_a) }
31
+ confidence = pattern.support.to_f / other_pattern.support.to_f
32
+ if confidence >= min_confidence
33
+ result << {
34
+ :left => part.first,
35
+ :right => part.last,
36
+ :support => pattern.support,
37
+ :confidence => confidence
38
+ }
39
+ end
40
+ end
41
+ end
42
+ result
43
+ end
44
+
45
+ end
46
+
47
+ class FpGrowth::FpTree
48
+
49
+ attr_reader :min_support, :root
50
+ attr_accessor :header_list
51
+ protected :header_list, :header_list=, :root
52
+
53
+ # items is sorted array of [key, count]
54
+ # min_support is not in %, but absolute
55
+ def initialize(min_support, items, transactions = [])
56
+ @min_support = min_support
57
+ @root = FpGrowth::FpTreeNode.new(nil)
58
+
59
+ # the count information is lost in this step
60
+ # HeaderList calculates this value again if needed
61
+ @header_list = FpGrowth::HeaderList.new(select_only_frequent_items(items))
62
+
63
+ add_transactions(transactions)
64
+ end
65
+
66
+ def empty?
67
+ @root.children.empty?
68
+ end
69
+
70
+ def add_transactions(transactions)
71
+ transactions.each do |transaction|
72
+ add_transaction(transaction)
73
+ end
74
+ end
75
+
76
+ # TODO decide: bild header list online or with rebuild_header_list afterwards
77
+ def add_transaction(transaction)
78
+ transaction = prepare_transaction(transaction)
79
+ increment = transaction.support
80
+ tree_node = @root
81
+ transaction.each do |key|
82
+ if next_node = tree_node.find_child(key)
83
+ next_node.increase_count(increment)
84
+ else
85
+ next_node = FpGrowth::FpTreeNode.new(key, increment)
86
+ tree_node.add_child(next_node)
87
+
88
+ @header_list[key] << next_node
89
+ end
90
+
91
+ tree_node = next_node
92
+ end
93
+ end
94
+
95
+ def to_s
96
+ if false
97
+ @root.to_s
98
+ else
99
+ result = "Tree (min_s = #{@min_support})\n"
100
+ result << @root.to_s
101
+ result << "\nHeader list\n"
102
+ result << @header_list.to_s
103
+ result
104
+ end
105
+ end
106
+
107
+ # arg is new root
108
+ def subtree(root_node)
109
+ result = FpGrowth::FpTree.new(@min_support, [])
110
+ root_node.clone_children(result.root)
111
+ result.rebuild_header_list
112
+ result
113
+ end
114
+
115
+ def fp_growth(alpha = nil)
116
+ single, multi = split_paths
117
+ single_teil = []
118
+ multi_teil = []
119
+ if single
120
+ single_teil = FpGrowth::Helper.subsets_as_prefix_path(single)
121
+ end
122
+ if multi
123
+ multi.header_list.each do |node|
124
+ immediate_frequent_pattern = node.immediate_frequent_pattern
125
+ multi_teil << immediate_frequent_pattern
126
+ new_tree = node.conditional_fp_tree(@min_support)
127
+ if !new_tree.empty?
128
+ multi_teil += new_tree.fp_growth(immediate_frequent_pattern)
129
+ end
130
+ end
131
+ end
132
+ FpGrowth::Helper.cross_product(single_teil + multi_teil + FpGrowth::Helper.cross_product(single_teil, multi_teil), [alpha])
133
+ end
134
+
135
+ # slow version used only as reference for testing
136
+ # does not split tree in single- and multipath parts
137
+ def fp_growth_slow(alpha = nil)
138
+ result = []
139
+ header_list.each do |node|
140
+ base_path = node.immediate_frequent_pattern + alpha
141
+ result << base_path
142
+ new_tree = node.conditional_fp_tree(@min_support)
143
+ if !new_tree.empty?
144
+ result += new_tree.fp_growth_slow(base_path)
145
+ end
146
+ end
147
+ result
148
+ end
149
+
150
+ # can be used with Array or PrefixPath
151
+ # returns sorted array of [key, count]
152
+ def self.get_items(transactions)
153
+ result = {}
154
+
155
+ transactions.each do |transaction|
156
+
157
+ increment = transaction.respond_to?(:support) ? transaction.support : 1
158
+
159
+ transaction.each do |item|
160
+ result[item] ||= 0
161
+ result[item] += increment
162
+ end
163
+ end
164
+ result.to_a.
165
+ sort { |a, b| b.last <=> a.last }
166
+ end
167
+
168
+ def save(filename = 'fp_tree.txt')
169
+ File.open(filename, 'w') do |file|
170
+ file.write Marshal.dump(self)
171
+ end
172
+ end
173
+
174
+ def self.load(filename = 'fp_tree.txt')
175
+ Marshal.load(IO.read(filename))
176
+ end
177
+
178
+ protected
179
+
180
+ def rebuild_header_list
181
+ @root.each do |node|
182
+ @header_list[node.key] << node
183
+ end
184
+ @header_list.sort
185
+ end
186
+
187
+ private
188
+
189
+ # returnvalue is array of two elements
190
+ # first element is nil or array of tree_nodes which form the single prefix path
191
+ # second element is nil or FpTree which forms the multipath part
192
+ def split_paths
193
+ return [nil, self] if @root.children.size != 1
194
+ single = []
195
+ node = @root
196
+ while node.children.size == 1
197
+ # this leaves out root
198
+ node = node.children.first
199
+ single << node
200
+ end
201
+ multi = node.children.empty? ? nil : subtree(node)
202
+ [single, multi]
203
+ end
204
+
205
+ # removes non-frequent-items and sorts remaining frequent-items on stored ordered-items
206
+ # arg can be Array or PrefixPath
207
+ # returns PrefixPath
208
+ def prepare_transaction(transaction)
209
+ support = transaction.respond_to?(:support) ? transaction.support : 1
210
+ # relies on the fact that Array#& keeps order of first array
211
+ FpGrowth::PrefixPath.new(support, @header_list.all_keys & transaction.to_a)
212
+ end
213
+
214
+ # takes array of [key,count]
215
+ # keeps order
216
+ def select_only_frequent_items(items)
217
+ items.select { |item| item.last >= @min_support }
218
+ end
219
+
220
+ end
@@ -0,0 +1,81 @@
1
+ require 'set'
2
+
3
+ # knows nothing about header list
4
+ class FpGrowth::FpTreeNode
5
+
6
+ attr_reader :key, :children, :count
7
+ attr_accessor :parent
8
+
9
+ protected :parent=
10
+
11
+ def initialize(key, count = 1)
12
+ @key = key
13
+ @count = count
14
+ @parent = nil
15
+ @children = []
16
+ end
17
+
18
+ def add_child(child)
19
+ @children << child
20
+ child.parent = self
21
+ end
22
+
23
+ def find_child(key)
24
+ children.find { |child| child.key == key }
25
+ end
26
+
27
+ # prints tree structure
28
+ def to_s(depth = 0)
29
+ if false
30
+ title = @key ? "#{@key}:#{count}" : 'root'
31
+ "( #{title} #{@children.collect { |child| child.to_s }.join(' ')})"
32
+ else
33
+ result = key ? "#{@key}:#{count} " : ''
34
+ result << @children.collect { |child| child.to_s(depth + 1) }.join("\n" << ' ' * 4 * depth)
35
+ result
36
+ end
37
+ end
38
+
39
+ def increase_count(value = 1)
40
+ @count += value
41
+ end
42
+
43
+ def prefix_path
44
+ result = FpGrowth::PrefixPath.new(count)
45
+ node = parent
46
+ while !node.is_root?
47
+ result << node.key
48
+ node = node.parent
49
+ end
50
+ result
51
+ end
52
+
53
+ def clone_children(new_parent)
54
+ @children.each do |child|
55
+ cloned_child = child.clone
56
+ cloned_child.parent = new_parent
57
+ new_parent.children << cloned_child
58
+ end
59
+ end
60
+
61
+ # walks recursive through whole tree, without self
62
+ def each &block
63
+ children.each do |child|
64
+ block.call(child)
65
+ child.each(&block)
66
+ end
67
+ end
68
+
69
+ def is_root?
70
+ !parent
71
+ end
72
+
73
+ protected
74
+
75
+ def clone
76
+ result = FpGrowth::FpTreeNode.new(@key, @count)
77
+ clone_children(result)
78
+ result
79
+ end
80
+
81
+ end
@@ -0,0 +1,51 @@
1
+ # Uses array as storage. Slow lookup but keeps order of items
2
+ class FpGrowth::HeaderList
3
+
4
+ # takes array of [key,count] or array of keys
5
+ def initialize(items)
6
+ items = items.first.respond_to?(:last) ? get_item_keys(items) : items
7
+ @array = items.collect { |item| FpGrowth::HeaderListNode.new(item) }
8
+ end
9
+
10
+ # lookup element via key
11
+ # if key is not found, a new element is added
12
+ def [](key)
13
+ if existing_node = @array.find { |item| item.key == key }
14
+ existing_node
15
+ else
16
+ new_node = FpGrowth::HeaderListNode.new(key)
17
+ @array << new_node
18
+ new_node
19
+ end
20
+ end
21
+
22
+ # delegate to @array
23
+ def each(&block)
24
+ @array.each &block
25
+ end
26
+
27
+ def each_key
28
+ @array.each { |node| yield(node.key) }
29
+ end
30
+
31
+ def all_keys
32
+ @array.collect { |node| node.key }
33
+ end
34
+
35
+ def to_s
36
+ @array.collect { |node| node.to_s }.join("\n")
37
+ end
38
+
39
+ # needed for cloning of fptree
40
+ def sort
41
+ @array.sort { |a, b| b.count <=> a.count }
42
+ end
43
+
44
+ private
45
+
46
+ # takes array of [key,count]
47
+ # returns array of keys
48
+ def get_item_keys(items)
49
+ items.collect { |item| item.first }
50
+ end
51
+ end
@@ -0,0 +1,44 @@
1
+ class FpGrowth::HeaderListNode
2
+
3
+ attr_reader :key
4
+
5
+ def initialize(key)
6
+ @key = key
7
+ @array = []
8
+ end
9
+
10
+ # delegate to @array
11
+ def <<(tree_node)
12
+ @array << tree_node
13
+ end
14
+
15
+ # delegate to @array
16
+ def each(&block)
17
+ @array.each &block
18
+ end
19
+
20
+ # returns array of all prefix paths
21
+ def conditional_pattern_base
22
+ result = @array.collect { |tree_node| tree_node.prefix_path }
23
+ result.select { |pp| !pp.to_a.empty? } # strip empty ones
24
+ end
25
+
26
+ def conditional_fp_tree(min_support)
27
+ cpb = conditional_pattern_base
28
+ FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(cpb), cpb)
29
+ end
30
+
31
+ def immediate_frequent_pattern
32
+ FpGrowth::PrefixPath.new(count,[key])
33
+ end
34
+
35
+ # TODO count could be given at initialize if known
36
+ def count
37
+ @array.inject(0) { |result, tree_node| result + tree_node.count }
38
+ end
39
+
40
+ def to_s
41
+ "#{key}:#{count} -> " <<
42
+ @array.collect { |tree_node| "#{tree_node.key}:#{tree_node.count}" }.join(' ')
43
+ end
44
+ end
@@ -0,0 +1,83 @@
1
+ class Array
2
+ # # slow, but nonrecursive solution
3
+ # def subsets_slow(include_empty = false, just_real = false)
4
+ # bits = size
5
+ # limit = 2 ** bits - 1
6
+ # result = []
7
+ # (include_empty ? 0 : 1).upto(just_real ? (limit - 1) : limit) do |count|
8
+ # mask = sprintf('%01$*2$b', count, bits).split(//).collect { |bit| bit == '1' }
9
+ #
10
+ # temp = []
11
+ # mask.each_index do |index|
12
+ # temp << self[index] if mask[index]
13
+ # end
14
+ # result << temp
15
+ # end
16
+ # result
17
+ # end
18
+
19
+ # modified from http://branch14.org/snippets/subsets_in_ruby.html
20
+ def subsets(include_empty = false, just_real = false, ranks = nil)
21
+ result = all_subsets
22
+ result.shift unless include_empty
23
+ result.pop if just_real
24
+ if ranks
25
+ result = result.select do |i|
26
+ if ranks.respond_to?(:include?)
27
+ ranks.include?(i.size)
28
+ else
29
+ ranks == i.size
30
+ end
31
+ end
32
+ end
33
+ result
34
+ end
35
+
36
+ def all_subsets
37
+ return [[]] if empty?
38
+ set = clone
39
+ first = set.shift
40
+ sets = set.all_subsets
41
+ sets.concat(sets.collect { |s| [first] + s })
42
+ return sets
43
+ end
44
+ protected :all_subsets
45
+
46
+ # TODO faster
47
+ def is_superset_of?(other)
48
+ subsets(false, true).find { |subset| other.content_equal?(subset) }
49
+ end
50
+
51
+ # TODO faster
52
+ def is_subset_of?(other)
53
+ other.subsets.find { |subset| content_equal?(subset) }
54
+ end
55
+
56
+ # TODO faster
57
+ def is_real_subset_of?(other)
58
+ other.subsets(false, true).find { |subset| content_equal?(subset) }
59
+ end
60
+
61
+ def parts(allow_empty = false)
62
+ subsets(allow_empty, !allow_empty).collect do |subset|
63
+ [ subset, self - subset]
64
+ end
65
+ end
66
+
67
+ def content_equal?(other)
68
+ return false if other.size != size
69
+ each do |item|
70
+ return false if !other.find { |other_item|
71
+ item.respond_to?(:content_equal?) ? item.content_equal?(other_item) : item == other_item
72
+ }
73
+ end
74
+ true
75
+ end
76
+ end
77
+
78
+ #puts [1,2].is_subset_of?([1,2]).inspect
79
+ #puts [1,2].is_subset_of?([1,2,3]).inspect
80
+ #puts [1,2].is_real_subset_of?([1,2]).inspect
81
+ #puts [1,2].is_real_subset_of?([1,2,3]).inspect
82
+ #puts [1,2,3].is_superset_of?([1,2]).inspect
83
+ #puts [1,2].is_superset_of?([1,2]).inspect
@@ -0,0 +1,41 @@
1
+ class FpGrowth::PrefixPath
2
+
3
+ attr_reader :support
4
+
5
+ def initialize(support = 1, array = [])
6
+ @array = array
7
+ @support = support
8
+ end
9
+
10
+ # delegate
11
+ def each(&block)
12
+ @array.each(&block)
13
+ end
14
+
15
+ # delegate
16
+ def <<(arg)
17
+ @array << arg
18
+ end
19
+
20
+ #delegate
21
+ def size
22
+ @array.size
23
+ end
24
+
25
+ def +(other)
26
+ return self unless other
27
+ FpGrowth::PrefixPath.new([@support, other.support].min, @array + other.to_a)
28
+ end
29
+
30
+ def to_s
31
+ "[#{@array}:#{support}]"
32
+ end
33
+
34
+ def to_a
35
+ @array
36
+ end
37
+
38
+ def ==(other)
39
+ (support == other.support) && (to_a.content_equal?(other.to_a))
40
+ end
41
+ end
@@ -0,0 +1,24 @@
1
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
2
+ require 'fp_growth'
3
+
4
+ if false
5
+ min_support = 3
6
+ transactions = [
7
+ ['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'],
8
+ ['a', 'b', 'c', 'f', 'l', 'm', 'o'],
9
+ ['b', 'f', 'h', 'j', 'o'],
10
+ ['b', 'c', 'k', 's', 'p'],
11
+ ['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']
12
+ ]
13
+ items = FpGrowth::FpTree.get_items(transactions)
14
+ tree = FpGrowth::FpTree.new(min_support, items, transactions)
15
+ else
16
+ tree = FpGrowth::FpTree.load
17
+ end
18
+
19
+ [:fp_growth, :fp_growth_slow].each do |m|
20
+ print "#{m}"
21
+ start = Time.now
22
+ 100.times { tree.send(m) }
23
+ puts " took #{Time.now - start}s"
24
+ end
@@ -0,0 +1,55 @@
1
+ class TestFpTree < Test::Unit::TestCase
2
+
3
+ def test_file
4
+ assert_both_algorithms_equal FpGrowth::FpTree.load
5
+ end
6
+
7
+ # fails with ruby-1.8.7, ruby-1.9.1, jruby-1.3.0rc1
8
+ def test_something
9
+ transactions = [
10
+ [1, 2, 4, 3],
11
+ [1, 2, 4],
12
+ [1, 2],
13
+ [1, 3, 4],
14
+ [3]
15
+ ]
16
+ assert_both_algorithms_equal create_tree(2, transactions)
17
+ end
18
+
19
+ private
20
+
21
+ def assert_both_algorithms_equal(tree)
22
+ frequent_patterns_right = tree.fp_growth_slow
23
+ frequent_patterns_wrong = tree.fp_growth
24
+
25
+ # text = "wrong has\n" << pp_hash(generate_difference_hash frequent_patterns_wrong, frequent_patterns_right)
26
+ # text << "\nright has\n" << pp_hash(generate_difference_hash frequent_patterns_right, frequent_patterns_wrong)
27
+ assert frequent_patterns_right.content_equal?(frequent_patterns_wrong)#, text
28
+ end
29
+
30
+ def pp_frequent_patterns(arg)
31
+ arg.collect { |a| a.to_s }.join(' ')
32
+ end
33
+
34
+ def create_tree(min_support, transactions)
35
+ FpGrowth::FpTree.new(min_support, FpGrowth::FpTree.get_items(transactions), transactions)
36
+ end
37
+
38
+ # result holds elements in a but not in b
39
+ def generate_difference_hash(a, b)
40
+ hash = {}
41
+ a.each do |item|
42
+ hash[item.to_a.sort] = item.support
43
+ end
44
+ b.each do |item|
45
+ hash.delete(item.to_a.sort) if hash[item.to_a.sort] == item.support
46
+ end
47
+ hash
48
+ end
49
+
50
+ def pp_hash(hash)
51
+ hash.collect { |key, value| "#{key.collect { |item| item.to_s }.join(' ')}:#{value}" }.join("\n")
52
+ end
53
+ end
54
+
55
+ # what is used to test equality on array - array?
@@ -0,0 +1,14 @@
1
+ require 'test/unit'
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'fp_growth'
6
+ rescue LoadError
7
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
8
+ require 'fp_growth'
9
+ end
10
+
11
+
12
+ Dir.glob(File.join(File.dirname(__FILE__), 'tc_*.rb')).each do |file|
13
+ require file
14
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fp-growth
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Stefan Achatz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-07-08 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: |
17
+ This is an implementation of the fp-growth frequent pattern mining algorithm as
18
+ stated in the paper
19
+
20
+ Mining Frequent Patterns without Candidate Generation: A Frequent-Pattern Tree Approach
21
+ Han et al, Data Mining and Knowledge Discovery, 8, 53-87, 2004
22
+
23
+ email: stefan_achatz@web.de
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - README
30
+ files:
31
+ - examples/small_ones.rb
32
+ - examples/cao.rb
33
+ - lib/fp_growth.rb
34
+ - lib/fp_growth/fp_tree.rb
35
+ - lib/fp_growth/fp_tree_node.rb
36
+ - lib/fp_growth/header_list.rb
37
+ - lib/fp_growth/inflections.rb
38
+ - lib/fp_growth/prefix_path.rb
39
+ - lib/fp_growth/header_list_node.rb
40
+ - tests/tc_fp_tree.rb
41
+ - tests/profile_fp_tree.rb
42
+ - tests/ts_all.rb
43
+ - README
44
+ has_rdoc: true
45
+ homepage: http://rubyforge.org/projects/fp-growth/
46
+ licenses: []
47
+
48
+ post_install_message:
49
+ rdoc_options:
50
+ - --main
51
+ - README
52
+ - --title
53
+ - fp-growth Documentation
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ requirements: []
69
+
70
+ rubyforge_project: fp-growth
71
+ rubygems_version: 1.3.4
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Implementation of the fp-growth frequent pattern algorithm
75
+ test_files:
76
+ - tests/ts_all.rb