fpgrowth 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +22 -0
- data/README.md +88 -0
- data/Rakefile +11 -0
- data/fpgrowth-ruby.gemspec +23 -0
- data/lib/fpgrowth.rb +8 -0
- data/lib/fpgrowth/fp_tree.rb +153 -0
- data/lib/fpgrowth/fp_tree/builder.rb +19 -0
- data/lib/fpgrowth/fp_tree/builder/first_pass.rb +72 -0
- data/lib/fpgrowth/fp_tree/builder/second_pass.rb +70 -0
- data/lib/fpgrowth/fp_tree/node.rb +46 -0
- data/lib/fpgrowth/miner.rb +62 -0
- data/lib/fpgrowth/miner/conditional_tree_builder.rb +123 -0
- data/lib/fpgrowth/miner/pattern.rb +47 -0
- data/lib/fpgrowth/miner/pattern_base_extractor.rb +70 -0
- data/lib/fpgrowth/version.rb +3 -0
- data/test/montreal-sondage/mtlsat12.csv +1202 -0
- data/test/montreal-velos-comptage/2009.csv +366 -0
- data/test/montreal-velos-comptage/2010.csv +366 -0
- data/test/montreal-velos-comptage/2011.csv +366 -0
- data/test/montreal-velos-comptage/2012.csv +311 -0
- data/test/tc_builder.rb +65 -0
- data/test/tc_first_pass.rb +119 -0
- data/test/tc_fp_tree.rb +168 -0
- data/test/tc_miner.rb +116 -0
- data/test/tc_node.rb +101 -0
- data/test/tc_open_data_sondage_montreal.rb +49 -0
- data/test/tc_open_data_velo_montreal.rb +58 -0
- data/test/tc_pattern.rb +56 -0
- data/test/tc_second_pass.rb +148 -0
- metadata +131 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 thedamfr
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# Fp-Growth-Ruby
|
2
|
+
|
3
|
+
Ruby implementation of FP-Growth
|
4
|
+
|
5
|
+
## What's FP-Growth
|
6
|
+
|
7
|
+
FP-Growth is an algorithm used for mining frequent pattern in an item set. Such pattern are then used to build association rules.
|
8
|
+
Literature example is { Potatoes, Onions } => { Burger } or { Beer, Chips } => { Dippers }.
|
9
|
+
|
10
|
+
FP-Growth is known as a solution for Mining without Candidate generation (http://dl.acm.org/citation.cfm?id=335372).
|
11
|
+
Main alternative to FP-Growth is A Priori which is a pretty much a naive solution. A Priori consist in generating candidate then scanning the database looking for them.
|
12
|
+
|
13
|
+
FP-Growth solution is about reducing the database in one simple Tree Structure : The FP-Tree. The FP-Tree make easy to extract frequent pattern from it.
|
14
|
+
Then FP-Growth algorithm is about Extracting pattern from the tree, starting by the smallest, at the leafs of the tree and then making them grow until every pattern has been found.
|
15
|
+
|
16
|
+
## When use FP-Growth ?
|
17
|
+
|
18
|
+
FP-Growth is about working on a very large number of transactions. The FpTree building is a single pass linear operation.
|
19
|
+
Most of the time, FP-Growth operation is a 0(h²) operation, where h is the height of the Tree. By design, the height is the maximal length of the pattern.
|
20
|
+
|
21
|
+
Worst case is a DataSet with long transactions and where each item is significantly frequent. Such a tree is very big and reduction factor is very low. Performances would be... not what you expect ^^
|
22
|
+
|
23
|
+
## So why use FP-Growth in Ruby ?
|
24
|
+
|
25
|
+
Imagine a web app allowing you to connect to other users. Such a website want to help the user to engage with other. It will scan relationship of users and make suggestions like : "People who connect with following persons often connect with thoses persons".
|
26
|
+
|
27
|
+
Imagine a commercial website showing some products to the users. It will scan users actions and make suggestions like : "People who like this products often like this other one !"
|
28
|
+
|
29
|
+
Those applications are now easy thanks to fpgrowth-ruby !
|
30
|
+
|
31
|
+
## Installation
|
32
|
+
|
33
|
+
Add this line to your application's Gemfile:
|
34
|
+
|
35
|
+
gem 'fpgrowth-ruby'
|
36
|
+
|
37
|
+
And then execute:
|
38
|
+
|
39
|
+
$ bundle install
|
40
|
+
|
41
|
+
Or install it yourself as:
|
42
|
+
|
43
|
+
$ gem install fp-growth-ruby
|
44
|
+
|
45
|
+
## Usage
|
46
|
+
|
47
|
+
Build a tree from transactions and mine it
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
51
|
+
fp_tree = FpGrowth::FpTree.build(transactions)
|
52
|
+
FpGrowth::Miner.fp_growth(fp_tree)
|
53
|
+
|
54
|
+
```
|
55
|
+
|
56
|
+
By default, threshold for an item to be considered as "frequent" is 1% of the transactions.
|
57
|
+
You can edit the threshold when building the tree. We recommend using a threshold around 1%.
|
58
|
+
The larger is the number of transactions, the smaller should be the threshold. If tree is too large, you should use a higher threshold.
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
62
|
+
fp_tree = FpGrowth::FpTree.build(transactions, 30) # 30 stands for 30% of transactions. Here, 'c' would be pruned.
|
63
|
+
FpGrowth::Miner.fp_growth(fp_tree)
|
64
|
+
|
65
|
+
```
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
## Development : Next steps
|
70
|
+
|
71
|
+
As we said, worst case is a is a DataSet with long transactions and where each item is significantly frequent. Solution would be to higher the threshold level, which would result in data-loss, maybe critical data would be lost...
|
72
|
+
|
73
|
+
A better solution, described is following articles : [http://dl.acm.org/citation.cfm?id=1133907 , http://link.springer.com/chapter/10.1007/978-3-540-24775-3_19]
|
74
|
+
Main concept is pruning the tree, once built in order to remove the less significant patterns. This is necessary to allow developer to prune his tree, losing least frequent pattern, in order to quickly obtain the most frequent ones.
|
75
|
+
|
76
|
+
This is next step in our Roadmap.
|
77
|
+
|
78
|
+
This is also a necessary step for allowing a Top-Down FP-Growth implementation as described in : http://link.springer.com/chapter/10.1007/3-540-47887-6_34
|
79
|
+
|
80
|
+
This last implementation is more scalable and more efficient than the current one.
|
81
|
+
|
82
|
+
## Contributing
|
83
|
+
|
84
|
+
1. Fork it
|
85
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
86
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
87
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
88
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'fpgrowth/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "fpgrowth"
|
8
|
+
spec.version = FpGrowth::VERSION
|
9
|
+
spec.authors = ["thedamfr"]
|
10
|
+
spec.email = ["dam.cavailles@laposte.net"]
|
11
|
+
spec.description = %q{FP-Growth implementation}
|
12
|
+
spec.summary = %q{FP-Growth is mean to detect}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "ruby-graphviz"
|
23
|
+
end
|
data/lib/fpgrowth.rb
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
require_relative 'fp_tree/node'
|
2
|
+
require_relative 'fp_tree/builder'
|
3
|
+
|
4
|
+
require 'graphviz'
|
5
|
+
require 'etc'
|
6
|
+
|
7
|
+
module FpGrowth
|
8
|
+
module FpTree
|
9
|
+
|
10
|
+
def self.build(transactions, threshold=1)
|
11
|
+
Builder.build(transactions, threshold)
|
12
|
+
end
|
13
|
+
|
14
|
+
class FpTree
|
15
|
+
attr_reader :root, :heads, :supports, :threshold
|
16
|
+
|
17
|
+
def self.build(transactions, threshold=1)
|
18
|
+
Builder.build(transactions, threshold)
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(supports={}, threshold=1)
|
22
|
+
@root = Node.new()
|
23
|
+
@heads = Hash.new nil
|
24
|
+
@supports = supports
|
25
|
+
#initialiser les clés
|
26
|
+
for k in @supports.keys
|
27
|
+
@heads[k]=nil
|
28
|
+
end
|
29
|
+
@threshold=threshold
|
30
|
+
end
|
31
|
+
|
32
|
+
def item_order_lookup
|
33
|
+
unless @lookup
|
34
|
+
@lookup = {}
|
35
|
+
@supports.keys.each_with_index do |item, index|
|
36
|
+
@lookup[item] = index
|
37
|
+
end
|
38
|
+
end
|
39
|
+
return @lookup
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
def find_lateral_leaf_for_item(item)
|
44
|
+
cursor = heads[item]
|
45
|
+
while cursor != nil and cursor.lateral != nil do
|
46
|
+
cursor = cursor.lateral
|
47
|
+
end
|
48
|
+
return cursor
|
49
|
+
end
|
50
|
+
|
51
|
+
def graphviz(fancy_name=nil)
|
52
|
+
g = GraphViz.new(:G, :type => :digraph)
|
53
|
+
nodonode = {}
|
54
|
+
nodonode[self.root]=g.add_nodes(self.root.to_s, :label => "nil")
|
55
|
+
|
56
|
+
for row in self.heads.values
|
57
|
+
node=row
|
58
|
+
while node != nil
|
59
|
+
nodonode[node]= g.add_nodes(node.to_s, :label => node.item.to_s + " : " + node.support.to_s)
|
60
|
+
node = node.lateral
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
for child in self.root.children
|
65
|
+
g.add_edges(nodonode[self.root], nodonode[child])
|
66
|
+
end
|
67
|
+
|
68
|
+
for row in self.heads.values
|
69
|
+
node=row
|
70
|
+
while node != nil
|
71
|
+
for child in node.children
|
72
|
+
g.add_edges(nodonode[node], nodonode[child])
|
73
|
+
end
|
74
|
+
node = node.lateral
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
for row in self.heads.values
|
80
|
+
node=row
|
81
|
+
while node != nil
|
82
|
+
|
83
|
+
g.add_edges(nodonode[node], nodonode[node.lateral], :style => :dashed, :constraint => :false) if node.lateral
|
84
|
+
node = node.lateral
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
g.output(:png => "./graphs/#{fancy_name}-#{Etc.getlogin}-#{items_count}-items-#{Time.now.to_s.gsub(" ", "-")}.png")
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
def sort_children_by_support(nodes)
|
93
|
+
lookup = item_order_lookup
|
94
|
+
|
95
|
+
nodes.sort_by! do |node|
|
96
|
+
lookup.fetch(node.item, lookup.size + 1)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def append_node(cursor_tree, node)
|
101
|
+
cursor_tree.children << node
|
102
|
+
node.parent = cursor_tree
|
103
|
+
sort_children_by_support(cursor_tree.children)
|
104
|
+
left = find_lateral_leaf_for_item(node.item)
|
105
|
+
if left == nil then
|
106
|
+
@heads[node.item] = node
|
107
|
+
else
|
108
|
+
left.lateral = node
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def items_count
|
113
|
+
sum=0
|
114
|
+
for val in supports.values
|
115
|
+
sum+=val
|
116
|
+
end
|
117
|
+
return sum
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
def single_path?
|
122
|
+
is = true
|
123
|
+
cursor = @root
|
124
|
+
while is and cursor != nil
|
125
|
+
is = false if cursor.children.size > 1
|
126
|
+
cursor = cursor.children.first
|
127
|
+
end
|
128
|
+
|
129
|
+
return is
|
130
|
+
end
|
131
|
+
|
132
|
+
def combinations
|
133
|
+
raise "Tree contains multiple paths" unless single_path?
|
134
|
+
array = []
|
135
|
+
item = @root.children.first
|
136
|
+
while item != nil
|
137
|
+
array << item
|
138
|
+
item = item.children.first
|
139
|
+
end
|
140
|
+
yss = 1.upto(array.size).flat_map do |n|
|
141
|
+
array.combination(n).to_a
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def empty?
|
146
|
+
return @heads.empty?
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
require_relative 'fp_tree/builder'
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require_relative 'builder/first_pass'
|
2
|
+
require_relative 'builder/second_pass'
|
3
|
+
|
4
|
+
|
5
|
+
module FpGrowth
|
6
|
+
module FpTree
|
7
|
+
module Builder
|
8
|
+
|
9
|
+
def self.build(transactions, threshold=1)
|
10
|
+
first_pass = FirstPass.new(threshold)
|
11
|
+
supports = first_pass.execute(transactions)
|
12
|
+
second_pass = SecondPass.new(supports, threshold)
|
13
|
+
tree = second_pass.execute(transactions)
|
14
|
+
return tree
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require "fpgrowth/models/transaction"
|
2
|
+
|
3
|
+
module FpGrowth
|
4
|
+
module FpTree
|
5
|
+
module Builder
|
6
|
+
class FirstPass
|
7
|
+
|
8
|
+
attr_accessor :supports
|
9
|
+
|
10
|
+
def initialize(threshold=1)
|
11
|
+
|
12
|
+
@supports = Hash.new 0
|
13
|
+
@threshold = threshold
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
# Scan data and find support for each item
|
19
|
+
# @param transactions FpGrowth::Transaction
|
20
|
+
#
|
21
|
+
#
|
22
|
+
def scan(transactions=@transactions)
|
23
|
+
@supports= Hash.new(0)
|
24
|
+
for transaction in transactions
|
25
|
+
for item in transaction
|
26
|
+
@supports[item] += 1
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
return @supports
|
31
|
+
end
|
32
|
+
|
33
|
+
# discard unfrequent items
|
34
|
+
# @param supports Hash
|
35
|
+
#
|
36
|
+
def pruning(transactions=@transactions, supports=@supports, threshold=@threshold)
|
37
|
+
|
38
|
+
minimum = transactions.size.to_f / 100 * threshold
|
39
|
+
|
40
|
+
for transaction in transactions
|
41
|
+
for item in transaction
|
42
|
+
|
43
|
+
transaction.delete(item) if supports[item] < minimum
|
44
|
+
end
|
45
|
+
end
|
46
|
+
transactions.delete([])
|
47
|
+
supports.delete_if { |key, value| value < minimum }
|
48
|
+
|
49
|
+
return supports
|
50
|
+
end
|
51
|
+
|
52
|
+
# Ordonner les items en fonction de le support
|
53
|
+
# Cet ordre est utilisé pour la construction du Tree lors de la seconde passe
|
54
|
+
#
|
55
|
+
def sort(supports=@supports)
|
56
|
+
Hash[(supports.sort_by { |_key, value| value }.reverse)]
|
57
|
+
end
|
58
|
+
|
59
|
+
# Actually make the first pass
|
60
|
+
#
|
61
|
+
def execute(transactions, threshold=@threshold)
|
62
|
+
@transactions = transactions
|
63
|
+
@threshold = threshold
|
64
|
+
scan
|
65
|
+
pruning
|
66
|
+
sort
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'fpgrowth/fp_tree'
|
2
|
+
module FpGrowth
|
3
|
+
module FpTree
|
4
|
+
module Builder
|
5
|
+
class SecondPass
|
6
|
+
|
7
|
+
attr_accessor :fp_tree
|
8
|
+
|
9
|
+
def initialize(supports, threshold=1)
|
10
|
+
@supports = supports
|
11
|
+
@fp_tree = FpTree.new(supports, threshold)
|
12
|
+
end
|
13
|
+
|
14
|
+
def execute(transactions)
|
15
|
+
@fp_tree = FpTree.new(@supports)
|
16
|
+
for transaction in transactions
|
17
|
+
transaction = sort_by_support(transaction)
|
18
|
+
#Look for leaf
|
19
|
+
traverse(@fp_tree.root, transaction)
|
20
|
+
|
21
|
+
end
|
22
|
+
return @fp_tree
|
23
|
+
end
|
24
|
+
|
25
|
+
def sort_by_support(transaction)
|
26
|
+
lookup = @fp_tree.item_order_lookup
|
27
|
+
|
28
|
+
transaction.sort_by! do |item|
|
29
|
+
lookup.fetch(item, lookup.size + 1)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
def traverse(cursor_tree, transaction)
|
36
|
+
if transaction and transaction.size > 0
|
37
|
+
found = false
|
38
|
+
if cursor_tree.item == transaction.first
|
39
|
+
continue_pattern(cursor_tree, transaction)
|
40
|
+
found = true
|
41
|
+
end
|
42
|
+
i = 0
|
43
|
+
while found == false and i < cursor_tree.children.size
|
44
|
+
if cursor_tree.children[i].item == transaction[0] then
|
45
|
+
continue_pattern(cursor_tree.children[i], transaction)
|
46
|
+
found = true
|
47
|
+
end
|
48
|
+
i+=1
|
49
|
+
end
|
50
|
+
fork_pattern(cursor_tree, transaction) unless found
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def fork_pattern(cursor_tree, transaction)
|
55
|
+
node = Node.new(transaction.first, 1)
|
56
|
+
@fp_tree.append_node(cursor_tree, node)
|
57
|
+
cursor_tree = node
|
58
|
+
traverse(cursor_tree, transaction[1..transaction.size])
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
def continue_pattern(cursor_tree, transaction)
|
63
|
+
cursor_tree.support+=1
|
64
|
+
traverse(cursor_tree, transaction[1..transaction.size])
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|