fpgrowth 0.0.2 → 1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +43 -2
- data/Rakefile +1 -1
- data/lib/fpgrowth.rb +16 -4
- data/lib/fpgrowth/fp_tree.rb +145 -5
- data/lib/fpgrowth/fp_tree/bonzai_secateur.rb +30 -0
- data/lib/fpgrowth/fp_tree/builder.rb +1 -0
- data/lib/fpgrowth/fp_tree/builder/first_pass.rb +2 -7
- data/lib/fpgrowth/fp_tree/builder/header_table_builder.rb +37 -0
- data/lib/fpgrowth/fp_tree/header_table.rb +41 -0
- data/lib/fpgrowth/fp_tree/node.rb +1 -0
- data/lib/fpgrowth/miner.rb +32 -2
- data/lib/fpgrowth/miner/pattern_base_extractor.rb +51 -3
- data/lib/fpgrowth/version.rb +1 -1
- data/test/tc_conditional_tree_builder.rb +74 -3
- data/test/tc_fp_tree.rb +80 -29
- data/test/tc_miner.rb +4 -4
- data/test/tc_open_data_enel.rb +77 -10
- data/test/tc_open_data_sondage_montreal.rb +62 -19
- data/test/tc_open_data_velo_montreal.rb +43 -9
- data/test/tc_pattern_base_extractor.rb +116 -0
- metadata +7 -2
data/README.md
CHANGED
@@ -44,12 +44,24 @@ Or install it yourself as:
|
|
44
44
|
|
45
45
|
## Usage
|
46
46
|
|
47
|
+
### Basic Usage
|
48
|
+
|
49
|
+
Just do it :
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
53
|
+
patterns = FpGrowth.mine(transactions)
|
54
|
+
```
|
55
|
+
|
56
|
+
### Advanced Usage
|
57
|
+
|
58
|
+
|
47
59
|
Build a tree from transactions and mine it
|
48
60
|
|
49
61
|
```ruby
|
50
62
|
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
51
63
|
fp_tree = FpGrowth::FpTree.build(transactions)
|
52
|
-
FpGrowth::Miner.
|
64
|
+
FpGrowth::Miner.td_fp_growth(fp_tree)
|
53
65
|
|
54
66
|
```
|
55
67
|
|
@@ -61,10 +73,39 @@ The larger is the number of transactions, the smaller should be the threshold. I
|
|
61
73
|
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
62
74
|
fp_tree = FpGrowth::FpTree.build(transactions, 30)
|
63
75
|
# 30 stands for 30% of transactions. Here, 'c' would be pruned.
|
64
|
-
FpGrowth::Miner.
|
76
|
+
FpGrowth::Miner.td_fp_growth(fp_tree)
|
65
77
|
|
66
78
|
```
|
67
79
|
|
80
|
+
If you want to avoid worst case, then you should make it a Bonzai !
|
81
|
+
```ruby
|
82
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
83
|
+
fp_tree = FpGrowth::FpTree.build(transactions, 30)
|
84
|
+
bonzai = fp_tree.to_bonzai(20)
|
85
|
+
FpGrowth::Miner.td_fp_growth(bonzai)
|
86
|
+
|
87
|
+
```
|
88
|
+
20 stands for a hardness of 20%. It mean that a node is cut from the tree if it's not greater than 20% of it's father support.
|
89
|
+
|
90
|
+
There is two variant of FP-Growth.
|
91
|
+
The first one is the TopDown, it's the most efficient, in most cases.
|
92
|
+
For some reasons, it's alternative, the classical FpGrowth, it might be more efficient on a very small set.
|
93
|
+
Use it this way :
|
94
|
+
```ruby
|
95
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
96
|
+
patterns = FpGrowth.fp_growth(transactions)
|
97
|
+
```
|
98
|
+
or
|
99
|
+
```ruby
|
100
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
101
|
+
fp_tree = FpGrowth::FpTree.build(transactions, 30)
|
102
|
+
bonzai = fp_tree.to_bonzai(20)
|
103
|
+
FpGrowth::Miner.fp_growth(bonzai)
|
104
|
+
|
105
|
+
```
|
106
|
+
|
107
|
+
|
108
|
+
|
68
109
|
### Examples
|
69
110
|
|
70
111
|
You can find in the test repository a few concrete example on Open Data.
|
data/Rakefile
CHANGED
data/lib/fpgrowth.rb
CHANGED
@@ -1,8 +1,20 @@
|
|
1
1
|
require "fpgrowth/version.rb"
|
2
|
+
require 'fpgrowth/fp_tree'
|
3
|
+
require 'fpgrowth/miner'
|
2
4
|
|
3
5
|
module FpGrowth
|
4
|
-
|
5
|
-
|
6
|
+
def self.mine(transactions, threshold=1)
|
7
|
+
td_fp_growth(transactions, threshold)
|
8
|
+
end
|
6
9
|
|
7
|
-
|
8
|
-
|
10
|
+
def self.fp_growth(transactions, threshold=1)
|
11
|
+
fp_tree = FpTree.build(transactions, threshold)
|
12
|
+
Miner.fp_growth(fp_tree)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.td_fp_growth(transactions, threshold=1)
|
16
|
+
fp_tree = FpTree.build(transactions, threshold)
|
17
|
+
Miner.td_fp_growth(fp_tree)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
data/lib/fpgrowth/fp_tree.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require_relative 'fp_tree/node'
|
2
2
|
require_relative 'fp_tree/builder'
|
3
|
+
require_relative 'fp_tree/bonzai_secateur'
|
4
|
+
require_relative 'fp_tree/header_table'
|
3
5
|
|
4
6
|
require 'graphviz'
|
5
7
|
require 'etc'
|
@@ -18,8 +20,16 @@ module FpGrowth
|
|
18
20
|
Builder.build(transactions, threshold)
|
19
21
|
end
|
20
22
|
|
21
|
-
def
|
22
|
-
|
23
|
+
def to_bonzai(hardness=20)
|
24
|
+
return BonzaiSecateur.new(self, hardness).execute()
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_bonzai!(hardness=20)
|
28
|
+
return BonzaiSecateur.new(self, hardness).execute!()
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize(supports={}, threshold=1, root=Node.new())
|
32
|
+
@root = root
|
23
33
|
@heads = Hash.new nil
|
24
34
|
@supports = supports
|
25
35
|
#initialiser les clés
|
@@ -69,7 +79,7 @@ module FpGrowth
|
|
69
79
|
node=row
|
70
80
|
while node != nil
|
71
81
|
for child in node.children
|
72
|
-
g.add_edges(nodonode[node], nodonode[child])
|
82
|
+
g.add_edges(nodonode[node], nodonode[child]) if nodonode[child]
|
73
83
|
end
|
74
84
|
node = node.lateral
|
75
85
|
end
|
@@ -79,8 +89,8 @@ module FpGrowth
|
|
79
89
|
for row in self.heads.values
|
80
90
|
node=row
|
81
91
|
while node != nil
|
82
|
-
|
83
|
-
g.add_edges(nodonode[node], nodonode[node.
|
92
|
+
g.add_edges(nodonode[node], nodonode[node.lateral], :style => :dashed, :color=> :green, :constraint => :false) if node.lateral
|
93
|
+
g.add_edges(nodonode[node], nodonode[node.parent], :style => :dashed, :color=> :red, :constraint => :false) if node.parent
|
84
94
|
node = node.lateral
|
85
95
|
end
|
86
96
|
end
|
@@ -109,6 +119,49 @@ module FpGrowth
|
|
109
119
|
end
|
110
120
|
end
|
111
121
|
|
122
|
+
def cut_branch(node)
|
123
|
+
node.children.each { |child| cut_branch(child) }
|
124
|
+
remove(node)
|
125
|
+
end
|
126
|
+
|
127
|
+
def remove_from_lateral(node, verbose=false)
|
128
|
+
if @heads[node.item].equal?(node)
|
129
|
+
if node.lateral
|
130
|
+
@heads[node.item] = node.lateral
|
131
|
+
else
|
132
|
+
@heads.delete(node.item)
|
133
|
+
end
|
134
|
+
else
|
135
|
+
puts "node #{node.to_s}" if verbose
|
136
|
+
puts "pas head" if verbose
|
137
|
+
left = @heads[node.item]
|
138
|
+
while left != nil and not left.equal? node and not left.lateral.equal? node
|
139
|
+
left = left.lateral
|
140
|
+
end
|
141
|
+
puts "left found #{left.lateral}" if verbose
|
142
|
+
left.lateral = node.lateral if left
|
143
|
+
puts "left found #{left.lateral}" if verbose
|
144
|
+
end
|
145
|
+
node.lateral=nil
|
146
|
+
end
|
147
|
+
|
148
|
+
def remove(node)
|
149
|
+
# Remove from lateral linked list
|
150
|
+
remove_from_lateral(node)
|
151
|
+
|
152
|
+
# attach childrens
|
153
|
+
node.parent.children += node.children
|
154
|
+
node.children.each { |x| x.parent = node.parent }
|
155
|
+
|
156
|
+
# Remove from parents
|
157
|
+
node.parent.children.delete(node)
|
158
|
+
|
159
|
+
# Remove from support
|
160
|
+
@supports[node.item] -= node.support if @supports[node.item]
|
161
|
+
|
162
|
+
|
163
|
+
end
|
164
|
+
|
112
165
|
def items_count
|
113
166
|
sum=0
|
114
167
|
for val in supports.values
|
@@ -146,6 +199,93 @@ module FpGrowth
|
|
146
199
|
return @heads.empty?
|
147
200
|
end
|
148
201
|
|
202
|
+
def clone
|
203
|
+
clone = FpTree.new(@supports, @threshold, @root.clone_deep)
|
204
|
+
clone.link_down()
|
205
|
+
return clone
|
206
|
+
end
|
207
|
+
|
208
|
+
def link_down(cursor=@root)
|
209
|
+
children = cursor.children.clone
|
210
|
+
cursor.children=[]
|
211
|
+
children.each { |child|
|
212
|
+
append_node(cursor, child)
|
213
|
+
}
|
214
|
+
children.each { |child| link_down(child) }
|
215
|
+
end
|
216
|
+
|
217
|
+
def size(subtree=@root)
|
218
|
+
sum = 1
|
219
|
+
subtree.children.each { |child| sum+= size(child) }
|
220
|
+
return sum
|
221
|
+
end
|
222
|
+
|
223
|
+
def sum
|
224
|
+
sum = 0
|
225
|
+
@supports.each { |key, value| sum+=value}
|
226
|
+
return sum
|
227
|
+
end
|
228
|
+
|
229
|
+
def lateral_sum
|
230
|
+
sum=0
|
231
|
+
for cursor in @heads.values
|
232
|
+
while cursor != nil
|
233
|
+
sum+=cursor.support
|
234
|
+
cursor = cursor.lateral
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
238
|
+
return sum
|
239
|
+
end
|
240
|
+
|
241
|
+
def max_width
|
242
|
+
max_width=0
|
243
|
+
for cursor in @heads.values
|
244
|
+
width=0
|
245
|
+
while cursor != nil
|
246
|
+
width+=1
|
247
|
+
cursor = cursor.lateral
|
248
|
+
end
|
249
|
+
max_width = width if max_width < width
|
250
|
+
end
|
251
|
+
return max_width
|
252
|
+
end
|
253
|
+
|
254
|
+
def has_lateral_cycle?
|
255
|
+
i = 0
|
256
|
+
while i < @heads.keys.size
|
257
|
+
key = @heads.keys[i]
|
258
|
+
cursor = @heads[key]
|
259
|
+
stack = []
|
260
|
+
flag = false
|
261
|
+
j=0
|
262
|
+
while cursor != nil and not flag
|
263
|
+
flag = true if stack.include?(cursor.object_id)
|
264
|
+
stack.push(cursor.object_id)
|
265
|
+
cursor = cursor.lateral
|
266
|
+
j += 1
|
267
|
+
#puts "#{i}/#{@heads.keys.size} - #{j}"
|
268
|
+
end
|
269
|
+
return key if flag
|
270
|
+
i += 1
|
271
|
+
end
|
272
|
+
return false
|
273
|
+
end
|
274
|
+
|
275
|
+
def header_table
|
276
|
+
unless @header_table
|
277
|
+
@header_table = HeaderTable.new()
|
278
|
+
for row in @heads.keys
|
279
|
+
node = @heads[row]
|
280
|
+
while node != nil
|
281
|
+
@header_table << [node.item, node.support, node]
|
282
|
+
node = node.lateral
|
283
|
+
end
|
284
|
+
end
|
285
|
+
return @header_table
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
149
289
|
end
|
150
290
|
end
|
151
291
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module FpGrowth
|
2
|
+
module FpTree
|
3
|
+
class BonzaiSecateur
|
4
|
+
|
5
|
+
def initialize(fp_tree, hardness=20)
|
6
|
+
@fp_tree=fp_tree
|
7
|
+
@hardness=hardness
|
8
|
+
end
|
9
|
+
|
10
|
+
def execute(hardness=@hardness, fp_tree=@fp_tree.clone)
|
11
|
+
traverse(fp_tree)
|
12
|
+
return fp_tree
|
13
|
+
end
|
14
|
+
|
15
|
+
def execute!(hardness=@hardness)
|
16
|
+
return execute(hardness, @fp_tree)
|
17
|
+
end
|
18
|
+
|
19
|
+
def traverse(fp_tree, cursor = fp_tree.root, deepness=0)
|
20
|
+
children = cursor.children.clone
|
21
|
+
threshold = cursor.support.to_f / 100 * (@hardness + deepness)
|
22
|
+
children.each { |child|
|
23
|
+
fp_tree.cut_branch(child) if child.support < threshold
|
24
|
+
}
|
25
|
+
cursor.children.each { |child| traverse(fp_tree, child, deepness + 1) }
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
module FpGrowth
|
3
2
|
module FpTree
|
4
3
|
module Builder
|
@@ -35,16 +34,12 @@ module FpGrowth
|
|
35
34
|
def pruning(transactions=@transactions, supports=@supports, threshold=@threshold)
|
36
35
|
|
37
36
|
minimum = transactions.size.to_f / 100 * threshold
|
38
|
-
|
39
37
|
for transaction in transactions
|
40
|
-
|
41
|
-
|
42
|
-
transaction.delete(item) if supports[item] < minimum
|
43
|
-
end
|
38
|
+
transaction.delete_if { |item| supports[item] < minimum }
|
44
39
|
end
|
45
40
|
transactions.delete([])
|
46
|
-
supports.delete_if { |key, value| value < minimum }
|
47
41
|
|
42
|
+
supports.delete_if { |key, value| value < minimum }
|
48
43
|
return supports
|
49
44
|
end
|
50
45
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require_relative "../header_table"
|
2
|
+
|
3
|
+
module FpGrowth
|
4
|
+
module FpTree
|
5
|
+
module Builder
|
6
|
+
class HeaderTableBuilder
|
7
|
+
|
8
|
+
|
9
|
+
def initialize(item, header_table)
|
10
|
+
@header_table = header_table
|
11
|
+
@item = item
|
12
|
+
@new_header_table = HeaderTable.new()
|
13
|
+
end
|
14
|
+
|
15
|
+
def execute()
|
16
|
+
# for each node n in header for item
|
17
|
+
|
18
|
+
|
19
|
+
for node in @header_table.nodes[@item]
|
20
|
+
# traverse tree from n to top
|
21
|
+
traverse_from_node_top_top(node.parent, node.support)
|
22
|
+
end
|
23
|
+
return @new_header_table
|
24
|
+
end
|
25
|
+
|
26
|
+
def traverse_from_node_top_top(node, support)
|
27
|
+
if node.item
|
28
|
+
# For each node m
|
29
|
+
@new_header_table << [node.item, support, node]
|
30
|
+
traverse_from_node_top_top(node.parent, support)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'set'
|
2
|
+
module FpGrowth
|
3
|
+
module FpTree
|
4
|
+
class HeaderTable
|
5
|
+
|
6
|
+
def self.build(item, header_table)
|
7
|
+
builder = Builder::HeaderTableBuilder.new(item, header_table)
|
8
|
+
return builder.execute()
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
def initialize()
|
13
|
+
@count = Hash.new 0
|
14
|
+
@nodes = Hash.new { Set.new() }
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_accessor :count, :nodes
|
18
|
+
|
19
|
+
def keys
|
20
|
+
@nodes.keys
|
21
|
+
end
|
22
|
+
|
23
|
+
# Append a Row
|
24
|
+
# @param row Array as [item, support, node]
|
25
|
+
#
|
26
|
+
def << (row)
|
27
|
+
# Add a link for m in HeaderTable
|
28
|
+
@nodes[row[0]] = @nodes[row[0]] << row[2]
|
29
|
+
# Add support m = previous + n
|
30
|
+
@count[row[0]] += row[1]
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Set
|
38
|
+
def to_s
|
39
|
+
to_a.join(', ')
|
40
|
+
end
|
41
|
+
end
|
data/lib/fpgrowth/miner.rb
CHANGED
@@ -15,6 +15,11 @@ module FpGrowth
|
|
15
15
|
return miner.pattern_set
|
16
16
|
end
|
17
17
|
|
18
|
+
def self.td_fp_growth(fp_tree)
|
19
|
+
miner = Miner.new()
|
20
|
+
miner.top_down_fp_growth(fp_tree)
|
21
|
+
return miner.pattern_set
|
22
|
+
end
|
18
23
|
|
19
24
|
class Miner
|
20
25
|
|
@@ -42,7 +47,7 @@ module FpGrowth
|
|
42
47
|
pattern_beta << node
|
43
48
|
end
|
44
49
|
@pattern_set << pattern_beta
|
45
|
-
#puts "Pattern extracted : #{pattern_beta.content.to_s}"
|
50
|
+
#puts "Pattern extracted : #{pattern_beta.content.to_s} - #{pattern_beta.support}"
|
46
51
|
end
|
47
52
|
else
|
48
53
|
for item in fp_tree.supports.keys
|
@@ -56,7 +61,32 @@ module FpGrowth
|
|
56
61
|
end
|
57
62
|
end
|
58
63
|
end
|
59
|
-
|
64
|
+
|
65
|
+
def top_down_fp_growth(header_table, pattern_alpha=Pattern.new(), min_support=0)
|
66
|
+
|
67
|
+
if header_table.instance_of? FpTree::FpTree
|
68
|
+
header_table = header_table.header_table
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
# For each row of header_table
|
73
|
+
for row in header_table.keys
|
74
|
+
# If Support of header_table > min_support
|
75
|
+
if header_table.count[row] > min_support then
|
76
|
+
# output pattern extended with row.item
|
77
|
+
pattern_beta = Pattern.new(pattern_alpha.content + [row], header_table.count[row])
|
78
|
+
@pattern_set << pattern_beta
|
79
|
+
# puts "Pattern extracted : #{pattern_beta.content.to_s} - #{pattern_beta.support}"
|
80
|
+
# Build new Header Table
|
81
|
+
header_table_new = FpTree::HeaderTable.build(row, header_table)
|
82
|
+
# Mine extended pattern, new header table
|
83
|
+
top_down_fp_growth(header_table_new, pattern_beta)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end #class
|
60
90
|
|
61
91
|
end
|
62
92
|
end
|