fpgrowth 0.0.2 → 1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +43 -2
- data/Rakefile +1 -1
- data/lib/fpgrowth.rb +16 -4
- data/lib/fpgrowth/fp_tree.rb +145 -5
- data/lib/fpgrowth/fp_tree/bonzai_secateur.rb +30 -0
- data/lib/fpgrowth/fp_tree/builder.rb +1 -0
- data/lib/fpgrowth/fp_tree/builder/first_pass.rb +2 -7
- data/lib/fpgrowth/fp_tree/builder/header_table_builder.rb +37 -0
- data/lib/fpgrowth/fp_tree/header_table.rb +41 -0
- data/lib/fpgrowth/fp_tree/node.rb +1 -0
- data/lib/fpgrowth/miner.rb +32 -2
- data/lib/fpgrowth/miner/pattern_base_extractor.rb +51 -3
- data/lib/fpgrowth/version.rb +1 -1
- data/test/tc_conditional_tree_builder.rb +74 -3
- data/test/tc_fp_tree.rb +80 -29
- data/test/tc_miner.rb +4 -4
- data/test/tc_open_data_enel.rb +77 -10
- data/test/tc_open_data_sondage_montreal.rb +62 -19
- data/test/tc_open_data_velo_montreal.rb +43 -9
- data/test/tc_pattern_base_extractor.rb +116 -0
- metadata +7 -2
data/README.md
CHANGED
@@ -44,12 +44,24 @@ Or install it yourself as:
|
|
44
44
|
|
45
45
|
## Usage
|
46
46
|
|
47
|
+
### Basic Usage
|
48
|
+
|
49
|
+
Just do it :
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
53
|
+
patterns = FpGrowth.mine(transactions)
|
54
|
+
```
|
55
|
+
|
56
|
+
### Advanced Usage
|
57
|
+
|
58
|
+
|
47
59
|
Build a tree from transactions and mine it
|
48
60
|
|
49
61
|
```ruby
|
50
62
|
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
51
63
|
fp_tree = FpGrowth::FpTree.build(transactions)
|
52
|
-
FpGrowth::Miner.
|
64
|
+
FpGrowth::Miner.td_fp_growth(fp_tree)
|
53
65
|
|
54
66
|
```
|
55
67
|
|
@@ -61,10 +73,39 @@ The larger is the number of transactions, the smaller should be the threshold. I
|
|
61
73
|
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
62
74
|
fp_tree = FpGrowth::FpTree.build(transactions, 30)
|
63
75
|
# 30 stands for 30% of transactions. Here, 'c' would be pruned.
|
64
|
-
FpGrowth::Miner.
|
76
|
+
FpGrowth::Miner.td_fp_growth(fp_tree)
|
65
77
|
|
66
78
|
```
|
67
79
|
|
80
|
+
If you want to avoid worst case, then you should make it a Bonzai !
|
81
|
+
```ruby
|
82
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
83
|
+
fp_tree = FpGrowth::FpTree.build(transactions, 30)
|
84
|
+
bonzai = fp_tree.to_bonzai(20)
|
85
|
+
FpGrowth::Miner.td_fp_growth(bonzai)
|
86
|
+
|
87
|
+
```
|
88
|
+
20 stands for a hardness of 20%. It mean that a node is cut from the tree if it's not greater than 20% of it's father support.
|
89
|
+
|
90
|
+
There is two variant of FP-Growth.
|
91
|
+
The first one is the TopDown, it's the most efficient, in most cases.
|
92
|
+
For some reasons, it's alternative, the classical FpGrowth, it might be more efficient on a very small set.
|
93
|
+
Use it this way :
|
94
|
+
```ruby
|
95
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
96
|
+
patterns = FpGrowth.fp_growth(transactions)
|
97
|
+
```
|
98
|
+
or
|
99
|
+
```ruby
|
100
|
+
transactions = [['a', 'b'], ['b'], ['b', 'c'], ['a', 'b']]
|
101
|
+
fp_tree = FpGrowth::FpTree.build(transactions, 30)
|
102
|
+
bonzai = fp_tree.to_bonzai(20)
|
103
|
+
FpGrowth::Miner.fp_growth(bonzai)
|
104
|
+
|
105
|
+
```
|
106
|
+
|
107
|
+
|
108
|
+
|
68
109
|
### Examples
|
69
110
|
|
70
111
|
You can find in the test repository a few concrete example on Open Data.
|
data/Rakefile
CHANGED
data/lib/fpgrowth.rb
CHANGED
@@ -1,8 +1,20 @@
|
|
1
1
|
require "fpgrowth/version.rb"
|
2
|
+
require 'fpgrowth/fp_tree'
|
3
|
+
require 'fpgrowth/miner'
|
2
4
|
|
3
5
|
module FpGrowth
|
4
|
-
|
5
|
-
|
6
|
+
def self.mine(transactions, threshold=1)
|
7
|
+
td_fp_growth(transactions, threshold)
|
8
|
+
end
|
6
9
|
|
7
|
-
|
8
|
-
|
10
|
+
def self.fp_growth(transactions, threshold=1)
|
11
|
+
fp_tree = FpTree.build(transactions, threshold)
|
12
|
+
Miner.fp_growth(fp_tree)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.td_fp_growth(transactions, threshold=1)
|
16
|
+
fp_tree = FpTree.build(transactions, threshold)
|
17
|
+
Miner.td_fp_growth(fp_tree)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
data/lib/fpgrowth/fp_tree.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require_relative 'fp_tree/node'
|
2
2
|
require_relative 'fp_tree/builder'
|
3
|
+
require_relative 'fp_tree/bonzai_secateur'
|
4
|
+
require_relative 'fp_tree/header_table'
|
3
5
|
|
4
6
|
require 'graphviz'
|
5
7
|
require 'etc'
|
@@ -18,8 +20,16 @@ module FpGrowth
|
|
18
20
|
Builder.build(transactions, threshold)
|
19
21
|
end
|
20
22
|
|
21
|
-
def
|
22
|
-
|
23
|
+
def to_bonzai(hardness=20)
|
24
|
+
return BonzaiSecateur.new(self, hardness).execute()
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_bonzai!(hardness=20)
|
28
|
+
return BonzaiSecateur.new(self, hardness).execute!()
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize(supports={}, threshold=1, root=Node.new())
|
32
|
+
@root = root
|
23
33
|
@heads = Hash.new nil
|
24
34
|
@supports = supports
|
25
35
|
#initialiser les clés
|
@@ -69,7 +79,7 @@ module FpGrowth
|
|
69
79
|
node=row
|
70
80
|
while node != nil
|
71
81
|
for child in node.children
|
72
|
-
g.add_edges(nodonode[node], nodonode[child])
|
82
|
+
g.add_edges(nodonode[node], nodonode[child]) if nodonode[child]
|
73
83
|
end
|
74
84
|
node = node.lateral
|
75
85
|
end
|
@@ -79,8 +89,8 @@ module FpGrowth
|
|
79
89
|
for row in self.heads.values
|
80
90
|
node=row
|
81
91
|
while node != nil
|
82
|
-
|
83
|
-
g.add_edges(nodonode[node], nodonode[node.
|
92
|
+
g.add_edges(nodonode[node], nodonode[node.lateral], :style => :dashed, :color=> :green, :constraint => :false) if node.lateral
|
93
|
+
g.add_edges(nodonode[node], nodonode[node.parent], :style => :dashed, :color=> :red, :constraint => :false) if node.parent
|
84
94
|
node = node.lateral
|
85
95
|
end
|
86
96
|
end
|
@@ -109,6 +119,49 @@ module FpGrowth
|
|
109
119
|
end
|
110
120
|
end
|
111
121
|
|
122
|
+
def cut_branch(node)
|
123
|
+
node.children.each { |child| cut_branch(child) }
|
124
|
+
remove(node)
|
125
|
+
end
|
126
|
+
|
127
|
+
def remove_from_lateral(node, verbose=false)
|
128
|
+
if @heads[node.item].equal?(node)
|
129
|
+
if node.lateral
|
130
|
+
@heads[node.item] = node.lateral
|
131
|
+
else
|
132
|
+
@heads.delete(node.item)
|
133
|
+
end
|
134
|
+
else
|
135
|
+
puts "node #{node.to_s}" if verbose
|
136
|
+
puts "pas head" if verbose
|
137
|
+
left = @heads[node.item]
|
138
|
+
while left != nil and not left.equal? node and not left.lateral.equal? node
|
139
|
+
left = left.lateral
|
140
|
+
end
|
141
|
+
puts "left found #{left.lateral}" if verbose
|
142
|
+
left.lateral = node.lateral if left
|
143
|
+
puts "left found #{left.lateral}" if verbose
|
144
|
+
end
|
145
|
+
node.lateral=nil
|
146
|
+
end
|
147
|
+
|
148
|
+
def remove(node)
|
149
|
+
# Remove from lateral linked list
|
150
|
+
remove_from_lateral(node)
|
151
|
+
|
152
|
+
# attach childrens
|
153
|
+
node.parent.children += node.children
|
154
|
+
node.children.each { |x| x.parent = node.parent }
|
155
|
+
|
156
|
+
# Remove from parents
|
157
|
+
node.parent.children.delete(node)
|
158
|
+
|
159
|
+
# Remove from support
|
160
|
+
@supports[node.item] -= node.support if @supports[node.item]
|
161
|
+
|
162
|
+
|
163
|
+
end
|
164
|
+
|
112
165
|
def items_count
|
113
166
|
sum=0
|
114
167
|
for val in supports.values
|
@@ -146,6 +199,93 @@ module FpGrowth
|
|
146
199
|
return @heads.empty?
|
147
200
|
end
|
148
201
|
|
202
|
+
def clone
|
203
|
+
clone = FpTree.new(@supports, @threshold, @root.clone_deep)
|
204
|
+
clone.link_down()
|
205
|
+
return clone
|
206
|
+
end
|
207
|
+
|
208
|
+
def link_down(cursor=@root)
|
209
|
+
children = cursor.children.clone
|
210
|
+
cursor.children=[]
|
211
|
+
children.each { |child|
|
212
|
+
append_node(cursor, child)
|
213
|
+
}
|
214
|
+
children.each { |child| link_down(child) }
|
215
|
+
end
|
216
|
+
|
217
|
+
def size(subtree=@root)
|
218
|
+
sum = 1
|
219
|
+
subtree.children.each { |child| sum+= size(child) }
|
220
|
+
return sum
|
221
|
+
end
|
222
|
+
|
223
|
+
def sum
|
224
|
+
sum = 0
|
225
|
+
@supports.each { |key, value| sum+=value}
|
226
|
+
return sum
|
227
|
+
end
|
228
|
+
|
229
|
+
def lateral_sum
|
230
|
+
sum=0
|
231
|
+
for cursor in @heads.values
|
232
|
+
while cursor != nil
|
233
|
+
sum+=cursor.support
|
234
|
+
cursor = cursor.lateral
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
238
|
+
return sum
|
239
|
+
end
|
240
|
+
|
241
|
+
def max_width
|
242
|
+
max_width=0
|
243
|
+
for cursor in @heads.values
|
244
|
+
width=0
|
245
|
+
while cursor != nil
|
246
|
+
width+=1
|
247
|
+
cursor = cursor.lateral
|
248
|
+
end
|
249
|
+
max_width = width if max_width < width
|
250
|
+
end
|
251
|
+
return max_width
|
252
|
+
end
|
253
|
+
|
254
|
+
def has_lateral_cycle?
|
255
|
+
i = 0
|
256
|
+
while i < @heads.keys.size
|
257
|
+
key = @heads.keys[i]
|
258
|
+
cursor = @heads[key]
|
259
|
+
stack = []
|
260
|
+
flag = false
|
261
|
+
j=0
|
262
|
+
while cursor != nil and not flag
|
263
|
+
flag = true if stack.include?(cursor.object_id)
|
264
|
+
stack.push(cursor.object_id)
|
265
|
+
cursor = cursor.lateral
|
266
|
+
j += 1
|
267
|
+
#puts "#{i}/#{@heads.keys.size} - #{j}"
|
268
|
+
end
|
269
|
+
return key if flag
|
270
|
+
i += 1
|
271
|
+
end
|
272
|
+
return false
|
273
|
+
end
|
274
|
+
|
275
|
+
def header_table
|
276
|
+
unless @header_table
|
277
|
+
@header_table = HeaderTable.new()
|
278
|
+
for row in @heads.keys
|
279
|
+
node = @heads[row]
|
280
|
+
while node != nil
|
281
|
+
@header_table << [node.item, node.support, node]
|
282
|
+
node = node.lateral
|
283
|
+
end
|
284
|
+
end
|
285
|
+
return @header_table
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
149
289
|
end
|
150
290
|
end
|
151
291
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module FpGrowth
|
2
|
+
module FpTree
|
3
|
+
class BonzaiSecateur
|
4
|
+
|
5
|
+
def initialize(fp_tree, hardness=20)
|
6
|
+
@fp_tree=fp_tree
|
7
|
+
@hardness=hardness
|
8
|
+
end
|
9
|
+
|
10
|
+
def execute(hardness=@hardness, fp_tree=@fp_tree.clone)
|
11
|
+
traverse(fp_tree)
|
12
|
+
return fp_tree
|
13
|
+
end
|
14
|
+
|
15
|
+
def execute!(hardness=@hardness)
|
16
|
+
return execute(hardness, @fp_tree)
|
17
|
+
end
|
18
|
+
|
19
|
+
def traverse(fp_tree, cursor = fp_tree.root, deepness=0)
|
20
|
+
children = cursor.children.clone
|
21
|
+
threshold = cursor.support.to_f / 100 * (@hardness + deepness)
|
22
|
+
children.each { |child|
|
23
|
+
fp_tree.cut_branch(child) if child.support < threshold
|
24
|
+
}
|
25
|
+
cursor.children.each { |child| traverse(fp_tree, child, deepness + 1) }
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
module FpGrowth
|
3
2
|
module FpTree
|
4
3
|
module Builder
|
@@ -35,16 +34,12 @@ module FpGrowth
|
|
35
34
|
def pruning(transactions=@transactions, supports=@supports, threshold=@threshold)
|
36
35
|
|
37
36
|
minimum = transactions.size.to_f / 100 * threshold
|
38
|
-
|
39
37
|
for transaction in transactions
|
40
|
-
|
41
|
-
|
42
|
-
transaction.delete(item) if supports[item] < minimum
|
43
|
-
end
|
38
|
+
transaction.delete_if { |item| supports[item] < minimum }
|
44
39
|
end
|
45
40
|
transactions.delete([])
|
46
|
-
supports.delete_if { |key, value| value < minimum }
|
47
41
|
|
42
|
+
supports.delete_if { |key, value| value < minimum }
|
48
43
|
return supports
|
49
44
|
end
|
50
45
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require_relative "../header_table"
|
2
|
+
|
3
|
+
module FpGrowth
|
4
|
+
module FpTree
|
5
|
+
module Builder
|
6
|
+
class HeaderTableBuilder
|
7
|
+
|
8
|
+
|
9
|
+
def initialize(item, header_table)
|
10
|
+
@header_table = header_table
|
11
|
+
@item = item
|
12
|
+
@new_header_table = HeaderTable.new()
|
13
|
+
end
|
14
|
+
|
15
|
+
def execute()
|
16
|
+
# for each node n in header for item
|
17
|
+
|
18
|
+
|
19
|
+
for node in @header_table.nodes[@item]
|
20
|
+
# traverse tree from n to top
|
21
|
+
traverse_from_node_top_top(node.parent, node.support)
|
22
|
+
end
|
23
|
+
return @new_header_table
|
24
|
+
end
|
25
|
+
|
26
|
+
def traverse_from_node_top_top(node, support)
|
27
|
+
if node.item
|
28
|
+
# For each node m
|
29
|
+
@new_header_table << [node.item, support, node]
|
30
|
+
traverse_from_node_top_top(node.parent, support)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'set'
|
2
|
+
module FpGrowth
|
3
|
+
module FpTree
|
4
|
+
class HeaderTable
|
5
|
+
|
6
|
+
def self.build(item, header_table)
|
7
|
+
builder = Builder::HeaderTableBuilder.new(item, header_table)
|
8
|
+
return builder.execute()
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
def initialize()
|
13
|
+
@count = Hash.new 0
|
14
|
+
@nodes = Hash.new { Set.new() }
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_accessor :count, :nodes
|
18
|
+
|
19
|
+
def keys
|
20
|
+
@nodes.keys
|
21
|
+
end
|
22
|
+
|
23
|
+
# Append a Row
|
24
|
+
# @param row Array as [item, support, node]
|
25
|
+
#
|
26
|
+
def << (row)
|
27
|
+
# Add a link for m in HeaderTable
|
28
|
+
@nodes[row[0]] = @nodes[row[0]] << row[2]
|
29
|
+
# Add support m = previous + n
|
30
|
+
@count[row[0]] += row[1]
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Set
|
38
|
+
def to_s
|
39
|
+
to_a.join(', ')
|
40
|
+
end
|
41
|
+
end
|
data/lib/fpgrowth/miner.rb
CHANGED
@@ -15,6 +15,11 @@ module FpGrowth
|
|
15
15
|
return miner.pattern_set
|
16
16
|
end
|
17
17
|
|
18
|
+
def self.td_fp_growth(fp_tree)
|
19
|
+
miner = Miner.new()
|
20
|
+
miner.top_down_fp_growth(fp_tree)
|
21
|
+
return miner.pattern_set
|
22
|
+
end
|
18
23
|
|
19
24
|
class Miner
|
20
25
|
|
@@ -42,7 +47,7 @@ module FpGrowth
|
|
42
47
|
pattern_beta << node
|
43
48
|
end
|
44
49
|
@pattern_set << pattern_beta
|
45
|
-
#puts "Pattern extracted : #{pattern_beta.content.to_s}"
|
50
|
+
#puts "Pattern extracted : #{pattern_beta.content.to_s} - #{pattern_beta.support}"
|
46
51
|
end
|
47
52
|
else
|
48
53
|
for item in fp_tree.supports.keys
|
@@ -56,7 +61,32 @@ module FpGrowth
|
|
56
61
|
end
|
57
62
|
end
|
58
63
|
end
|
59
|
-
|
64
|
+
|
65
|
+
def top_down_fp_growth(header_table, pattern_alpha=Pattern.new(), min_support=0)
|
66
|
+
|
67
|
+
if header_table.instance_of? FpTree::FpTree
|
68
|
+
header_table = header_table.header_table
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
# For each row of header_table
|
73
|
+
for row in header_table.keys
|
74
|
+
# If Support of header_table > min_support
|
75
|
+
if header_table.count[row] > min_support then
|
76
|
+
# output pattern extended with row.item
|
77
|
+
pattern_beta = Pattern.new(pattern_alpha.content + [row], header_table.count[row])
|
78
|
+
@pattern_set << pattern_beta
|
79
|
+
# puts "Pattern extracted : #{pattern_beta.content.to_s} - #{pattern_beta.support}"
|
80
|
+
# Build new Header Table
|
81
|
+
header_table_new = FpTree::HeaderTable.build(row, header_table)
|
82
|
+
# Mine extended pattern, new header table
|
83
|
+
top_down_fp_growth(header_table_new, pattern_beta)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end #class
|
60
90
|
|
61
91
|
end
|
62
92
|
end
|