tyler-collaborative_filter 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +686 -0
- data/README.textile +109 -0
- data/Rakefile +1 -0
- data/lib/boosters/simple_booster.rb +137 -0
- data/lib/collaborative_filter.rb +43 -0
- data/lib/collaborative_filter/config.rb +58 -0
- data/lib/collaborative_filter/content_booster.rb +51 -0
- data/lib/collaborative_filter/data_set.rb +75 -0
- data/lib/collaborative_filter/output.rb +13 -0
- data/lib/collaborative_filter/output/mysql_adapter.rb +42 -0
- data/lib/collaborative_filter/output/yaml_adapter.rb +13 -0
- data/lib/correlators/simple_svd.rb +55 -0
- data/lib/recommenders/simplest_recommender.rb +92 -0
- metadata +66 -0
data/README.textile
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
h1. CollaborativeFilter
|
2
|
+
|
3
|
+
Introduction goes here.
|
4
|
+
|
5
|
+
h2. Example
|
6
|
+
|
7
|
+
<pre>
|
8
|
+
<code>
|
9
|
+
CollaborativeFilter.filter(:logger => LOGGER) do |cf|
|
10
|
+
# We're using a low threshold settings because we're using a content booster
|
11
|
+
# for post processing. We keep it low so the content booster has plenty of
|
12
|
+
# data to work with. There is a second threshold in the content booster that
|
13
|
+
# is set high.
|
14
|
+
#
|
15
|
+
# The threshold of the content booster is set at 4.2, with a factor of 0.3
|
16
|
+
# and a single gene. So the lowest recommendation which could theoretically
|
17
|
+
# make it through the content booster would be:
|
18
|
+
#
|
19
|
+
# 4.2 - (5 - crossover) * factor =
|
20
|
+
# 4.2 - (5 - 2.5) * 0.3 = 3.45
|
21
|
+
#
|
22
|
+
# So we set it to 3.45. Anything lower than that will never be recommended.
|
23
|
+
|
24
|
+
cf.recommender CollaborativeFilter::SimplestRecommender, :threshold => 3.45, :max_per_user => 90
|
25
|
+
|
26
|
+
|
27
|
+
cf.output :type => :sql, :options => { :table_name => 'recommendations',
|
28
|
+
:mapping => { :user_id => :customer_id, :score => :relevance } }
|
29
|
+
|
30
|
+
cf.content_booster do |cb|
|
31
|
+
cb.booster CollaborativeFilter::SimpleBooster
|
32
|
+
|
33
|
+
# The crossover determines what we consider to be positive or negative.
|
34
|
+
# With a crossover of 2.5, a rating of 3 counts as +0.5, whereas a
|
35
|
+
# rating of 1 counts as -1.5.
|
36
|
+
cb.crossover 3
|
37
|
+
|
38
|
+
# The threshold is a quality control that determines what we allow to be
|
39
|
+
# output. A threshold of 4.2 means we won't recommend anything that we
|
40
|
+
# think the user will rate lower than 4.2.
|
41
|
+
cb.threshold 4.2
|
42
|
+
|
43
|
+
# The factor determines how much weight we give to content. A factor of
|
44
|
+
# 1 would be "full weight". In other words, if your average Superhero
|
45
|
+
# rating is -2, we will subtract 2 from all Superhero recommendations
|
46
|
+
# before thresholding them again. Whereas a factor of 0.5 would mean
|
47
|
+
# the same person would only have 1 substracted from the Superhero
|
48
|
+
# recommendations.
|
49
|
+
cb.factor 0.8
|
50
|
+
|
51
|
+
# Genes determine content. You can have multiple of these blocks. Please
|
52
|
+
# be sure to knock the factor down for each gene, as they all count
|
53
|
+
# independently.
|
54
|
+
cb.gene :genres do |items|
|
55
|
+
items.map do |(id,type)|
|
56
|
+
i = type.constantize.find(id)
|
57
|
+
next i.genres.map(&:id) if i.respond_to?(:genres)
|
58
|
+
next [i.genre.id] if i.respond_to?(:genre) && i.genre
|
59
|
+
[]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
cf.dataset :ratings do |ds|
|
66
|
+
ds.correlator CollaborativeFilter::SimpleSvd
|
67
|
+
|
68
|
+
# The cosine_similarity option determines what the minimum cosine similarity
|
69
|
+
# should be between two users to consider them similar. The
|
70
|
+
# max_similar_users option determines the maximum number of users we'll
|
71
|
+
# user for Determining recommendations. 20 seems like a decent number for
|
72
|
+
# this. Be aware that changing this will significantly impact how long it
|
73
|
+
# takes to run.
|
74
|
+
ds.options :cosine_similarity => 0.96, :max_similar_users => 20
|
75
|
+
|
76
|
+
ds.users Rating.find(:all).map(&:customer).uniq
|
77
|
+
ds.items Rating.find(:all).map(&:rateable).uniq
|
78
|
+
ds.nodes do |m|
|
79
|
+
Rating.find(:all).each do |r|
|
80
|
+
score = r.not_interested ? 0.1 : r.score
|
81
|
+
m[ds.item_index(r.rateable_id, r.rateable_type), ds.user_index(r.customer_id)] = score
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
cf.dataset :purchases do |ds|
|
87
|
+
ds.correlator CollaborativeFilter::SimpleSvd
|
88
|
+
ds.options :cosine_similarity => 0.985, :max_similar_users => 20
|
89
|
+
|
90
|
+
ds.users Order.find(:all).map(&:customer).uniq
|
91
|
+
ds.items LineItem.find(:all).map(&:product).uniq
|
92
|
+
ds.nodes do |m|
|
93
|
+
Order.find(:all).each do |o|
|
94
|
+
o.line_items.each do |li|
|
95
|
+
next if li.cancelled?
|
96
|
+
m[ds.item_index(li.product_id), ds.user_index(o.customer_id)] = 5
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
</code>
|
103
|
+
</pre>
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
Many thanks to SmartFlix for letting me spend an inordinate amount of time on this.
|
108
|
+
|
109
|
+
Copyright (c) 2008 Tyler McMullen, released under the GPL license
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'rake'
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# SimpleBooster - a content booster for CollaborativeFiltration
|
2
|
+
#
|
3
|
+
# The purpose of a content booster is to improve the purely collaborative output
|
4
|
+
# from a recommender. Collaborative filtration relies on the idea that if you
|
5
|
+
# have similar ratings/purchases as someone else in the past, you are likely to
|
6
|
+
# continue to in the future.
|
7
|
+
#
|
8
|
+
# The fallacy of this is obvious when you consider that Bob may really enjoy
|
9
|
+
# Superhero comics and Horror comics. Joe really enjoys Superhero comics and
|
10
|
+
# Humor comics. Depending on how many things they rate and other factors, Joe
|
11
|
+
# and Bob may still have a high correlation. However, Bob's love of Horror
|
12
|
+
# comics may infiltrate Joe's ratings despite the fact that Joe really dislikes
|
13
|
+
# them.
|
14
|
+
#
|
15
|
+
# So, a content booster allows us to nudge the value of your recommendations up
|
16
|
+
# or down. One strategy for using a content booster is to set the threshold on
|
17
|
+
# your recommender low (say... 2.5 or 3), but the threshold on the content
|
18
|
+
# booster high.
|
19
|
+
class CollaborativeFilter
|
20
|
+
class SimpleBooster
|
21
|
+
def run(recs,datasets,genes,options)
|
22
|
+
@recs, @datasets, @genes, @options = recs, datasets, genes, options
|
23
|
+
generate_profiles
|
24
|
+
end
|
25
|
+
|
26
|
+
def generate_profiles
|
27
|
+
all_items = @datasets.inject([]) { |o,(dn,ds)| o.concat ds.items; o }.uniq
|
28
|
+
gene_lists = @genes.map { |gn,gene| gene.all(all_items) }
|
29
|
+
|
30
|
+
# Generating a profile for each user.
|
31
|
+
# In essence ...
|
32
|
+
# user_id => [ {'superhero' => -1, 'horror' => +2 },
|
33
|
+
# {'spiderman' => 2, 'atomic robo' => 1 } ]
|
34
|
+
#
|
35
|
+
# Iterate through each dataset, as we take all of them into account.
|
36
|
+
CollaborativeFilter.log " Generating user_profs: #{Time.now}"
|
37
|
+
user_profs = @datasets.inject({}) { |profiles,(ds_name,ds)|
|
38
|
+
CollaborativeFilter.log " Starting new dataset: #{Time.now}"
|
39
|
+
|
40
|
+
ds.users.each_with_index do |user_id,user_idx|
|
41
|
+
profiles[user_id] ||= []
|
42
|
+
|
43
|
+
# Grab the User's ratings from their column in the input matrix
|
44
|
+
user_ratings = ds.m.col(user_idx).to_a
|
45
|
+
user_ratings.each_index do |item_idx|
|
46
|
+
|
47
|
+
# user_ratings is an array with an entry for each item for the user
|
48
|
+
score = user_ratings[item_idx]
|
49
|
+
|
50
|
+
next if score == 0
|
51
|
+
|
52
|
+
# we have a master list of all items in all datasets which we need
|
53
|
+
# for the profiles to span datasets. Find this item's index in there.
|
54
|
+
all_items_idx = all_items.index(ds.items[item_idx])
|
55
|
+
|
56
|
+
# iterate through each gene type (genres, franchises, etc)
|
57
|
+
gene_lists.each_index do |gene_type_idx|
|
58
|
+
|
59
|
+
# find the value of the gene for this particular item (e.g. this item's genre is horror)
|
60
|
+
# this value is always an array and can contain more than one value
|
61
|
+
gis = gene_lists[gene_type_idx][all_items_idx]
|
62
|
+
|
63
|
+
profiles[user_id][gene_type_idx] ||= {}
|
64
|
+
|
65
|
+
adj = (score - @options[:crossover]) / gis.size
|
66
|
+
|
67
|
+
gis.each do |gi|
|
68
|
+
# we keep a tuple for each gene value (genre => horror) the first element
|
69
|
+
# is the count of how many items we've noted and last is the total adjustment
|
70
|
+
# they are later used to make an average
|
71
|
+
profiles[user_id][gene_type_idx][gi] ||= [0,0]
|
72
|
+
profiles[user_id][gene_type_idx][gi][0] += 1
|
73
|
+
profiles[user_id][gene_type_idx][gi][1] += adj
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
profiles
|
79
|
+
}.to_a.map { |user_id,genes|
|
80
|
+
# Grab each of those tuples we made above ([count, total]) and turn each one into an
|
81
|
+
# average multiplied by the 'factor' option. Meaning... If you rated Superman 2 points
|
82
|
+
# above the crossover, and Spiderman 1 point above the threshold, we have a tuple that
|
83
|
+
# looks like [2,3]. 3 / 2 = 1.5. Then we multiply by the factor (say 0.5) meaning we
|
84
|
+
# only want to half weight on the factors. So, on average you've rated Superheros 1.5
|
85
|
+
# above the crossover, however, since our factor is 0.5, we're going to record 0.75 as
|
86
|
+
# the modifier. This limits the power of the content booster.
|
87
|
+
#
|
88
|
+
# The more genes you have the lower you'll want to set the factor, as each of them modify
|
89
|
+
# the recommendations in turn. I should probably change the factor to be configurable
|
90
|
+
# per gene.
|
91
|
+
[user_id, genes.map { |m| m.to_a.map { |gi,(qty,tot)| [gi, (tot/qty) * @options[:factor] ] } } ]
|
92
|
+
}
|
93
|
+
|
94
|
+
CollaborativeFilter.log " Boosting recommendations: #{Time.now}"
|
95
|
+
|
96
|
+
new_recs = []
|
97
|
+
@recs.each_index do |user_idx|
|
98
|
+
# Grab a user's raw recs and their profile which we generated above
|
99
|
+
user_id, user_recs = @recs[user_idx]
|
100
|
+
user_id, user_profile = user_profs[user_idx]
|
101
|
+
|
102
|
+
# Iterate through each of the individual items in the recommendations
|
103
|
+
new_user_recs = user_recs.map { |item_id, score|
|
104
|
+
|
105
|
+
|
106
|
+
user_profile.each_index do |gene_type_idx|
|
107
|
+
# Grab this item's genes for this particular gene type from the master list
|
108
|
+
item_gene = gene_lists[gene_type_idx][all_items.index(item_id)]
|
109
|
+
|
110
|
+
# item_gene will always be an array, if it's empty we can move on
|
111
|
+
next if item_gene.empty?
|
112
|
+
|
113
|
+
# an item can have multiple genes for a gene type, we just use the average
|
114
|
+
item_mod = item_gene.inject([0,0]) { |o,g|
|
115
|
+
|
116
|
+
# find the user's modifier for this gene
|
117
|
+
mod = user_profile[gene_type_idx].detect { |ig| ig.first == g }
|
118
|
+
next o unless mod
|
119
|
+
[o[0] + mod.last, o[1] + 1]
|
120
|
+
}
|
121
|
+
# move on unless we have at least modifier
|
122
|
+
next unless item_mod[1] > 0
|
123
|
+
score += item_mod[0] / item_mod[1]
|
124
|
+
end
|
125
|
+
|
126
|
+
# if the score is at or above the threshold, add it to our new recs list
|
127
|
+
next if score < @options[:threshold]
|
128
|
+
[item_id, score > 5 ? 5 : score]
|
129
|
+
}.compact
|
130
|
+
new_recs << [user_id, new_user_recs]
|
131
|
+
end
|
132
|
+
|
133
|
+
new_recs
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'gsl'
|
2
|
+
require 'collaborative_filter/data_set'
|
3
|
+
require 'collaborative_filter/config'
|
4
|
+
require 'collaborative_filter/content_booster'
|
5
|
+
require 'collaborative_filter/output'
|
6
|
+
require 'collaborative_filter/output/mysql_adapter'
|
7
|
+
require 'collaborative_filter/output/yaml_adapter'
|
8
|
+
require 'correlators/simple_svd'
|
9
|
+
require 'recommenders/simplest_recommender'
|
10
|
+
require 'boosters/simple_booster'
|
11
|
+
|
12
|
+
class CollaborativeFilter
|
13
|
+
def self.filter(options={})
|
14
|
+
@@logger = options[:logger]
|
15
|
+
|
16
|
+
log "Starting configuration: #{Time.now}"
|
17
|
+
raise '#setup must be sent a block' unless block_given?
|
18
|
+
|
19
|
+
yield config
|
20
|
+
|
21
|
+
log "Starting correlations: #{Time.now}"
|
22
|
+
config.datasets.each do |name,ds|
|
23
|
+
log " Correlating '#{name}': #{Time.now}"
|
24
|
+
ds.run
|
25
|
+
end
|
26
|
+
log "Starting recommender: #{Time.now}"
|
27
|
+
recommendations = config.recommender.new.run(config.datasets, config.recommender_options)
|
28
|
+
|
29
|
+
log "Starting booster: #{Time.now}"
|
30
|
+
recommendations = config.content_booster.run(recommendations, config.datasets)
|
31
|
+
|
32
|
+
log "Output: #{Time.now}"
|
33
|
+
Output.store config.output, recommendations
|
34
|
+
|
35
|
+
log "Done: #{Time.now}"
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.log(msg)
|
39
|
+
@@logger.info(msg) if @@logger
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
class CollaborativeFilter
|
2
|
+
|
3
|
+
def self.config
|
4
|
+
@@config ||= Config.new
|
5
|
+
end
|
6
|
+
|
7
|
+
class Config
|
8
|
+
def self.config_option(option, default=nil)
|
9
|
+
@@config_defaults ||= {}
|
10
|
+
@@config_defaults[option] = default
|
11
|
+
|
12
|
+
define_method(:setup_defaults) {
|
13
|
+
@@config_defaults.each { |k,v| instance_variable_set("@#{k}".to_sym, v) }
|
14
|
+
} unless instance_methods.include?('setup_defaults')
|
15
|
+
|
16
|
+
class_eval <<-END
|
17
|
+
def #{option}(input=nil)
|
18
|
+
return @#{option} unless input
|
19
|
+
@#{option} = input
|
20
|
+
end
|
21
|
+
END
|
22
|
+
end
|
23
|
+
|
24
|
+
config_option :item_genes, []
|
25
|
+
config_option :output, { :type => :yaml }
|
26
|
+
attr_reader :datasets, :recommender_options
|
27
|
+
|
28
|
+
def dataset(name)
|
29
|
+
raise Error.new('#dataset requires a block') unless block_given?
|
30
|
+
|
31
|
+
ds = CollaborativeFilter::DataSet.new
|
32
|
+
yield ds
|
33
|
+
@datasets[name] = ds
|
34
|
+
end
|
35
|
+
|
36
|
+
def content_booster
|
37
|
+
return @content_booster unless block_given?
|
38
|
+
|
39
|
+
cb = CollaborativeFilter::ContentBooster.new
|
40
|
+
yield cb
|
41
|
+
@content_booster = cb
|
42
|
+
end
|
43
|
+
|
44
|
+
def recommender(input=nil, options={})
|
45
|
+
return @recommender unless input
|
46
|
+
@recommender = input
|
47
|
+
@recommender_options = options
|
48
|
+
end
|
49
|
+
|
50
|
+
def initialize
|
51
|
+
setup_defaults
|
52
|
+
@datasets = {}
|
53
|
+
end
|
54
|
+
|
55
|
+
class Error < StandardError; end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
class CollaborativeFilter
|
2
|
+
class ContentBooster
|
3
|
+
attr_reader :genes
|
4
|
+
attr_reader :booster
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@genes = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def booster(booster_class=nil)
|
11
|
+
return @booster unless booster_class
|
12
|
+
@booster = booster_class
|
13
|
+
end
|
14
|
+
|
15
|
+
def gene(name, &block)
|
16
|
+
gene = Gene.new
|
17
|
+
gene.finder = block
|
18
|
+
@genes[name] = gene
|
19
|
+
end
|
20
|
+
|
21
|
+
def crossover(point=nil)
|
22
|
+
return @crossover unless point
|
23
|
+
@crossover = point
|
24
|
+
end
|
25
|
+
|
26
|
+
def threshold(input=nil)
|
27
|
+
return @threshold unless input
|
28
|
+
@threshold = input
|
29
|
+
end
|
30
|
+
|
31
|
+
def factor(input=nil)
|
32
|
+
return @factor unless input
|
33
|
+
@factor = input
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def run(recommendations,datasets)
|
38
|
+
options = { :crossover => @crossover, :threshold => @threshold, :factor => @factor }
|
39
|
+
@booster.new.run(recommendations,datasets,@genes,options)
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
class Gene
|
44
|
+
attr_accessor :finder
|
45
|
+
def all(items)
|
46
|
+
@finder[items]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
class CollaborativeFilter
|
2
|
+
class DataSet
|
3
|
+
attr_accessor :users
|
4
|
+
attr_accessor :items
|
5
|
+
attr_accessor :m
|
6
|
+
attr_accessor :similarities
|
7
|
+
|
8
|
+
def users(input=nil)
|
9
|
+
return @users unless input
|
10
|
+
|
11
|
+
raise Error.new("all users must be unique") if input.size != input.uniq.size
|
12
|
+
raise Error.new("must have at least two users") if input.size < 2
|
13
|
+
|
14
|
+
if input.map(&:class).uniq.size == 1
|
15
|
+
@users = input.map(&:id)
|
16
|
+
else
|
17
|
+
@users = input.map { |u| [u.id,u.class.to_s] }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def items(input=nil)
|
22
|
+
return @items unless input
|
23
|
+
|
24
|
+
raise Error.new("all items must be unique") if input.size != input.uniq.size
|
25
|
+
raise Error.new("must have at least two items") if input.size < 2
|
26
|
+
|
27
|
+
if input.map(&:class).uniq.size == 1
|
28
|
+
@items = input.map(&:id)
|
29
|
+
else
|
30
|
+
@items = input.map { |i| [i.id,i.class.to_s] }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def nodes(input=nil)
|
35
|
+
if block_given?
|
36
|
+
@m = GSL::Matrix[@items.size,@users.size]
|
37
|
+
yield @m
|
38
|
+
else
|
39
|
+
@m = input.is_a?(GSL::Matrix) ? input : GSL::Matrix[*input]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def options(opts=nil)
|
44
|
+
return @options unless opts
|
45
|
+
@options = opts
|
46
|
+
end
|
47
|
+
|
48
|
+
def correlator(input=nil)
|
49
|
+
return @correlator unless input
|
50
|
+
@correlator = input
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
def item_index(id,type=nil)
|
56
|
+
find_index(id,type,@items)
|
57
|
+
end
|
58
|
+
|
59
|
+
def user_index(id,type=nil)
|
60
|
+
find_index(id,type,@users)
|
61
|
+
end
|
62
|
+
|
63
|
+
def run
|
64
|
+
@similarities = @correlator.new.run(@m, @users, @items, @options)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def find_index(id,type,collection)
|
70
|
+
collection.index(type ? [id,type] : id)
|
71
|
+
end
|
72
|
+
|
73
|
+
class Error < StandardError; end
|
74
|
+
end
|
75
|
+
end
|