tyler-collaborative_filter 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile ADDED
@@ -0,0 +1,109 @@
1
+ h1. CollaborativeFilter
2
+
3
+ Introduction goes here.
4
+
5
+ h2. Example
6
+
7
+ <pre>
8
+ <code>
9
+ CollaborativeFilter.filter(:logger => LOGGER) do |cf|
10
+ # We're using a low threshold settings because we're using a content booster
11
+ # for post processing. We keep it low so the content booster has plenty of
12
+ # data to work with. There is a second threshold in the content booster that
13
+ # is set high.
14
+ #
15
+ # The threshold of the content booster is set at 4.2, with a factor of 0.3
16
+ # and a single gene. So the lowest recommendation which could theoretically
17
+ # make it through the content booster would be:
18
+ #
19
+ # 4.2 - (5 - crossover) * factor =
20
+ # 4.2 - (5 - 2.5) * 0.3 = 3.45
21
+ #
22
+ # So we set it to 3.45. Anything lower than that will never be recommended.
23
+
24
+ cf.recommender CollaborativeFilter::SimplestRecommender, :threshold => 3.45, :max_per_user => 90
25
+
26
+
27
+ cf.output :type => :sql, :options => { :table_name => 'recommendations',
28
+ :mapping => { :user_id => :customer_id, :score => :relevance } }
29
+
30
+ cf.content_booster do |cb|
31
+ cb.booster CollaborativeFilter::SimpleBooster
32
+
33
+ # The crossover determines what we consider to be positive or negative.
34
+ # With a crossover of 2.5, a rating of 3 counts as +0.5, whereas a
35
+ # rating of 1 counts as -1.5.
36
+ cb.crossover 3
37
+
38
+ # The threshold is a quality control that determines what we allow to be
39
+ # output. A threshold of 4.2 means we won't recommend anything that we
40
+ # think the user will rate lower than 4.2.
41
+ cb.threshold 4.2
42
+
43
+ # The factor determines how much weight we give to content. A factor of
44
+ # 1 would be "full weight". In other words, if your average Superhero
45
+ # rating is -2, we will subtract 2 from all Superhero recommendations
46
+ # before thresholding them again. Whereas a factor of 0.5 would mean
47
+ # the same person would only have 1 substracted from the Superhero
48
+ # recommendations.
49
+ cb.factor 0.8
50
+
51
+ # Genes determine content. You can have multiple of these blocks. Please
52
+ # be sure to knock the factor down for each gene, as they all count
53
+ # independently.
54
+ cb.gene :genres do |items|
55
+ items.map do |(id,type)|
56
+ i = type.constantize.find(id)
57
+ next i.genres.map(&:id) if i.respond_to?(:genres)
58
+ next [i.genre.id] if i.respond_to?(:genre) && i.genre
59
+ []
60
+ end
61
+ end
62
+ end
63
+
64
+
65
+ cf.dataset :ratings do |ds|
66
+ ds.correlator CollaborativeFilter::SimpleSvd
67
+
68
+ # The cosine_similarity option determines what the minimum cosine similarity
69
+ # should be between two users to consider them similar. The
70
+ # max_similar_users option determines the maximum number of users we'll
71
+ # user for Determining recommendations. 20 seems like a decent number for
72
+ # this. Be aware that changing this will significantly impact how long it
73
+ # takes to run.
74
+ ds.options :cosine_similarity => 0.96, :max_similar_users => 20
75
+
76
+ ds.users Rating.find(:all).map(&:customer).uniq
77
+ ds.items Rating.find(:all).map(&:rateable).uniq
78
+ ds.nodes do |m|
79
+ Rating.find(:all).each do |r|
80
+ score = r.not_interested ? 0.1 : r.score
81
+ m[ds.item_index(r.rateable_id, r.rateable_type), ds.user_index(r.customer_id)] = score
82
+ end
83
+ end
84
+ end
85
+
86
+ cf.dataset :purchases do |ds|
87
+ ds.correlator CollaborativeFilter::SimpleSvd
88
+ ds.options :cosine_similarity => 0.985, :max_similar_users => 20
89
+
90
+ ds.users Order.find(:all).map(&:customer).uniq
91
+ ds.items LineItem.find(:all).map(&:product).uniq
92
+ ds.nodes do |m|
93
+ Order.find(:all).each do |o|
94
+ o.line_items.each do |li|
95
+ next if li.cancelled?
96
+ m[ds.item_index(li.product_id), ds.user_index(o.customer_id)] = 5
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ </code>
103
+ </pre>
104
+
105
+
106
+
107
+ Many thanks to SmartFlix for letting me spend an inordinate amount of time on this.
108
+
109
+ Copyright (c) 2008 Tyler McMullen, released under the GPL license
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'rake'
@@ -0,0 +1,137 @@
1
+ # SimpleBooster - a content booster for CollaborativeFiltration
2
+ #
3
+ # The purpose of a content booster is to improve the purely collaborative output
4
+ # from a recommender. Collaborative filtration relies on the idea that if you
5
+ # have similar ratings/purchases as someone else in the past, you are likely to
6
+ # continue to in the future.
7
+ #
8
+ # The fallacy of this is obvious when you consider that Bob may really enjoy
9
+ # Superhero comics and Horror comics. Joe really enjoys Superhero comics and
10
+ # Humor comics. Depending on how many things they rate and other factors, Joe
11
+ # and Bob may still have a high correlation. However, Bob's love of Horror
12
+ # comics may infiltrate Joe's ratings despite the fact that Joe really dislikes
13
+ # them.
14
+ #
15
+ # So, a content booster allows us to nudge the value of your recommendations up
16
+ # or down. One strategy for using a content booster is to set the threshold on
17
+ # your recommender low (say... 2.5 or 3), but the threshold on the content
18
+ # booster high.
19
+ class CollaborativeFilter
20
+ class SimpleBooster
21
+ def run(recs,datasets,genes,options)
22
+ @recs, @datasets, @genes, @options = recs, datasets, genes, options
23
+ generate_profiles
24
+ end
25
+
26
+ def generate_profiles
27
+ all_items = @datasets.inject([]) { |o,(dn,ds)| o.concat ds.items; o }.uniq
28
+ gene_lists = @genes.map { |gn,gene| gene.all(all_items) }
29
+
30
+ # Generating a profile for each user.
31
+ # In essence ...
32
+ # user_id => [ {'superhero' => -1, 'horror' => +2 },
33
+ # {'spiderman' => 2, 'atomic robo' => 1 } ]
34
+ #
35
+ # Iterate through each dataset, as we take all of them into account.
36
+ CollaborativeFilter.log " Generating user_profs: #{Time.now}"
37
+ user_profs = @datasets.inject({}) { |profiles,(ds_name,ds)|
38
+ CollaborativeFilter.log " Starting new dataset: #{Time.now}"
39
+
40
+ ds.users.each_with_index do |user_id,user_idx|
41
+ profiles[user_id] ||= []
42
+
43
+ # Grab the User's ratings from their column in the input matrix
44
+ user_ratings = ds.m.col(user_idx).to_a
45
+ user_ratings.each_index do |item_idx|
46
+
47
+ # user_ratings is an array with an entry for each item for the user
48
+ score = user_ratings[item_idx]
49
+
50
+ next if score == 0
51
+
52
+ # we have a master list of all items in all datasets which we need
53
+ # for the profiles to span datasets. Find this item's index in there.
54
+ all_items_idx = all_items.index(ds.items[item_idx])
55
+
56
+ # iterate through each gene type (genres, franchises, etc)
57
+ gene_lists.each_index do |gene_type_idx|
58
+
59
+ # find the value of the gene for this particular item (e.g. this item's genre is horror)
60
+ # this value is always an array and can contain more than one value
61
+ gis = gene_lists[gene_type_idx][all_items_idx]
62
+
63
+ profiles[user_id][gene_type_idx] ||= {}
64
+
65
+ adj = (score - @options[:crossover]) / gis.size
66
+
67
+ gis.each do |gi|
68
+ # we keep a tuple for each gene value (genre => horror) the first element
69
+ # is the count of how many items we've noted and last is the total adjustment
70
+ # they are later used to make an average
71
+ profiles[user_id][gene_type_idx][gi] ||= [0,0]
72
+ profiles[user_id][gene_type_idx][gi][0] += 1
73
+ profiles[user_id][gene_type_idx][gi][1] += adj
74
+ end
75
+ end
76
+ end
77
+ end
78
+ profiles
79
+ }.to_a.map { |user_id,genes|
80
+ # Grab each of those tuples we made above ([count, total]) and turn each one into an
81
+ # average multiplied by the 'factor' option. Meaning... If you rated Superman 2 points
82
+ # above the crossover, and Spiderman 1 point above the threshold, we have a tuple that
83
+ # looks like [2,3]. 3 / 2 = 1.5. Then we multiply by the factor (say 0.5) meaning we
84
+ # only want to half weight on the factors. So, on average you've rated Superheros 1.5
85
+ # above the crossover, however, since our factor is 0.5, we're going to record 0.75 as
86
+ # the modifier. This limits the power of the content booster.
87
+ #
88
+ # The more genes you have the lower you'll want to set the factor, as each of them modify
89
+ # the recommendations in turn. I should probably change the factor to be configurable
90
+ # per gene.
91
+ [user_id, genes.map { |m| m.to_a.map { |gi,(qty,tot)| [gi, (tot/qty) * @options[:factor] ] } } ]
92
+ }
93
+
94
+ CollaborativeFilter.log " Boosting recommendations: #{Time.now}"
95
+
96
+ new_recs = []
97
+ @recs.each_index do |user_idx|
98
+ # Grab a user's raw recs and their profile which we generated above
99
+ user_id, user_recs = @recs[user_idx]
100
+ user_id, user_profile = user_profs[user_idx]
101
+
102
+ # Iterate through each of the individual items in the recommendations
103
+ new_user_recs = user_recs.map { |item_id, score|
104
+
105
+
106
+ user_profile.each_index do |gene_type_idx|
107
+ # Grab this item's genes for this particular gene type from the master list
108
+ item_gene = gene_lists[gene_type_idx][all_items.index(item_id)]
109
+
110
+ # item_gene will always be an array, if it's empty we can move on
111
+ next if item_gene.empty?
112
+
113
+ # an item can have multiple genes for a gene type, we just use the average
114
+ item_mod = item_gene.inject([0,0]) { |o,g|
115
+
116
+ # find the user's modifier for this gene
117
+ mod = user_profile[gene_type_idx].detect { |ig| ig.first == g }
118
+ next o unless mod
119
+ [o[0] + mod.last, o[1] + 1]
120
+ }
121
+ # move on unless we have at least modifier
122
+ next unless item_mod[1] > 0
123
+ score += item_mod[0] / item_mod[1]
124
+ end
125
+
126
+ # if the score is at or above the threshold, add it to our new recs list
127
+ next if score < @options[:threshold]
128
+ [item_id, score > 5 ? 5 : score]
129
+ }.compact
130
+ new_recs << [user_id, new_user_recs]
131
+ end
132
+
133
+ new_recs
134
+ end
135
+ end
136
+ end
137
+
@@ -0,0 +1,43 @@
1
+ require 'gsl'
2
+ require 'collaborative_filter/data_set'
3
+ require 'collaborative_filter/config'
4
+ require 'collaborative_filter/content_booster'
5
+ require 'collaborative_filter/output'
6
+ require 'collaborative_filter/output/mysql_adapter'
7
+ require 'collaborative_filter/output/yaml_adapter'
8
+ require 'correlators/simple_svd'
9
+ require 'recommenders/simplest_recommender'
10
+ require 'boosters/simple_booster'
11
+
12
+ class CollaborativeFilter
13
+ def self.filter(options={})
14
+ @@logger = options[:logger]
15
+
16
+ log "Starting configuration: #{Time.now}"
17
+ raise '#setup must be sent a block' unless block_given?
18
+
19
+ yield config
20
+
21
+ log "Starting correlations: #{Time.now}"
22
+ config.datasets.each do |name,ds|
23
+ log " Correlating '#{name}': #{Time.now}"
24
+ ds.run
25
+ end
26
+ log "Starting recommender: #{Time.now}"
27
+ recommendations = config.recommender.new.run(config.datasets, config.recommender_options)
28
+
29
+ log "Starting booster: #{Time.now}"
30
+ recommendations = config.content_booster.run(recommendations, config.datasets)
31
+
32
+ log "Output: #{Time.now}"
33
+ Output.store config.output, recommendations
34
+
35
+ log "Done: #{Time.now}"
36
+ end
37
+
38
+ def self.log(msg)
39
+ @@logger.info(msg) if @@logger
40
+ end
41
+ end
42
+
43
+
@@ -0,0 +1,58 @@
1
+ class CollaborativeFilter
2
+
3
+ def self.config
4
+ @@config ||= Config.new
5
+ end
6
+
7
+ class Config
8
+ def self.config_option(option, default=nil)
9
+ @@config_defaults ||= {}
10
+ @@config_defaults[option] = default
11
+
12
+ define_method(:setup_defaults) {
13
+ @@config_defaults.each { |k,v| instance_variable_set("@#{k}".to_sym, v) }
14
+ } unless instance_methods.include?('setup_defaults')
15
+
16
+ class_eval <<-END
17
+ def #{option}(input=nil)
18
+ return @#{option} unless input
19
+ @#{option} = input
20
+ end
21
+ END
22
+ end
23
+
24
+ config_option :item_genes, []
25
+ config_option :output, { :type => :yaml }
26
+ attr_reader :datasets, :recommender_options
27
+
28
+ def dataset(name)
29
+ raise Error.new('#dataset requires a block') unless block_given?
30
+
31
+ ds = CollaborativeFilter::DataSet.new
32
+ yield ds
33
+ @datasets[name] = ds
34
+ end
35
+
36
+ def content_booster
37
+ return @content_booster unless block_given?
38
+
39
+ cb = CollaborativeFilter::ContentBooster.new
40
+ yield cb
41
+ @content_booster = cb
42
+ end
43
+
44
+ def recommender(input=nil, options={})
45
+ return @recommender unless input
46
+ @recommender = input
47
+ @recommender_options = options
48
+ end
49
+
50
+ def initialize
51
+ setup_defaults
52
+ @datasets = {}
53
+ end
54
+
55
+ class Error < StandardError; end
56
+ end
57
+ end
58
+
@@ -0,0 +1,51 @@
1
+ class CollaborativeFilter
2
+ class ContentBooster
3
+ attr_reader :genes
4
+ attr_reader :booster
5
+
6
+ def initialize
7
+ @genes = {}
8
+ end
9
+
10
+ def booster(booster_class=nil)
11
+ return @booster unless booster_class
12
+ @booster = booster_class
13
+ end
14
+
15
+ def gene(name, &block)
16
+ gene = Gene.new
17
+ gene.finder = block
18
+ @genes[name] = gene
19
+ end
20
+
21
+ def crossover(point=nil)
22
+ return @crossover unless point
23
+ @crossover = point
24
+ end
25
+
26
+ def threshold(input=nil)
27
+ return @threshold unless input
28
+ @threshold = input
29
+ end
30
+
31
+ def factor(input=nil)
32
+ return @factor unless input
33
+ @factor = input
34
+ end
35
+
36
+
37
+ def run(recommendations,datasets)
38
+ options = { :crossover => @crossover, :threshold => @threshold, :factor => @factor }
39
+ @booster.new.run(recommendations,datasets,@genes,options)
40
+ end
41
+
42
+
43
+ class Gene
44
+ attr_accessor :finder
45
+ def all(items)
46
+ @finder[items]
47
+ end
48
+ end
49
+ end
50
+ end
51
+
@@ -0,0 +1,75 @@
1
+ class CollaborativeFilter
2
+ class DataSet
3
+ attr_accessor :users
4
+ attr_accessor :items
5
+ attr_accessor :m
6
+ attr_accessor :similarities
7
+
8
+ def users(input=nil)
9
+ return @users unless input
10
+
11
+ raise Error.new("all users must be unique") if input.size != input.uniq.size
12
+ raise Error.new("must have at least two users") if input.size < 2
13
+
14
+ if input.map(&:class).uniq.size == 1
15
+ @users = input.map(&:id)
16
+ else
17
+ @users = input.map { |u| [u.id,u.class.to_s] }
18
+ end
19
+ end
20
+
21
+ def items(input=nil)
22
+ return @items unless input
23
+
24
+ raise Error.new("all items must be unique") if input.size != input.uniq.size
25
+ raise Error.new("must have at least two items") if input.size < 2
26
+
27
+ if input.map(&:class).uniq.size == 1
28
+ @items = input.map(&:id)
29
+ else
30
+ @items = input.map { |i| [i.id,i.class.to_s] }
31
+ end
32
+ end
33
+
34
+ def nodes(input=nil)
35
+ if block_given?
36
+ @m = GSL::Matrix[@items.size,@users.size]
37
+ yield @m
38
+ else
39
+ @m = input.is_a?(GSL::Matrix) ? input : GSL::Matrix[*input]
40
+ end
41
+ end
42
+
43
+ def options(opts=nil)
44
+ return @options unless opts
45
+ @options = opts
46
+ end
47
+
48
+ def correlator(input=nil)
49
+ return @correlator unless input
50
+ @correlator = input
51
+ end
52
+
53
+
54
+
55
+ def item_index(id,type=nil)
56
+ find_index(id,type,@items)
57
+ end
58
+
59
+ def user_index(id,type=nil)
60
+ find_index(id,type,@users)
61
+ end
62
+
63
+ def run
64
+ @similarities = @correlator.new.run(@m, @users, @items, @options)
65
+ end
66
+
67
+ private
68
+
69
+ def find_index(id,type,collection)
70
+ collection.index(type ? [id,type] : id)
71
+ end
72
+
73
+ class Error < StandardError; end
74
+ end
75
+ end