commendo 1.2.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +3 -0
  3. data/bin/commendo-create-mysql-db +3 -0
  4. data/bin/commendo-create.sql +99 -0
  5. data/bin/commendo-load-tsv +11 -5
  6. data/bin/commendo-load-tsv-mysql.rb +43 -0
  7. data/bin/commendo-time-mysql.rb +31 -0
  8. data/commendo.gemspec +4 -2
  9. data/lib/commendo.rb +24 -0
  10. data/lib/commendo/configuration.rb +25 -0
  11. data/lib/commendo/content_set.rb +13 -182
  12. data/lib/commendo/mysql-backed/content_set.rb +152 -0
  13. data/lib/commendo/mysql-backed/tag_set.rb +81 -0
  14. data/lib/commendo/mysql-backed/weighted_group.rb +40 -0
  15. data/lib/commendo/redis-backed/content_set.rb +194 -0
  16. data/lib/commendo/{pair_comparison.lua → redis-backed/pair_comparison.lua} +0 -0
  17. data/lib/commendo/{similarity.lua → redis-backed/similarity.lua} +0 -0
  18. data/lib/commendo/redis-backed/tag_set.rb +54 -0
  19. data/lib/commendo/redis-backed/weighted_group.rb +54 -0
  20. data/lib/commendo/tag_set.rb +6 -42
  21. data/lib/commendo/version.rb +1 -1
  22. data/lib/commendo/weighted_group.rb +7 -41
  23. data/lib/mysql2/client.rb +17 -0
  24. data/model 2.mwb +0 -0
  25. data/sql_model.mwb +0 -0
  26. data/test/configuration_test.rb +71 -0
  27. data/test/mysql_content_set_test.rb +40 -0
  28. data/test/mysql_tag_set_test.rb +34 -0
  29. data/test/mysql_weighted_group_test.rb +54 -0
  30. data/test/redis_content_set_test.rb +57 -0
  31. data/test/redis_tag_set_test.rb +31 -0
  32. data/test/redis_weighted_group_test.rb +49 -0
  33. data/test/tests_for_content_sets.rb +379 -0
  34. data/test/tests_for_tag_sets.rb +130 -0
  35. data/test/tests_for_weighted_groups.rb +106 -0
  36. metadata +72 -12
  37. data/test/content_set_test.rb +0 -408
  38. data/test/tag_set_test.rb +0 -128
  39. data/test/weighted_group_test.rb +0 -191
@@ -0,0 +1,152 @@
1
+ module Commendo
2
+ module MySqlBacked
3
+
4
+ class ContentSet
5
+
6
+ attr_accessor :mysql, :key_base, :tag_set
7
+
8
+ DEFAULT_LIMIT = 1000
9
+
10
+ def initialize(key_base, tag_set = nil)
11
+ config_hash = Commendo.config.to_hash
12
+ config_hash[:flags] = Mysql2::Client::MULTI_STATEMENTS
13
+ @mysql = Mysql2::Client.new(config_hash)
14
+ @key_base = key_base
15
+ @tag_set = tag_set
16
+ end
17
+
18
+ def add_by_group(group, *resources)
19
+ resources.map! { |r| r.is_a?(Array) ? r : [r, 1] } #sets default score of 1
20
+ resources.each { |r| add_single(r[0], group, r[1]) }
21
+ end
22
+
23
+ def add(resource, *groups)
24
+ groups.map! { |g| g.is_a?(Array) ? g : [g, 1] } #sets default score of 1
25
+ query = add_single_prepared_query
26
+ groups.each { |(g, s)| query.execute(@key_base, resource, g, s, s) }
27
+ end
28
+
29
+ def add_single(resource, group, score)
30
+ add(resource, [group, score])
31
+ end
32
+
33
+ def add_and_calculate(resource, *groups)
34
+ add(resource, *groups)
35
+ end
36
+
37
+ def groups(resource)
38
+ groups_prepared_query.execute(@key_base, resource).map { |r| r['groupname'] }
39
+ end
40
+
41
+ def delete(resource)
42
+ delete_prepared_query.execute(@key_base, resource)
43
+ end
44
+
45
+ def calculate_similarity(threshold = nil)
46
+ threshold = nil if threshold == 0
47
+ @threshold = threshold
48
+ end
49
+
50
+ def calculate_similarity_for_resource(resource, threshold = 0)
51
+ end
52
+
53
+ def similar_to(resource, limit = DEFAULT_LIMIT)
54
+ resource = [resource] unless resource.is_a? Array
55
+ results = @mysql.query(similar_to_query(@key_base, resource, limit)) if @threshold.nil?
56
+ results = @mysql.query(similar_to_with_threshold_query(@key_base, resource, @threshold, limit)) unless @threshold.nil?
57
+ similar = results.map { |r| {resource: r['similar'], similarity: r['similarity'].round(3)} }
58
+ return similar if resource.length == 1
59
+ grouped = similar.group_by { |r| r[:resource] }
60
+ grouped.map { |resource, similar| {resource: resource, similarity: similar.inject(0.0) { |sum, s| sum += s[:similarity] }} }.sort_by { |h| [h[:similarity], h[:resource]] }.reverse
61
+ end
62
+
63
+ def filtered_similar_to(resource, options = {})
64
+ if @tag_set.nil? || (options[:include].nil? && options[:exclude].nil?) || @tag_set.empty?
65
+ return similar_to(resource, options[:limit] || DEFAULT_LIMIT)
66
+ else
67
+ similar = similar_to(resource)
68
+ limit = options[:limit] || similar.length
69
+ filtered = []
70
+ similar.each do |s|
71
+ return filtered if filtered.length >= limit
72
+ filtered << s if @tag_set.matches(s[:resource], options[:include], options[:exclude])
73
+ end
74
+ return filtered
75
+ end
76
+ end
77
+
78
+ def remove_from_groups(resource, *groups)
79
+ @mysql.query(remove_from_groups_prepared_query(@key_base, resource, groups))
80
+ end
81
+
82
+ def remove_from_groups_and_calculate(resource, *groups)
83
+ remove_from_groups(resource, *groups)
84
+ calculate_similarity_for_resource(resource)
85
+ end
86
+
87
+ private
88
+
89
+ def add_single_prepared_query
90
+ @add_single_prepared_query ||= @mysql.prepare('INSERT INTO Resources (keybase, name, groupname, score) VALUES (?,?,?,?) ON DUPLICATE KEY UPDATE score = score + ?')
91
+ end
92
+
93
+ def groups_prepared_query
94
+ @groups_prepared_query ||= @mysql.prepare('SELECT DISTINCT groupname FROM Resources WHERE keybase=? AND name=?')
95
+ end
96
+
97
+ def delete_prepared_query
98
+ @delete_prepared_query ||= @mysql.prepare('DELETE FROM Resources WHERE keybase = ? AND name = ?')
99
+ end
100
+
101
+ def remove_from_groups_prepared_query(keybase, name, groups)
102
+ "
103
+ DELETE FROM Resources WHERE keybase = '#{keybase}' AND name = '#{name}' AND groupname IN (#{groups.map { |r| "'#{r}'" }.join(',')})"
104
+ end
105
+
106
+ def similar_to_query(keybase, resources, limit)
107
+ "
108
+ SELECT similar, intersect_score, l_union, r_union, intersect_score / (l_union + r_union) AS similarity
109
+ FROM (
110
+ SELECT r.name AS similar,
111
+ SUM(l.score + r.score) AS intersect_score,
112
+ l_us.union_score AS l_union,
113
+ r_us.union_score AS r_union
114
+ FROM Resources AS l
115
+ JOIN Resources AS r ON l.keybase = r.keybase AND l.groupname = r.groupname
116
+ JOIN UnionScores as l_us ON l_us.keybase = l.keybase AND l_us.name = l.name
117
+ JOIN UnionScores as r_us ON r_us.keybase = r.keybase AND r_us.name = r.name
118
+ WHERE l.keybase = '#{keybase}'
119
+ AND l.name IN (#{resources.map { |r| "'#{r}'" }.join(',')})
120
+ AND l.name <> r.name
121
+ GROUP BY l.name, r.name
122
+ ) AS similar_resources
123
+ ORDER BY similarity DESC, similar DESC
124
+ LIMIT #{limit}"
125
+ end
126
+
127
+ def similar_to_with_threshold_query(keybase, resources, threshold, limit)
128
+ "
129
+ SELECT similar, intersect_score, l_union, r_union, similarity FROM (
130
+ SELECT similar, intersect_score, l_union, r_union, intersect_score / (l_union + r_union) AS similarity FROM (
131
+ SELECT r.name AS similar,
132
+ SUM(l.score + r.score) AS intersect_score,
133
+ (SELECT SUM(score) FROM Resources WHERE keybase = l.keybase AND name = l.name) AS l_union,
134
+ (SELECT SUM(score) FROM Resources WHERE keybase = r.keybase AND name = r.name) AS r_union
135
+ FROM Resources AS l
136
+ JOIN Resources AS r ON l.keybase = r.keybase AND l.groupname = r.groupname
137
+ WHERE l.keybase = '#{keybase}'
138
+ AND l.name IN (#{resources.map { |r| "'#{r}'" }.join(',')})
139
+ AND l.name <> r.name
140
+ GROUP BY l.name, r.name
141
+ ) AS similar_resources
142
+ ) AS similar
143
+ WHERE similarity > #{threshold}
144
+ ORDER BY similarity DESC, similar DESC
145
+ LIMIT #{limit}"
146
+ end
147
+
148
+
149
+ end
150
+
151
+ end
152
+ end
@@ -0,0 +1,81 @@
1
+ module Commendo
2
+ module MySqlBacked
3
+ class TagSet
4
+
5
+ attr_accessor :mysql, :key_base
6
+
7
+ def initialize(key_base)
8
+ config_hash = Commendo.config.to_hash
9
+ @mysql = Mysql2::Client.new(config_hash)
10
+ @key_base = key_base
11
+ end
12
+
13
+ def empty?
14
+ result = empty_prepared_query.execute(@key_base)
15
+ result.count.zero?
16
+ end
17
+
18
+ def get(resource)
19
+ result = get_tags_prepared_query.execute(@key_base, resource)
20
+ result.map { |r| r['tag'] }
21
+ end
22
+
23
+ def add(resource, *tags)
24
+ return if tags.empty?
25
+ @mysql.transaction do |client|
26
+ insert_tags(resource, tags)
27
+ end
28
+ end
29
+
30
+ def set(resource, *tags)
31
+ @mysql.transaction do |client|
32
+ delete(resource)
33
+ insert_tags(resource, tags) unless tags.empty?
34
+ end
35
+ end
36
+
37
+ def matches(resource, include, exclude = [])
38
+ resource_tags = get(resource)
39
+ can_include = include.nil? || include.empty? || (resource_tags & include).length > 0
40
+ should_exclude = !exclude.nil? && !exclude.empty? && (resource_tags & exclude).length > 0
41
+ return can_include && !should_exclude
42
+ end
43
+
44
+ def delete(resource, *tags)
45
+ if tags.empty?
46
+ delete_all_tags_prepared_query.execute(@key_base, resource)
47
+ else
48
+ tags.each { |t| delete_tags_prepared_query.execute(@key_base, resource, t) }
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def insert_tags(resource, tags)
55
+ tags.each { |t| insert_prepared_query.execute(@key_base, resource, t) }
56
+ end
57
+
58
+ def get_tags_prepared_query
59
+ @get_tags_prepared_query ||= @mysql.prepare('SELECT tag FROM Tags t WHERE keybase = ? AND name = ?')
60
+ end
61
+
62
+ def delete_all_tags_prepared_query
63
+ @delete_all_tags_prepared_query ||= @mysql.prepare('DELETE FROM Tags WHERE keybase = ? AND name = ?')
64
+ end
65
+
66
+ def delete_tags_prepared_query
67
+ @delete_tags_prepared_query ||= @mysql.prepare('DELETE FROM Tags WHERE keybase = ? AND name = ? AND tag = ?')
68
+ end
69
+
70
+ def insert_prepared_query
71
+ @insert_prepared_query ||= @mysql.prepare('INSERT IGNORE INTO Tags (keybase, name, tag) VALUES (?,?,?)')
72
+ end
73
+
74
+ def empty_prepared_query
75
+ @empty_prepared_query ||= @mysql.prepare('SELECT tag FROM Tags WHERE keybase = ? LIMIT 1')
76
+ end
77
+
78
+ end
79
+ end
80
+ end
81
+
@@ -0,0 +1,40 @@
1
+ module Commendo
2
+ module MySqlBacked
3
+
4
+ class WeightedGroup
5
+
6
+ attr_accessor :mysql, :content_sets, :key_base, :tag_set
7
+
8
+ def initialize(key_base, *content_sets)
9
+ config_hash = Commendo.config.to_hash
10
+ @mysql = Mysql2::Client.new(config_hash)
11
+ @key_base = key_base
12
+ @content_sets = content_sets
13
+ end
14
+
15
+ def similar_to(resource, limit = 0)
16
+ similar = @content_sets.flat_map { |cs| cs[:cs].similar_to(resource).map { |s| {resource: s[:resource], similarity: (s[:similarity] * cs[:weight]).round(3)} } }
17
+ grouped = similar.group_by { |r| r[:resource] }
18
+ totaled_similar = grouped.map { |resource, similar| {resource: resource, similarity: similar.inject(0.0) { |sum, s| sum += s[:similarity] }} }.sort_by { |h| [h[:similarity], h[:resource]] }.reverse
19
+ limit > 0 ? totaled_similar[0..limit-1] : totaled_similar
20
+ end
21
+
22
+ def filtered_similar_to(resource, options = {})
23
+ if @tag_set.nil? || (options[:include].nil? && options[:exclude].nil?)
24
+ return similar_to(resource, options[:limit] || 0)
25
+ else
26
+ similar = similar_to(resource)
27
+ limit = options[:limit] || similar.length
28
+ filtered = []
29
+ similar.each do |s|
30
+ return filtered if filtered.length >= limit
31
+ filtered << s if @tag_set.matches(s[:resource], options[:include], options[:exclude])
32
+ end
33
+ return filtered
34
+ end
35
+ end
36
+
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,194 @@
1
+ module Commendo
2
+ module RedisBacked
3
+
4
+ class ContentSet
5
+
6
+ attr_accessor :redis, :key_base, :tag_set
7
+
8
+ def initialize(key_base, tag_set = nil)
9
+ @redis = Redis.new(host: Commendo.config.host, port: Commendo.config.port, db: Commendo.config.database, timeout: 120)
10
+ @key_base = key_base
11
+ @tag_set = tag_set
12
+ end
13
+
14
+ def add_by_group(group, *resources)
15
+ resources.each do |resource|
16
+ if resource.kind_of?(Array)
17
+ add_single(resource[0], group, resource[1])
18
+ else
19
+ add_single(resource, group, 1)
20
+ end
21
+ end
22
+ end
23
+
24
+ def add(resource, *groups)
25
+ groups.each do |group|
26
+ if group.kind_of?(Array)
27
+ add_single(resource, group[0], group[1])
28
+ else
29
+ add_single(resource, group, 1)
30
+ end
31
+ end
32
+ end
33
+
34
+ def add_single(resource, group, score)
35
+ redis.zincrby(group_key(group), score, resource)
36
+ redis.zincrby(resource_key(resource), score, group)
37
+ end
38
+
39
+ def add_and_calculate(resource, *groups)
40
+ add(resource, *groups)
41
+ calculate_similarity_for_resource(resource, 0)
42
+ end
43
+
44
+ def groups(resource)
45
+ redis.zrange(resource_key(resource), 0, -1)
46
+ end
47
+
48
+ def delete(resource)
49
+ similar = similar_to(resource)
50
+ similar.each do |other_resource|
51
+ redis.zrem(similarity_key(other_resource[:resource]), "#{resource}")
52
+ end
53
+ #TODO delete from groups?
54
+ redis.del(similarity_key(resource))
55
+ redis.del(resource_key(resource))
56
+ end
57
+
58
+ SET_TOO_LARGE_FOR_LUA = 999
59
+
60
+ def calculate_similarity(threshold = 0)
61
+ #TODO make this use scan for scaling
62
+ keys = redis.keys("#{resource_key_base}:*")
63
+ keys.each_with_index do |key, i|
64
+ resource = key.gsub(/^#{resource_key_base}:/, '')
65
+ similarity_key = similarity_key(resource)
66
+ redis.del(similarity_key)
67
+ yield(key, i, keys.length) if block_given?
68
+ completed = redis.eval(similarity_lua, keys: [key], argv: [tmp_key_base, resource_key_base, similar_key_base, group_key_base, threshold])
69
+ if completed == SET_TOO_LARGE_FOR_LUA
70
+ calculate_similarity_for_key_resource(key, resource, threshold)
71
+ end
72
+ end
73
+ end
74
+
75
+ def calculate_similarity_for_resource(resource, threshold)
76
+ key = resource_key(resource)
77
+ calculate_similarity_for_key_resource(key, resource, threshold)
78
+ end
79
+
80
+ def similar_to(resource, limit = 0)
81
+ finish = limit -1
82
+ if resource.kind_of? Array
83
+ keys = resource.map do |res|
84
+ similarity_key(res)
85
+ end
86
+ tmp_key = "#{key_base}:tmp:#{SecureRandom.uuid}"
87
+ redis.zunionstore(tmp_key, keys)
88
+ similar_resources = redis.zrevrange(tmp_key, 0, finish, with_scores: true)
89
+ redis.del(tmp_key)
90
+ else
91
+ similar_resources = redis.zrevrange(similarity_key(resource), 0, finish, with_scores: true)
92
+ end
93
+ similar_resources.map do |resource|
94
+ {resource: resource[0], similarity: resource[1].to_f}
95
+ end
96
+ end
97
+
98
+ def filtered_similar_to(resource, options = {})
99
+ if @tag_set.nil? || (options[:include].nil? && options[:exclude].nil?) || @tag_set.empty?
100
+ return similar_to(resource, options[:limit] || 0)
101
+ else
102
+ similar = similar_to(resource)
103
+ limit = options[:limit] || similar.length
104
+ filtered = []
105
+ similar.each do |s|
106
+ return filtered if filtered.length >= limit
107
+ filtered << s if @tag_set.matches(s[:resource], options[:include], options[:exclude])
108
+ end
109
+ return filtered
110
+ end
111
+ end
112
+
113
+ def similarity_key(resource)
114
+ "#{similar_key_base}:#{resource}"
115
+ end
116
+
117
+ def remove_from_groups(resource, *groups)
118
+ resource_key = resource_key(resource)
119
+ redis.zrem(resource_key, groups)
120
+ groups.each do |group|
121
+ group_key = group_key(group)
122
+ redis.zrem(group_key, resource)
123
+ end
124
+ end
125
+
126
+ def remove_from_groups_and_calculate(resource, *groups)
127
+ remove_from_groups(resource, *groups)
128
+ calculate_similarity_for_resource(resource, 0)
129
+ end
130
+
131
+ private
132
+
133
+ def calculate_similarity_for_key_resource(key, resource, threshold)
134
+ groups = groups(resource)
135
+ return if groups.empty?
136
+ group_keys = groups.map { |group| group_key(group) }
137
+ tmp_key = "#{tmp_key_base}:#{SecureRandom.uuid}"
138
+ redis.zunionstore(tmp_key, group_keys)
139
+ resources = redis.zrange(tmp_key, 0, -1)
140
+ redis.del(tmp_key)
141
+ similarity_key = similarity_key(resource)
142
+ redis.del(similarity_key)
143
+ resources.each do |to_compare|
144
+ next if resource == to_compare
145
+ redis.eval(pair_comparison_lua, keys: [key, resource_key(to_compare), similarity_key(resource), similarity_key(to_compare)], argv: [tmp_key_base, resource, to_compare, threshold])
146
+ end
147
+ end
148
+
149
+ def similarity_lua
150
+ @similarity_lua ||= load_similarity_lua
151
+ end
152
+
153
+ def load_similarity_lua
154
+ file = File.open(File.expand_path('../similarity.lua', __FILE__), "r")
155
+ file.read
156
+ end
157
+
158
+ def pair_comparison_lua
159
+ @pair_comparison_lua ||= load_pair_comparison_lua
160
+ end
161
+
162
+ def load_pair_comparison_lua
163
+ file = File.open(File.expand_path('../pair_comparison.lua', __FILE__), "r")
164
+ file.read
165
+ end
166
+
167
+ def tmp_key_base
168
+ "#{key_base}:tmp"
169
+ end
170
+
171
+ def similar_key_base
172
+ "#{key_base}:similar"
173
+ end
174
+
175
+ def resource_key_base
176
+ "#{key_base}:resources"
177
+ end
178
+
179
+ def resource_key(resource)
180
+ "#{resource_key_base}:#{resource}"
181
+ end
182
+
183
+ def group_key_base
184
+ "#{key_base}:groups"
185
+ end
186
+
187
+ def group_key(group)
188
+ "#{group_key_base}:#{group}"
189
+ end
190
+
191
+ end
192
+
193
+ end
194
+ end