commendo 1.2.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +3 -0
  3. data/bin/commendo-create-mysql-db +3 -0
  4. data/bin/commendo-create.sql +99 -0
  5. data/bin/commendo-load-tsv +11 -5
  6. data/bin/commendo-load-tsv-mysql.rb +43 -0
  7. data/bin/commendo-time-mysql.rb +31 -0
  8. data/commendo.gemspec +4 -2
  9. data/lib/commendo.rb +24 -0
  10. data/lib/commendo/configuration.rb +25 -0
  11. data/lib/commendo/content_set.rb +13 -182
  12. data/lib/commendo/mysql-backed/content_set.rb +152 -0
  13. data/lib/commendo/mysql-backed/tag_set.rb +81 -0
  14. data/lib/commendo/mysql-backed/weighted_group.rb +40 -0
  15. data/lib/commendo/redis-backed/content_set.rb +194 -0
  16. data/lib/commendo/{pair_comparison.lua → redis-backed/pair_comparison.lua} +0 -0
  17. data/lib/commendo/{similarity.lua → redis-backed/similarity.lua} +0 -0
  18. data/lib/commendo/redis-backed/tag_set.rb +54 -0
  19. data/lib/commendo/redis-backed/weighted_group.rb +54 -0
  20. data/lib/commendo/tag_set.rb +6 -42
  21. data/lib/commendo/version.rb +1 -1
  22. data/lib/commendo/weighted_group.rb +7 -41
  23. data/lib/mysql2/client.rb +17 -0
  24. data/model 2.mwb +0 -0
  25. data/sql_model.mwb +0 -0
  26. data/test/configuration_test.rb +71 -0
  27. data/test/mysql_content_set_test.rb +40 -0
  28. data/test/mysql_tag_set_test.rb +34 -0
  29. data/test/mysql_weighted_group_test.rb +54 -0
  30. data/test/redis_content_set_test.rb +57 -0
  31. data/test/redis_tag_set_test.rb +31 -0
  32. data/test/redis_weighted_group_test.rb +49 -0
  33. data/test/tests_for_content_sets.rb +379 -0
  34. data/test/tests_for_tag_sets.rb +130 -0
  35. data/test/tests_for_weighted_groups.rb +106 -0
  36. metadata +72 -12
  37. data/test/content_set_test.rb +0 -408
  38. data/test/tag_set_test.rb +0 -128
  39. data/test/weighted_group_test.rb +0 -191
@@ -0,0 +1,152 @@
1
+ module Commendo
2
+ module MySqlBacked
3
+
4
+ class ContentSet
5
+
6
+ attr_accessor :mysql, :key_base, :tag_set
7
+
8
+ DEFAULT_LIMIT = 1000
9
+
10
+ def initialize(key_base, tag_set = nil)
11
+ config_hash = Commendo.config.to_hash
12
+ config_hash[:flags] = Mysql2::Client::MULTI_STATEMENTS
13
+ @mysql = Mysql2::Client.new(config_hash)
14
+ @key_base = key_base
15
+ @tag_set = tag_set
16
+ end
17
+
18
+ def add_by_group(group, *resources)
19
+ resources.map! { |r| r.is_a?(Array) ? r : [r, 1] } #sets default score of 1
20
+ resources.each { |r| add_single(r[0], group, r[1]) }
21
+ end
22
+
23
+ def add(resource, *groups)
24
+ groups.map! { |g| g.is_a?(Array) ? g : [g, 1] } #sets default score of 1
25
+ query = add_single_prepared_query
26
+ groups.each { |(g, s)| query.execute(@key_base, resource, g, s, s) }
27
+ end
28
+
29
+ def add_single(resource, group, score)
30
+ add(resource, [group, score])
31
+ end
32
+
33
+ def add_and_calculate(resource, *groups)
34
+ add(resource, *groups)
35
+ end
36
+
37
+ def groups(resource)
38
+ groups_prepared_query.execute(@key_base, resource).map { |r| r['groupname'] }
39
+ end
40
+
41
+ def delete(resource)
42
+ delete_prepared_query.execute(@key_base, resource)
43
+ end
44
+
45
+ def calculate_similarity(threshold = nil)
46
+ threshold = nil if threshold == 0
47
+ @threshold = threshold
48
+ end
49
+
50
+ def calculate_similarity_for_resource(resource, threshold = 0)
51
+ end
52
+
53
+ def similar_to(resource, limit = DEFAULT_LIMIT)
54
+ resource = [resource] unless resource.is_a? Array
55
+ results = @mysql.query(similar_to_query(@key_base, resource, limit)) if @threshold.nil?
56
+ results = @mysql.query(similar_to_with_threshold_query(@key_base, resource, @threshold, limit)) unless @threshold.nil?
57
+ similar = results.map { |r| {resource: r['similar'], similarity: r['similarity'].round(3)} }
58
+ return similar if resource.length == 1
59
+ grouped = similar.group_by { |r| r[:resource] }
60
+ grouped.map { |resource, similar| {resource: resource, similarity: similar.inject(0.0) { |sum, s| sum += s[:similarity] }} }.sort_by { |h| [h[:similarity], h[:resource]] }.reverse
61
+ end
62
+
63
+ def filtered_similar_to(resource, options = {})
64
+ if @tag_set.nil? || (options[:include].nil? && options[:exclude].nil?) || @tag_set.empty?
65
+ return similar_to(resource, options[:limit] || DEFAULT_LIMIT)
66
+ else
67
+ similar = similar_to(resource)
68
+ limit = options[:limit] || similar.length
69
+ filtered = []
70
+ similar.each do |s|
71
+ return filtered if filtered.length >= limit
72
+ filtered << s if @tag_set.matches(s[:resource], options[:include], options[:exclude])
73
+ end
74
+ return filtered
75
+ end
76
+ end
77
+
78
+ def remove_from_groups(resource, *groups)
79
+ @mysql.query(remove_from_groups_prepared_query(@key_base, resource, groups))
80
+ end
81
+
82
+ def remove_from_groups_and_calculate(resource, *groups)
83
+ remove_from_groups(resource, *groups)
84
+ calculate_similarity_for_resource(resource)
85
+ end
86
+
87
+ private
88
+
89
+ def add_single_prepared_query
90
+ @add_single_prepared_query ||= @mysql.prepare('INSERT INTO Resources (keybase, name, groupname, score) VALUES (?,?,?,?) ON DUPLICATE KEY UPDATE score = score + ?')
91
+ end
92
+
93
+ def groups_prepared_query
94
+ @groups_prepared_query ||= @mysql.prepare('SELECT DISTINCT groupname FROM Resources WHERE keybase=? AND name=?')
95
+ end
96
+
97
+ def delete_prepared_query
98
+ @delete_prepared_query ||= @mysql.prepare('DELETE FROM Resources WHERE keybase = ? AND name = ?')
99
+ end
100
+
101
+ def remove_from_groups_prepared_query(keybase, name, groups)
102
+ "
103
+ DELETE FROM Resources WHERE keybase = '#{keybase}' AND name = '#{name}' AND groupname IN (#{groups.map { |r| "'#{r}'" }.join(',')})"
104
+ end
105
+
106
+ def similar_to_query(keybase, resources, limit)
107
+ "
108
+ SELECT similar, intersect_score, l_union, r_union, intersect_score / (l_union + r_union) AS similarity
109
+ FROM (
110
+ SELECT r.name AS similar,
111
+ SUM(l.score + r.score) AS intersect_score,
112
+ l_us.union_score AS l_union,
113
+ r_us.union_score AS r_union
114
+ FROM Resources AS l
115
+ JOIN Resources AS r ON l.keybase = r.keybase AND l.groupname = r.groupname
116
+ JOIN UnionScores as l_us ON l_us.keybase = l.keybase AND l_us.name = l.name
117
+ JOIN UnionScores as r_us ON r_us.keybase = r.keybase AND r_us.name = r.name
118
+ WHERE l.keybase = '#{keybase}'
119
+ AND l.name IN (#{resources.map { |r| "'#{r}'" }.join(',')})
120
+ AND l.name <> r.name
121
+ GROUP BY l.name, r.name
122
+ ) AS similar_resources
123
+ ORDER BY similarity DESC, similar DESC
124
+ LIMIT #{limit}"
125
+ end
126
+
127
+ def similar_to_with_threshold_query(keybase, resources, threshold, limit)
128
+ "
129
+ SELECT similar, intersect_score, l_union, r_union, similarity FROM (
130
+ SELECT similar, intersect_score, l_union, r_union, intersect_score / (l_union + r_union) AS similarity FROM (
131
+ SELECT r.name AS similar,
132
+ SUM(l.score + r.score) AS intersect_score,
133
+ (SELECT SUM(score) FROM Resources WHERE keybase = l.keybase AND name = l.name) AS l_union,
134
+ (SELECT SUM(score) FROM Resources WHERE keybase = r.keybase AND name = r.name) AS r_union
135
+ FROM Resources AS l
136
+ JOIN Resources AS r ON l.keybase = r.keybase AND l.groupname = r.groupname
137
+ WHERE l.keybase = '#{keybase}'
138
+ AND l.name IN (#{resources.map { |r| "'#{r}'" }.join(',')})
139
+ AND l.name <> r.name
140
+ GROUP BY l.name, r.name
141
+ ) AS similar_resources
142
+ ) AS similar
143
+ WHERE similarity > #{threshold}
144
+ ORDER BY similarity DESC, similar DESC
145
+ LIMIT #{limit}"
146
+ end
147
+
148
+
149
+ end
150
+
151
+ end
152
+ end
@@ -0,0 +1,81 @@
1
+ module Commendo
2
+ module MySqlBacked
3
+ class TagSet
4
+
5
+ attr_accessor :mysql, :key_base
6
+
7
+ def initialize(key_base)
8
+ config_hash = Commendo.config.to_hash
9
+ @mysql = Mysql2::Client.new(config_hash)
10
+ @key_base = key_base
11
+ end
12
+
13
+ def empty?
14
+ result = empty_prepared_query.execute(@key_base)
15
+ result.count.zero?
16
+ end
17
+
18
+ def get(resource)
19
+ result = get_tags_prepared_query.execute(@key_base, resource)
20
+ result.map { |r| r['tag'] }
21
+ end
22
+
23
+ def add(resource, *tags)
24
+ return if tags.empty?
25
+ @mysql.transaction do |client|
26
+ insert_tags(resource, tags)
27
+ end
28
+ end
29
+
30
+ def set(resource, *tags)
31
+ @mysql.transaction do |client|
32
+ delete(resource)
33
+ insert_tags(resource, tags) unless tags.empty?
34
+ end
35
+ end
36
+
37
+ def matches(resource, include, exclude = [])
38
+ resource_tags = get(resource)
39
+ can_include = include.nil? || include.empty? || (resource_tags & include).length > 0
40
+ should_exclude = !exclude.nil? && !exclude.empty? && (resource_tags & exclude).length > 0
41
+ return can_include && !should_exclude
42
+ end
43
+
44
+ def delete(resource, *tags)
45
+ if tags.empty?
46
+ delete_all_tags_prepared_query.execute(@key_base, resource)
47
+ else
48
+ tags.each { |t| delete_tags_prepared_query.execute(@key_base, resource, t) }
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def insert_tags(resource, tags)
55
+ tags.each { |t| insert_prepared_query.execute(@key_base, resource, t) }
56
+ end
57
+
58
+ def get_tags_prepared_query
59
+ @get_tags_prepared_query ||= @mysql.prepare('SELECT tag FROM Tags t WHERE keybase = ? AND name = ?')
60
+ end
61
+
62
+ def delete_all_tags_prepared_query
63
+ @delete_all_tags_prepared_query ||= @mysql.prepare('DELETE FROM Tags WHERE keybase = ? AND name = ?')
64
+ end
65
+
66
+ def delete_tags_prepared_query
67
+ @delete_tags_prepared_query ||= @mysql.prepare('DELETE FROM Tags WHERE keybase = ? AND name = ? AND tag = ?')
68
+ end
69
+
70
+ def insert_prepared_query
71
+ @insert_prepared_query ||= @mysql.prepare('INSERT IGNORE INTO Tags (keybase, name, tag) VALUES (?,?,?)')
72
+ end
73
+
74
+ def empty_prepared_query
75
+ @empty_prepared_query ||= @mysql.prepare('SELECT tag FROM Tags WHERE keybase = ? LIMIT 1')
76
+ end
77
+
78
+ end
79
+ end
80
+ end
81
+
@@ -0,0 +1,40 @@
1
+ module Commendo
2
+ module MySqlBacked
3
+
4
+ class WeightedGroup
5
+
6
+ attr_accessor :mysql, :content_sets, :key_base, :tag_set
7
+
8
+ def initialize(key_base, *content_sets)
9
+ config_hash = Commendo.config.to_hash
10
+ @mysql = Mysql2::Client.new(config_hash)
11
+ @key_base = key_base
12
+ @content_sets = content_sets
13
+ end
14
+
15
+ def similar_to(resource, limit = 0)
16
+ similar = @content_sets.flat_map { |cs| cs[:cs].similar_to(resource).map { |s| {resource: s[:resource], similarity: (s[:similarity] * cs[:weight]).round(3)} } }
17
+ grouped = similar.group_by { |r| r[:resource] }
18
+ totaled_similar = grouped.map { |resource, similar| {resource: resource, similarity: similar.inject(0.0) { |sum, s| sum += s[:similarity] }} }.sort_by { |h| [h[:similarity], h[:resource]] }.reverse
19
+ limit > 0 ? totaled_similar[0..limit-1] : totaled_similar
20
+ end
21
+
22
+ def filtered_similar_to(resource, options = {})
23
+ if @tag_set.nil? || (options[:include].nil? && options[:exclude].nil?)
24
+ return similar_to(resource, options[:limit] || 0)
25
+ else
26
+ similar = similar_to(resource)
27
+ limit = options[:limit] || similar.length
28
+ filtered = []
29
+ similar.each do |s|
30
+ return filtered if filtered.length >= limit
31
+ filtered << s if @tag_set.matches(s[:resource], options[:include], options[:exclude])
32
+ end
33
+ return filtered
34
+ end
35
+ end
36
+
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,194 @@
1
+ module Commendo
2
+ module RedisBacked
3
+
4
+ class ContentSet
5
+
6
+ attr_accessor :redis, :key_base, :tag_set
7
+
8
+ def initialize(key_base, tag_set = nil)
9
+ @redis = Redis.new(host: Commendo.config.host, port: Commendo.config.port, db: Commendo.config.database, timeout: 120)
10
+ @key_base = key_base
11
+ @tag_set = tag_set
12
+ end
13
+
14
+ def add_by_group(group, *resources)
15
+ resources.each do |resource|
16
+ if resource.kind_of?(Array)
17
+ add_single(resource[0], group, resource[1])
18
+ else
19
+ add_single(resource, group, 1)
20
+ end
21
+ end
22
+ end
23
+
24
+ def add(resource, *groups)
25
+ groups.each do |group|
26
+ if group.kind_of?(Array)
27
+ add_single(resource, group[0], group[1])
28
+ else
29
+ add_single(resource, group, 1)
30
+ end
31
+ end
32
+ end
33
+
34
+ def add_single(resource, group, score)
35
+ redis.zincrby(group_key(group), score, resource)
36
+ redis.zincrby(resource_key(resource), score, group)
37
+ end
38
+
39
+ def add_and_calculate(resource, *groups)
40
+ add(resource, *groups)
41
+ calculate_similarity_for_resource(resource, 0)
42
+ end
43
+
44
+ def groups(resource)
45
+ redis.zrange(resource_key(resource), 0, -1)
46
+ end
47
+
48
+ def delete(resource)
49
+ similar = similar_to(resource)
50
+ similar.each do |other_resource|
51
+ redis.zrem(similarity_key(other_resource[:resource]), "#{resource}")
52
+ end
53
+ #TODO delete from groups?
54
+ redis.del(similarity_key(resource))
55
+ redis.del(resource_key(resource))
56
+ end
57
+
58
+ SET_TOO_LARGE_FOR_LUA = 999
59
+
60
+ def calculate_similarity(threshold = 0)
61
+ #TODO make this use scan for scaling
62
+ keys = redis.keys("#{resource_key_base}:*")
63
+ keys.each_with_index do |key, i|
64
+ resource = key.gsub(/^#{resource_key_base}:/, '')
65
+ similarity_key = similarity_key(resource)
66
+ redis.del(similarity_key)
67
+ yield(key, i, keys.length) if block_given?
68
+ completed = redis.eval(similarity_lua, keys: [key], argv: [tmp_key_base, resource_key_base, similar_key_base, group_key_base, threshold])
69
+ if completed == SET_TOO_LARGE_FOR_LUA
70
+ calculate_similarity_for_key_resource(key, resource, threshold)
71
+ end
72
+ end
73
+ end
74
+
75
+ def calculate_similarity_for_resource(resource, threshold)
76
+ key = resource_key(resource)
77
+ calculate_similarity_for_key_resource(key, resource, threshold)
78
+ end
79
+
80
+ def similar_to(resource, limit = 0)
81
+ finish = limit -1
82
+ if resource.kind_of? Array
83
+ keys = resource.map do |res|
84
+ similarity_key(res)
85
+ end
86
+ tmp_key = "#{key_base}:tmp:#{SecureRandom.uuid}"
87
+ redis.zunionstore(tmp_key, keys)
88
+ similar_resources = redis.zrevrange(tmp_key, 0, finish, with_scores: true)
89
+ redis.del(tmp_key)
90
+ else
91
+ similar_resources = redis.zrevrange(similarity_key(resource), 0, finish, with_scores: true)
92
+ end
93
+ similar_resources.map do |resource|
94
+ {resource: resource[0], similarity: resource[1].to_f}
95
+ end
96
+ end
97
+
98
+ def filtered_similar_to(resource, options = {})
99
+ if @tag_set.nil? || (options[:include].nil? && options[:exclude].nil?) || @tag_set.empty?
100
+ return similar_to(resource, options[:limit] || 0)
101
+ else
102
+ similar = similar_to(resource)
103
+ limit = options[:limit] || similar.length
104
+ filtered = []
105
+ similar.each do |s|
106
+ return filtered if filtered.length >= limit
107
+ filtered << s if @tag_set.matches(s[:resource], options[:include], options[:exclude])
108
+ end
109
+ return filtered
110
+ end
111
+ end
112
+
113
+ def similarity_key(resource)
114
+ "#{similar_key_base}:#{resource}"
115
+ end
116
+
117
+ def remove_from_groups(resource, *groups)
118
+ resource_key = resource_key(resource)
119
+ redis.zrem(resource_key, groups)
120
+ groups.each do |group|
121
+ group_key = group_key(group)
122
+ redis.zrem(group_key, resource)
123
+ end
124
+ end
125
+
126
+ def remove_from_groups_and_calculate(resource, *groups)
127
+ remove_from_groups(resource, *groups)
128
+ calculate_similarity_for_resource(resource, 0)
129
+ end
130
+
131
+ private
132
+
133
+ def calculate_similarity_for_key_resource(key, resource, threshold)
134
+ groups = groups(resource)
135
+ return if groups.empty?
136
+ group_keys = groups.map { |group| group_key(group) }
137
+ tmp_key = "#{tmp_key_base}:#{SecureRandom.uuid}"
138
+ redis.zunionstore(tmp_key, group_keys)
139
+ resources = redis.zrange(tmp_key, 0, -1)
140
+ redis.del(tmp_key)
141
+ similarity_key = similarity_key(resource)
142
+ redis.del(similarity_key)
143
+ resources.each do |to_compare|
144
+ next if resource == to_compare
145
+ redis.eval(pair_comparison_lua, keys: [key, resource_key(to_compare), similarity_key(resource), similarity_key(to_compare)], argv: [tmp_key_base, resource, to_compare, threshold])
146
+ end
147
+ end
148
+
149
+ def similarity_lua
150
+ @similarity_lua ||= load_similarity_lua
151
+ end
152
+
153
+ def load_similarity_lua
154
+ file = File.open(File.expand_path('../similarity.lua', __FILE__), "r")
155
+ file.read
156
+ end
157
+
158
+ def pair_comparison_lua
159
+ @pair_comparison_lua ||= load_pair_comparison_lua
160
+ end
161
+
162
+ def load_pair_comparison_lua
163
+ file = File.open(File.expand_path('../pair_comparison.lua', __FILE__), "r")
164
+ file.read
165
+ end
166
+
167
+ def tmp_key_base
168
+ "#{key_base}:tmp"
169
+ end
170
+
171
+ def similar_key_base
172
+ "#{key_base}:similar"
173
+ end
174
+
175
+ def resource_key_base
176
+ "#{key_base}:resources"
177
+ end
178
+
179
+ def resource_key(resource)
180
+ "#{resource_key_base}:#{resource}"
181
+ end
182
+
183
+ def group_key_base
184
+ "#{key_base}:groups"
185
+ end
186
+
187
+ def group_key(group)
188
+ "#{group_key_base}:#{group}"
189
+ end
190
+
191
+ end
192
+
193
+ end
194
+ end