harmonizer_redis 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +21 -0
- data/README.md +89 -0
- data/Rakefile +19 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/ext/white_similarity/extconf.rb +5 -0
- data/ext/white_similarity/white_similarity.c +159 -0
- data/ext/white_similarity/white_similarity.h +16 -0
- data/harmonizer_redis.gemspec +30 -0
- data/lib/harmonizer_redis.rb +59 -0
- data/lib/harmonizer_redis/base_object.rb +24 -0
- data/lib/harmonizer_redis/category.rb +236 -0
- data/lib/harmonizer_redis/idf_scorer.rb +113 -0
- data/lib/harmonizer_redis/linkage.rb +188 -0
- data/lib/harmonizer_redis/phrase.rb +87 -0
- data/lib/harmonizer_redis/version.rb +3 -0
- metadata +150 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1b7c2cbd0cf8dbb603f66c6b7dc37fe6cb780cbf
|
4
|
+
data.tar.gz: e803d14a56ca973ac52e02e6804c6ee5965cb6c0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 551d4c226fa3cf6f8badd822f7442b6d7c5673e09fb4687f169afe8e57da82f686c6bd0084cf68f714feadecdf53274c89aee08d0a99c55d46a84ad4f24d1b2b
|
7
|
+
data.tar.gz: 9cf6d8c7ddce775cda2dff601421c961fc2326e0c9233805d3d21a99f782653feb70b19b30f6b374ddef76ea8a51691acb77b3478261946d0152ac286e5cc39e
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Tian Wang
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
# HarmonizerRedis
|
2
|
+
|
3
|
+
HarmonizerRedis is a Ruby gem that aids the process of relabeling/grouping free text phrases to
|
4
|
+
resolve the many ways people spell or describe something. It uses fuzzy string matching along with inverse
|
5
|
+
term frequencies to score and rank similarities between phrases. The gem uses Redis for performance.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
|
9
|
+
### Configuration
|
10
|
+
|
11
|
+
The Redis must be configured first. Refer to the [Redis] (https://github.com/redis/redis-rb) for more information.
|
12
|
+
`Redis.current` should be set to the Redis connection.
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
Redis.current = Redis.new
|
16
|
+
```
|
17
|
+
|
18
|
+
### Adding an entry
|
19
|
+
|
20
|
+
`HarmonizerRedis::Linkage` represents the connection between your data structures and the gem. Linkages contain
|
21
|
+
string content, an `id` (which will be a uniquely generated uuid), and a `category_id` which identifies the collection this entry belongs to.
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
my_category_id = 100
|
25
|
+
linkage = HarmonizerRedis::Linkage.new(content: 'harmonizer redis',
|
26
|
+
category_id: my_category_id)
|
27
|
+
linkage.save
|
28
|
+
my_linkage_id = linkage.id # "520c488b-e9f8-4a6f-aaea-0d5e37b97644"
|
29
|
+
```
|
30
|
+
|
31
|
+
### Retrieving an entry
|
32
|
+
|
33
|
+
```ruby
|
34
|
+
my_linkage = HarmonizerRedis::Linkage.find(my_linkage_id)
|
35
|
+
```
|
36
|
+
|
37
|
+
### Calculating and Retrieving Similarities
|
38
|
+
|
39
|
+
Calculate similarities for all the linkages in a category in a batch. New calculations will need to
|
40
|
+
be performed if new linkages are added.
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
HarmonizerRedis.calculate_similarities(my_category_id)
|
44
|
+
```
|
45
|
+
|
46
|
+
To get an Array of similar phrases. The default is to return the top 20 phrases. If new linkages have
|
47
|
+
been added or if the similarities have not yet been computed for this linkage, it will be computed
|
48
|
+
automatically with this call.
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
my_linkage.get_similarities
|
52
|
+
```
|
53
|
+
|
54
|
+
### Merging into groups, labeling groups, and getting recommended labels
|
55
|
+
|
56
|
+
Each entry in this array is an array in the following format `[text_label, group_label, similarity_score, phrase_id]`
|
57
|
+
|
58
|
+
After deciding which phrase the linkage should be combined with - use the accompanying phrase_id data to merge the phrases into a group
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
my_linkage.merge_with_phrase(phrase_id)
|
62
|
+
```
|
63
|
+
|
64
|
+
To label everything in the same group:
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
my_linkage.set_corrected_label('HarmonizerRedis')
|
68
|
+
```
|
69
|
+
|
70
|
+
To suggest labels for this group (this works better the more HarmonizerRedis is used)
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
my_linkage.recommend_labels
|
74
|
+
```
|
75
|
+
|
76
|
+
Lastly to get the final corrected label of a linkage:
|
77
|
+
|
78
|
+
```ruby
|
79
|
+
my_linkage.corrected
|
80
|
+
```
|
81
|
+
|
82
|
+
## Contributing
|
83
|
+
|
84
|
+
Feel free to fork this repo and change it as you wish. We prefer pull requests on github, but you can send us emails. All attributions need to be tested as well.
|
85
|
+
|
86
|
+
## License
|
87
|
+
|
88
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
89
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
require "rake/extensiontask"
|
4
|
+
|
5
|
+
RSpec::Core::RakeTask.new
|
6
|
+
|
7
|
+
task :default => :spec
|
8
|
+
task :test => :spec
|
9
|
+
Rake::Task[:test].prerequisites << :compile
|
10
|
+
|
11
|
+
task :console do
|
12
|
+
exec "irb -r harmonizer_redis -I ./lib"
|
13
|
+
end
|
14
|
+
|
15
|
+
Rake::ExtensionTask.new('white_similarity') do |extension|
|
16
|
+
extension.lib_dir = 'lib/harmonizer_redis'
|
17
|
+
end
|
18
|
+
|
19
|
+
task :build => [:clean, :compile]
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "harmonizer_redis"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,159 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <stdbool.h>
|
5
|
+
#include "white_similarity.h"
|
6
|
+
|
7
|
+
void Init_white_similarity()
|
8
|
+
{
|
9
|
+
WhiteSimilarity = rb_define_module("WhiteSimilarity");
|
10
|
+
rb_define_method(WhiteSimilarity, "score", method_score, 2);
|
11
|
+
rb_define_method(WhiteSimilarity, "soft_cos_similarity", method_soft_cos_similarity, 2);
|
12
|
+
}
|
13
|
+
|
14
|
+
inline bool makes_bad_pair(char *str, int index)
|
15
|
+
{
|
16
|
+
return (str[index] == ' ' || str[index+1] == ' ');
|
17
|
+
}
|
18
|
+
|
19
|
+
inline bool is_pair_equal(char *x_pair, char *y_pair)
|
20
|
+
{
|
21
|
+
return (x_pair[0] == y_pair[0] && x_pair[1] == y_pair[1]);
|
22
|
+
}
|
23
|
+
|
24
|
+
struct PairHolder {
|
25
|
+
int pair_count;
|
26
|
+
char **pairs;
|
27
|
+
};
|
28
|
+
|
29
|
+
Pair generate_pairs(char *str)
|
30
|
+
{
|
31
|
+
Pair str_pairs;
|
32
|
+
int max_pair_number = strlen(str) - 1;
|
33
|
+
str_pairs.pairs = malloc(max_pair_number * sizeof(char *));
|
34
|
+
|
35
|
+
int pair_count = 0;
|
36
|
+
for (int i = 0; i < max_pair_number; i++) {
|
37
|
+
if (!makes_bad_pair(str, i))
|
38
|
+
{
|
39
|
+
str_pairs.pairs[pair_count] = str + i;
|
40
|
+
pair_count++;
|
41
|
+
}
|
42
|
+
}
|
43
|
+
str_pairs.pair_count = pair_count;
|
44
|
+
return str_pairs;
|
45
|
+
}
|
46
|
+
|
47
|
+
double white_similarity(char *x_str, char *y_str)
|
48
|
+
{
|
49
|
+
Pair x_pairs = generate_pairs(x_str);
|
50
|
+
Pair y_pairs = generate_pairs(y_str);
|
51
|
+
int intersect = 0;
|
52
|
+
int sum = x_pairs.pair_count + y_pairs.pair_count;
|
53
|
+
|
54
|
+
for (int i = 0; i < x_pairs.pair_count; i++) {
|
55
|
+
for (int j = 0; j < y_pairs.pair_count; j++) {
|
56
|
+
if (x_pairs.pairs[i] != NULL && y_pairs.pairs[j] != NULL &&
|
57
|
+
is_pair_equal(x_pairs.pairs[i], y_pairs.pairs[j])) {
|
58
|
+
intersect++;
|
59
|
+
y_pairs.pairs[j] = NULL;
|
60
|
+
break;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
64
|
+
free(x_pairs.pairs);
|
65
|
+
free(y_pairs.pairs);
|
66
|
+
|
67
|
+
return 2.0 * (double)intersect / (double)sum;
|
68
|
+
}
|
69
|
+
|
70
|
+
VALUE method_score(VALUE self, VALUE x_string, VALUE y_string)
|
71
|
+
{
|
72
|
+
char *x_str = StringValueCStr(x_string);
|
73
|
+
char *y_str = StringValueCStr(y_string);
|
74
|
+
if (strlen(x_str) == 1 || strlen(y_str) == 1) {
|
75
|
+
return rb_float_new(0.0);
|
76
|
+
}
|
77
|
+
double similarity = white_similarity(x_str, y_str);
|
78
|
+
return rb_float_new(similarity);
|
79
|
+
}
|
80
|
+
|
81
|
+
/* code needed for soft cos similarity calculations */
|
82
|
+
|
83
|
+
struct MatrixHolder {
|
84
|
+
int len;
|
85
|
+
char *raw_string;
|
86
|
+
char **words;
|
87
|
+
double *values;
|
88
|
+
};
|
89
|
+
|
90
|
+
Matrix generate_matrix(char *matrix_cstr)
|
91
|
+
{
|
92
|
+
Matrix new_matrix;
|
93
|
+
|
94
|
+
new_matrix.raw_string = malloc(strlen(matrix_cstr + 1) + 1);
|
95
|
+
strcpy(new_matrix.raw_string, matrix_cstr + 1);
|
96
|
+
|
97
|
+
//printf("%s", new_matrix.raw_string);
|
98
|
+
|
99
|
+
new_matrix.len = (int)matrix_cstr[0];
|
100
|
+
new_matrix.words = malloc(new_matrix.len * sizeof(char *));
|
101
|
+
new_matrix.values = malloc(new_matrix.len * sizeof(double));
|
102
|
+
|
103
|
+
char *head = new_matrix.raw_string;
|
104
|
+
for (int i = 0; i < new_matrix.len; i++) {
|
105
|
+
new_matrix.words[i] = head;
|
106
|
+
while (*head != ',') {
|
107
|
+
head++;
|
108
|
+
}
|
109
|
+
*head = '\0';
|
110
|
+
head++;
|
111
|
+
}
|
112
|
+
|
113
|
+
for (int i = 0; i < new_matrix.len; i++) {
|
114
|
+
char *curr_value = head;
|
115
|
+
while(*head != ',' && *head != '\0') {
|
116
|
+
head++;
|
117
|
+
}
|
118
|
+
if (*head == ',') {
|
119
|
+
*head = '\0';
|
120
|
+
head++;
|
121
|
+
}
|
122
|
+
sscanf(curr_value, "%lf", new_matrix.values + i);
|
123
|
+
}
|
124
|
+
|
125
|
+
return new_matrix;
|
126
|
+
}
|
127
|
+
|
128
|
+
void free_matrix(Matrix to_free)
|
129
|
+
{
|
130
|
+
free(to_free.raw_string);
|
131
|
+
free(to_free.words);
|
132
|
+
free(to_free.values);
|
133
|
+
}
|
134
|
+
|
135
|
+
double soft_cos_similarity(char *x_matrix_str, char *y_matrix_str)
|
136
|
+
{
|
137
|
+
Matrix x_matrix = generate_matrix(x_matrix_str);
|
138
|
+
Matrix y_matrix = generate_matrix(y_matrix_str);
|
139
|
+
double similarity = 0.0;
|
140
|
+
|
141
|
+
for (int i = 0; i < x_matrix.len; i++) {
|
142
|
+
for (int j = 0; j < y_matrix.len; j++) {
|
143
|
+
double word_sim = white_similarity(x_matrix.words[i], y_matrix.words[j]);
|
144
|
+
similarity += (word_sim * x_matrix.values[i] * y_matrix.values[j]);
|
145
|
+
}
|
146
|
+
}
|
147
|
+
|
148
|
+
free_matrix(x_matrix);
|
149
|
+
free_matrix(y_matrix);
|
150
|
+
|
151
|
+
return similarity;
|
152
|
+
}
|
153
|
+
|
154
|
+
VALUE method_soft_cos_similarity(VALUE self, VALUE x_matrix, VALUE y_matrix)
|
155
|
+
{
|
156
|
+
char *x_matrix_cstr = StringValueCStr(x_matrix);
|
157
|
+
char *y_matrix_cstr = StringValueCStr(y_matrix);
|
158
|
+
return rb_float_new(soft_cos_similarity(x_matrix_cstr, y_matrix_cstr));
|
159
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
VALUE WhiteSimilarity = Qnil;
|
2
|
+
void Init_white_similarity();
|
3
|
+
|
4
|
+
typedef struct PairHolder Pair;
|
5
|
+
typedef struct MatrixHolder Matrix;
|
6
|
+
|
7
|
+
VALUE method_score(VALUE self, VALUE x_string, VALUE y_string);
|
8
|
+
double white_similarity(char *x_str, char *y_str);
|
9
|
+
Pair generate_pairs(char *str);
|
10
|
+
inline bool makes_bad_pair(char *str, int index);
|
11
|
+
inline bool is_pair_equal(char *x_pair, char *y_pair);
|
12
|
+
|
13
|
+
Matrix generate_matrix(char *matrix_cstr);
|
14
|
+
void free_matrix(Matrix to_free);
|
15
|
+
double soft_cos_similarity(char *x_matrix_str, char *y_matrix_str);
|
16
|
+
VALUE method_soft_cos_similarity(VALUE self, VALUE x_matrix, VALUE y_matrix);
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'harmonizer_redis/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "harmonizer_redis"
|
8
|
+
spec.version = HarmonizerRedis::VERSION
|
9
|
+
spec.authors = ["Tian Wang"]
|
10
|
+
spec.email = ["twang95@stanford.edu"]
|
11
|
+
|
12
|
+
spec.summary = %q{Harmonizes records}
|
13
|
+
spec.description = %q{Harmonizes records based on fuzzy string/phrase matching. Built on redis for speed}
|
14
|
+
spec.homepage = "https://github.com/POSpulse/harmonizer_redis"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
spec.bindir = "exe"
|
19
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
+
spec.require_paths = ["lib"]
|
21
|
+
|
22
|
+
spec.add_dependency "redis"
|
23
|
+
spec.add_dependency "hiredis"
|
24
|
+
spec.add_dependency "activesupport"
|
25
|
+
spec.extensions = ["ext/white_similarity/extconf.rb"]
|
26
|
+
|
27
|
+
spec.add_development_dependency "rspec"
|
28
|
+
spec.add_development_dependency "bundler", "~> 1.10"
|
29
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
30
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'harmonizer_redis/version'
|
2
|
+
require 'harmonizer_redis/base_object'
|
3
|
+
require 'harmonizer_redis/linkage'
|
4
|
+
require 'harmonizer_redis/phrase'
|
5
|
+
require 'harmonizer_redis/idf_scorer'
|
6
|
+
require 'harmonizer_redis/white_similarity'
|
7
|
+
require 'harmonizer_redis/category'
|
8
|
+
require 'active_support/all'
|
9
|
+
require 'redis/connection/hiredis'
|
10
|
+
require 'redis'
|
11
|
+
|
12
|
+
include WhiteSimilarity
|
13
|
+
|
14
|
+
module HarmonizerRedis
|
15
|
+
### Calculate Similarities. Store them with the category
|
16
|
+
class << self
|
17
|
+
def calculate_similarities(category_id)
|
18
|
+
unless Category.valid?(category_id)
|
19
|
+
raise "Category ID: #{category_id} is invalid"
|
20
|
+
end
|
21
|
+
|
22
|
+
phrase_id_list = Category.get_phrase_list(category_id)
|
23
|
+
|
24
|
+
matrix_list = Category.get_matrices(category_id, phrase_id_list)
|
25
|
+
|
26
|
+
Redis.current.pipelined do
|
27
|
+
(0...phrase_id_list.length).each do |i|
|
28
|
+
(i + 1...phrase_id_list.length).each do |j|
|
29
|
+
id_x = phrase_id_list[i]
|
30
|
+
id_y = phrase_id_list[j]
|
31
|
+
score = Phrase.calc_soft_pair_similarity(matrix_list[i], matrix_list[j])
|
32
|
+
unless score < 0.2
|
33
|
+
add_similarity_entry(id_x, id_y, score, category_id)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
Category.reset_changed(category_id)
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
### String PreProcessing
|
44
|
+
def normalize_string(string)
|
45
|
+
ActiveSupport::Inflector.transliterate(string.strip.downcase).
|
46
|
+
split(/[^\p{L}0-9]/).delete_if { |x| x.length == 0 }.join(' ')
|
47
|
+
end
|
48
|
+
|
49
|
+
### Helper
|
50
|
+
|
51
|
+
def add_similarity_entry(id_x, id_y, score, category_id)
|
52
|
+
Redis.current.zadd("HarmonizerRedis::Category:#{category_id}:#{id_x}:sims", score, id_y)
|
53
|
+
Redis.current.zadd("HarmonizerRedis::Category:#{category_id}:#{id_y}:sims", score, id_x)
|
54
|
+
end
|
55
|
+
|
56
|
+
private :add_similarity_entry
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module HarmonizerRedis
|
2
|
+
class BaseObject
|
3
|
+
attr_accessor :id
|
4
|
+
|
5
|
+
def generate_id
|
6
|
+
Redis.current.incr("#{self.class}").to_i - 1
|
7
|
+
end
|
8
|
+
|
9
|
+
def save
|
10
|
+
#creates a new id only when object is being saved
|
11
|
+
klass = "#{self.class}"
|
12
|
+
new_id = @id || self.generate_id
|
13
|
+
self.instance_variables.each do |variable|
|
14
|
+
var_name = variable.to_s[1..-1]
|
15
|
+
Redis.current.set("#{klass}:#{new_id}:#{var_name}", instance_variable_get(variable))
|
16
|
+
end
|
17
|
+
|
18
|
+
@id = new_id
|
19
|
+
|
20
|
+
#add id to HarmonizerRedis::ClassName:set
|
21
|
+
Redis.current.sadd("#{klass}:set", @id)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,236 @@
|
|
1
|
+
module HarmonizerRedis
|
2
|
+
class Category < BaseObject
|
3
|
+
attr_reader :id
|
4
|
+
def initialize(id)
|
5
|
+
@id = id
|
6
|
+
end
|
7
|
+
|
8
|
+
def save
|
9
|
+
super()
|
10
|
+
end
|
11
|
+
|
12
|
+
class << self
|
13
|
+
# Add linkage to category group
|
14
|
+
def add_linkage(linkage)
|
15
|
+
category_id = linkage.category_id
|
16
|
+
linkage_id = linkage.id
|
17
|
+
phrase_id = linkage.phrase_id
|
18
|
+
|
19
|
+
unless self.valid?(category_id)
|
20
|
+
new_category = self.new(category_id)
|
21
|
+
new_category.save
|
22
|
+
end
|
23
|
+
|
24
|
+
self.add_to_linkage_set(category_id, linkage_id)
|
25
|
+
self.add_to_phrase_set(category_id, phrase_id)
|
26
|
+
|
27
|
+
set_changed(category_id, 1)
|
28
|
+
set_calculated(category_id, 0)
|
29
|
+
set_phrase_calculated(category_id, phrase_id, 0)
|
30
|
+
|
31
|
+
# Creating/adding to a group
|
32
|
+
add_group(category_id, phrase_id)
|
33
|
+
|
34
|
+
# Adding linkage to the phrase
|
35
|
+
Phrase.add_linkage(phrase_id, linkage_id, category_id)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Gets list of linkages included in category group
|
39
|
+
def get_linkage_list(category_id)
|
40
|
+
Redis.current.smembers("#{self}:#{category_id}:linkage_set")
|
41
|
+
end
|
42
|
+
|
43
|
+
def get_phrase_list(category_id)
|
44
|
+
Redis.current.smembers("#{self}:#{category_id}:phrase_set")
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_matrices(category_id, phrase_id_list)
|
48
|
+
matrices_list = []
|
49
|
+
if !matrices_calculated?(category_id) || changed?(category_id)
|
50
|
+
phrase_id_list.each do |id|
|
51
|
+
content = Phrase.get_content(id)
|
52
|
+
new_matrix = IdfScorer.serialize_matrix(IdfScorer.calc_soft_matrix(content))
|
53
|
+
Redis.current.set("HarmonizerRedis::Phrase:#{id}:matrix", new_matrix)
|
54
|
+
matrices_list << new_matrix
|
55
|
+
end
|
56
|
+
set_calculated(category_id, 1)
|
57
|
+
else
|
58
|
+
phrase_id_list.each do |phrase_id|
|
59
|
+
matrices_list << Phrase.get_matrix(phrase_id)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
matrices_list
|
63
|
+
end
|
64
|
+
|
65
|
+
# Set the category as "unchanged". Should be called after Category similarities
|
66
|
+
# have been calculated
|
67
|
+
def reset_changed(category_id)
|
68
|
+
set_changed(category_id, 0)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Check to see if id is valid
|
72
|
+
def valid?(category_id)
|
73
|
+
Redis.current.sismember("#{self}:set", "#{category_id}")
|
74
|
+
end
|
75
|
+
|
76
|
+
def changed?(category_id)
|
77
|
+
!Redis.current.getbit("#{self}:changed", category_id).zero?
|
78
|
+
end
|
79
|
+
|
80
|
+
def matrices_calculated?(category_id)
|
81
|
+
!Redis.current.getbit("#{self}:calculated", category_id).zero?
|
82
|
+
end
|
83
|
+
|
84
|
+
def is_phrase_calculated?(category_id, phrase_id)
|
85
|
+
!Redis.current.getbit("#{self}:#{category_id}:calculated", phrase_id).zero?
|
86
|
+
end
|
87
|
+
|
88
|
+
# Merge 2 phrases' groups
|
89
|
+
def merge_phrase_groups(category_id, phrase_a_id, phrase_b_id)
|
90
|
+
group_a = get_group_key(category_id, phrase_a_id)
|
91
|
+
group_b = get_group_key(category_id, phrase_b_id)
|
92
|
+
# Error either group does not exist
|
93
|
+
if group_a.nil? || group_b.nil?
|
94
|
+
raise 'Invalid Phrase ID(s) given!'
|
95
|
+
end
|
96
|
+
# Do nothing if both are already in the same group
|
97
|
+
if group_a == group_b
|
98
|
+
return
|
99
|
+
end
|
100
|
+
label_a = get_group_label(category_id, phrase_a_id)
|
101
|
+
label_b = get_group_label(category_id, phrase_b_id)
|
102
|
+
|
103
|
+
# if label_a and label_b both exist
|
104
|
+
unless label_a.nil? ^ label_b.nil?
|
105
|
+
# if label_a and label_b are not the same label
|
106
|
+
if label_a != label_b
|
107
|
+
# delete both labels due to conflict
|
108
|
+
Redis.current.del(label_a, label_b)
|
109
|
+
else # both labels are the same
|
110
|
+
# delete only the label that belongs to the group getting destroyed
|
111
|
+
Redis.current.del(label_b)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# if only label_b exists
|
116
|
+
if label_a.nil? && !label_b.nil?
|
117
|
+
merge_phrase_group_helper(category_id, group_a, group_b)
|
118
|
+
Redis.current.del(group_a)
|
119
|
+
else # if only label_a exists
|
120
|
+
merge_phrase_group_helper(category_id, group_b, group_a)
|
121
|
+
Redis.current.del(group_b)
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
### Getting popular linkages and generating possible labels
|
127
|
+
|
128
|
+
def get_group_popular_linkages(category_id, phrase_id, number = 5)
|
129
|
+
phrases = get_group(category_id, phrase_id)
|
130
|
+
linkages = []
|
131
|
+
phrases.each do |id|
|
132
|
+
linkages += Phrase.get_popular_linkages(id)
|
133
|
+
end
|
134
|
+
linkages.sort_by! { |entry| -1 * entry[-1] }
|
135
|
+
linkages.first(number)
|
136
|
+
end
|
137
|
+
|
138
|
+
def get_all_group_labels(category_id, phrase_id)
|
139
|
+
phrases_in_group = get_group(category_id, phrase_id)
|
140
|
+
categories = []
|
141
|
+
phrases_in_group.each do |phrase_id|
|
142
|
+
categories += Phrase.get_categories(phrase_id)
|
143
|
+
end
|
144
|
+
labels = Hash.new { |hash, key| hash[key] = 0.0 }
|
145
|
+
categories.each do |category_id|
|
146
|
+
phrases_in_group.each do |phrase_id|
|
147
|
+
group_label = get_group_label(category_id, phrase_id)
|
148
|
+
unless group_label.nil?
|
149
|
+
labels[group_label] += Phrase.get_linkage_count(phrase_id)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
labels.to_a.sort_by { |x| x[-1] }
|
154
|
+
end
|
155
|
+
|
156
|
+
def set_group_label(category_id, phrase_id, label)
|
157
|
+
Redis.current.set("#{get_group_key(category_id, phrase_id)}:label", label)
|
158
|
+
end
|
159
|
+
|
160
|
+
def get_group_label(category_id, phrase_id)
|
161
|
+
Redis.current.get("#{get_group_key(category_id, phrase_id)}:label")
|
162
|
+
end
|
163
|
+
|
164
|
+
def get_group_count(category_id, phrase_id)
|
165
|
+
Redis.current.scard(get_group_key(category_id, phrase_id))
|
166
|
+
end
|
167
|
+
|
168
|
+
def get_group(category_id, phrase_id)
|
169
|
+
Redis.current.smembers(get_group_key(category_id, phrase_id))
|
170
|
+
end
|
171
|
+
|
172
|
+
def in_same_group?(category_id, phrase_a_id, phrase_b_id)
|
173
|
+
get_group_key(category_id, phrase_a_id) == get_group_key(category_id, phrase_b_id)
|
174
|
+
end
|
175
|
+
|
176
|
+
### Helpers ####
|
177
|
+
def add_to_linkage_set(category_id, linkage_id)
|
178
|
+
Redis.current.sadd("#{self}:#{category_id}:linkage_set", linkage_id)
|
179
|
+
end
|
180
|
+
|
181
|
+
def add_to_phrase_set(category_id, phrase_id)
|
182
|
+
Redis.current.sadd("#{self}:#{category_id}:phrase_set", phrase_id)
|
183
|
+
end
|
184
|
+
|
185
|
+
def set_changed(category_id, value)
|
186
|
+
Redis.current.setbit("#{self}:changed", category_id, value)
|
187
|
+
end
|
188
|
+
|
189
|
+
def set_calculated(category_id, value)
|
190
|
+
Redis.current.setbit("#{self}:calculated", category_id, value)
|
191
|
+
end
|
192
|
+
|
193
|
+
def set_phrase_calculated(category_id, phrase_id, value)
|
194
|
+
Redis.current.setbit("#{self}:#{category_id}:calculated", phrase_id, value)
|
195
|
+
end
|
196
|
+
|
197
|
+
def add_group(category_id, phrase_id)
|
198
|
+
group_key = get_group_key(category_id, phrase_id)
|
199
|
+
if group_key.nil?
|
200
|
+
group_key = create_group(category_id, phrase_id)
|
201
|
+
Redis.current.sadd(group_key, phrase_id)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def create_group(category_id, phrase_id)
|
206
|
+
new_group_id = Redis.current.incr("#{self}:#{category_id}:group_count") - 1
|
207
|
+
new_group_key = "#{self}:#{category_id}:group:#{new_group_id}"
|
208
|
+
set_phrase_group(category_id, phrase_id, new_group_key)
|
209
|
+
new_group_key
|
210
|
+
end
|
211
|
+
|
212
|
+
def get_group_key(category_id, phrase_id)
|
213
|
+
Redis.current.get("#{self}:#{category_id}:#{phrase_id}:group")
|
214
|
+
end
|
215
|
+
|
216
|
+
def change_phrases_group(category_id, old_group_key, new_group_key)
|
217
|
+
phrase_list = Redis.current.smembers(old_group_key)
|
218
|
+
phrase_list.each do |phrase_id|
|
219
|
+
set_phrase_group(category_id, phrase_id, new_group_key)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
def merge_phrase_group_helper(category_id, source_group, dest_group)
|
224
|
+
Redis.current.sunionstore(dest_group, source_group, dest_group)
|
225
|
+
change_phrases_group(category_id, source_group, dest_group)
|
226
|
+
Redis.current.del(source_group)
|
227
|
+
end
|
228
|
+
|
229
|
+
def set_phrase_group(category_id, phrase_id, group_key)
|
230
|
+
Redis.current.set("#{self}:#{category_id}:#{phrase_id}:group", group_key)
|
231
|
+
end
|
232
|
+
|
233
|
+
end
|
234
|
+
private_class_method :set_changed, :create_group, :get_group_key, :set_phrase_group, :change_phrases_group
|
235
|
+
end
|
236
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module HarmonizerRedis
|
2
|
+
module IdfScorer
|
3
|
+
# class self
|
4
|
+
class << self
|
5
|
+
def add_document(phrase_id)
|
6
|
+
self.incr_doc_count
|
7
|
+
text = HarmonizerRedis::Phrase.get_content(phrase_id)
|
8
|
+
word_set = Set.new
|
9
|
+
text.split.each do |word|
|
10
|
+
unless word_set.include? word
|
11
|
+
word_set.add(word)
|
12
|
+
Redis.current.incr(word_doc_freq_key(word))
|
13
|
+
end
|
14
|
+
Redis.current.incr(word_count_key(word))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_score(word)
|
19
|
+
doc_freq = self.get_doc_freq(word) + 0.1
|
20
|
+
doc_count = self.doc_count + 0.1
|
21
|
+
Math.log(0.1+(doc_count / doc_freq))
|
22
|
+
end
|
23
|
+
|
24
|
+
# first char is length of phrase (capped at 255 words). Each word(key) is separated
|
25
|
+
# by a comma and the keys and values are separated by a vertical bar
|
26
|
+
def serialize_matrix(matrix)
|
27
|
+
serialized = "#{matrix.length.chr}#{matrix.keys.join(',')},#{matrix.values.join(',')}"
|
28
|
+
serialized
|
29
|
+
end
|
30
|
+
|
31
|
+
# Used for soft cosine similarity
|
32
|
+
def calc_soft_matrix(phrase_content)
|
33
|
+
matrix = Hash.new(0.0)
|
34
|
+
phrase_content.split.each do |word|
|
35
|
+
matrix[word] += 1.0
|
36
|
+
end
|
37
|
+
|
38
|
+
matrix.each do |word, count|
|
39
|
+
updated = (1.0 + Math::log10(count)) * self.get_score(word)
|
40
|
+
matrix[word] = updated
|
41
|
+
end
|
42
|
+
|
43
|
+
#calculate normalization factor
|
44
|
+
norm_factor_sqrd = 0.0
|
45
|
+
matrix.each do |word_a, value_a|
|
46
|
+
matrix.each do |word_b, value_b|
|
47
|
+
similarity = WhiteSimilarity.score(word_a, word_b)
|
48
|
+
norm_factor_sqrd += (similarity * value_a * value_b)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#normalize
|
53
|
+
factor = Math::sqrt(norm_factor_sqrd)
|
54
|
+
matrix.each do |word, value|
|
55
|
+
matrix[word] = value / factor
|
56
|
+
end
|
57
|
+
matrix
|
58
|
+
end
|
59
|
+
|
60
|
+
def cos_similarity(matrix_a, matrix_b)
|
61
|
+
similarity = 0.0
|
62
|
+
matrix_a.each do |word, value|
|
63
|
+
similarity += (value * matrix_b[word])
|
64
|
+
end
|
65
|
+
similarity
|
66
|
+
end
|
67
|
+
|
68
|
+
def soft_cos_similarity(matrix_a, matrix_b)
|
69
|
+
similarity = 0.0
|
70
|
+
matrix_a.each do |word_a, value_a|
|
71
|
+
matrix_b.each do |word_b, value_b|
|
72
|
+
if word_a != word_b
|
73
|
+
white_similarity = WhiteSimilarity.score(word_a, word_b)
|
74
|
+
else
|
75
|
+
white_similarity = 1.0
|
76
|
+
end
|
77
|
+
similarity += (white_similarity * value_a * value_b)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
similarity
|
81
|
+
end
|
82
|
+
|
83
|
+
def get_doc_freq(word)
|
84
|
+
Redis.current.get(word_doc_freq_key(word)).to_f
|
85
|
+
end
|
86
|
+
|
87
|
+
def get_count(word)
|
88
|
+
Redis.current.get(word_count_key(word)).to_f
|
89
|
+
end
|
90
|
+
|
91
|
+
def decr_doc_freq(word)
|
92
|
+
Redis.current.decr(word_doc_freq_key(word))
|
93
|
+
Redis.current.decr("#{self}:doc_count")
|
94
|
+
end
|
95
|
+
|
96
|
+
def doc_count
|
97
|
+
Redis.current.get("#{self}:doc_count").to_f
|
98
|
+
end
|
99
|
+
|
100
|
+
def incr_doc_count
|
101
|
+
Redis.current.incr("#{self}:doc_count")
|
102
|
+
end
|
103
|
+
|
104
|
+
def word_doc_freq_key(word)
|
105
|
+
"Word:[#{word}]:doc_freq"
|
106
|
+
end
|
107
|
+
|
108
|
+
def word_count_key(word)
|
109
|
+
"Word:[#{word}]:count"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
module HarmonizerRedis
|
2
|
+
class Linkage < BaseObject
|
3
|
+
attr_reader :id
|
4
|
+
|
5
|
+
def generate_id
|
6
|
+
SecureRandom.uuid
|
7
|
+
end
|
8
|
+
|
9
|
+
def initialize(params={})
|
10
|
+
@content = params[:content]
|
11
|
+
@category_id = params[:category_id]
|
12
|
+
end
|
13
|
+
|
14
|
+
def save # make sure that new phrase is saved
|
15
|
+
# if phrase already exists : set to that phrase
|
16
|
+
# otherwise : create a new phrase and set linkage:phrase to that phrase
|
17
|
+
# linkage is also added to the category with certain id (can be used to divide tasks)
|
18
|
+
# Assert: all required fields have been set
|
19
|
+
@id = generate_id
|
20
|
+
|
21
|
+
unless @id && @content && @category_id
|
22
|
+
raise "id, content, and category_id are not all set"
|
23
|
+
end
|
24
|
+
|
25
|
+
@content_normalized = HarmonizerRedis.normalize_string(@content)
|
26
|
+
existing_phrase_id = HarmonizerRedis::Phrase.find_by_content(@content_normalized)
|
27
|
+
if existing_phrase_id
|
28
|
+
@phrase = existing_phrase_id
|
29
|
+
else
|
30
|
+
new_phrase = HarmonizerRedis::Phrase.new(@content_normalized)
|
31
|
+
new_phrase.save
|
32
|
+
@phrase = new_phrase.id
|
33
|
+
end
|
34
|
+
super()
|
35
|
+
Category.add_linkage(self)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Readers
|
39
|
+
|
40
|
+
def content
|
41
|
+
@content ||= self.class.get_content(@id)
|
42
|
+
end
|
43
|
+
|
44
|
+
def content_normalized
|
45
|
+
@content_normalized ||= self.class.get_content_normalized(@id)
|
46
|
+
end
|
47
|
+
|
48
|
+
def category_id
|
49
|
+
@category_id ||= self.class.get_category_id(@id)
|
50
|
+
end
|
51
|
+
|
52
|
+
def corrected
|
53
|
+
label = Category.get_group_label(category_id, phrase_id)
|
54
|
+
if label.nil?
|
55
|
+
if Category.get_group_count(category_id, phrase_id)
|
56
|
+
return content
|
57
|
+
else
|
58
|
+
'(LABEL NOT SET)'
|
59
|
+
end
|
60
|
+
else
|
61
|
+
label
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def phrase_id
|
66
|
+
@phrase ||= self.class.get_phrase_id(@id)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Writers
|
70
|
+
|
71
|
+
def content=(value)
|
72
|
+
if self.is_saved?
|
73
|
+
raise "Saved linkage content cannot be edited"
|
74
|
+
else
|
75
|
+
@content = value
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def category_id=(value)
|
80
|
+
if self.is_saved?
|
81
|
+
raise "Saved linkage category_id cannot be edited"
|
82
|
+
else
|
83
|
+
@category_id = value
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
### Functionality
|
88
|
+
def calculate_similarities
|
89
|
+
own_phrase_id = phrase_id
|
90
|
+
own_cat_id = category_id
|
91
|
+
phrase_list = Category.get_phrase_list(own_cat_id)
|
92
|
+
matrix_list = Category.get_matrices(category_id, phrase_list)
|
93
|
+
own_matrix = Phrase.get_matrix(own_phrase_id)
|
94
|
+
Redis.current.pipelined do
|
95
|
+
phrase_list.each_with_index do |other_id, index|
|
96
|
+
score = Phrase.calc_soft_pair_similarity(own_matrix, matrix_list[index])
|
97
|
+
if score > 0.2
|
98
|
+
Redis.current.zadd("HarmonizerRedis::Category:#{own_cat_id}:#{own_phrase_id}:sims", score, other_id)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
Category.set_phrase_calculated(own_cat_id, own_phrase_id, 1)
|
103
|
+
Category.reset_changed(own_cat_id)
|
104
|
+
end
|
105
|
+
|
106
|
+
def get_similarities(num_phrases = 20)
|
107
|
+
self_phrase_id = phrase_id
|
108
|
+
unless is_calculated?
|
109
|
+
calculate_similarities
|
110
|
+
end
|
111
|
+
phrase_id_list = Redis.current.zrevrange("HarmonizerRedis::Category:#{self.category_id}:#{self_phrase_id}:sims",
|
112
|
+
0, num_phrases, :with_scores => true)
|
113
|
+
results = []
|
114
|
+
phrase_id_list.each do |phrase, score|
|
115
|
+
unless Category.in_same_group?(category_id, self_phrase_id, phrase)
|
116
|
+
results << [Phrase.get_content(phrase), Category.get_group_label(category_id, phrase), score, phrase]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
results
|
120
|
+
end
|
121
|
+
|
122
|
+
# Recommend possible labels for a linkage
|
123
|
+
def recommend_labels
|
124
|
+
existing_labels = Category.get_all_group_labels(category_id, phrase_id)
|
125
|
+
other_linkages = Category.get_group_popular_linkages(category_id, phrase_id)
|
126
|
+
existing_labels + other_linkages
|
127
|
+
end
|
128
|
+
|
129
|
+
def merge_with_phrase(phrase_id)
|
130
|
+
Category.merge_phrase_groups(category_id, self.phrase_id, phrase_id)
|
131
|
+
end
|
132
|
+
|
133
|
+
def set_corrected_label(label)
|
134
|
+
Category.set_group_label(category_id, phrase_id, label)
|
135
|
+
end
|
136
|
+
|
137
|
+
### Helpers
|
138
|
+
def is_category_changed?
|
139
|
+
unless is_saved?
|
140
|
+
raise "Linkage must be saved first"
|
141
|
+
end
|
142
|
+
Category.changed?(self.category_id)
|
143
|
+
end
|
144
|
+
|
145
|
+
def is_saved?
|
146
|
+
self.class.is_linkage_saved?(@id)
|
147
|
+
end
|
148
|
+
|
149
|
+
def is_calculated?
|
150
|
+
if is_category_changed? || !is_saved?
|
151
|
+
false
|
152
|
+
else
|
153
|
+
Category.is_phrase_calculated?(category_id, phrase_id)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
class << self
|
158
|
+
def find(linkage_id)
|
159
|
+
unless is_linkage_saved?(linkage_id)
|
160
|
+
return nil
|
161
|
+
end
|
162
|
+
linkage = self.new
|
163
|
+
linkage.instance_variable_set('@id', linkage_id)
|
164
|
+
linkage
|
165
|
+
end
|
166
|
+
|
167
|
+
def is_linkage_saved?(linkage_id)
|
168
|
+
Redis.current.sismember("#{self}:set", "#{linkage_id}")
|
169
|
+
end
|
170
|
+
|
171
|
+
def get_category_id(linkage_id)
|
172
|
+
Redis.current.get("#{self}:#{linkage_id}:category_id")
|
173
|
+
end
|
174
|
+
|
175
|
+
def get_phrase_id(linkage_id)
|
176
|
+
Redis.current.get("#{self}:#{linkage_id}:phrase")
|
177
|
+
end
|
178
|
+
|
179
|
+
def get_content(linkage_id)
|
180
|
+
Redis.current.get("#{self}:#{linkage_id}:content")
|
181
|
+
end
|
182
|
+
|
183
|
+
def get_content_normalized(linkage_id)
|
184
|
+
Redis.current.get("#{self}:#{linkage_id}:content_normalized")
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module HarmonizerRedis
|
2
|
+
class Phrase < BaseObject
|
3
|
+
attr_accessor :content
|
4
|
+
|
5
|
+
def initialize(content)
|
6
|
+
@content = content
|
7
|
+
end
|
8
|
+
|
9
|
+
def save
|
10
|
+
super()
|
11
|
+
HarmonizerRedis::IdfScorer.add_document(@id)
|
12
|
+
Redis.current.set("#{self.class}:[#{@content}]", "#{@id}")
|
13
|
+
end
|
14
|
+
|
15
|
+
class << self
|
16
|
+
def find_by_content(content)
|
17
|
+
Redis.current.get("#{self}:[#{content}]")
|
18
|
+
end
|
19
|
+
|
20
|
+
def get_content(phrase_id)
|
21
|
+
Redis.current.get("#{self}:#{phrase_id}:content")
|
22
|
+
end
|
23
|
+
|
24
|
+
# Setup
|
25
|
+
def add_linkage(phrase_id, linkage_id, category_id)
|
26
|
+
add_linkage_id(phrase_id, linkage_id)
|
27
|
+
add_category_id(phrase_id, category_id)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Linkages
|
31
|
+
def add_linkage_id(phrase_id, linkage_id)
|
32
|
+
Redis.current.zincrby(linkage_set_key(phrase_id), 1, Linkage.get_content(linkage_id))
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_linkage_count(phrase_id)
|
36
|
+
Redis.current.zcard(linkage_set_key(phrase_id))
|
37
|
+
end
|
38
|
+
|
39
|
+
def get_popular_linkages(phrase_id, number = 5)
|
40
|
+
if number <= 0
|
41
|
+
raise "number must be >= 0"
|
42
|
+
end
|
43
|
+
Redis.current.zrevrange(linkage_set_key(phrase_id), 0, number-1, with_scores: true)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Categories
|
47
|
+
def add_category_id(phrase_id, category_id)
|
48
|
+
Redis.current.sadd(category_set_key(phrase_id), category_id)
|
49
|
+
end
|
50
|
+
|
51
|
+
def get_categories(phrase_id)
|
52
|
+
Redis.current.smembers(category_set_key(phrase_id))
|
53
|
+
end
|
54
|
+
|
55
|
+
#get a serialized version of the matrix.
|
56
|
+
def get_matrix(phrase_id)
|
57
|
+
serialized = Redis.current.get("#{self}:#{phrase_id}:matrix")
|
58
|
+
if serialized
|
59
|
+
serialized
|
60
|
+
else
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def calc_pair_similarity(phrase_a, phrase_b, phrase_a_matrix, phrase_b_matrix)
|
66
|
+
idf_similarity = IdfScorer.cos_similarity(phrase_a_matrix, phrase_b_matrix)
|
67
|
+
white_similarity = WhiteSimilarity.score(phrase_a, phrase_b)
|
68
|
+
(idf_similarity + white_similarity) * -0.5
|
69
|
+
end
|
70
|
+
|
71
|
+
def calc_soft_pair_similarity(phrase_a_matrix, phrase_b_matrix)
|
72
|
+
WhiteSimilarity.soft_cos_similarity(phrase_a_matrix, phrase_b_matrix)
|
73
|
+
end
|
74
|
+
|
75
|
+
### Helpers ### (consider refactoring with metaprogramming)
|
76
|
+
def linkage_set_key(phrase_id)
|
77
|
+
"#{self}:#{phrase_id}:linkage_set"
|
78
|
+
end
|
79
|
+
|
80
|
+
def category_set_key(phrase_id)
|
81
|
+
"#{self}:#{phrase_id}:category_set"
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
private_class_method :linkage_set_key, :category_set_key
|
86
|
+
end
|
87
|
+
end
|
metadata
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: harmonizer_redis
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tian Wang
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-09-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: redis
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: hiredis
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: activesupport
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.10'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.10'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '10.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '10.0'
|
97
|
+
description: Harmonizes records based on fuzzy string/phrase matching. Built on redis
|
98
|
+
for speed
|
99
|
+
email:
|
100
|
+
- twang95@stanford.edu
|
101
|
+
executables: []
|
102
|
+
extensions:
|
103
|
+
- ext/white_similarity/extconf.rb
|
104
|
+
extra_rdoc_files: []
|
105
|
+
files:
|
106
|
+
- ".gitignore"
|
107
|
+
- ".rspec"
|
108
|
+
- ".travis.yml"
|
109
|
+
- Gemfile
|
110
|
+
- LICENSE.txt
|
111
|
+
- README.md
|
112
|
+
- Rakefile
|
113
|
+
- bin/console
|
114
|
+
- bin/setup
|
115
|
+
- ext/white_similarity/extconf.rb
|
116
|
+
- ext/white_similarity/white_similarity.c
|
117
|
+
- ext/white_similarity/white_similarity.h
|
118
|
+
- harmonizer_redis.gemspec
|
119
|
+
- lib/harmonizer_redis.rb
|
120
|
+
- lib/harmonizer_redis/base_object.rb
|
121
|
+
- lib/harmonizer_redis/category.rb
|
122
|
+
- lib/harmonizer_redis/idf_scorer.rb
|
123
|
+
- lib/harmonizer_redis/linkage.rb
|
124
|
+
- lib/harmonizer_redis/phrase.rb
|
125
|
+
- lib/harmonizer_redis/version.rb
|
126
|
+
homepage: https://github.com/POSpulse/harmonizer_redis
|
127
|
+
licenses:
|
128
|
+
- MIT
|
129
|
+
metadata: {}
|
130
|
+
post_install_message:
|
131
|
+
rdoc_options: []
|
132
|
+
require_paths:
|
133
|
+
- lib
|
134
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
140
|
+
requirements:
|
141
|
+
- - ">="
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
144
|
+
requirements: []
|
145
|
+
rubyforge_project:
|
146
|
+
rubygems_version: 2.4.8
|
147
|
+
signing_key:
|
148
|
+
specification_version: 4
|
149
|
+
summary: Harmonizes records
|
150
|
+
test_files: []
|