harmonizer_redis 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1b7c2cbd0cf8dbb603f66c6b7dc37fe6cb780cbf
4
+ data.tar.gz: e803d14a56ca973ac52e02e6804c6ee5965cb6c0
5
+ SHA512:
6
+ metadata.gz: 551d4c226fa3cf6f8badd822f7442b6d7c5673e09fb4687f169afe8e57da82f686c6bd0084cf68f714feadecdf53274c89aee08d0a99c55d46a84ad4f24d1b2b
7
+ data.tar.gz: 9cf6d8c7ddce775cda2dff601421c961fc2326e0c9233805d3d21a99f782653feb70b19b30f6b374ddef76ea8a51691acb77b3478261946d0152ac286e5cc39e
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ Makefile
11
+ *.bundle
12
+ .DS_Store
13
+ .idea/
14
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.0
4
+ before_install: gem install bundler -v 1.10.4
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in harmonizer_redis.gemspec
4
+
5
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Tian Wang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,89 @@
1
+ # HarmonizerRedis
2
+
3
+ HarmonizerRedis is a Ruby gem that aids the process of relabeling/grouping free text phrases to
4
+ resolve the many ways people spell or describe something. It uses fuzzy string matching along with inverse
5
+ term frequencies to score and rank similarities between phrases. The gem uses Redis for performance.
6
+
7
+ ## Usage
8
+
9
+ ### Configuration
10
+
11
+ The Redis must be configured first. Refer to the [Redis] (https://github.com/redis/redis-rb) for more information.
12
+ `Redis.current` should be set to the Redis connection.
13
+
14
+ ```ruby
15
+ Redis.current = Redis.new
16
+ ```
17
+
18
+ ### Adding an entry
19
+
20
+ `HarmonizerRedis::Linkage` represents the connection between your data structures and the gem. Linkages contain
21
+ string content, an `id` (which will be a uniquely generated uuid), and a `category_id` which identifies the collection this entry belongs to.
22
+
23
+ ```ruby
24
+ my_category_id = 100
25
+ linkage = HarmonizerRedis::Linkage.new(content: 'harmonizer redis',
26
+ category_id: my_category_id)
27
+ linkage.save
28
+ my_linkage_id = linkage.id # "520c488b-e9f8-4a6f-aaea-0d5e37b97644"
29
+ ```
30
+
31
+ ### Retrieving an entry
32
+
33
+ ```ruby
34
+ my_linkage = HarmonizerRedis::Linkage.find(my_linkage_id)
35
+ ```
36
+
37
+ ### Calculating and Retrieving Similarities
38
+
39
+ Calculate similarities for all the linkages in a category in a batch. New calculations will need to
40
+ be performed if new linkages are added.
41
+
42
+ ```ruby
43
+ HarmonizerRedis.calculate_similarities(my_category_id)
44
+ ```
45
+
46
+ To get an Array of similar phrases. The default is to return the top 20 phrases. If new linkages have
47
+ been added or if the similarities have not yet been computed for this linkage, it will be computed
48
+ automatically with this call.
49
+
50
+ ```ruby
51
+ my_linkage.get_similarities
52
+ ```
53
+
54
+ ### Merging into groups, labeling groups, and getting recommended labels
55
+
56
+ Each entry in this array is an array in the following format `[text_label, group_label, similarity_score, phrase_id]`
57
+
58
+ After deciding which phrase the linkage should be combined with - use the accompanying phrase_id data to merge the phrases into a group
59
+
60
+ ```ruby
61
+ my_linkage.merge_with_phrase(phrase_id)
62
+ ```
63
+
64
+ To label everything in the same group:
65
+
66
+ ```ruby
67
+ my_linkage.set_corrected_label('HarmonizerRedis')
68
+ ```
69
+
70
+ To suggest labels for this group (this works better the more HarmonizerRedis is used)
71
+
72
+ ```ruby
73
+ my_linkage.recommend_labels
74
+ ```
75
+
76
+ Lastly to get the final corrected label of a linkage:
77
+
78
+ ```ruby
79
+ my_linkage.corrected
80
+ ```
81
+
82
+ ## Contributing
83
+
84
+ Feel free to fork this repo and change it as you wish. We prefer pull requests on github, but you can send us emails. All attributions need to be tested as well.
85
+
86
+ ## License
87
+
88
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
89
+
data/Rakefile ADDED
@@ -0,0 +1,19 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+ require "rake/extensiontask"
4
+
5
+ RSpec::Core::RakeTask.new
6
+
7
+ task :default => :spec
8
+ task :test => :spec
9
+ Rake::Task[:test].prerequisites << :compile
10
+
11
+ task :console do
12
+ exec "irb -r harmonizer_redis -I ./lib"
13
+ end
14
+
15
+ Rake::ExtensionTask.new('white_similarity') do |extension|
16
+ extension.lib_dir = 'lib/harmonizer_redis'
17
+ end
18
+
19
+ task :build => [:clean, :compile]
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "harmonizer_redis"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,5 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS = '--std=c99 -O'
4
+
5
+ create_makefile('harmonizer_redis/white_similarity')
@@ -0,0 +1,159 @@
1
+ #include "ruby.h"
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <stdbool.h>
5
+ #include "white_similarity.h"
6
+
7
+ void Init_white_similarity()
8
+ {
9
+ WhiteSimilarity = rb_define_module("WhiteSimilarity");
10
+ rb_define_method(WhiteSimilarity, "score", method_score, 2);
11
+ rb_define_method(WhiteSimilarity, "soft_cos_similarity", method_soft_cos_similarity, 2);
12
+ }
13
+
14
+ inline bool makes_bad_pair(char *str, int index)
15
+ {
16
+ return (str[index] == ' ' || str[index+1] == ' ');
17
+ }
18
+
19
+ inline bool is_pair_equal(char *x_pair, char *y_pair)
20
+ {
21
+ return (x_pair[0] == y_pair[0] && x_pair[1] == y_pair[1]);
22
+ }
23
+
24
+ struct PairHolder {
25
+ int pair_count;
26
+ char **pairs;
27
+ };
28
+
29
+ Pair generate_pairs(char *str)
30
+ {
31
+ Pair str_pairs;
32
+ int max_pair_number = strlen(str) - 1;
33
+ str_pairs.pairs = malloc(max_pair_number * sizeof(char *));
34
+
35
+ int pair_count = 0;
36
+ for (int i = 0; i < max_pair_number; i++) {
37
+ if (!makes_bad_pair(str, i))
38
+ {
39
+ str_pairs.pairs[pair_count] = str + i;
40
+ pair_count++;
41
+ }
42
+ }
43
+ str_pairs.pair_count = pair_count;
44
+ return str_pairs;
45
+ }
46
+
47
+ double white_similarity(char *x_str, char *y_str)
48
+ {
49
+ Pair x_pairs = generate_pairs(x_str);
50
+ Pair y_pairs = generate_pairs(y_str);
51
+ int intersect = 0;
52
+ int sum = x_pairs.pair_count + y_pairs.pair_count;
53
+
54
+ for (int i = 0; i < x_pairs.pair_count; i++) {
55
+ for (int j = 0; j < y_pairs.pair_count; j++) {
56
+ if (x_pairs.pairs[i] != NULL && y_pairs.pairs[j] != NULL &&
57
+ is_pair_equal(x_pairs.pairs[i], y_pairs.pairs[j])) {
58
+ intersect++;
59
+ y_pairs.pairs[j] = NULL;
60
+ break;
61
+ }
62
+ }
63
+ }
64
+ free(x_pairs.pairs);
65
+ free(y_pairs.pairs);
66
+
67
+ return 2.0 * (double)intersect / (double)sum;
68
+ }
69
+
70
+ VALUE method_score(VALUE self, VALUE x_string, VALUE y_string)
71
+ {
72
+ char *x_str = StringValueCStr(x_string);
73
+ char *y_str = StringValueCStr(y_string);
74
+ if (strlen(x_str) == 1 || strlen(y_str) == 1) {
75
+ return rb_float_new(0.0);
76
+ }
77
+ double similarity = white_similarity(x_str, y_str);
78
+ return rb_float_new(similarity);
79
+ }
80
+
81
+ /* code needed for soft cos similarity calculations */
82
+
83
+ struct MatrixHolder {
84
+ int len;
85
+ char *raw_string;
86
+ char **words;
87
+ double *values;
88
+ };
89
+
90
+ Matrix generate_matrix(char *matrix_cstr)
91
+ {
92
+ Matrix new_matrix;
93
+
94
+ new_matrix.raw_string = malloc(strlen(matrix_cstr + 1) + 1);
95
+ strcpy(new_matrix.raw_string, matrix_cstr + 1);
96
+
97
+ //printf("%s", new_matrix.raw_string);
98
+
99
+ new_matrix.len = (int)matrix_cstr[0];
100
+ new_matrix.words = malloc(new_matrix.len * sizeof(char *));
101
+ new_matrix.values = malloc(new_matrix.len * sizeof(double));
102
+
103
+ char *head = new_matrix.raw_string;
104
+ for (int i = 0; i < new_matrix.len; i++) {
105
+ new_matrix.words[i] = head;
106
+ while (*head != ',') {
107
+ head++;
108
+ }
109
+ *head = '\0';
110
+ head++;
111
+ }
112
+
113
+ for (int i = 0; i < new_matrix.len; i++) {
114
+ char *curr_value = head;
115
+ while(*head != ',' && *head != '\0') {
116
+ head++;
117
+ }
118
+ if (*head == ',') {
119
+ *head = '\0';
120
+ head++;
121
+ }
122
+ sscanf(curr_value, "%lf", new_matrix.values + i);
123
+ }
124
+
125
+ return new_matrix;
126
+ }
127
+
128
+ void free_matrix(Matrix to_free)
129
+ {
130
+ free(to_free.raw_string);
131
+ free(to_free.words);
132
+ free(to_free.values);
133
+ }
134
+
135
+ double soft_cos_similarity(char *x_matrix_str, char *y_matrix_str)
136
+ {
137
+ Matrix x_matrix = generate_matrix(x_matrix_str);
138
+ Matrix y_matrix = generate_matrix(y_matrix_str);
139
+ double similarity = 0.0;
140
+
141
+ for (int i = 0; i < x_matrix.len; i++) {
142
+ for (int j = 0; j < y_matrix.len; j++) {
143
+ double word_sim = white_similarity(x_matrix.words[i], y_matrix.words[j]);
144
+ similarity += (word_sim * x_matrix.values[i] * y_matrix.values[j]);
145
+ }
146
+ }
147
+
148
+ free_matrix(x_matrix);
149
+ free_matrix(y_matrix);
150
+
151
+ return similarity;
152
+ }
153
+
154
+ VALUE method_soft_cos_similarity(VALUE self, VALUE x_matrix, VALUE y_matrix)
155
+ {
156
+ char *x_matrix_cstr = StringValueCStr(x_matrix);
157
+ char *y_matrix_cstr = StringValueCStr(y_matrix);
158
+ return rb_float_new(soft_cos_similarity(x_matrix_cstr, y_matrix_cstr));
159
+ }
@@ -0,0 +1,16 @@
1
+ VALUE WhiteSimilarity = Qnil;
2
+ void Init_white_similarity();
3
+
4
+ typedef struct PairHolder Pair;
5
+ typedef struct MatrixHolder Matrix;
6
+
7
+ VALUE method_score(VALUE self, VALUE x_string, VALUE y_string);
8
+ double white_similarity(char *x_str, char *y_str);
9
+ Pair generate_pairs(char *str);
10
+ inline bool makes_bad_pair(char *str, int index);
11
+ inline bool is_pair_equal(char *x_pair, char *y_pair);
12
+
13
+ Matrix generate_matrix(char *matrix_cstr);
14
+ void free_matrix(Matrix to_free);
15
+ double soft_cos_similarity(char *x_matrix_str, char *y_matrix_str);
16
+ VALUE method_soft_cos_similarity(VALUE self, VALUE x_matrix, VALUE y_matrix);
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'harmonizer_redis/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "harmonizer_redis"
8
+ spec.version = HarmonizerRedis::VERSION
9
+ spec.authors = ["Tian Wang"]
10
+ spec.email = ["twang95@stanford.edu"]
11
+
12
+ spec.summary = %q{Harmonizes records}
13
+ spec.description = %q{Harmonizes records based on fuzzy string/phrase matching. Built on redis for speed}
14
+ spec.homepage = "https://github.com/POSpulse/harmonizer_redis"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency "redis"
23
+ spec.add_dependency "hiredis"
24
+ spec.add_dependency "activesupport"
25
+ spec.extensions = ["ext/white_similarity/extconf.rb"]
26
+
27
+ spec.add_development_dependency "rspec"
28
+ spec.add_development_dependency "bundler", "~> 1.10"
29
+ spec.add_development_dependency "rake", "~> 10.0"
30
+ end
@@ -0,0 +1,59 @@
1
+ require 'harmonizer_redis/version'
2
+ require 'harmonizer_redis/base_object'
3
+ require 'harmonizer_redis/linkage'
4
+ require 'harmonizer_redis/phrase'
5
+ require 'harmonizer_redis/idf_scorer'
6
+ require 'harmonizer_redis/white_similarity'
7
+ require 'harmonizer_redis/category'
8
+ require 'active_support/all'
9
+ require 'redis/connection/hiredis'
10
+ require 'redis'
11
+
12
+ include WhiteSimilarity
13
+
14
+ module HarmonizerRedis
15
+ ### Calculate Similarities. Store them with the category
16
+ class << self
17
+ def calculate_similarities(category_id)
18
+ unless Category.valid?(category_id)
19
+ raise "Category ID: #{category_id} is invalid"
20
+ end
21
+
22
+ phrase_id_list = Category.get_phrase_list(category_id)
23
+
24
+ matrix_list = Category.get_matrices(category_id, phrase_id_list)
25
+
26
+ Redis.current.pipelined do
27
+ (0...phrase_id_list.length).each do |i|
28
+ (i + 1...phrase_id_list.length).each do |j|
29
+ id_x = phrase_id_list[i]
30
+ id_y = phrase_id_list[j]
31
+ score = Phrase.calc_soft_pair_similarity(matrix_list[i], matrix_list[j])
32
+ unless score < 0.2
33
+ add_similarity_entry(id_x, id_y, score, category_id)
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ Category.reset_changed(category_id)
40
+ end
41
+
42
+
43
+ ### String PreProcessing
44
+ def normalize_string(string)
45
+ ActiveSupport::Inflector.transliterate(string.strip.downcase).
46
+ split(/[^\p{L}0-9]/).delete_if { |x| x.length == 0 }.join(' ')
47
+ end
48
+
49
+ ### Helper
50
+
51
+ def add_similarity_entry(id_x, id_y, score, category_id)
52
+ Redis.current.zadd("HarmonizerRedis::Category:#{category_id}:#{id_x}:sims", score, id_y)
53
+ Redis.current.zadd("HarmonizerRedis::Category:#{category_id}:#{id_y}:sims", score, id_x)
54
+ end
55
+
56
+ private :add_similarity_entry
57
+ end
58
+
59
+ end
@@ -0,0 +1,24 @@
1
+ module HarmonizerRedis
2
+ class BaseObject
3
+ attr_accessor :id
4
+
5
+ def generate_id
6
+ Redis.current.incr("#{self.class}").to_i - 1
7
+ end
8
+
9
+ def save
10
+ #creates a new id only when object is being saved
11
+ klass = "#{self.class}"
12
+ new_id = @id || self.generate_id
13
+ self.instance_variables.each do |variable|
14
+ var_name = variable.to_s[1..-1]
15
+ Redis.current.set("#{klass}:#{new_id}:#{var_name}", instance_variable_get(variable))
16
+ end
17
+
18
+ @id = new_id
19
+
20
+ #add id to HarmonizerRedis::ClassName:set
21
+ Redis.current.sadd("#{klass}:set", @id)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,236 @@
1
+ module HarmonizerRedis
2
+ class Category < BaseObject
3
+ attr_reader :id
4
+ def initialize(id)
5
+ @id = id
6
+ end
7
+
8
+ def save
9
+ super()
10
+ end
11
+
12
+ class << self
13
+ # Add linkage to category group
14
+ def add_linkage(linkage)
15
+ category_id = linkage.category_id
16
+ linkage_id = linkage.id
17
+ phrase_id = linkage.phrase_id
18
+
19
+ unless self.valid?(category_id)
20
+ new_category = self.new(category_id)
21
+ new_category.save
22
+ end
23
+
24
+ self.add_to_linkage_set(category_id, linkage_id)
25
+ self.add_to_phrase_set(category_id, phrase_id)
26
+
27
+ set_changed(category_id, 1)
28
+ set_calculated(category_id, 0)
29
+ set_phrase_calculated(category_id, phrase_id, 0)
30
+
31
+ # Creating/adding to a group
32
+ add_group(category_id, phrase_id)
33
+
34
+ # Adding linkage to the phrase
35
+ Phrase.add_linkage(phrase_id, linkage_id, category_id)
36
+ end
37
+
38
+ # Gets list of linkages included in category group
39
+ def get_linkage_list(category_id)
40
+ Redis.current.smembers("#{self}:#{category_id}:linkage_set")
41
+ end
42
+
43
+ def get_phrase_list(category_id)
44
+ Redis.current.smembers("#{self}:#{category_id}:phrase_set")
45
+ end
46
+
47
+ def get_matrices(category_id, phrase_id_list)
48
+ matrices_list = []
49
+ if !matrices_calculated?(category_id) || changed?(category_id)
50
+ phrase_id_list.each do |id|
51
+ content = Phrase.get_content(id)
52
+ new_matrix = IdfScorer.serialize_matrix(IdfScorer.calc_soft_matrix(content))
53
+ Redis.current.set("HarmonizerRedis::Phrase:#{id}:matrix", new_matrix)
54
+ matrices_list << new_matrix
55
+ end
56
+ set_calculated(category_id, 1)
57
+ else
58
+ phrase_id_list.each do |phrase_id|
59
+ matrices_list << Phrase.get_matrix(phrase_id)
60
+ end
61
+ end
62
+ matrices_list
63
+ end
64
+
65
+ # Set the category as "unchanged". Should be called after Category similarities
66
+ # have been calculated
67
+ def reset_changed(category_id)
68
+ set_changed(category_id, 0)
69
+ end
70
+
71
+ # Check to see if id is valid
72
+ def valid?(category_id)
73
+ Redis.current.sismember("#{self}:set", "#{category_id}")
74
+ end
75
+
76
+ def changed?(category_id)
77
+ !Redis.current.getbit("#{self}:changed", category_id).zero?
78
+ end
79
+
80
+ def matrices_calculated?(category_id)
81
+ !Redis.current.getbit("#{self}:calculated", category_id).zero?
82
+ end
83
+
84
+ def is_phrase_calculated?(category_id, phrase_id)
85
+ !Redis.current.getbit("#{self}:#{category_id}:calculated", phrase_id).zero?
86
+ end
87
+
88
+ # Merge 2 phrases' groups
89
+ def merge_phrase_groups(category_id, phrase_a_id, phrase_b_id)
90
+ group_a = get_group_key(category_id, phrase_a_id)
91
+ group_b = get_group_key(category_id, phrase_b_id)
92
+ # Error either group does not exist
93
+ if group_a.nil? || group_b.nil?
94
+ raise 'Invalid Phrase ID(s) given!'
95
+ end
96
+ # Do nothing if both are already in the same group
97
+ if group_a == group_b
98
+ return
99
+ end
100
+ label_a = get_group_label(category_id, phrase_a_id)
101
+ label_b = get_group_label(category_id, phrase_b_id)
102
+
103
+ # if label_a and label_b both exist
104
+ unless label_a.nil? ^ label_b.nil?
105
+ # if label_a and label_b are not the same label
106
+ if label_a != label_b
107
+ # delete both labels due to conflict
108
+ Redis.current.del(label_a, label_b)
109
+ else # both labels are the same
110
+ # delete only the label that belongs to the group getting destroyed
111
+ Redis.current.del(label_b)
112
+ end
113
+ end
114
+
115
+ # if only label_b exists
116
+ if label_a.nil? && !label_b.nil?
117
+ merge_phrase_group_helper(category_id, group_a, group_b)
118
+ Redis.current.del(group_a)
119
+ else # if only label_a exists
120
+ merge_phrase_group_helper(category_id, group_b, group_a)
121
+ Redis.current.del(group_b)
122
+ end
123
+
124
+ end
125
+
126
+ ### Getting popular linkages and generating possible labels
127
+
128
+ def get_group_popular_linkages(category_id, phrase_id, number = 5)
129
+ phrases = get_group(category_id, phrase_id)
130
+ linkages = []
131
+ phrases.each do |id|
132
+ linkages += Phrase.get_popular_linkages(id)
133
+ end
134
+ linkages.sort_by! { |entry| -1 * entry[-1] }
135
+ linkages.first(number)
136
+ end
137
+
138
+ def get_all_group_labels(category_id, phrase_id)
139
+ phrases_in_group = get_group(category_id, phrase_id)
140
+ categories = []
141
+ phrases_in_group.each do |phrase_id|
142
+ categories += Phrase.get_categories(phrase_id)
143
+ end
144
+ labels = Hash.new { |hash, key| hash[key] = 0.0 }
145
+ categories.each do |category_id|
146
+ phrases_in_group.each do |phrase_id|
147
+ group_label = get_group_label(category_id, phrase_id)
148
+ unless group_label.nil?
149
+ labels[group_label] += Phrase.get_linkage_count(phrase_id)
150
+ end
151
+ end
152
+ end
153
+ labels.to_a.sort_by { |x| x[-1] }
154
+ end
155
+
156
+ def set_group_label(category_id, phrase_id, label)
157
+ Redis.current.set("#{get_group_key(category_id, phrase_id)}:label", label)
158
+ end
159
+
160
+ def get_group_label(category_id, phrase_id)
161
+ Redis.current.get("#{get_group_key(category_id, phrase_id)}:label")
162
+ end
163
+
164
+ def get_group_count(category_id, phrase_id)
165
+ Redis.current.scard(get_group_key(category_id, phrase_id))
166
+ end
167
+
168
+ def get_group(category_id, phrase_id)
169
+ Redis.current.smembers(get_group_key(category_id, phrase_id))
170
+ end
171
+
172
+ def in_same_group?(category_id, phrase_a_id, phrase_b_id)
173
+ get_group_key(category_id, phrase_a_id) == get_group_key(category_id, phrase_b_id)
174
+ end
175
+
176
+ ### Helpers ####
177
+ def add_to_linkage_set(category_id, linkage_id)
178
+ Redis.current.sadd("#{self}:#{category_id}:linkage_set", linkage_id)
179
+ end
180
+
181
+ def add_to_phrase_set(category_id, phrase_id)
182
+ Redis.current.sadd("#{self}:#{category_id}:phrase_set", phrase_id)
183
+ end
184
+
185
+ def set_changed(category_id, value)
186
+ Redis.current.setbit("#{self}:changed", category_id, value)
187
+ end
188
+
189
+ def set_calculated(category_id, value)
190
+ Redis.current.setbit("#{self}:calculated", category_id, value)
191
+ end
192
+
193
+ def set_phrase_calculated(category_id, phrase_id, value)
194
+ Redis.current.setbit("#{self}:#{category_id}:calculated", phrase_id, value)
195
+ end
196
+
197
+ def add_group(category_id, phrase_id)
198
+ group_key = get_group_key(category_id, phrase_id)
199
+ if group_key.nil?
200
+ group_key = create_group(category_id, phrase_id)
201
+ Redis.current.sadd(group_key, phrase_id)
202
+ end
203
+ end
204
+
205
+ def create_group(category_id, phrase_id)
206
+ new_group_id = Redis.current.incr("#{self}:#{category_id}:group_count") - 1
207
+ new_group_key = "#{self}:#{category_id}:group:#{new_group_id}"
208
+ set_phrase_group(category_id, phrase_id, new_group_key)
209
+ new_group_key
210
+ end
211
+
212
+ def get_group_key(category_id, phrase_id)
213
+ Redis.current.get("#{self}:#{category_id}:#{phrase_id}:group")
214
+ end
215
+
216
+ def change_phrases_group(category_id, old_group_key, new_group_key)
217
+ phrase_list = Redis.current.smembers(old_group_key)
218
+ phrase_list.each do |phrase_id|
219
+ set_phrase_group(category_id, phrase_id, new_group_key)
220
+ end
221
+ end
222
+
223
+ def merge_phrase_group_helper(category_id, source_group, dest_group)
224
+ Redis.current.sunionstore(dest_group, source_group, dest_group)
225
+ change_phrases_group(category_id, source_group, dest_group)
226
+ Redis.current.del(source_group)
227
+ end
228
+
229
+ def set_phrase_group(category_id, phrase_id, group_key)
230
+ Redis.current.set("#{self}:#{category_id}:#{phrase_id}:group", group_key)
231
+ end
232
+
233
+ end
234
+ private_class_method :set_changed, :create_group, :get_group_key, :set_phrase_group, :change_phrases_group
235
+ end
236
+ end
@@ -0,0 +1,113 @@
1
+ module HarmonizerRedis
2
+ module IdfScorer
3
+ # class self
4
+ class << self
5
+ def add_document(phrase_id)
6
+ self.incr_doc_count
7
+ text = HarmonizerRedis::Phrase.get_content(phrase_id)
8
+ word_set = Set.new
9
+ text.split.each do |word|
10
+ unless word_set.include? word
11
+ word_set.add(word)
12
+ Redis.current.incr(word_doc_freq_key(word))
13
+ end
14
+ Redis.current.incr(word_count_key(word))
15
+ end
16
+ end
17
+
18
+ def get_score(word)
19
+ doc_freq = self.get_doc_freq(word) + 0.1
20
+ doc_count = self.doc_count + 0.1
21
+ Math.log(0.1+(doc_count / doc_freq))
22
+ end
23
+
24
+ # first char is length of phrase (capped at 255 words). Each word(key) is separated
25
+ # by a comma and the keys and values are separated by a vertical bar
26
+ def serialize_matrix(matrix)
27
+ serialized = "#{matrix.length.chr}#{matrix.keys.join(',')},#{matrix.values.join(',')}"
28
+ serialized
29
+ end
30
+
31
+ # Used for soft cosine similarity
32
+ def calc_soft_matrix(phrase_content)
33
+ matrix = Hash.new(0.0)
34
+ phrase_content.split.each do |word|
35
+ matrix[word] += 1.0
36
+ end
37
+
38
+ matrix.each do |word, count|
39
+ updated = (1.0 + Math::log10(count)) * self.get_score(word)
40
+ matrix[word] = updated
41
+ end
42
+
43
+ #calculate normalization factor
44
+ norm_factor_sqrd = 0.0
45
+ matrix.each do |word_a, value_a|
46
+ matrix.each do |word_b, value_b|
47
+ similarity = WhiteSimilarity.score(word_a, word_b)
48
+ norm_factor_sqrd += (similarity * value_a * value_b)
49
+ end
50
+ end
51
+
52
+ #normalize
53
+ factor = Math::sqrt(norm_factor_sqrd)
54
+ matrix.each do |word, value|
55
+ matrix[word] = value / factor
56
+ end
57
+ matrix
58
+ end
59
+
60
+ def cos_similarity(matrix_a, matrix_b)
61
+ similarity = 0.0
62
+ matrix_a.each do |word, value|
63
+ similarity += (value * matrix_b[word])
64
+ end
65
+ similarity
66
+ end
67
+
68
+ def soft_cos_similarity(matrix_a, matrix_b)
69
+ similarity = 0.0
70
+ matrix_a.each do |word_a, value_a|
71
+ matrix_b.each do |word_b, value_b|
72
+ if word_a != word_b
73
+ white_similarity = WhiteSimilarity.score(word_a, word_b)
74
+ else
75
+ white_similarity = 1.0
76
+ end
77
+ similarity += (white_similarity * value_a * value_b)
78
+ end
79
+ end
80
+ similarity
81
+ end
82
+
83
+ def get_doc_freq(word)
84
+ Redis.current.get(word_doc_freq_key(word)).to_f
85
+ end
86
+
87
+ def get_count(word)
88
+ Redis.current.get(word_count_key(word)).to_f
89
+ end
90
+
91
+ def decr_doc_freq(word)
92
+ Redis.current.decr(word_doc_freq_key(word))
93
+ Redis.current.decr("#{self}:doc_count")
94
+ end
95
+
96
+ def doc_count
97
+ Redis.current.get("#{self}:doc_count").to_f
98
+ end
99
+
100
+ def incr_doc_count
101
+ Redis.current.incr("#{self}:doc_count")
102
+ end
103
+
104
+ def word_doc_freq_key(word)
105
+ "Word:[#{word}]:doc_freq"
106
+ end
107
+
108
+ def word_count_key(word)
109
+ "Word:[#{word}]:count"
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,188 @@
1
+ module HarmonizerRedis
2
+ class Linkage < BaseObject
3
+ attr_reader :id
4
+
5
+ def generate_id
6
+ SecureRandom.uuid
7
+ end
8
+
9
+ def initialize(params={})
10
+ @content = params[:content]
11
+ @category_id = params[:category_id]
12
+ end
13
+
14
+ def save # make sure that new phrase is saved
15
+ # if phrase already exists : set to that phrase
16
+ # otherwise : create a new phrase and set linkage:phrase to that phrase
17
+ # linkage is also added to the category with certain id (can be used to divide tasks)
18
+ # Assert: all required fields have been set
19
+ @id = generate_id
20
+
21
+ unless @id && @content && @category_id
22
+ raise "id, content, and category_id are not all set"
23
+ end
24
+
25
+ @content_normalized = HarmonizerRedis.normalize_string(@content)
26
+ existing_phrase_id = HarmonizerRedis::Phrase.find_by_content(@content_normalized)
27
+ if existing_phrase_id
28
+ @phrase = existing_phrase_id
29
+ else
30
+ new_phrase = HarmonizerRedis::Phrase.new(@content_normalized)
31
+ new_phrase.save
32
+ @phrase = new_phrase.id
33
+ end
34
+ super()
35
+ Category.add_linkage(self)
36
+ end
37
+
38
+ # Readers
39
+
40
+ def content
41
+ @content ||= self.class.get_content(@id)
42
+ end
43
+
44
+ def content_normalized
45
+ @content_normalized ||= self.class.get_content_normalized(@id)
46
+ end
47
+
48
+ def category_id
49
+ @category_id ||= self.class.get_category_id(@id)
50
+ end
51
+
52
+ def corrected
53
+ label = Category.get_group_label(category_id, phrase_id)
54
+ if label.nil?
55
+ if Category.get_group_count(category_id, phrase_id)
56
+ return content
57
+ else
58
+ '(LABEL NOT SET)'
59
+ end
60
+ else
61
+ label
62
+ end
63
+ end
64
+
65
+ def phrase_id
66
+ @phrase ||= self.class.get_phrase_id(@id)
67
+ end
68
+
69
+ # Writers
70
+
71
+ def content=(value)
72
+ if self.is_saved?
73
+ raise "Saved linkage content cannot be edited"
74
+ else
75
+ @content = value
76
+ end
77
+ end
78
+
79
+ def category_id=(value)
80
+ if self.is_saved?
81
+ raise "Saved linkage category_id cannot be edited"
82
+ else
83
+ @category_id = value
84
+ end
85
+ end
86
+
87
+ ### Functionality
88
+ def calculate_similarities
89
+ own_phrase_id = phrase_id
90
+ own_cat_id = category_id
91
+ phrase_list = Category.get_phrase_list(own_cat_id)
92
+ matrix_list = Category.get_matrices(category_id, phrase_list)
93
+ own_matrix = Phrase.get_matrix(own_phrase_id)
94
+ Redis.current.pipelined do
95
+ phrase_list.each_with_index do |other_id, index|
96
+ score = Phrase.calc_soft_pair_similarity(own_matrix, matrix_list[index])
97
+ if score > 0.2
98
+ Redis.current.zadd("HarmonizerRedis::Category:#{own_cat_id}:#{own_phrase_id}:sims", score, other_id)
99
+ end
100
+ end
101
+ end
102
+ Category.set_phrase_calculated(own_cat_id, own_phrase_id, 1)
103
+ Category.reset_changed(own_cat_id)
104
+ end
105
+
106
+ def get_similarities(num_phrases = 20)
107
+ self_phrase_id = phrase_id
108
+ unless is_calculated?
109
+ calculate_similarities
110
+ end
111
+ phrase_id_list = Redis.current.zrevrange("HarmonizerRedis::Category:#{self.category_id}:#{self_phrase_id}:sims",
112
+ 0, num_phrases, :with_scores => true)
113
+ results = []
114
+ phrase_id_list.each do |phrase, score|
115
+ unless Category.in_same_group?(category_id, self_phrase_id, phrase)
116
+ results << [Phrase.get_content(phrase), Category.get_group_label(category_id, phrase), score, phrase]
117
+ end
118
+ end
119
+ results
120
+ end
121
+
122
+ # Recommend possible labels for a linkage
123
+ def recommend_labels
124
+ existing_labels = Category.get_all_group_labels(category_id, phrase_id)
125
+ other_linkages = Category.get_group_popular_linkages(category_id, phrase_id)
126
+ existing_labels + other_linkages
127
+ end
128
+
129
+ def merge_with_phrase(phrase_id)
130
+ Category.merge_phrase_groups(category_id, self.phrase_id, phrase_id)
131
+ end
132
+
133
+ def set_corrected_label(label)
134
+ Category.set_group_label(category_id, phrase_id, label)
135
+ end
136
+
137
+ ### Helpers
138
+ def is_category_changed?
139
+ unless is_saved?
140
+ raise "Linkage must be saved first"
141
+ end
142
+ Category.changed?(self.category_id)
143
+ end
144
+
145
+ def is_saved?
146
+ self.class.is_linkage_saved?(@id)
147
+ end
148
+
149
+ def is_calculated?
150
+ if is_category_changed? || !is_saved?
151
+ false
152
+ else
153
+ Category.is_phrase_calculated?(category_id, phrase_id)
154
+ end
155
+ end
156
+
157
+ class << self
158
+ def find(linkage_id)
159
+ unless is_linkage_saved?(linkage_id)
160
+ return nil
161
+ end
162
+ linkage = self.new
163
+ linkage.instance_variable_set('@id', linkage_id)
164
+ linkage
165
+ end
166
+
167
+ def is_linkage_saved?(linkage_id)
168
+ Redis.current.sismember("#{self}:set", "#{linkage_id}")
169
+ end
170
+
171
+ def get_category_id(linkage_id)
172
+ Redis.current.get("#{self}:#{linkage_id}:category_id")
173
+ end
174
+
175
+ def get_phrase_id(linkage_id)
176
+ Redis.current.get("#{self}:#{linkage_id}:phrase")
177
+ end
178
+
179
+ def get_content(linkage_id)
180
+ Redis.current.get("#{self}:#{linkage_id}:content")
181
+ end
182
+
183
+ def get_content_normalized(linkage_id)
184
+ Redis.current.get("#{self}:#{linkage_id}:content_normalized")
185
+ end
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,87 @@
1
+ module HarmonizerRedis
2
+ class Phrase < BaseObject
3
+ attr_accessor :content
4
+
5
+ def initialize(content)
6
+ @content = content
7
+ end
8
+
9
+ def save
10
+ super()
11
+ HarmonizerRedis::IdfScorer.add_document(@id)
12
+ Redis.current.set("#{self.class}:[#{@content}]", "#{@id}")
13
+ end
14
+
15
+ class << self
16
+ def find_by_content(content)
17
+ Redis.current.get("#{self}:[#{content}]")
18
+ end
19
+
20
+ def get_content(phrase_id)
21
+ Redis.current.get("#{self}:#{phrase_id}:content")
22
+ end
23
+
24
+ # Setup
25
+ def add_linkage(phrase_id, linkage_id, category_id)
26
+ add_linkage_id(phrase_id, linkage_id)
27
+ add_category_id(phrase_id, category_id)
28
+ end
29
+
30
+ # Linkages
31
+ def add_linkage_id(phrase_id, linkage_id)
32
+ Redis.current.zincrby(linkage_set_key(phrase_id), 1, Linkage.get_content(linkage_id))
33
+ end
34
+
35
+ def get_linkage_count(phrase_id)
36
+ Redis.current.zcard(linkage_set_key(phrase_id))
37
+ end
38
+
39
+ def get_popular_linkages(phrase_id, number = 5)
40
+ if number <= 0
41
+ raise "number must be >= 0"
42
+ end
43
+ Redis.current.zrevrange(linkage_set_key(phrase_id), 0, number-1, with_scores: true)
44
+ end
45
+
46
+ # Categories
47
+ def add_category_id(phrase_id, category_id)
48
+ Redis.current.sadd(category_set_key(phrase_id), category_id)
49
+ end
50
+
51
+ def get_categories(phrase_id)
52
+ Redis.current.smembers(category_set_key(phrase_id))
53
+ end
54
+
55
+ #get a serialized version of the matrix.
56
+ def get_matrix(phrase_id)
57
+ serialized = Redis.current.get("#{self}:#{phrase_id}:matrix")
58
+ if serialized
59
+ serialized
60
+ else
61
+ nil
62
+ end
63
+ end
64
+
65
+ def calc_pair_similarity(phrase_a, phrase_b, phrase_a_matrix, phrase_b_matrix)
66
+ idf_similarity = IdfScorer.cos_similarity(phrase_a_matrix, phrase_b_matrix)
67
+ white_similarity = WhiteSimilarity.score(phrase_a, phrase_b)
68
+ (idf_similarity + white_similarity) * -0.5
69
+ end
70
+
71
+ def calc_soft_pair_similarity(phrase_a_matrix, phrase_b_matrix)
72
+ WhiteSimilarity.soft_cos_similarity(phrase_a_matrix, phrase_b_matrix)
73
+ end
74
+
75
+ ### Helpers ### (consider refactoring with metaprogramming)
76
+ def linkage_set_key(phrase_id)
77
+ "#{self}:#{phrase_id}:linkage_set"
78
+ end
79
+
80
+ def category_set_key(phrase_id)
81
+ "#{self}:#{phrase_id}:category_set"
82
+ end
83
+
84
+ end
85
+ private_class_method :linkage_set_key, :category_set_key
86
+ end
87
+ end
@@ -0,0 +1,3 @@
1
+ module HarmonizerRedis
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,150 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: harmonizer_redis
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Tian Wang
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-09-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: redis
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: hiredis
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.10'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.10'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ description: Harmonizes records based on fuzzy string/phrase matching. Built on redis
98
+ for speed
99
+ email:
100
+ - twang95@stanford.edu
101
+ executables: []
102
+ extensions:
103
+ - ext/white_similarity/extconf.rb
104
+ extra_rdoc_files: []
105
+ files:
106
+ - ".gitignore"
107
+ - ".rspec"
108
+ - ".travis.yml"
109
+ - Gemfile
110
+ - LICENSE.txt
111
+ - README.md
112
+ - Rakefile
113
+ - bin/console
114
+ - bin/setup
115
+ - ext/white_similarity/extconf.rb
116
+ - ext/white_similarity/white_similarity.c
117
+ - ext/white_similarity/white_similarity.h
118
+ - harmonizer_redis.gemspec
119
+ - lib/harmonizer_redis.rb
120
+ - lib/harmonizer_redis/base_object.rb
121
+ - lib/harmonizer_redis/category.rb
122
+ - lib/harmonizer_redis/idf_scorer.rb
123
+ - lib/harmonizer_redis/linkage.rb
124
+ - lib/harmonizer_redis/phrase.rb
125
+ - lib/harmonizer_redis/version.rb
126
+ homepage: https://github.com/POSpulse/harmonizer_redis
127
+ licenses:
128
+ - MIT
129
+ metadata: {}
130
+ post_install_message:
131
+ rdoc_options: []
132
+ require_paths:
133
+ - lib
134
+ required_ruby_version: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ required_rubygems_version: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ requirements: []
145
+ rubyforge_project:
146
+ rubygems_version: 2.4.8
147
+ signing_key:
148
+ specification_version: 4
149
+ summary: Harmonizes records
150
+ test_files: []