RubyGems - string_similarity - Versions diffs - 0.0.1 → 1.0.0 - Mend

string_similarity 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +3 -0
data/README.md +13 -2
data/lib/string_similarity.rb +16 -10
data/lib/string_similarity/version.rb +1 -1
data/spec/lib/string_similarity_spec.rb +122 -89
data/spec/spec_helper.rb +1 -2
data/string_similarity.gemspec +1 -0
metadata +17 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f57c49e8e84c841cbe135db15f28e25f62f9097d
-  data.tar.gz: a85ee9d70fa101ce803cb98689c3a2c7ffc46766
+  metadata.gz: 802acbb7fe320b6fe87fd865c7da83b234a1429c
+  data.tar.gz: e8c74c00831ad99587e4c5514cccfea67ad4462e
 SHA512:
-  metadata.gz: 5dcb000ba65f93a1a23b6aae169ba418fe95a46a92b3fe3772d220f4d78987a58dfae617bd3e6c380cac6c26cc4cc9fc2bd81aa34da968d456da0b8eddf719ca
-  data.tar.gz: 8e35b84d3cdb7e4326f06f2ed2dc651067e1854639fc757a7c49cee5ffd1e542eb52df6815ece8dc6ad2e4ae715a92929c3baf68bf0a82db7f64ce254a4db66a
+  metadata.gz: cd3e23c7a0acb53400f31df84f67c8d47ff4435c74dde0c17af76730dcf5b6890e70caff67c925dcfe02242a257a619d78b8d49db1147682ae74fef05e3f2abb
+  data.tar.gz: 06d566f18d1e8ce43dfce31ae8e9111c3644e11b1365d5e8f2c804345310d0b7f131bff848cf110275a02df8ff9368c88640a9a4b18449a83506f61141a1f53d

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,3 @@
+## 1.0.0
+* deprecate `score` method in favor of `bigram_score` and `ngram_score`

data/README.md CHANGED Viewed

@@ -18,10 +18,21 @@ Or install it yourself as:
 ## Usage
+StringSimilarity has two methods, `bigram_score` and `ngram_score`.
+`bigram_score` is most useful for strings that do not have spaces in them, e.g. Japanese or Simplified Chinese text.
+`ngram_score` is most useful for all other languages.
 ```ruby
-StringSimilarity.score('hello', 'hello')
-#=> 100
+string_1 = 'Translation company in Boulder'
+string_2 = 'Translation company in New York'
+StringSimilarity.bigram_score(string_1, string_2)
+#=> 73
+StringSimilarity.ngram_score(string_1, string_2)
+#=> 74
 ```

data/lib/string_similarity.rb CHANGED Viewed

@@ -2,21 +2,29 @@ require "string_similarity/version"
 class StringSimilarity
-  MIN_BIGRAM_LENGTH = 2
-  VERIFIED_BONUS = 5
+  MIN_NGRAM_LENGTH = 2
-  def self.score(string_1, string_2)
+  def self.ngram_score(string_1, string_2)
+    ngram_length = (string_1.length.to_f * 0.1).round
+    ngram_length = [MIN_NGRAM_LENGTH, ngram_length].max
+    score(string_1, string_2, ngram_length)
+  end
+  def self.bigram_score(string_1, string_2)
+    score(string_1, string_2, MIN_NGRAM_LENGTH)
+  end
+  private
+  def self.score(string_1, string_2, ngram_length)
     string_1 = String.new(string_1)
     string_2 = String.new(string_2)
     cleaned_string_1 = remove_special_characters(string_1)
     cleaned_string_2 = remove_special_characters(string_2)
-    bigram_length = (string_1.length.to_f * 0.1).round
-    bigram_length = [MIN_BIGRAM_LENGTH, bigram_length].max
-    string_1_ngrams = cleaned_string_1.each_char.each_cons(bigram_length).to_set
-    string_2_ngrams = cleaned_string_2.each_char.each_cons(bigram_length).to_set
+    string_1_ngrams = cleaned_string_1.each_char.each_cons(ngram_length).to_set
+    string_2_ngrams = cleaned_string_2.each_char.each_cons(ngram_length).to_set
     overlap = (string_1_ngrams & string_2_ngrams).size
     total = string_1_ngrams.size + string_2_ngrams.size
@@ -28,8 +36,6 @@ class StringSimilarity
     normalize_score(score, string_1, string_2)
   end
-  private
   def self.remove_special_characters(phrase)
     phrase.gsub!(/[[:punct:]<>]+/, "") # remove all punctuation
     phrase.strip! # remove trailing and leading white space

data/lib/string_similarity/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class StringSimilarity
-  VERSION = "0.0.1"
+  VERSION = "1.0.0"
 end

data/spec/lib/string_similarity_spec.rb CHANGED Viewed

@@ -2,142 +2,175 @@ require 'spec_helper'
 describe StringSimilarity do
-  describe 'short phrases' do
+  describe 'ngram_score' do
-    it 'returns 100 if there is no difference' do
-      string = 'hello'
-      expect(StringSimilarity.score(string, string)).to eq(100)
-    end
+    describe 'short phrases' do
-    it 'returns 99 if the original is lowercase and found is uppercase' do
-      string_1 = 'test'
-      string_2 = 'TEST'
+      it 'returns 100 if there is no difference' do
+        string = 'hello'
+        expect(StringSimilarity.ngram_score(string, string)).to eq(100)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(99)
-    end
+      it 'returns 99 if the original is lowercase and found is uppercase' do
+        string_1 = 'test'
+        string_2 = 'TEST'
-    it 'returns 99 when found is uppercase and original is capitalized' do
-      string_1 = 'Email'
-      string_2 = 'EMAIL'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(99)
-    end
+      it 'returns 99 when found is uppercase and original is capitalized' do
+        string_1 = 'Email'
+        string_2 = 'EMAIL'
-    it 'returns 99 if original is uppercase and found is capitalized' do
-      string_1 = 'EMAIL'
-      string_2 = 'Email'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(99)
-    end
+      it 'returns 99 if original is uppercase and found is capitalized' do
+        string_1 = 'EMAIL'
+        string_2 = 'Email'
-    it 'returns 99 if original is uppercase and found is lowercase' do
-      string_1 = 'EMAIL'
-      string_2 = 'email'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(99)
-    end
+      it 'returns 99 if original is uppercase and found is lowercase' do
+        string_1 = 'EMAIL'
+        string_2 = 'email'
-    it 'returns 86' do
-      string_1 = 'test'
-      string_2 = 'tests'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(86)
-    end
+      it 'returns 86' do
+        string_1 = 'test'
+        string_2 = 'tests'
-    it 'returns an 86 with capitalization differences' do
-      string_1 = 'test'
-      string_2 = 'Tests'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(86)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(86)
-    end
+      it 'returns an 86 with capitalization differences' do
+        string_1 = 'test'
+        string_2 = 'Tests'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(86)
+      end
-    it 'returns 6' do
-      string_1 = 'test'
-      string_2 = 'no way is this like the original string'
+      it 'returns 6' do
+        string_1 = 'test'
+        string_2 = 'no way is this like the original string'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(6)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(6)
     end
-  end
+    describe 'long phrases' do
-  describe 'long phrases' do
+      it 'returns 0 when strings are of much different length' do
+        string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
+        string_2 = "INFORMATION WE COLLECT. Roomster collects information our users submit to us such as their name and e-mail address to allow us to identify users and notify them of changes or updates to our service. We also collect personal information submitted by our users in filling out their profile on the service such as gender, state of residence, occupation, interests, etc. We collect this personal information when our users: (a) sign up as a member; (b) make changes to their member profile information; and (c) send e-mail messages, forms, or other information to us via Roomster. Members may choose to provide additional information beyond their basic profile for their personal and professional profiles. Providing additional information beyond what is required at registration is entirely optional, but enables our users to better identify each other and more effectively connect and interact with their network.  Roomster also collects information from users that is unique, but cannot be linked to a specific individual, such as IP address and browser type. A user's session will be tracked, but each user will remain anonymous. We gather this information on all visitors to Roomster.com for systems administration purposes and to track user trends, such as our most popular features. We do not link IP addresses to any personally identifiable information."
-    it 'returns 0 when strings are of much different length' do
-      string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
-      string_2 = "INFORMATION WE COLLECT. Roomster collects information our users submit to us such as their name and e-mail address to allow us to identify users and notify them of changes or updates to our service. We also collect personal information submitted by our users in filling out their profile on the service such as gender, state of residence, occupation, interests, etc. We collect this personal information when our users: (a) sign up as a member; (b) make changes to their member profile information; and (c) send e-mail messages, forms, or other information to us via Roomster. Members may choose to provide additional information beyond their basic profile for their personal and professional profiles. Providing additional information beyond what is required at registration is entirely optional, but enables our users to better identify each other and more effectively connect and interact with their network.  Roomster also collects information from users that is unique, but cannot be linked to a specific individual, such as IP address and browser type. A user's session will be tracked, but each user will remain anonymous. We gather this information on all visitors to Roomster.com for systems administration purposes and to track user trends, such as our most popular features. We do not link IP addresses to any personally identifiable information."
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(0)
-    end
+      it 'returns 0 when the phrase is too long to come up with an accurate percentage' do
+        string_1 = 'A variety of shopping centers housing popular merchants are just a few miles from the oceanfront, offering varied dining options, large retailers and a number of specialty stores. From classic fashions at Talbots to gourmet foods and cookware at Williams-Sonoma, your refined tastes are sure to be rewarded.'
+        string_2 = "Nice is an attractive city situated on the well-reputed French Riviera – France’s Mediterranean coast. There are many beautiful beaches within driving distance as well as a varied array of shopping centers, historical attractions, and museums to satiate the appetite of the cultured traveler. Since it is one of France's largest cities, it is also a major business hub. With both tourists and business executives traveling to Nice, it pays for travelers to plan for their transportation needs in advance. To travel in comfort for an affordable cost, use Blacklane's limousine service for Nice."
-    it 'returns 0 when the phrase is too long to come up with an accurate percentage' do
-      string_1 = 'A variety of shopping centers housing popular merchants are just a few miles from the oceanfront, offering varied dining options, large retailers and a number of specialty stores. From classic fashions at Talbots to gourmet foods and cookware at Williams-Sonoma, your refined tastes are sure to be rewarded.'
-      string_2 = "Nice is an attractive city situated on the well-reputed French Riviera – France’s Mediterranean coast. There are many beautiful beaches within driving distance as well as a varied array of shopping centers, historical attractions, and museums to satiate the appetite of the cultured traveler. Since it is one of France's largest cities, it is also a major business hub. With both tourists and business executives traveling to Nice, it pays for travelers to plan for their transportation needs in advance. To travel in comfort for an affordable cost, use Blacklane's limousine service for Nice."
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(0)
-    end
+      it 'returns 0 when the phrases dont match enough' do
+        string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
+        string_2 = 'We strive to provide the best music experience for our listeners, and are passionate about providing the largest and broadest catalog of digital music.^^<br />^^'
-    it 'returns 0 when the phrases dont match enough' do
-      string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
-      string_2 = 'We strive to provide the best music experience for our listeners, and are passionate about providing the largest and broadest catalog of digital music.^^<br />^^'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(0)
-    end
+      it 'returns a higher match when the phrases are close' do
+        string_1 = 'A taxi alternative in Prague'
+        string_2 = 'A taxi alternative in Munich'
-    it 'returns a higher match when the phrases are close' do
-      string_1 = 'A taxi alternative in Prague'
-      string_2 = 'A taxi alternative in Munich'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(77)
+      end
-      expect(StringSimilarity.score(string_1, string_2) > 75).to eq(true)
     end
-  end
+    describe 'special characters' do
-  describe 'special characters' do
+      it 'strips out carats to mark text to not translate' do
+        string_1 = 'Back to Home'
+        string_2 = '<<Back to Main>>'
-    it 'strips out carats to mark text to not translate' do
-      string_1 = 'Back to Home'
-      string_2 = '<<Back to Main>>'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(64)
-    end
+      it 'strips out punctuation' do
+        string_1 = 'Back to Home!?'
+        string_2 = 'Back to Main...'
-    it 'strips out punctuation' do
-      string_1 = 'Back to Home!?'
-      string_2 = 'Back to Main...'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(64)
-    end
+      it 'converts extended whitespace to one space' do
+        string_1 = 'Back to Home'
+        string_2 = 'Back to     Main'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
+      end
-    it 'converts extended whitespace to one space' do
-      string_1 = 'Back to Home'
-      string_2 = 'Back to     Main'
+      it 'strips beginning and trailing whitespace' do
+        string_1 = 'Back to Home '
+        string_2 = ' Back to Main'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(64)
     end
-    it 'strips beginning and trailing whitespace' do
-      string_1 = 'Back to Home '
-      string_2 = ' Back to Main'
+    describe 'phrases with numbers' do
+      it 'does not raise FloatDomainError' do
+        string_1 = '1 '
+        string_2 = '< 1'
+        expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
+      end
+      it 'compares strings that have a number within them' do
+        string_1 = 'I work at 1035 Pearl St.'
+        string_2 = 'I work at 1045 Pearl St.'
+        expect(StringSimilarity.ngram_score(string_1, string_2) > 90).to eq(true)
+      end
-      expect(StringSimilarity.score(string_1, string_2)).to eq(64)
     end
   end
-  describe 'phrases with numbers' do
+  describe 'bigram_score' do
-    it 'does not raise FloatDomainError' do
-      string_1 = '1 '
-      string_2 = '< 1'
+    it 'returns a score for a japanese phrase' do
+      # hello world
+      string_1 = 'こんにちは世界'
+      # hello earth
+      string_2 = 'こんにちは地球'
-      expect(StringSimilarity.score(string_1, string_2)).to eq(0)
+      expect(StringSimilarity.bigram_score(string_1, string_2) > 65).to eq(true)
     end
-    it 'compares strings that have a number within them' do
-      string_1 = 'I work at 1035 Pearl St.'
-      string_2 = 'I work at 1045 Pearl St.'
+    it 'returns a score for a simplified chinese phrase' do
+      # hello world
+      string_1 = '你好世界'
+      # hello earth
+      string_2 = '你好地球'
+      expect(StringSimilarity.bigram_score(string_1, string_2) > 30).to eq(true)
+    end
+    it 'returns a slightly higher score than the ngram score for the same strings' do
+      string_1 = 'A taxi alternative in Prague'
+      string_2 = 'A taxi alternative in Munich'
-      expect(StringSimilarity.score(string_1, string_2) > 90).to eq(true)
+      expect(StringSimilarity.bigram_score(string_1, string_2)).to eq(78)
     end
   end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'string_similarity'
 require 'rspec'
+require 'pry'
 RSpec.configure do |config|
   config.expect_with :rspec do |expectations|
@@ -20,8 +21,6 @@ RSpec.configure do |config|
     mocks.verify_partial_doubles = true
   end
-  config.warnings = true
   # Many RSpec users commonly either run the entire suite or an individual
   # file, and it's useful to allow more verbose output when running an
   # individual spec file.

data/string_similarity.gemspec CHANGED Viewed

@@ -21,4 +21,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "bundler", "~> 1.6"
   spec.add_development_dependency "rake"
   spec.add_development_dependency "rspec", "~> 3.3.0"
+  spec.add_development_dependency "pry", "~> 0.10.1"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: string_similarity
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 1.0.0
 platform: ruby
 authors:
 - Nathanael Burt
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-08-06 00:00:00.000000000 Z
+date: 2015-08-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -52,6 +52,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 3.3.0
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.10.1
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.10.1
 description: ''
 email:
 - nathanael.burt@gmail.com
@@ -62,6 +76,7 @@ files:
 - ".gitignore"
 - ".rspec"
 - ".ruby-version"
+- CHANGELOG.md
 - Gemfile
 - LICENSE.txt
 - README.md