string_similarity 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f57c49e8e84c841cbe135db15f28e25f62f9097d
4
- data.tar.gz: a85ee9d70fa101ce803cb98689c3a2c7ffc46766
3
+ metadata.gz: 802acbb7fe320b6fe87fd865c7da83b234a1429c
4
+ data.tar.gz: e8c74c00831ad99587e4c5514cccfea67ad4462e
5
5
  SHA512:
6
- metadata.gz: 5dcb000ba65f93a1a23b6aae169ba418fe95a46a92b3fe3772d220f4d78987a58dfae617bd3e6c380cac6c26cc4cc9fc2bd81aa34da968d456da0b8eddf719ca
7
- data.tar.gz: 8e35b84d3cdb7e4326f06f2ed2dc651067e1854639fc757a7c49cee5ffd1e542eb52df6815ece8dc6ad2e4ae715a92929c3baf68bf0a82db7f64ce254a4db66a
6
+ metadata.gz: cd3e23c7a0acb53400f31df84f67c8d47ff4435c74dde0c17af76730dcf5b6890e70caff67c925dcfe02242a257a619d78b8d49db1147682ae74fef05e3f2abb
7
+ data.tar.gz: 06d566f18d1e8ce43dfce31ae8e9111c3644e11b1365d5e8f2c804345310d0b7f131bff848cf110275a02df8ff9368c88640a9a4b18449a83506f61141a1f53d
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ## 1.0.0
2
+
3
+ * deprecate `score` method in favor of `bigram_score` and `ngram_score`
data/README.md CHANGED
@@ -18,10 +18,21 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
+ StringSimilarity has two methods, `bigram_score` and `ngram_score`.
22
+
23
+ `bigram_score` is most useful for strings that do not have spaces in them, e.g. Japanese or Simplified Chinese text.
24
+
25
+ `ngram_score` is most useful for all other languages.
26
+
21
27
  ```ruby
22
28
 
23
- StringSimilarity.score('hello', 'hello')
24
- #=> 100
29
+ string_1 = 'Translation company in Boulder'
30
+ string_2 = 'Translation company in New York'
31
+
32
+ StringSimilarity.bigram_score(string_1, string_2)
33
+ #=> 73
34
+ StringSimilarity.ngram_score(string_1, string_2)
35
+ #=> 74
25
36
 
26
37
  ```
27
38
 
@@ -2,21 +2,29 @@ require "string_similarity/version"
2
2
 
3
3
  class StringSimilarity
4
4
 
5
- MIN_BIGRAM_LENGTH = 2
6
- VERIFIED_BONUS = 5
5
+ MIN_NGRAM_LENGTH = 2
7
6
 
8
- def self.score(string_1, string_2)
7
+ def self.ngram_score(string_1, string_2)
8
+ ngram_length = (string_1.length.to_f * 0.1).round
9
+ ngram_length = [MIN_NGRAM_LENGTH, ngram_length].max
10
+ score(string_1, string_2, ngram_length)
11
+ end
12
+
13
+ def self.bigram_score(string_1, string_2)
14
+ score(string_1, string_2, MIN_NGRAM_LENGTH)
15
+ end
16
+
17
+ private
18
+
19
+ def self.score(string_1, string_2, ngram_length)
9
20
  string_1 = String.new(string_1)
10
21
  string_2 = String.new(string_2)
11
22
 
12
23
  cleaned_string_1 = remove_special_characters(string_1)
13
24
  cleaned_string_2 = remove_special_characters(string_2)
14
25
 
15
- bigram_length = (string_1.length.to_f * 0.1).round
16
- bigram_length = [MIN_BIGRAM_LENGTH, bigram_length].max
17
-
18
- string_1_ngrams = cleaned_string_1.each_char.each_cons(bigram_length).to_set
19
- string_2_ngrams = cleaned_string_2.each_char.each_cons(bigram_length).to_set
26
+ string_1_ngrams = cleaned_string_1.each_char.each_cons(ngram_length).to_set
27
+ string_2_ngrams = cleaned_string_2.each_char.each_cons(ngram_length).to_set
20
28
 
21
29
  overlap = (string_1_ngrams & string_2_ngrams).size
22
30
  total = string_1_ngrams.size + string_2_ngrams.size
@@ -28,8 +36,6 @@ class StringSimilarity
28
36
  normalize_score(score, string_1, string_2)
29
37
  end
30
38
 
31
- private
32
-
33
39
  def self.remove_special_characters(phrase)
34
40
  phrase.gsub!(/[[:punct:]<>]+/, "") # remove all punctuation
35
41
  phrase.strip! # remove trailing and leading white space
@@ -1,3 +1,3 @@
1
1
  class StringSimilarity
2
- VERSION = "0.0.1"
2
+ VERSION = "1.0.0"
3
3
  end
@@ -2,142 +2,175 @@ require 'spec_helper'
2
2
 
3
3
  describe StringSimilarity do
4
4
 
5
- describe 'short phrases' do
5
+ describe 'ngram_score' do
6
6
 
7
- it 'returns 100 if there is no difference' do
8
- string = 'hello'
9
- expect(StringSimilarity.score(string, string)).to eq(100)
10
- end
7
+ describe 'short phrases' do
11
8
 
12
- it 'returns 99 if the original is lowercase and found is uppercase' do
13
- string_1 = 'test'
14
- string_2 = 'TEST'
9
+ it 'returns 100 if there is no difference' do
10
+ string = 'hello'
11
+ expect(StringSimilarity.ngram_score(string, string)).to eq(100)
12
+ end
15
13
 
16
- expect(StringSimilarity.score(string_1, string_2)).to eq(99)
17
- end
14
+ it 'returns 99 if the original is lowercase and found is uppercase' do
15
+ string_1 = 'test'
16
+ string_2 = 'TEST'
18
17
 
19
- it 'returns 99 when found is uppercase and original is capitalized' do
20
- string_1 = 'Email'
21
- string_2 = 'EMAIL'
18
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
19
+ end
22
20
 
23
- expect(StringSimilarity.score(string_1, string_2)).to eq(99)
24
- end
21
+ it 'returns 99 when found is uppercase and original is capitalized' do
22
+ string_1 = 'Email'
23
+ string_2 = 'EMAIL'
25
24
 
26
- it 'returns 99 if original is uppercase and found is capitalized' do
27
- string_1 = 'EMAIL'
28
- string_2 = 'Email'
25
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
26
+ end
29
27
 
30
- expect(StringSimilarity.score(string_1, string_2)).to eq(99)
31
- end
28
+ it 'returns 99 if original is uppercase and found is capitalized' do
29
+ string_1 = 'EMAIL'
30
+ string_2 = 'Email'
32
31
 
33
- it 'returns 99 if original is uppercase and found is lowercase' do
34
- string_1 = 'EMAIL'
35
- string_2 = 'email'
32
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
33
+ end
36
34
 
37
- expect(StringSimilarity.score(string_1, string_2)).to eq(99)
38
- end
35
+ it 'returns 99 if original is uppercase and found is lowercase' do
36
+ string_1 = 'EMAIL'
37
+ string_2 = 'email'
39
38
 
40
- it 'returns 86' do
41
- string_1 = 'test'
42
- string_2 = 'tests'
39
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
40
+ end
43
41
 
44
- expect(StringSimilarity.score(string_1, string_2)).to eq(86)
45
- end
42
+ it 'returns 86' do
43
+ string_1 = 'test'
44
+ string_2 = 'tests'
46
45
 
47
- it 'returns an 86 with capitalization differences' do
48
- string_1 = 'test'
49
- string_2 = 'Tests'
46
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(86)
47
+ end
50
48
 
51
- expect(StringSimilarity.score(string_1, string_2)).to eq(86)
52
- end
49
+ it 'returns an 86 with capitalization differences' do
50
+ string_1 = 'test'
51
+ string_2 = 'Tests'
52
+
53
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(86)
54
+ end
53
55
 
54
- it 'returns 6' do
55
- string_1 = 'test'
56
- string_2 = 'no way is this like the original string'
56
+ it 'returns 6' do
57
+ string_1 = 'test'
58
+ string_2 = 'no way is this like the original string'
59
+
60
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(6)
61
+ end
57
62
 
58
- expect(StringSimilarity.score(string_1, string_2)).to eq(6)
59
63
  end
60
64
 
61
- end
65
+ describe 'long phrases' do
62
66
 
63
- describe 'long phrases' do
67
+ it 'returns 0 when strings are of much different length' do
68
+ string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
69
+ string_2 = "INFORMATION WE COLLECT. Roomster collects information our users submit to us such as their name and e-mail address to allow us to identify users and notify them of changes or updates to our service. We also collect personal information submitted by our users in filling out their profile on the service such as gender, state of residence, occupation, interests, etc. We collect this personal information when our users: (a) sign up as a member; (b) make changes to their member profile information; and (c) send e-mail messages, forms, or other information to us via Roomster. Members may choose to provide additional information beyond their basic profile for their personal and professional profiles. Providing additional information beyond what is required at registration is entirely optional, but enables our users to better identify each other and more effectively connect and interact with their network.

Roomster also collects information from users that is unique, but cannot be linked to a specific individual, such as IP address and browser type. A user's session will be tracked, but each user will remain anonymous. We gather this information on all visitors to Roomster.com for systems administration purposes and to track user trends, such as our most popular features. We do not link IP addresses to any personally identifiable information."
64
70
 
65
- it 'returns 0 when strings are of much different length' do
66
- string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
67
- string_2 = "INFORMATION WE COLLECT. Roomster collects information our users submit to us such as their name and e-mail address to allow us to identify users and notify them of changes or updates to our service. We also collect personal information submitted by our users in filling out their profile on the service such as gender, state of residence, occupation, interests, etc. We collect this personal information when our users: (a) sign up as a member; (b) make changes to their member profile information; and (c) send e-mail messages, forms, or other information to us via Roomster. Members may choose to provide additional information beyond their basic profile for their personal and professional profiles. Providing additional information beyond what is required at registration is entirely optional, but enables our users to better identify each other and more effectively connect and interact with their network.

Roomster also collects information from users that is unique, but cannot be linked to a specific individual, such as IP address and browser type. A user's session will be tracked, but each user will remain anonymous. We gather this information on all visitors to Roomster.com for systems administration purposes and to track user trends, such as our most popular features. We do not link IP addresses to any personally identifiable information."
71
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
72
+ end
68
73
 
69
- expect(StringSimilarity.score(string_1, string_2)).to eq(0)
70
- end
74
+ it 'returns 0 when the phrase is too long to come up with an accurate percentage' do
75
+ string_1 = 'A variety of shopping centers housing popular merchants are just a few miles from the oceanfront, offering varied dining options, large retailers and a number of specialty stores. From classic fashions at Talbots to gourmet foods and cookware at Williams-Sonoma, your refined tastes are sure to be rewarded.'
76
+ string_2 = "Nice is an attractive city situated on the well-reputed French Riviera – France’s Mediterranean coast. There are many beautiful beaches within driving distance as well as a varied array of shopping centers, historical attractions, and museums to satiate the appetite of the cultured traveler. Since it is one of France's largest cities, it is also a major business hub. With both tourists and business executives traveling to Nice, it pays for travelers to plan for their transportation needs in advance. To travel in comfort for an affordable cost, use Blacklane's limousine service for Nice."
71
77
 
72
- it 'returns 0 when the phrase is too long to come up with an accurate percentage' do
73
- string_1 = 'A variety of shopping centers housing popular merchants are just a few miles from the oceanfront, offering varied dining options, large retailers and a number of specialty stores. From classic fashions at Talbots to gourmet foods and cookware at Williams-Sonoma, your refined tastes are sure to be rewarded.'
74
- string_2 = "Nice is an attractive city situated on the well-reputed French Riviera – France’s Mediterranean coast. There are many beautiful beaches within driving distance as well as a varied array of shopping centers, historical attractions, and museums to satiate the appetite of the cultured traveler. Since it is one of France's largest cities, it is also a major business hub. With both tourists and business executives traveling to Nice, it pays for travelers to plan for their transportation needs in advance. To travel in comfort for an affordable cost, use Blacklane's limousine service for Nice."
78
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
79
+ end
75
80
 
76
- expect(StringSimilarity.score(string_1, string_2)).to eq(0)
77
- end
81
+ it 'returns 0 when the phrases dont match enough' do
82
+ string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
83
+ string_2 = 'We strive to provide the best music experience for our listeners, and are passionate about providing the largest and broadest catalog of digital music.^^<br />^^'
78
84
 
79
- it 'returns 0 when the phrases dont match enough' do
80
- string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
81
- string_2 = 'We strive to provide the best music experience for our listeners, and are passionate about providing the largest and broadest catalog of digital music.^^<br />^^'
85
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
86
+ end
82
87
 
83
- expect(StringSimilarity.score(string_1, string_2)).to eq(0)
84
- end
88
+ it 'returns a higher match when the phrases are close' do
89
+ string_1 = 'A taxi alternative in Prague'
90
+ string_2 = 'A taxi alternative in Munich'
85
91
 
86
- it 'returns a higher match when the phrases are close' do
87
- string_1 = 'A taxi alternative in Prague'
88
- string_2 = 'A taxi alternative in Munich'
92
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(77)
93
+ end
89
94
 
90
- expect(StringSimilarity.score(string_1, string_2) > 75).to eq(true)
91
95
  end
92
96
 
93
- end
97
+ describe 'special characters' do
94
98
 
95
- describe 'special characters' do
99
+ it 'strips out carats to mark text to not translate' do
100
+ string_1 = 'Back to Home'
101
+ string_2 = '<<Back to Main>>'
96
102
 
97
- it 'strips out carats to mark text to not translate' do
98
- string_1 = 'Back to Home'
99
- string_2 = '<<Back to Main>>'
103
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
104
+ end
100
105
 
101
- expect(StringSimilarity.score(string_1, string_2)).to eq(64)
102
- end
106
+ it 'strips out punctuation' do
107
+ string_1 = 'Back to Home!?'
108
+ string_2 = 'Back to Main...'
103
109
 
104
- it 'strips out punctuation' do
105
- string_1 = 'Back to Home!?'
106
- string_2 = 'Back to Main...'
110
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
111
+ end
107
112
 
108
- expect(StringSimilarity.score(string_1, string_2)).to eq(64)
109
- end
113
+ it 'converts extended whitespace to one space' do
114
+ string_1 = 'Back to Home'
115
+ string_2 = 'Back to Main'
116
+
117
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
118
+ end
110
119
 
111
- it 'converts extended whitespace to one space' do
112
- string_1 = 'Back to Home'
113
- string_2 = 'Back to Main'
120
+ it 'strips beginning and trailing whitespace' do
121
+ string_1 = 'Back to Home '
122
+ string_2 = ' Back to Main'
123
+
124
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
125
+ end
114
126
 
115
- expect(StringSimilarity.score(string_1, string_2)).to eq(64)
116
127
  end
117
128
 
118
- it 'strips beginning and trailing whitespace' do
119
- string_1 = 'Back to Home '
120
- string_2 = ' Back to Main'
129
+ describe 'phrases with numbers' do
130
+
131
+ it 'does not raise FloatDomainError' do
132
+ string_1 = '1 '
133
+ string_2 = '< 1'
134
+
135
+ expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
136
+ end
137
+
138
+ it 'compares strings that have a number within them' do
139
+ string_1 = 'I work at 1035 Pearl St.'
140
+ string_2 = 'I work at 1045 Pearl St.'
141
+
142
+ expect(StringSimilarity.ngram_score(string_1, string_2) > 90).to eq(true)
143
+ end
121
144
 
122
- expect(StringSimilarity.score(string_1, string_2)).to eq(64)
123
145
  end
124
146
 
125
147
  end
126
148
 
127
- describe 'phrases with numbers' do
149
+ describe 'bigram_score' do
128
150
 
129
- it 'does not raise FloatDomainError' do
130
- string_1 = '1 '
131
- string_2 = '< 1'
151
+ it 'returns a score for a japanese phrase' do
152
+ # hello world
153
+ string_1 = 'こんにちは世界'
154
+ # hello earth
155
+ string_2 = 'こんにちは地球'
132
156
 
133
- expect(StringSimilarity.score(string_1, string_2)).to eq(0)
157
+ expect(StringSimilarity.bigram_score(string_1, string_2) > 65).to eq(true)
134
158
  end
135
159
 
136
- it 'compares strings that have a number within them' do
137
- string_1 = 'I work at 1035 Pearl St.'
138
- string_2 = 'I work at 1045 Pearl St.'
160
+ it 'returns a score for a simplified chinese phrase' do
161
+ # hello world
162
+ string_1 = '你好世界'
163
+ # hello earth
164
+ string_2 = '你好地球'
165
+
166
+ expect(StringSimilarity.bigram_score(string_1, string_2) > 30).to eq(true)
167
+ end
168
+
169
+ it 'returns a slightly higher score than the ngram score for the same strings' do
170
+ string_1 = 'A taxi alternative in Prague'
171
+ string_2 = 'A taxi alternative in Munich'
139
172
 
140
- expect(StringSimilarity.score(string_1, string_2) > 90).to eq(true)
173
+ expect(StringSimilarity.bigram_score(string_1, string_2)).to eq(78)
141
174
  end
142
175
 
143
176
  end
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'string_similarity'
2
2
  require 'rspec'
3
+ require 'pry'
3
4
 
4
5
  RSpec.configure do |config|
5
6
  config.expect_with :rspec do |expectations|
@@ -20,8 +21,6 @@ RSpec.configure do |config|
20
21
  mocks.verify_partial_doubles = true
21
22
  end
22
23
 
23
- config.warnings = true
24
-
25
24
  # Many RSpec users commonly either run the entire suite or an individual
26
25
  # file, and it's useful to allow more verbose output when running an
27
26
  # individual spec file.
@@ -21,4 +21,5 @@ Gem::Specification.new do |spec|
21
21
  spec.add_development_dependency "bundler", "~> 1.6"
22
22
  spec.add_development_dependency "rake"
23
23
  spec.add_development_dependency "rspec", "~> 3.3.0"
24
+ spec.add_development_dependency "pry", "~> 0.10.1"
24
25
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_similarity
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathanael Burt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-06 00:00:00.000000000 Z
11
+ date: 2015-08-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: 3.3.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.10.1
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.10.1
55
69
  description: ''
56
70
  email:
57
71
  - nathanael.burt@gmail.com
@@ -62,6 +76,7 @@ files:
62
76
  - ".gitignore"
63
77
  - ".rspec"
64
78
  - ".ruby-version"
79
+ - CHANGELOG.md
65
80
  - Gemfile
66
81
  - LICENSE.txt
67
82
  - README.md