string_similarity 0.0.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/README.md +13 -2
- data/lib/string_similarity.rb +16 -10
- data/lib/string_similarity/version.rb +1 -1
- data/spec/lib/string_similarity_spec.rb +122 -89
- data/spec/spec_helper.rb +1 -2
- data/string_similarity.gemspec +1 -0
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 802acbb7fe320b6fe87fd865c7da83b234a1429c
|
4
|
+
data.tar.gz: e8c74c00831ad99587e4c5514cccfea67ad4462e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd3e23c7a0acb53400f31df84f67c8d47ff4435c74dde0c17af76730dcf5b6890e70caff67c925dcfe02242a257a619d78b8d49db1147682ae74fef05e3f2abb
|
7
|
+
data.tar.gz: 06d566f18d1e8ce43dfce31ae8e9111c3644e11b1365d5e8f2c804345310d0b7f131bff848cf110275a02df8ff9368c88640a9a4b18449a83506f61141a1f53d
|
data/CHANGELOG.md
ADDED
data/README.md
CHANGED
@@ -18,10 +18,21 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
+
StringSimilarity has two methods, `bigram_score` and `ngram_score`.
|
22
|
+
|
23
|
+
`bigram_score` is most useful for strings that do not have spaces in them, e.g. Japanese or Simplified Chinese text.
|
24
|
+
|
25
|
+
`ngram_score` is most useful for all other languages.
|
26
|
+
|
21
27
|
```ruby
|
22
28
|
|
23
|
-
|
24
|
-
|
29
|
+
string_1 = 'Translation company in Boulder'
|
30
|
+
string_2 = 'Translation company in New York'
|
31
|
+
|
32
|
+
StringSimilarity.bigram_score(string_1, string_2)
|
33
|
+
#=> 73
|
34
|
+
StringSimilarity.ngram_score(string_1, string_2)
|
35
|
+
#=> 74
|
25
36
|
|
26
37
|
```
|
27
38
|
|
data/lib/string_similarity.rb
CHANGED
@@ -2,21 +2,29 @@ require "string_similarity/version"
|
|
2
2
|
|
3
3
|
class StringSimilarity
|
4
4
|
|
5
|
-
|
6
|
-
VERIFIED_BONUS = 5
|
5
|
+
MIN_NGRAM_LENGTH = 2
|
7
6
|
|
8
|
-
def self.
|
7
|
+
def self.ngram_score(string_1, string_2)
|
8
|
+
ngram_length = (string_1.length.to_f * 0.1).round
|
9
|
+
ngram_length = [MIN_NGRAM_LENGTH, ngram_length].max
|
10
|
+
score(string_1, string_2, ngram_length)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.bigram_score(string_1, string_2)
|
14
|
+
score(string_1, string_2, MIN_NGRAM_LENGTH)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def self.score(string_1, string_2, ngram_length)
|
9
20
|
string_1 = String.new(string_1)
|
10
21
|
string_2 = String.new(string_2)
|
11
22
|
|
12
23
|
cleaned_string_1 = remove_special_characters(string_1)
|
13
24
|
cleaned_string_2 = remove_special_characters(string_2)
|
14
25
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
string_1_ngrams = cleaned_string_1.each_char.each_cons(bigram_length).to_set
|
19
|
-
string_2_ngrams = cleaned_string_2.each_char.each_cons(bigram_length).to_set
|
26
|
+
string_1_ngrams = cleaned_string_1.each_char.each_cons(ngram_length).to_set
|
27
|
+
string_2_ngrams = cleaned_string_2.each_char.each_cons(ngram_length).to_set
|
20
28
|
|
21
29
|
overlap = (string_1_ngrams & string_2_ngrams).size
|
22
30
|
total = string_1_ngrams.size + string_2_ngrams.size
|
@@ -28,8 +36,6 @@ class StringSimilarity
|
|
28
36
|
normalize_score(score, string_1, string_2)
|
29
37
|
end
|
30
38
|
|
31
|
-
private
|
32
|
-
|
33
39
|
def self.remove_special_characters(phrase)
|
34
40
|
phrase.gsub!(/[[:punct:]<>]+/, "") # remove all punctuation
|
35
41
|
phrase.strip! # remove trailing and leading white space
|
@@ -2,142 +2,175 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe StringSimilarity do
|
4
4
|
|
5
|
-
describe '
|
5
|
+
describe 'ngram_score' do
|
6
6
|
|
7
|
-
|
8
|
-
string = 'hello'
|
9
|
-
expect(StringSimilarity.score(string, string)).to eq(100)
|
10
|
-
end
|
7
|
+
describe 'short phrases' do
|
11
8
|
|
12
|
-
|
13
|
-
|
14
|
-
|
9
|
+
it 'returns 100 if there is no difference' do
|
10
|
+
string = 'hello'
|
11
|
+
expect(StringSimilarity.ngram_score(string, string)).to eq(100)
|
12
|
+
end
|
15
13
|
|
16
|
-
|
17
|
-
|
14
|
+
it 'returns 99 if the original is lowercase and found is uppercase' do
|
15
|
+
string_1 = 'test'
|
16
|
+
string_2 = 'TEST'
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
string_2 = 'EMAIL'
|
18
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
|
19
|
+
end
|
22
20
|
|
23
|
-
|
24
|
-
|
21
|
+
it 'returns 99 when found is uppercase and original is capitalized' do
|
22
|
+
string_1 = 'Email'
|
23
|
+
string_2 = 'EMAIL'
|
25
24
|
|
26
|
-
|
27
|
-
|
28
|
-
string_2 = 'Email'
|
25
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
|
26
|
+
end
|
29
27
|
|
30
|
-
|
31
|
-
|
28
|
+
it 'returns 99 if original is uppercase and found is capitalized' do
|
29
|
+
string_1 = 'EMAIL'
|
30
|
+
string_2 = 'Email'
|
32
31
|
|
33
|
-
|
34
|
-
|
35
|
-
string_2 = 'email'
|
32
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
|
33
|
+
end
|
36
34
|
|
37
|
-
|
38
|
-
|
35
|
+
it 'returns 99 if original is uppercase and found is lowercase' do
|
36
|
+
string_1 = 'EMAIL'
|
37
|
+
string_2 = 'email'
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
string_2 = 'tests'
|
39
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(99)
|
40
|
+
end
|
43
41
|
|
44
|
-
|
45
|
-
|
42
|
+
it 'returns 86' do
|
43
|
+
string_1 = 'test'
|
44
|
+
string_2 = 'tests'
|
46
45
|
|
47
|
-
|
48
|
-
|
49
|
-
string_2 = 'Tests'
|
46
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(86)
|
47
|
+
end
|
50
48
|
|
51
|
-
|
52
|
-
|
49
|
+
it 'returns an 86 with capitalization differences' do
|
50
|
+
string_1 = 'test'
|
51
|
+
string_2 = 'Tests'
|
52
|
+
|
53
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(86)
|
54
|
+
end
|
53
55
|
|
54
|
-
|
55
|
-
|
56
|
-
|
56
|
+
it 'returns 6' do
|
57
|
+
string_1 = 'test'
|
58
|
+
string_2 = 'no way is this like the original string'
|
59
|
+
|
60
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(6)
|
61
|
+
end
|
57
62
|
|
58
|
-
expect(StringSimilarity.score(string_1, string_2)).to eq(6)
|
59
63
|
end
|
60
64
|
|
61
|
-
|
65
|
+
describe 'long phrases' do
|
62
66
|
|
63
|
-
|
67
|
+
it 'returns 0 when strings are of much different length' do
|
68
|
+
string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
|
69
|
+
string_2 = "INFORMATION WE COLLECT. Roomster collects information our users submit to us such as their name and e-mail address to allow us to identify users and notify them of changes or updates to our service. We also collect personal information submitted by our users in filling out their profile on the service such as gender, state of residence, occupation, interests, etc. We collect this personal information when our users: (a) sign up as a member; (b) make changes to their member profile information; and (c) send e-mail messages, forms, or other information to us via Roomster. Members may choose to provide additional information beyond their basic profile for their personal and professional profiles. Providing additional information beyond what is required at registration is entirely optional, but enables our users to better identify each other and more effectively connect and interact with their network.
Roomster also collects information from users that is unique, but cannot be linked to a specific individual, such as IP address and browser type. A user's session will be tracked, but each user will remain anonymous. We gather this information on all visitors to Roomster.com for systems administration purposes and to track user trends, such as our most popular features. We do not link IP addresses to any personally identifiable information."
|
64
70
|
|
65
|
-
|
66
|
-
|
67
|
-
string_2 = "INFORMATION WE COLLECT. Roomster collects information our users submit to us such as their name and e-mail address to allow us to identify users and notify them of changes or updates to our service. We also collect personal information submitted by our users in filling out their profile on the service such as gender, state of residence, occupation, interests, etc. We collect this personal information when our users: (a) sign up as a member; (b) make changes to their member profile information; and (c) send e-mail messages, forms, or other information to us via Roomster. Members may choose to provide additional information beyond their basic profile for their personal and professional profiles. Providing additional information beyond what is required at registration is entirely optional, but enables our users to better identify each other and more effectively connect and interact with their network.
Roomster also collects information from users that is unique, but cannot be linked to a specific individual, such as IP address and browser type. A user's session will be tracked, but each user will remain anonymous. We gather this information on all visitors to Roomster.com for systems administration purposes and to track user trends, such as our most popular features. We do not link IP addresses to any personally identifiable information."
|
71
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
|
72
|
+
end
|
68
73
|
|
69
|
-
|
70
|
-
|
74
|
+
it 'returns 0 when the phrase is too long to come up with an accurate percentage' do
|
75
|
+
string_1 = 'A variety of shopping centers housing popular merchants are just a few miles from the oceanfront, offering varied dining options, large retailers and a number of specialty stores. From classic fashions at Talbots to gourmet foods and cookware at Williams-Sonoma, your refined tastes are sure to be rewarded.'
|
76
|
+
string_2 = "Nice is an attractive city situated on the well-reputed French Riviera – France’s Mediterranean coast. There are many beautiful beaches within driving distance as well as a varied array of shopping centers, historical attractions, and museums to satiate the appetite of the cultured traveler. Since it is one of France's largest cities, it is also a major business hub. With both tourists and business executives traveling to Nice, it pays for travelers to plan for their transportation needs in advance. To travel in comfort for an affordable cost, use Blacklane's limousine service for Nice."
|
71
77
|
|
72
|
-
|
73
|
-
|
74
|
-
string_2 = "Nice is an attractive city situated on the well-reputed French Riviera – France’s Mediterranean coast. There are many beautiful beaches within driving distance as well as a varied array of shopping centers, historical attractions, and museums to satiate the appetite of the cultured traveler. Since it is one of France's largest cities, it is also a major business hub. With both tourists and business executives traveling to Nice, it pays for travelers to plan for their transportation needs in advance. To travel in comfort for an affordable cost, use Blacklane's limousine service for Nice."
|
78
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
|
79
|
+
end
|
75
80
|
|
76
|
-
|
77
|
-
|
81
|
+
it 'returns 0 when the phrases dont match enough' do
|
82
|
+
string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
|
83
|
+
string_2 = 'We strive to provide the best music experience for our listeners, and are passionate about providing the largest and broadest catalog of digital music.^^<br />^^'
|
78
84
|
|
79
|
-
|
80
|
-
|
81
|
-
string_2 = 'We strive to provide the best music experience for our listeners, and are passionate about providing the largest and broadest catalog of digital music.^^<br />^^'
|
85
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
|
86
|
+
end
|
82
87
|
|
83
|
-
|
84
|
-
|
88
|
+
it 'returns a higher match when the phrases are close' do
|
89
|
+
string_1 = 'A taxi alternative in Prague'
|
90
|
+
string_2 = 'A taxi alternative in Munich'
|
85
91
|
|
86
|
-
|
87
|
-
|
88
|
-
string_2 = 'A taxi alternative in Munich'
|
92
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(77)
|
93
|
+
end
|
89
94
|
|
90
|
-
expect(StringSimilarity.score(string_1, string_2) > 75).to eq(true)
|
91
95
|
end
|
92
96
|
|
93
|
-
|
97
|
+
describe 'special characters' do
|
94
98
|
|
95
|
-
|
99
|
+
it 'strips out carats to mark text to not translate' do
|
100
|
+
string_1 = 'Back to Home'
|
101
|
+
string_2 = '<<Back to Main>>'
|
96
102
|
|
97
|
-
|
98
|
-
|
99
|
-
string_2 = '<<Back to Main>>'
|
103
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
|
104
|
+
end
|
100
105
|
|
101
|
-
|
102
|
-
|
106
|
+
it 'strips out punctuation' do
|
107
|
+
string_1 = 'Back to Home!?'
|
108
|
+
string_2 = 'Back to Main...'
|
103
109
|
|
104
|
-
|
105
|
-
|
106
|
-
string_2 = 'Back to Main...'
|
110
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
|
111
|
+
end
|
107
112
|
|
108
|
-
|
109
|
-
|
113
|
+
it 'converts extended whitespace to one space' do
|
114
|
+
string_1 = 'Back to Home'
|
115
|
+
string_2 = 'Back to Main'
|
116
|
+
|
117
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
|
118
|
+
end
|
110
119
|
|
111
|
-
|
112
|
-
|
113
|
-
|
120
|
+
it 'strips beginning and trailing whitespace' do
|
121
|
+
string_1 = 'Back to Home '
|
122
|
+
string_2 = ' Back to Main'
|
123
|
+
|
124
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(64)
|
125
|
+
end
|
114
126
|
|
115
|
-
expect(StringSimilarity.score(string_1, string_2)).to eq(64)
|
116
127
|
end
|
117
128
|
|
118
|
-
|
119
|
-
|
120
|
-
|
129
|
+
describe 'phrases with numbers' do
|
130
|
+
|
131
|
+
it 'does not raise FloatDomainError' do
|
132
|
+
string_1 = '1 '
|
133
|
+
string_2 = '< 1'
|
134
|
+
|
135
|
+
expect(StringSimilarity.ngram_score(string_1, string_2)).to eq(0)
|
136
|
+
end
|
137
|
+
|
138
|
+
it 'compares strings that have a number within them' do
|
139
|
+
string_1 = 'I work at 1035 Pearl St.'
|
140
|
+
string_2 = 'I work at 1045 Pearl St.'
|
141
|
+
|
142
|
+
expect(StringSimilarity.ngram_score(string_1, string_2) > 90).to eq(true)
|
143
|
+
end
|
121
144
|
|
122
|
-
expect(StringSimilarity.score(string_1, string_2)).to eq(64)
|
123
145
|
end
|
124
146
|
|
125
147
|
end
|
126
148
|
|
127
|
-
describe '
|
149
|
+
describe 'bigram_score' do
|
128
150
|
|
129
|
-
it '
|
130
|
-
|
131
|
-
|
151
|
+
it 'returns a score for a japanese phrase' do
|
152
|
+
# hello world
|
153
|
+
string_1 = 'こんにちは世界'
|
154
|
+
# hello earth
|
155
|
+
string_2 = 'こんにちは地球'
|
132
156
|
|
133
|
-
expect(StringSimilarity.
|
157
|
+
expect(StringSimilarity.bigram_score(string_1, string_2) > 65).to eq(true)
|
134
158
|
end
|
135
159
|
|
136
|
-
it '
|
137
|
-
|
138
|
-
|
160
|
+
it 'returns a score for a simplified chinese phrase' do
|
161
|
+
# hello world
|
162
|
+
string_1 = '你好世界'
|
163
|
+
# hello earth
|
164
|
+
string_2 = '你好地球'
|
165
|
+
|
166
|
+
expect(StringSimilarity.bigram_score(string_1, string_2) > 30).to eq(true)
|
167
|
+
end
|
168
|
+
|
169
|
+
it 'returns a slightly higher score than the ngram score for the same strings' do
|
170
|
+
string_1 = 'A taxi alternative in Prague'
|
171
|
+
string_2 = 'A taxi alternative in Munich'
|
139
172
|
|
140
|
-
expect(StringSimilarity.
|
173
|
+
expect(StringSimilarity.bigram_score(string_1, string_2)).to eq(78)
|
141
174
|
end
|
142
175
|
|
143
176
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'string_similarity'
|
2
2
|
require 'rspec'
|
3
|
+
require 'pry'
|
3
4
|
|
4
5
|
RSpec.configure do |config|
|
5
6
|
config.expect_with :rspec do |expectations|
|
@@ -20,8 +21,6 @@ RSpec.configure do |config|
|
|
20
21
|
mocks.verify_partial_doubles = true
|
21
22
|
end
|
22
23
|
|
23
|
-
config.warnings = true
|
24
|
-
|
25
24
|
# Many RSpec users commonly either run the entire suite or an individual
|
26
25
|
# file, and it's useful to allow more verbose output when running an
|
27
26
|
# individual spec file.
|
data/string_similarity.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string_similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathanael Burt
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08-
|
11
|
+
date: 2015-08-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 3.3.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.10.1
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.10.1
|
55
69
|
description: ''
|
56
70
|
email:
|
57
71
|
- nathanael.burt@gmail.com
|
@@ -62,6 +76,7 @@ files:
|
|
62
76
|
- ".gitignore"
|
63
77
|
- ".rspec"
|
64
78
|
- ".ruby-version"
|
79
|
+
- CHANGELOG.md
|
65
80
|
- Gemfile
|
66
81
|
- LICENSE.txt
|
67
82
|
- README.md
|