string_similarity 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f57c49e8e84c841cbe135db15f28e25f62f9097d
4
+ data.tar.gz: a85ee9d70fa101ce803cb98689c3a2c7ffc46766
5
+ SHA512:
6
+ metadata.gz: 5dcb000ba65f93a1a23b6aae169ba418fe95a46a92b3fe3772d220f4d78987a58dfae617bd3e6c380cac6c26cc4cc9fc2bd81aa34da968d456da0b8eddf719ca
7
+ data.tar.gz: 8e35b84d3cdb7e4326f06f2ed2dc651067e1854639fc757a7c49cee5ffd1e542eb52df6815ece8dc6ad2e4ae715a92929c3baf68bf0a82db7f64ce254a4db66a
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.2.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in string_similarity.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Nathanael Burt
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # StringSimilarity
2
+
3
+ StringSimilarity provides a very simple api to compare two strings and return a percentage similarity. It is based on the Sørensen–Dice coefficient (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'string_similarity'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install string_similarity
18
+
19
+ ## Usage
20
+
21
+ ```ruby
22
+
23
+ StringSimilarity.score('hello', 'hello')
24
+ #=> 100
25
+
26
+ ```
27
+
28
+ ## Contributing
29
+
30
+ 1. Fork it ( https://github.com/[my-github-username]/string_similarity/fork )
31
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
32
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
33
+ 4. Push to the branch (`git push origin my-new-feature`)
34
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,51 @@
1
+ require "string_similarity/version"
2
+
3
+ class StringSimilarity
4
+
5
+ MIN_BIGRAM_LENGTH = 2
6
+ VERIFIED_BONUS = 5
7
+
8
+ def self.score(string_1, string_2)
9
+ string_1 = String.new(string_1)
10
+ string_2 = String.new(string_2)
11
+
12
+ cleaned_string_1 = remove_special_characters(string_1)
13
+ cleaned_string_2 = remove_special_characters(string_2)
14
+
15
+ bigram_length = (string_1.length.to_f * 0.1).round
16
+ bigram_length = [MIN_BIGRAM_LENGTH, bigram_length].max
17
+
18
+ string_1_ngrams = cleaned_string_1.each_char.each_cons(bigram_length).to_set
19
+ string_2_ngrams = cleaned_string_2.each_char.each_cons(bigram_length).to_set
20
+
21
+ overlap = (string_1_ngrams & string_2_ngrams).size
22
+ total = string_1_ngrams.size + string_2_ngrams.size
23
+ return 0 unless overlap > 0 && total > 0
24
+
25
+ sorensen_dice = overlap * 2.0 / total
26
+
27
+ score = (sorensen_dice * 100).round
28
+ normalize_score(score, string_1, string_2)
29
+ end
30
+
31
+ private
32
+
33
+ def self.remove_special_characters(phrase)
34
+ phrase.gsub!(/[[:punct:]<>]+/, "") # remove all punctuation
35
+ phrase.strip! # remove trailing and leading white space
36
+ phrase.gsub!(/\s+/, " ") # replace extended white space with one space
37
+ phrase.downcase
38
+ end
39
+
40
+ def self.capitalization_differences(original_phrase, found_phrase)
41
+ (original_phrase.downcase == found_phrase || found_phrase.downcase == original_phrase ||
42
+ original_phrase.upcase == found_phrase || found_phrase.upcase == original_phrase) &&
43
+ found_phrase != original_phrase
44
+ end
45
+
46
+ def self.normalize_score(score, original_phrase, found_phrase)
47
+ capitalization_differences(original_phrase, found_phrase) ? score - 1 : score
48
+ end
49
+
50
+
51
+ end
@@ -0,0 +1,3 @@
1
+ class StringSimilarity
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,145 @@
1
+ require 'spec_helper'
2
+
3
+ describe StringSimilarity do
4
+
5
+ describe 'short phrases' do
6
+
7
+ it 'returns 100 if there is no difference' do
8
+ string = 'hello'
9
+ expect(StringSimilarity.score(string, string)).to eq(100)
10
+ end
11
+
12
+ it 'returns 99 if the original is lowercase and found is uppercase' do
13
+ string_1 = 'test'
14
+ string_2 = 'TEST'
15
+
16
+ expect(StringSimilarity.score(string_1, string_2)).to eq(99)
17
+ end
18
+
19
+ it 'returns 99 when found is uppercase and original is capitalized' do
20
+ string_1 = 'Email'
21
+ string_2 = 'EMAIL'
22
+
23
+ expect(StringSimilarity.score(string_1, string_2)).to eq(99)
24
+ end
25
+
26
+ it 'returns 99 if original is uppercase and found is capitalized' do
27
+ string_1 = 'EMAIL'
28
+ string_2 = 'Email'
29
+
30
+ expect(StringSimilarity.score(string_1, string_2)).to eq(99)
31
+ end
32
+
33
+ it 'returns 99 if original is uppercase and found is lowercase' do
34
+ string_1 = 'EMAIL'
35
+ string_2 = 'email'
36
+
37
+ expect(StringSimilarity.score(string_1, string_2)).to eq(99)
38
+ end
39
+
40
+ it 'returns 86' do
41
+ string_1 = 'test'
42
+ string_2 = 'tests'
43
+
44
+ expect(StringSimilarity.score(string_1, string_2)).to eq(86)
45
+ end
46
+
47
+ it 'returns an 86 with capitalization differences' do
48
+ string_1 = 'test'
49
+ string_2 = 'Tests'
50
+
51
+ expect(StringSimilarity.score(string_1, string_2)).to eq(86)
52
+ end
53
+
54
+ it 'returns 6' do
55
+ string_1 = 'test'
56
+ string_2 = 'no way is this like the original string'
57
+
58
+ expect(StringSimilarity.score(string_1, string_2)).to eq(6)
59
+ end
60
+
61
+ end
62
+
63
+ describe 'long phrases' do
64
+
65
+ it 'returns 0 when strings are of much different length' do
66
+ string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
67
+ string_2 = "INFORMATION WE COLLECT. Roomster collects information our users submit to us such as their name and e-mail address to allow us to identify users and notify them of changes or updates to our service. We also collect personal information submitted by our users in filling out their profile on the service such as gender, state of residence, occupation, interests, etc. We collect this personal information when our users: (a) sign up as a member; (b) make changes to their member profile information; and (c) send e-mail messages, forms, or other information to us via Roomster. Members may choose to provide additional information beyond their basic profile for their personal and professional profiles. Providing additional information beyond what is required at registration is entirely optional, but enables our users to better identify each other and more effectively connect and interact with their network.

Roomster also collects information from users that is unique, but cannot be linked to a specific individual, such as IP address and browser type. A user's session will be tracked, but each user will remain anonymous. We gather this information on all visitors to Roomster.com for systems administration purposes and to track user trends, such as our most popular features. We do not link IP addresses to any personally identifiable information."
68
+
69
+ expect(StringSimilarity.score(string_1, string_2)).to eq(0)
70
+ end
71
+
72
+ it 'returns 0 when the phrase is too long to come up with an accurate percentage' do
73
+ string_1 = 'A variety of shopping centers housing popular merchants are just a few miles from the oceanfront, offering varied dining options, large retailers and a number of specialty stores. From classic fashions at Talbots to gourmet foods and cookware at Williams-Sonoma, your refined tastes are sure to be rewarded.'
74
+ string_2 = "Nice is an attractive city situated on the well-reputed French Riviera – France’s Mediterranean coast. There are many beautiful beaches within driving distance as well as a varied array of shopping centers, historical attractions, and museums to satiate the appetite of the cultured traveler. Since it is one of France's largest cities, it is also a major business hub. With both tourists and business executives traveling to Nice, it pays for travelers to plan for their transportation needs in advance. To travel in comfort for an affordable cost, use Blacklane's limousine service for Nice."
75
+
76
+ expect(StringSimilarity.score(string_1, string_2)).to eq(0)
77
+ end
78
+
79
+ it 'returns 0 when the phrases dont match enough' do
80
+ string_1 = 'We are committed to providing the best products and friendliest customer service. If you should have any questions about ordering or a question about any of our great products, please feel free to contact us with the information provided below.'
81
+ string_2 = 'We strive to provide the best music experience for our listeners, and are passionate about providing the largest and broadest catalog of digital music.^^<br />^^'
82
+
83
+ expect(StringSimilarity.score(string_1, string_2)).to eq(0)
84
+ end
85
+
86
+ it 'returns a higher match when the phrases are close' do
87
+ string_1 = 'A taxi alternative in Prague'
88
+ string_2 = 'A taxi alternative in Munich'
89
+
90
+ expect(StringSimilarity.score(string_1, string_2) > 75).to eq(true)
91
+ end
92
+
93
+ end
94
+
95
+ describe 'special characters' do
96
+
97
+ it 'strips out carats to mark text to not translate' do
98
+ string_1 = 'Back to Home'
99
+ string_2 = '<<Back to Main>>'
100
+
101
+ expect(StringSimilarity.score(string_1, string_2)).to eq(64)
102
+ end
103
+
104
+ it 'strips out punctuation' do
105
+ string_1 = 'Back to Home!?'
106
+ string_2 = 'Back to Main...'
107
+
108
+ expect(StringSimilarity.score(string_1, string_2)).to eq(64)
109
+ end
110
+
111
+ it 'converts extended whitespace to one space' do
112
+ string_1 = 'Back to Home'
113
+ string_2 = 'Back to Main'
114
+
115
+ expect(StringSimilarity.score(string_1, string_2)).to eq(64)
116
+ end
117
+
118
+ it 'strips beginning and trailing whitespace' do
119
+ string_1 = 'Back to Home '
120
+ string_2 = ' Back to Main'
121
+
122
+ expect(StringSimilarity.score(string_1, string_2)).to eq(64)
123
+ end
124
+
125
+ end
126
+
127
+ describe 'phrases with numbers' do
128
+
129
+ it 'does not raise FloatDomainError' do
130
+ string_1 = '1 '
131
+ string_2 = '< 1'
132
+
133
+ expect(StringSimilarity.score(string_1, string_2)).to eq(0)
134
+ end
135
+
136
+ it 'compares strings that have a number within them' do
137
+ string_1 = 'I work at 1035 Pearl St.'
138
+ string_2 = 'I work at 1045 Pearl St.'
139
+
140
+ expect(StringSimilarity.score(string_1, string_2) > 90).to eq(true)
141
+ end
142
+
143
+ end
144
+
145
+ end
@@ -0,0 +1,51 @@
1
+ require 'string_similarity'
2
+ require 'rspec'
3
+
4
+ RSpec.configure do |config|
5
+ config.expect_with :rspec do |expectations|
6
+ # This option will default to `true` in RSpec 4. It makes the `description`
7
+ # and `failure_message` of custom matchers include text for helper methods
8
+ # defined using `chain`, e.g.:
9
+ # be_bigger_than(2).and_smaller_than(4).description
10
+ # # => "be bigger than 2 and smaller than 4"
11
+ # ...rather than:
12
+ # # => "be bigger than 2"
13
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
14
+ end
15
+
16
+ config.mock_with :rspec do |mocks|
17
+ # Prevents you from mocking or stubbing a method that does not exist on
18
+ # a real object. This is generally recommended, and will default to
19
+ # `true` in RSpec 4.
20
+ mocks.verify_partial_doubles = true
21
+ end
22
+
23
+ config.warnings = true
24
+
25
+ # Many RSpec users commonly either run the entire suite or an individual
26
+ # file, and it's useful to allow more verbose output when running an
27
+ # individual spec file.
28
+ if config.files_to_run.one?
29
+ # Use the documentation formatter for detailed output,
30
+ # unless a formatter has already been configured
31
+ # (e.g. via a command-line flag).
32
+ config.default_formatter = 'doc'
33
+ end
34
+
35
+ # Print the 10 slowest examples and example groups at the
36
+ # end of the spec run, to help surface which specs are running
37
+ # particularly slow.
38
+ config.profile_examples = 10
39
+
40
+ # Run specs in random order to surface order dependencies. If you find an
41
+ # order dependency and want to debug it, you can fix the order by providing
42
+ # the seed, which is printed after each run.
43
+ # --seed 1234
44
+ config.order = :random
45
+
46
+ # Seed global randomization in this process using the `--seed` CLI option.
47
+ # Setting this allows you to use `--seed` to deterministically reproduce
48
+ # test failures related to randomization by passing the same `--seed` value
49
+ # as the one that triggered the failure.
50
+ Kernel.srand config.seed
51
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'string_similarity/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "string_similarity"
8
+ spec.version = StringSimilarity::VERSION
9
+ spec.authors = ["Nathanael Burt"]
10
+ spec.email = ["nathanael.burt@gmail.com"]
11
+ spec.summary = "Gem to find a percentage similarity between two strings"
12
+ spec.description = ""
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec", "~> 3.3.0"
24
+ end
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: string_similarity
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nathanael Burt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 3.3.0
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 3.3.0
55
+ description: ''
56
+ email:
57
+ - nathanael.burt@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".rspec"
64
+ - ".ruby-version"
65
+ - Gemfile
66
+ - LICENSE.txt
67
+ - README.md
68
+ - Rakefile
69
+ - lib/string_similarity.rb
70
+ - lib/string_similarity/version.rb
71
+ - spec/lib/string_similarity_spec.rb
72
+ - spec/spec_helper.rb
73
+ - string_similarity.gemspec
74
+ homepage: ''
75
+ licenses:
76
+ - MIT
77
+ metadata: {}
78
+ post_install_message:
79
+ rdoc_options: []
80
+ require_paths:
81
+ - lib
82
+ required_ruby_version: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ version: '0'
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: '0'
92
+ requirements: []
93
+ rubyforge_project:
94
+ rubygems_version: 2.4.8
95
+ signing_key:
96
+ specification_version: 4
97
+ summary: Gem to find a percentage similarity between two strings
98
+ test_files:
99
+ - spec/lib/string_similarity_spec.rb
100
+ - spec/spec_helper.rb