jaro_winkler 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a9890ac64633ae4e8b9c89c252fb13cfc6e855c1
4
+ data.tar.gz: c3732dec15dd6e499bf5450a9d355f5c965ba8bf
5
+ SHA512:
6
+ metadata.gz: d7618f1b877df22b450d121084d0405c9c019312f4453ba2df9053383dc50e1d8621abfec4372414f9e742f660b61307c52c80ea2a5fbf810fb6594c93eac1d8
7
+ data.tar.gz: b31ad35c86848d86107b536cb05fc14e14ee0d3b2da44b33289e98eed3158d631cd92f00698072c03abf2bcd336674928b0354d8aca9069c1eb207696c6dc0bd
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in jaro_winkler.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Jian Weihang
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # About
2
+
3
+ It's a pure Ruby implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm.
4
+
5
+ ## Installation
6
+
7
+ ```
8
+ gem install jaro_winkler
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```ruby
14
+ require 'jaro_winkler'
15
+ JaroWinkler.jaro_winkler_distance "MARTHA", "MARHTA"
16
+ # => 0.9611
17
+ JaroWinkler.jaro_winkler_distance "MARTHA", "marhta", case_match: true
18
+ # => 0.9611
19
+ JaroWinkler.jaro_winkler_distance "MARTHA", "marhta", weight: 0.2
20
+ # => 0.9778
21
+ ```
22
+
23
+ ## Options
24
+
25
+ Name | Type | Default | Note
26
+ ----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
27
+ case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
28
+ weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
29
+ threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'jaro_winkler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "jaro_winkler"
8
+ spec.version = JaroWinkler::VERSION
9
+ spec.authors = ["Jian Weihang"]
10
+ spec.email = ["tonytonyjan@gmail.com"]
11
+ spec.summary = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
12
+ spec.description = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
13
+ spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ end
@@ -0,0 +1,50 @@
1
+ module JaroWinkler
2
+ module_function
3
+ def jaro_distance s1, s2
4
+ return 0.0 if s1.empty? || s2.empty?
5
+ length1, length2 = s1.length, s2.length
6
+ window_size = ([length1, length2].max / 2) - 1
7
+ matches = 0.0
8
+ transpositions = 0
9
+ previous_index = -1
10
+ s1.chars.each_with_index do |c1, i|
11
+ max_index = length2 - 1
12
+ left = i - window_size
13
+ right = i + window_size
14
+ left = 0 if left < 0
15
+ right = max_index if right > max_index
16
+ matched = false
17
+ found = false
18
+ s2[left..right].chars.each_with_index do |c2, j|
19
+ if c1 == c2
20
+ matched = true
21
+ s2_index = left + j
22
+ unless found
23
+ if s2_index > previous_index
24
+ previous_index = s2_index
25
+ found = true
26
+ end
27
+ end
28
+ end
29
+ end
30
+ if matched
31
+ matches += 1
32
+ transpositions += 1 unless found
33
+ end
34
+ end
35
+ # Don't divide transpositions by 2 since it's been counted directly by above code.
36
+ matches == 0 ? 0 : 1.0 / 3.0 * (matches / length1 + matches / length2 + (matches - transpositions) / matches)
37
+ end
38
+
39
+ def jaro_winkler_distance s1, s2, weight: 0.1, threshold: 0.7, case_match: false
40
+ raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
41
+ s1, s2 = s1.downcase, s2.downcase if case_match
42
+ distance = jaro_distance(s1, s2)
43
+ prefix = 0
44
+ max_length = [4, s1.length, s2.length].min
45
+ s1[0, max_length].chars.each_with_index do |c1, i|
46
+ c1 == s2[i] ? prefix += 1 : break
47
+ end
48
+ distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
49
+ end
50
+ end
@@ -0,0 +1,3 @@
1
+ module JaroWinkler
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,24 @@
1
+ require 'jaro_winkler'
2
+
3
+ describe JaroWinkler do
4
+ it 'works' do
5
+ expect(JaroWinkler.jaro_winkler_distance("MARTHA", "MARHTA")).to be_within(0.0001).of(0.9611)
6
+ expect(JaroWinkler.jaro_winkler_distance("DIXON", "DICKSONX")).to be_within(0.0001).of(0.8133)
7
+ expect(JaroWinkler.jaro_winkler_distance("abcvwxyz", "cabvwxyz")).to be_within(0.0001).of(0.9583)
8
+ expect(JaroWinkler.jaro_winkler_distance("DWAYNE", "DUANE")).to eq 0.84
9
+ expect(JaroWinkler.jaro_winkler_distance("tony", "tony")).to eq 1.0
10
+ expect(JaroWinkler.jaro_winkler_distance("tonytonyjan", "tonytonyjan")).to eq 1.0
11
+ expect(JaroWinkler.jaro_winkler_distance("", "")).to eq 0.0
12
+ expect(JaroWinkler.jaro_winkler_distance("tony", "")).to eq 0.0
13
+ expect(JaroWinkler.jaro_winkler_distance("", "tony")).to eq 0.0
14
+ end
15
+
16
+ it 'can ignore case' do
17
+ expect(JaroWinkler.jaro_winkler_distance("MARTHA", "marhta", case_match: true)).to be_within(0.0001).of(0.9611)
18
+ end
19
+
20
+ it 'can set weight' do
21
+ expect(JaroWinkler.jaro_winkler_distance("MARTHA", "MARHTA", weight: 0.2)).to be_within(0.0001).of(0.9778)
22
+ expect{ JaroWinkler.jaro_winkler_distance("MARTHA", "MARHTA", weight: 0.26) }.to raise_error
23
+ end
24
+ end
@@ -0,0 +1,89 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # The generated `.rspec` file contains `--require spec_helper` which will cause this
4
+ # file to always be loaded, without a need to explicitly require it in any files.
5
+ #
6
+ # Given that it is always loaded, you are encouraged to keep this file as
7
+ # light-weight as possible. Requiring heavyweight dependencies from this file
8
+ # will add to the boot time of your test suite on EVERY test run, even for an
9
+ # individual file that may not need all of that loaded. Instead, consider making
10
+ # a separate helper file that requires the additional dependencies and performs
11
+ # the additional setup, and require it from the spec files that actually need it.
12
+ #
13
+ # The `.rspec` file also contains a few flags that are not defaults but that
14
+ # users commonly want.
15
+ #
16
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
17
+ RSpec.configure do |config|
18
+ # rspec-expectations config goes here. You can use an alternate
19
+ # assertion/expectation library such as wrong or the stdlib/minitest
20
+ # assertions if you prefer.
21
+ config.expect_with :rspec do |expectations|
22
+ # This option will default to `true` in RSpec 4. It makes the `description`
23
+ # and `failure_message` of custom matchers include text for helper methods
24
+ # defined using `chain`, e.g.:
25
+ # be_bigger_than(2).and_smaller_than(4).description
26
+ # # => "be bigger than 2 and smaller than 4"
27
+ # ...rather than:
28
+ # # => "be bigger than 2"
29
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
30
+ end
31
+
32
+ # rspec-mocks config goes here. You can use an alternate test double
33
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
34
+ config.mock_with :rspec do |mocks|
35
+ # Prevents you from mocking or stubbing a method that does not exist on
36
+ # a real object. This is generally recommended, and will default to
37
+ # `true` in RSpec 4.
38
+ mocks.verify_partial_doubles = true
39
+ end
40
+
41
+ # The settings below are suggested to provide a good initial experience
42
+ # with RSpec, but feel free to customize to your heart's content.
43
+ =begin
44
+ # These two settings work together to allow you to limit a spec run
45
+ # to individual examples or groups you care about by tagging them with
46
+ # `:focus` metadata. When nothing is tagged with `:focus`, all examples
47
+ # get run.
48
+ config.filter_run :focus
49
+ config.run_all_when_everything_filtered = true
50
+
51
+ # Limits the available syntax to the non-monkey patched syntax that is recommended.
52
+ # For more details, see:
53
+ # - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
54
+ # - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
55
+ # - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
56
+ config.disable_monkey_patching!
57
+
58
+ # This setting enables warnings. It's recommended, but in some cases may
59
+ # be too noisy due to issues in dependencies.
60
+ config.warnings = true
61
+
62
+ # Many RSpec users commonly either run the entire suite or an individual
63
+ # file, and it's useful to allow more verbose output when running an
64
+ # individual spec file.
65
+ if config.files_to_run.one?
66
+ # Use the documentation formatter for detailed output,
67
+ # unless a formatter has already been configured
68
+ # (e.g. via a command-line flag).
69
+ config.default_formatter = 'doc'
70
+ end
71
+
72
+ # Print the 10 slowest examples and example groups at the
73
+ # end of the spec run, to help surface which specs are running
74
+ # particularly slow.
75
+ config.profile_examples = 10
76
+
77
+ # Run specs in random order to surface order dependencies. If you find an
78
+ # order dependency and want to debug it, you can fix the order by providing
79
+ # the seed, which is printed after each run.
80
+ # --seed 1234
81
+ config.order = :random
82
+
83
+ # Seed global randomization in this process using the `--seed` CLI option.
84
+ # Setting this allows you to use `--seed` to deterministically reproduce
85
+ # test failures related to randomization by passing the same `--seed` value
86
+ # as the one that triggered the failure.
87
+ Kernel.srand config.seed
88
+ =end
89
+ end
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jaro_winkler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Jian Weihang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-09-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Pure Ruby implementation of Jaro-Winkler distance algorithm.
42
+ email:
43
+ - tonytonyjan@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - ".rspec"
50
+ - Gemfile
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - jaro_winkler.gemspec
55
+ - lib/jaro_winkler.rb
56
+ - lib/jaro_winkler/version.rb
57
+ - spec/jaro_winkler_spec.rb
58
+ - spec/spec_helper.rb
59
+ homepage: https://github.com/tonytonyjan/jaro_winkler
60
+ licenses:
61
+ - MIT
62
+ metadata: {}
63
+ post_install_message:
64
+ rdoc_options: []
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ requirements: []
78
+ rubyforge_project:
79
+ rubygems_version: 2.4.1
80
+ signing_key:
81
+ specification_version: 4
82
+ summary: Pure Ruby implementation of Jaro-Winkler distance algorithm.
83
+ test_files:
84
+ - spec/jaro_winkler_spec.rb
85
+ - spec/spec_helper.rb