jaro_winkler 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +2 -0
- data/jaro_winkler.gemspec +23 -0
- data/lib/jaro_winkler.rb +50 -0
- data/lib/jaro_winkler/version.rb +3 -0
- data/spec/jaro_winkler_spec.rb +24 -0
- data/spec/spec_helper.rb +89 -0
- metadata +85 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a9890ac64633ae4e8b9c89c252fb13cfc6e855c1
|
4
|
+
data.tar.gz: c3732dec15dd6e499bf5450a9d355f5c965ba8bf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d7618f1b877df22b450d121084d0405c9c019312f4453ba2df9053383dc50e1d8621abfec4372414f9e742f660b61307c52c80ea2a5fbf810fb6594c93eac1d8
|
7
|
+
data.tar.gz: b31ad35c86848d86107b536cb05fc14e14ee0d3b2da44b33289e98eed3158d631cd92f00698072c03abf2bcd336674928b0354d8aca9069c1eb207696c6dc0bd
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Jian Weihang
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# About
|
2
|
+
|
3
|
+
It's a pure Ruby implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
```
|
8
|
+
gem install jaro_winkler
|
9
|
+
```
|
10
|
+
|
11
|
+
## Usage
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
require 'jaro_winkler'
|
15
|
+
JaroWinkler.jaro_winkler_distance "MARTHA", "MARHTA"
|
16
|
+
# => 0.9611
|
17
|
+
JaroWinkler.jaro_winkler_distance "MARTHA", "marhta", case_match: true
|
18
|
+
# => 0.9611
|
19
|
+
JaroWinkler.jaro_winkler_distance "MARTHA", "marhta", weight: 0.2
|
20
|
+
# => 0.9778
|
21
|
+
```
|
22
|
+
|
23
|
+
## Options
|
24
|
+
|
25
|
+
Name | Type | Default | Note
|
26
|
+
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
|
27
|
+
case_match | boolean | false | All upper case characters are converted to lower case prior to the comparison.
|
28
|
+
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
|
29
|
+
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above a this.
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'jaro_winkler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "jaro_winkler"
|
8
|
+
spec.version = JaroWinkler::VERSION
|
9
|
+
spec.authors = ["Jian Weihang"]
|
10
|
+
spec.email = ["tonytonyjan@gmail.com"]
|
11
|
+
spec.summary = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
|
12
|
+
spec.description = %q{Pure Ruby implementation of Jaro-Winkler distance algorithm.}
|
13
|
+
spec.homepage = "https://github.com/tonytonyjan/jaro_winkler"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
end
|
data/lib/jaro_winkler.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
module JaroWinkler
|
2
|
+
module_function
|
3
|
+
def jaro_distance s1, s2
|
4
|
+
return 0.0 if s1.empty? || s2.empty?
|
5
|
+
length1, length2 = s1.length, s2.length
|
6
|
+
window_size = ([length1, length2].max / 2) - 1
|
7
|
+
matches = 0.0
|
8
|
+
transpositions = 0
|
9
|
+
previous_index = -1
|
10
|
+
s1.chars.each_with_index do |c1, i|
|
11
|
+
max_index = length2 - 1
|
12
|
+
left = i - window_size
|
13
|
+
right = i + window_size
|
14
|
+
left = 0 if left < 0
|
15
|
+
right = max_index if right > max_index
|
16
|
+
matched = false
|
17
|
+
found = false
|
18
|
+
s2[left..right].chars.each_with_index do |c2, j|
|
19
|
+
if c1 == c2
|
20
|
+
matched = true
|
21
|
+
s2_index = left + j
|
22
|
+
unless found
|
23
|
+
if s2_index > previous_index
|
24
|
+
previous_index = s2_index
|
25
|
+
found = true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
if matched
|
31
|
+
matches += 1
|
32
|
+
transpositions += 1 unless found
|
33
|
+
end
|
34
|
+
end
|
35
|
+
# Don't divide transpositions by 2 since it's been counted directly by above code.
|
36
|
+
matches == 0 ? 0 : 1.0 / 3.0 * (matches / length1 + matches / length2 + (matches - transpositions) / matches)
|
37
|
+
end
|
38
|
+
|
39
|
+
def jaro_winkler_distance s1, s2, weight: 0.1, threshold: 0.7, case_match: false
|
40
|
+
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
|
41
|
+
s1, s2 = s1.downcase, s2.downcase if case_match
|
42
|
+
distance = jaro_distance(s1, s2)
|
43
|
+
prefix = 0
|
44
|
+
max_length = [4, s1.length, s2.length].min
|
45
|
+
s1[0, max_length].chars.each_with_index do |c1, i|
|
46
|
+
c1 == s2[i] ? prefix += 1 : break
|
47
|
+
end
|
48
|
+
distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'jaro_winkler'
|
2
|
+
|
3
|
+
describe JaroWinkler do
|
4
|
+
it 'works' do
|
5
|
+
expect(JaroWinkler.jaro_winkler_distance("MARTHA", "MARHTA")).to be_within(0.0001).of(0.9611)
|
6
|
+
expect(JaroWinkler.jaro_winkler_distance("DIXON", "DICKSONX")).to be_within(0.0001).of(0.8133)
|
7
|
+
expect(JaroWinkler.jaro_winkler_distance("abcvwxyz", "cabvwxyz")).to be_within(0.0001).of(0.9583)
|
8
|
+
expect(JaroWinkler.jaro_winkler_distance("DWAYNE", "DUANE")).to eq 0.84
|
9
|
+
expect(JaroWinkler.jaro_winkler_distance("tony", "tony")).to eq 1.0
|
10
|
+
expect(JaroWinkler.jaro_winkler_distance("tonytonyjan", "tonytonyjan")).to eq 1.0
|
11
|
+
expect(JaroWinkler.jaro_winkler_distance("", "")).to eq 0.0
|
12
|
+
expect(JaroWinkler.jaro_winkler_distance("tony", "")).to eq 0.0
|
13
|
+
expect(JaroWinkler.jaro_winkler_distance("", "tony")).to eq 0.0
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'can ignore case' do
|
17
|
+
expect(JaroWinkler.jaro_winkler_distance("MARTHA", "marhta", case_match: true)).to be_within(0.0001).of(0.9611)
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'can set weight' do
|
21
|
+
expect(JaroWinkler.jaro_winkler_distance("MARTHA", "MARHTA", weight: 0.2)).to be_within(0.0001).of(0.9778)
|
22
|
+
expect{ JaroWinkler.jaro_winkler_distance("MARTHA", "MARHTA", weight: 0.26) }.to raise_error
|
23
|
+
end
|
24
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# The generated `.rspec` file contains `--require spec_helper` which will cause this
|
4
|
+
# file to always be loaded, without a need to explicitly require it in any files.
|
5
|
+
#
|
6
|
+
# Given that it is always loaded, you are encouraged to keep this file as
|
7
|
+
# light-weight as possible. Requiring heavyweight dependencies from this file
|
8
|
+
# will add to the boot time of your test suite on EVERY test run, even for an
|
9
|
+
# individual file that may not need all of that loaded. Instead, consider making
|
10
|
+
# a separate helper file that requires the additional dependencies and performs
|
11
|
+
# the additional setup, and require it from the spec files that actually need it.
|
12
|
+
#
|
13
|
+
# The `.rspec` file also contains a few flags that are not defaults but that
|
14
|
+
# users commonly want.
|
15
|
+
#
|
16
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
17
|
+
RSpec.configure do |config|
|
18
|
+
# rspec-expectations config goes here. You can use an alternate
|
19
|
+
# assertion/expectation library such as wrong or the stdlib/minitest
|
20
|
+
# assertions if you prefer.
|
21
|
+
config.expect_with :rspec do |expectations|
|
22
|
+
# This option will default to `true` in RSpec 4. It makes the `description`
|
23
|
+
# and `failure_message` of custom matchers include text for helper methods
|
24
|
+
# defined using `chain`, e.g.:
|
25
|
+
# be_bigger_than(2).and_smaller_than(4).description
|
26
|
+
# # => "be bigger than 2 and smaller than 4"
|
27
|
+
# ...rather than:
|
28
|
+
# # => "be bigger than 2"
|
29
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
30
|
+
end
|
31
|
+
|
32
|
+
# rspec-mocks config goes here. You can use an alternate test double
|
33
|
+
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
34
|
+
config.mock_with :rspec do |mocks|
|
35
|
+
# Prevents you from mocking or stubbing a method that does not exist on
|
36
|
+
# a real object. This is generally recommended, and will default to
|
37
|
+
# `true` in RSpec 4.
|
38
|
+
mocks.verify_partial_doubles = true
|
39
|
+
end
|
40
|
+
|
41
|
+
# The settings below are suggested to provide a good initial experience
|
42
|
+
# with RSpec, but feel free to customize to your heart's content.
|
43
|
+
=begin
|
44
|
+
# These two settings work together to allow you to limit a spec run
|
45
|
+
# to individual examples or groups you care about by tagging them with
|
46
|
+
# `:focus` metadata. When nothing is tagged with `:focus`, all examples
|
47
|
+
# get run.
|
48
|
+
config.filter_run :focus
|
49
|
+
config.run_all_when_everything_filtered = true
|
50
|
+
|
51
|
+
# Limits the available syntax to the non-monkey patched syntax that is recommended.
|
52
|
+
# For more details, see:
|
53
|
+
# - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
|
54
|
+
# - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
|
55
|
+
# - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
|
56
|
+
config.disable_monkey_patching!
|
57
|
+
|
58
|
+
# This setting enables warnings. It's recommended, but in some cases may
|
59
|
+
# be too noisy due to issues in dependencies.
|
60
|
+
config.warnings = true
|
61
|
+
|
62
|
+
# Many RSpec users commonly either run the entire suite or an individual
|
63
|
+
# file, and it's useful to allow more verbose output when running an
|
64
|
+
# individual spec file.
|
65
|
+
if config.files_to_run.one?
|
66
|
+
# Use the documentation formatter for detailed output,
|
67
|
+
# unless a formatter has already been configured
|
68
|
+
# (e.g. via a command-line flag).
|
69
|
+
config.default_formatter = 'doc'
|
70
|
+
end
|
71
|
+
|
72
|
+
# Print the 10 slowest examples and example groups at the
|
73
|
+
# end of the spec run, to help surface which specs are running
|
74
|
+
# particularly slow.
|
75
|
+
config.profile_examples = 10
|
76
|
+
|
77
|
+
# Run specs in random order to surface order dependencies. If you find an
|
78
|
+
# order dependency and want to debug it, you can fix the order by providing
|
79
|
+
# the seed, which is printed after each run.
|
80
|
+
# --seed 1234
|
81
|
+
config.order = :random
|
82
|
+
|
83
|
+
# Seed global randomization in this process using the `--seed` CLI option.
|
84
|
+
# Setting this allows you to use `--seed` to deterministically reproduce
|
85
|
+
# test failures related to randomization by passing the same `--seed` value
|
86
|
+
# as the one that triggered the failure.
|
87
|
+
Kernel.srand config.seed
|
88
|
+
=end
|
89
|
+
end
|
metadata
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jaro_winkler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jian Weihang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-09-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
description: Pure Ruby implementation of Jaro-Winkler distance algorithm.
|
42
|
+
email:
|
43
|
+
- tonytonyjan@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- ".gitignore"
|
49
|
+
- ".rspec"
|
50
|
+
- Gemfile
|
51
|
+
- LICENSE.txt
|
52
|
+
- README.md
|
53
|
+
- Rakefile
|
54
|
+
- jaro_winkler.gemspec
|
55
|
+
- lib/jaro_winkler.rb
|
56
|
+
- lib/jaro_winkler/version.rb
|
57
|
+
- spec/jaro_winkler_spec.rb
|
58
|
+
- spec/spec_helper.rb
|
59
|
+
homepage: https://github.com/tonytonyjan/jaro_winkler
|
60
|
+
licenses:
|
61
|
+
- MIT
|
62
|
+
metadata: {}
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options: []
|
65
|
+
require_paths:
|
66
|
+
- lib
|
67
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
requirements: []
|
78
|
+
rubyforge_project:
|
79
|
+
rubygems_version: 2.4.1
|
80
|
+
signing_key:
|
81
|
+
specification_version: 4
|
82
|
+
summary: Pure Ruby implementation of Jaro-Winkler distance algorithm.
|
83
|
+
test_files:
|
84
|
+
- spec/jaro_winkler_spec.rb
|
85
|
+
- spec/spec_helper.rb
|