jaro 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +16 -0
- data/jaro.rb +50 -0
- metadata +47 -0
data/README.md
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# jaro
|
2
|
+
|
3
|
+
Yet another implementation of Jaro-Winkler string distance. Examples taken from wikipedia:
|
4
|
+
```ruby
|
5
|
+
puts "MARTHA" ^ "MARHTA"
|
6
|
+
puts "DWAYNE" ^ "DUANE"
|
7
|
+
puts "DIXON" ^ "DICKSONX"
|
8
|
+
```
|
9
|
+
|
10
|
+
outputs:
|
11
|
+
|
12
|
+
```
|
13
|
+
0.9611111111111111
|
14
|
+
0.8400000000000001
|
15
|
+
0.8133333333333332
|
16
|
+
```
|
data/jaro.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
class String
|
2
|
+
# Jaro distance
|
3
|
+
def ^(other)
|
4
|
+
return 0 if self.empty? or other.empty?
|
5
|
+
s1 = self.codepoints.to_a
|
6
|
+
s2 = other.codepoints.to_a
|
7
|
+
s1, s2 = s2, s1 if s1.size > s2.size
|
8
|
+
s1s, s2s = s1.size, s2.size
|
9
|
+
|
10
|
+
m, t = 0.0, 0
|
11
|
+
max_dist = s2s/2 - 1
|
12
|
+
|
13
|
+
m1 = Array.new(s1s, nil)
|
14
|
+
m2 = Array.new(s2s, false)
|
15
|
+
|
16
|
+
# find m
|
17
|
+
s1.each_with_index do |a, ia|
|
18
|
+
lower = ia > max_dist ? ia-max_dist : 0
|
19
|
+
upper = ia+max_dist < s2s ? ia+max_dist : s2s
|
20
|
+
s2[lower..upper].each_with_index do |b, ib|
|
21
|
+
ib += lower
|
22
|
+
if a == b and !m2[ib]
|
23
|
+
m, m1[ia], m2[ib] = m+1, ib, true
|
24
|
+
break
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
m1.reduce do |a, b|
|
30
|
+
# if either a or b are nil, that means there was no match
|
31
|
+
# if a > b, that means the previous value is greater than the current
|
32
|
+
# which means it went down
|
33
|
+
if a != nil and b != nil and a > b
|
34
|
+
t += (a-b > 1 ? 1 : 2)
|
35
|
+
end
|
36
|
+
b
|
37
|
+
end
|
38
|
+
|
39
|
+
dj = (m/s1s + m/s2s + (m - t/2)/m) / 3
|
40
|
+
|
41
|
+
# winkler adjustment
|
42
|
+
l = 0
|
43
|
+
for i in 0..3
|
44
|
+
s1[i] == s2[i] ? l += 1 : break
|
45
|
+
end
|
46
|
+
|
47
|
+
# standard weight (p) for winkler == 0.1
|
48
|
+
dj + l*0.1*(1-dj)
|
49
|
+
end
|
50
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jaro
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- moshee
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-28 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Adds String#^ for measuring string similarity
|
15
|
+
email: moshee@displaynone.us
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- README.md
|
21
|
+
- jaro.rb
|
22
|
+
homepage: http://github.com/moshee/jaro
|
23
|
+
licenses: []
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
none: false
|
36
|
+
requirements:
|
37
|
+
- - ! '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 1.8.11
|
43
|
+
signing_key:
|
44
|
+
specification_version: 3
|
45
|
+
summary: Implements Jaro-Winkler string distance
|
46
|
+
test_files: []
|
47
|
+
has_rdoc:
|