jaro 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/README.md +16 -0
  2. data/jaro.rb +50 -0
  3. metadata +47 -0
data/README.md ADDED
@@ -0,0 +1,16 @@
1
+ # jaro
2
+
3
+ Yet another implementation of Jaro-Winkler string distance. Examples taken from wikipedia:
4
+ ```ruby
5
+ puts "MARTHA" ^ "MARHTA"
6
+ puts "DWAYNE" ^ "DUANE"
7
+ puts "DIXON" ^ "DICKSONX"
8
+ ```
9
+
10
+ outputs:
11
+
12
+ ```
13
+ 0.9611111111111111
14
+ 0.8400000000000001
15
+ 0.8133333333333332
16
+ ```
data/jaro.rb ADDED
@@ -0,0 +1,50 @@
1
+ class String
2
+ # Jaro distance
3
+ def ^(other)
4
+ return 0 if self.empty? or other.empty?
5
+ s1 = self.codepoints.to_a
6
+ s2 = other.codepoints.to_a
7
+ s1, s2 = s2, s1 if s1.size > s2.size
8
+ s1s, s2s = s1.size, s2.size
9
+
10
+ m, t = 0.0, 0
11
+ max_dist = s2s/2 - 1
12
+
13
+ m1 = Array.new(s1s, nil)
14
+ m2 = Array.new(s2s, false)
15
+
16
+ # find m
17
+ s1.each_with_index do |a, ia|
18
+ lower = ia > max_dist ? ia-max_dist : 0
19
+ upper = ia+max_dist < s2s ? ia+max_dist : s2s
20
+ s2[lower..upper].each_with_index do |b, ib|
21
+ ib += lower
22
+ if a == b and !m2[ib]
23
+ m, m1[ia], m2[ib] = m+1, ib, true
24
+ break
25
+ end
26
+ end
27
+ end
28
+
29
+ m1.reduce do |a, b|
30
+ # if either a or b are nil, that means there was no match
31
+ # if a > b, that means the previous value is greater than the current
32
+ # which means it went down
33
+ if a != nil and b != nil and a > b
34
+ t += (a-b > 1 ? 1 : 2)
35
+ end
36
+ b
37
+ end
38
+
39
+ dj = (m/s1s + m/s2s + (m - t/2)/m) / 3
40
+
41
+ # winkler adjustment
42
+ l = 0
43
+ for i in 0..3
44
+ s1[i] == s2[i] ? l += 1 : break
45
+ end
46
+
47
+ # standard weight (p) for winkler == 0.1
48
+ dj + l*0.1*(1-dj)
49
+ end
50
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jaro
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - moshee
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-28 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Adds String#^ for measuring string similarity
15
+ email: moshee@displaynone.us
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - jaro.rb
22
+ homepage: http://github.com/moshee/jaro
23
+ licenses: []
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.11
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Implements Jaro-Winkler string distance
46
+ test_files: []
47
+ has_rdoc: