jaro 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/README.md +16 -0
  2. data/jaro.rb +50 -0
  3. metadata +47 -0
data/README.md ADDED
@@ -0,0 +1,16 @@
1
+ # jaro
2
+
3
+ Yet another implementation of Jaro-Winkler string distance. Examples taken from wikipedia:
4
+ ```ruby
5
+ puts "MARTHA" ^ "MARHTA"
6
+ puts "DWAYNE" ^ "DUANE"
7
+ puts "DIXON" ^ "DICKSONX"
8
+ ```
9
+
10
+ outputs:
11
+
12
+ ```
13
+ 0.9611111111111111
14
+ 0.8400000000000001
15
+ 0.8133333333333332
16
+ ```
data/jaro.rb ADDED
@@ -0,0 +1,50 @@
1
+ class String
2
+ # Jaro distance
3
+ def ^(other)
4
+ return 0 if self.empty? or other.empty?
5
+ s1 = self.codepoints.to_a
6
+ s2 = other.codepoints.to_a
7
+ s1, s2 = s2, s1 if s1.size > s2.size
8
+ s1s, s2s = s1.size, s2.size
9
+
10
+ m, t = 0.0, 0
11
+ max_dist = s2s/2 - 1
12
+
13
+ m1 = Array.new(s1s, nil)
14
+ m2 = Array.new(s2s, false)
15
+
16
+ # find m
17
+ s1.each_with_index do |a, ia|
18
+ lower = ia > max_dist ? ia-max_dist : 0
19
+ upper = ia+max_dist < s2s ? ia+max_dist : s2s
20
+ s2[lower..upper].each_with_index do |b, ib|
21
+ ib += lower
22
+ if a == b and !m2[ib]
23
+ m, m1[ia], m2[ib] = m+1, ib, true
24
+ break
25
+ end
26
+ end
27
+ end
28
+
29
+ m1.reduce do |a, b|
30
+ # if either a or b are nil, that means there was no match
31
+ # if a > b, that means the previous value is greater than the current
32
+ # which means it went down
33
+ if a != nil and b != nil and a > b
34
+ t += (a-b > 1 ? 1 : 2)
35
+ end
36
+ b
37
+ end
38
+
39
+ dj = (m/s1s + m/s2s + (m - t/2)/m) / 3
40
+
41
+ # winkler adjustment
42
+ l = 0
43
+ for i in 0..3
44
+ s1[i] == s2[i] ? l += 1 : break
45
+ end
46
+
47
+ # standard weight (p) for winkler == 0.1
48
+ dj + l*0.1*(1-dj)
49
+ end
50
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jaro
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - moshee
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-28 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Adds String#^ for measuring string similarity
15
+ email: moshee@displaynone.us
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - jaro.rb
22
+ homepage: http://github.com/moshee/jaro
23
+ licenses: []
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.11
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Implements Jaro-Winkler string distance
46
+ test_files: []
47
+ has_rdoc: