rubyfish 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +0 -0
- data/LICENSE +27 -0
- data/README.md +39 -0
- data/ROADMAP.md +3 -0
- data/lib/rubyfish.rb +12 -0
- data/lib/rubyfish/damerau_levenshtein.rb +41 -0
- data/lib/rubyfish/hamming.rb +14 -0
- data/lib/rubyfish/jaro.rb +11 -0
- data/lib/rubyfish/jaro_winkler.rb +107 -0
- data/lib/rubyfish/levenshtein.rb +36 -0
- data/lib/rubyfish/longest_subsequence.rb +34 -0
- data/lib/rubyfish/longest_substring.rb +34 -0
- data/lib/rubyfish/mmatrix.rb +29 -0
- data/lib/rubyfish/mra.rb +3 -0
- data/lib/rubyfish/version.rb +3 -0
- metadata +82 -0
data/CHANGELOG.md
ADDED
File without changes
|
data/LICENSE
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
Copyright (c) 2010, AnjLab
|
2
|
+
|
3
|
+
All rights reserved.
|
4
|
+
|
5
|
+
Redistribution and use in source and binary forms, with or without modification,
|
6
|
+
are permitted provided that the following conditions are met:
|
7
|
+
|
8
|
+
* Redistributions of source code must retain the above copyright notice,
|
9
|
+
this list of conditions and the following disclaimer.
|
10
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
11
|
+
this list of conditions and the following disclaimer in the documentation
|
12
|
+
and/or other materials provided with the distribution.
|
13
|
+
* Neither the name of Sunlight Labs nor the names of its contributors may be
|
14
|
+
used to endorse or promote products derived from this software without
|
15
|
+
specific prior written permission.
|
16
|
+
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
18
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
19
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
20
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
21
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
22
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
23
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
24
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
25
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
26
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
27
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
=========
|
2
|
+
jellyfish
|
3
|
+
=========
|
4
|
+
|
5
|
+
RubyFish is a ruby port of python library jellyfish (http://github.com/sunlightlabs/jellyfish) for doing approximate and phonetic matching of strings.
|
6
|
+
|
7
|
+
RubyFish is a project of AnjLab (c) 2010.
|
8
|
+
All code is released under a BSD-style license, see LICENSE for details.
|
9
|
+
|
10
|
+
Originally written by
|
11
|
+
Written by Michael Stephens <mstephens@sunlightfoundation.com> and James Turk
|
12
|
+
<jturk@sunlightfoundation.com>.
|
13
|
+
|
14
|
+
Ported by Yury Korolev <yury.korolev@gmail.com>
|
15
|
+
Source is available at http://github.com/anjlab/rubyfish
|
16
|
+
|
17
|
+
Included Algorithms
|
18
|
+
===================
|
19
|
+
|
20
|
+
String comparison:
|
21
|
+
|
22
|
+
* Levenshtein Distance
|
23
|
+
* Damerau-Levenshtein Distance
|
24
|
+
* Jaro Distance
|
25
|
+
* Jaro-Winkler Distance
|
26
|
+
* Hamming Distance
|
27
|
+
* Longest Substring
|
28
|
+
* Longest Subsequence
|
29
|
+
|
30
|
+
Example Usage
|
31
|
+
=============
|
32
|
+
|
33
|
+
ruby-1.9.2-p0 > require 'rubyfish'
|
34
|
+
ruby-1.9.2-p0 > RubyFish::Levenshtein.distance("jellyfish", "rubyfish")
|
35
|
+
=> 4
|
36
|
+
ruby-1.9.2-p0 > RubyFish::Jaro.distance("jellyfish", "rubyfish")
|
37
|
+
=> 0.7268518518518519
|
38
|
+
ruby-1.9.2-p0 > RubyFish::DamerauLevenshtein.distance("rubyfish", "rubyfihs")
|
39
|
+
=> 1
|
data/ROADMAP.md
ADDED
data/lib/rubyfish.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#require "rubyfish/awesome"
|
2
|
+
|
3
|
+
module RubyFish
|
4
|
+
autoload :Hamming, 'rubyfish/hamming'
|
5
|
+
autoload :Levenshtein, 'rubyfish/levenshtein'
|
6
|
+
autoload :DamerauLevenshtein, 'rubyfish/damerau_levenshtein'
|
7
|
+
autoload :LongestSubstring, 'rubyfish/longest_substring'
|
8
|
+
autoload :LongestSubsequence, 'rubyfish/longest_subsequence'
|
9
|
+
autoload :Jaro, 'rubyfish/jaro'
|
10
|
+
autoload :JaroWinkler, 'rubyfish/jaro_winkler'
|
11
|
+
autoload :MMatrix, 'rubyfish/mmatrix'
|
12
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
|
3
|
+
module RubyFish::DamerauLevenshtein
|
4
|
+
|
5
|
+
def distance a, b
|
6
|
+
as = a.to_s
|
7
|
+
bs = b.to_s
|
8
|
+
|
9
|
+
rows = as.size + 1
|
10
|
+
cols = bs.size + 1
|
11
|
+
|
12
|
+
dist = ::RubyFish::MMatrix.new(rows, cols)
|
13
|
+
|
14
|
+
(0...rows).each {|i| dist[i, 0] = i}
|
15
|
+
(0...cols).each {|j| dist[0, j] = j}
|
16
|
+
|
17
|
+
(1...rows).each do |i|
|
18
|
+
(1...cols).each do |j|
|
19
|
+
cost = as[i - 1] == bs[j - 1] ? 0 : 1
|
20
|
+
|
21
|
+
#minimum of deletion, insertion, substitution
|
22
|
+
d1 = dist[i - 1, j] + 1
|
23
|
+
d2 = dist[i, j - 1] + 1
|
24
|
+
d3 = dist[i - 1, j - 1] + cost
|
25
|
+
|
26
|
+
d_now = [d1, d2, d3].min
|
27
|
+
|
28
|
+
if i > 2 && j > 2 && as[i - 1] == bs[j - 2] && as[i - 2] == bs[j - 1]
|
29
|
+
d1 = dist[i - 2, j - 2] + cost
|
30
|
+
d_now = [d_now, d1].min;
|
31
|
+
end
|
32
|
+
|
33
|
+
dist[i, j] = d_now;
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
dist[as.size, bs.size]
|
38
|
+
end
|
39
|
+
|
40
|
+
module_function :distance
|
41
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module RubyFish::JaroWinkler
|
2
|
+
|
3
|
+
def _distance a, b, opts = {}
|
4
|
+
long_tolerance = opts[:long_tolerance]
|
5
|
+
winklerize = opts[:winklerize]
|
6
|
+
|
7
|
+
as = a.to_s
|
8
|
+
bs = b.to_s
|
9
|
+
|
10
|
+
as_length = as.size
|
11
|
+
bs_length = bs.size
|
12
|
+
|
13
|
+
if as_length == 0 && bs_length == 0
|
14
|
+
return 1
|
15
|
+
end
|
16
|
+
|
17
|
+
if as_length == 0 || bs_length == 0
|
18
|
+
return 0
|
19
|
+
end
|
20
|
+
|
21
|
+
if as_length > bs_length
|
22
|
+
search_range = as_length
|
23
|
+
min_len = bs_length
|
24
|
+
else
|
25
|
+
search_range = bs_length
|
26
|
+
min_len = as_length
|
27
|
+
end
|
28
|
+
|
29
|
+
as_flag = Array.new(as_length + 1, false)
|
30
|
+
bs_flag = Array.new(bs_length + 1, false)
|
31
|
+
|
32
|
+
search_range = (search_range / 2) - 1
|
33
|
+
search_range = 0 if search_range < 0
|
34
|
+
|
35
|
+
# Looking only within the search range, count and flag the matched pairs.
|
36
|
+
common_chars = 0
|
37
|
+
(0...as_length).each do |i|
|
38
|
+
low_lim = (i >= search_range) ? i - search_range : 0
|
39
|
+
hi_lim = (i + search_range <= bs_length - 1) ? (i + search_range) : bs_length - 1
|
40
|
+
(low_lim..hi_lim).each do |j|
|
41
|
+
if !bs_flag[j] && bs[j] == as[i]
|
42
|
+
as_flag[i] = bs_flag[j] = true
|
43
|
+
common_chars += 1
|
44
|
+
break
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# If no characters in common - return
|
50
|
+
return 0 if common_chars == 0
|
51
|
+
|
52
|
+
# Count the number of transpositions
|
53
|
+
k = trans_count = 0
|
54
|
+
(0...as_length).each do |i|
|
55
|
+
if as_flag[i]
|
56
|
+
for j in (k...bs_length) do
|
57
|
+
if bs_flag[j]
|
58
|
+
k = j + 1
|
59
|
+
break
|
60
|
+
end
|
61
|
+
end
|
62
|
+
trans_count += 1 if as[i] != bs[j]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
trans_count = trans_count / 2
|
67
|
+
|
68
|
+
# adjust for similarities in nonmatched characters
|
69
|
+
|
70
|
+
one_third = 1.0/3
|
71
|
+
# Main weight computation.
|
72
|
+
weight = ( one_third * common_chars / as_length +
|
73
|
+
one_third * common_chars / bs_length +
|
74
|
+
one_third * (common_chars - trans_count) / common_chars )
|
75
|
+
|
76
|
+
# # Continue to boost the weight if the strings are similar
|
77
|
+
if winklerize && weight > 0.7
|
78
|
+
# Adjust for having up to the first 4 characters in common
|
79
|
+
j = (min_len >= 4) ? 4 : min_len
|
80
|
+
i = 0
|
81
|
+
while ((i<j)&&(as[i]==bs[i])&&((as[i].ord > 57) || (as[i].ord < 48)))
|
82
|
+
i+=1
|
83
|
+
end
|
84
|
+
|
85
|
+
weight += i * 0.1 * (1.0 - weight) if i > 0
|
86
|
+
|
87
|
+
# Optionally adjust for long strings.
|
88
|
+
# After agreeing beginning chars, at least two more must agree and
|
89
|
+
# the agreeing characters must be > .5 of remaining characters.
|
90
|
+
if long_tolerance && (min_len>4) && (common_chars > i+1) && (2 * common_chars >= min_len + i)
|
91
|
+
if as[0].ord > 57 || as[0].ord < 48
|
92
|
+
weight += (1.0 - weight) * (common_chars - i - 1) / (as_length + bs_length - i * 2 + 2).to_f
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
weight
|
98
|
+
end
|
99
|
+
|
100
|
+
def distance a, b, opts = {}
|
101
|
+
_distance(a, b, :winklerize => true)
|
102
|
+
end
|
103
|
+
|
104
|
+
module_function :distance
|
105
|
+
module_function :_distance
|
106
|
+
|
107
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module RubyFish::Levenshtein
|
2
|
+
|
3
|
+
# http://en.wikipedia.org/wiki/Levenshtein_distance
|
4
|
+
def distance a, b
|
5
|
+
as, bs = a.to_s, b.to_s
|
6
|
+
|
7
|
+
as_length = as.size
|
8
|
+
bs_length = bs.size
|
9
|
+
|
10
|
+
rows = as_length + 1
|
11
|
+
cols = bs_length + 1
|
12
|
+
|
13
|
+
dist = ::RubyFish::MMatrix.new rows, cols
|
14
|
+
(0...rows).each {|i| dist[i, 0] = i}
|
15
|
+
(0...cols).each {|j| dist[0, j] = j}
|
16
|
+
|
17
|
+
(1...cols).each do |j|
|
18
|
+
(1...rows).each do |i|
|
19
|
+
if as[i - 1] == bs[j - 1]
|
20
|
+
dist[i, j] = dist[i - 1, j - 1]
|
21
|
+
else
|
22
|
+
d1 = dist[i - 1, j] + 1
|
23
|
+
d2 = dist[i, j - 1] + 1
|
24
|
+
d3 = dist[i - 1, j - 1] + 1
|
25
|
+
|
26
|
+
dist[i, j] = [d1, d2, d3].min;
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
dist[as_length, bs_length];
|
32
|
+
end
|
33
|
+
|
34
|
+
module_function :distance
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module RubyFish::LongestSubsequence
|
2
|
+
|
3
|
+
# http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_subsequence#Ruby
|
4
|
+
def distance a, b
|
5
|
+
as = a.to_s
|
6
|
+
bs = b.to_s
|
7
|
+
|
8
|
+
rows = as.size
|
9
|
+
cols = bs.size
|
10
|
+
|
11
|
+
if rows == 0 || cols == 0
|
12
|
+
return 0
|
13
|
+
end
|
14
|
+
|
15
|
+
num = ::RubyFish::MMatrix.new rows + 1, cols + 1
|
16
|
+
|
17
|
+
(1..rows).each do |i|
|
18
|
+
(1..cols).each do |j|
|
19
|
+
if as[i - 1] == bs[j - 1]
|
20
|
+
num[i, j] = num[i - 1, j - 1] + 1;
|
21
|
+
else
|
22
|
+
if num[i, j - 1] > num[i - 1, j]
|
23
|
+
num[i, j] = num[i, j - 1]
|
24
|
+
else
|
25
|
+
num[i, j] = num[i - 1, j]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
num[rows, cols]
|
31
|
+
end
|
32
|
+
|
33
|
+
module_function :distance
|
34
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module RubyFish::LongestSubstring
|
2
|
+
|
3
|
+
# http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring#Ruby
|
4
|
+
def distance a, b
|
5
|
+
as = a.to_s
|
6
|
+
bs = b.to_s
|
7
|
+
|
8
|
+
rows = as.size
|
9
|
+
cols = bs.size
|
10
|
+
|
11
|
+
if rows == 0 || cols == 0
|
12
|
+
return 0
|
13
|
+
end
|
14
|
+
|
15
|
+
num= ::RubyFish::MMatrix.new rows, cols
|
16
|
+
len,ans=0
|
17
|
+
|
18
|
+
as.each_char.with_index do |ac, i|
|
19
|
+
bs.each_char.with_index do |bc, j|
|
20
|
+
unless ac == bc
|
21
|
+
num[i, j]=0
|
22
|
+
else
|
23
|
+
(i==0 || j==0)? num[i, j] = 1 : num[i, j] = 1 + num[i-1, j-1]
|
24
|
+
len = ans = num[i, j] if num[i, j] > len
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
ans
|
30
|
+
end
|
31
|
+
|
32
|
+
module_function :distance
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module RubyFish
|
2
|
+
class MMatrix
|
3
|
+
def initialize nrows, ncolumns
|
4
|
+
@rows = Array.new(nrows) { Array.new(ncolumns) {0}}
|
5
|
+
end
|
6
|
+
|
7
|
+
def [](i, j)
|
8
|
+
@rows[i][j]
|
9
|
+
end
|
10
|
+
|
11
|
+
def []=(i, j, v)
|
12
|
+
@rows[i][j] = v
|
13
|
+
end
|
14
|
+
|
15
|
+
def each_index
|
16
|
+
@rows.each_with_index do |r, i|
|
17
|
+
r.each_index do |j|
|
18
|
+
yield i, j
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def each_with_index
|
24
|
+
each_index do |i, j|
|
25
|
+
yield self[i, j], i, j
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/rubyfish/mra.rb
ADDED
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rubyfish
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
version: 0.0.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Yury Korolev
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-09-20 00:00:00 +04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Port of http://github.com/sunlightlabs/jellyfish
|
22
|
+
email:
|
23
|
+
- yury.korolev@gmail.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- lib/rubyfish/damerau_levenshtein.rb
|
32
|
+
- lib/rubyfish/hamming.rb
|
33
|
+
- lib/rubyfish/jaro.rb
|
34
|
+
- lib/rubyfish/jaro_winkler.rb
|
35
|
+
- lib/rubyfish/levenshtein.rb
|
36
|
+
- lib/rubyfish/longest_subsequence.rb
|
37
|
+
- lib/rubyfish/longest_substring.rb
|
38
|
+
- lib/rubyfish/mmatrix.rb
|
39
|
+
- lib/rubyfish/mra.rb
|
40
|
+
- lib/rubyfish/version.rb
|
41
|
+
- lib/rubyfish.rb
|
42
|
+
- LICENSE
|
43
|
+
- CHANGELOG.md
|
44
|
+
- README.md
|
45
|
+
- ROADMAP.md
|
46
|
+
has_rdoc: true
|
47
|
+
homepage: http://github.com/anjlab/rubyfish
|
48
|
+
licenses: []
|
49
|
+
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options: []
|
52
|
+
|
53
|
+
require_paths:
|
54
|
+
- lib
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 663674839144737507
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
segments:
|
70
|
+
- 1
|
71
|
+
- 3
|
72
|
+
- 6
|
73
|
+
version: 1.3.6
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project: rubyfish
|
77
|
+
rubygems_version: 1.3.7
|
78
|
+
signing_key:
|
79
|
+
specification_version: 3
|
80
|
+
summary: A new gem templates
|
81
|
+
test_files: []
|
82
|
+
|