rubyfish 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +0 -0
- data/LICENSE +27 -0
- data/README.md +39 -0
- data/ROADMAP.md +3 -0
- data/lib/rubyfish.rb +12 -0
- data/lib/rubyfish/damerau_levenshtein.rb +41 -0
- data/lib/rubyfish/hamming.rb +14 -0
- data/lib/rubyfish/jaro.rb +11 -0
- data/lib/rubyfish/jaro_winkler.rb +107 -0
- data/lib/rubyfish/levenshtein.rb +36 -0
- data/lib/rubyfish/longest_subsequence.rb +34 -0
- data/lib/rubyfish/longest_substring.rb +34 -0
- data/lib/rubyfish/mmatrix.rb +29 -0
- data/lib/rubyfish/mra.rb +3 -0
- data/lib/rubyfish/version.rb +3 -0
- metadata +82 -0
data/CHANGELOG.md
ADDED
File without changes
|
data/LICENSE
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
Copyright (c) 2010, AnjLab
|
2
|
+
|
3
|
+
All rights reserved.
|
4
|
+
|
5
|
+
Redistribution and use in source and binary forms, with or without modification,
|
6
|
+
are permitted provided that the following conditions are met:
|
7
|
+
|
8
|
+
* Redistributions of source code must retain the above copyright notice,
|
9
|
+
this list of conditions and the following disclaimer.
|
10
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
11
|
+
this list of conditions and the following disclaimer in the documentation
|
12
|
+
and/or other materials provided with the distribution.
|
13
|
+
* Neither the name of Sunlight Labs nor the names of its contributors may be
|
14
|
+
used to endorse or promote products derived from this software without
|
15
|
+
specific prior written permission.
|
16
|
+
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
18
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
19
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
20
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
21
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
22
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
23
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
24
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
25
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
26
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
27
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
=========
|
2
|
+
jellyfish
|
3
|
+
=========
|
4
|
+
|
5
|
+
RubyFish is a ruby port of python library jellyfish (http://github.com/sunlightlabs/jellyfish) for doing approximate and phonetic matching of strings.
|
6
|
+
|
7
|
+
RubyFish is a project of AnjLab (c) 2010.
|
8
|
+
All code is released under a BSD-style license, see LICENSE for details.
|
9
|
+
|
10
|
+
Originally written by
|
11
|
+
Written by Michael Stephens <mstephens@sunlightfoundation.com> and James Turk
|
12
|
+
<jturk@sunlightfoundation.com>.
|
13
|
+
|
14
|
+
Ported by Yury Korolev <yury.korolev@gmail.com>
|
15
|
+
Source is available at http://github.com/anjlab/rubyfish
|
16
|
+
|
17
|
+
Included Algorithms
|
18
|
+
===================
|
19
|
+
|
20
|
+
String comparison:
|
21
|
+
|
22
|
+
* Levenshtein Distance
|
23
|
+
* Damerau-Levenshtein Distance
|
24
|
+
* Jaro Distance
|
25
|
+
* Jaro-Winkler Distance
|
26
|
+
* Hamming Distance
|
27
|
+
* Longest Substring
|
28
|
+
* Longest Subsequence
|
29
|
+
|
30
|
+
Example Usage
|
31
|
+
=============
|
32
|
+
|
33
|
+
ruby-1.9.2-p0 > require 'rubyfish'
|
34
|
+
ruby-1.9.2-p0 > RubyFish::Levenshtein.distance("jellyfish", "rubyfish")
|
35
|
+
=> 4
|
36
|
+
ruby-1.9.2-p0 > RubyFish::Jaro.distance("jellyfish", "rubyfish")
|
37
|
+
=> 0.7268518518518519
|
38
|
+
ruby-1.9.2-p0 > RubyFish::DamerauLevenshtein.distance("rubyfish", "rubyfihs")
|
39
|
+
=> 1
|
data/ROADMAP.md
ADDED
data/lib/rubyfish.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#require "rubyfish/awesome"
|
2
|
+
|
3
|
+
module RubyFish
|
4
|
+
autoload :Hamming, 'rubyfish/hamming'
|
5
|
+
autoload :Levenshtein, 'rubyfish/levenshtein'
|
6
|
+
autoload :DamerauLevenshtein, 'rubyfish/damerau_levenshtein'
|
7
|
+
autoload :LongestSubstring, 'rubyfish/longest_substring'
|
8
|
+
autoload :LongestSubsequence, 'rubyfish/longest_subsequence'
|
9
|
+
autoload :Jaro, 'rubyfish/jaro'
|
10
|
+
autoload :JaroWinkler, 'rubyfish/jaro_winkler'
|
11
|
+
autoload :MMatrix, 'rubyfish/mmatrix'
|
12
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
|
3
|
+
module RubyFish::DamerauLevenshtein
|
4
|
+
|
5
|
+
def distance a, b
|
6
|
+
as = a.to_s
|
7
|
+
bs = b.to_s
|
8
|
+
|
9
|
+
rows = as.size + 1
|
10
|
+
cols = bs.size + 1
|
11
|
+
|
12
|
+
dist = ::RubyFish::MMatrix.new(rows, cols)
|
13
|
+
|
14
|
+
(0...rows).each {|i| dist[i, 0] = i}
|
15
|
+
(0...cols).each {|j| dist[0, j] = j}
|
16
|
+
|
17
|
+
(1...rows).each do |i|
|
18
|
+
(1...cols).each do |j|
|
19
|
+
cost = as[i - 1] == bs[j - 1] ? 0 : 1
|
20
|
+
|
21
|
+
#minimum of deletion, insertion, substitution
|
22
|
+
d1 = dist[i - 1, j] + 1
|
23
|
+
d2 = dist[i, j - 1] + 1
|
24
|
+
d3 = dist[i - 1, j - 1] + cost
|
25
|
+
|
26
|
+
d_now = [d1, d2, d3].min
|
27
|
+
|
28
|
+
if i > 2 && j > 2 && as[i - 1] == bs[j - 2] && as[i - 2] == bs[j - 1]
|
29
|
+
d1 = dist[i - 2, j - 2] + cost
|
30
|
+
d_now = [d_now, d1].min;
|
31
|
+
end
|
32
|
+
|
33
|
+
dist[i, j] = d_now;
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
dist[as.size, bs.size]
|
38
|
+
end
|
39
|
+
|
40
|
+
module_function :distance
|
41
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module RubyFish::JaroWinkler
|
2
|
+
|
3
|
+
def _distance a, b, opts = {}
|
4
|
+
long_tolerance = opts[:long_tolerance]
|
5
|
+
winklerize = opts[:winklerize]
|
6
|
+
|
7
|
+
as = a.to_s
|
8
|
+
bs = b.to_s
|
9
|
+
|
10
|
+
as_length = as.size
|
11
|
+
bs_length = bs.size
|
12
|
+
|
13
|
+
if as_length == 0 && bs_length == 0
|
14
|
+
return 1
|
15
|
+
end
|
16
|
+
|
17
|
+
if as_length == 0 || bs_length == 0
|
18
|
+
return 0
|
19
|
+
end
|
20
|
+
|
21
|
+
if as_length > bs_length
|
22
|
+
search_range = as_length
|
23
|
+
min_len = bs_length
|
24
|
+
else
|
25
|
+
search_range = bs_length
|
26
|
+
min_len = as_length
|
27
|
+
end
|
28
|
+
|
29
|
+
as_flag = Array.new(as_length + 1, false)
|
30
|
+
bs_flag = Array.new(bs_length + 1, false)
|
31
|
+
|
32
|
+
search_range = (search_range / 2) - 1
|
33
|
+
search_range = 0 if search_range < 0
|
34
|
+
|
35
|
+
# Looking only within the search range, count and flag the matched pairs.
|
36
|
+
common_chars = 0
|
37
|
+
(0...as_length).each do |i|
|
38
|
+
low_lim = (i >= search_range) ? i - search_range : 0
|
39
|
+
hi_lim = (i + search_range <= bs_length - 1) ? (i + search_range) : bs_length - 1
|
40
|
+
(low_lim..hi_lim).each do |j|
|
41
|
+
if !bs_flag[j] && bs[j] == as[i]
|
42
|
+
as_flag[i] = bs_flag[j] = true
|
43
|
+
common_chars += 1
|
44
|
+
break
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# If no characters in common - return
|
50
|
+
return 0 if common_chars == 0
|
51
|
+
|
52
|
+
# Count the number of transpositions
|
53
|
+
k = trans_count = 0
|
54
|
+
(0...as_length).each do |i|
|
55
|
+
if as_flag[i]
|
56
|
+
for j in (k...bs_length) do
|
57
|
+
if bs_flag[j]
|
58
|
+
k = j + 1
|
59
|
+
break
|
60
|
+
end
|
61
|
+
end
|
62
|
+
trans_count += 1 if as[i] != bs[j]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
trans_count = trans_count / 2
|
67
|
+
|
68
|
+
# adjust for similarities in nonmatched characters
|
69
|
+
|
70
|
+
one_third = 1.0/3
|
71
|
+
# Main weight computation.
|
72
|
+
weight = ( one_third * common_chars / as_length +
|
73
|
+
one_third * common_chars / bs_length +
|
74
|
+
one_third * (common_chars - trans_count) / common_chars )
|
75
|
+
|
76
|
+
# # Continue to boost the weight if the strings are similar
|
77
|
+
if winklerize && weight > 0.7
|
78
|
+
# Adjust for having up to the first 4 characters in common
|
79
|
+
j = (min_len >= 4) ? 4 : min_len
|
80
|
+
i = 0
|
81
|
+
while ((i<j)&&(as[i]==bs[i])&&((as[i].ord > 57) || (as[i].ord < 48)))
|
82
|
+
i+=1
|
83
|
+
end
|
84
|
+
|
85
|
+
weight += i * 0.1 * (1.0 - weight) if i > 0
|
86
|
+
|
87
|
+
# Optionally adjust for long strings.
|
88
|
+
# After agreeing beginning chars, at least two more must agree and
|
89
|
+
# the agreeing characters must be > .5 of remaining characters.
|
90
|
+
if long_tolerance && (min_len>4) && (common_chars > i+1) && (2 * common_chars >= min_len + i)
|
91
|
+
if as[0].ord > 57 || as[0].ord < 48
|
92
|
+
weight += (1.0 - weight) * (common_chars - i - 1) / (as_length + bs_length - i * 2 + 2).to_f
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
weight
|
98
|
+
end
|
99
|
+
|
100
|
+
def distance a, b, opts = {}
|
101
|
+
_distance(a, b, :winklerize => true)
|
102
|
+
end
|
103
|
+
|
104
|
+
module_function :distance
|
105
|
+
module_function :_distance
|
106
|
+
|
107
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module RubyFish::Levenshtein
|
2
|
+
|
3
|
+
# http://en.wikipedia.org/wiki/Levenshtein_distance
|
4
|
+
def distance a, b
|
5
|
+
as, bs = a.to_s, b.to_s
|
6
|
+
|
7
|
+
as_length = as.size
|
8
|
+
bs_length = bs.size
|
9
|
+
|
10
|
+
rows = as_length + 1
|
11
|
+
cols = bs_length + 1
|
12
|
+
|
13
|
+
dist = ::RubyFish::MMatrix.new rows, cols
|
14
|
+
(0...rows).each {|i| dist[i, 0] = i}
|
15
|
+
(0...cols).each {|j| dist[0, j] = j}
|
16
|
+
|
17
|
+
(1...cols).each do |j|
|
18
|
+
(1...rows).each do |i|
|
19
|
+
if as[i - 1] == bs[j - 1]
|
20
|
+
dist[i, j] = dist[i - 1, j - 1]
|
21
|
+
else
|
22
|
+
d1 = dist[i - 1, j] + 1
|
23
|
+
d2 = dist[i, j - 1] + 1
|
24
|
+
d3 = dist[i - 1, j - 1] + 1
|
25
|
+
|
26
|
+
dist[i, j] = [d1, d2, d3].min;
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
dist[as_length, bs_length];
|
32
|
+
end
|
33
|
+
|
34
|
+
module_function :distance
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module RubyFish::LongestSubsequence
|
2
|
+
|
3
|
+
# http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_subsequence#Ruby
|
4
|
+
def distance a, b
|
5
|
+
as = a.to_s
|
6
|
+
bs = b.to_s
|
7
|
+
|
8
|
+
rows = as.size
|
9
|
+
cols = bs.size
|
10
|
+
|
11
|
+
if rows == 0 || cols == 0
|
12
|
+
return 0
|
13
|
+
end
|
14
|
+
|
15
|
+
num = ::RubyFish::MMatrix.new rows + 1, cols + 1
|
16
|
+
|
17
|
+
(1..rows).each do |i|
|
18
|
+
(1..cols).each do |j|
|
19
|
+
if as[i - 1] == bs[j - 1]
|
20
|
+
num[i, j] = num[i - 1, j - 1] + 1;
|
21
|
+
else
|
22
|
+
if num[i, j - 1] > num[i - 1, j]
|
23
|
+
num[i, j] = num[i, j - 1]
|
24
|
+
else
|
25
|
+
num[i, j] = num[i - 1, j]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
num[rows, cols]
|
31
|
+
end
|
32
|
+
|
33
|
+
module_function :distance
|
34
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module RubyFish::LongestSubstring
|
2
|
+
|
3
|
+
# http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring#Ruby
|
4
|
+
def distance a, b
|
5
|
+
as = a.to_s
|
6
|
+
bs = b.to_s
|
7
|
+
|
8
|
+
rows = as.size
|
9
|
+
cols = bs.size
|
10
|
+
|
11
|
+
if rows == 0 || cols == 0
|
12
|
+
return 0
|
13
|
+
end
|
14
|
+
|
15
|
+
num= ::RubyFish::MMatrix.new rows, cols
|
16
|
+
len,ans=0
|
17
|
+
|
18
|
+
as.each_char.with_index do |ac, i|
|
19
|
+
bs.each_char.with_index do |bc, j|
|
20
|
+
unless ac == bc
|
21
|
+
num[i, j]=0
|
22
|
+
else
|
23
|
+
(i==0 || j==0)? num[i, j] = 1 : num[i, j] = 1 + num[i-1, j-1]
|
24
|
+
len = ans = num[i, j] if num[i, j] > len
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
ans
|
30
|
+
end
|
31
|
+
|
32
|
+
module_function :distance
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module RubyFish
|
2
|
+
class MMatrix
|
3
|
+
def initialize nrows, ncolumns
|
4
|
+
@rows = Array.new(nrows) { Array.new(ncolumns) {0}}
|
5
|
+
end
|
6
|
+
|
7
|
+
def [](i, j)
|
8
|
+
@rows[i][j]
|
9
|
+
end
|
10
|
+
|
11
|
+
def []=(i, j, v)
|
12
|
+
@rows[i][j] = v
|
13
|
+
end
|
14
|
+
|
15
|
+
def each_index
|
16
|
+
@rows.each_with_index do |r, i|
|
17
|
+
r.each_index do |j|
|
18
|
+
yield i, j
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def each_with_index
|
24
|
+
each_index do |i, j|
|
25
|
+
yield self[i, j], i, j
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/rubyfish/mra.rb
ADDED
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rubyfish
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
version: 0.0.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Yury Korolev
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-09-20 00:00:00 +04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Port of http://github.com/sunlightlabs/jellyfish
|
22
|
+
email:
|
23
|
+
- yury.korolev@gmail.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- lib/rubyfish/damerau_levenshtein.rb
|
32
|
+
- lib/rubyfish/hamming.rb
|
33
|
+
- lib/rubyfish/jaro.rb
|
34
|
+
- lib/rubyfish/jaro_winkler.rb
|
35
|
+
- lib/rubyfish/levenshtein.rb
|
36
|
+
- lib/rubyfish/longest_subsequence.rb
|
37
|
+
- lib/rubyfish/longest_substring.rb
|
38
|
+
- lib/rubyfish/mmatrix.rb
|
39
|
+
- lib/rubyfish/mra.rb
|
40
|
+
- lib/rubyfish/version.rb
|
41
|
+
- lib/rubyfish.rb
|
42
|
+
- LICENSE
|
43
|
+
- CHANGELOG.md
|
44
|
+
- README.md
|
45
|
+
- ROADMAP.md
|
46
|
+
has_rdoc: true
|
47
|
+
homepage: http://github.com/anjlab/rubyfish
|
48
|
+
licenses: []
|
49
|
+
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options: []
|
52
|
+
|
53
|
+
require_paths:
|
54
|
+
- lib
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 663674839144737507
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
segments:
|
70
|
+
- 1
|
71
|
+
- 3
|
72
|
+
- 6
|
73
|
+
version: 1.3.6
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project: rubyfish
|
77
|
+
rubygems_version: 1.3.7
|
78
|
+
signing_key:
|
79
|
+
specification_version: 3
|
80
|
+
summary: A new gem templates
|
81
|
+
test_files: []
|
82
|
+
|