rubyfish 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
File without changes
data/LICENSE ADDED
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2010, AnjLab
2
+
3
+ All rights reserved.
4
+
5
+ Redistribution and use in source and binary forms, with or without modification,
6
+ are permitted provided that the following conditions are met:
7
+
8
+ * Redistributions of source code must retain the above copyright notice,
9
+ this list of conditions and the following disclaimer.
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+ * Neither the name of Sunlight Labs nor the names of its contributors may be
14
+ used to endorse or promote products derived from this software without
15
+ specific prior written permission.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,39 @@
1
+ =========
2
+ jellyfish
3
+ =========
4
+
5
+ RubyFish is a ruby port of python library jellyfish (http://github.com/sunlightlabs/jellyfish) for doing approximate and phonetic matching of strings.
6
+
7
+ RubyFish is a project of AnjLab (c) 2010.
8
+ All code is released under a BSD-style license, see LICENSE for details.
9
+
10
+ Originally written by
11
+ Written by Michael Stephens <mstephens@sunlightfoundation.com> and James Turk
12
+ <jturk@sunlightfoundation.com>.
13
+
14
+ Ported by Yury Korolev <yury.korolev@gmail.com>
15
+ Source is available at http://github.com/anjlab/rubyfish
16
+
17
+ Included Algorithms
18
+ ===================
19
+
20
+ String comparison:
21
+
22
+ * Levenshtein Distance
23
+ * Damerau-Levenshtein Distance
24
+ * Jaro Distance
25
+ * Jaro-Winkler Distance
26
+ * Hamming Distance
27
+ * Longest Substring
28
+ * Longest Subsequence
29
+
30
+ Example Usage
31
+ =============
32
+
33
+ ruby-1.9.2-p0 > require 'rubyfish'
34
+ ruby-1.9.2-p0 > RubyFish::Levenshtein.distance("jellyfish", "rubyfish")
35
+ => 4
36
+ ruby-1.9.2-p0 > RubyFish::Jaro.distance("jellyfish", "rubyfish")
37
+ => 0.7268518518518519
38
+ ruby-1.9.2-p0 > RubyFish::DamerauLevenshtein.distance("rubyfish", "rubyfihs")
39
+ => 1
@@ -0,0 +1,3 @@
1
+ - Port MRA
2
+ - Port NYSIIS
3
+ - Add Double Metaphone
@@ -0,0 +1,12 @@
1
+ #require "rubyfish/awesome"
2
+
3
+ module RubyFish
4
+ autoload :Hamming, 'rubyfish/hamming'
5
+ autoload :Levenshtein, 'rubyfish/levenshtein'
6
+ autoload :DamerauLevenshtein, 'rubyfish/damerau_levenshtein'
7
+ autoload :LongestSubstring, 'rubyfish/longest_substring'
8
+ autoload :LongestSubsequence, 'rubyfish/longest_subsequence'
9
+ autoload :Jaro, 'rubyfish/jaro'
10
+ autoload :JaroWinkler, 'rubyfish/jaro_winkler'
11
+ autoload :MMatrix, 'rubyfish/mmatrix'
12
+ end
@@ -0,0 +1,41 @@
1
+ require 'matrix'
2
+
3
+ module RubyFish::DamerauLevenshtein
4
+
5
+ def distance a, b
6
+ as = a.to_s
7
+ bs = b.to_s
8
+
9
+ rows = as.size + 1
10
+ cols = bs.size + 1
11
+
12
+ dist = ::RubyFish::MMatrix.new(rows, cols)
13
+
14
+ (0...rows).each {|i| dist[i, 0] = i}
15
+ (0...cols).each {|j| dist[0, j] = j}
16
+
17
+ (1...rows).each do |i|
18
+ (1...cols).each do |j|
19
+ cost = as[i - 1] == bs[j - 1] ? 0 : 1
20
+
21
+ #minimum of deletion, insertion, substitution
22
+ d1 = dist[i - 1, j] + 1
23
+ d2 = dist[i, j - 1] + 1
24
+ d3 = dist[i - 1, j - 1] + cost
25
+
26
+ d_now = [d1, d2, d3].min
27
+
28
+ if i > 2 && j > 2 && as[i - 1] == bs[j - 2] && as[i - 2] == bs[j - 1]
29
+ d1 = dist[i - 2, j - 2] + cost
30
+ d_now = [d_now, d1].min;
31
+ end
32
+
33
+ dist[i, j] = d_now;
34
+ end
35
+ end
36
+
37
+ dist[as.size, bs.size]
38
+ end
39
+
40
+ module_function :distance
41
+ end
@@ -0,0 +1,14 @@
1
+ module RubyFish::Hamming
2
+ def distance a, b
3
+ distance = 0
4
+ as, bs = a.to_s, b.to_s
5
+
6
+ short, long = [as, bs].sort
7
+
8
+ long.chars.zip(short.chars).each {|ac, bc| distance += 1 if ac != bc }
9
+
10
+ distance
11
+ end
12
+
13
+ module_function :distance
14
+ end
@@ -0,0 +1,11 @@
1
+ module RubyFish::Jaro
2
+ include ::RubyFish::JaroWinkler
3
+ extend ::RubyFish::JaroWinkler
4
+
5
+ def distance a, b
6
+ _distance(a, b, :winklerize => false)
7
+ end
8
+
9
+ module_function :distance
10
+
11
+ end
@@ -0,0 +1,107 @@
1
+ module RubyFish::JaroWinkler
2
+
3
+ def _distance a, b, opts = {}
4
+ long_tolerance = opts[:long_tolerance]
5
+ winklerize = opts[:winklerize]
6
+
7
+ as = a.to_s
8
+ bs = b.to_s
9
+
10
+ as_length = as.size
11
+ bs_length = bs.size
12
+
13
+ if as_length == 0 && bs_length == 0
14
+ return 1
15
+ end
16
+
17
+ if as_length == 0 || bs_length == 0
18
+ return 0
19
+ end
20
+
21
+ if as_length > bs_length
22
+ search_range = as_length
23
+ min_len = bs_length
24
+ else
25
+ search_range = bs_length
26
+ min_len = as_length
27
+ end
28
+
29
+ as_flag = Array.new(as_length + 1, false)
30
+ bs_flag = Array.new(bs_length + 1, false)
31
+
32
+ search_range = (search_range / 2) - 1
33
+ search_range = 0 if search_range < 0
34
+
35
+ # Looking only within the search range, count and flag the matched pairs.
36
+ common_chars = 0
37
+ (0...as_length).each do |i|
38
+ low_lim = (i >= search_range) ? i - search_range : 0
39
+ hi_lim = (i + search_range <= bs_length - 1) ? (i + search_range) : bs_length - 1
40
+ (low_lim..hi_lim).each do |j|
41
+ if !bs_flag[j] && bs[j] == as[i]
42
+ as_flag[i] = bs_flag[j] = true
43
+ common_chars += 1
44
+ break
45
+ end
46
+ end
47
+ end
48
+
49
+ # If no characters in common - return
50
+ return 0 if common_chars == 0
51
+
52
+ # Count the number of transpositions
53
+ k = trans_count = 0
54
+ (0...as_length).each do |i|
55
+ if as_flag[i]
56
+ for j in (k...bs_length) do
57
+ if bs_flag[j]
58
+ k = j + 1
59
+ break
60
+ end
61
+ end
62
+ trans_count += 1 if as[i] != bs[j]
63
+ end
64
+ end
65
+
66
+ trans_count = trans_count / 2
67
+
68
+ # adjust for similarities in nonmatched characters
69
+
70
+ one_third = 1.0/3
71
+ # Main weight computation.
72
+ weight = ( one_third * common_chars / as_length +
73
+ one_third * common_chars / bs_length +
74
+ one_third * (common_chars - trans_count) / common_chars )
75
+
76
+ # # Continue to boost the weight if the strings are similar
77
+ if winklerize && weight > 0.7
78
+ # Adjust for having up to the first 4 characters in common
79
+ j = (min_len >= 4) ? 4 : min_len
80
+ i = 0
81
+ while ((i<j)&&(as[i]==bs[i])&&((as[i].ord > 57) || (as[i].ord < 48)))
82
+ i+=1
83
+ end
84
+
85
+ weight += i * 0.1 * (1.0 - weight) if i > 0
86
+
87
+ # Optionally adjust for long strings.
88
+ # After agreeing beginning chars, at least two more must agree and
89
+ # the agreeing characters must be > .5 of remaining characters.
90
+ if long_tolerance && (min_len>4) && (common_chars > i+1) && (2 * common_chars >= min_len + i)
91
+ if as[0].ord > 57 || as[0].ord < 48
92
+ weight += (1.0 - weight) * (common_chars - i - 1) / (as_length + bs_length - i * 2 + 2).to_f
93
+ end
94
+ end
95
+ end
96
+
97
+ weight
98
+ end
99
+
100
+ def distance a, b, opts = {}
101
+ _distance(a, b, :winklerize => true)
102
+ end
103
+
104
+ module_function :distance
105
+ module_function :_distance
106
+
107
+ end
@@ -0,0 +1,36 @@
1
+ module RubyFish::Levenshtein
2
+
3
+ # http://en.wikipedia.org/wiki/Levenshtein_distance
4
+ def distance a, b
5
+ as, bs = a.to_s, b.to_s
6
+
7
+ as_length = as.size
8
+ bs_length = bs.size
9
+
10
+ rows = as_length + 1
11
+ cols = bs_length + 1
12
+
13
+ dist = ::RubyFish::MMatrix.new rows, cols
14
+ (0...rows).each {|i| dist[i, 0] = i}
15
+ (0...cols).each {|j| dist[0, j] = j}
16
+
17
+ (1...cols).each do |j|
18
+ (1...rows).each do |i|
19
+ if as[i - 1] == bs[j - 1]
20
+ dist[i, j] = dist[i - 1, j - 1]
21
+ else
22
+ d1 = dist[i - 1, j] + 1
23
+ d2 = dist[i, j - 1] + 1
24
+ d3 = dist[i - 1, j - 1] + 1
25
+
26
+ dist[i, j] = [d1, d2, d3].min;
27
+ end
28
+ end
29
+ end
30
+
31
+ dist[as_length, bs_length];
32
+ end
33
+
34
+ module_function :distance
35
+
36
+ end
@@ -0,0 +1,34 @@
1
+ module RubyFish::LongestSubsequence
2
+
3
+ # http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_subsequence#Ruby
4
+ def distance a, b
5
+ as = a.to_s
6
+ bs = b.to_s
7
+
8
+ rows = as.size
9
+ cols = bs.size
10
+
11
+ if rows == 0 || cols == 0
12
+ return 0
13
+ end
14
+
15
+ num = ::RubyFish::MMatrix.new rows + 1, cols + 1
16
+
17
+ (1..rows).each do |i|
18
+ (1..cols).each do |j|
19
+ if as[i - 1] == bs[j - 1]
20
+ num[i, j] = num[i - 1, j - 1] + 1;
21
+ else
22
+ if num[i, j - 1] > num[i - 1, j]
23
+ num[i, j] = num[i, j - 1]
24
+ else
25
+ num[i, j] = num[i - 1, j]
26
+ end
27
+ end
28
+ end
29
+ end
30
+ num[rows, cols]
31
+ end
32
+
33
+ module_function :distance
34
+ end
@@ -0,0 +1,34 @@
1
+ module RubyFish::LongestSubstring
2
+
3
+ # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring#Ruby
4
+ def distance a, b
5
+ as = a.to_s
6
+ bs = b.to_s
7
+
8
+ rows = as.size
9
+ cols = bs.size
10
+
11
+ if rows == 0 || cols == 0
12
+ return 0
13
+ end
14
+
15
+ num= ::RubyFish::MMatrix.new rows, cols
16
+ len,ans=0
17
+
18
+ as.each_char.with_index do |ac, i|
19
+ bs.each_char.with_index do |bc, j|
20
+ unless ac == bc
21
+ num[i, j]=0
22
+ else
23
+ (i==0 || j==0)? num[i, j] = 1 : num[i, j] = 1 + num[i-1, j-1]
24
+ len = ans = num[i, j] if num[i, j] > len
25
+ end
26
+ end
27
+ end
28
+
29
+ ans
30
+ end
31
+
32
+ module_function :distance
33
+
34
+ end
@@ -0,0 +1,29 @@
1
+ module RubyFish
2
+ class MMatrix
3
+ def initialize nrows, ncolumns
4
+ @rows = Array.new(nrows) { Array.new(ncolumns) {0}}
5
+ end
6
+
7
+ def [](i, j)
8
+ @rows[i][j]
9
+ end
10
+
11
+ def []=(i, j, v)
12
+ @rows[i][j] = v
13
+ end
14
+
15
+ def each_index
16
+ @rows.each_with_index do |r, i|
17
+ r.each_index do |j|
18
+ yield i, j
19
+ end
20
+ end
21
+ end
22
+
23
+ def each_with_index
24
+ each_index do |i, j|
25
+ yield self[i, j], i, j
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,3 @@
1
+ module RubyFish::MRA
2
+
3
+ end
@@ -0,0 +1,3 @@
1
+ module RubyFish
2
+ VERSION = "0.0.2"
3
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rubyfish
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 2
9
+ version: 0.0.2
10
+ platform: ruby
11
+ authors:
12
+ - Yury Korolev
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-09-20 00:00:00 +04:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Port of http://github.com/sunlightlabs/jellyfish
22
+ email:
23
+ - yury.korolev@gmail.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - lib/rubyfish/damerau_levenshtein.rb
32
+ - lib/rubyfish/hamming.rb
33
+ - lib/rubyfish/jaro.rb
34
+ - lib/rubyfish/jaro_winkler.rb
35
+ - lib/rubyfish/levenshtein.rb
36
+ - lib/rubyfish/longest_subsequence.rb
37
+ - lib/rubyfish/longest_substring.rb
38
+ - lib/rubyfish/mmatrix.rb
39
+ - lib/rubyfish/mra.rb
40
+ - lib/rubyfish/version.rb
41
+ - lib/rubyfish.rb
42
+ - LICENSE
43
+ - CHANGELOG.md
44
+ - README.md
45
+ - ROADMAP.md
46
+ has_rdoc: true
47
+ homepage: http://github.com/anjlab/rubyfish
48
+ licenses: []
49
+
50
+ post_install_message:
51
+ rdoc_options: []
52
+
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 663674839144737507
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ segments:
70
+ - 1
71
+ - 3
72
+ - 6
73
+ version: 1.3.6
74
+ requirements: []
75
+
76
+ rubyforge_project: rubyfish
77
+ rubygems_version: 1.3.7
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: A new gem templates
81
+ test_files: []
82
+