rubyfish 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
data/LICENSE ADDED
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2010, AnjLab
2
+
3
+ All rights reserved.
4
+
5
+ Redistribution and use in source and binary forms, with or without modification,
6
+ are permitted provided that the following conditions are met:
7
+
8
+ * Redistributions of source code must retain the above copyright notice,
9
+ this list of conditions and the following disclaimer.
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+ * Neither the name of Sunlight Labs nor the names of its contributors may be
14
+ used to endorse or promote products derived from this software without
15
+ specific prior written permission.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,39 @@
1
+ =========
2
+ jellyfish
3
+ =========
4
+
5
+ RubyFish is a ruby port of python library jellyfish (http://github.com/sunlightlabs/jellyfish) for doing approximate and phonetic matching of strings.
6
+
7
+ RubyFish is a project of AnjLab (c) 2010.
8
+ All code is released under a BSD-style license, see LICENSE for details.
9
+
10
+ Originally written by
11
+ Written by Michael Stephens <mstephens@sunlightfoundation.com> and James Turk
12
+ <jturk@sunlightfoundation.com>.
13
+
14
+ Ported by Yury Korolev <yury.korolev@gmail.com>
15
+ Source is available at http://github.com/anjlab/rubyfish
16
+
17
+ Included Algorithms
18
+ ===================
19
+
20
+ String comparison:
21
+
22
+ * Levenshtein Distance
23
+ * Damerau-Levenshtein Distance
24
+ * Jaro Distance
25
+ * Jaro-Winkler Distance
26
+ * Hamming Distance
27
+ * Longest Substring
28
+ * Longest Subsequence
29
+
30
+ Example Usage
31
+ =============
32
+
33
+ ruby-1.9.2-p0 > require 'rubyfish'
34
+ ruby-1.9.2-p0 > RubyFish::Levenshtein.distance("jellyfish", "rubyfish")
35
+ => 4
36
+ ruby-1.9.2-p0 > RubyFish::Jaro.distance("jellyfish", "rubyfish")
37
+ => 0.7268518518518519
38
+ ruby-1.9.2-p0 > RubyFish::DamerauLevenshtein.distance("rubyfish", "rubyfihs")
39
+ => 1
@@ -0,0 +1,3 @@
1
+ - Port MRA
2
+ - Port NYSIIS
3
+ - Add Double Metaphone
@@ -0,0 +1,12 @@
1
+ #require "rubyfish/awesome"
2
+
3
+ module RubyFish
4
+ autoload :Hamming, 'rubyfish/hamming'
5
+ autoload :Levenshtein, 'rubyfish/levenshtein'
6
+ autoload :DamerauLevenshtein, 'rubyfish/damerau_levenshtein'
7
+ autoload :LongestSubstring, 'rubyfish/longest_substring'
8
+ autoload :LongestSubsequence, 'rubyfish/longest_subsequence'
9
+ autoload :Jaro, 'rubyfish/jaro'
10
+ autoload :JaroWinkler, 'rubyfish/jaro_winkler'
11
+ autoload :MMatrix, 'rubyfish/mmatrix'
12
+ end
@@ -0,0 +1,41 @@
1
+ require 'matrix'
2
+
3
+ module RubyFish::DamerauLevenshtein
4
+
5
+ def distance a, b
6
+ as = a.to_s
7
+ bs = b.to_s
8
+
9
+ rows = as.size + 1
10
+ cols = bs.size + 1
11
+
12
+ dist = ::RubyFish::MMatrix.new(rows, cols)
13
+
14
+ (0...rows).each {|i| dist[i, 0] = i}
15
+ (0...cols).each {|j| dist[0, j] = j}
16
+
17
+ (1...rows).each do |i|
18
+ (1...cols).each do |j|
19
+ cost = as[i - 1] == bs[j - 1] ? 0 : 1
20
+
21
+ #minimum of deletion, insertion, substitution
22
+ d1 = dist[i - 1, j] + 1
23
+ d2 = dist[i, j - 1] + 1
24
+ d3 = dist[i - 1, j - 1] + cost
25
+
26
+ d_now = [d1, d2, d3].min
27
+
28
+ if i > 2 && j > 2 && as[i - 1] == bs[j - 2] && as[i - 2] == bs[j - 1]
29
+ d1 = dist[i - 2, j - 2] + cost
30
+ d_now = [d_now, d1].min;
31
+ end
32
+
33
+ dist[i, j] = d_now;
34
+ end
35
+ end
36
+
37
+ dist[as.size, bs.size]
38
+ end
39
+
40
+ module_function :distance
41
+ end
@@ -0,0 +1,14 @@
1
+ module RubyFish::Hamming
2
+ def distance a, b
3
+ distance = 0
4
+ as, bs = a.to_s, b.to_s
5
+
6
+ short, long = [as, bs].sort
7
+
8
+ long.chars.zip(short.chars).each {|ac, bc| distance += 1 if ac != bc }
9
+
10
+ distance
11
+ end
12
+
13
+ module_function :distance
14
+ end
@@ -0,0 +1,11 @@
1
+ module RubyFish::Jaro
2
+ include ::RubyFish::JaroWinkler
3
+ extend ::RubyFish::JaroWinkler
4
+
5
+ def distance a, b
6
+ _distance(a, b, :winklerize => false)
7
+ end
8
+
9
+ module_function :distance
10
+
11
+ end
@@ -0,0 +1,107 @@
1
+ module RubyFish::JaroWinkler
2
+
3
+ def _distance a, b, opts = {}
4
+ long_tolerance = opts[:long_tolerance]
5
+ winklerize = opts[:winklerize]
6
+
7
+ as = a.to_s
8
+ bs = b.to_s
9
+
10
+ as_length = as.size
11
+ bs_length = bs.size
12
+
13
+ if as_length == 0 && bs_length == 0
14
+ return 1
15
+ end
16
+
17
+ if as_length == 0 || bs_length == 0
18
+ return 0
19
+ end
20
+
21
+ if as_length > bs_length
22
+ search_range = as_length
23
+ min_len = bs_length
24
+ else
25
+ search_range = bs_length
26
+ min_len = as_length
27
+ end
28
+
29
+ as_flag = Array.new(as_length + 1, false)
30
+ bs_flag = Array.new(bs_length + 1, false)
31
+
32
+ search_range = (search_range / 2) - 1
33
+ search_range = 0 if search_range < 0
34
+
35
+ # Looking only within the search range, count and flag the matched pairs.
36
+ common_chars = 0
37
+ (0...as_length).each do |i|
38
+ low_lim = (i >= search_range) ? i - search_range : 0
39
+ hi_lim = (i + search_range <= bs_length - 1) ? (i + search_range) : bs_length - 1
40
+ (low_lim..hi_lim).each do |j|
41
+ if !bs_flag[j] && bs[j] == as[i]
42
+ as_flag[i] = bs_flag[j] = true
43
+ common_chars += 1
44
+ break
45
+ end
46
+ end
47
+ end
48
+
49
+ # If no characters in common - return
50
+ return 0 if common_chars == 0
51
+
52
+ # Count the number of transpositions
53
+ k = trans_count = 0
54
+ (0...as_length).each do |i|
55
+ if as_flag[i]
56
+ for j in (k...bs_length) do
57
+ if bs_flag[j]
58
+ k = j + 1
59
+ break
60
+ end
61
+ end
62
+ trans_count += 1 if as[i] != bs[j]
63
+ end
64
+ end
65
+
66
+ trans_count = trans_count / 2
67
+
68
+ # adjust for similarities in nonmatched characters
69
+
70
+ one_third = 1.0/3
71
+ # Main weight computation.
72
+ weight = ( one_third * common_chars / as_length +
73
+ one_third * common_chars / bs_length +
74
+ one_third * (common_chars - trans_count) / common_chars )
75
+
76
+ # # Continue to boost the weight if the strings are similar
77
+ if winklerize && weight > 0.7
78
+ # Adjust for having up to the first 4 characters in common
79
+ j = (min_len >= 4) ? 4 : min_len
80
+ i = 0
81
+ while ((i<j)&&(as[i]==bs[i])&&((as[i].ord > 57) || (as[i].ord < 48)))
82
+ i+=1
83
+ end
84
+
85
+ weight += i * 0.1 * (1.0 - weight) if i > 0
86
+
87
+ # Optionally adjust for long strings.
88
+ # After agreeing beginning chars, at least two more must agree and
89
+ # the agreeing characters must be > .5 of remaining characters.
90
+ if long_tolerance && (min_len>4) && (common_chars > i+1) && (2 * common_chars >= min_len + i)
91
+ if as[0].ord > 57 || as[0].ord < 48
92
+ weight += (1.0 - weight) * (common_chars - i - 1) / (as_length + bs_length - i * 2 + 2).to_f
93
+ end
94
+ end
95
+ end
96
+
97
+ weight
98
+ end
99
+
100
+ def distance a, b, opts = {}
101
+ _distance(a, b, :winklerize => true)
102
+ end
103
+
104
+ module_function :distance
105
+ module_function :_distance
106
+
107
+ end
@@ -0,0 +1,36 @@
1
+ module RubyFish::Levenshtein
2
+
3
+ # http://en.wikipedia.org/wiki/Levenshtein_distance
4
+ def distance a, b
5
+ as, bs = a.to_s, b.to_s
6
+
7
+ as_length = as.size
8
+ bs_length = bs.size
9
+
10
+ rows = as_length + 1
11
+ cols = bs_length + 1
12
+
13
+ dist = ::RubyFish::MMatrix.new rows, cols
14
+ (0...rows).each {|i| dist[i, 0] = i}
15
+ (0...cols).each {|j| dist[0, j] = j}
16
+
17
+ (1...cols).each do |j|
18
+ (1...rows).each do |i|
19
+ if as[i - 1] == bs[j - 1]
20
+ dist[i, j] = dist[i - 1, j - 1]
21
+ else
22
+ d1 = dist[i - 1, j] + 1
23
+ d2 = dist[i, j - 1] + 1
24
+ d3 = dist[i - 1, j - 1] + 1
25
+
26
+ dist[i, j] = [d1, d2, d3].min;
27
+ end
28
+ end
29
+ end
30
+
31
+ dist[as_length, bs_length];
32
+ end
33
+
34
+ module_function :distance
35
+
36
+ end
@@ -0,0 +1,34 @@
1
+ module RubyFish::LongestSubsequence
2
+
3
+ # http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_subsequence#Ruby
4
+ def distance a, b
5
+ as = a.to_s
6
+ bs = b.to_s
7
+
8
+ rows = as.size
9
+ cols = bs.size
10
+
11
+ if rows == 0 || cols == 0
12
+ return 0
13
+ end
14
+
15
+ num = ::RubyFish::MMatrix.new rows + 1, cols + 1
16
+
17
+ (1..rows).each do |i|
18
+ (1..cols).each do |j|
19
+ if as[i - 1] == bs[j - 1]
20
+ num[i, j] = num[i - 1, j - 1] + 1;
21
+ else
22
+ if num[i, j - 1] > num[i - 1, j]
23
+ num[i, j] = num[i, j - 1]
24
+ else
25
+ num[i, j] = num[i - 1, j]
26
+ end
27
+ end
28
+ end
29
+ end
30
+ num[rows, cols]
31
+ end
32
+
33
+ module_function :distance
34
+ end
@@ -0,0 +1,34 @@
1
+ module RubyFish::LongestSubstring
2
+
3
+ # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring#Ruby
4
+ def distance a, b
5
+ as = a.to_s
6
+ bs = b.to_s
7
+
8
+ rows = as.size
9
+ cols = bs.size
10
+
11
+ if rows == 0 || cols == 0
12
+ return 0
13
+ end
14
+
15
+ num= ::RubyFish::MMatrix.new rows, cols
16
+ len,ans=0
17
+
18
+ as.each_char.with_index do |ac, i|
19
+ bs.each_char.with_index do |bc, j|
20
+ unless ac == bc
21
+ num[i, j]=0
22
+ else
23
+ (i==0 || j==0)? num[i, j] = 1 : num[i, j] = 1 + num[i-1, j-1]
24
+ len = ans = num[i, j] if num[i, j] > len
25
+ end
26
+ end
27
+ end
28
+
29
+ ans
30
+ end
31
+
32
+ module_function :distance
33
+
34
+ end
@@ -0,0 +1,29 @@
1
+ module RubyFish
2
+ class MMatrix
3
+ def initialize nrows, ncolumns
4
+ @rows = Array.new(nrows) { Array.new(ncolumns) {0}}
5
+ end
6
+
7
+ def [](i, j)
8
+ @rows[i][j]
9
+ end
10
+
11
+ def []=(i, j, v)
12
+ @rows[i][j] = v
13
+ end
14
+
15
+ def each_index
16
+ @rows.each_with_index do |r, i|
17
+ r.each_index do |j|
18
+ yield i, j
19
+ end
20
+ end
21
+ end
22
+
23
+ def each_with_index
24
+ each_index do |i, j|
25
+ yield self[i, j], i, j
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,3 @@
1
+ module RubyFish::MRA
2
+
3
+ end
@@ -0,0 +1,3 @@
1
+ module RubyFish
2
+ VERSION = "0.0.2"
3
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rubyfish
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 2
9
+ version: 0.0.2
10
+ platform: ruby
11
+ authors:
12
+ - Yury Korolev
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-09-20 00:00:00 +04:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Port of http://github.com/sunlightlabs/jellyfish
22
+ email:
23
+ - yury.korolev@gmail.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - lib/rubyfish/damerau_levenshtein.rb
32
+ - lib/rubyfish/hamming.rb
33
+ - lib/rubyfish/jaro.rb
34
+ - lib/rubyfish/jaro_winkler.rb
35
+ - lib/rubyfish/levenshtein.rb
36
+ - lib/rubyfish/longest_subsequence.rb
37
+ - lib/rubyfish/longest_substring.rb
38
+ - lib/rubyfish/mmatrix.rb
39
+ - lib/rubyfish/mra.rb
40
+ - lib/rubyfish/version.rb
41
+ - lib/rubyfish.rb
42
+ - LICENSE
43
+ - CHANGELOG.md
44
+ - README.md
45
+ - ROADMAP.md
46
+ has_rdoc: true
47
+ homepage: http://github.com/anjlab/rubyfish
48
+ licenses: []
49
+
50
+ post_install_message:
51
+ rdoc_options: []
52
+
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 663674839144737507
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ segments:
70
+ - 1
71
+ - 3
72
+ - 6
73
+ version: 1.3.6
74
+ requirements: []
75
+
76
+ rubyforge_project: rubyfish
77
+ rubygems_version: 1.3.7
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: A new gem templates
81
+ test_files: []
82
+