amatch 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/CHANGES +3 -0
- data/COPYING +203 -340
- data/README.md +124 -0
- data/Rakefile +5 -12
- data/VERSION +1 -1
- data/amatch.gemspec +0 -0
- data/bin/{agrep.rb → agrep} +23 -9
- data/bin/dupfind +153 -0
- data/ext/amatch_ext.c +298 -74
- data/images/amatch_ext.png +0 -0
- data/lib/amatch/version.rb +1 -1
- data/tests/test_damerau_levenshtein.rb +93 -0
- metadata +27 -8
- data/README.rdoc +0 -128
data/README.md
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
# amatch - Approximate Matching Extension for Ruby
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
This is a collection of classes that can be used for Approximate
|
6
|
+
matching, searching, and comparing of Strings. They implement algorithms
|
7
|
+
that compute the Levenshtein edit distance, Sellers edit distance, the
|
8
|
+
Hamming distance, the longest common subsequence length, the longest common
|
9
|
+
substring length, the pair distance metric, the Jaro-Winkler metric.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
To install this extension as a gem type
|
14
|
+
|
15
|
+
# gem install amatch
|
16
|
+
|
17
|
+
into the shell.
|
18
|
+
|
19
|
+
## Download
|
20
|
+
|
21
|
+
The homepage of this library is located at
|
22
|
+
|
23
|
+
* https://github.com/flori/amatch
|
24
|
+
|
25
|
+
## Examples
|
26
|
+
|
27
|
+
require 'amatch'
|
28
|
+
# => true
|
29
|
+
include Amatch
|
30
|
+
# => Object
|
31
|
+
|
32
|
+
m = Sellers.new("pattern")
|
33
|
+
# => #<Amatch::Sellers:0x40366324>
|
34
|
+
m.match("pattren")
|
35
|
+
# => 2.0
|
36
|
+
m.substitution = m.insertion = 3
|
37
|
+
# => 3
|
38
|
+
m.match("pattren")
|
39
|
+
# => 4.0
|
40
|
+
m.reset_weights
|
41
|
+
# => #<Amatch::Sellers:0x40366324>
|
42
|
+
m.match(["pattren","parent"])
|
43
|
+
# => [2.0, 4.0]
|
44
|
+
m.search("abcpattrendef")
|
45
|
+
# => 2.0
|
46
|
+
|
47
|
+
m = Levenshtein.new("pattern")
|
48
|
+
# => #<Amatch::Levenshtein:0x4035919c>
|
49
|
+
m.match("pattren")
|
50
|
+
# => 2
|
51
|
+
m.search("abcpattrendef")
|
52
|
+
# => 2
|
53
|
+
"pattern language".levenshtein_similar("language of patterns")
|
54
|
+
# => 0.2
|
55
|
+
|
56
|
+
m = Amatch::DamerauLevenshtein.new("pattern")
|
57
|
+
# => #<Amatch::DamerauLevenshtein:0x007fc3483dd278>
|
58
|
+
m.match("pattren")
|
59
|
+
# => 1
|
60
|
+
"pattern language".damerau_levenshtein_similar("language of patterns")
|
61
|
+
# => 0.19999999999999996
|
62
|
+
|
63
|
+
m = Hamming.new("pattern")
|
64
|
+
# => #<Amatch::Hamming:0x40350858>
|
65
|
+
m.match("pattren")
|
66
|
+
# => 2
|
67
|
+
"pattern language".hamming_similar("language of patterns")
|
68
|
+
# => 0.1
|
69
|
+
|
70
|
+
m = PairDistance.new("pattern")
|
71
|
+
# => #<Amatch::PairDistance:0x40349be8>
|
72
|
+
m.match("pattr en")
|
73
|
+
# => 0.545454545454545
|
74
|
+
m.match("pattr en", nil)
|
75
|
+
# => 0.461538461538462
|
76
|
+
m.match("pattr en", /t+/)
|
77
|
+
# => 0.285714285714286
|
78
|
+
"pattern language".pair_distance_similar("language of patterns")
|
79
|
+
# => 0.928571428571429
|
80
|
+
|
81
|
+
m = LongestSubsequence.new("pattern")
|
82
|
+
# => #<Amatch::LongestSubsequence:0x4033e900>
|
83
|
+
m.match("pattren")
|
84
|
+
# => 6
|
85
|
+
"pattern language".longest_subsequence_similar("language of patterns")
|
86
|
+
# => 0.4
|
87
|
+
|
88
|
+
m = LongestSubstring.new("pattern")
|
89
|
+
# => #<Amatch::LongestSubstring:0x403378d0>
|
90
|
+
m.match("pattren")
|
91
|
+
# => 4
|
92
|
+
"pattern language".longest_substring_similar("language of patterns")
|
93
|
+
# => 0.4
|
94
|
+
|
95
|
+
m = Jaro.new("pattern")
|
96
|
+
# => #<Amatch::Jaro:0x363b70>
|
97
|
+
m.match("paTTren")
|
98
|
+
# => 0.952380952380952
|
99
|
+
m.ignore_case = false
|
100
|
+
m.match("paTTren")
|
101
|
+
# => 0.742857142857143
|
102
|
+
"pattern language".jaro_similar("language of patterns")
|
103
|
+
# => 0.672222222222222
|
104
|
+
|
105
|
+
m = JaroWinkler.new("pattern")
|
106
|
+
# #<Amatch::JaroWinkler:0x3530b8>
|
107
|
+
m.match("paTTren")
|
108
|
+
# => 0.971428571712403
|
109
|
+
m.ignore_case = false
|
110
|
+
m.match("paTTren")
|
111
|
+
# => 0.79428571505206
|
112
|
+
m.scaling_factor = 0.05
|
113
|
+
m.match("pattren")
|
114
|
+
# => 0.961904762046678
|
115
|
+
"pattern language".jarowinkler_similar("language of patterns")
|
116
|
+
# => 0.672222222222222
|
117
|
+
|
118
|
+
## Author
|
119
|
+
|
120
|
+
Florian Frank mailto:flori@ping.de
|
121
|
+
|
122
|
+
## License
|
123
|
+
|
124
|
+
Apache License, Version 2.0 – See the COPYING file in the source archive.
|
data/Rakefile
CHANGED
@@ -13,22 +13,15 @@ Amatch is a library for approximate string matching and searching in strings.
|
|
13
13
|
Several algorithms can be used to do this, and it's also possible to compute a
|
14
14
|
similarity metric number between 0.0 and 1.0 for two given strings.
|
15
15
|
EOT
|
16
|
-
executables << 'agrep
|
16
|
+
executables << 'agrep' << 'dupfind'
|
17
17
|
bindir 'bin'
|
18
18
|
test_dir 'tests'
|
19
|
-
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', '
|
19
|
+
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', 'Makefile'
|
20
20
|
title "#{name.camelize} - Approximate Matching"
|
21
|
-
readme 'README.
|
21
|
+
readme 'README.md'
|
22
22
|
require_paths %w[lib ext]
|
23
23
|
dependency 'tins', '~>1.0'
|
24
|
+
dependency 'mize'
|
24
25
|
development_dependency 'test-unit', '~>3.0'
|
25
|
-
licenses << '
|
26
|
-
|
27
|
-
install_library do
|
28
|
-
libdir = CONFIG["sitelibdir"]
|
29
|
-
src, = Dir['ext/amatch.*'].reject { |x| x =~ /\.[co]$/ }
|
30
|
-
install(src, File.join(libdir, File.basename(src)), :verbose => true)
|
31
|
-
mkdir_p dst = File.join(libdir, 'amatch')
|
32
|
-
install('lib/amatch/version.rb', File.join(dst, 'version.rb'), :verbose => true)
|
33
|
-
end
|
26
|
+
licenses << 'Apache-2.0'
|
34
27
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/amatch.gemspec
CHANGED
Binary file
|
data/bin/{agrep.rb → agrep}
RENAMED
@@ -15,15 +15,21 @@ end
|
|
15
15
|
|
16
16
|
class Amatch::Levenshtein
|
17
17
|
def search_relative(strings)
|
18
|
-
|
18
|
+
if Array === strings
|
19
|
+
search(strings).map { |s| s.to_f / pattern.size }
|
20
|
+
else
|
21
|
+
search(strings).to_f / pattern.size
|
22
|
+
end
|
19
23
|
end
|
20
24
|
end
|
21
25
|
|
26
|
+
$algorithm = 'Levenshtein'
|
22
27
|
$distance = 1
|
23
28
|
$mode = :search
|
24
29
|
begin
|
25
30
|
parser = GetoptLong.new
|
26
31
|
options = [
|
32
|
+
[ '--algorithm', '-a', GetoptLong::REQUIRED_ARGUMENT ],
|
27
33
|
[ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
|
28
34
|
[ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
|
29
35
|
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
|
@@ -33,6 +39,8 @@ begin
|
|
33
39
|
parser.each_option do |name, arg|
|
34
40
|
name = name.sub(/^--/, '')
|
35
41
|
case name
|
42
|
+
when 'algorithm'
|
43
|
+
$algorithm = arg
|
36
44
|
when 'distance'
|
37
45
|
$distance = arg.to_f
|
38
46
|
when 'relative'
|
@@ -48,7 +56,7 @@ rescue
|
|
48
56
|
end
|
49
57
|
pattern = ARGV.shift or usage('Pattern needed!', options)
|
50
58
|
|
51
|
-
matcher = Amatch
|
59
|
+
matcher = Amatch.const_get($algorithm).new(pattern)
|
52
60
|
size = 0
|
53
61
|
start = Time.new
|
54
62
|
if ARGV.size > 0 then
|
@@ -56,9 +64,12 @@ if ARGV.size > 0 then
|
|
56
64
|
File.stat(filename).file? or next
|
57
65
|
size += File.size(filename)
|
58
66
|
begin
|
59
|
-
File.open(filename, 'r').each_line do |
|
60
|
-
|
61
|
-
|
67
|
+
File.open(filename, 'r').each_line.each_slice(1000) do |lines|
|
68
|
+
results = matcher.__send__($mode, lines)
|
69
|
+
lines.zip(results) do |line, r|
|
70
|
+
if r <= $distance
|
71
|
+
puts "#{filename}:#{line}"
|
72
|
+
end
|
62
73
|
end
|
63
74
|
end
|
64
75
|
rescue
|
@@ -66,10 +77,13 @@ if ARGV.size > 0 then
|
|
66
77
|
end
|
67
78
|
end
|
68
79
|
else
|
69
|
-
STDIN.each_line do |
|
70
|
-
size +=
|
71
|
-
|
72
|
-
|
80
|
+
STDIN.each_line.each_slice(1000) do |lines|
|
81
|
+
size += lines.size
|
82
|
+
results = matcher.__send__($mode, lines)
|
83
|
+
lines.zip(results) do |line, r|
|
84
|
+
if r <= $distance
|
85
|
+
puts line
|
86
|
+
end
|
73
87
|
end
|
74
88
|
end
|
75
89
|
end
|
data/bin/dupfind
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'tins/go'
|
4
|
+
include Tins::GO
|
5
|
+
require 'tins/minimize'
|
6
|
+
class Array
|
7
|
+
include Tins::Minimize
|
8
|
+
end
|
9
|
+
require 'amatch'
|
10
|
+
begin
|
11
|
+
require 'infobar'
|
12
|
+
rescue LoadError
|
13
|
+
warn "Please install gem infobar to run this executable!"
|
14
|
+
exit 1
|
15
|
+
end
|
16
|
+
|
17
|
+
def usage
|
18
|
+
puts <<EOT
|
19
|
+
Usage: #{File.basename($0)} [OPTIONS] FILE
|
20
|
+
|
21
|
+
-a ALGO Amatch matching algorithm
|
22
|
+
-p LIMIT more than p similarity to be a match
|
23
|
+
-R NUMBER skip NUMBER mismatch for building ranges
|
24
|
+
-r NUMBER minimum length to be counted as a range
|
25
|
+
-i compute a PNG per file
|
26
|
+
|
27
|
+
Repor$ bugs to <flori@ping.de>.
|
28
|
+
EOT
|
29
|
+
exit 0
|
30
|
+
end
|
31
|
+
|
32
|
+
class FindDuplicates
|
33
|
+
def initialize(algo, p_lim, filename)
|
34
|
+
@algo, @p_lim, @filename = algo, p_lim, filename
|
35
|
+
end
|
36
|
+
|
37
|
+
attr_reader :filename
|
38
|
+
|
39
|
+
attr_reader :algo
|
40
|
+
|
41
|
+
attr_reader :p_lim
|
42
|
+
|
43
|
+
memoize method:
|
44
|
+
def lines
|
45
|
+
File.readlines(filename)
|
46
|
+
end
|
47
|
+
|
48
|
+
memoize method:
|
49
|
+
def matrix
|
50
|
+
result = lines.with_infobar(label: filename, output: STDERR).map do |l1|
|
51
|
+
+infobar
|
52
|
+
a = algo.new(l1)
|
53
|
+
r = a.similar(lines)
|
54
|
+
r.map! { |s| s >= p_lim ? ?1 : ?0 }
|
55
|
+
r.join
|
56
|
+
end
|
57
|
+
infobar.finish
|
58
|
+
infobar.newline
|
59
|
+
result
|
60
|
+
end
|
61
|
+
|
62
|
+
def pbm(output: $>)
|
63
|
+
output << <<HEADER
|
64
|
+
P1
|
65
|
+
#{matrix.size} #{matrix.size}
|
66
|
+
HEADER
|
67
|
+
output << matrix.map { |line| line.each_char.to_a * ' ' } * ?\n
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
def png(output: $>)
|
72
|
+
IO.popen("pnmtopng", 'w+') do |conv|
|
73
|
+
pbm(output: conv)
|
74
|
+
conv.close_write
|
75
|
+
output.write(conv.read)
|
76
|
+
end
|
77
|
+
self
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_image
|
81
|
+
suffix = Regexp.quote(File.extname(filename))
|
82
|
+
f = filename.sub(/(#{suffix}|)\z/, '.png')
|
83
|
+
File.open(f, 'wb') do |output|
|
84
|
+
png(output: output)
|
85
|
+
infobar.puts "Writing output to #{f.inspect}."
|
86
|
+
end
|
87
|
+
self
|
88
|
+
end
|
89
|
+
|
90
|
+
def similar_ranges(min_range: 3, skip_range: 0)
|
91
|
+
set = 0
|
92
|
+
ranges = { set => [] }
|
93
|
+
m = matrix
|
94
|
+
n = m.size
|
95
|
+
skip_count = 0
|
96
|
+
n.downto(1) do |h|
|
97
|
+
(n - h + 1).upto(n - 1) do |k|
|
98
|
+
i = k
|
99
|
+
j = k - (n - h + 1)
|
100
|
+
if m[i][j] == ?1
|
101
|
+
skip_count = 0
|
102
|
+
ranges[set] << [ i, j ]
|
103
|
+
elsif !ranges[set].empty? && skip_count < skip_range
|
104
|
+
skip_count += 1
|
105
|
+
else
|
106
|
+
skip_count = 0
|
107
|
+
ranges[set].empty? or ranges[set += 1] = []
|
108
|
+
end
|
109
|
+
end
|
110
|
+
skip_count = 0
|
111
|
+
ranges[set].empty? or ranges[set += 1] = []
|
112
|
+
end
|
113
|
+
ranges.each { |_, r|
|
114
|
+
r.flatten!
|
115
|
+
r.sort!
|
116
|
+
r.map! { |x| x + 1 }
|
117
|
+
r.minimize!
|
118
|
+
r.reject! { |s| s.size < min_range }
|
119
|
+
}.reject! { |_, r| r.empty? }
|
120
|
+
unions = []
|
121
|
+
while !ranges.empty?
|
122
|
+
_, r = ranges.first
|
123
|
+
equivalent = ranges.reject { |_, v| (v & r).empty? }
|
124
|
+
unions << equivalent.values.flatten.uniq
|
125
|
+
ranges.delete_if { |k, _| equivalent.keys.include?(k) }
|
126
|
+
end
|
127
|
+
unions.each do |r|
|
128
|
+
r.map! do |x|
|
129
|
+
"#{filename}:#{x.begin}-#{x.end}"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
unions
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
opts = go 'a:p:R:r:ih'
|
137
|
+
|
138
|
+
usage if opts[?h]
|
139
|
+
algo = Amatch.const_get(opts[?a] || 'Levenshtein')
|
140
|
+
p_lim = (opts[?p] || 0.95).to_f
|
141
|
+
min_range = (opts[?r] || 3).to_i
|
142
|
+
skip_range = opts[?R].to_i
|
143
|
+
ARGV.empty? and usage
|
144
|
+
|
145
|
+
filenames = ARGV.inject([]) { |s, f| s.concat(Dir[f]) }
|
146
|
+
for filename in filenames
|
147
|
+
finder = FindDuplicates.new(algo, p_lim, filename)
|
148
|
+
opts[?i] and finder.create_image
|
149
|
+
for s in finder.similar_ranges(min_range: min_range, skip_range: skip_range)
|
150
|
+
infobar.reset
|
151
|
+
puts s, ?\n
|
152
|
+
end
|
153
|
+
end
|
data/ext/amatch_ext.c
CHANGED
@@ -3,24 +3,8 @@
|
|
3
3
|
#include <ctype.h>
|
4
4
|
#include "common.h"
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
*
|
9
|
-
* call-seq: pattern -> pattern string
|
10
|
-
*
|
11
|
-
* Returns the current pattern string of this instance.
|
12
|
-
*/
|
13
|
-
|
14
|
-
/*
|
15
|
-
* Document-method: pattern=
|
16
|
-
*
|
17
|
-
* call-seq: pattern=(pattern)
|
18
|
-
*
|
19
|
-
* Sets the current pattern string of this instance to <code>pattern</code>.
|
20
|
-
*/
|
21
|
-
|
22
|
-
|
23
|
-
static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
6
|
+
static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein,
|
7
|
+
rb_cDamerauLevenshtein, rb_cSellers, rb_cHamming,
|
24
8
|
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
|
25
9
|
rb_cJaro, rb_cJaroWinkler;
|
26
10
|
|
@@ -230,9 +214,11 @@ DEF_ITERATE_STRINGS(JaroWinkler)
|
|
230
214
|
*/
|
231
215
|
|
232
216
|
#define COMPUTE_LEVENSHTEIN_DISTANCE \
|
233
|
-
|
217
|
+
c = 0; \
|
218
|
+
p = 0; \
|
219
|
+
for (i = 1; i <= a_len; i++) { \
|
234
220
|
c = i % 2; /* current row */ \
|
235
|
-
p = (i
|
221
|
+
p = (i - 1) % 2; /* previous row */ \
|
236
222
|
v[c][0] = i; /* first column */ \
|
237
223
|
for (j = 1; j <= b_len; j++) { \
|
238
224
|
/* Bellman's principle of optimality: */ \
|
@@ -245,8 +231,6 @@ DEF_ITERATE_STRINGS(JaroWinkler)
|
|
245
231
|
} \
|
246
232
|
v[c][j] = weight; \
|
247
233
|
} \
|
248
|
-
p = c; \
|
249
|
-
c = (c + 1) % 2; \
|
250
234
|
}
|
251
235
|
|
252
236
|
static VALUE Levenshtein_match(General *amatch, VALUE string)
|
@@ -269,7 +253,7 @@ static VALUE Levenshtein_match(General *amatch, VALUE string)
|
|
269
253
|
|
270
254
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
271
255
|
|
272
|
-
result = INT2FIX(v[
|
256
|
+
result = INT2FIX(v[c][b_len]);
|
273
257
|
|
274
258
|
xfree(v[0]);
|
275
259
|
xfree(v[1]);
|
@@ -287,6 +271,7 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
|
287
271
|
|
288
272
|
Check_Type(string, T_STRING);
|
289
273
|
DONT_OPTIMIZE
|
274
|
+
|
290
275
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
291
276
|
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
292
277
|
v[0] = ALLOC_N(int, b_len + 1);
|
@@ -299,12 +284,14 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
|
299
284
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
300
285
|
|
301
286
|
if (b_len > a_len) {
|
302
|
-
result = rb_float_new(1.0 - ((double) v[
|
287
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
|
303
288
|
} else {
|
304
|
-
result = rb_float_new(1.0 - ((double) v[
|
289
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
|
305
290
|
}
|
291
|
+
|
306
292
|
xfree(v[0]);
|
307
293
|
xfree(v[1]);
|
294
|
+
|
308
295
|
return result;
|
309
296
|
}
|
310
297
|
|
@@ -327,26 +314,159 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
|
|
327
314
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
328
315
|
|
329
316
|
for (i = 0, min = a_len; i <= b_len; i++) {
|
330
|
-
if (v[
|
317
|
+
if (v[c][i] < min) min = v[c][i];
|
331
318
|
}
|
332
319
|
|
333
320
|
result = INT2FIX(min);
|
334
321
|
|
335
322
|
xfree(v[0]);
|
336
323
|
xfree(v[1]);
|
337
|
-
|
324
|
+
|
325
|
+
return result;
|
326
|
+
}
|
327
|
+
|
328
|
+
/*
|
329
|
+
* DamerauLevenshtein edit distances are computed here:
|
330
|
+
*/
|
331
|
+
|
332
|
+
#define COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE \
|
333
|
+
c = 0; \
|
334
|
+
p = 0; \
|
335
|
+
pp = 0; \
|
336
|
+
for (i = 1; i <= a_len; i++) { \
|
337
|
+
c = i % 3; /* current row */ \
|
338
|
+
p = (i - 1) % 3; /* previous row */ \
|
339
|
+
pp = (i - 2) % 3; /* previous previous row */ \
|
340
|
+
v[c][0] = i; /* first column */ \
|
341
|
+
for (j = 1; j <= b_len; j++) { \
|
342
|
+
/* Bellman's principle of optimality: */ \
|
343
|
+
weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
344
|
+
if (weight > v[p][j] + 1) { \
|
345
|
+
weight = v[p][j] + 1; \
|
346
|
+
} \
|
347
|
+
if (weight > v[c][j - 1] + 1) { \
|
348
|
+
weight = v[c][j - 1] + 1; \
|
349
|
+
} \
|
350
|
+
if (i > 2 && j > 2 && a_ptr[i - 1] == b_ptr[j - 2] && a_ptr[i - 2] == b_ptr[j - 1]) {\
|
351
|
+
if (weight > v[pp][j - 2]) { \
|
352
|
+
weight = v[pp][j - 2] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
353
|
+
} \
|
354
|
+
} \
|
355
|
+
v[c][j] = weight; \
|
356
|
+
} \
|
357
|
+
}
|
358
|
+
|
359
|
+
static VALUE DamerauLevenshtein_match(General *amatch, VALUE string)
|
360
|
+
{
|
361
|
+
VALUE result;
|
362
|
+
char *a_ptr, *b_ptr;
|
363
|
+
int a_len, b_len;
|
364
|
+
int *v[3], weight;
|
365
|
+
int i, j, c, p, pp;
|
366
|
+
|
367
|
+
Check_Type(string, T_STRING);
|
368
|
+
DONT_OPTIMIZE
|
369
|
+
|
370
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
371
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
372
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
373
|
+
for (i = 0; i <= b_len; i++) {
|
374
|
+
v[0][i] = i;
|
375
|
+
v[1][i] = i;
|
376
|
+
v[2][i] = i;
|
377
|
+
}
|
378
|
+
|
379
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
380
|
+
|
381
|
+
result = INT2FIX(v[c][b_len]);
|
382
|
+
|
383
|
+
xfree(v[0]);
|
384
|
+
xfree(v[1]);
|
385
|
+
xfree(v[2]);
|
386
|
+
|
338
387
|
return result;
|
339
388
|
}
|
340
389
|
|
390
|
+
static VALUE DamerauLevenshtein_similar(General *amatch, VALUE string)
|
391
|
+
{
|
392
|
+
VALUE result;
|
393
|
+
char *a_ptr, *b_ptr;
|
394
|
+
int a_len, b_len;
|
395
|
+
int *v[3], weight;
|
396
|
+
int i, j, c, p, pp;
|
397
|
+
|
398
|
+
Check_Type(string, T_STRING);
|
399
|
+
DONT_OPTIMIZE
|
400
|
+
|
401
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
402
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
403
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
404
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
405
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
406
|
+
for (i = 0; i <= b_len; i++) {
|
407
|
+
v[0][i] = i;
|
408
|
+
v[1][i] = i;
|
409
|
+
v[2][i] = i;
|
410
|
+
}
|
411
|
+
|
412
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
413
|
+
|
414
|
+
if (b_len > a_len) {
|
415
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
|
416
|
+
} else {
|
417
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
|
418
|
+
}
|
419
|
+
|
420
|
+
xfree(v[0]);
|
421
|
+
xfree(v[1]);
|
422
|
+
xfree(v[2]);
|
423
|
+
|
424
|
+
return result;
|
425
|
+
}
|
426
|
+
|
427
|
+
static VALUE DamerauLevenshtein_search(General *amatch, VALUE string)
|
428
|
+
{
|
429
|
+
VALUE result;
|
430
|
+
char *a_ptr, *b_ptr;
|
431
|
+
int a_len, b_len;
|
432
|
+
int *v[3], weight, min;
|
433
|
+
int i, j, c, p, pp;
|
434
|
+
|
435
|
+
Check_Type(string, T_STRING);
|
436
|
+
DONT_OPTIMIZE
|
437
|
+
|
438
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
439
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
440
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
441
|
+
MEMZERO(v[0], int, b_len + 1);
|
442
|
+
MEMZERO(v[1], int, b_len + 1);
|
443
|
+
MEMZERO(v[2], int, b_len + 1);
|
444
|
+
|
445
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
446
|
+
|
447
|
+
for (i = 0, min = a_len; i <= b_len; i++) {
|
448
|
+
if (v[c][i] < min) min = v[c][i];
|
449
|
+
}
|
450
|
+
|
451
|
+
result = INT2FIX(min);
|
452
|
+
|
453
|
+
xfree(v[0]);
|
454
|
+
xfree(v[1]);
|
455
|
+
xfree(v[2]);
|
456
|
+
|
457
|
+
return result;
|
458
|
+
}
|
341
459
|
|
342
460
|
/*
|
343
461
|
* Sellers edit distances are computed here:
|
344
462
|
*/
|
345
463
|
|
346
464
|
#define COMPUTE_SELLERS_DISTANCE \
|
347
|
-
|
465
|
+
c = 0; \
|
466
|
+
p = 0; \
|
467
|
+
for (i = 1; i <= a_len; i++) { \
|
348
468
|
c = i % 2; /* current row */ \
|
349
|
-
p = (i
|
469
|
+
p = (i - 1) % 2; /* previous row */ \
|
350
470
|
v[c][0] = i * amatch->deletion; /* first column */ \
|
351
471
|
for (j = 1; j <= b_len; j++) { \
|
352
472
|
/* Bellman's principle of optimality: */ \
|
@@ -361,7 +481,6 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
|
|
361
481
|
v[c][j] = weight; \
|
362
482
|
} \
|
363
483
|
p = c; \
|
364
|
-
c = (c + 1) % 2; \
|
365
484
|
}
|
366
485
|
|
367
486
|
static VALUE Sellers_match(Sellers *amatch, VALUE string)
|
@@ -411,9 +530,10 @@ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
|
|
411
530
|
max_weight = amatch->deletion;
|
412
531
|
}
|
413
532
|
}
|
414
|
-
|
533
|
+
|
415
534
|
Check_Type(string, T_STRING);
|
416
535
|
DONT_OPTIMIZE
|
536
|
+
|
417
537
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
418
538
|
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
419
539
|
v[0] = ALLOC_N(double, b_len + 1);
|
@@ -459,7 +579,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
459
579
|
result = rb_float_new(min);
|
460
580
|
xfree(v[0]);
|
461
581
|
xfree(v[1]);
|
462
|
-
|
582
|
+
|
463
583
|
return result;
|
464
584
|
}
|
465
585
|
|
@@ -470,7 +590,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
470
590
|
static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
|
471
591
|
{
|
472
592
|
double result;
|
473
|
-
VALUE
|
593
|
+
VALUE string_tokens, tokens;
|
474
594
|
PairArray *pattern_pair_array, *pair_array;
|
475
595
|
|
476
596
|
Check_Type(string, T_STRING);
|
@@ -518,7 +638,7 @@ static VALUE Hamming_match(General *amatch, VALUE string)
|
|
518
638
|
char *a_ptr, *b_ptr;
|
519
639
|
int a_len, b_len;
|
520
640
|
int i, result;
|
521
|
-
|
641
|
+
|
522
642
|
Check_Type(string, T_STRING);
|
523
643
|
OPTIMIZE_TIME
|
524
644
|
COMPUTE_HAMMING_DISTANCE
|
@@ -530,7 +650,7 @@ static VALUE Hamming_similar(General *amatch, VALUE string)
|
|
530
650
|
char *a_ptr, *b_ptr;
|
531
651
|
int a_len, b_len;
|
532
652
|
int i, result;
|
533
|
-
|
653
|
+
|
534
654
|
Check_Type(string, T_STRING);
|
535
655
|
OPTIMIZE_TIME
|
536
656
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
@@ -570,7 +690,7 @@ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
|
|
570
690
|
char *a_ptr, *b_ptr;
|
571
691
|
int a_len, b_len;
|
572
692
|
int result, c, p, i, j, *l[2];
|
573
|
-
|
693
|
+
|
574
694
|
Check_Type(string, T_STRING);
|
575
695
|
OPTIMIZE_TIME
|
576
696
|
|
@@ -584,7 +704,7 @@ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
|
|
584
704
|
char *a_ptr, *b_ptr;
|
585
705
|
int a_len, b_len;
|
586
706
|
int result, c, p, i, j, *l[2];
|
587
|
-
|
707
|
+
|
588
708
|
Check_Type(string, T_STRING);
|
589
709
|
OPTIMIZE_TIME
|
590
710
|
|
@@ -624,7 +744,7 @@ static VALUE LongestSubstring_match(General *amatch, VALUE string)
|
|
624
744
|
char *a_ptr, *b_ptr;
|
625
745
|
int a_len, b_len;
|
626
746
|
int result, c, p, i, j, *l[2];
|
627
|
-
|
747
|
+
|
628
748
|
Check_Type(string, T_STRING);
|
629
749
|
OPTIMIZE_TIME
|
630
750
|
if (a_len == 0 || b_len == 0) return INT2FIX(0);
|
@@ -637,7 +757,7 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
|
|
637
757
|
char *a_ptr, *b_ptr;
|
638
758
|
int a_len, b_len;
|
639
759
|
int result, c, p, i, j, *l[2];
|
640
|
-
|
760
|
+
|
641
761
|
Check_Type(string, T_STRING);
|
642
762
|
OPTIMIZE_TIME
|
643
763
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
@@ -769,7 +889,7 @@ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
|
|
769
889
|
* Ruby API
|
770
890
|
*/
|
771
891
|
|
772
|
-
/*
|
892
|
+
/*
|
773
893
|
* Document-class: Amatch::Levenshtein
|
774
894
|
*
|
775
895
|
* The Levenshtein edit distance is defined as the minimal costs involved to
|
@@ -802,7 +922,7 @@ DEF_CONSTRUCTOR(Levenshtein, General)
|
|
802
922
|
|
803
923
|
/*
|
804
924
|
* call-seq: match(strings) -> results
|
805
|
-
*
|
925
|
+
*
|
806
926
|
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
807
927
|
* against <code>strings</code>. It returns the number operations, the Sellers
|
808
928
|
* distance. <code>strings</code> has to be either a String or an Array of
|
@@ -810,14 +930,14 @@ DEF_CONSTRUCTOR(Levenshtein, General)
|
|
810
930
|
* Floats respectively.
|
811
931
|
*/
|
812
932
|
static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
813
|
-
{
|
933
|
+
{
|
814
934
|
GET_STRUCT(General)
|
815
935
|
return General_iterate_strings(amatch, strings, Levenshtein_match);
|
816
936
|
}
|
817
937
|
|
818
938
|
/*
|
819
939
|
* call-seq: similar(strings) -> results
|
820
|
-
*
|
940
|
+
*
|
821
941
|
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
822
942
|
* against <code>strings</code>, and compute a Levenshtein distance metric
|
823
943
|
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
@@ -826,14 +946,14 @@ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
|
826
946
|
* respectively.
|
827
947
|
*/
|
828
948
|
static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
|
829
|
-
{
|
949
|
+
{
|
830
950
|
GET_STRUCT(General)
|
831
951
|
return General_iterate_strings(amatch, strings, Levenshtein_similar);
|
832
952
|
}
|
833
953
|
|
834
954
|
/*
|
835
955
|
* call-seq: levenshtein_similar(strings) -> results
|
836
|
-
*
|
956
|
+
*
|
837
957
|
* If called on a String, this string is used as a Amatch::Levenshtein#pattern
|
838
958
|
* to match against <code>strings</code>. It returns a Levenshtein distance
|
839
959
|
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
@@ -849,7 +969,7 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
|
849
969
|
|
850
970
|
/*
|
851
971
|
* call-seq: search(strings) -> results
|
852
|
-
*
|
972
|
+
*
|
853
973
|
* searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
|
854
974
|
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
855
975
|
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
@@ -857,12 +977,105 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
|
857
977
|
* <code>results</code> is either a Float or an Array of Floats respectively.
|
858
978
|
*/
|
859
979
|
static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
|
860
|
-
{
|
980
|
+
{
|
861
981
|
GET_STRUCT(General)
|
862
982
|
return General_iterate_strings(amatch, strings, Levenshtein_search);
|
863
983
|
}
|
864
984
|
|
865
|
-
/*
|
985
|
+
/*
|
986
|
+
* Document-class: Amatch::DamerauLevenshtein
|
987
|
+
* XXX
|
988
|
+
* The DamerauLevenshtein edit distance is defined as the minimal costs
|
989
|
+
* involved to transform one string into another by using three elementary
|
990
|
+
* operations: deletion, insertion and substitution of a character. To
|
991
|
+
* transform "water" into "wine", for instance, you have to substitute "a" ->
|
992
|
+
* "i": "witer", "t" -> "n": "winer" and delete "r": "wine". The edit distance
|
993
|
+
* between "water" and "wine" is 3, because you have to apply three
|
994
|
+
* operations. The edit distance between "wine" and "wine" is 0 of course: no
|
995
|
+
* operation is necessary for the transformation -- they're already the same
|
996
|
+
* string. It's easy to see that more similar strings have smaller edit
|
997
|
+
* distances than strings that differ a lot.
|
998
|
+
*/
|
999
|
+
|
1000
|
+
DEF_RB_FREE(DamerauLevenshtein, General)
|
1001
|
+
|
1002
|
+
/*
|
1003
|
+
* call-seq: new(pattern)
|
1004
|
+
* XXX
|
1005
|
+
* Creates a new Amatch::DamerauLevenshtein instance from <code>pattern</code>.
|
1006
|
+
*/
|
1007
|
+
static VALUE rb_DamerauLevenshtein_initialize(VALUE self, VALUE pattern)
|
1008
|
+
{
|
1009
|
+
GET_STRUCT(General)
|
1010
|
+
General_pattern_set(amatch, pattern);
|
1011
|
+
return self;
|
1012
|
+
}
|
1013
|
+
|
1014
|
+
DEF_CONSTRUCTOR(DamerauLevenshtein, General)
|
1015
|
+
|
1016
|
+
/*
|
1017
|
+
* call-seq: match(strings) -> results
|
1018
|
+
* XXX
|
1019
|
+
* Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
|
1020
|
+
* against <code>strings</code>. It returns the number operations, the Sellers
|
1021
|
+
* distance. <code>strings</code> has to be either a String or an Array of
|
1022
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
1023
|
+
* Floats respectively.
|
1024
|
+
*/
|
1025
|
+
static VALUE rb_DamerauLevenshtein_match(VALUE self, VALUE strings)
|
1026
|
+
{
|
1027
|
+
GET_STRUCT(General)
|
1028
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_match);
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
/*
|
1032
|
+
* call-seq: similar(strings) -> results
|
1033
|
+
* XXX
|
1034
|
+
* Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
|
1035
|
+
* against <code>strings</code>, and compute a DamerauLevenshtein distance metric
|
1036
|
+
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1037
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
1038
|
+
* returned <code>results</code> is either a Fixnum or an Array of Fixnums
|
1039
|
+
* respectively.
|
1040
|
+
*/
|
1041
|
+
static VALUE rb_DamerauLevenshtein_similar(VALUE self, VALUE strings)
|
1042
|
+
{
|
1043
|
+
GET_STRUCT(General)
|
1044
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_similar);
|
1045
|
+
}
|
1046
|
+
|
1047
|
+
/*
|
1048
|
+
* call-seq: levenshtein_similar(strings) -> results
|
1049
|
+
* XXX
|
1050
|
+
* If called on a String, this string is used as a Amatch::DamerauLevenshtein#pattern
|
1051
|
+
* to match against <code>strings</code>. It returns a DamerauLevenshtein distance
|
1052
|
+
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
1053
|
+
* match. <code>strings</code> has to be either a String or an Array of
|
1054
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
1055
|
+
* Floats respectively.
|
1056
|
+
*/
|
1057
|
+
static VALUE rb_str_damerau_levenshtein_similar(VALUE self, VALUE strings)
|
1058
|
+
{
|
1059
|
+
VALUE amatch = rb_DamerauLevenshtein_new(rb_cDamerauLevenshtein, self);
|
1060
|
+
return rb_DamerauLevenshtein_similar(amatch, strings);
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
/*
|
1064
|
+
* call-seq: search(strings) -> results
|
1065
|
+
* XXX
|
1066
|
+
* searches Amatch::DamerauLevenshtein#pattern in <code>strings</code> and returns the
|
1067
|
+
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
1068
|
+
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
1069
|
+
* to be either a String or an Array of Strings. The returned
|
1070
|
+
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1071
|
+
*/
|
1072
|
+
static VALUE rb_DamerauLevenshtein_search(VALUE self, VALUE strings)
|
1073
|
+
{
|
1074
|
+
GET_STRUCT(General)
|
1075
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_search);
|
1076
|
+
}
|
1077
|
+
|
1078
|
+
/*
|
866
1079
|
* Document-class: Amatch::Sellers
|
867
1080
|
*
|
868
1081
|
* The Sellers edit distance is very similar to the Levenshtein edit distance.
|
@@ -981,14 +1194,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
|
|
981
1194
|
* Document-method: pattern=
|
982
1195
|
*
|
983
1196
|
* call-seq: pattern=(pattern)
|
984
|
-
*
|
1197
|
+
*
|
985
1198
|
* Sets the current pattern string of this Amatch::Sellers instance to
|
986
1199
|
* <code>pattern</code>.
|
987
1200
|
*/
|
988
1201
|
|
989
1202
|
/*
|
990
1203
|
* call-seq: match(strings) -> results
|
991
|
-
*
|
1204
|
+
*
|
992
1205
|
* Uses this Amatch::Sellers instance to match Sellers#pattern against
|
993
1206
|
* <code>strings</code>, while taking into account the given weights. It
|
994
1207
|
* returns the number of weighted character operations, the Sellers distance.
|
@@ -997,14 +1210,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
|
|
997
1210
|
* respectively.
|
998
1211
|
*/
|
999
1212
|
static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
1000
|
-
{
|
1213
|
+
{
|
1001
1214
|
GET_STRUCT(Sellers)
|
1002
1215
|
return Sellers_iterate_strings(amatch, strings, Sellers_match);
|
1003
1216
|
}
|
1004
1217
|
|
1005
1218
|
/*
|
1006
1219
|
* call-seq: similar(strings) -> results
|
1007
|
-
*
|
1220
|
+
*
|
1008
1221
|
* Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
|
1009
1222
|
* against <code>strings</code> (taking into account the given weights), and
|
1010
1223
|
* compute a Sellers distance metric number between 0.0 for very unsimilar
|
@@ -1014,7 +1227,7 @@ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
|
1014
1227
|
* respectively.
|
1015
1228
|
*/
|
1016
1229
|
static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
1017
|
-
{
|
1230
|
+
{
|
1018
1231
|
GET_STRUCT(Sellers)
|
1019
1232
|
return Sellers_iterate_strings(amatch, strings, Sellers_similar);
|
1020
1233
|
}
|
@@ -1029,12 +1242,12 @@ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
|
1029
1242
|
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1030
1243
|
*/
|
1031
1244
|
static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
1032
|
-
{
|
1245
|
+
{
|
1033
1246
|
GET_STRUCT(Sellers)
|
1034
1247
|
return Sellers_iterate_strings(amatch, strings, Sellers_search);
|
1035
1248
|
}
|
1036
1249
|
|
1037
|
-
/*
|
1250
|
+
/*
|
1038
1251
|
* Document-class: Amatch::PairDistance
|
1039
1252
|
*
|
1040
1253
|
* The pair distance between two strings is based on the number of adjacent
|
@@ -1045,7 +1258,7 @@ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
|
1045
1258
|
* are more dissimilar. The advantage of considering adjacent characters, is to
|
1046
1259
|
* take account not only of the characters, but also of the character ordering
|
1047
1260
|
* in the original strings.
|
1048
|
-
*
|
1261
|
+
*
|
1049
1262
|
* This metric is very capable to find similarities in natural languages.
|
1050
1263
|
* It is explained in more detail in Simon White's article "How to Strike a
|
1051
1264
|
* Match", located at this url:
|
@@ -1072,7 +1285,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
|
1072
1285
|
|
1073
1286
|
/*
|
1074
1287
|
* call-seq: match(strings, regexp = /\s+/) -> results
|
1075
|
-
*
|
1288
|
+
*
|
1076
1289
|
* Uses this Amatch::PairDistance instance to match PairDistance#pattern against
|
1077
1290
|
* <code>strings</code>. It returns the pair distance measure, that is a
|
1078
1291
|
* returned value of 1.0 is an exact match, partial matches are lower
|
@@ -1088,7 +1301,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
|
1088
1301
|
* Array of Floats respectively.
|
1089
1302
|
*/
|
1090
1303
|
static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
1091
|
-
{
|
1304
|
+
{
|
1092
1305
|
VALUE result, strings, regexp = Qnil;
|
1093
1306
|
int use_regexp;
|
1094
1307
|
GET_STRUCT(PairDistance)
|
@@ -1146,7 +1359,7 @@ static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
|
|
1146
1359
|
}
|
1147
1360
|
}
|
1148
1361
|
|
1149
|
-
/*
|
1362
|
+
/*
|
1150
1363
|
* Document-class: Amatch::Hamming
|
1151
1364
|
*
|
1152
1365
|
* This class computes the Hamming distance between two strings.
|
@@ -1176,7 +1389,7 @@ DEF_CONSTRUCTOR(Hamming, General)
|
|
1176
1389
|
|
1177
1390
|
/*
|
1178
1391
|
* call-seq: match(strings) -> results
|
1179
|
-
*
|
1392
|
+
*
|
1180
1393
|
* Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
|
1181
1394
|
* <code>strings</code>, that is compute the hamming distance between
|
1182
1395
|
* <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
|
@@ -1184,7 +1397,7 @@ DEF_CONSTRUCTOR(Hamming, General)
|
|
1184
1397
|
* is either a Fixnum or an Array of Fixnums respectively.
|
1185
1398
|
*/
|
1186
1399
|
static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
1187
|
-
{
|
1400
|
+
{
|
1188
1401
|
GET_STRUCT(General)
|
1189
1402
|
return General_iterate_strings(amatch, strings, Hamming_match);
|
1190
1403
|
}
|
@@ -1200,7 +1413,7 @@ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
|
1200
1413
|
* respectively.
|
1201
1414
|
*/
|
1202
1415
|
static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
|
1203
|
-
{
|
1416
|
+
{
|
1204
1417
|
GET_STRUCT(General)
|
1205
1418
|
return General_iterate_strings(amatch, strings, Hamming_similar);
|
1206
1419
|
}
|
@@ -1222,7 +1435,7 @@ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
|
|
1222
1435
|
}
|
1223
1436
|
|
1224
1437
|
|
1225
|
-
/*
|
1438
|
+
/*
|
1226
1439
|
* Document-class: Amatch::LongestSubsequence
|
1227
1440
|
*
|
1228
1441
|
* This class computes the length of the longest subsequence common to two
|
@@ -1252,7 +1465,7 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
|
|
1252
1465
|
|
1253
1466
|
/*
|
1254
1467
|
* call-seq: match(strings) -> results
|
1255
|
-
*
|
1468
|
+
*
|
1256
1469
|
* Uses this Amatch::LongestSubsequence instance to match
|
1257
1470
|
* LongestSubsequence#pattern against <code>strings</code>, that is compute the
|
1258
1471
|
* length of the longest common subsequence. <code>strings</code> has to be
|
@@ -1260,14 +1473,14 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
|
|
1260
1473
|
* is either a Fixnum or an Array of Fixnums respectively.
|
1261
1474
|
*/
|
1262
1475
|
static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
1263
|
-
{
|
1476
|
+
{
|
1264
1477
|
GET_STRUCT(General)
|
1265
1478
|
return General_iterate_strings(amatch, strings, LongestSubsequence_match);
|
1266
1479
|
}
|
1267
1480
|
|
1268
1481
|
/*
|
1269
1482
|
* call-seq: similar(strings) -> results
|
1270
|
-
*
|
1483
|
+
*
|
1271
1484
|
* Uses this Amatch::LongestSubsequence instance to match
|
1272
1485
|
* Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
|
1273
1486
|
* a longest substring distance metric number between 0.0 for very unsimilar
|
@@ -1276,7 +1489,7 @@ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
|
1276
1489
|
* a Fixnum or an Array of Fixnums
|
1277
1490
|
*/
|
1278
1491
|
static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
1279
|
-
{
|
1492
|
+
{
|
1280
1493
|
GET_STRUCT(General)
|
1281
1494
|
return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
|
1282
1495
|
}
|
@@ -1292,12 +1505,12 @@ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
|
1292
1505
|
* is either a Float or an Array of Floats respectively.
|
1293
1506
|
*/
|
1294
1507
|
static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
1295
|
-
{
|
1508
|
+
{
|
1296
1509
|
VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
|
1297
1510
|
return rb_LongestSubsequence_similar(amatch, strings);
|
1298
1511
|
}
|
1299
1512
|
|
1300
|
-
/*
|
1513
|
+
/*
|
1301
1514
|
* Document-class: Amatch::LongestSubstring
|
1302
1515
|
*
|
1303
1516
|
* The longest common substring is the longest substring, that is part of
|
@@ -1308,7 +1521,7 @@ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
|
1308
1521
|
* The longest common substring between 'string' and 'string' is 'string'
|
1309
1522
|
* again, thus the longest common substring length is 6. The longest common
|
1310
1523
|
* substring between 'string' and 'storing' is 'ring', thus the longest common
|
1311
|
-
* substring length is 4.
|
1524
|
+
* substring length is 4.
|
1312
1525
|
*/
|
1313
1526
|
|
1314
1527
|
DEF_RB_FREE(LongestSubstring, General)
|
@@ -1329,7 +1542,7 @@ DEF_CONSTRUCTOR(LongestSubstring, General)
|
|
1329
1542
|
|
1330
1543
|
/*
|
1331
1544
|
* call-seq: match(strings) -> results
|
1332
|
-
*
|
1545
|
+
*
|
1333
1546
|
* Uses this Amatch::LongestSubstring instance to match
|
1334
1547
|
* LongestSubstring#pattern against <code>strings</code>, that is compute the
|
1335
1548
|
* length of the longest common substring. <code>strings</code> has to be
|
@@ -1344,7 +1557,7 @@ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
|
|
1344
1557
|
|
1345
1558
|
/*
|
1346
1559
|
* call-seq: similar(strings) -> results
|
1347
|
-
*
|
1560
|
+
*
|
1348
1561
|
* Uses this Amatch::LongestSubstring instance to match
|
1349
1562
|
* Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
|
1350
1563
|
* longest substring distance metric number between 0.0 for very unsimilar
|
@@ -1370,11 +1583,11 @@ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
|
|
1370
1583
|
* is either a Float or an Array of Floats respectively.
|
1371
1584
|
*/
|
1372
1585
|
static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
|
1373
|
-
{
|
1586
|
+
{
|
1374
1587
|
VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
|
1375
1588
|
return rb_LongestSubstring_similar(amatch, strings);
|
1376
1589
|
}
|
1377
|
-
|
1590
|
+
|
1378
1591
|
/*
|
1379
1592
|
* Document-class: Amatch::Jaro
|
1380
1593
|
*
|
@@ -1571,6 +1784,17 @@ void Init_amatch_ext()
|
|
1571
1784
|
rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
|
1572
1785
|
rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
|
1573
1786
|
|
1787
|
+
/* DamerauLevenshtein */
|
1788
|
+
rb_cDamerauLevenshtein = rb_define_class_under(rb_mAmatch, "DamerauLevenshtein", rb_cObject);
|
1789
|
+
rb_define_alloc_func(rb_cDamerauLevenshtein, rb_DamerauLevenshtein_s_allocate);
|
1790
|
+
rb_define_method(rb_cDamerauLevenshtein, "initialize", rb_DamerauLevenshtein_initialize, 1);
|
1791
|
+
rb_define_method(rb_cDamerauLevenshtein, "pattern", rb_General_pattern, 0);
|
1792
|
+
rb_define_method(rb_cDamerauLevenshtein, "pattern=", rb_General_pattern_set, 1);
|
1793
|
+
rb_define_method(rb_cDamerauLevenshtein, "match", rb_DamerauLevenshtein_match, 1);
|
1794
|
+
rb_define_method(rb_cDamerauLevenshtein, "search", rb_DamerauLevenshtein_search, 1);
|
1795
|
+
rb_define_method(rb_cDamerauLevenshtein, "similar", rb_DamerauLevenshtein_similar, 1);
|
1796
|
+
rb_define_method(rb_mAmatchStringMethods, "damerau_levenshtein_similar", rb_str_damerau_levenshtein_similar, 1);
|
1797
|
+
|
1574
1798
|
/* Sellers */
|
1575
1799
|
rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
|
1576
1800
|
rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
|