amatch 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/CHANGES +3 -0
- data/COPYING +203 -340
- data/README.md +124 -0
- data/Rakefile +5 -12
- data/VERSION +1 -1
- data/amatch.gemspec +0 -0
- data/bin/{agrep.rb → agrep} +23 -9
- data/bin/dupfind +153 -0
- data/ext/amatch_ext.c +298 -74
- data/images/amatch_ext.png +0 -0
- data/lib/amatch/version.rb +1 -1
- data/tests/test_damerau_levenshtein.rb +93 -0
- metadata +27 -8
- data/README.rdoc +0 -128
data/README.md
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
# amatch - Approximate Matching Extension for Ruby
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
This is a collection of classes that can be used for Approximate
|
6
|
+
matching, searching, and comparing of Strings. They implement algorithms
|
7
|
+
that compute the Levenshtein edit distance, Sellers edit distance, the
|
8
|
+
Hamming distance, the longest common subsequence length, the longest common
|
9
|
+
substring length, the pair distance metric, the Jaro-Winkler metric.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
To install this extension as a gem type
|
14
|
+
|
15
|
+
# gem install amatch
|
16
|
+
|
17
|
+
into the shell.
|
18
|
+
|
19
|
+
## Download
|
20
|
+
|
21
|
+
The homepage of this library is located at
|
22
|
+
|
23
|
+
* https://github.com/flori/amatch
|
24
|
+
|
25
|
+
## Examples
|
26
|
+
|
27
|
+
require 'amatch'
|
28
|
+
# => true
|
29
|
+
include Amatch
|
30
|
+
# => Object
|
31
|
+
|
32
|
+
m = Sellers.new("pattern")
|
33
|
+
# => #<Amatch::Sellers:0x40366324>
|
34
|
+
m.match("pattren")
|
35
|
+
# => 2.0
|
36
|
+
m.substitution = m.insertion = 3
|
37
|
+
# => 3
|
38
|
+
m.match("pattren")
|
39
|
+
# => 4.0
|
40
|
+
m.reset_weights
|
41
|
+
# => #<Amatch::Sellers:0x40366324>
|
42
|
+
m.match(["pattren","parent"])
|
43
|
+
# => [2.0, 4.0]
|
44
|
+
m.search("abcpattrendef")
|
45
|
+
# => 2.0
|
46
|
+
|
47
|
+
m = Levenshtein.new("pattern")
|
48
|
+
# => #<Amatch::Levenshtein:0x4035919c>
|
49
|
+
m.match("pattren")
|
50
|
+
# => 2
|
51
|
+
m.search("abcpattrendef")
|
52
|
+
# => 2
|
53
|
+
"pattern language".levenshtein_similar("language of patterns")
|
54
|
+
# => 0.2
|
55
|
+
|
56
|
+
m = Amatch::DamerauLevenshtein.new("pattern")
|
57
|
+
# => #<Amatch::DamerauLevenshtein:0x007fc3483dd278>
|
58
|
+
m.match("pattren")
|
59
|
+
# => 1
|
60
|
+
"pattern language".damerau_levenshtein_similar("language of patterns")
|
61
|
+
# => 0.19999999999999996
|
62
|
+
|
63
|
+
m = Hamming.new("pattern")
|
64
|
+
# => #<Amatch::Hamming:0x40350858>
|
65
|
+
m.match("pattren")
|
66
|
+
# => 2
|
67
|
+
"pattern language".hamming_similar("language of patterns")
|
68
|
+
# => 0.1
|
69
|
+
|
70
|
+
m = PairDistance.new("pattern")
|
71
|
+
# => #<Amatch::PairDistance:0x40349be8>
|
72
|
+
m.match("pattr en")
|
73
|
+
# => 0.545454545454545
|
74
|
+
m.match("pattr en", nil)
|
75
|
+
# => 0.461538461538462
|
76
|
+
m.match("pattr en", /t+/)
|
77
|
+
# => 0.285714285714286
|
78
|
+
"pattern language".pair_distance_similar("language of patterns")
|
79
|
+
# => 0.928571428571429
|
80
|
+
|
81
|
+
m = LongestSubsequence.new("pattern")
|
82
|
+
# => #<Amatch::LongestSubsequence:0x4033e900>
|
83
|
+
m.match("pattren")
|
84
|
+
# => 6
|
85
|
+
"pattern language".longest_subsequence_similar("language of patterns")
|
86
|
+
# => 0.4
|
87
|
+
|
88
|
+
m = LongestSubstring.new("pattern")
|
89
|
+
# => #<Amatch::LongestSubstring:0x403378d0>
|
90
|
+
m.match("pattren")
|
91
|
+
# => 4
|
92
|
+
"pattern language".longest_substring_similar("language of patterns")
|
93
|
+
# => 0.4
|
94
|
+
|
95
|
+
m = Jaro.new("pattern")
|
96
|
+
# => #<Amatch::Jaro:0x363b70>
|
97
|
+
m.match("paTTren")
|
98
|
+
# => 0.952380952380952
|
99
|
+
m.ignore_case = false
|
100
|
+
m.match("paTTren")
|
101
|
+
# => 0.742857142857143
|
102
|
+
"pattern language".jaro_similar("language of patterns")
|
103
|
+
# => 0.672222222222222
|
104
|
+
|
105
|
+
m = JaroWinkler.new("pattern")
|
106
|
+
# #<Amatch::JaroWinkler:0x3530b8>
|
107
|
+
m.match("paTTren")
|
108
|
+
# => 0.971428571712403
|
109
|
+
m.ignore_case = false
|
110
|
+
m.match("paTTren")
|
111
|
+
# => 0.79428571505206
|
112
|
+
m.scaling_factor = 0.05
|
113
|
+
m.match("pattren")
|
114
|
+
# => 0.961904762046678
|
115
|
+
"pattern language".jarowinkler_similar("language of patterns")
|
116
|
+
# => 0.672222222222222
|
117
|
+
|
118
|
+
## Author
|
119
|
+
|
120
|
+
Florian Frank mailto:flori@ping.de
|
121
|
+
|
122
|
+
## License
|
123
|
+
|
124
|
+
Apache License, Version 2.0 – See the COPYING file in the source archive.
|
data/Rakefile
CHANGED
@@ -13,22 +13,15 @@ Amatch is a library for approximate string matching and searching in strings.
|
|
13
13
|
Several algorithms can be used to do this, and it's also possible to compute a
|
14
14
|
similarity metric number between 0.0 and 1.0 for two given strings.
|
15
15
|
EOT
|
16
|
-
executables << 'agrep
|
16
|
+
executables << 'agrep' << 'dupfind'
|
17
17
|
bindir 'bin'
|
18
18
|
test_dir 'tests'
|
19
|
-
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', '
|
19
|
+
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', 'Makefile'
|
20
20
|
title "#{name.camelize} - Approximate Matching"
|
21
|
-
readme 'README.
|
21
|
+
readme 'README.md'
|
22
22
|
require_paths %w[lib ext]
|
23
23
|
dependency 'tins', '~>1.0'
|
24
|
+
dependency 'mize'
|
24
25
|
development_dependency 'test-unit', '~>3.0'
|
25
|
-
licenses << '
|
26
|
-
|
27
|
-
install_library do
|
28
|
-
libdir = CONFIG["sitelibdir"]
|
29
|
-
src, = Dir['ext/amatch.*'].reject { |x| x =~ /\.[co]$/ }
|
30
|
-
install(src, File.join(libdir, File.basename(src)), :verbose => true)
|
31
|
-
mkdir_p dst = File.join(libdir, 'amatch')
|
32
|
-
install('lib/amatch/version.rb', File.join(dst, 'version.rb'), :verbose => true)
|
33
|
-
end
|
26
|
+
licenses << 'Apache-2.0'
|
34
27
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
data/amatch.gemspec
CHANGED
Binary file
|
data/bin/{agrep.rb → agrep}
RENAMED
@@ -15,15 +15,21 @@ end
|
|
15
15
|
|
16
16
|
class Amatch::Levenshtein
|
17
17
|
def search_relative(strings)
|
18
|
-
|
18
|
+
if Array === strings
|
19
|
+
search(strings).map { |s| s.to_f / pattern.size }
|
20
|
+
else
|
21
|
+
search(strings).to_f / pattern.size
|
22
|
+
end
|
19
23
|
end
|
20
24
|
end
|
21
25
|
|
26
|
+
$algorithm = 'Levenshtein'
|
22
27
|
$distance = 1
|
23
28
|
$mode = :search
|
24
29
|
begin
|
25
30
|
parser = GetoptLong.new
|
26
31
|
options = [
|
32
|
+
[ '--algorithm', '-a', GetoptLong::REQUIRED_ARGUMENT ],
|
27
33
|
[ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
|
28
34
|
[ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
|
29
35
|
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
|
@@ -33,6 +39,8 @@ begin
|
|
33
39
|
parser.each_option do |name, arg|
|
34
40
|
name = name.sub(/^--/, '')
|
35
41
|
case name
|
42
|
+
when 'algorithm'
|
43
|
+
$algorithm = arg
|
36
44
|
when 'distance'
|
37
45
|
$distance = arg.to_f
|
38
46
|
when 'relative'
|
@@ -48,7 +56,7 @@ rescue
|
|
48
56
|
end
|
49
57
|
pattern = ARGV.shift or usage('Pattern needed!', options)
|
50
58
|
|
51
|
-
matcher = Amatch
|
59
|
+
matcher = Amatch.const_get($algorithm).new(pattern)
|
52
60
|
size = 0
|
53
61
|
start = Time.new
|
54
62
|
if ARGV.size > 0 then
|
@@ -56,9 +64,12 @@ if ARGV.size > 0 then
|
|
56
64
|
File.stat(filename).file? or next
|
57
65
|
size += File.size(filename)
|
58
66
|
begin
|
59
|
-
File.open(filename, 'r').each_line do |
|
60
|
-
|
61
|
-
|
67
|
+
File.open(filename, 'r').each_line.each_slice(1000) do |lines|
|
68
|
+
results = matcher.__send__($mode, lines)
|
69
|
+
lines.zip(results) do |line, r|
|
70
|
+
if r <= $distance
|
71
|
+
puts "#{filename}:#{line}"
|
72
|
+
end
|
62
73
|
end
|
63
74
|
end
|
64
75
|
rescue
|
@@ -66,10 +77,13 @@ if ARGV.size > 0 then
|
|
66
77
|
end
|
67
78
|
end
|
68
79
|
else
|
69
|
-
STDIN.each_line do |
|
70
|
-
size +=
|
71
|
-
|
72
|
-
|
80
|
+
STDIN.each_line.each_slice(1000) do |lines|
|
81
|
+
size += lines.size
|
82
|
+
results = matcher.__send__($mode, lines)
|
83
|
+
lines.zip(results) do |line, r|
|
84
|
+
if r <= $distance
|
85
|
+
puts line
|
86
|
+
end
|
73
87
|
end
|
74
88
|
end
|
75
89
|
end
|
data/bin/dupfind
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'tins/go'
|
4
|
+
include Tins::GO
|
5
|
+
require 'tins/minimize'
|
6
|
+
class Array
|
7
|
+
include Tins::Minimize
|
8
|
+
end
|
9
|
+
require 'amatch'
|
10
|
+
begin
|
11
|
+
require 'infobar'
|
12
|
+
rescue LoadError
|
13
|
+
warn "Please install gem infobar to run this executable!"
|
14
|
+
exit 1
|
15
|
+
end
|
16
|
+
|
17
|
+
def usage
|
18
|
+
puts <<EOT
|
19
|
+
Usage: #{File.basename($0)} [OPTIONS] FILE
|
20
|
+
|
21
|
+
-a ALGO Amatch matching algorithm
|
22
|
+
-p LIMIT more than p similarity to be a match
|
23
|
+
-R NUMBER skip NUMBER mismatch for building ranges
|
24
|
+
-r NUMBER minimum length to be counted as a range
|
25
|
+
-i compute a PNG per file
|
26
|
+
|
27
|
+
Repor$ bugs to <flori@ping.de>.
|
28
|
+
EOT
|
29
|
+
exit 0
|
30
|
+
end
|
31
|
+
|
32
|
+
class FindDuplicates
|
33
|
+
def initialize(algo, p_lim, filename)
|
34
|
+
@algo, @p_lim, @filename = algo, p_lim, filename
|
35
|
+
end
|
36
|
+
|
37
|
+
attr_reader :filename
|
38
|
+
|
39
|
+
attr_reader :algo
|
40
|
+
|
41
|
+
attr_reader :p_lim
|
42
|
+
|
43
|
+
memoize method:
|
44
|
+
def lines
|
45
|
+
File.readlines(filename)
|
46
|
+
end
|
47
|
+
|
48
|
+
memoize method:
|
49
|
+
def matrix
|
50
|
+
result = lines.with_infobar(label: filename, output: STDERR).map do |l1|
|
51
|
+
+infobar
|
52
|
+
a = algo.new(l1)
|
53
|
+
r = a.similar(lines)
|
54
|
+
r.map! { |s| s >= p_lim ? ?1 : ?0 }
|
55
|
+
r.join
|
56
|
+
end
|
57
|
+
infobar.finish
|
58
|
+
infobar.newline
|
59
|
+
result
|
60
|
+
end
|
61
|
+
|
62
|
+
def pbm(output: $>)
|
63
|
+
output << <<HEADER
|
64
|
+
P1
|
65
|
+
#{matrix.size} #{matrix.size}
|
66
|
+
HEADER
|
67
|
+
output << matrix.map { |line| line.each_char.to_a * ' ' } * ?\n
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
def png(output: $>)
|
72
|
+
IO.popen("pnmtopng", 'w+') do |conv|
|
73
|
+
pbm(output: conv)
|
74
|
+
conv.close_write
|
75
|
+
output.write(conv.read)
|
76
|
+
end
|
77
|
+
self
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_image
|
81
|
+
suffix = Regexp.quote(File.extname(filename))
|
82
|
+
f = filename.sub(/(#{suffix}|)\z/, '.png')
|
83
|
+
File.open(f, 'wb') do |output|
|
84
|
+
png(output: output)
|
85
|
+
infobar.puts "Writing output to #{f.inspect}."
|
86
|
+
end
|
87
|
+
self
|
88
|
+
end
|
89
|
+
|
90
|
+
def similar_ranges(min_range: 3, skip_range: 0)
|
91
|
+
set = 0
|
92
|
+
ranges = { set => [] }
|
93
|
+
m = matrix
|
94
|
+
n = m.size
|
95
|
+
skip_count = 0
|
96
|
+
n.downto(1) do |h|
|
97
|
+
(n - h + 1).upto(n - 1) do |k|
|
98
|
+
i = k
|
99
|
+
j = k - (n - h + 1)
|
100
|
+
if m[i][j] == ?1
|
101
|
+
skip_count = 0
|
102
|
+
ranges[set] << [ i, j ]
|
103
|
+
elsif !ranges[set].empty? && skip_count < skip_range
|
104
|
+
skip_count += 1
|
105
|
+
else
|
106
|
+
skip_count = 0
|
107
|
+
ranges[set].empty? or ranges[set += 1] = []
|
108
|
+
end
|
109
|
+
end
|
110
|
+
skip_count = 0
|
111
|
+
ranges[set].empty? or ranges[set += 1] = []
|
112
|
+
end
|
113
|
+
ranges.each { |_, r|
|
114
|
+
r.flatten!
|
115
|
+
r.sort!
|
116
|
+
r.map! { |x| x + 1 }
|
117
|
+
r.minimize!
|
118
|
+
r.reject! { |s| s.size < min_range }
|
119
|
+
}.reject! { |_, r| r.empty? }
|
120
|
+
unions = []
|
121
|
+
while !ranges.empty?
|
122
|
+
_, r = ranges.first
|
123
|
+
equivalent = ranges.reject { |_, v| (v & r).empty? }
|
124
|
+
unions << equivalent.values.flatten.uniq
|
125
|
+
ranges.delete_if { |k, _| equivalent.keys.include?(k) }
|
126
|
+
end
|
127
|
+
unions.each do |r|
|
128
|
+
r.map! do |x|
|
129
|
+
"#{filename}:#{x.begin}-#{x.end}"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
unions
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
opts = go 'a:p:R:r:ih'
|
137
|
+
|
138
|
+
usage if opts[?h]
|
139
|
+
algo = Amatch.const_get(opts[?a] || 'Levenshtein')
|
140
|
+
p_lim = (opts[?p] || 0.95).to_f
|
141
|
+
min_range = (opts[?r] || 3).to_i
|
142
|
+
skip_range = opts[?R].to_i
|
143
|
+
ARGV.empty? and usage
|
144
|
+
|
145
|
+
filenames = ARGV.inject([]) { |s, f| s.concat(Dir[f]) }
|
146
|
+
for filename in filenames
|
147
|
+
finder = FindDuplicates.new(algo, p_lim, filename)
|
148
|
+
opts[?i] and finder.create_image
|
149
|
+
for s in finder.similar_ranges(min_range: min_range, skip_range: skip_range)
|
150
|
+
infobar.reset
|
151
|
+
puts s, ?\n
|
152
|
+
end
|
153
|
+
end
|
data/ext/amatch_ext.c
CHANGED
@@ -3,24 +3,8 @@
|
|
3
3
|
#include <ctype.h>
|
4
4
|
#include "common.h"
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
*
|
9
|
-
* call-seq: pattern -> pattern string
|
10
|
-
*
|
11
|
-
* Returns the current pattern string of this instance.
|
12
|
-
*/
|
13
|
-
|
14
|
-
/*
|
15
|
-
* Document-method: pattern=
|
16
|
-
*
|
17
|
-
* call-seq: pattern=(pattern)
|
18
|
-
*
|
19
|
-
* Sets the current pattern string of this instance to <code>pattern</code>.
|
20
|
-
*/
|
21
|
-
|
22
|
-
|
23
|
-
static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
6
|
+
static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein,
|
7
|
+
rb_cDamerauLevenshtein, rb_cSellers, rb_cHamming,
|
24
8
|
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
|
25
9
|
rb_cJaro, rb_cJaroWinkler;
|
26
10
|
|
@@ -230,9 +214,11 @@ DEF_ITERATE_STRINGS(JaroWinkler)
|
|
230
214
|
*/
|
231
215
|
|
232
216
|
#define COMPUTE_LEVENSHTEIN_DISTANCE \
|
233
|
-
|
217
|
+
c = 0; \
|
218
|
+
p = 0; \
|
219
|
+
for (i = 1; i <= a_len; i++) { \
|
234
220
|
c = i % 2; /* current row */ \
|
235
|
-
p = (i
|
221
|
+
p = (i - 1) % 2; /* previous row */ \
|
236
222
|
v[c][0] = i; /* first column */ \
|
237
223
|
for (j = 1; j <= b_len; j++) { \
|
238
224
|
/* Bellman's principle of optimality: */ \
|
@@ -245,8 +231,6 @@ DEF_ITERATE_STRINGS(JaroWinkler)
|
|
245
231
|
} \
|
246
232
|
v[c][j] = weight; \
|
247
233
|
} \
|
248
|
-
p = c; \
|
249
|
-
c = (c + 1) % 2; \
|
250
234
|
}
|
251
235
|
|
252
236
|
static VALUE Levenshtein_match(General *amatch, VALUE string)
|
@@ -269,7 +253,7 @@ static VALUE Levenshtein_match(General *amatch, VALUE string)
|
|
269
253
|
|
270
254
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
271
255
|
|
272
|
-
result = INT2FIX(v[
|
256
|
+
result = INT2FIX(v[c][b_len]);
|
273
257
|
|
274
258
|
xfree(v[0]);
|
275
259
|
xfree(v[1]);
|
@@ -287,6 +271,7 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
|
287
271
|
|
288
272
|
Check_Type(string, T_STRING);
|
289
273
|
DONT_OPTIMIZE
|
274
|
+
|
290
275
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
291
276
|
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
292
277
|
v[0] = ALLOC_N(int, b_len + 1);
|
@@ -299,12 +284,14 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
|
299
284
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
300
285
|
|
301
286
|
if (b_len > a_len) {
|
302
|
-
result = rb_float_new(1.0 - ((double) v[
|
287
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
|
303
288
|
} else {
|
304
|
-
result = rb_float_new(1.0 - ((double) v[
|
289
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
|
305
290
|
}
|
291
|
+
|
306
292
|
xfree(v[0]);
|
307
293
|
xfree(v[1]);
|
294
|
+
|
308
295
|
return result;
|
309
296
|
}
|
310
297
|
|
@@ -327,26 +314,159 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
|
|
327
314
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
328
315
|
|
329
316
|
for (i = 0, min = a_len; i <= b_len; i++) {
|
330
|
-
if (v[
|
317
|
+
if (v[c][i] < min) min = v[c][i];
|
331
318
|
}
|
332
319
|
|
333
320
|
result = INT2FIX(min);
|
334
321
|
|
335
322
|
xfree(v[0]);
|
336
323
|
xfree(v[1]);
|
337
|
-
|
324
|
+
|
325
|
+
return result;
|
326
|
+
}
|
327
|
+
|
328
|
+
/*
|
329
|
+
* DamerauLevenshtein edit distances are computed here:
|
330
|
+
*/
|
331
|
+
|
332
|
+
#define COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE \
|
333
|
+
c = 0; \
|
334
|
+
p = 0; \
|
335
|
+
pp = 0; \
|
336
|
+
for (i = 1; i <= a_len; i++) { \
|
337
|
+
c = i % 3; /* current row */ \
|
338
|
+
p = (i - 1) % 3; /* previous row */ \
|
339
|
+
pp = (i - 2) % 3; /* previous previous row */ \
|
340
|
+
v[c][0] = i; /* first column */ \
|
341
|
+
for (j = 1; j <= b_len; j++) { \
|
342
|
+
/* Bellman's principle of optimality: */ \
|
343
|
+
weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
344
|
+
if (weight > v[p][j] + 1) { \
|
345
|
+
weight = v[p][j] + 1; \
|
346
|
+
} \
|
347
|
+
if (weight > v[c][j - 1] + 1) { \
|
348
|
+
weight = v[c][j - 1] + 1; \
|
349
|
+
} \
|
350
|
+
if (i > 2 && j > 2 && a_ptr[i - 1] == b_ptr[j - 2] && a_ptr[i - 2] == b_ptr[j - 1]) {\
|
351
|
+
if (weight > v[pp][j - 2]) { \
|
352
|
+
weight = v[pp][j - 2] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
353
|
+
} \
|
354
|
+
} \
|
355
|
+
v[c][j] = weight; \
|
356
|
+
} \
|
357
|
+
}
|
358
|
+
|
359
|
+
static VALUE DamerauLevenshtein_match(General *amatch, VALUE string)
|
360
|
+
{
|
361
|
+
VALUE result;
|
362
|
+
char *a_ptr, *b_ptr;
|
363
|
+
int a_len, b_len;
|
364
|
+
int *v[3], weight;
|
365
|
+
int i, j, c, p, pp;
|
366
|
+
|
367
|
+
Check_Type(string, T_STRING);
|
368
|
+
DONT_OPTIMIZE
|
369
|
+
|
370
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
371
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
372
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
373
|
+
for (i = 0; i <= b_len; i++) {
|
374
|
+
v[0][i] = i;
|
375
|
+
v[1][i] = i;
|
376
|
+
v[2][i] = i;
|
377
|
+
}
|
378
|
+
|
379
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
380
|
+
|
381
|
+
result = INT2FIX(v[c][b_len]);
|
382
|
+
|
383
|
+
xfree(v[0]);
|
384
|
+
xfree(v[1]);
|
385
|
+
xfree(v[2]);
|
386
|
+
|
338
387
|
return result;
|
339
388
|
}
|
340
389
|
|
390
|
+
static VALUE DamerauLevenshtein_similar(General *amatch, VALUE string)
|
391
|
+
{
|
392
|
+
VALUE result;
|
393
|
+
char *a_ptr, *b_ptr;
|
394
|
+
int a_len, b_len;
|
395
|
+
int *v[3], weight;
|
396
|
+
int i, j, c, p, pp;
|
397
|
+
|
398
|
+
Check_Type(string, T_STRING);
|
399
|
+
DONT_OPTIMIZE
|
400
|
+
|
401
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
402
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
403
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
404
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
405
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
406
|
+
for (i = 0; i <= b_len; i++) {
|
407
|
+
v[0][i] = i;
|
408
|
+
v[1][i] = i;
|
409
|
+
v[2][i] = i;
|
410
|
+
}
|
411
|
+
|
412
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
413
|
+
|
414
|
+
if (b_len > a_len) {
|
415
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
|
416
|
+
} else {
|
417
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
|
418
|
+
}
|
419
|
+
|
420
|
+
xfree(v[0]);
|
421
|
+
xfree(v[1]);
|
422
|
+
xfree(v[2]);
|
423
|
+
|
424
|
+
return result;
|
425
|
+
}
|
426
|
+
|
427
|
+
static VALUE DamerauLevenshtein_search(General *amatch, VALUE string)
|
428
|
+
{
|
429
|
+
VALUE result;
|
430
|
+
char *a_ptr, *b_ptr;
|
431
|
+
int a_len, b_len;
|
432
|
+
int *v[3], weight, min;
|
433
|
+
int i, j, c, p, pp;
|
434
|
+
|
435
|
+
Check_Type(string, T_STRING);
|
436
|
+
DONT_OPTIMIZE
|
437
|
+
|
438
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
439
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
440
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
441
|
+
MEMZERO(v[0], int, b_len + 1);
|
442
|
+
MEMZERO(v[1], int, b_len + 1);
|
443
|
+
MEMZERO(v[2], int, b_len + 1);
|
444
|
+
|
445
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
446
|
+
|
447
|
+
for (i = 0, min = a_len; i <= b_len; i++) {
|
448
|
+
if (v[c][i] < min) min = v[c][i];
|
449
|
+
}
|
450
|
+
|
451
|
+
result = INT2FIX(min);
|
452
|
+
|
453
|
+
xfree(v[0]);
|
454
|
+
xfree(v[1]);
|
455
|
+
xfree(v[2]);
|
456
|
+
|
457
|
+
return result;
|
458
|
+
}
|
341
459
|
|
342
460
|
/*
|
343
461
|
* Sellers edit distances are computed here:
|
344
462
|
*/
|
345
463
|
|
346
464
|
#define COMPUTE_SELLERS_DISTANCE \
|
347
|
-
|
465
|
+
c = 0; \
|
466
|
+
p = 0; \
|
467
|
+
for (i = 1; i <= a_len; i++) { \
|
348
468
|
c = i % 2; /* current row */ \
|
349
|
-
p = (i
|
469
|
+
p = (i - 1) % 2; /* previous row */ \
|
350
470
|
v[c][0] = i * amatch->deletion; /* first column */ \
|
351
471
|
for (j = 1; j <= b_len; j++) { \
|
352
472
|
/* Bellman's principle of optimality: */ \
|
@@ -361,7 +481,6 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
|
|
361
481
|
v[c][j] = weight; \
|
362
482
|
} \
|
363
483
|
p = c; \
|
364
|
-
c = (c + 1) % 2; \
|
365
484
|
}
|
366
485
|
|
367
486
|
static VALUE Sellers_match(Sellers *amatch, VALUE string)
|
@@ -411,9 +530,10 @@ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
|
|
411
530
|
max_weight = amatch->deletion;
|
412
531
|
}
|
413
532
|
}
|
414
|
-
|
533
|
+
|
415
534
|
Check_Type(string, T_STRING);
|
416
535
|
DONT_OPTIMIZE
|
536
|
+
|
417
537
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
418
538
|
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
419
539
|
v[0] = ALLOC_N(double, b_len + 1);
|
@@ -459,7 +579,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
459
579
|
result = rb_float_new(min);
|
460
580
|
xfree(v[0]);
|
461
581
|
xfree(v[1]);
|
462
|
-
|
582
|
+
|
463
583
|
return result;
|
464
584
|
}
|
465
585
|
|
@@ -470,7 +590,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
470
590
|
static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
|
471
591
|
{
|
472
592
|
double result;
|
473
|
-
VALUE
|
593
|
+
VALUE string_tokens, tokens;
|
474
594
|
PairArray *pattern_pair_array, *pair_array;
|
475
595
|
|
476
596
|
Check_Type(string, T_STRING);
|
@@ -518,7 +638,7 @@ static VALUE Hamming_match(General *amatch, VALUE string)
|
|
518
638
|
char *a_ptr, *b_ptr;
|
519
639
|
int a_len, b_len;
|
520
640
|
int i, result;
|
521
|
-
|
641
|
+
|
522
642
|
Check_Type(string, T_STRING);
|
523
643
|
OPTIMIZE_TIME
|
524
644
|
COMPUTE_HAMMING_DISTANCE
|
@@ -530,7 +650,7 @@ static VALUE Hamming_similar(General *amatch, VALUE string)
|
|
530
650
|
char *a_ptr, *b_ptr;
|
531
651
|
int a_len, b_len;
|
532
652
|
int i, result;
|
533
|
-
|
653
|
+
|
534
654
|
Check_Type(string, T_STRING);
|
535
655
|
OPTIMIZE_TIME
|
536
656
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
@@ -570,7 +690,7 @@ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
|
|
570
690
|
char *a_ptr, *b_ptr;
|
571
691
|
int a_len, b_len;
|
572
692
|
int result, c, p, i, j, *l[2];
|
573
|
-
|
693
|
+
|
574
694
|
Check_Type(string, T_STRING);
|
575
695
|
OPTIMIZE_TIME
|
576
696
|
|
@@ -584,7 +704,7 @@ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
|
|
584
704
|
char *a_ptr, *b_ptr;
|
585
705
|
int a_len, b_len;
|
586
706
|
int result, c, p, i, j, *l[2];
|
587
|
-
|
707
|
+
|
588
708
|
Check_Type(string, T_STRING);
|
589
709
|
OPTIMIZE_TIME
|
590
710
|
|
@@ -624,7 +744,7 @@ static VALUE LongestSubstring_match(General *amatch, VALUE string)
|
|
624
744
|
char *a_ptr, *b_ptr;
|
625
745
|
int a_len, b_len;
|
626
746
|
int result, c, p, i, j, *l[2];
|
627
|
-
|
747
|
+
|
628
748
|
Check_Type(string, T_STRING);
|
629
749
|
OPTIMIZE_TIME
|
630
750
|
if (a_len == 0 || b_len == 0) return INT2FIX(0);
|
@@ -637,7 +757,7 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
|
|
637
757
|
char *a_ptr, *b_ptr;
|
638
758
|
int a_len, b_len;
|
639
759
|
int result, c, p, i, j, *l[2];
|
640
|
-
|
760
|
+
|
641
761
|
Check_Type(string, T_STRING);
|
642
762
|
OPTIMIZE_TIME
|
643
763
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
@@ -769,7 +889,7 @@ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
|
|
769
889
|
* Ruby API
|
770
890
|
*/
|
771
891
|
|
772
|
-
/*
|
892
|
+
/*
|
773
893
|
* Document-class: Amatch::Levenshtein
|
774
894
|
*
|
775
895
|
* The Levenshtein edit distance is defined as the minimal costs involved to
|
@@ -802,7 +922,7 @@ DEF_CONSTRUCTOR(Levenshtein, General)
|
|
802
922
|
|
803
923
|
/*
|
804
924
|
* call-seq: match(strings) -> results
|
805
|
-
*
|
925
|
+
*
|
806
926
|
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
807
927
|
* against <code>strings</code>. It returns the number operations, the Sellers
|
808
928
|
* distance. <code>strings</code> has to be either a String or an Array of
|
@@ -810,14 +930,14 @@ DEF_CONSTRUCTOR(Levenshtein, General)
|
|
810
930
|
* Floats respectively.
|
811
931
|
*/
|
812
932
|
static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
813
|
-
{
|
933
|
+
{
|
814
934
|
GET_STRUCT(General)
|
815
935
|
return General_iterate_strings(amatch, strings, Levenshtein_match);
|
816
936
|
}
|
817
937
|
|
818
938
|
/*
|
819
939
|
* call-seq: similar(strings) -> results
|
820
|
-
*
|
940
|
+
*
|
821
941
|
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
822
942
|
* against <code>strings</code>, and compute a Levenshtein distance metric
|
823
943
|
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
@@ -826,14 +946,14 @@ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
|
826
946
|
* respectively.
|
827
947
|
*/
|
828
948
|
static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
|
829
|
-
{
|
949
|
+
{
|
830
950
|
GET_STRUCT(General)
|
831
951
|
return General_iterate_strings(amatch, strings, Levenshtein_similar);
|
832
952
|
}
|
833
953
|
|
834
954
|
/*
|
835
955
|
* call-seq: levenshtein_similar(strings) -> results
|
836
|
-
*
|
956
|
+
*
|
837
957
|
* If called on a String, this string is used as a Amatch::Levenshtein#pattern
|
838
958
|
* to match against <code>strings</code>. It returns a Levenshtein distance
|
839
959
|
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
@@ -849,7 +969,7 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
|
849
969
|
|
850
970
|
/*
|
851
971
|
* call-seq: search(strings) -> results
|
852
|
-
*
|
972
|
+
*
|
853
973
|
* searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
|
854
974
|
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
855
975
|
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
@@ -857,12 +977,105 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
|
857
977
|
* <code>results</code> is either a Float or an Array of Floats respectively.
|
858
978
|
*/
|
859
979
|
static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
|
860
|
-
{
|
980
|
+
{
|
861
981
|
GET_STRUCT(General)
|
862
982
|
return General_iterate_strings(amatch, strings, Levenshtein_search);
|
863
983
|
}
|
864
984
|
|
865
|
-
/*
|
985
|
+
/*
|
986
|
+
* Document-class: Amatch::DamerauLevenshtein
|
987
|
+
* XXX
|
988
|
+
* The DamerauLevenshtein edit distance is defined as the minimal costs
|
989
|
+
* involved to transform one string into another by using three elementary
|
990
|
+
* operations: deletion, insertion and substitution of a character. To
|
991
|
+
* transform "water" into "wine", for instance, you have to substitute "a" ->
|
992
|
+
* "i": "witer", "t" -> "n": "winer" and delete "r": "wine". The edit distance
|
993
|
+
* between "water" and "wine" is 3, because you have to apply three
|
994
|
+
* operations. The edit distance between "wine" and "wine" is 0 of course: no
|
995
|
+
* operation is necessary for the transformation -- they're already the same
|
996
|
+
* string. It's easy to see that more similar strings have smaller edit
|
997
|
+
* distances than strings that differ a lot.
|
998
|
+
*/
|
999
|
+
|
1000
|
+
DEF_RB_FREE(DamerauLevenshtein, General)
|
1001
|
+
|
1002
|
+
/*
|
1003
|
+
* call-seq: new(pattern)
|
1004
|
+
* XXX
|
1005
|
+
* Creates a new Amatch::DamerauLevenshtein instance from <code>pattern</code>.
|
1006
|
+
*/
|
1007
|
+
static VALUE rb_DamerauLevenshtein_initialize(VALUE self, VALUE pattern)
|
1008
|
+
{
|
1009
|
+
GET_STRUCT(General)
|
1010
|
+
General_pattern_set(amatch, pattern);
|
1011
|
+
return self;
|
1012
|
+
}
|
1013
|
+
|
1014
|
+
DEF_CONSTRUCTOR(DamerauLevenshtein, General)
|
1015
|
+
|
1016
|
+
/*
|
1017
|
+
* call-seq: match(strings) -> results
|
1018
|
+
* XXX
|
1019
|
+
* Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
|
1020
|
+
* against <code>strings</code>. It returns the number operations, the Sellers
|
1021
|
+
* distance. <code>strings</code> has to be either a String or an Array of
|
1022
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
1023
|
+
* Floats respectively.
|
1024
|
+
*/
|
1025
|
+
static VALUE rb_DamerauLevenshtein_match(VALUE self, VALUE strings)
|
1026
|
+
{
|
1027
|
+
GET_STRUCT(General)
|
1028
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_match);
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
/*
|
1032
|
+
* call-seq: similar(strings) -> results
|
1033
|
+
* XXX
|
1034
|
+
* Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
|
1035
|
+
* against <code>strings</code>, and compute a DamerauLevenshtein distance metric
|
1036
|
+
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1037
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
1038
|
+
* returned <code>results</code> is either a Fixnum or an Array of Fixnums
|
1039
|
+
* respectively.
|
1040
|
+
*/
|
1041
|
+
static VALUE rb_DamerauLevenshtein_similar(VALUE self, VALUE strings)
|
1042
|
+
{
|
1043
|
+
GET_STRUCT(General)
|
1044
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_similar);
|
1045
|
+
}
|
1046
|
+
|
1047
|
+
/*
|
1048
|
+
* call-seq: levenshtein_similar(strings) -> results
|
1049
|
+
* XXX
|
1050
|
+
* If called on a String, this string is used as a Amatch::DamerauLevenshtein#pattern
|
1051
|
+
* to match against <code>strings</code>. It returns a DamerauLevenshtein distance
|
1052
|
+
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
1053
|
+
* match. <code>strings</code> has to be either a String or an Array of
|
1054
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
1055
|
+
* Floats respectively.
|
1056
|
+
*/
|
1057
|
+
static VALUE rb_str_damerau_levenshtein_similar(VALUE self, VALUE strings)
|
1058
|
+
{
|
1059
|
+
VALUE amatch = rb_DamerauLevenshtein_new(rb_cDamerauLevenshtein, self);
|
1060
|
+
return rb_DamerauLevenshtein_similar(amatch, strings);
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
/*
|
1064
|
+
* call-seq: search(strings) -> results
|
1065
|
+
* XXX
|
1066
|
+
* searches Amatch::DamerauLevenshtein#pattern in <code>strings</code> and returns the
|
1067
|
+
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
1068
|
+
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
1069
|
+
* to be either a String or an Array of Strings. The returned
|
1070
|
+
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1071
|
+
*/
|
1072
|
+
static VALUE rb_DamerauLevenshtein_search(VALUE self, VALUE strings)
|
1073
|
+
{
|
1074
|
+
GET_STRUCT(General)
|
1075
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_search);
|
1076
|
+
}
|
1077
|
+
|
1078
|
+
/*
|
866
1079
|
* Document-class: Amatch::Sellers
|
867
1080
|
*
|
868
1081
|
* The Sellers edit distance is very similar to the Levenshtein edit distance.
|
@@ -981,14 +1194,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
|
|
981
1194
|
* Document-method: pattern=
|
982
1195
|
*
|
983
1196
|
* call-seq: pattern=(pattern)
|
984
|
-
*
|
1197
|
+
*
|
985
1198
|
* Sets the current pattern string of this Amatch::Sellers instance to
|
986
1199
|
* <code>pattern</code>.
|
987
1200
|
*/
|
988
1201
|
|
989
1202
|
/*
|
990
1203
|
* call-seq: match(strings) -> results
|
991
|
-
*
|
1204
|
+
*
|
992
1205
|
* Uses this Amatch::Sellers instance to match Sellers#pattern against
|
993
1206
|
* <code>strings</code>, while taking into account the given weights. It
|
994
1207
|
* returns the number of weighted character operations, the Sellers distance.
|
@@ -997,14 +1210,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
|
|
997
1210
|
* respectively.
|
998
1211
|
*/
|
999
1212
|
static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
1000
|
-
{
|
1213
|
+
{
|
1001
1214
|
GET_STRUCT(Sellers)
|
1002
1215
|
return Sellers_iterate_strings(amatch, strings, Sellers_match);
|
1003
1216
|
}
|
1004
1217
|
|
1005
1218
|
/*
|
1006
1219
|
* call-seq: similar(strings) -> results
|
1007
|
-
*
|
1220
|
+
*
|
1008
1221
|
* Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
|
1009
1222
|
* against <code>strings</code> (taking into account the given weights), and
|
1010
1223
|
* compute a Sellers distance metric number between 0.0 for very unsimilar
|
@@ -1014,7 +1227,7 @@ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
|
1014
1227
|
* respectively.
|
1015
1228
|
*/
|
1016
1229
|
static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
1017
|
-
{
|
1230
|
+
{
|
1018
1231
|
GET_STRUCT(Sellers)
|
1019
1232
|
return Sellers_iterate_strings(amatch, strings, Sellers_similar);
|
1020
1233
|
}
|
@@ -1029,12 +1242,12 @@ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
|
1029
1242
|
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1030
1243
|
*/
|
1031
1244
|
static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
1032
|
-
{
|
1245
|
+
{
|
1033
1246
|
GET_STRUCT(Sellers)
|
1034
1247
|
return Sellers_iterate_strings(amatch, strings, Sellers_search);
|
1035
1248
|
}
|
1036
1249
|
|
1037
|
-
/*
|
1250
|
+
/*
|
1038
1251
|
* Document-class: Amatch::PairDistance
|
1039
1252
|
*
|
1040
1253
|
* The pair distance between two strings is based on the number of adjacent
|
@@ -1045,7 +1258,7 @@ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
|
1045
1258
|
* are more dissimilar. The advantage of considering adjacent characters, is to
|
1046
1259
|
* take account not only of the characters, but also of the character ordering
|
1047
1260
|
* in the original strings.
|
1048
|
-
*
|
1261
|
+
*
|
1049
1262
|
* This metric is very capable to find similarities in natural languages.
|
1050
1263
|
* It is explained in more detail in Simon White's article "How to Strike a
|
1051
1264
|
* Match", located at this url:
|
@@ -1072,7 +1285,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
|
1072
1285
|
|
1073
1286
|
/*
|
1074
1287
|
* call-seq: match(strings, regexp = /\s+/) -> results
|
1075
|
-
*
|
1288
|
+
*
|
1076
1289
|
* Uses this Amatch::PairDistance instance to match PairDistance#pattern against
|
1077
1290
|
* <code>strings</code>. It returns the pair distance measure, that is a
|
1078
1291
|
* returned value of 1.0 is an exact match, partial matches are lower
|
@@ -1088,7 +1301,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
|
1088
1301
|
* Array of Floats respectively.
|
1089
1302
|
*/
|
1090
1303
|
static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
1091
|
-
{
|
1304
|
+
{
|
1092
1305
|
VALUE result, strings, regexp = Qnil;
|
1093
1306
|
int use_regexp;
|
1094
1307
|
GET_STRUCT(PairDistance)
|
@@ -1146,7 +1359,7 @@ static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
|
|
1146
1359
|
}
|
1147
1360
|
}
|
1148
1361
|
|
1149
|
-
/*
|
1362
|
+
/*
|
1150
1363
|
* Document-class: Amatch::Hamming
|
1151
1364
|
*
|
1152
1365
|
* This class computes the Hamming distance between two strings.
|
@@ -1176,7 +1389,7 @@ DEF_CONSTRUCTOR(Hamming, General)
|
|
1176
1389
|
|
1177
1390
|
/*
|
1178
1391
|
* call-seq: match(strings) -> results
|
1179
|
-
*
|
1392
|
+
*
|
1180
1393
|
* Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
|
1181
1394
|
* <code>strings</code>, that is compute the hamming distance between
|
1182
1395
|
* <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
|
@@ -1184,7 +1397,7 @@ DEF_CONSTRUCTOR(Hamming, General)
|
|
1184
1397
|
* is either a Fixnum or an Array of Fixnums respectively.
|
1185
1398
|
*/
|
1186
1399
|
static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
1187
|
-
{
|
1400
|
+
{
|
1188
1401
|
GET_STRUCT(General)
|
1189
1402
|
return General_iterate_strings(amatch, strings, Hamming_match);
|
1190
1403
|
}
|
@@ -1200,7 +1413,7 @@ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
|
1200
1413
|
* respectively.
|
1201
1414
|
*/
|
1202
1415
|
static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
|
1203
|
-
{
|
1416
|
+
{
|
1204
1417
|
GET_STRUCT(General)
|
1205
1418
|
return General_iterate_strings(amatch, strings, Hamming_similar);
|
1206
1419
|
}
|
@@ -1222,7 +1435,7 @@ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
|
|
1222
1435
|
}
|
1223
1436
|
|
1224
1437
|
|
1225
|
-
/*
|
1438
|
+
/*
|
1226
1439
|
* Document-class: Amatch::LongestSubsequence
|
1227
1440
|
*
|
1228
1441
|
* This class computes the length of the longest subsequence common to two
|
@@ -1252,7 +1465,7 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
|
|
1252
1465
|
|
1253
1466
|
/*
|
1254
1467
|
* call-seq: match(strings) -> results
|
1255
|
-
*
|
1468
|
+
*
|
1256
1469
|
* Uses this Amatch::LongestSubsequence instance to match
|
1257
1470
|
* LongestSubsequence#pattern against <code>strings</code>, that is compute the
|
1258
1471
|
* length of the longest common subsequence. <code>strings</code> has to be
|
@@ -1260,14 +1473,14 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
|
|
1260
1473
|
* is either a Fixnum or an Array of Fixnums respectively.
|
1261
1474
|
*/
|
1262
1475
|
static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
1263
|
-
{
|
1476
|
+
{
|
1264
1477
|
GET_STRUCT(General)
|
1265
1478
|
return General_iterate_strings(amatch, strings, LongestSubsequence_match);
|
1266
1479
|
}
|
1267
1480
|
|
1268
1481
|
/*
|
1269
1482
|
* call-seq: similar(strings) -> results
|
1270
|
-
*
|
1483
|
+
*
|
1271
1484
|
* Uses this Amatch::LongestSubsequence instance to match
|
1272
1485
|
* Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
|
1273
1486
|
* a longest substring distance metric number between 0.0 for very unsimilar
|
@@ -1276,7 +1489,7 @@ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
|
1276
1489
|
* a Fixnum or an Array of Fixnums
|
1277
1490
|
*/
|
1278
1491
|
static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
1279
|
-
{
|
1492
|
+
{
|
1280
1493
|
GET_STRUCT(General)
|
1281
1494
|
return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
|
1282
1495
|
}
|
@@ -1292,12 +1505,12 @@ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
|
1292
1505
|
* is either a Float or an Array of Floats respectively.
|
1293
1506
|
*/
|
1294
1507
|
static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
1295
|
-
{
|
1508
|
+
{
|
1296
1509
|
VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
|
1297
1510
|
return rb_LongestSubsequence_similar(amatch, strings);
|
1298
1511
|
}
|
1299
1512
|
|
1300
|
-
/*
|
1513
|
+
/*
|
1301
1514
|
* Document-class: Amatch::LongestSubstring
|
1302
1515
|
*
|
1303
1516
|
* The longest common substring is the longest substring, that is part of
|
@@ -1308,7 +1521,7 @@ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
|
1308
1521
|
* The longest common substring between 'string' and 'string' is 'string'
|
1309
1522
|
* again, thus the longest common substring length is 6. The longest common
|
1310
1523
|
* substring between 'string' and 'storing' is 'ring', thus the longest common
|
1311
|
-
* substring length is 4.
|
1524
|
+
* substring length is 4.
|
1312
1525
|
*/
|
1313
1526
|
|
1314
1527
|
DEF_RB_FREE(LongestSubstring, General)
|
@@ -1329,7 +1542,7 @@ DEF_CONSTRUCTOR(LongestSubstring, General)
|
|
1329
1542
|
|
1330
1543
|
/*
|
1331
1544
|
* call-seq: match(strings) -> results
|
1332
|
-
*
|
1545
|
+
*
|
1333
1546
|
* Uses this Amatch::LongestSubstring instance to match
|
1334
1547
|
* LongestSubstring#pattern against <code>strings</code>, that is compute the
|
1335
1548
|
* length of the longest common substring. <code>strings</code> has to be
|
@@ -1344,7 +1557,7 @@ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
|
|
1344
1557
|
|
1345
1558
|
/*
|
1346
1559
|
* call-seq: similar(strings) -> results
|
1347
|
-
*
|
1560
|
+
*
|
1348
1561
|
* Uses this Amatch::LongestSubstring instance to match
|
1349
1562
|
* Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
|
1350
1563
|
* longest substring distance metric number between 0.0 for very unsimilar
|
@@ -1370,11 +1583,11 @@ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
|
|
1370
1583
|
* is either a Float or an Array of Floats respectively.
|
1371
1584
|
*/
|
1372
1585
|
static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
|
1373
|
-
{
|
1586
|
+
{
|
1374
1587
|
VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
|
1375
1588
|
return rb_LongestSubstring_similar(amatch, strings);
|
1376
1589
|
}
|
1377
|
-
|
1590
|
+
|
1378
1591
|
/*
|
1379
1592
|
* Document-class: Amatch::Jaro
|
1380
1593
|
*
|
@@ -1571,6 +1784,17 @@ void Init_amatch_ext()
|
|
1571
1784
|
rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
|
1572
1785
|
rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
|
1573
1786
|
|
1787
|
+
/* DamerauLevenshtein */
|
1788
|
+
rb_cDamerauLevenshtein = rb_define_class_under(rb_mAmatch, "DamerauLevenshtein", rb_cObject);
|
1789
|
+
rb_define_alloc_func(rb_cDamerauLevenshtein, rb_DamerauLevenshtein_s_allocate);
|
1790
|
+
rb_define_method(rb_cDamerauLevenshtein, "initialize", rb_DamerauLevenshtein_initialize, 1);
|
1791
|
+
rb_define_method(rb_cDamerauLevenshtein, "pattern", rb_General_pattern, 0);
|
1792
|
+
rb_define_method(rb_cDamerauLevenshtein, "pattern=", rb_General_pattern_set, 1);
|
1793
|
+
rb_define_method(rb_cDamerauLevenshtein, "match", rb_DamerauLevenshtein_match, 1);
|
1794
|
+
rb_define_method(rb_cDamerauLevenshtein, "search", rb_DamerauLevenshtein_search, 1);
|
1795
|
+
rb_define_method(rb_cDamerauLevenshtein, "similar", rb_DamerauLevenshtein_similar, 1);
|
1796
|
+
rb_define_method(rb_mAmatchStringMethods, "damerau_levenshtein_similar", rb_str_damerau_levenshtein_similar, 1);
|
1797
|
+
|
1574
1798
|
/* Sellers */
|
1575
1799
|
rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
|
1576
1800
|
rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
|