amatch 0.3.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGES +7 -0
- data/COPYING +203 -340
- data/README.md +124 -0
- data/Rakefile +9 -14
- data/amatch.gemspec +0 -0
- data/bin/{agrep.rb → agrep} +23 -9
- data/bin/dupfind +153 -0
- data/ext/amatch_ext.c +313 -91
- data/ext/pair.c +3 -1
- data/images/amatch_ext.png +0 -0
- data/lib/amatch/version.rb +1 -1
- data/tests/test_damerau_levenshtein.rb +93 -0
- metadata +33 -37
- data/.gitignore +0 -6
- data/.travis.yml +0 -10
- data/README.rdoc +0 -128
- data/VERSION +0 -1
data/README.md
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
# amatch - Approximate Matching Extension for Ruby
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
This is a collection of classes that can be used for Approximate
|
6
|
+
matching, searching, and comparing of Strings. They implement algorithms
|
7
|
+
that compute the Levenshtein edit distance, Sellers edit distance, the
|
8
|
+
Hamming distance, the longest common subsequence length, the longest common
|
9
|
+
substring length, the pair distance metric, the Jaro-Winkler metric.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
To install this extension as a gem type
|
14
|
+
|
15
|
+
# gem install amatch
|
16
|
+
|
17
|
+
into the shell.
|
18
|
+
|
19
|
+
## Download
|
20
|
+
|
21
|
+
The homepage of this library is located at
|
22
|
+
|
23
|
+
* https://github.com/flori/amatch
|
24
|
+
|
25
|
+
## Examples
|
26
|
+
|
27
|
+
require 'amatch'
|
28
|
+
# => true
|
29
|
+
include Amatch
|
30
|
+
# => Object
|
31
|
+
|
32
|
+
m = Sellers.new("pattern")
|
33
|
+
# => #<Amatch::Sellers:0x40366324>
|
34
|
+
m.match("pattren")
|
35
|
+
# => 2.0
|
36
|
+
m.substitution = m.insertion = 3
|
37
|
+
# => 3
|
38
|
+
m.match("pattren")
|
39
|
+
# => 4.0
|
40
|
+
m.reset_weights
|
41
|
+
# => #<Amatch::Sellers:0x40366324>
|
42
|
+
m.match(["pattren","parent"])
|
43
|
+
# => [2.0, 4.0]
|
44
|
+
m.search("abcpattrendef")
|
45
|
+
# => 2.0
|
46
|
+
|
47
|
+
m = Levenshtein.new("pattern")
|
48
|
+
# => #<Amatch::Levenshtein:0x4035919c>
|
49
|
+
m.match("pattren")
|
50
|
+
# => 2
|
51
|
+
m.search("abcpattrendef")
|
52
|
+
# => 2
|
53
|
+
"pattern language".levenshtein_similar("language of patterns")
|
54
|
+
# => 0.2
|
55
|
+
|
56
|
+
m = Amatch::DamerauLevenshtein.new("pattern")
|
57
|
+
# => #<Amatch::DamerauLevenshtein:0x007fc3483dd278>
|
58
|
+
m.match("pattren")
|
59
|
+
# => 1
|
60
|
+
"pattern language".damerau_levenshtein_similar("language of patterns")
|
61
|
+
# => 0.19999999999999996
|
62
|
+
|
63
|
+
m = Hamming.new("pattern")
|
64
|
+
# => #<Amatch::Hamming:0x40350858>
|
65
|
+
m.match("pattren")
|
66
|
+
# => 2
|
67
|
+
"pattern language".hamming_similar("language of patterns")
|
68
|
+
# => 0.1
|
69
|
+
|
70
|
+
m = PairDistance.new("pattern")
|
71
|
+
# => #<Amatch::PairDistance:0x40349be8>
|
72
|
+
m.match("pattr en")
|
73
|
+
# => 0.545454545454545
|
74
|
+
m.match("pattr en", nil)
|
75
|
+
# => 0.461538461538462
|
76
|
+
m.match("pattr en", /t+/)
|
77
|
+
# => 0.285714285714286
|
78
|
+
"pattern language".pair_distance_similar("language of patterns")
|
79
|
+
# => 0.928571428571429
|
80
|
+
|
81
|
+
m = LongestSubsequence.new("pattern")
|
82
|
+
# => #<Amatch::LongestSubsequence:0x4033e900>
|
83
|
+
m.match("pattren")
|
84
|
+
# => 6
|
85
|
+
"pattern language".longest_subsequence_similar("language of patterns")
|
86
|
+
# => 0.4
|
87
|
+
|
88
|
+
m = LongestSubstring.new("pattern")
|
89
|
+
# => #<Amatch::LongestSubstring:0x403378d0>
|
90
|
+
m.match("pattren")
|
91
|
+
# => 4
|
92
|
+
"pattern language".longest_substring_similar("language of patterns")
|
93
|
+
# => 0.4
|
94
|
+
|
95
|
+
m = Jaro.new("pattern")
|
96
|
+
# => #<Amatch::Jaro:0x363b70>
|
97
|
+
m.match("paTTren")
|
98
|
+
# => 0.952380952380952
|
99
|
+
m.ignore_case = false
|
100
|
+
m.match("paTTren")
|
101
|
+
# => 0.742857142857143
|
102
|
+
"pattern language".jaro_similar("language of patterns")
|
103
|
+
# => 0.672222222222222
|
104
|
+
|
105
|
+
m = JaroWinkler.new("pattern")
|
106
|
+
# #<Amatch::JaroWinkler:0x3530b8>
|
107
|
+
m.match("paTTren")
|
108
|
+
# => 0.971428571712403
|
109
|
+
m.ignore_case = false
|
110
|
+
m.match("paTTren")
|
111
|
+
# => 0.79428571505206
|
112
|
+
m.scaling_factor = 0.05
|
113
|
+
m.match("pattren")
|
114
|
+
# => 0.961904762046678
|
115
|
+
"pattern language".jarowinkler_similar("language of patterns")
|
116
|
+
# => 0.672222222222222
|
117
|
+
|
118
|
+
## Author
|
119
|
+
|
120
|
+
Florian Frank mailto:flori@ping.de
|
121
|
+
|
122
|
+
## License
|
123
|
+
|
124
|
+
Apache License, Version 2.0 – See the COPYING file in the source archive.
|
data/Rakefile
CHANGED
@@ -13,23 +13,18 @@ Amatch is a library for approximate string matching and searching in strings.
|
|
13
13
|
Several algorithms can be used to do this, and it's also possible to compute a
|
14
14
|
similarity metric number between 0.0 and 1.0 for two given strings.
|
15
15
|
EOT
|
16
|
-
executables << 'agrep
|
16
|
+
executables << 'agrep' << 'dupfind'
|
17
17
|
bindir 'bin'
|
18
18
|
test_dir 'tests'
|
19
|
-
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', '
|
19
|
+
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', 'Makefile'
|
20
|
+
package_ignore '.all_images.yml', '.gitignore', 'VERSION'
|
20
21
|
title "#{name.camelize} - Approximate Matching"
|
21
|
-
readme 'README.
|
22
|
+
readme 'README.md'
|
22
23
|
require_paths %w[lib ext]
|
23
24
|
dependency 'tins', '~>1.0'
|
24
|
-
|
25
|
-
development_dependency '
|
26
|
-
development_dependency '
|
27
|
-
|
28
|
-
|
29
|
-
libdir = CONFIG["sitelibdir"]
|
30
|
-
src, = Dir['ext/amatch.*'].reject { |x| x =~ /\.[co]$/ }
|
31
|
-
install(src, File.join(libdir, File.basename(src)), :verbose => true)
|
32
|
-
mkdir_p dst = File.join(libdir, 'amatch')
|
33
|
-
install('lib/amatch/version.rb', File.join(dst, 'version.rb'), :verbose => true)
|
34
|
-
end
|
25
|
+
dependency 'mize'
|
26
|
+
development_dependency 'test-unit', '~>3.0'
|
27
|
+
development_dependency 'all_images'
|
28
|
+
required_ruby_version '>=2.4'
|
29
|
+
licenses << 'Apache-2.0'
|
35
30
|
end
|
data/amatch.gemspec
CHANGED
Binary file
|
data/bin/{agrep.rb → agrep}
RENAMED
@@ -15,15 +15,21 @@ end
|
|
15
15
|
|
16
16
|
class Amatch::Levenshtein
|
17
17
|
def search_relative(strings)
|
18
|
-
|
18
|
+
if Array === strings
|
19
|
+
search(strings).map { |s| s.to_f / pattern.size }
|
20
|
+
else
|
21
|
+
search(strings).to_f / pattern.size
|
22
|
+
end
|
19
23
|
end
|
20
24
|
end
|
21
25
|
|
26
|
+
$algorithm = 'Levenshtein'
|
22
27
|
$distance = 1
|
23
28
|
$mode = :search
|
24
29
|
begin
|
25
30
|
parser = GetoptLong.new
|
26
31
|
options = [
|
32
|
+
[ '--algorithm', '-a', GetoptLong::REQUIRED_ARGUMENT ],
|
27
33
|
[ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
|
28
34
|
[ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
|
29
35
|
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
|
@@ -33,6 +39,8 @@ begin
|
|
33
39
|
parser.each_option do |name, arg|
|
34
40
|
name = name.sub(/^--/, '')
|
35
41
|
case name
|
42
|
+
when 'algorithm'
|
43
|
+
$algorithm = arg
|
36
44
|
when 'distance'
|
37
45
|
$distance = arg.to_f
|
38
46
|
when 'relative'
|
@@ -48,7 +56,7 @@ rescue
|
|
48
56
|
end
|
49
57
|
pattern = ARGV.shift or usage('Pattern needed!', options)
|
50
58
|
|
51
|
-
matcher = Amatch
|
59
|
+
matcher = Amatch.const_get($algorithm).new(pattern)
|
52
60
|
size = 0
|
53
61
|
start = Time.new
|
54
62
|
if ARGV.size > 0 then
|
@@ -56,9 +64,12 @@ if ARGV.size > 0 then
|
|
56
64
|
File.stat(filename).file? or next
|
57
65
|
size += File.size(filename)
|
58
66
|
begin
|
59
|
-
File.open(filename, 'r').each_line do |
|
60
|
-
|
61
|
-
|
67
|
+
File.open(filename, 'r').each_line.each_slice(1000) do |lines|
|
68
|
+
results = matcher.__send__($mode, lines)
|
69
|
+
lines.zip(results) do |line, r|
|
70
|
+
if r <= $distance
|
71
|
+
puts "#{filename}:#{line}"
|
72
|
+
end
|
62
73
|
end
|
63
74
|
end
|
64
75
|
rescue
|
@@ -66,10 +77,13 @@ if ARGV.size > 0 then
|
|
66
77
|
end
|
67
78
|
end
|
68
79
|
else
|
69
|
-
STDIN.each_line do |
|
70
|
-
size +=
|
71
|
-
|
72
|
-
|
80
|
+
STDIN.each_line.each_slice(1000) do |lines|
|
81
|
+
size += lines.size
|
82
|
+
results = matcher.__send__($mode, lines)
|
83
|
+
lines.zip(results) do |line, r|
|
84
|
+
if r <= $distance
|
85
|
+
puts line
|
86
|
+
end
|
73
87
|
end
|
74
88
|
end
|
75
89
|
end
|
data/bin/dupfind
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'tins/go'
|
4
|
+
include Tins::GO
|
5
|
+
require 'tins/minimize'
|
6
|
+
class Array
|
7
|
+
include Tins::Minimize
|
8
|
+
end
|
9
|
+
require 'amatch'
|
10
|
+
begin
|
11
|
+
require 'infobar'
|
12
|
+
rescue LoadError
|
13
|
+
warn "Please install gem infobar to run this executable!"
|
14
|
+
exit 1
|
15
|
+
end
|
16
|
+
|
17
|
+
def usage
|
18
|
+
puts <<EOT
|
19
|
+
Usage: #{File.basename($0)} [OPTIONS] FILE
|
20
|
+
|
21
|
+
-a ALGO Amatch matching algorithm
|
22
|
+
-p LIMIT more than p similarity to be a match
|
23
|
+
-R NUMBER skip NUMBER mismatch for building ranges
|
24
|
+
-r NUMBER minimum length to be counted as a range
|
25
|
+
-i compute a PNG per file
|
26
|
+
|
27
|
+
Repor$ bugs to <flori@ping.de>.
|
28
|
+
EOT
|
29
|
+
exit 0
|
30
|
+
end
|
31
|
+
|
32
|
+
class FindDuplicates
|
33
|
+
def initialize(algo, p_lim, filename)
|
34
|
+
@algo, @p_lim, @filename = algo, p_lim, filename
|
35
|
+
end
|
36
|
+
|
37
|
+
attr_reader :filename
|
38
|
+
|
39
|
+
attr_reader :algo
|
40
|
+
|
41
|
+
attr_reader :p_lim
|
42
|
+
|
43
|
+
memoize method:
|
44
|
+
def lines
|
45
|
+
File.readlines(filename)
|
46
|
+
end
|
47
|
+
|
48
|
+
memoize method:
|
49
|
+
def matrix
|
50
|
+
result = lines.with_infobar(label: filename, output: STDERR).map do |l1|
|
51
|
+
+infobar
|
52
|
+
a = algo.new(l1)
|
53
|
+
r = a.similar(lines)
|
54
|
+
r.map! { |s| s >= p_lim ? ?1 : ?0 }
|
55
|
+
r.join
|
56
|
+
end
|
57
|
+
infobar.finish
|
58
|
+
infobar.newline
|
59
|
+
result
|
60
|
+
end
|
61
|
+
|
62
|
+
def pbm(output: $>)
|
63
|
+
output << <<HEADER
|
64
|
+
P1
|
65
|
+
#{matrix.size} #{matrix.size}
|
66
|
+
HEADER
|
67
|
+
output << matrix.map { |line| line.each_char.to_a * ' ' } * ?\n
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
def png(output: $>)
|
72
|
+
IO.popen("pnmtopng", 'w+') do |conv|
|
73
|
+
pbm(output: conv)
|
74
|
+
conv.close_write
|
75
|
+
output.write(conv.read)
|
76
|
+
end
|
77
|
+
self
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_image
|
81
|
+
suffix = Regexp.quote(File.extname(filename))
|
82
|
+
f = filename.sub(/(#{suffix}|)\z/, '.png')
|
83
|
+
File.open(f, 'wb') do |output|
|
84
|
+
png(output: output)
|
85
|
+
infobar.puts "Writing output to #{f.inspect}."
|
86
|
+
end
|
87
|
+
self
|
88
|
+
end
|
89
|
+
|
90
|
+
def similar_ranges(min_range: 3, skip_range: 0)
|
91
|
+
set = 0
|
92
|
+
ranges = { set => [] }
|
93
|
+
m = matrix
|
94
|
+
n = m.size
|
95
|
+
skip_count = 0
|
96
|
+
n.downto(1) do |h|
|
97
|
+
(n - h + 1).upto(n - 1) do |k|
|
98
|
+
i = k
|
99
|
+
j = k - (n - h + 1)
|
100
|
+
if m[i][j] == ?1
|
101
|
+
skip_count = 0
|
102
|
+
ranges[set] << [ i, j ]
|
103
|
+
elsif !ranges[set].empty? && skip_count < skip_range
|
104
|
+
skip_count += 1
|
105
|
+
else
|
106
|
+
skip_count = 0
|
107
|
+
ranges[set].empty? or ranges[set += 1] = []
|
108
|
+
end
|
109
|
+
end
|
110
|
+
skip_count = 0
|
111
|
+
ranges[set].empty? or ranges[set += 1] = []
|
112
|
+
end
|
113
|
+
ranges.each { |_, r|
|
114
|
+
r.flatten!
|
115
|
+
r.sort!
|
116
|
+
r.map! { |x| x + 1 }
|
117
|
+
r.minimize!
|
118
|
+
r.reject! { |s| s.size < min_range }
|
119
|
+
}.reject! { |_, r| r.empty? }
|
120
|
+
unions = []
|
121
|
+
while !ranges.empty?
|
122
|
+
_, r = ranges.first
|
123
|
+
equivalent = ranges.reject { |_, v| (v & r).empty? }
|
124
|
+
unions << equivalent.values.flatten.uniq
|
125
|
+
ranges.delete_if { |k, _| equivalent.keys.include?(k) }
|
126
|
+
end
|
127
|
+
unions.each do |r|
|
128
|
+
r.map! do |x|
|
129
|
+
"#{filename}:#{x.begin}-#{x.end}"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
unions
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
opts = go 'a:p:R:r:ih'
|
137
|
+
|
138
|
+
usage if opts[?h]
|
139
|
+
algo = Amatch.const_get(opts[?a] || 'Levenshtein')
|
140
|
+
p_lim = (opts[?p] || 0.95).to_f
|
141
|
+
min_range = (opts[?r] || 3).to_i
|
142
|
+
skip_range = opts[?R].to_i
|
143
|
+
ARGV.empty? and usage
|
144
|
+
|
145
|
+
filenames = ARGV.inject([]) { |s, f| s.concat(Dir[f]) }
|
146
|
+
for filename in filenames
|
147
|
+
finder = FindDuplicates.new(algo, p_lim, filename)
|
148
|
+
opts[?i] and finder.create_image
|
149
|
+
for s in finder.similar_ranges(min_range: min_range, skip_range: skip_range)
|
150
|
+
infobar.reset
|
151
|
+
puts s, ?\n
|
152
|
+
end
|
153
|
+
end
|