amatch 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGES +7 -0
- data/COPYING +203 -340
- data/README.md +124 -0
- data/Rakefile +9 -14
- data/amatch.gemspec +0 -0
- data/bin/{agrep.rb → agrep} +23 -9
- data/bin/dupfind +153 -0
- data/ext/amatch_ext.c +313 -91
- data/ext/pair.c +3 -1
- data/images/amatch_ext.png +0 -0
- data/lib/amatch/version.rb +1 -1
- data/tests/test_damerau_levenshtein.rb +93 -0
- metadata +33 -37
- data/.gitignore +0 -6
- data/.travis.yml +0 -10
- data/README.rdoc +0 -128
- data/VERSION +0 -1
data/README.md
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
# amatch - Approximate Matching Extension for Ruby
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
This is a collection of classes that can be used for Approximate
|
6
|
+
matching, searching, and comparing of Strings. They implement algorithms
|
7
|
+
that compute the Levenshtein edit distance, Sellers edit distance, the
|
8
|
+
Hamming distance, the longest common subsequence length, the longest common
|
9
|
+
substring length, the pair distance metric, the Jaro-Winkler metric.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
To install this extension as a gem type
|
14
|
+
|
15
|
+
# gem install amatch
|
16
|
+
|
17
|
+
into the shell.
|
18
|
+
|
19
|
+
## Download
|
20
|
+
|
21
|
+
The homepage of this library is located at
|
22
|
+
|
23
|
+
* https://github.com/flori/amatch
|
24
|
+
|
25
|
+
## Examples
|
26
|
+
|
27
|
+
require 'amatch'
|
28
|
+
# => true
|
29
|
+
include Amatch
|
30
|
+
# => Object
|
31
|
+
|
32
|
+
m = Sellers.new("pattern")
|
33
|
+
# => #<Amatch::Sellers:0x40366324>
|
34
|
+
m.match("pattren")
|
35
|
+
# => 2.0
|
36
|
+
m.substitution = m.insertion = 3
|
37
|
+
# => 3
|
38
|
+
m.match("pattren")
|
39
|
+
# => 4.0
|
40
|
+
m.reset_weights
|
41
|
+
# => #<Amatch::Sellers:0x40366324>
|
42
|
+
m.match(["pattren","parent"])
|
43
|
+
# => [2.0, 4.0]
|
44
|
+
m.search("abcpattrendef")
|
45
|
+
# => 2.0
|
46
|
+
|
47
|
+
m = Levenshtein.new("pattern")
|
48
|
+
# => #<Amatch::Levenshtein:0x4035919c>
|
49
|
+
m.match("pattren")
|
50
|
+
# => 2
|
51
|
+
m.search("abcpattrendef")
|
52
|
+
# => 2
|
53
|
+
"pattern language".levenshtein_similar("language of patterns")
|
54
|
+
# => 0.2
|
55
|
+
|
56
|
+
m = Amatch::DamerauLevenshtein.new("pattern")
|
57
|
+
# => #<Amatch::DamerauLevenshtein:0x007fc3483dd278>
|
58
|
+
m.match("pattren")
|
59
|
+
# => 1
|
60
|
+
"pattern language".damerau_levenshtein_similar("language of patterns")
|
61
|
+
# => 0.19999999999999996
|
62
|
+
|
63
|
+
m = Hamming.new("pattern")
|
64
|
+
# => #<Amatch::Hamming:0x40350858>
|
65
|
+
m.match("pattren")
|
66
|
+
# => 2
|
67
|
+
"pattern language".hamming_similar("language of patterns")
|
68
|
+
# => 0.1
|
69
|
+
|
70
|
+
m = PairDistance.new("pattern")
|
71
|
+
# => #<Amatch::PairDistance:0x40349be8>
|
72
|
+
m.match("pattr en")
|
73
|
+
# => 0.545454545454545
|
74
|
+
m.match("pattr en", nil)
|
75
|
+
# => 0.461538461538462
|
76
|
+
m.match("pattr en", /t+/)
|
77
|
+
# => 0.285714285714286
|
78
|
+
"pattern language".pair_distance_similar("language of patterns")
|
79
|
+
# => 0.928571428571429
|
80
|
+
|
81
|
+
m = LongestSubsequence.new("pattern")
|
82
|
+
# => #<Amatch::LongestSubsequence:0x4033e900>
|
83
|
+
m.match("pattren")
|
84
|
+
# => 6
|
85
|
+
"pattern language".longest_subsequence_similar("language of patterns")
|
86
|
+
# => 0.4
|
87
|
+
|
88
|
+
m = LongestSubstring.new("pattern")
|
89
|
+
# => #<Amatch::LongestSubstring:0x403378d0>
|
90
|
+
m.match("pattren")
|
91
|
+
# => 4
|
92
|
+
"pattern language".longest_substring_similar("language of patterns")
|
93
|
+
# => 0.4
|
94
|
+
|
95
|
+
m = Jaro.new("pattern")
|
96
|
+
# => #<Amatch::Jaro:0x363b70>
|
97
|
+
m.match("paTTren")
|
98
|
+
# => 0.952380952380952
|
99
|
+
m.ignore_case = false
|
100
|
+
m.match("paTTren")
|
101
|
+
# => 0.742857142857143
|
102
|
+
"pattern language".jaro_similar("language of patterns")
|
103
|
+
# => 0.672222222222222
|
104
|
+
|
105
|
+
m = JaroWinkler.new("pattern")
|
106
|
+
# #<Amatch::JaroWinkler:0x3530b8>
|
107
|
+
m.match("paTTren")
|
108
|
+
# => 0.971428571712403
|
109
|
+
m.ignore_case = false
|
110
|
+
m.match("paTTren")
|
111
|
+
# => 0.79428571505206
|
112
|
+
m.scaling_factor = 0.05
|
113
|
+
m.match("pattren")
|
114
|
+
# => 0.961904762046678
|
115
|
+
"pattern language".jarowinkler_similar("language of patterns")
|
116
|
+
# => 0.672222222222222
|
117
|
+
|
118
|
+
## Author
|
119
|
+
|
120
|
+
Florian Frank mailto:flori@ping.de
|
121
|
+
|
122
|
+
## License
|
123
|
+
|
124
|
+
Apache License, Version 2.0 – See the COPYING file in the source archive.
|
data/Rakefile
CHANGED
@@ -13,23 +13,18 @@ Amatch is a library for approximate string matching and searching in strings.
|
|
13
13
|
Several algorithms can be used to do this, and it's also possible to compute a
|
14
14
|
similarity metric number between 0.0 and 1.0 for two given strings.
|
15
15
|
EOT
|
16
|
-
executables << 'agrep
|
16
|
+
executables << 'agrep' << 'dupfind'
|
17
17
|
bindir 'bin'
|
18
18
|
test_dir 'tests'
|
19
|
-
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', '
|
19
|
+
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', 'Makefile'
|
20
|
+
package_ignore '.all_images.yml', '.gitignore', 'VERSION'
|
20
21
|
title "#{name.camelize} - Approximate Matching"
|
21
|
-
readme 'README.
|
22
|
+
readme 'README.md'
|
22
23
|
require_paths %w[lib ext]
|
23
24
|
dependency 'tins', '~>1.0'
|
24
|
-
|
25
|
-
development_dependency '
|
26
|
-
development_dependency '
|
27
|
-
|
28
|
-
|
29
|
-
libdir = CONFIG["sitelibdir"]
|
30
|
-
src, = Dir['ext/amatch.*'].reject { |x| x =~ /\.[co]$/ }
|
31
|
-
install(src, File.join(libdir, File.basename(src)), :verbose => true)
|
32
|
-
mkdir_p dst = File.join(libdir, 'amatch')
|
33
|
-
install('lib/amatch/version.rb', File.join(dst, 'version.rb'), :verbose => true)
|
34
|
-
end
|
25
|
+
dependency 'mize'
|
26
|
+
development_dependency 'test-unit', '~>3.0'
|
27
|
+
development_dependency 'all_images'
|
28
|
+
required_ruby_version '>=2.4'
|
29
|
+
licenses << 'Apache-2.0'
|
35
30
|
end
|
data/amatch.gemspec
CHANGED
Binary file
|
data/bin/{agrep.rb → agrep}
RENAMED
@@ -15,15 +15,21 @@ end
|
|
15
15
|
|
16
16
|
class Amatch::Levenshtein
|
17
17
|
def search_relative(strings)
|
18
|
-
|
18
|
+
if Array === strings
|
19
|
+
search(strings).map { |s| s.to_f / pattern.size }
|
20
|
+
else
|
21
|
+
search(strings).to_f / pattern.size
|
22
|
+
end
|
19
23
|
end
|
20
24
|
end
|
21
25
|
|
26
|
+
$algorithm = 'Levenshtein'
|
22
27
|
$distance = 1
|
23
28
|
$mode = :search
|
24
29
|
begin
|
25
30
|
parser = GetoptLong.new
|
26
31
|
options = [
|
32
|
+
[ '--algorithm', '-a', GetoptLong::REQUIRED_ARGUMENT ],
|
27
33
|
[ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
|
28
34
|
[ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
|
29
35
|
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
|
@@ -33,6 +39,8 @@ begin
|
|
33
39
|
parser.each_option do |name, arg|
|
34
40
|
name = name.sub(/^--/, '')
|
35
41
|
case name
|
42
|
+
when 'algorithm'
|
43
|
+
$algorithm = arg
|
36
44
|
when 'distance'
|
37
45
|
$distance = arg.to_f
|
38
46
|
when 'relative'
|
@@ -48,7 +56,7 @@ rescue
|
|
48
56
|
end
|
49
57
|
pattern = ARGV.shift or usage('Pattern needed!', options)
|
50
58
|
|
51
|
-
matcher = Amatch
|
59
|
+
matcher = Amatch.const_get($algorithm).new(pattern)
|
52
60
|
size = 0
|
53
61
|
start = Time.new
|
54
62
|
if ARGV.size > 0 then
|
@@ -56,9 +64,12 @@ if ARGV.size > 0 then
|
|
56
64
|
File.stat(filename).file? or next
|
57
65
|
size += File.size(filename)
|
58
66
|
begin
|
59
|
-
File.open(filename, 'r').each_line do |
|
60
|
-
|
61
|
-
|
67
|
+
File.open(filename, 'r').each_line.each_slice(1000) do |lines|
|
68
|
+
results = matcher.__send__($mode, lines)
|
69
|
+
lines.zip(results) do |line, r|
|
70
|
+
if r <= $distance
|
71
|
+
puts "#{filename}:#{line}"
|
72
|
+
end
|
62
73
|
end
|
63
74
|
end
|
64
75
|
rescue
|
@@ -66,10 +77,13 @@ if ARGV.size > 0 then
|
|
66
77
|
end
|
67
78
|
end
|
68
79
|
else
|
69
|
-
STDIN.each_line do |
|
70
|
-
size +=
|
71
|
-
|
72
|
-
|
80
|
+
STDIN.each_line.each_slice(1000) do |lines|
|
81
|
+
size += lines.size
|
82
|
+
results = matcher.__send__($mode, lines)
|
83
|
+
lines.zip(results) do |line, r|
|
84
|
+
if r <= $distance
|
85
|
+
puts line
|
86
|
+
end
|
73
87
|
end
|
74
88
|
end
|
75
89
|
end
|
data/bin/dupfind
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'tins/go'
|
4
|
+
include Tins::GO
|
5
|
+
require 'tins/minimize'
|
6
|
+
class Array
|
7
|
+
include Tins::Minimize
|
8
|
+
end
|
9
|
+
require 'amatch'
|
10
|
+
begin
|
11
|
+
require 'infobar'
|
12
|
+
rescue LoadError
|
13
|
+
warn "Please install gem infobar to run this executable!"
|
14
|
+
exit 1
|
15
|
+
end
|
16
|
+
|
17
|
+
def usage
|
18
|
+
puts <<EOT
|
19
|
+
Usage: #{File.basename($0)} [OPTIONS] FILE
|
20
|
+
|
21
|
+
-a ALGO Amatch matching algorithm
|
22
|
+
-p LIMIT more than p similarity to be a match
|
23
|
+
-R NUMBER skip NUMBER mismatch for building ranges
|
24
|
+
-r NUMBER minimum length to be counted as a range
|
25
|
+
-i compute a PNG per file
|
26
|
+
|
27
|
+
Repor$ bugs to <flori@ping.de>.
|
28
|
+
EOT
|
29
|
+
exit 0
|
30
|
+
end
|
31
|
+
|
32
|
+
class FindDuplicates
|
33
|
+
def initialize(algo, p_lim, filename)
|
34
|
+
@algo, @p_lim, @filename = algo, p_lim, filename
|
35
|
+
end
|
36
|
+
|
37
|
+
attr_reader :filename
|
38
|
+
|
39
|
+
attr_reader :algo
|
40
|
+
|
41
|
+
attr_reader :p_lim
|
42
|
+
|
43
|
+
memoize method:
|
44
|
+
def lines
|
45
|
+
File.readlines(filename)
|
46
|
+
end
|
47
|
+
|
48
|
+
memoize method:
|
49
|
+
def matrix
|
50
|
+
result = lines.with_infobar(label: filename, output: STDERR).map do |l1|
|
51
|
+
+infobar
|
52
|
+
a = algo.new(l1)
|
53
|
+
r = a.similar(lines)
|
54
|
+
r.map! { |s| s >= p_lim ? ?1 : ?0 }
|
55
|
+
r.join
|
56
|
+
end
|
57
|
+
infobar.finish
|
58
|
+
infobar.newline
|
59
|
+
result
|
60
|
+
end
|
61
|
+
|
62
|
+
def pbm(output: $>)
|
63
|
+
output << <<HEADER
|
64
|
+
P1
|
65
|
+
#{matrix.size} #{matrix.size}
|
66
|
+
HEADER
|
67
|
+
output << matrix.map { |line| line.each_char.to_a * ' ' } * ?\n
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
def png(output: $>)
|
72
|
+
IO.popen("pnmtopng", 'w+') do |conv|
|
73
|
+
pbm(output: conv)
|
74
|
+
conv.close_write
|
75
|
+
output.write(conv.read)
|
76
|
+
end
|
77
|
+
self
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_image
|
81
|
+
suffix = Regexp.quote(File.extname(filename))
|
82
|
+
f = filename.sub(/(#{suffix}|)\z/, '.png')
|
83
|
+
File.open(f, 'wb') do |output|
|
84
|
+
png(output: output)
|
85
|
+
infobar.puts "Writing output to #{f.inspect}."
|
86
|
+
end
|
87
|
+
self
|
88
|
+
end
|
89
|
+
|
90
|
+
def similar_ranges(min_range: 3, skip_range: 0)
|
91
|
+
set = 0
|
92
|
+
ranges = { set => [] }
|
93
|
+
m = matrix
|
94
|
+
n = m.size
|
95
|
+
skip_count = 0
|
96
|
+
n.downto(1) do |h|
|
97
|
+
(n - h + 1).upto(n - 1) do |k|
|
98
|
+
i = k
|
99
|
+
j = k - (n - h + 1)
|
100
|
+
if m[i][j] == ?1
|
101
|
+
skip_count = 0
|
102
|
+
ranges[set] << [ i, j ]
|
103
|
+
elsif !ranges[set].empty? && skip_count < skip_range
|
104
|
+
skip_count += 1
|
105
|
+
else
|
106
|
+
skip_count = 0
|
107
|
+
ranges[set].empty? or ranges[set += 1] = []
|
108
|
+
end
|
109
|
+
end
|
110
|
+
skip_count = 0
|
111
|
+
ranges[set].empty? or ranges[set += 1] = []
|
112
|
+
end
|
113
|
+
ranges.each { |_, r|
|
114
|
+
r.flatten!
|
115
|
+
r.sort!
|
116
|
+
r.map! { |x| x + 1 }
|
117
|
+
r.minimize!
|
118
|
+
r.reject! { |s| s.size < min_range }
|
119
|
+
}.reject! { |_, r| r.empty? }
|
120
|
+
unions = []
|
121
|
+
while !ranges.empty?
|
122
|
+
_, r = ranges.first
|
123
|
+
equivalent = ranges.reject { |_, v| (v & r).empty? }
|
124
|
+
unions << equivalent.values.flatten.uniq
|
125
|
+
ranges.delete_if { |k, _| equivalent.keys.include?(k) }
|
126
|
+
end
|
127
|
+
unions.each do |r|
|
128
|
+
r.map! do |x|
|
129
|
+
"#{filename}:#{x.begin}-#{x.end}"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
unions
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
opts = go 'a:p:R:r:ih'
|
137
|
+
|
138
|
+
usage if opts[?h]
|
139
|
+
algo = Amatch.const_get(opts[?a] || 'Levenshtein')
|
140
|
+
p_lim = (opts[?p] || 0.95).to_f
|
141
|
+
min_range = (opts[?r] || 3).to_i
|
142
|
+
skip_range = opts[?R].to_i
|
143
|
+
ARGV.empty? and usage
|
144
|
+
|
145
|
+
filenames = ARGV.inject([]) { |s, f| s.concat(Dir[f]) }
|
146
|
+
for filename in filenames
|
147
|
+
finder = FindDuplicates.new(algo, p_lim, filename)
|
148
|
+
opts[?i] and finder.create_image
|
149
|
+
for s in finder.similar_ranges(min_range: min_range, skip_range: skip_range)
|
150
|
+
infobar.reset
|
151
|
+
puts s, ?\n
|
152
|
+
end
|
153
|
+
end
|