amatch 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,124 @@
1
+ # amatch - Approximate Matching Extension for Ruby
2
+
3
+ ## Description
4
+
5
+ This is a collection of classes that can be used for Approximate
6
+ matching, searching, and comparing of Strings. They implement algorithms
7
+ that compute the Levenshtein edit distance, Sellers edit distance, the
8
+ Hamming distance, the longest common subsequence length, the longest common
9
+ substring length, the pair distance metric, the Jaro-Winkler metric.
10
+
11
+ ## Installation
12
+
13
+ To install this extension as a gem type
14
+
15
+ # gem install amatch
16
+
17
+ into the shell.
18
+
19
+ ## Download
20
+
21
+ The homepage of this library is located at
22
+
23
+ * https://github.com/flori/amatch
24
+
25
+ ## Examples
26
+
27
+ require 'amatch'
28
+ # => true
29
+ include Amatch
30
+ # => Object
31
+
32
+ m = Sellers.new("pattern")
33
+ # => #<Amatch::Sellers:0x40366324>
34
+ m.match("pattren")
35
+ # => 2.0
36
+ m.substitution = m.insertion = 3
37
+ # => 3
38
+ m.match("pattren")
39
+ # => 4.0
40
+ m.reset_weights
41
+ # => #<Amatch::Sellers:0x40366324>
42
+ m.match(["pattren","parent"])
43
+ # => [2.0, 4.0]
44
+ m.search("abcpattrendef")
45
+ # => 2.0
46
+
47
+ m = Levenshtein.new("pattern")
48
+ # => #<Amatch::Levenshtein:0x4035919c>
49
+ m.match("pattren")
50
+ # => 2
51
+ m.search("abcpattrendef")
52
+ # => 2
53
+ "pattern language".levenshtein_similar("language of patterns")
54
+ # => 0.2
55
+
56
+ m = Amatch::DamerauLevenshtein.new("pattern")
57
+ # => #<Amatch::DamerauLevenshtein:0x007fc3483dd278>
58
+ m.match("pattren")
59
+ # => 1
60
+ "pattern language".damerau_levenshtein_similar("language of patterns")
61
+ # => 0.19999999999999996
62
+
63
+ m = Hamming.new("pattern")
64
+ # => #<Amatch::Hamming:0x40350858>
65
+ m.match("pattren")
66
+ # => 2
67
+ "pattern language".hamming_similar("language of patterns")
68
+ # => 0.1
69
+
70
+ m = PairDistance.new("pattern")
71
+ # => #<Amatch::PairDistance:0x40349be8>
72
+ m.match("pattr en")
73
+ # => 0.545454545454545
74
+ m.match("pattr en", nil)
75
+ # => 0.461538461538462
76
+ m.match("pattr en", /t+/)
77
+ # => 0.285714285714286
78
+ "pattern language".pair_distance_similar("language of patterns")
79
+ # => 0.928571428571429
80
+
81
+ m = LongestSubsequence.new("pattern")
82
+ # => #<Amatch::LongestSubsequence:0x4033e900>
83
+ m.match("pattren")
84
+ # => 6
85
+ "pattern language".longest_subsequence_similar("language of patterns")
86
+ # => 0.4
87
+
88
+ m = LongestSubstring.new("pattern")
89
+ # => #<Amatch::LongestSubstring:0x403378d0>
90
+ m.match("pattren")
91
+ # => 4
92
+ "pattern language".longest_substring_similar("language of patterns")
93
+ # => 0.4
94
+
95
+ m = Jaro.new("pattern")
96
+ # => #<Amatch::Jaro:0x363b70>
97
+ m.match("paTTren")
98
+ # => 0.952380952380952
99
+ m.ignore_case = false
100
+ m.match("paTTren")
101
+ # => 0.742857142857143
102
+ "pattern language".jaro_similar("language of patterns")
103
+ # => 0.672222222222222
104
+
105
+ m = JaroWinkler.new("pattern")
106
+ # #<Amatch::JaroWinkler:0x3530b8>
107
+ m.match("paTTren")
108
+ # => 0.971428571712403
109
+ m.ignore_case = false
110
+ m.match("paTTren")
111
+ # => 0.79428571505206
112
+ m.scaling_factor = 0.05
113
+ m.match("pattren")
114
+ # => 0.961904762046678
115
+ "pattern language".jarowinkler_similar("language of patterns")
116
+ # => 0.672222222222222
117
+
118
+ ## Author
119
+
120
+ Florian Frank mailto:flori@ping.de
121
+
122
+ ## License
123
+
124
+ Apache License, Version 2.0 – See the COPYING file in the source archive.
data/Rakefile CHANGED
@@ -13,23 +13,18 @@ Amatch is a library for approximate string matching and searching in strings.
13
13
  Several algorithms can be used to do this, and it's also possible to compute a
14
14
  similarity metric number between 0.0 and 1.0 for two given strings.
15
15
  EOT
16
- executables << 'agrep.rb'
16
+ executables << 'agrep' << 'dupfind'
17
17
  bindir 'bin'
18
18
  test_dir 'tests'
19
- ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', '.bundle'
19
+ ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', 'Makefile'
20
+ package_ignore '.all_images.yml', '.gitignore', 'VERSION'
20
21
  title "#{name.camelize} - Approximate Matching"
21
- readme 'README.rdoc'
22
+ readme 'README.md'
22
23
  require_paths %w[lib ext]
23
24
  dependency 'tins', '~>1.0'
24
- development_dependency 'test-unit', '~>2.3'
25
- development_dependency 'utils'
26
- development_dependency 'rake', '~>10', '<11.0'
27
-
28
- install_library do
29
- libdir = CONFIG["sitelibdir"]
30
- src, = Dir['ext/amatch.*'].reject { |x| x =~ /\.[co]$/ }
31
- install(src, File.join(libdir, File.basename(src)), :verbose => true)
32
- mkdir_p dst = File.join(libdir, 'amatch')
33
- install('lib/amatch/version.rb', File.join(dst, 'version.rb'), :verbose => true)
34
- end
25
+ dependency 'mize'
26
+ development_dependency 'test-unit', '~>3.0'
27
+ development_dependency 'all_images'
28
+ required_ruby_version '>=2.4'
29
+ licenses << 'Apache-2.0'
35
30
  end
data/amatch.gemspec CHANGED
Binary file
@@ -15,15 +15,21 @@ end
15
15
 
16
16
  class Amatch::Levenshtein
17
17
  def search_relative(strings)
18
- search(strings).to_f / pattern.size
18
+ if Array === strings
19
+ search(strings).map { |s| s.to_f / pattern.size }
20
+ else
21
+ search(strings).to_f / pattern.size
22
+ end
19
23
  end
20
24
  end
21
25
 
26
+ $algorithm = 'Levenshtein'
22
27
  $distance = 1
23
28
  $mode = :search
24
29
  begin
25
30
  parser = GetoptLong.new
26
31
  options = [
32
+ [ '--algorithm', '-a', GetoptLong::REQUIRED_ARGUMENT ],
27
33
  [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
28
34
  [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
29
35
  [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
@@ -33,6 +39,8 @@ begin
33
39
  parser.each_option do |name, arg|
34
40
  name = name.sub(/^--/, '')
35
41
  case name
42
+ when 'algorithm'
43
+ $algorithm = arg
36
44
  when 'distance'
37
45
  $distance = arg.to_f
38
46
  when 'relative'
@@ -48,7 +56,7 @@ rescue
48
56
  end
49
57
  pattern = ARGV.shift or usage('Pattern needed!', options)
50
58
 
51
- matcher = Amatch::Levenshtein.new(pattern)
59
+ matcher = Amatch.const_get($algorithm).new(pattern)
52
60
  size = 0
53
61
  start = Time.new
54
62
  if ARGV.size > 0 then
@@ -56,9 +64,12 @@ if ARGV.size > 0 then
56
64
  File.stat(filename).file? or next
57
65
  size += File.size(filename)
58
66
  begin
59
- File.open(filename, 'r').each_line do |line|
60
- if matcher.__send__($mode, line) <= $distance
61
- puts "#{filename}:#{line}"
67
+ File.open(filename, 'r').each_line.each_slice(1000) do |lines|
68
+ results = matcher.__send__($mode, lines)
69
+ lines.zip(results) do |line, r|
70
+ if r <= $distance
71
+ puts "#{filename}:#{line}"
72
+ end
62
73
  end
63
74
  end
64
75
  rescue
@@ -66,10 +77,13 @@ if ARGV.size > 0 then
66
77
  end
67
78
  end
68
79
  else
69
- STDIN.each_line do |line|
70
- size += line.size
71
- if matcher.__send__($mode, line) <= $distance
72
- puts line
80
+ STDIN.each_line.each_slice(1000) do |lines|
81
+ size += lines.size
82
+ results = matcher.__send__($mode, lines)
83
+ lines.zip(results) do |line, r|
84
+ if r <= $distance
85
+ puts line
86
+ end
73
87
  end
74
88
  end
75
89
  end
data/bin/dupfind ADDED
@@ -0,0 +1,153 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'tins/go'
4
+ include Tins::GO
5
+ require 'tins/minimize'
6
+ class Array
7
+ include Tins::Minimize
8
+ end
9
+ require 'amatch'
10
+ begin
11
+ require 'infobar'
12
+ rescue LoadError
13
+ warn "Please install gem infobar to run this executable!"
14
+ exit 1
15
+ end
16
+
17
+ def usage
18
+ puts <<EOT
19
+ Usage: #{File.basename($0)} [OPTIONS] FILE
20
+
21
+ -a ALGO Amatch matching algorithm
22
+ -p LIMIT more than p similarity to be a match
23
+ -R NUMBER skip NUMBER mismatch for building ranges
24
+ -r NUMBER minimum length to be counted as a range
25
+ -i compute a PNG per file
26
+
27
+ Repor$ bugs to <flori@ping.de>.
28
+ EOT
29
+ exit 0
30
+ end
31
+
32
+ class FindDuplicates
33
+ def initialize(algo, p_lim, filename)
34
+ @algo, @p_lim, @filename = algo, p_lim, filename
35
+ end
36
+
37
+ attr_reader :filename
38
+
39
+ attr_reader :algo
40
+
41
+ attr_reader :p_lim
42
+
43
+ memoize method:
44
+ def lines
45
+ File.readlines(filename)
46
+ end
47
+
48
+ memoize method:
49
+ def matrix
50
+ result = lines.with_infobar(label: filename, output: STDERR).map do |l1|
51
+ +infobar
52
+ a = algo.new(l1)
53
+ r = a.similar(lines)
54
+ r.map! { |s| s >= p_lim ? ?1 : ?0 }
55
+ r.join
56
+ end
57
+ infobar.finish
58
+ infobar.newline
59
+ result
60
+ end
61
+
62
+ def pbm(output: $>)
63
+ output << <<HEADER
64
+ P1
65
+ #{matrix.size} #{matrix.size}
66
+ HEADER
67
+ output << matrix.map { |line| line.each_char.to_a * ' ' } * ?\n
68
+ self
69
+ end
70
+
71
+ def png(output: $>)
72
+ IO.popen("pnmtopng", 'w+') do |conv|
73
+ pbm(output: conv)
74
+ conv.close_write
75
+ output.write(conv.read)
76
+ end
77
+ self
78
+ end
79
+
80
+ def create_image
81
+ suffix = Regexp.quote(File.extname(filename))
82
+ f = filename.sub(/(#{suffix}|)\z/, '.png')
83
+ File.open(f, 'wb') do |output|
84
+ png(output: output)
85
+ infobar.puts "Writing output to #{f.inspect}."
86
+ end
87
+ self
88
+ end
89
+
90
+ def similar_ranges(min_range: 3, skip_range: 0)
91
+ set = 0
92
+ ranges = { set => [] }
93
+ m = matrix
94
+ n = m.size
95
+ skip_count = 0
96
+ n.downto(1) do |h|
97
+ (n - h + 1).upto(n - 1) do |k|
98
+ i = k
99
+ j = k - (n - h + 1)
100
+ if m[i][j] == ?1
101
+ skip_count = 0
102
+ ranges[set] << [ i, j ]
103
+ elsif !ranges[set].empty? && skip_count < skip_range
104
+ skip_count += 1
105
+ else
106
+ skip_count = 0
107
+ ranges[set].empty? or ranges[set += 1] = []
108
+ end
109
+ end
110
+ skip_count = 0
111
+ ranges[set].empty? or ranges[set += 1] = []
112
+ end
113
+ ranges.each { |_, r|
114
+ r.flatten!
115
+ r.sort!
116
+ r.map! { |x| x + 1 }
117
+ r.minimize!
118
+ r.reject! { |s| s.size < min_range }
119
+ }.reject! { |_, r| r.empty? }
120
+ unions = []
121
+ while !ranges.empty?
122
+ _, r = ranges.first
123
+ equivalent = ranges.reject { |_, v| (v & r).empty? }
124
+ unions << equivalent.values.flatten.uniq
125
+ ranges.delete_if { |k, _| equivalent.keys.include?(k) }
126
+ end
127
+ unions.each do |r|
128
+ r.map! do |x|
129
+ "#{filename}:#{x.begin}-#{x.end}"
130
+ end
131
+ end
132
+ unions
133
+ end
134
+ end
135
+
136
+ opts = go 'a:p:R:r:ih'
137
+
138
+ usage if opts[?h]
139
+ algo = Amatch.const_get(opts[?a] || 'Levenshtein')
140
+ p_lim = (opts[?p] || 0.95).to_f
141
+ min_range = (opts[?r] || 3).to_i
142
+ skip_range = opts[?R].to_i
143
+ ARGV.empty? and usage
144
+
145
+ filenames = ARGV.inject([]) { |s, f| s.concat(Dir[f]) }
146
+ for filename in filenames
147
+ finder = FindDuplicates.new(algo, p_lim, filename)
148
+ opts[?i] and finder.create_image
149
+ for s in finder.similar_ranges(min_range: min_range, skip_range: skip_range)
150
+ infobar.reset
151
+ puts s, ?\n
152
+ end
153
+ end