amatch 0.3.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,124 @@
1
+ # amatch - Approximate Matching Extension for Ruby
2
+
3
+ ## Description
4
+
5
+ This is a collection of classes that can be used for Approximate
6
+ matching, searching, and comparing of Strings. They implement algorithms
7
+ that compute the Levenshtein edit distance, Sellers edit distance, the
8
+ Hamming distance, the longest common subsequence length, the longest common
9
+ substring length, the pair distance metric, the Jaro-Winkler metric.
10
+
11
+ ## Installation
12
+
13
+ To install this extension as a gem type
14
+
15
+ # gem install amatch
16
+
17
+ into the shell.
18
+
19
+ ## Download
20
+
21
+ The homepage of this library is located at
22
+
23
+ * https://github.com/flori/amatch
24
+
25
+ ## Examples
26
+
27
+ require 'amatch'
28
+ # => true
29
+ include Amatch
30
+ # => Object
31
+
32
+ m = Sellers.new("pattern")
33
+ # => #<Amatch::Sellers:0x40366324>
34
+ m.match("pattren")
35
+ # => 2.0
36
+ m.substitution = m.insertion = 3
37
+ # => 3
38
+ m.match("pattren")
39
+ # => 4.0
40
+ m.reset_weights
41
+ # => #<Amatch::Sellers:0x40366324>
42
+ m.match(["pattren","parent"])
43
+ # => [2.0, 4.0]
44
+ m.search("abcpattrendef")
45
+ # => 2.0
46
+
47
+ m = Levenshtein.new("pattern")
48
+ # => #<Amatch::Levenshtein:0x4035919c>
49
+ m.match("pattren")
50
+ # => 2
51
+ m.search("abcpattrendef")
52
+ # => 2
53
+ "pattern language".levenshtein_similar("language of patterns")
54
+ # => 0.2
55
+
56
+ m = Amatch::DamerauLevenshtein.new("pattern")
57
+ # => #<Amatch::DamerauLevenshtein:0x007fc3483dd278>
58
+ m.match("pattren")
59
+ # => 1
60
+ "pattern language".damerau_levenshtein_similar("language of patterns")
61
+ # => 0.19999999999999996
62
+
63
+ m = Hamming.new("pattern")
64
+ # => #<Amatch::Hamming:0x40350858>
65
+ m.match("pattren")
66
+ # => 2
67
+ "pattern language".hamming_similar("language of patterns")
68
+ # => 0.1
69
+
70
+ m = PairDistance.new("pattern")
71
+ # => #<Amatch::PairDistance:0x40349be8>
72
+ m.match("pattr en")
73
+ # => 0.545454545454545
74
+ m.match("pattr en", nil)
75
+ # => 0.461538461538462
76
+ m.match("pattr en", /t+/)
77
+ # => 0.285714285714286
78
+ "pattern language".pair_distance_similar("language of patterns")
79
+ # => 0.928571428571429
80
+
81
+ m = LongestSubsequence.new("pattern")
82
+ # => #<Amatch::LongestSubsequence:0x4033e900>
83
+ m.match("pattren")
84
+ # => 6
85
+ "pattern language".longest_subsequence_similar("language of patterns")
86
+ # => 0.4
87
+
88
+ m = LongestSubstring.new("pattern")
89
+ # => #<Amatch::LongestSubstring:0x403378d0>
90
+ m.match("pattren")
91
+ # => 4
92
+ "pattern language".longest_substring_similar("language of patterns")
93
+ # => 0.4
94
+
95
+ m = Jaro.new("pattern")
96
+ # => #<Amatch::Jaro:0x363b70>
97
+ m.match("paTTren")
98
+ # => 0.952380952380952
99
+ m.ignore_case = false
100
+ m.match("paTTren")
101
+ # => 0.742857142857143
102
+ "pattern language".jaro_similar("language of patterns")
103
+ # => 0.672222222222222
104
+
105
+ m = JaroWinkler.new("pattern")
106
+ # #<Amatch::JaroWinkler:0x3530b8>
107
+ m.match("paTTren")
108
+ # => 0.971428571712403
109
+ m.ignore_case = false
110
+ m.match("paTTren")
111
+ # => 0.79428571505206
112
+ m.scaling_factor = 0.05
113
+ m.match("pattren")
114
+ # => 0.961904762046678
115
+ "pattern language".jarowinkler_similar("language of patterns")
116
+ # => 0.672222222222222
117
+
118
+ ## Author
119
+
120
+ Florian Frank mailto:flori@ping.de
121
+
122
+ ## License
123
+
124
+ Apache License, Version 2.0 – See the COPYING file in the source archive.
data/Rakefile CHANGED
@@ -13,23 +13,18 @@ Amatch is a library for approximate string matching and searching in strings.
13
13
  Several algorithms can be used to do this, and it's also possible to compute a
14
14
  similarity metric number between 0.0 and 1.0 for two given strings.
15
15
  EOT
16
- executables << 'agrep.rb'
16
+ executables << 'agrep' << 'dupfind'
17
17
  bindir 'bin'
18
18
  test_dir 'tests'
19
- ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', '.bundle'
19
+ ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx', 'Makefile'
20
+ package_ignore '.all_images.yml', '.gitignore', 'VERSION'
20
21
  title "#{name.camelize} - Approximate Matching"
21
- readme 'README.rdoc'
22
+ readme 'README.md'
22
23
  require_paths %w[lib ext]
23
24
  dependency 'tins', '~>1.0'
24
- development_dependency 'test-unit', '~>2.3'
25
- development_dependency 'utils'
26
- development_dependency 'rake', '~>10', '<11.0'
27
-
28
- install_library do
29
- libdir = CONFIG["sitelibdir"]
30
- src, = Dir['ext/amatch.*'].reject { |x| x =~ /\.[co]$/ }
31
- install(src, File.join(libdir, File.basename(src)), :verbose => true)
32
- mkdir_p dst = File.join(libdir, 'amatch')
33
- install('lib/amatch/version.rb', File.join(dst, 'version.rb'), :verbose => true)
34
- end
25
+ dependency 'mize'
26
+ development_dependency 'test-unit', '~>3.0'
27
+ development_dependency 'all_images'
28
+ required_ruby_version '>=2.4'
29
+ licenses << 'Apache-2.0'
35
30
  end
data/amatch.gemspec CHANGED
Binary file
@@ -15,15 +15,21 @@ end
15
15
 
16
16
  class Amatch::Levenshtein
17
17
  def search_relative(strings)
18
- search(strings).to_f / pattern.size
18
+ if Array === strings
19
+ search(strings).map { |s| s.to_f / pattern.size }
20
+ else
21
+ search(strings).to_f / pattern.size
22
+ end
19
23
  end
20
24
  end
21
25
 
26
+ $algorithm = 'Levenshtein'
22
27
  $distance = 1
23
28
  $mode = :search
24
29
  begin
25
30
  parser = GetoptLong.new
26
31
  options = [
32
+ [ '--algorithm', '-a', GetoptLong::REQUIRED_ARGUMENT ],
27
33
  [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
28
34
  [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
29
35
  [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
@@ -33,6 +39,8 @@ begin
33
39
  parser.each_option do |name, arg|
34
40
  name = name.sub(/^--/, '')
35
41
  case name
42
+ when 'algorithm'
43
+ $algorithm = arg
36
44
  when 'distance'
37
45
  $distance = arg.to_f
38
46
  when 'relative'
@@ -48,7 +56,7 @@ rescue
48
56
  end
49
57
  pattern = ARGV.shift or usage('Pattern needed!', options)
50
58
 
51
- matcher = Amatch::Levenshtein.new(pattern)
59
+ matcher = Amatch.const_get($algorithm).new(pattern)
52
60
  size = 0
53
61
  start = Time.new
54
62
  if ARGV.size > 0 then
@@ -56,9 +64,12 @@ if ARGV.size > 0 then
56
64
  File.stat(filename).file? or next
57
65
  size += File.size(filename)
58
66
  begin
59
- File.open(filename, 'r').each_line do |line|
60
- if matcher.__send__($mode, line) <= $distance
61
- puts "#{filename}:#{line}"
67
+ File.open(filename, 'r').each_line.each_slice(1000) do |lines|
68
+ results = matcher.__send__($mode, lines)
69
+ lines.zip(results) do |line, r|
70
+ if r <= $distance
71
+ puts "#{filename}:#{line}"
72
+ end
62
73
  end
63
74
  end
64
75
  rescue
@@ -66,10 +77,13 @@ if ARGV.size > 0 then
66
77
  end
67
78
  end
68
79
  else
69
- STDIN.each_line do |line|
70
- size += line.size
71
- if matcher.__send__($mode, line) <= $distance
72
- puts line
80
+ STDIN.each_line.each_slice(1000) do |lines|
81
+ size += lines.size
82
+ results = matcher.__send__($mode, lines)
83
+ lines.zip(results) do |line, r|
84
+ if r <= $distance
85
+ puts line
86
+ end
73
87
  end
74
88
  end
75
89
  end
data/bin/dupfind ADDED
@@ -0,0 +1,153 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'tins/go'
4
+ include Tins::GO
5
+ require 'tins/minimize'
6
+ class Array
7
+ include Tins::Minimize
8
+ end
9
+ require 'amatch'
10
+ begin
11
+ require 'infobar'
12
+ rescue LoadError
13
+ warn "Please install gem infobar to run this executable!"
14
+ exit 1
15
+ end
16
+
17
+ def usage
18
+ puts <<EOT
19
+ Usage: #{File.basename($0)} [OPTIONS] FILE
20
+
21
+ -a ALGO Amatch matching algorithm
22
+ -p LIMIT more than p similarity to be a match
23
+ -R NUMBER skip NUMBER mismatch for building ranges
24
+ -r NUMBER minimum length to be counted as a range
25
+ -i compute a PNG per file
26
+
27
+ Repor$ bugs to <flori@ping.de>.
28
+ EOT
29
+ exit 0
30
+ end
31
+
32
+ class FindDuplicates
33
+ def initialize(algo, p_lim, filename)
34
+ @algo, @p_lim, @filename = algo, p_lim, filename
35
+ end
36
+
37
+ attr_reader :filename
38
+
39
+ attr_reader :algo
40
+
41
+ attr_reader :p_lim
42
+
43
+ memoize method:
44
+ def lines
45
+ File.readlines(filename)
46
+ end
47
+
48
+ memoize method:
49
+ def matrix
50
+ result = lines.with_infobar(label: filename, output: STDERR).map do |l1|
51
+ +infobar
52
+ a = algo.new(l1)
53
+ r = a.similar(lines)
54
+ r.map! { |s| s >= p_lim ? ?1 : ?0 }
55
+ r.join
56
+ end
57
+ infobar.finish
58
+ infobar.newline
59
+ result
60
+ end
61
+
62
+ def pbm(output: $>)
63
+ output << <<HEADER
64
+ P1
65
+ #{matrix.size} #{matrix.size}
66
+ HEADER
67
+ output << matrix.map { |line| line.each_char.to_a * ' ' } * ?\n
68
+ self
69
+ end
70
+
71
+ def png(output: $>)
72
+ IO.popen("pnmtopng", 'w+') do |conv|
73
+ pbm(output: conv)
74
+ conv.close_write
75
+ output.write(conv.read)
76
+ end
77
+ self
78
+ end
79
+
80
+ def create_image
81
+ suffix = Regexp.quote(File.extname(filename))
82
+ f = filename.sub(/(#{suffix}|)\z/, '.png')
83
+ File.open(f, 'wb') do |output|
84
+ png(output: output)
85
+ infobar.puts "Writing output to #{f.inspect}."
86
+ end
87
+ self
88
+ end
89
+
90
+ def similar_ranges(min_range: 3, skip_range: 0)
91
+ set = 0
92
+ ranges = { set => [] }
93
+ m = matrix
94
+ n = m.size
95
+ skip_count = 0
96
+ n.downto(1) do |h|
97
+ (n - h + 1).upto(n - 1) do |k|
98
+ i = k
99
+ j = k - (n - h + 1)
100
+ if m[i][j] == ?1
101
+ skip_count = 0
102
+ ranges[set] << [ i, j ]
103
+ elsif !ranges[set].empty? && skip_count < skip_range
104
+ skip_count += 1
105
+ else
106
+ skip_count = 0
107
+ ranges[set].empty? or ranges[set += 1] = []
108
+ end
109
+ end
110
+ skip_count = 0
111
+ ranges[set].empty? or ranges[set += 1] = []
112
+ end
113
+ ranges.each { |_, r|
114
+ r.flatten!
115
+ r.sort!
116
+ r.map! { |x| x + 1 }
117
+ r.minimize!
118
+ r.reject! { |s| s.size < min_range }
119
+ }.reject! { |_, r| r.empty? }
120
+ unions = []
121
+ while !ranges.empty?
122
+ _, r = ranges.first
123
+ equivalent = ranges.reject { |_, v| (v & r).empty? }
124
+ unions << equivalent.values.flatten.uniq
125
+ ranges.delete_if { |k, _| equivalent.keys.include?(k) }
126
+ end
127
+ unions.each do |r|
128
+ r.map! do |x|
129
+ "#{filename}:#{x.begin}-#{x.end}"
130
+ end
131
+ end
132
+ unions
133
+ end
134
+ end
135
+
136
+ opts = go 'a:p:R:r:ih'
137
+
138
+ usage if opts[?h]
139
+ algo = Amatch.const_get(opts[?a] || 'Levenshtein')
140
+ p_lim = (opts[?p] || 0.95).to_f
141
+ min_range = (opts[?r] || 3).to_i
142
+ skip_range = opts[?R].to_i
143
+ ARGV.empty? and usage
144
+
145
+ filenames = ARGV.inject([]) { |s, f| s.concat(Dir[f]) }
146
+ for filename in filenames
147
+ finder = FindDuplicates.new(algo, p_lim, filename)
148
+ opts[?i] and finder.create_image
149
+ for s in finder.similar_ranges(min_range: min_range, skip_range: skip_range)
150
+ infobar.reset
151
+ puts s, ?\n
152
+ end
153
+ end