perseus_match 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LINGO_BASE ADDED
@@ -0,0 +1 @@
1
+ /home/jw/devel/lingo/trunk
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to perseus_match version 0.0.2
5
+ This documentation refers to perseus_match version 0.0.3
6
6
 
7
7
 
8
8
  == DESCRIPTION
@@ -10,15 +10,17 @@ This documentation refers to perseus_match version 0.0.2
10
10
  Fuzzy string matching based on linguistic analysis.
11
11
 
12
12
 
13
- == AUTHORS
13
+ == LINKS
14
14
 
15
- * Jens Wille <mailto:jens.wille@uni-koeln.de>
15
+ <b></b>
16
+ Documentation:: <http://prometheus.rubyforge.org/perseus_match>
17
+ Source code:: <http://github.com/blackwinter/perseus_match>
18
+ Rubyforge project:: <http://rubyforge.org/projects/prometheus>
16
19
 
17
20
 
18
- == LINKS
21
+ == AUTHORS
19
22
 
20
- * <http://prometheus.rubyforge.org/perseus_match>
21
- * <http://github.com/blackwinter/perseus_match>
23
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
22
24
 
23
25
 
24
26
  == LICENSE AND COPYRIGHT
data/Rakefile CHANGED
@@ -13,8 +13,8 @@ begin
13
13
  :version => PerseusMatch::VERSION,
14
14
  :summary => %q{Fuzzy string matching based on linguistic analysis},
15
15
  :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
- :extra_files => FileList['[A-Z]*'].to_a,
17
- :dependencies => [['ruby-nuggets', '>= 0.3.0']]
16
+ :extra_files => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
17
+ :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
18
18
  }
19
19
  }}
20
20
  rescue LoadError
data/bin/perseus_match CHANGED
@@ -2,8 +2,10 @@
2
2
 
3
3
  require 'optparse'
4
4
  require 'benchmark'
5
+ require 'yaml'
5
6
 
6
7
  require 'rubygems'
8
+ require 'nuggets/enumerable/minmax'
7
9
  require 'nuggets/numeric/duration'
8
10
 
9
11
  $: << File.join(File.dirname(__FILE__), '..', 'lib')
@@ -14,19 +16,29 @@ USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
14
16
  abort USAGE if ARGV.empty?
15
17
 
16
18
  options = {
17
- :stats => false,
18
- :threshold => 0,
19
- :sort => false
19
+ :config => nil,
20
+ :threshold => 0,
21
+ :sort => false,
22
+ :stats => false,
23
+ :lingo => false,
24
+ :minimal => false,
25
+ :separate => false,
26
+ :check => false,
27
+ :failed_only => false,
28
+ :align => false,
29
+ :adjust_coeff => false
20
30
  }
21
31
 
22
32
  OptionParser.new { |opts|
23
33
  opts.banner = USAGE
24
34
 
25
- opts.separator ''
35
+ opts.separator ' '
26
36
  opts.separator 'Options:'
27
37
 
28
- opts.on('--stats', 'Output some statistics at the end') {
29
- options[:stats] = true
38
+ opts.on('-c', '--config YAML', 'Config file in YAML format') { |f|
39
+ abort "Can't find config file: #{f}." unless File.readable?(f)
40
+
41
+ options[:config] = f
30
42
  }
31
43
 
32
44
  opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
@@ -37,7 +49,51 @@ OptionParser.new { |opts|
37
49
  options[:sort] = true
38
50
  }
39
51
 
40
- opts.separator ''
52
+ opts.on('-S', '--stats', 'Output some statistics at the end') {
53
+ options[:stats] = true
54
+ }
55
+
56
+ opts.on('-v', '--verbose', 'Print additional information during processing') {
57
+ options[:verbose] = true
58
+ }
59
+
60
+ opts.separator ' '
61
+ opts.separator ' * Calculating similarities (default)'
62
+ opts.separator ' '
63
+
64
+ opts.on('-m', '--minimal', 'Produce minimal pairs only') {
65
+ options[:minimal] = true
66
+ }
67
+
68
+ opts.on('-p', '--separate [SEPARATOR]', 'Separate phrase blocks by SEPARATOR', '[Default: empty line]') { |p|
69
+ options[:separate] = p || ''
70
+ }
71
+
72
+ opts.on('-l', '--lingo', 'Output results in lingo dictionary format') {
73
+ options[:lingo] = true
74
+ }
75
+
76
+ opts.separator ' '
77
+ opts.separator ' * Checking pairs'
78
+ opts.separator ' '
79
+
80
+ opts.on('-C', '--check', 'Check pairs in input file (CSV) against', 'specified (or global) threshold') {
81
+ options[:check] = true
82
+ }
83
+
84
+ opts.on('-f', '--failed', 'Print only failed checks') {
85
+ options[:failed_only] = true
86
+ }
87
+
88
+ opts.on('-a', '--align', 'Align check results') {
89
+ options[:align] = true
90
+ }
91
+
92
+ opts.on('-A', '--adjust-coeff', 'Adjust coefficient to minimize the error') {
93
+ options[:adjust_coeff] = true
94
+ }
95
+
96
+ opts.separator ' '
41
97
  opts.separator 'Generic options:'
42
98
 
43
99
  opts.on('-h', '--help', 'Print this help message and exit') {
@@ -57,33 +113,179 @@ end
57
113
 
58
114
  PerseusMatch::TokenSet.tokenize(file)
59
115
 
60
- phrases = File.readlines(file).map { |line| line.chomp }
116
+ skip_re = %r{\A\s*(?:#|\z)}o
117
+ phrases = []
118
+
119
+ File.open(file).each { |line|
120
+ phrases << line.chomp unless line =~ skip_re
121
+ }.close
122
+
123
+ pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
124
+ pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
125
+ list_options = { :minimal => options[:minimal] }
61
126
 
62
127
  threshold, count, count_all = options[:threshold], 0, 0
63
128
 
64
- time = Benchmark.realtime {
65
- if options[:sort]
66
- require 'pp'
129
+ action = if options[:check]
130
+ require 'fastercsv'
67
131
 
68
- pp PerseusMatch::Cluster.new(phrases).sort { |pm|
69
- if pm.similarity >= threshold
70
- [pm.target, pm.distance, pm.similarity]
71
- count += 1
72
- end
73
- count_all += 1
74
- }.compact
132
+ format = if options[:align]
133
+ require 'jcode'
134
+
135
+ width = phrases.max(:jlength) + 3
136
+
137
+ lambda { |line, res|
138
+ "#{line} #{'.' * (width - line.jlength)} [#{res}]"
139
+ }
75
140
  else
76
- PerseusMatch::List.pair(phrases) { |pm|
77
- if pm.similarity >= threshold
78
- p [pm.phrase, pm.target, pm.distance, pm.similarity]
141
+ lambda { |line, res|
142
+ "#{line} [#{res}]"
143
+ }
144
+ end
145
+
146
+ phrases.sort! if options[:sort]
147
+ phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
148
+
149
+ global_threshold = options[:threshold]
150
+ failed_only = options[:failed_only]
151
+ collect_stats = options[:stats]
152
+ adjust_coeff = options[:adjust_coeff]
153
+
154
+ _action = lambda { |*args|
155
+ pm_options[:default_coeff] = args.first unless args.empty?
156
+
157
+ count, count_all = 0, 0
158
+ positives = negatives = false_positives = false_negatives = 0.0
159
+
160
+ phrases.each { |line, spec|
161
+ phrase, target, threshold, operator, _ = *spec
162
+
163
+ threshold ||= global_threshold
164
+ operator ||= '>'
165
+ assign = operator =~ />/
166
+
167
+ begin
168
+ PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
169
+
79
170
  count += 1
171
+ assign ? positives += 1 : negatives += 1
172
+
173
+ puts format[line, 'OK'] unless adjust_coeff || failed_only
174
+ rescue PerseusMatch::CheckFailedError => err
175
+ assign ? false_negatives += 1 : false_positives += 1
176
+
177
+ puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
80
178
  end
81
179
  count_all += 1
82
180
  }
181
+
182
+ divide = lambda { |numerator, denominator|
183
+ denominator == 0 ? 0 : numerator / denominator
184
+ }
185
+
186
+ if collect_stats || adjust_coeff
187
+ error = divide[
188
+ false_positives + false_negatives,
189
+ positives + negatives + false_positives + false_negatives
190
+ ]
191
+ end
192
+
193
+ if collect_stats
194
+ recall = divide[positives, positives + false_negatives]
195
+ precision = divide[positives, positives + false_positives]
196
+ f1 = divide[2 * recall * precision, recall + precision]
197
+
198
+ warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
199
+ recall * 100, precision * 100, f1, error
200
+ ]
201
+ end
202
+
203
+ error if adjust_coeff
204
+ }
205
+
206
+ if adjust_coeff
207
+ lambda {
208
+ step, max = 1, 100
209
+
210
+ start_coeff = pm_options[:default_coeff] || PerseusMatch::DEFAULT_COEFF
211
+ start_err = _action[start_coeff]
212
+
213
+ previous_coeff = next_coeff = start_coeff
214
+ previous_err = next_err = start_err
215
+
216
+ max.times { break if (previous_err = _action[previous_coeff -= step]) != start_err }
217
+ max.times { break if (next_err = _action[next_coeff += step]) != start_err }
218
+
219
+ best_err = [start_err, previous_err, next_err].min
220
+
221
+ if best_err == start_err
222
+ best_coeff = start_coeff
223
+ else
224
+ if best_err == previous_err
225
+ max.times {
226
+ break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
227
+ previous_err, previous_coeff = current_err, current_coeff
228
+ }
229
+
230
+ best_err, best_coeff = previous_err, previous_coeff
231
+ else
232
+ max.times {
233
+ break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
234
+ next_err, next_coeff = current_err, current_coeff
235
+ }
236
+
237
+ best_err, best_coeff = next_err, next_coeff
238
+ end
239
+ end
240
+
241
+ puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
242
+ }
243
+ else
244
+ _action
83
245
  end
84
- }
246
+ else
247
+ format =
248
+ options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
249
+ options[:sort] ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
250
+ lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
251
+
252
+ if options[:sort]
253
+ require 'pp'
254
+
255
+ lambda {
256
+ pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
257
+ if pm.similarity >= threshold
258
+ res = format[pm]
259
+ count += 1
260
+ end
261
+ count_all += 1
262
+ res
263
+ }.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
264
+ }
265
+ else
266
+ lambda {
267
+ separator, previous_phrase = options[:separate], nil
268
+
269
+ PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
270
+ if separator && pm.phrase != previous_phrase ||= pm.phrase
271
+ puts separator
272
+ previous_phrase = pm.phrase
273
+ end
274
+
275
+ if pm.similarity >= threshold
276
+ puts format[pm]
277
+ count += 1
278
+ end
279
+
280
+ count_all += 1
281
+ }
282
+ }
283
+ end
284
+ end
85
285
 
86
286
  if options[:stats]
287
+ time = Benchmark.realtime(&action)
288
+
87
289
  hms, x, y = time.to_hms(2), time / count, time / count_all
88
290
 
89
291
  precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
@@ -91,4 +293,6 @@ if options[:stats]
91
293
  warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
92
294
  phrases.size, count, count_all, hms, x, y
93
295
  ]
296
+ else
297
+ action.call
94
298
  end
@@ -30,10 +30,10 @@ class PerseusMatch
30
30
 
31
31
  class Cluster < Hash
32
32
 
33
- def initialize(phrases = [])
33
+ def initialize(phrases = [], pm_options = {}, list_options = {})
34
34
  super() { |h, k| h[k] = [] }
35
35
 
36
- List.new(phrases).each { |pm| add(pm) }
36
+ List.pair(phrases, pm_options, list_options) { |pm| add(pm) }
37
37
  end
38
38
 
39
39
  def add(pm)
@@ -42,7 +42,7 @@ class PerseusMatch
42
42
 
43
43
  alias_method :<<, :add
44
44
 
45
- def sort_by(attribute, *args, &block)
45
+ def sort_by(attribute, *args)
46
46
  options = args.last.is_a?(Hash) ? args.pop : {}
47
47
 
48
48
  _ = map { |phrase, matches|
@@ -63,15 +63,15 @@ class PerseusMatch
63
63
  lambda { |match| res[match] < threshold } :
64
64
  lambda { |match| res[match] > threshold }
65
65
 
66
- matches.reject! { |match| condition[match] }
66
+ matches.reject!(&condition)
67
67
  end
68
68
 
69
69
  if limit = options[:limit]
70
- matches.slice!(limit..-1)
70
+ matches.slice!(limit..-1) if matches.size > limit
71
71
  end
72
72
 
73
73
  # transform entries if so requested
74
- matches.map!(&block) if block
74
+ matches.map! { |match| yield(match) } if block_given?
75
75
 
76
76
  [phrase, matches]
77
77
  }.sort
@@ -79,8 +79,9 @@ class PerseusMatch
79
79
  _ # rcov hack :-(
80
80
  end
81
81
 
82
- def sort(options = {}, &block)
83
- sort_by(:similarity, options.delete(:coeff), options, &block)
82
+ def sort(options = {})
83
+ args = [:similarity, options.delete(:coeff), options]
84
+ block_given? ? sort_by(*args) { |*a| yield(*a) } : sort_by(*args)
84
85
  end
85
86
 
86
87
  def rank(options = {})
@@ -32,24 +32,46 @@ class PerseusMatch
32
32
 
33
33
  class << self
34
34
 
35
- def pair(phrases)
35
+ def pair(phrases, pm_options = {}, list_options = {})
36
36
  phrases.uniq!
37
37
 
38
- phrases.each { |phrase|
39
- phrases.each { |target|
40
- yield PerseusMatch.new(phrase, target)
38
+ pairs = [] unless block_given?
39
+
40
+ unless list_options[:minimal]
41
+ # => pairs.size = phrases.size ** 2
42
+
43
+ phrases.each { |phrase|
44
+ phrases.each { |target|
45
+ pm = PerseusMatch.new(phrase, target, pm_options)
46
+ block_given? ? yield(pm) : pairs << pm
47
+ }
48
+ }
49
+ else
50
+ # => pairs.size = (phrases.size ** 2 - phrases.size) / 2
51
+
52
+ size = phrases.size
53
+
54
+ 1.upto(size) { |i|
55
+ phrase = phrases[i - 1]
56
+
57
+ i.upto(size - 1) { |j|
58
+ pm = PerseusMatch.new(phrase, phrases[j], pm_options)
59
+ block_given? ? yield(pm) : pairs << pm
60
+ }
41
61
  }
42
- }
62
+ end
63
+
64
+ pairs || phrases
43
65
  end
44
66
 
45
67
  end
46
68
 
47
- alias_method :add, :push
48
-
49
- def initialize(phrases = [])
50
- self.class.pair(phrases) { |pm| add(pm) }
69
+ def initialize(phrases = [], pm_options = {}, list_options = {})
70
+ self.class.pair(phrases, pm_options, list_options) { |pm| add(pm) }
51
71
  end
52
72
 
73
+ alias_method :add, :push
74
+
53
75
  end
54
76
 
55
77
  end
@@ -28,44 +28,52 @@
28
28
 
29
29
  $KCODE = 'u'
30
30
 
31
- LINGO_BASE = '/home/jw/devel/lingo/trunk'
32
-
33
- LINGO_CONFIG = {
34
- 'meeting' => {
35
- 'attendees' => [
36
- { 'textreader' => { 'files'=> 'STDIN' } },
37
- { 'tokenizer' => { } },
38
- { 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
39
- { 'decomposer' => { 'source' => 'sys-dic' } },
40
- { 'multiworder' => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
41
- { 'synonymer' => { 'source' => 'sys-syn', 'out' => 'syn', 'skip'=>'?,t' } },
42
- { 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } }
43
- ]
44
- }
45
- }
46
-
47
- require 'tempfile'
31
+ require 'pathname'
32
+ require 'rbconfig'
48
33
  require 'yaml'
49
34
 
50
- # use enhanced Tempfile#make_tmpname, as of r13631
51
- if RUBY_RELEASE_DATE < '2007-10-05'
52
- class Tempfile
53
-
54
- def make_tmpname(basename, n)
55
- case basename
56
- when Array
57
- prefix, suffix = *basename
58
- else
59
- prefix, suffix = basename, ''
60
- end
35
+ require 'rubygems'
36
+ require 'backports/tempfile'
37
+ require 'nuggets/tempfile/open'
38
+ require 'nuggets/util/i18n'
61
39
 
62
- t = Time.now.strftime("%Y%m%d")
63
- path = "#{prefix}#{t}-#{$$}-#{rand(0x100000000).to_s(36)}-#{n}#{suffix}"
64
- end
40
+ begin
41
+ require 'text/soundex'
42
+ rescue LoadError
43
+ warn "could not load the Text gem -- soundex functionality will not be available"
44
+ end
65
45
 
66
- end
46
+ LINGO_BASE = ENV['PM_LINGO_BASE'] || (
47
+ File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
48
+ )
49
+
50
+ LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
51
+ warn "lingo installation not found at #{LINGO_BASE} -- proceeding anyway" unless LINGO_FOUND
52
+
53
+ lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
54
+ YAML.load_file(file)
55
+ else
56
+ warn "lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
57
+
58
+ {
59
+ 'meeting' => {
60
+ 'attendees' => [
61
+ { 'tokenizer' => { } },
62
+ { 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
63
+ { 'decomposer' => { 'source' => 'sys-dic' } },
64
+ { 'multiworder' => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
65
+ { 'synonymer' => { 'source' => 'sys-syn', 'skip' => '?,t' } },
66
+ ]
67
+ }
68
+ }
67
69
  end
68
-
70
+
71
+ lingo_config['meeting']['attendees'].
72
+ unshift({ 'textreader' => { 'files'=> 'STDIN' } }).
73
+ push({ 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } })
74
+
75
+ LINGO_CONFIG = lingo_config
76
+
69
77
  class PerseusMatch
70
78
 
71
79
  class TokenSet < Array
@@ -73,10 +81,8 @@ class PerseusMatch
73
81
  def self.tokenize(form)
74
82
  return @tokens[form] if @tokens
75
83
 
76
- @_tokens = {}
77
- @tokens = Hash.new { |h, k| h[k] = new(
78
- k, @_tokens.has_key?(k) ? @_tokens[k] :
79
- k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
84
+ @_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
85
+ k, (@_tokens[k] || []) | k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
80
86
  )}
81
87
 
82
88
  parse = lambda { |x|
@@ -85,8 +91,11 @@ class PerseusMatch
85
91
  when /<(.*?)\s=\s\[(.*)\]>/
86
92
  a, b = $1, $2
87
93
  @_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
88
- #when /<(.*)>/, /:(.*):/
89
- # # ignore
94
+ when /<(.*)>/, /:(.*):/
95
+ a, b = $1, $1.dup
96
+ @_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
97
+
98
+ warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
90
99
  end
91
100
  }
92
101
  }
@@ -95,29 +104,32 @@ class PerseusMatch
95
104
  File.open(t) { |f| parse[f] }
96
105
  @tokens[form]
97
106
  else
98
- cfg = Tempfile.new(['perseus_match_lingo', '.cfg'])
99
- YAML.dump(LINGO_CONFIG, cfg)
100
- cfg.close
107
+ raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
108
+
109
+ cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
110
+ YAML.dump(LINGO_CONFIG, t)
111
+ }
101
112
 
102
- file = form[0] == ?/ ? form : File.join(Dir.pwd, form)
113
+ file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
103
114
 
104
115
  unless File.file?(file) && File.readable?(file)
105
- temp = Tempfile.new('perseus_match_temp')
106
- temp.puts form
107
- temp.close
116
+ temp = Tempfile.open('perseus_match_temp') { |t|
117
+ t.puts form
118
+ }
108
119
 
109
120
  file = temp.path
110
121
  end
111
122
 
112
- Dir.chdir(LINGO_BASE) { parse[%x{
113
- ./lingo.rb -c #{cfg.path} < #{file}
114
- }] }
115
-
116
- cfg.unlink
123
+ begin
124
+ Dir.chdir(LINGO_BASE) { parse[%x{
125
+ #{Config::CONFIG['ruby_install_name']} lingo.rb -c "#{cfg.path}" < "#{file}"
126
+ }] }
127
+ ensure
128
+ cfg.unlink
129
+ temp.unlink if temp
130
+ end
117
131
 
118
132
  if temp
119
- temp.unlink
120
-
121
133
  tokens, @tokens = @tokens[form], nil
122
134
  tokens
123
135
  else
@@ -138,42 +150,40 @@ class PerseusMatch
138
150
  end
139
151
 
140
152
  def distance(other)
141
- distance, index, max = xor(other).size, -1, size
153
+ tokens1, tokens2 = tokens, other.tokens
154
+ size1, size2 = tokens1.size, tokens2.size
142
155
 
143
- intersect(other).each { |token|
144
- while current = other.tokens[index += 1] and current != token
145
- distance += 1
156
+ return size2 if tokens1.empty?
157
+ return size1 if tokens2.empty?
146
158
 
147
- break if index > max
148
- end
149
- }
159
+ distance, costs = nil, (0..size2).to_a
150
160
 
151
- distance
152
- end
161
+ 0.upto(size1 - 1) { |index1|
162
+ token1, cost = tokens1[index1], index1 + 1
153
163
 
154
- def tokens(wc = true)
155
- wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
156
- token.sub(%r{[/|].*?\z}, '')
157
- }
158
- end
164
+ 0.upto(size2 - 1) { |index2|
165
+ penalty = token1 == tokens2[index2] ? 0 : 1
159
166
 
160
- def &(other)
161
- tokens & other.tokens
162
- end
167
+ # rcov hack :-(
168
+ _ = [
169
+ costs[index2 + 1] + 1, # insertion
170
+ cost + 1, # deletion
171
+ costs[index2] + penalty # substitution
172
+ ]
173
+ distance = _.min
163
174
 
164
- def |(other)
165
- tokens | other.tokens
166
- end
175
+ costs[index2], cost = cost, distance
176
+ }
167
177
 
168
- def intersect(other)
169
- (self & other).inject([]) { |memo, token|
170
- memo + [token] * [count(token), other.count(token)].max
178
+ costs[size2] = distance
171
179
  }
180
+
181
+ distance + 1 # > 0 !?!
172
182
  end
173
183
 
174
- def xor(other)
175
- ((self | other) - (self & other)).inject([]) { |memo, token|
176
- memo + [token] * (count(token) + other.count(token))
184
+ def tokens(wc = true)
185
+ wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
186
+ token.sub(%r{[/|].*?\z}, '')
177
187
  }
178
188
  end
179
189
 
@@ -186,26 +196,31 @@ class PerseusMatch
186
196
  end
187
197
 
188
198
  def incl(*wc)
189
- (@incl ||= {})[wc = [*wc].compact] ||= map { |tokens|
190
- tokens.reject { |token| !match?(token, wc) }
199
+ (@incl ||= {})[wc = [*wc].compact] ||= select { |token|
200
+ match?(token, wc)
191
201
  }.to_token_set(form)
192
202
  end
193
203
 
194
204
  def excl(*wc)
195
- (@excl ||= {})[wc = [*wc].compact] ||= map { |tokens|
196
- tokens.reject { |token| match?(token, wc) }
205
+ (@excl ||= {})[wc = [*wc].compact] ||= reject { |token|
206
+ match?(token, wc)
197
207
  }.to_token_set(form)
198
208
  end
199
209
 
200
- def count(token)
201
- counts[token]
210
+ def soundex
211
+ raise "soundex functionality not available" unless defined?(Text::Soundex)
212
+
213
+ @soundex ||= map { |token|
214
+ token.sub(/(.*)(?=[\/|])/) { |m| Text::Soundex.soundex(m.replace_diacritics) }
215
+ }.to_token_set(form)
202
216
  end
203
217
 
204
- def counts
205
- @counts ||= tokens.inject(Hash.new(0)) { |counts, token|
206
- counts[token] += 1
207
- counts
208
- }
218
+ def soundex!
219
+ replace soundex
220
+ end
221
+
222
+ def eql?(other)
223
+ tokens == other.tokens && form == other.form
209
224
  end
210
225
 
211
226
  def inspect
@@ -4,7 +4,7 @@ class PerseusMatch
4
4
 
5
5
  MAJOR = 0
6
6
  MINOR = 0
7
- TINY = 2
7
+ TINY = 3
8
8
 
9
9
  class << self
10
10
 
data/lib/perseus_match.rb CHANGED
@@ -38,35 +38,53 @@ class PerseusMatch
38
38
 
39
39
  DEFAULT_COEFF = 20
40
40
 
41
- DISTANCE_SPEC = {
42
- {} => 1,
43
- { :excl => %w[a t] } => 1,
44
- { :incl => 's' } => 2,
45
- { :incl => 'y' } => 4,
46
- { :sort => true } => 4
47
- }
41
+ DISTANCE_SPEC = [ # {
42
+ [{}, 1], # {} => 1,
43
+ [{ :excl => %w[a t] }, 2], # { :excl => %w[a t] } => 1,
44
+ [{ :incl => 's' }, 3], # { :incl => 's' } => 2,
45
+ [{ :incl => 'y' }, 4], # { :incl => 'y' } => 4,
46
+ [{ :sort => true }, 4], # { :sort => true } => 4,
47
+ [{ :soundex => true }, 4] # { :soundex => true } => 8
48
+ ] # }
48
49
 
49
50
  class << self
50
51
 
51
- def match(phrases)
52
- List.new(phrases)
52
+ def distance(*args)
53
+ new(*args).distance
53
54
  end
54
55
 
55
- def cluster(phrases, options = {})
56
- Cluster.new(phrases).rank(options)
56
+ def match(phrases, pm_options = {})
57
+ List.new(phrases, pm_options)
58
+ end
59
+
60
+ def cluster(phrases, options = {}, pm_options = {})
61
+ Cluster.new(phrases, pm_options).rank(options)
62
+ end
63
+
64
+ def check(*args)
65
+ check!(*args)
66
+ rescue CheckFailedError
67
+ false
68
+ end
69
+
70
+ def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
71
+ value = new(phrase, target, pm_options).send(attribute)
72
+ value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
57
73
  end
58
74
 
59
75
  end
60
76
 
61
- attr_reader :phrase, :target, :distance_spec, :default_coeff
77
+ attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
62
78
 
63
79
  def initialize(phrase, target, options = {})
64
- @phrase = phrase
65
- @target = target
80
+ @phrase = phrase.to_s
81
+ @target = target.to_s
66
82
 
67
83
  @default_coeff = options[:default_coeff] || DEFAULT_COEFF
68
84
  @distance_spec = options[:distance_spec] || DISTANCE_SPEC
69
85
 
86
+ @verbose = options[:verbose]
87
+
70
88
  @similarity = {}
71
89
  end
72
90
 
@@ -97,7 +115,7 @@ class PerseusMatch
97
115
 
98
116
  def calculate_distance
99
117
  return Infinity if phrase_tokens.disjoint?(target_tokens)
100
- return 0 if phrase_tokens == target_tokens
118
+ return 0 if phrase_tokens.eql?(target_tokens)
101
119
 
102
120
  distance_spec.inject(0) { |distance, (options, weight)|
103
121
  distance + token_distance(options) * weight
@@ -105,19 +123,47 @@ class PerseusMatch
105
123
  end
106
124
 
107
125
  def token_distance(options = {})
108
- phrase_tokens = self.phrase_tokens.inclexcl(options)
109
- target_tokens = self.target_tokens.inclexcl(options)
126
+ tokens1 = phrase_tokens.inclexcl(options)
127
+ tokens2 = target_tokens.inclexcl(options)
110
128
 
111
129
  if options[:sort]
112
- phrase_tokens.sort!
113
- target_tokens.sort!
130
+ tokens1 = tokens1.sort
131
+ tokens2 = tokens2.sort
114
132
  end
115
133
 
116
- (phrase_tokens.distance(target_tokens) + target_tokens.distance(phrase_tokens)) / 2.0
134
+ if options[:soundex]
135
+ tokens1 = tokens1.soundex
136
+ tokens2 = tokens2.soundex
137
+ end
138
+
139
+ distance = tokens1.distance(tokens2)
140
+
141
+ warn <<-EOT if verbose
142
+ #{options.inspect}:
143
+ #{tokens1.inspect}
144
+ #{tokens2.inspect}
145
+ => #{distance}
146
+ EOT
147
+
148
+ distance
117
149
  end
118
150
 
119
151
  def total_weight
120
- distance_spec.values.inject(0.0) { |total, weight| total + weight }
152
+ @total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
153
+ end
154
+
155
+ class CheckFailedError < StandardError
156
+
157
+ attr_reader :value, :threshold, :operator
158
+
159
+ def initialize(value, threshold, operator)
160
+ @value, @threshold, @operator = value, threshold, operator
161
+ end
162
+
163
+ def to_s
164
+ "FAILED: #{value} #{operator} #{threshold}"
165
+ end
166
+
121
167
  end
122
168
 
123
169
  end
@@ -0,0 +1,45 @@
1
+ describe PerseusMatch::Cluster do
2
+
3
+ it 'should accept limit option in sort_by' do
4
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :limit => 1).all? { |phrase, matches|
5
+ matches.size.should == 1
6
+ matches.size.should == matches.nitems
7
+ }
8
+ end
9
+
10
+ it 'should accept threshold option in sort_by (1a)' do
11
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
12
+ matches.size.should be_zero
13
+ matches.size.should == matches.nitems
14
+ }
15
+ end
16
+
17
+ it 'should accept threshold option in sort_by (1b)' do
18
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0).all? { |phrase, matches|
19
+ matches.size.should == 2
20
+ matches.size.should == matches.nitems
21
+ }
22
+ end
23
+
24
+ it 'should accept threshold option in sort_by (2)' do
25
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'c').all? { |phrase, matches|
26
+ matches.size.should == 1
27
+ matches.size.should == matches.nitems
28
+ }
29
+ end
30
+
31
+ it 'should accept both limit and threshold options in sort_by (1)' do
32
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'z', :limit => 1).all? { |phrase, matches|
33
+ matches.size.should == 1
34
+ matches.size.should == matches.nitems
35
+ }
36
+ end
37
+
38
+ it 'should accept both limit and threshold options in sort_by (2)' do
39
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'a', :limit => 1).all? { |phrase, matches|
40
+ matches.size.should be_zero
41
+ matches.size.should == matches.nitems
42
+ }
43
+ end
44
+
45
+ end if LINGO_FOUND
@@ -0,0 +1,16 @@
1
+ describe PerseusMatch::List, '::pair' do
2
+
3
+ before :all do
4
+ @phrases = %w[foo bar baz]
5
+ @size = @phrases.size
6
+ end
7
+
8
+ it 'should produce full list of pairs with correct size' do
9
+ PerseusMatch::List.pair(@phrases).size.should == @size ** 2
10
+ end
11
+
12
+ it 'should produce minimal list of pairs with correct size' do
13
+ PerseusMatch::List.pair(@phrases, {}, :minimal => true).size.should == (@size ** 2 - @size) / 2
14
+ end
15
+
16
+ end
@@ -0,0 +1,65 @@
1
+ describe PerseusMatch::TokenSet, ' with lingo' do
2
+
3
+ before :each do
4
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
5
+ end
6
+
7
+ it 'should tokenize a string' do
8
+ PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
9
+ end
10
+
11
+ it 'should report strictly equal TokenSets as ==' do
12
+ PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
13
+ end
14
+
15
+ it 'should report strictly equal TokenSets as eql' do
16
+ PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
17
+ end
18
+
19
+ it 'should report slightly equal TokenSets as ==' do
20
+ PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
21
+ end
22
+
23
+ it 'should *not* report slightly equal TokenSets as eql' do
24
+ PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
25
+ end
26
+
27
+ it 'should include form in inspect' do
28
+ PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
29
+ end
30
+
31
+ end if LINGO_FOUND
32
+
33
+ describe PerseusMatch::TokenSet, ' without lingo' do
34
+
35
+ before :each do
36
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
37
+ end
38
+
39
+ it 'should take a prepared file for tokenization' do
40
+ # prevent lingo from being used
41
+ lingo_base = LINGO_BASE.dup
42
+ LINGO_BASE.replace('')
43
+
44
+ temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
45
+ t.puts *%w[<foo|?> <bar|?>]
46
+ }
47
+
48
+ path = temp.path
49
+ link = 'perseus.tokens'
50
+
51
+ Dir.chdir(File.dirname(path)) {
52
+ File.symlink(path, link)
53
+
54
+ PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
55
+
56
+ File.unlink(link)
57
+ }
58
+
59
+ temp.unlink
60
+
61
+ # reset lingo base
62
+ LINGO_BASE.replace(lingo_base)
63
+ end
64
+
65
+ end
@@ -0,0 +1,168 @@
1
+ require 'rubygems'
2
+ require 'nuggets/tempfile/open'
3
+ require 'nuggets/util/i18n'
4
+
5
+ describe PerseusMatch do
6
+
7
+ before :all do
8
+ @highly_similar = [
9
+ 'Anbetung der Könige',
10
+ 'Die Anbetung der Könige'
11
+ ] # ok
12
+
13
+ @similar = [
14
+ # @highly_similar + ...
15
+ 'Die Anbetung der Heiligen Drei Könige',
16
+ 'dIE AnBeTuNg der heILIGen dREI KÖniGE'
17
+ ] # ok
18
+
19
+ @unfortunately_similar = [
20
+ # @similar + ...
21
+ 'Die Die Die Anbetung der Könige',
22
+ 'Die Könige der Anbetung',
23
+ 'Königsanbetung hoch drei'
24
+ ] # *not* ok -- eventually try to drop these below the threshold
25
+
26
+ @somewhat_similar = @highly_similar + @similar + @unfortunately_similar
27
+
28
+ phrases = @somewhat_similar + [
29
+ 'Drei mal drei macht sechs',
30
+ 'Das Ende dieses Blödsinns',
31
+ ''
32
+ ]
33
+
34
+ temp = Tempfile.open('perseus_match_spec_temp') { |t|
35
+ t.puts *phrases
36
+ }
37
+
38
+ PerseusMatch::TokenSet.tokenize(temp.path)
39
+
40
+ temp.unlink
41
+
42
+ @matchings = PerseusMatch.match(phrases)
43
+ end
44
+
45
+ it 'should identify identical (non-empty) strings as identical' do
46
+ @matchings.each { |matching|
47
+ if !matching.phrase.empty? && matching.phrase == matching.target
48
+ inform_on_error(matching) { matching.similarity.should == 1.0 }
49
+ end
50
+ }
51
+ end
52
+
53
+ it 'should identify case-insensitively identical (non-empty) strings as nearly identical' do
54
+ @matchings.each { |matching|
55
+ if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase == matching.target.replace_diacritics.downcase
56
+ inform_on_error(matching) { matching.similarity.should > 0.95 }
57
+ end
58
+ }
59
+ end
60
+
61
+ it 'should identify *only* case-insensitively identical (non-empty) strings as nearly identical' do
62
+ @matchings.each { |matching|
63
+ if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase != matching.target.replace_diacritics.downcase
64
+ inform_on_error(matching) { matching.similarity.should < 0.98 }
65
+ end
66
+ }
67
+ end
68
+
69
+ it 'should identify disjunct (non-empty) strings as disjunct' do
70
+ @matchings.each { |matching|
71
+ if !matching.phrase.empty? && matching.phrase_tokens.disjoint?(matching.target_tokens)
72
+ inform_on_error(matching) { matching.similarity.should == 0.0 }
73
+ end
74
+ }
75
+ end
76
+
77
+ it 'should identify empty string as disjunct with anything, even with itself' do
78
+ @matchings.each { |matching|
79
+ if matching.phrase.empty? || matching.target.empty?
80
+ inform_on_error(matching) { matching.similarity.should == 0.0 }
81
+ end
82
+ }
83
+ end
84
+
85
+ it 'should identify certain strings as highly similar (1)' do
86
+ @matchings.each { |matching|
87
+ if @highly_similar.include?(matching.phrase) && @highly_similar.include?(matching.target)
88
+ inform_on_error(matching) { matching.similarity.should > 0.9 }
89
+ end
90
+ }
91
+ end
92
+
93
+ it 'should identify certain strings as highly similar (2)' do
94
+ @highly_similar.each { |phrase|
95
+ @highly_similar.each { |target|
96
+ inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.9).should be_true }
97
+ }
98
+ }
99
+ end
100
+
101
+ it 'should identify certain strings as similar (1)' do
102
+ @matchings.each { |matching|
103
+ if @similar.include?(matching.phrase) && @similar.include?(matching.target)
104
+ inform_on_error(matching) { matching.similarity.should > 0.8 }
105
+ end
106
+ }
107
+ end
108
+
109
+ it 'should identify certain strings as similar (2)' do
110
+ @similar.each { |phrase|
111
+ @similar.each { |target|
112
+ inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.8).should be_true }
113
+ }
114
+ }
115
+ end
116
+
117
+ it 'should *not* identify other strings as similar (1)' do
118
+ @matchings.each { |matching|
119
+ if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
120
+ inform_on_error(matching) { matching.similarity.should_not > 0.8 }
121
+ end
122
+ }
123
+ end
124
+
125
+ it 'should *not* identify other strings as similar (2)' do
126
+ @matchings.each { |matching|
127
+ if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
128
+ inform_on_error(matching) { PerseusMatch.check(matching.phrase, matching.target, 0.8).should be_false }
129
+ end
130
+ }
131
+ end
132
+
133
+ it 'should be symmetrical' do
134
+ similarities = {}
135
+
136
+ @matchings.each { |matching|
137
+ if similarity = similarities[[matching.target, matching.phrase]]
138
+ inform_on_error(matching) { similarity.should == matching.similarity }
139
+ else
140
+ similarities[[matching.phrase, matching.target]] = matching.similarity
141
+ end
142
+ }
143
+ end
144
+
145
+ it 'should calculate pair distance' do
146
+ PerseusMatch.distance('foo', 'bar').class.should < Numeric
147
+ end
148
+
149
+ it 'should be clusterable' do
150
+ PerseusMatch.cluster(@somewhat_similar).should be_an_instance_of(Array)
151
+ end
152
+
153
+ it 'should be checkable (1)' do
154
+ PerseusMatch.check('foo', 'bar', 0, :>=).should be_true
155
+ end
156
+
157
+ it 'should be checkable (2)' do
158
+ lambda {
159
+ begin
160
+ PerseusMatch.check!('foo', 'bar', 0, :>)
161
+ rescue PerseusMatch::CheckFailedError => err
162
+ err.to_s.should =~ /0/
163
+ raise err
164
+ end
165
+ }.should raise_error(PerseusMatch::CheckFailedError)
166
+ end
167
+
168
+ end if LINGO_FOUND
@@ -0,0 +1,18 @@
1
+ unless Object.const_defined?(:PerseusMatch)
2
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
3
+ require 'perseus_match'
4
+ end
5
+
6
+ def inform_on_error(*args)
7
+ begin
8
+ yield
9
+ rescue Spec::Expectations::ExpectationNotMetError => err
10
+ unless args.empty?
11
+ puts
12
+ p *args
13
+ puts
14
+ end
15
+
16
+ raise
17
+ end
18
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: perseus_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,9 +9,19 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-08-15 00:00:00 +02:00
12
+ date: 2008-12-09 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: ruby-backports
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
15
25
  - !ruby/object:Gem::Dependency
16
26
  name: ruby-nuggets
17
27
  type: :runtime
@@ -20,7 +30,7 @@ dependencies:
20
30
  requirements:
21
31
  - - ">="
22
32
  - !ruby/object:Gem::Version
23
- version: 0.3.0
33
+ version: 0.4.0
24
34
  version:
25
35
  description: Fuzzy string matching based on linguistic analysis
26
36
  email: jens.wille@uni-koeln.de
@@ -33,29 +43,35 @@ extra_rdoc_files:
33
43
  - ChangeLog
34
44
  - README
35
45
  files:
36
- - lib/perseus_match.rb
46
+ - lib/perseus_match/list.rb
37
47
  - lib/perseus_match/version.rb
38
48
  - lib/perseus_match/token_set.rb
39
- - lib/perseus_match/list.rb
40
49
  - lib/perseus_match/cluster.rb
50
+ - lib/perseus_match.rb
41
51
  - bin/perseus_match
52
+ - Rakefile
42
53
  - COPYING
43
- - README
44
54
  - ChangeLog
45
- - Rakefile
55
+ - LINGO_BASE
56
+ - README
57
+ - spec/spec_helper.rb
58
+ - spec/perseus_match/list_spec.rb
59
+ - spec/perseus_match/cluster_spec.rb
60
+ - spec/perseus_match/token_set_spec.rb
61
+ - spec/perseus_match_spec.rb
46
62
  has_rdoc: true
47
63
  homepage: http://prometheus.rubyforge.org/perseus_match
48
64
  post_install_message:
49
65
  rdoc_options:
50
- - --charset
51
- - UTF-8
66
+ - --line-numbers
67
+ - --inline-source
52
68
  - --title
53
69
  - perseus_match Application documentation
54
70
  - --main
55
71
  - README
72
+ - --charset
73
+ - UTF-8
56
74
  - --all
57
- - --line-numbers
58
- - --inline-source
59
75
  require_paths:
60
76
  - lib
61
77
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -73,7 +89,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
73
89
  requirements: []
74
90
 
75
91
  rubyforge_project: prometheus
76
- rubygems_version: 1.2.0
92
+ rubygems_version: 1.3.1
77
93
  signing_key:
78
94
  specification_version: 2
79
95
  summary: Fuzzy string matching based on linguistic analysis