perseus_match 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/LINGO_BASE ADDED
@@ -0,0 +1 @@
1
+ /home/jw/devel/lingo/trunk
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to perseus_match version 0.0.2
5
+ This documentation refers to perseus_match version 0.0.3
6
6
 
7
7
 
8
8
  == DESCRIPTION
@@ -10,15 +10,17 @@ This documentation refers to perseus_match version 0.0.2
10
10
  Fuzzy string matching based on linguistic analysis.
11
11
 
12
12
 
13
- == AUTHORS
13
+ == LINKS
14
14
 
15
- * Jens Wille <mailto:jens.wille@uni-koeln.de>
15
+ <b></b>
16
+ Documentation:: <http://prometheus.rubyforge.org/perseus_match>
17
+ Source code:: <http://github.com/blackwinter/perseus_match>
18
+ Rubyforge project:: <http://rubyforge.org/projects/prometheus>
16
19
 
17
20
 
18
- == LINKS
21
+ == AUTHORS
19
22
 
20
- * <http://prometheus.rubyforge.org/perseus_match>
21
- * <http://github.com/blackwinter/perseus_match>
23
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
22
24
 
23
25
 
24
26
  == LICENSE AND COPYRIGHT
data/Rakefile CHANGED
@@ -13,8 +13,8 @@ begin
13
13
  :version => PerseusMatch::VERSION,
14
14
  :summary => %q{Fuzzy string matching based on linguistic analysis},
15
15
  :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
- :extra_files => FileList['[A-Z]*'].to_a,
17
- :dependencies => [['ruby-nuggets', '>= 0.3.0']]
16
+ :extra_files => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
17
+ :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
18
18
  }
19
19
  }}
20
20
  rescue LoadError
data/bin/perseus_match CHANGED
@@ -2,8 +2,10 @@
2
2
 
3
3
  require 'optparse'
4
4
  require 'benchmark'
5
+ require 'yaml'
5
6
 
6
7
  require 'rubygems'
8
+ require 'nuggets/enumerable/minmax'
7
9
  require 'nuggets/numeric/duration'
8
10
 
9
11
  $: << File.join(File.dirname(__FILE__), '..', 'lib')
@@ -14,19 +16,29 @@ USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
14
16
  abort USAGE if ARGV.empty?
15
17
 
16
18
  options = {
17
- :stats => false,
18
- :threshold => 0,
19
- :sort => false
19
+ :config => nil,
20
+ :threshold => 0,
21
+ :sort => false,
22
+ :stats => false,
23
+ :lingo => false,
24
+ :minimal => false,
25
+ :separate => false,
26
+ :check => false,
27
+ :failed_only => false,
28
+ :align => false,
29
+ :adjust_coeff => false
20
30
  }
21
31
 
22
32
  OptionParser.new { |opts|
23
33
  opts.banner = USAGE
24
34
 
25
- opts.separator ''
35
+ opts.separator ' '
26
36
  opts.separator 'Options:'
27
37
 
28
- opts.on('--stats', 'Output some statistics at the end') {
29
- options[:stats] = true
38
+ opts.on('-c', '--config YAML', 'Config file in YAML format') { |f|
39
+ abort "Can't find config file: #{f}." unless File.readable?(f)
40
+
41
+ options[:config] = f
30
42
  }
31
43
 
32
44
  opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
@@ -37,7 +49,51 @@ OptionParser.new { |opts|
37
49
  options[:sort] = true
38
50
  }
39
51
 
40
- opts.separator ''
52
+ opts.on('-S', '--stats', 'Output some statistics at the end') {
53
+ options[:stats] = true
54
+ }
55
+
56
+ opts.on('-v', '--verbose', 'Print additional information during processing') {
57
+ options[:verbose] = true
58
+ }
59
+
60
+ opts.separator ' '
61
+ opts.separator ' * Calculating similarities (default)'
62
+ opts.separator ' '
63
+
64
+ opts.on('-m', '--minimal', 'Produce minimal pairs only') {
65
+ options[:minimal] = true
66
+ }
67
+
68
+ opts.on('-p', '--separate [SEPARATOR]', 'Separate phrase blocks by SEPARATOR', '[Default: empty line]') { |p|
69
+ options[:separate] = p || ''
70
+ }
71
+
72
+ opts.on('-l', '--lingo', 'Output results in lingo dictionary format') {
73
+ options[:lingo] = true
74
+ }
75
+
76
+ opts.separator ' '
77
+ opts.separator ' * Checking pairs'
78
+ opts.separator ' '
79
+
80
+ opts.on('-C', '--check', 'Check pairs in input file (CSV) against', 'specified (or global) threshold') {
81
+ options[:check] = true
82
+ }
83
+
84
+ opts.on('-f', '--failed', 'Print only failed checks') {
85
+ options[:failed_only] = true
86
+ }
87
+
88
+ opts.on('-a', '--align', 'Align check results') {
89
+ options[:align] = true
90
+ }
91
+
92
+ opts.on('-A', '--adjust-coeff', 'Adjust coefficient to minimize the error') {
93
+ options[:adjust_coeff] = true
94
+ }
95
+
96
+ opts.separator ' '
41
97
  opts.separator 'Generic options:'
42
98
 
43
99
  opts.on('-h', '--help', 'Print this help message and exit') {
@@ -57,33 +113,179 @@ end
57
113
 
58
114
  PerseusMatch::TokenSet.tokenize(file)
59
115
 
60
- phrases = File.readlines(file).map { |line| line.chomp }
116
+ skip_re = %r{\A\s*(?:#|\z)}o
117
+ phrases = []
118
+
119
+ File.open(file).each { |line|
120
+ phrases << line.chomp unless line =~ skip_re
121
+ }.close
122
+
123
+ pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
124
+ pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
125
+ list_options = { :minimal => options[:minimal] }
61
126
 
62
127
  threshold, count, count_all = options[:threshold], 0, 0
63
128
 
64
- time = Benchmark.realtime {
65
- if options[:sort]
66
- require 'pp'
129
+ action = if options[:check]
130
+ require 'fastercsv'
67
131
 
68
- pp PerseusMatch::Cluster.new(phrases).sort { |pm|
69
- if pm.similarity >= threshold
70
- [pm.target, pm.distance, pm.similarity]
71
- count += 1
72
- end
73
- count_all += 1
74
- }.compact
132
+ format = if options[:align]
133
+ require 'jcode'
134
+
135
+ width = phrases.max(:jlength) + 3
136
+
137
+ lambda { |line, res|
138
+ "#{line} #{'.' * (width - line.jlength)} [#{res}]"
139
+ }
75
140
  else
76
- PerseusMatch::List.pair(phrases) { |pm|
77
- if pm.similarity >= threshold
78
- p [pm.phrase, pm.target, pm.distance, pm.similarity]
141
+ lambda { |line, res|
142
+ "#{line} [#{res}]"
143
+ }
144
+ end
145
+
146
+ phrases.sort! if options[:sort]
147
+ phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
148
+
149
+ global_threshold = options[:threshold]
150
+ failed_only = options[:failed_only]
151
+ collect_stats = options[:stats]
152
+ adjust_coeff = options[:adjust_coeff]
153
+
154
+ _action = lambda { |*args|
155
+ pm_options[:default_coeff] = args.first unless args.empty?
156
+
157
+ count, count_all = 0, 0
158
+ positives = negatives = false_positives = false_negatives = 0.0
159
+
160
+ phrases.each { |line, spec|
161
+ phrase, target, threshold, operator, _ = *spec
162
+
163
+ threshold ||= global_threshold
164
+ operator ||= '>'
165
+ assign = operator =~ />/
166
+
167
+ begin
168
+ PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
169
+
79
170
  count += 1
171
+ assign ? positives += 1 : negatives += 1
172
+
173
+ puts format[line, 'OK'] unless adjust_coeff || failed_only
174
+ rescue PerseusMatch::CheckFailedError => err
175
+ assign ? false_negatives += 1 : false_positives += 1
176
+
177
+ puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
80
178
  end
81
179
  count_all += 1
82
180
  }
181
+
182
+ divide = lambda { |numerator, denominator|
183
+ denominator == 0 ? 0 : numerator / denominator
184
+ }
185
+
186
+ if collect_stats || adjust_coeff
187
+ error = divide[
188
+ false_positives + false_negatives,
189
+ positives + negatives + false_positives + false_negatives
190
+ ]
191
+ end
192
+
193
+ if collect_stats
194
+ recall = divide[positives, positives + false_negatives]
195
+ precision = divide[positives, positives + false_positives]
196
+ f1 = divide[2 * recall * precision, recall + precision]
197
+
198
+ warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
199
+ recall * 100, precision * 100, f1, error
200
+ ]
201
+ end
202
+
203
+ error if adjust_coeff
204
+ }
205
+
206
+ if adjust_coeff
207
+ lambda {
208
+ step, max = 1, 100
209
+
210
+ start_coeff = pm_options[:default_coeff] || PerseusMatch::DEFAULT_COEFF
211
+ start_err = _action[start_coeff]
212
+
213
+ previous_coeff = next_coeff = start_coeff
214
+ previous_err = next_err = start_err
215
+
216
+ max.times { break if (previous_err = _action[previous_coeff -= step]) != start_err }
217
+ max.times { break if (next_err = _action[next_coeff += step]) != start_err }
218
+
219
+ best_err = [start_err, previous_err, next_err].min
220
+
221
+ if best_err == start_err
222
+ best_coeff = start_coeff
223
+ else
224
+ if best_err == previous_err
225
+ max.times {
226
+ break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
227
+ previous_err, previous_coeff = current_err, current_coeff
228
+ }
229
+
230
+ best_err, best_coeff = previous_err, previous_coeff
231
+ else
232
+ max.times {
233
+ break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
234
+ next_err, next_coeff = current_err, current_coeff
235
+ }
236
+
237
+ best_err, best_coeff = next_err, next_coeff
238
+ end
239
+ end
240
+
241
+ puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
242
+ }
243
+ else
244
+ _action
83
245
  end
84
- }
246
+ else
247
+ format =
248
+ options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
249
+ options[:sort] ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
250
+ lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
251
+
252
+ if options[:sort]
253
+ require 'pp'
254
+
255
+ lambda {
256
+ pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
257
+ if pm.similarity >= threshold
258
+ res = format[pm]
259
+ count += 1
260
+ end
261
+ count_all += 1
262
+ res
263
+ }.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
264
+ }
265
+ else
266
+ lambda {
267
+ separator, previous_phrase = options[:separate], nil
268
+
269
+ PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
270
+ if separator && pm.phrase != previous_phrase ||= pm.phrase
271
+ puts separator
272
+ previous_phrase = pm.phrase
273
+ end
274
+
275
+ if pm.similarity >= threshold
276
+ puts format[pm]
277
+ count += 1
278
+ end
279
+
280
+ count_all += 1
281
+ }
282
+ }
283
+ end
284
+ end
85
285
 
86
286
  if options[:stats]
287
+ time = Benchmark.realtime(&action)
288
+
87
289
  hms, x, y = time.to_hms(2), time / count, time / count_all
88
290
 
89
291
  precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
@@ -91,4 +293,6 @@ if options[:stats]
91
293
  warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
92
294
  phrases.size, count, count_all, hms, x, y
93
295
  ]
296
+ else
297
+ action.call
94
298
  end
@@ -30,10 +30,10 @@ class PerseusMatch
30
30
 
31
31
  class Cluster < Hash
32
32
 
33
- def initialize(phrases = [])
33
+ def initialize(phrases = [], pm_options = {}, list_options = {})
34
34
  super() { |h, k| h[k] = [] }
35
35
 
36
- List.new(phrases).each { |pm| add(pm) }
36
+ List.pair(phrases, pm_options, list_options) { |pm| add(pm) }
37
37
  end
38
38
 
39
39
  def add(pm)
@@ -42,7 +42,7 @@ class PerseusMatch
42
42
 
43
43
  alias_method :<<, :add
44
44
 
45
- def sort_by(attribute, *args, &block)
45
+ def sort_by(attribute, *args)
46
46
  options = args.last.is_a?(Hash) ? args.pop : {}
47
47
 
48
48
  _ = map { |phrase, matches|
@@ -63,15 +63,15 @@ class PerseusMatch
63
63
  lambda { |match| res[match] < threshold } :
64
64
  lambda { |match| res[match] > threshold }
65
65
 
66
- matches.reject! { |match| condition[match] }
66
+ matches.reject!(&condition)
67
67
  end
68
68
 
69
69
  if limit = options[:limit]
70
- matches.slice!(limit..-1)
70
+ matches.slice!(limit..-1) if matches.size > limit
71
71
  end
72
72
 
73
73
  # transform entries if so requested
74
- matches.map!(&block) if block
74
+ matches.map! { |match| yield(match) } if block_given?
75
75
 
76
76
  [phrase, matches]
77
77
  }.sort
@@ -79,8 +79,9 @@ class PerseusMatch
79
79
  _ # rcov hack :-(
80
80
  end
81
81
 
82
- def sort(options = {}, &block)
83
- sort_by(:similarity, options.delete(:coeff), options, &block)
82
+ def sort(options = {})
83
+ args = [:similarity, options.delete(:coeff), options]
84
+ block_given? ? sort_by(*args) { |*a| yield(*a) } : sort_by(*args)
84
85
  end
85
86
 
86
87
  def rank(options = {})
@@ -32,24 +32,46 @@ class PerseusMatch
32
32
 
33
33
  class << self
34
34
 
35
- def pair(phrases)
35
+ def pair(phrases, pm_options = {}, list_options = {})
36
36
  phrases.uniq!
37
37
 
38
- phrases.each { |phrase|
39
- phrases.each { |target|
40
- yield PerseusMatch.new(phrase, target)
38
+ pairs = [] unless block_given?
39
+
40
+ unless list_options[:minimal]
41
+ # => pairs.size = phrases.size ** 2
42
+
43
+ phrases.each { |phrase|
44
+ phrases.each { |target|
45
+ pm = PerseusMatch.new(phrase, target, pm_options)
46
+ block_given? ? yield(pm) : pairs << pm
47
+ }
48
+ }
49
+ else
50
+ # => pairs.size = (phrases.size ** 2 - phrases.size) / 2
51
+
52
+ size = phrases.size
53
+
54
+ 1.upto(size) { |i|
55
+ phrase = phrases[i - 1]
56
+
57
+ i.upto(size - 1) { |j|
58
+ pm = PerseusMatch.new(phrase, phrases[j], pm_options)
59
+ block_given? ? yield(pm) : pairs << pm
60
+ }
41
61
  }
42
- }
62
+ end
63
+
64
+ pairs || phrases
43
65
  end
44
66
 
45
67
  end
46
68
 
47
- alias_method :add, :push
48
-
49
- def initialize(phrases = [])
50
- self.class.pair(phrases) { |pm| add(pm) }
69
+ def initialize(phrases = [], pm_options = {}, list_options = {})
70
+ self.class.pair(phrases, pm_options, list_options) { |pm| add(pm) }
51
71
  end
52
72
 
73
+ alias_method :add, :push
74
+
53
75
  end
54
76
 
55
77
  end
@@ -28,44 +28,52 @@
28
28
 
29
29
  $KCODE = 'u'
30
30
 
31
- LINGO_BASE = '/home/jw/devel/lingo/trunk'
32
-
33
- LINGO_CONFIG = {
34
- 'meeting' => {
35
- 'attendees' => [
36
- { 'textreader' => { 'files'=> 'STDIN' } },
37
- { 'tokenizer' => { } },
38
- { 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
39
- { 'decomposer' => { 'source' => 'sys-dic' } },
40
- { 'multiworder' => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
41
- { 'synonymer' => { 'source' => 'sys-syn', 'out' => 'syn', 'skip'=>'?,t' } },
42
- { 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } }
43
- ]
44
- }
45
- }
46
-
47
- require 'tempfile'
31
+ require 'pathname'
32
+ require 'rbconfig'
48
33
  require 'yaml'
49
34
 
50
- # use enhanced Tempfile#make_tmpname, as of r13631
51
- if RUBY_RELEASE_DATE < '2007-10-05'
52
- class Tempfile
53
-
54
- def make_tmpname(basename, n)
55
- case basename
56
- when Array
57
- prefix, suffix = *basename
58
- else
59
- prefix, suffix = basename, ''
60
- end
35
+ require 'rubygems'
36
+ require 'backports/tempfile'
37
+ require 'nuggets/tempfile/open'
38
+ require 'nuggets/util/i18n'
61
39
 
62
- t = Time.now.strftime("%Y%m%d")
63
- path = "#{prefix}#{t}-#{$$}-#{rand(0x100000000).to_s(36)}-#{n}#{suffix}"
64
- end
40
+ begin
41
+ require 'text/soundex'
42
+ rescue LoadError
43
+ warn "could not load the Text gem -- soundex functionality will not be available"
44
+ end
65
45
 
66
- end
46
+ LINGO_BASE = ENV['PM_LINGO_BASE'] || (
47
+ File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
48
+ )
49
+
50
+ LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
51
+ warn "lingo installation not found at #{LINGO_BASE} -- proceeding anyway" unless LINGO_FOUND
52
+
53
+ lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
54
+ YAML.load_file(file)
55
+ else
56
+ warn "lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
57
+
58
+ {
59
+ 'meeting' => {
60
+ 'attendees' => [
61
+ { 'tokenizer' => { } },
62
+ { 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
63
+ { 'decomposer' => { 'source' => 'sys-dic' } },
64
+ { 'multiworder' => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
65
+ { 'synonymer' => { 'source' => 'sys-syn', 'skip' => '?,t' } },
66
+ ]
67
+ }
68
+ }
67
69
  end
68
-
70
+
71
+ lingo_config['meeting']['attendees'].
72
+ unshift({ 'textreader' => { 'files'=> 'STDIN' } }).
73
+ push({ 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } })
74
+
75
+ LINGO_CONFIG = lingo_config
76
+
69
77
  class PerseusMatch
70
78
 
71
79
  class TokenSet < Array
@@ -73,10 +81,8 @@ class PerseusMatch
73
81
  def self.tokenize(form)
74
82
  return @tokens[form] if @tokens
75
83
 
76
- @_tokens = {}
77
- @tokens = Hash.new { |h, k| h[k] = new(
78
- k, @_tokens.has_key?(k) ? @_tokens[k] :
79
- k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
84
+ @_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
85
+ k, (@_tokens[k] || []) | k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
80
86
  )}
81
87
 
82
88
  parse = lambda { |x|
@@ -85,8 +91,11 @@ class PerseusMatch
85
91
  when /<(.*?)\s=\s\[(.*)\]>/
86
92
  a, b = $1, $2
87
93
  @_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
88
- #when /<(.*)>/, /:(.*):/
89
- # # ignore
94
+ when /<(.*)>/, /:(.*):/
95
+ a, b = $1, $1.dup
96
+ @_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
97
+
98
+ warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
90
99
  end
91
100
  }
92
101
  }
@@ -95,29 +104,32 @@ class PerseusMatch
95
104
  File.open(t) { |f| parse[f] }
96
105
  @tokens[form]
97
106
  else
98
- cfg = Tempfile.new(['perseus_match_lingo', '.cfg'])
99
- YAML.dump(LINGO_CONFIG, cfg)
100
- cfg.close
107
+ raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
108
+
109
+ cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
110
+ YAML.dump(LINGO_CONFIG, t)
111
+ }
101
112
 
102
- file = form[0] == ?/ ? form : File.join(Dir.pwd, form)
113
+ file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
103
114
 
104
115
  unless File.file?(file) && File.readable?(file)
105
- temp = Tempfile.new('perseus_match_temp')
106
- temp.puts form
107
- temp.close
116
+ temp = Tempfile.open('perseus_match_temp') { |t|
117
+ t.puts form
118
+ }
108
119
 
109
120
  file = temp.path
110
121
  end
111
122
 
112
- Dir.chdir(LINGO_BASE) { parse[%x{
113
- ./lingo.rb -c #{cfg.path} < #{file}
114
- }] }
115
-
116
- cfg.unlink
123
+ begin
124
+ Dir.chdir(LINGO_BASE) { parse[%x{
125
+ #{Config::CONFIG['ruby_install_name']} lingo.rb -c "#{cfg.path}" < "#{file}"
126
+ }] }
127
+ ensure
128
+ cfg.unlink
129
+ temp.unlink if temp
130
+ end
117
131
 
118
132
  if temp
119
- temp.unlink
120
-
121
133
  tokens, @tokens = @tokens[form], nil
122
134
  tokens
123
135
  else
@@ -138,42 +150,40 @@ class PerseusMatch
138
150
  end
139
151
 
140
152
  def distance(other)
141
- distance, index, max = xor(other).size, -1, size
153
+ tokens1, tokens2 = tokens, other.tokens
154
+ size1, size2 = tokens1.size, tokens2.size
142
155
 
143
- intersect(other).each { |token|
144
- while current = other.tokens[index += 1] and current != token
145
- distance += 1
156
+ return size2 if tokens1.empty?
157
+ return size1 if tokens2.empty?
146
158
 
147
- break if index > max
148
- end
149
- }
159
+ distance, costs = nil, (0..size2).to_a
150
160
 
151
- distance
152
- end
161
+ 0.upto(size1 - 1) { |index1|
162
+ token1, cost = tokens1[index1], index1 + 1
153
163
 
154
- def tokens(wc = true)
155
- wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
156
- token.sub(%r{[/|].*?\z}, '')
157
- }
158
- end
164
+ 0.upto(size2 - 1) { |index2|
165
+ penalty = token1 == tokens2[index2] ? 0 : 1
159
166
 
160
- def &(other)
161
- tokens & other.tokens
162
- end
167
+ # rcov hack :-(
168
+ _ = [
169
+ costs[index2 + 1] + 1, # insertion
170
+ cost + 1, # deletion
171
+ costs[index2] + penalty # substitution
172
+ ]
173
+ distance = _.min
163
174
 
164
- def |(other)
165
- tokens | other.tokens
166
- end
175
+ costs[index2], cost = cost, distance
176
+ }
167
177
 
168
- def intersect(other)
169
- (self & other).inject([]) { |memo, token|
170
- memo + [token] * [count(token), other.count(token)].max
178
+ costs[size2] = distance
171
179
  }
180
+
181
+ distance + 1 # > 0 !?!
172
182
  end
173
183
 
174
- def xor(other)
175
- ((self | other) - (self & other)).inject([]) { |memo, token|
176
- memo + [token] * (count(token) + other.count(token))
184
+ def tokens(wc = true)
185
+ wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
186
+ token.sub(%r{[/|].*?\z}, '')
177
187
  }
178
188
  end
179
189
 
@@ -186,26 +196,31 @@ class PerseusMatch
186
196
  end
187
197
 
188
198
  def incl(*wc)
189
- (@incl ||= {})[wc = [*wc].compact] ||= map { |tokens|
190
- tokens.reject { |token| !match?(token, wc) }
199
+ (@incl ||= {})[wc = [*wc].compact] ||= select { |token|
200
+ match?(token, wc)
191
201
  }.to_token_set(form)
192
202
  end
193
203
 
194
204
  def excl(*wc)
195
- (@excl ||= {})[wc = [*wc].compact] ||= map { |tokens|
196
- tokens.reject { |token| match?(token, wc) }
205
+ (@excl ||= {})[wc = [*wc].compact] ||= reject { |token|
206
+ match?(token, wc)
197
207
  }.to_token_set(form)
198
208
  end
199
209
 
200
- def count(token)
201
- counts[token]
210
+ def soundex
211
+ raise "soundex functionality not available" unless defined?(Text::Soundex)
212
+
213
+ @soundex ||= map { |token|
214
+ token.sub(/(.*)(?=[\/|])/) { |m| Text::Soundex.soundex(m.replace_diacritics) }
215
+ }.to_token_set(form)
202
216
  end
203
217
 
204
- def counts
205
- @counts ||= tokens.inject(Hash.new(0)) { |counts, token|
206
- counts[token] += 1
207
- counts
208
- }
218
+ def soundex!
219
+ replace soundex
220
+ end
221
+
222
+ def eql?(other)
223
+ tokens == other.tokens && form == other.form
209
224
  end
210
225
 
211
226
  def inspect
@@ -4,7 +4,7 @@ class PerseusMatch
4
4
 
5
5
  MAJOR = 0
6
6
  MINOR = 0
7
- TINY = 2
7
+ TINY = 3
8
8
 
9
9
  class << self
10
10
 
data/lib/perseus_match.rb CHANGED
@@ -38,35 +38,53 @@ class PerseusMatch
38
38
 
39
39
  DEFAULT_COEFF = 20
40
40
 
41
- DISTANCE_SPEC = {
42
- {} => 1,
43
- { :excl => %w[a t] } => 1,
44
- { :incl => 's' } => 2,
45
- { :incl => 'y' } => 4,
46
- { :sort => true } => 4
47
- }
41
+ DISTANCE_SPEC = [ # {
42
+ [{}, 1], # {} => 1,
43
+ [{ :excl => %w[a t] }, 2], # { :excl => %w[a t] } => 1,
44
+ [{ :incl => 's' }, 3], # { :incl => 's' } => 2,
45
+ [{ :incl => 'y' }, 4], # { :incl => 'y' } => 4,
46
+ [{ :sort => true }, 4], # { :sort => true } => 4,
47
+ [{ :soundex => true }, 4] # { :soundex => true } => 8
48
+ ] # }
48
49
 
49
50
  class << self
50
51
 
51
- def match(phrases)
52
- List.new(phrases)
52
+ def distance(*args)
53
+ new(*args).distance
53
54
  end
54
55
 
55
- def cluster(phrases, options = {})
56
- Cluster.new(phrases).rank(options)
56
+ def match(phrases, pm_options = {})
57
+ List.new(phrases, pm_options)
58
+ end
59
+
60
+ def cluster(phrases, options = {}, pm_options = {})
61
+ Cluster.new(phrases, pm_options).rank(options)
62
+ end
63
+
64
+ def check(*args)
65
+ check!(*args)
66
+ rescue CheckFailedError
67
+ false
68
+ end
69
+
70
+ def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
71
+ value = new(phrase, target, pm_options).send(attribute)
72
+ value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
57
73
  end
58
74
 
59
75
  end
60
76
 
61
- attr_reader :phrase, :target, :distance_spec, :default_coeff
77
+ attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
62
78
 
63
79
  def initialize(phrase, target, options = {})
64
- @phrase = phrase
65
- @target = target
80
+ @phrase = phrase.to_s
81
+ @target = target.to_s
66
82
 
67
83
  @default_coeff = options[:default_coeff] || DEFAULT_COEFF
68
84
  @distance_spec = options[:distance_spec] || DISTANCE_SPEC
69
85
 
86
+ @verbose = options[:verbose]
87
+
70
88
  @similarity = {}
71
89
  end
72
90
 
@@ -97,7 +115,7 @@ class PerseusMatch
97
115
 
98
116
  def calculate_distance
99
117
  return Infinity if phrase_tokens.disjoint?(target_tokens)
100
- return 0 if phrase_tokens == target_tokens
118
+ return 0 if phrase_tokens.eql?(target_tokens)
101
119
 
102
120
  distance_spec.inject(0) { |distance, (options, weight)|
103
121
  distance + token_distance(options) * weight
@@ -105,19 +123,47 @@ class PerseusMatch
105
123
  end
106
124
 
107
125
  def token_distance(options = {})
108
- phrase_tokens = self.phrase_tokens.inclexcl(options)
109
- target_tokens = self.target_tokens.inclexcl(options)
126
+ tokens1 = phrase_tokens.inclexcl(options)
127
+ tokens2 = target_tokens.inclexcl(options)
110
128
 
111
129
  if options[:sort]
112
- phrase_tokens.sort!
113
- target_tokens.sort!
130
+ tokens1 = tokens1.sort
131
+ tokens2 = tokens2.sort
114
132
  end
115
133
 
116
- (phrase_tokens.distance(target_tokens) + target_tokens.distance(phrase_tokens)) / 2.0
134
+ if options[:soundex]
135
+ tokens1 = tokens1.soundex
136
+ tokens2 = tokens2.soundex
137
+ end
138
+
139
+ distance = tokens1.distance(tokens2)
140
+
141
+ warn <<-EOT if verbose
142
+ #{options.inspect}:
143
+ #{tokens1.inspect}
144
+ #{tokens2.inspect}
145
+ => #{distance}
146
+ EOT
147
+
148
+ distance
117
149
  end
118
150
 
119
151
  def total_weight
120
- distance_spec.values.inject(0.0) { |total, weight| total + weight }
152
+ @total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
153
+ end
154
+
155
+ class CheckFailedError < StandardError
156
+
157
+ attr_reader :value, :threshold, :operator
158
+
159
+ def initialize(value, threshold, operator)
160
+ @value, @threshold, @operator = value, threshold, operator
161
+ end
162
+
163
+ def to_s
164
+ "FAILED: #{value} #{operator} #{threshold}"
165
+ end
166
+
121
167
  end
122
168
 
123
169
  end
@@ -0,0 +1,45 @@
1
+ describe PerseusMatch::Cluster do
2
+
3
+ it 'should accept limit option in sort_by' do
4
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :limit => 1).all? { |phrase, matches|
5
+ matches.size.should == 1
6
+ matches.size.should == matches.nitems
7
+ }
8
+ end
9
+
10
+ it 'should accept threshold option in sort_by (1a)' do
11
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
12
+ matches.size.should be_zero
13
+ matches.size.should == matches.nitems
14
+ }
15
+ end
16
+
17
+ it 'should accept threshold option in sort_by (1b)' do
18
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0).all? { |phrase, matches|
19
+ matches.size.should == 2
20
+ matches.size.should == matches.nitems
21
+ }
22
+ end
23
+
24
+ it 'should accept threshold option in sort_by (2)' do
25
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'c').all? { |phrase, matches|
26
+ matches.size.should == 1
27
+ matches.size.should == matches.nitems
28
+ }
29
+ end
30
+
31
+ it 'should accept both limit and threshold options in sort_by (1)' do
32
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'z', :limit => 1).all? { |phrase, matches|
33
+ matches.size.should == 1
34
+ matches.size.should == matches.nitems
35
+ }
36
+ end
37
+
38
+ it 'should accept both limit and threshold options in sort_by (2)' do
39
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'a', :limit => 1).all? { |phrase, matches|
40
+ matches.size.should be_zero
41
+ matches.size.should == matches.nitems
42
+ }
43
+ end
44
+
45
+ end if LINGO_FOUND
@@ -0,0 +1,16 @@
1
+ describe PerseusMatch::List, '::pair' do
2
+
3
+ before :all do
4
+ @phrases = %w[foo bar baz]
5
+ @size = @phrases.size
6
+ end
7
+
8
+ it 'should produce full list of pairs with correct size' do
9
+ PerseusMatch::List.pair(@phrases).size.should == @size ** 2
10
+ end
11
+
12
+ it 'should produce minimal list of pairs with correct size' do
13
+ PerseusMatch::List.pair(@phrases, {}, :minimal => true).size.should == (@size ** 2 - @size) / 2
14
+ end
15
+
16
+ end
@@ -0,0 +1,65 @@
1
+ describe PerseusMatch::TokenSet, ' with lingo' do
2
+
3
+ before :each do
4
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
5
+ end
6
+
7
+ it 'should tokenize a string' do
8
+ PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
9
+ end
10
+
11
+ it 'should report strictly equal TokenSets as ==' do
12
+ PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
13
+ end
14
+
15
+ it 'should report strictly equal TokenSets as eql' do
16
+ PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
17
+ end
18
+
19
+ it 'should report slightly equal TokenSets as ==' do
20
+ PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
21
+ end
22
+
23
+ it 'should *not* report slightly equal TokenSets as eql' do
24
+ PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
25
+ end
26
+
27
+ it 'should include form in inspect' do
28
+ PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
29
+ end
30
+
31
+ end if LINGO_FOUND
32
+
33
+ describe PerseusMatch::TokenSet, ' without lingo' do
34
+
35
+ before :each do
36
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
37
+ end
38
+
39
+ it 'should take a prepared file for tokenization' do
40
+ # prevent lingo from being used
41
+ lingo_base = LINGO_BASE.dup
42
+ LINGO_BASE.replace('')
43
+
44
+ temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
45
+ t.puts *%w[<foo|?> <bar|?>]
46
+ }
47
+
48
+ path = temp.path
49
+ link = 'perseus.tokens'
50
+
51
+ Dir.chdir(File.dirname(path)) {
52
+ File.symlink(path, link)
53
+
54
+ PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
55
+
56
+ File.unlink(link)
57
+ }
58
+
59
+ temp.unlink
60
+
61
+ # reset lingo base
62
+ LINGO_BASE.replace(lingo_base)
63
+ end
64
+
65
+ end
@@ -0,0 +1,168 @@
1
+ require 'rubygems'
2
+ require 'nuggets/tempfile/open'
3
+ require 'nuggets/util/i18n'
4
+
5
+ describe PerseusMatch do
6
+
7
+ before :all do
8
+ @highly_similar = [
9
+ 'Anbetung der Könige',
10
+ 'Die Anbetung der Könige'
11
+ ] # ok
12
+
13
+ @similar = [
14
+ # @highly_similar + ...
15
+ 'Die Anbetung der Heiligen Drei Könige',
16
+ 'dIE AnBeTuNg der heILIGen dREI KÖniGE'
17
+ ] # ok
18
+
19
+ @unfortunately_similar = [
20
+ # @similar + ...
21
+ 'Die Die Die Anbetung der Könige',
22
+ 'Die Könige der Anbetung',
23
+ 'Königsanbetung hoch drei'
24
+ ] # *not* ok -- eventually try to drop these below the threshold
25
+
26
+ @somewhat_similar = @highly_similar + @similar + @unfortunately_similar
27
+
28
+ phrases = @somewhat_similar + [
29
+ 'Drei mal drei macht sechs',
30
+ 'Das Ende dieses Blödsinns',
31
+ ''
32
+ ]
33
+
34
+ temp = Tempfile.open('perseus_match_spec_temp') { |t|
35
+ t.puts *phrases
36
+ }
37
+
38
+ PerseusMatch::TokenSet.tokenize(temp.path)
39
+
40
+ temp.unlink
41
+
42
+ @matchings = PerseusMatch.match(phrases)
43
+ end
44
+
45
+ it 'should identify identical (non-empty) strings as identical' do
46
+ @matchings.each { |matching|
47
+ if !matching.phrase.empty? && matching.phrase == matching.target
48
+ inform_on_error(matching) { matching.similarity.should == 1.0 }
49
+ end
50
+ }
51
+ end
52
+
53
+ it 'should identify case-insensitively identical (non-empty) strings as nearly identical' do
54
+ @matchings.each { |matching|
55
+ if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase == matching.target.replace_diacritics.downcase
56
+ inform_on_error(matching) { matching.similarity.should > 0.95 }
57
+ end
58
+ }
59
+ end
60
+
61
+ it 'should identify *only* case-insensitively identical (non-empty) strings as nearly identical' do
62
+ @matchings.each { |matching|
63
+ if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase != matching.target.replace_diacritics.downcase
64
+ inform_on_error(matching) { matching.similarity.should < 0.98 }
65
+ end
66
+ }
67
+ end
68
+
69
+ it 'should identify disjunct (non-empty) strings as disjunct' do
70
+ @matchings.each { |matching|
71
+ if !matching.phrase.empty? && matching.phrase_tokens.disjoint?(matching.target_tokens)
72
+ inform_on_error(matching) { matching.similarity.should == 0.0 }
73
+ end
74
+ }
75
+ end
76
+
77
+ it 'should identify empty string as disjunct with anything, even with itself' do
78
+ @matchings.each { |matching|
79
+ if matching.phrase.empty? || matching.target.empty?
80
+ inform_on_error(matching) { matching.similarity.should == 0.0 }
81
+ end
82
+ }
83
+ end
84
+
85
+ it 'should identify certain strings as highly similar (1)' do
86
+ @matchings.each { |matching|
87
+ if @highly_similar.include?(matching.phrase) && @highly_similar.include?(matching.target)
88
+ inform_on_error(matching) { matching.similarity.should > 0.9 }
89
+ end
90
+ }
91
+ end
92
+
93
+ it 'should identify certain strings as highly similar (2)' do
94
+ @highly_similar.each { |phrase|
95
+ @highly_similar.each { |target|
96
+ inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.9).should be_true }
97
+ }
98
+ }
99
+ end
100
+
101
+ it 'should identify certain strings as similar (1)' do
102
+ @matchings.each { |matching|
103
+ if @similar.include?(matching.phrase) && @similar.include?(matching.target)
104
+ inform_on_error(matching) { matching.similarity.should > 0.8 }
105
+ end
106
+ }
107
+ end
108
+
109
+ it 'should identify certain strings as similar (2)' do
110
+ @similar.each { |phrase|
111
+ @similar.each { |target|
112
+ inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.8).should be_true }
113
+ }
114
+ }
115
+ end
116
+
117
+ it 'should *not* identify other strings as similar (1)' do
118
+ @matchings.each { |matching|
119
+ if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
120
+ inform_on_error(matching) { matching.similarity.should_not > 0.8 }
121
+ end
122
+ }
123
+ end
124
+
125
+ it 'should *not* identify other strings as similar (2)' do
126
+ @matchings.each { |matching|
127
+ if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
128
+ inform_on_error(matching) { PerseusMatch.check(matching.phrase, matching.target, 0.8).should be_false }
129
+ end
130
+ }
131
+ end
132
+
133
+ it 'should be symmetrical' do
134
+ similarities = {}
135
+
136
+ @matchings.each { |matching|
137
+ if similarity = similarities[[matching.target, matching.phrase]]
138
+ inform_on_error(matching) { similarity.should == matching.similarity }
139
+ else
140
+ similarities[[matching.phrase, matching.target]] = matching.similarity
141
+ end
142
+ }
143
+ end
144
+
145
+ it 'should calculate pair distance' do
146
+ PerseusMatch.distance('foo', 'bar').class.should < Numeric
147
+ end
148
+
149
+ it 'should be clusterable' do
150
+ PerseusMatch.cluster(@somewhat_similar).should be_an_instance_of(Array)
151
+ end
152
+
153
+ it 'should be checkable (1)' do
154
+ PerseusMatch.check('foo', 'bar', 0, :>=).should be_true
155
+ end
156
+
157
+ it 'should be checkable (2)' do
158
+ lambda {
159
+ begin
160
+ PerseusMatch.check!('foo', 'bar', 0, :>)
161
+ rescue PerseusMatch::CheckFailedError => err
162
+ err.to_s.should =~ /0/
163
+ raise err
164
+ end
165
+ }.should raise_error(PerseusMatch::CheckFailedError)
166
+ end
167
+
168
+ end if LINGO_FOUND
@@ -0,0 +1,18 @@
1
+ unless Object.const_defined?(:PerseusMatch)
2
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
3
+ require 'perseus_match'
4
+ end
5
+
6
+ def inform_on_error(*args)
7
+ begin
8
+ yield
9
+ rescue Spec::Expectations::ExpectationNotMetError => err
10
+ unless args.empty?
11
+ puts
12
+ p *args
13
+ puts
14
+ end
15
+
16
+ raise
17
+ end
18
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: perseus_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,9 +9,19 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-08-15 00:00:00 +02:00
12
+ date: 2008-12-09 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: ruby-backports
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
15
25
  - !ruby/object:Gem::Dependency
16
26
  name: ruby-nuggets
17
27
  type: :runtime
@@ -20,7 +30,7 @@ dependencies:
20
30
  requirements:
21
31
  - - ">="
22
32
  - !ruby/object:Gem::Version
23
- version: 0.3.0
33
+ version: 0.4.0
24
34
  version:
25
35
  description: Fuzzy string matching based on linguistic analysis
26
36
  email: jens.wille@uni-koeln.de
@@ -33,29 +43,35 @@ extra_rdoc_files:
33
43
  - ChangeLog
34
44
  - README
35
45
  files:
36
- - lib/perseus_match.rb
46
+ - lib/perseus_match/list.rb
37
47
  - lib/perseus_match/version.rb
38
48
  - lib/perseus_match/token_set.rb
39
- - lib/perseus_match/list.rb
40
49
  - lib/perseus_match/cluster.rb
50
+ - lib/perseus_match.rb
41
51
  - bin/perseus_match
52
+ - Rakefile
42
53
  - COPYING
43
- - README
44
54
  - ChangeLog
45
- - Rakefile
55
+ - LINGO_BASE
56
+ - README
57
+ - spec/spec_helper.rb
58
+ - spec/perseus_match/list_spec.rb
59
+ - spec/perseus_match/cluster_spec.rb
60
+ - spec/perseus_match/token_set_spec.rb
61
+ - spec/perseus_match_spec.rb
46
62
  has_rdoc: true
47
63
  homepage: http://prometheus.rubyforge.org/perseus_match
48
64
  post_install_message:
49
65
  rdoc_options:
50
- - --charset
51
- - UTF-8
66
+ - --line-numbers
67
+ - --inline-source
52
68
  - --title
53
69
  - perseus_match Application documentation
54
70
  - --main
55
71
  - README
72
+ - --charset
73
+ - UTF-8
56
74
  - --all
57
- - --line-numbers
58
- - --inline-source
59
75
  require_paths:
60
76
  - lib
61
77
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -73,7 +89,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
73
89
  requirements: []
74
90
 
75
91
  rubyforge_project: prometheus
76
- rubygems_version: 1.2.0
92
+ rubygems_version: 1.3.1
77
93
  signing_key:
78
94
  specification_version: 2
79
95
  summary: Fuzzy string matching based on linguistic analysis