perseus_match 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README +1 -1
- data/Rakefile +1 -1
- data/bin/perseus_match +22 -8
- data/lib/perseus_match/core_ext.rb +14 -0
- data/lib/perseus_match/token.rb +43 -0
- data/lib/perseus_match/token_set.rb +171 -116
- data/lib/perseus_match/version.rb +1 -1
- data/lib/perseus_match.rb +39 -13
- data/spec/perseus_match/token_set_spec.rb +80 -56
- data/spec/perseus_match/token_spec.rb +23 -0
- data/spec/perseus_match_spec.rb +3 -8
- metadata +15 -2
data/README
CHANGED
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ begin
|
|
14
14
|
:summary => %q{Fuzzy string matching based on linguistic analysis},
|
15
15
|
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
16
16
|
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
|
17
|
-
:dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
|
17
|
+
:dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0'], ['unicode', '>= 0.1.1']]
|
18
18
|
}
|
19
19
|
}}
|
20
20
|
rescue LoadError
|
data/bin/perseus_match
CHANGED
@@ -130,7 +130,7 @@ end
|
|
130
130
|
|
131
131
|
unknowns = Set.new if options[:unknowns]
|
132
132
|
|
133
|
-
PerseusMatch
|
133
|
+
PerseusMatch.tokenize(file, unknowns || !options[:silent])
|
134
134
|
|
135
135
|
if unknowns
|
136
136
|
File.open(options[:unknowns], 'w') { |f|
|
@@ -154,7 +154,12 @@ list_options = { :minimal => options[:minimal] }
|
|
154
154
|
threshold, count, count_all = options[:threshold], 0, 0
|
155
155
|
|
156
156
|
action = if options[:check]
|
157
|
-
require '
|
157
|
+
require 'csv'
|
158
|
+
|
159
|
+
if CSV.const_defined?(:Reader)
|
160
|
+
require 'fastercsv'
|
161
|
+
CSV = FasterCSV
|
162
|
+
end
|
158
163
|
|
159
164
|
format = if options[:align]
|
160
165
|
require 'jcode'
|
@@ -184,23 +189,23 @@ action = if options[:check]
|
|
184
189
|
positives = negatives = false_positives = false_negatives = 0.0
|
185
190
|
|
186
191
|
phrases.each { |line|
|
187
|
-
phrase, target, threshold, operator, _ = *
|
192
|
+
phrase, target, threshold, operator, _ = *CSV.parse_line(line)
|
188
193
|
|
189
194
|
threshold ||= global_threshold
|
190
195
|
operator ||= '>'
|
191
196
|
assign = operator =~ />/ || operator == '=='
|
192
197
|
|
193
198
|
begin
|
194
|
-
PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
|
199
|
+
res = PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
|
195
200
|
|
196
201
|
count += 1
|
197
202
|
assign ? positives += 1 : negatives += 1
|
198
203
|
|
199
|
-
puts format[line,
|
204
|
+
puts format[line, "OK -- #{res.value} (#{res.pm.distance})"] unless adjust_coeff || failed_only
|
200
205
|
rescue PerseusMatch::CheckFailedError => err
|
201
206
|
assign ? false_negatives += 1 : false_positives += 1
|
202
207
|
|
203
|
-
puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
|
208
|
+
puts format[line, "FAILED -- #{err.value} (#{err.pm.distance})"] unless adjust_coeff
|
204
209
|
end
|
205
210
|
|
206
211
|
count_all += 1
|
@@ -222,8 +227,17 @@ action = if options[:check]
|
|
222
227
|
precision = divide[positives, positives + false_positives]
|
223
228
|
f1 = divide[2 * recall * precision, recall + precision]
|
224
229
|
|
225
|
-
|
226
|
-
|
230
|
+
error_all = divide[ # trivial: assign all
|
231
|
+
negatives + false_positives,
|
232
|
+
positives + negatives + false_positives + false_negatives
|
233
|
+
]
|
234
|
+
error_none = divide[ # trivial: assign none
|
235
|
+
positives + false_negatives,
|
236
|
+
positives + negatives + false_positives + false_negatives
|
237
|
+
]
|
238
|
+
|
239
|
+
stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f (ALL = %0.4f, NONE = %0.4f)' % [
|
240
|
+
recall * 100, precision * 100, f1, error, error_all, error_none
|
227
241
|
]
|
228
242
|
|
229
243
|
stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
|
@@ -0,0 +1,43 @@
|
|
1
|
+
class PerseusMatch
|
2
|
+
|
3
|
+
class Token < String
|
4
|
+
|
5
|
+
WC_RE = %r{[/|]([^/|]*)\z}
|
6
|
+
|
7
|
+
ANY_WC = '*'.freeze
|
8
|
+
|
9
|
+
attr_reader :form, :wc
|
10
|
+
|
11
|
+
def initialize(form, wc = nil)
|
12
|
+
@form = form.sub(WC_RE, '')
|
13
|
+
@wc = wc || $1
|
14
|
+
|
15
|
+
super(@form)
|
16
|
+
end
|
17
|
+
|
18
|
+
def match?(wcs)
|
19
|
+
wcs = [*wcs].compact
|
20
|
+
wcs.include?(wc) || wcs.include?(ANY_WC)
|
21
|
+
end
|
22
|
+
|
23
|
+
def unk?
|
24
|
+
wc == '?'
|
25
|
+
end
|
26
|
+
|
27
|
+
def ==(other)
|
28
|
+
other.is_a?(self.class) ? form == other.form : form == other
|
29
|
+
end
|
30
|
+
|
31
|
+
def eql?(other)
|
32
|
+
self == other && wc == other.wc
|
33
|
+
end
|
34
|
+
|
35
|
+
def inspect
|
36
|
+
"#{super}/#{wc}"
|
37
|
+
end
|
38
|
+
|
39
|
+
alias_method :to_s, :inspect
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -40,20 +40,32 @@ require 'nuggets/util/i18n'
|
|
40
40
|
begin
|
41
41
|
require 'text/soundex'
|
42
42
|
rescue LoadError
|
43
|
-
warn "
|
43
|
+
warn "Could not load the Text gem -- Soundex functionality will not be available"
|
44
44
|
end
|
45
45
|
|
46
46
|
LINGO_BASE = ENV['PM_LINGO_BASE'] || (
|
47
47
|
File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
|
48
48
|
)
|
49
49
|
|
50
|
-
LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
|
51
|
-
|
50
|
+
if LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
|
51
|
+
begin
|
52
|
+
require File.join(LINGO_BASE, 'lib', 'const')
|
53
|
+
rescue LoadError
|
54
|
+
end
|
55
|
+
else
|
56
|
+
warn "Lingo installation not found at #{LINGO_BASE} -- proceeding anyway"
|
57
|
+
end
|
58
|
+
|
59
|
+
unless Object.const_defined?(:PRINTABLE_CHAR)
|
60
|
+
PRINTABLE_CHAR = '[\w-]'
|
61
|
+
end
|
62
|
+
|
63
|
+
PRINTABLE_CHAR_RE = %r{(?:#{PRINTABLE_CHAR})+}
|
52
64
|
|
53
65
|
lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
|
54
66
|
YAML.load_file(file)
|
55
67
|
else
|
56
|
-
warn "
|
68
|
+
warn "Lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
|
57
69
|
|
58
70
|
{
|
59
71
|
'meeting' => {
|
@@ -78,30 +90,81 @@ class PerseusMatch
|
|
78
90
|
|
79
91
|
class TokenSet < Array
|
80
92
|
|
81
|
-
|
82
|
-
return @tokens[form] if @tokens
|
93
|
+
class << self
|
83
94
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
)
|
90
|
-
|
95
|
+
def tokenize(form, unknowns = false)
|
96
|
+
form.downcase!
|
97
|
+
return @tokens[form] if @tokens ||= nil
|
98
|
+
|
99
|
+
@_tokens = Hash.new
|
100
|
+
@tokens = Hash.new { |h, k| h[k] = new(k, @_tokens[k] || []) }
|
101
|
+
|
102
|
+
tokens_file = ENV['PM_TOKENS_FILE'] || 'perseus.tokens'
|
103
|
+
|
104
|
+
if File.readable?(tokens_file)
|
105
|
+
File.open(tokens_file) { |f| parse(f, unknowns, @_tokens) }
|
106
|
+
@tokens[form]
|
107
|
+
else
|
108
|
+
raise "Lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
|
109
|
+
|
110
|
+
cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
|
111
|
+
YAML.dump(LINGO_CONFIG, t)
|
112
|
+
}
|
113
|
+
|
114
|
+
file = file?(form) || begin
|
115
|
+
temp = Tempfile.open('perseus_match_temp') { |t| t.puts form }
|
116
|
+
temp.path
|
117
|
+
end
|
118
|
+
|
119
|
+
ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
|
120
|
+
|
121
|
+
if keep = ENV['PM_KEEP_TOKENS']
|
122
|
+
keep = File.expand_path(keep =~ /\A(?:1|y(?:es)?|true)\z/i ? tokens_file : keep)
|
123
|
+
end
|
124
|
+
|
125
|
+
begin
|
126
|
+
Dir.chdir(LINGO_BASE) {
|
127
|
+
tokens = %x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}
|
128
|
+
File.open(keep, 'w') { |f| f.puts tokens } if keep
|
129
|
+
parse(tokens, unknowns, @_tokens)
|
130
|
+
}
|
131
|
+
ensure
|
132
|
+
cfg.unlink
|
133
|
+
temp.unlink if temp
|
134
|
+
end
|
135
|
+
|
136
|
+
if temp
|
137
|
+
tokens, @tokens = @tokens[form], nil
|
138
|
+
tokens
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def file?(form)
|
144
|
+
file = Pathname.new(form).absolute? ? form : File.expand_path(form)
|
145
|
+
file if File.file?(file) && File.readable?(file)
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
91
149
|
|
92
|
-
parse =
|
93
|
-
|
150
|
+
def parse(output, unknowns = false, tokens = {})
|
151
|
+
sanitize = lambda { |a|
|
152
|
+
a.sub!(Token::WC_RE, '')
|
153
|
+
a.downcase!
|
154
|
+
}
|
155
|
+
|
156
|
+
output.each_line { |res|
|
94
157
|
case res
|
95
158
|
when /<(.*?)\s=\s\[(.*)\]>/
|
96
159
|
a, b = $1, $2
|
97
|
-
a
|
160
|
+
sanitize[a]
|
98
161
|
|
99
|
-
|
162
|
+
tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten.map { |t| Token.new(t) }
|
100
163
|
when /<(.*)>/, /:(.*):/
|
101
|
-
a, b = $1, $1.
|
102
|
-
a
|
164
|
+
a, b = $1, Token.new($1.downcase)
|
165
|
+
sanitize[a]
|
103
166
|
|
104
|
-
if unknowns && b
|
167
|
+
if unknowns && b.unk?
|
105
168
|
if unknowns.respond_to?(:<<)
|
106
169
|
unknowns << a
|
107
170
|
else
|
@@ -109,134 +172,65 @@ class PerseusMatch
|
|
109
172
|
end
|
110
173
|
end
|
111
174
|
|
112
|
-
|
175
|
+
tokens[a] ||= [b]
|
113
176
|
end
|
114
177
|
}
|
115
|
-
}
|
116
|
-
|
117
|
-
if File.readable?(t = 'perseus.tokens')
|
118
|
-
File.open(t) { |f| parse[f] }
|
119
|
-
@tokens[form]
|
120
|
-
else
|
121
|
-
raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
|
122
|
-
|
123
|
-
cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
|
124
|
-
YAML.dump(LINGO_CONFIG, t)
|
125
|
-
}
|
126
|
-
|
127
|
-
file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
|
128
178
|
|
129
|
-
|
130
|
-
temp = Tempfile.open('perseus_match_temp') { |t|
|
131
|
-
t.puts form
|
132
|
-
}
|
133
|
-
|
134
|
-
file = temp.path
|
135
|
-
end
|
136
|
-
|
137
|
-
ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
|
138
|
-
|
139
|
-
begin
|
140
|
-
Dir.chdir(LINGO_BASE) {
|
141
|
-
parse[%x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}]
|
142
|
-
}
|
143
|
-
ensure
|
144
|
-
cfg.unlink
|
145
|
-
temp.unlink if temp
|
146
|
-
end
|
147
|
-
|
148
|
-
if temp
|
149
|
-
tokens, @tokens = @tokens[form], nil
|
150
|
-
tokens
|
151
|
-
else
|
152
|
-
@tokens[form]
|
153
|
-
end
|
179
|
+
tokens
|
154
180
|
end
|
181
|
+
|
155
182
|
end
|
156
183
|
|
157
184
|
private :push, :<<, :[]= # maybe more...
|
158
185
|
|
159
|
-
attr_reader :form
|
186
|
+
attr_reader :form, :tokens
|
160
187
|
|
161
188
|
def initialize(form, tokens = nil)
|
162
189
|
super(tokens || self.class.tokenize(form))
|
163
190
|
|
164
191
|
@form = form
|
165
|
-
@tokens = to_a
|
192
|
+
@tokens = to_a
|
166
193
|
end
|
167
194
|
|
168
195
|
def distance(other)
|
169
|
-
|
170
|
-
size1, size2 = tokens1.size, tokens2.size
|
171
|
-
|
172
|
-
return size2 if tokens1.empty?
|
173
|
-
return size1 if tokens2.empty?
|
174
|
-
|
175
|
-
distance, costs = nil, (0..size2).to_a
|
176
|
-
|
177
|
-
0.upto(size1 - 1) { |index1|
|
178
|
-
token1, cost = tokens1[index1], index1 + 1
|
179
|
-
|
180
|
-
0.upto(size2 - 1) { |index2|
|
181
|
-
penalty = token1 == tokens2[index2] ? 0 : 1
|
182
|
-
|
183
|
-
# rcov hack :-(
|
184
|
-
_ = [
|
185
|
-
costs[index2 + 1] + 1, # insertion
|
186
|
-
cost + 1, # deletion
|
187
|
-
costs[index2] + penalty # substitution
|
188
|
-
]
|
189
|
-
distance = _.min
|
190
|
-
|
191
|
-
costs[index2], cost = cost, distance
|
192
|
-
}
|
193
|
-
|
194
|
-
costs[size2] = distance
|
195
|
-
}
|
196
|
-
|
197
|
-
distance + 1 # > 0 !?!
|
196
|
+
(forms | other.forms).size - (forms & other.forms).size
|
198
197
|
end
|
199
198
|
|
200
|
-
def
|
201
|
-
|
202
|
-
token.sub(%r{[/|].*?\z}, '')
|
203
|
-
}
|
199
|
+
def forms
|
200
|
+
@forms ||= map { |token| token.form }
|
204
201
|
end
|
205
202
|
|
206
203
|
def disjoint?(other)
|
207
|
-
(
|
204
|
+
(forms.flatten & other.forms.flatten).flatten.empty?
|
208
205
|
end
|
209
206
|
|
210
207
|
def inclexcl(inclexcl = {})
|
211
|
-
incl(inclexcl[:incl] ||
|
208
|
+
incl(inclexcl[:incl] || Token::ANY_WC).excl(inclexcl[:excl])
|
212
209
|
end
|
213
210
|
|
214
|
-
def incl(
|
215
|
-
(
|
216
|
-
match?(token, wc)
|
217
|
-
}.to_token_set(form)
|
211
|
+
def incl(wcs)
|
212
|
+
self.class.new(form, select { |token| token.match?(wcs) })
|
218
213
|
end
|
219
214
|
|
220
|
-
def excl(
|
221
|
-
(
|
222
|
-
match?(token, wc)
|
223
|
-
}.to_token_set(form)
|
215
|
+
def excl(wcs)
|
216
|
+
self.class.new(form, reject { |token| token.match?(wcs) })
|
224
217
|
end
|
225
218
|
|
226
219
|
def soundex
|
227
|
-
|
220
|
+
ensure_soundex!
|
228
221
|
|
229
|
-
@soundex ||= map { |token|
|
230
|
-
token.sub(
|
231
|
-
|
222
|
+
@soundex ||= self.class.new(form, map { |token|
|
223
|
+
form = token.form.replace_diacritics.sub(/\W+/, '')
|
224
|
+
Token.new(Text::Soundex.soundex(form) || '', token.wc)
|
225
|
+
})
|
232
226
|
end
|
233
227
|
|
234
|
-
def
|
235
|
-
|
228
|
+
def ==(other)
|
229
|
+
tokens == other.tokens
|
236
230
|
end
|
237
231
|
|
238
232
|
def eql?(other)
|
239
|
-
|
233
|
+
self == other && form == other.form
|
240
234
|
end
|
241
235
|
|
242
236
|
def inspect
|
@@ -247,16 +241,77 @@ class PerseusMatch
|
|
247
241
|
|
248
242
|
private
|
249
243
|
|
250
|
-
def
|
251
|
-
|
244
|
+
def ensure_soundex!
|
245
|
+
unless defined?(Text::Soundex)
|
246
|
+
raise RuntimeError, "Soundex functionality not available", caller(1)
|
247
|
+
end
|
252
248
|
end
|
253
249
|
|
254
250
|
end
|
255
251
|
|
256
|
-
class
|
252
|
+
class PhraseTokenSet < TokenSet
|
253
|
+
|
254
|
+
class << self
|
255
|
+
|
256
|
+
def tokenize(form, unknowns = false)
|
257
|
+
(@tokens ||= {})[form] ||= new(form, form.scan(PRINTABLE_CHAR_RE).map { |i|
|
258
|
+
TokenSet.tokenize(i, unknowns)
|
259
|
+
})
|
260
|
+
end
|
261
|
+
|
262
|
+
end
|
263
|
+
|
264
|
+
alias_method :phrase, :form
|
265
|
+
alias_method :token_sets, :tokens
|
266
|
+
|
267
|
+
# (size1 - size2).abs <= distance <= [size1, size2].max
|
268
|
+
def distance(other)
|
269
|
+
token_sets1, token_sets2 = token_sets, other.token_sets
|
270
|
+
size1, size2 = token_sets1.size, token_sets2.size
|
271
|
+
|
272
|
+
return size2 if size1 == 0
|
273
|
+
return size1 if size2 == 0
|
274
|
+
|
275
|
+
distance, costs = nil, (0..size2).to_a
|
276
|
+
|
277
|
+
0.upto(size1 - 1) { |index1|
|
278
|
+
token_set1, cost = token_sets1[index1], index1 + 1
|
279
|
+
|
280
|
+
0.upto(size2 - 1) { |index2|
|
281
|
+
penalty = token_set1.distance(token_sets2[index2])
|
282
|
+
|
283
|
+
# rcov hack :-(
|
284
|
+
_ = [
|
285
|
+
costs[index2 + 1] + 1, # insertion
|
286
|
+
cost + 1, # deletion
|
287
|
+
costs[index2] + penalty # substitution
|
288
|
+
]
|
289
|
+
distance = _.min
|
257
290
|
|
258
|
-
|
259
|
-
|
291
|
+
costs[index2], cost = cost, distance
|
292
|
+
}
|
293
|
+
|
294
|
+
costs[size2] = distance
|
295
|
+
}
|
296
|
+
|
297
|
+
distance
|
298
|
+
end
|
299
|
+
|
300
|
+
def forms
|
301
|
+
@forms ||= map { |token_set| token_set.forms }
|
302
|
+
end
|
303
|
+
|
304
|
+
def incl(wcs)
|
305
|
+
self.class.new(form, map { |token_set| token_set.incl(wcs) })
|
306
|
+
end
|
307
|
+
|
308
|
+
def excl(wcs)
|
309
|
+
self.class.new(form, map { |token_set| token_set.excl(wcs) })
|
310
|
+
end
|
311
|
+
|
312
|
+
def soundex
|
313
|
+
ensure_soundex!
|
314
|
+
@soundex ||= self.class.new(form, map { |token_set| token_set.soundex })
|
260
315
|
end
|
261
316
|
|
262
317
|
end
|
data/lib/perseus_match.rb
CHANGED
@@ -26,8 +26,11 @@
|
|
26
26
|
###############################################################################
|
27
27
|
#++
|
28
28
|
|
29
|
+
require 'perseus_match/core_ext'
|
30
|
+
|
29
31
|
require 'perseus_match/list'
|
30
32
|
require 'perseus_match/cluster'
|
33
|
+
require 'perseus_match/token'
|
31
34
|
require 'perseus_match/token_set'
|
32
35
|
|
33
36
|
require 'perseus_match/version'
|
@@ -36,7 +39,7 @@ class PerseusMatch
|
|
36
39
|
|
37
40
|
Infinity = 1.0 / 0
|
38
41
|
|
39
|
-
DEFAULT_COEFF =
|
42
|
+
DEFAULT_COEFF = 2
|
40
43
|
|
41
44
|
DISTANCE_SPEC = [ # {
|
42
45
|
[{}, 1], # {} => 1,
|
@@ -68,8 +71,22 @@ class PerseusMatch
|
|
68
71
|
end
|
69
72
|
|
70
73
|
def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
|
71
|
-
|
72
|
-
value
|
74
|
+
pm = new(phrase, target, pm_options)
|
75
|
+
value = pm.send(attribute)
|
76
|
+
|
77
|
+
if value.send(operator, threshold)
|
78
|
+
Struct.new(:pm, :value, :threshold, :operator).new(pm, value, threshold, operator)
|
79
|
+
else
|
80
|
+
raise CheckFailedError.new(pm, value, threshold, operator)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def tokenize(form, unknowns = false)
|
85
|
+
if file = TokenSet.file?(form)
|
86
|
+
TokenSet.tokenize(file, unknowns)
|
87
|
+
else
|
88
|
+
PhraseTokenSet.tokenize(form, unknowns)
|
89
|
+
end
|
73
90
|
end
|
74
91
|
|
75
92
|
end
|
@@ -77,8 +94,8 @@ class PerseusMatch
|
|
77
94
|
attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
|
78
95
|
|
79
96
|
def initialize(phrase, target, options = {})
|
80
|
-
@phrase = phrase.to_s
|
81
|
-
@target = target.to_s
|
97
|
+
@phrase = sanitize(phrase.to_s)
|
98
|
+
@target = sanitize(target.to_s)
|
82
99
|
|
83
100
|
@default_coeff = options[:default_coeff] || DEFAULT_COEFF
|
84
101
|
@distance_spec = options[:distance_spec] || DISTANCE_SPEC
|
@@ -89,11 +106,11 @@ class PerseusMatch
|
|
89
106
|
end
|
90
107
|
|
91
108
|
def phrase_tokens
|
92
|
-
@phrase_tokens ||= tokenize(phrase)
|
109
|
+
@phrase_tokens ||= self.class.tokenize(phrase)
|
93
110
|
end
|
94
111
|
|
95
112
|
def target_tokens
|
96
|
-
@target_tokens ||= tokenize(target)
|
113
|
+
@target_tokens ||= self.class.tokenize(target)
|
97
114
|
end
|
98
115
|
|
99
116
|
# 0 <= distance <= Infinity
|
@@ -104,13 +121,13 @@ class PerseusMatch
|
|
104
121
|
# 1 >= similarity >= 0
|
105
122
|
def similarity(coeff = nil)
|
106
123
|
coeff ||= default_coeff # passed arg may be nil
|
107
|
-
@similarity[coeff] ||=
|
124
|
+
@similarity[coeff] ||= normalize_distance(coeff)
|
108
125
|
end
|
109
126
|
|
110
127
|
private
|
111
128
|
|
112
|
-
def
|
113
|
-
|
129
|
+
def sanitize(str)
|
130
|
+
str.gsub(/\s*\(.*?\)|\s*\[.*?\]/, '').sub(/\s*[\/:].*/, '')
|
114
131
|
end
|
115
132
|
|
116
133
|
def calculate_distance
|
@@ -148,16 +165,25 @@ class PerseusMatch
|
|
148
165
|
distance
|
149
166
|
end
|
150
167
|
|
168
|
+
def normalize_distance(coeff)
|
169
|
+
length = phrase_tokens.size + target_tokens.size
|
170
|
+
return 0 if length == 0
|
171
|
+
|
172
|
+
norm = Math.log(length ** Math.sqrt(2)) * coeff * total_weight * Math::E
|
173
|
+
|
174
|
+
1 / Math.exp(distance / norm)
|
175
|
+
end
|
176
|
+
|
151
177
|
def total_weight
|
152
178
|
@total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
|
153
179
|
end
|
154
180
|
|
155
181
|
class CheckFailedError < StandardError
|
156
182
|
|
157
|
-
attr_reader :value, :threshold, :operator
|
183
|
+
attr_reader :pm, :value, :threshold, :operator
|
158
184
|
|
159
|
-
def initialize(value, threshold, operator)
|
160
|
-
@value, @threshold, @operator = value, threshold, operator
|
185
|
+
def initialize(pm, value, threshold, operator)
|
186
|
+
@pm, @value, @threshold, @operator = pm, value, threshold, operator
|
161
187
|
end
|
162
188
|
|
163
189
|
def to_s
|
@@ -1,81 +1,105 @@
|
|
1
|
-
describe PerseusMatch::
|
1
|
+
describe PerseusMatch::PhraseTokenSet do
|
2
2
|
|
3
|
-
|
4
|
-
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
5
|
-
end
|
3
|
+
describe 'with lingo' do
|
6
4
|
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
before :all do
|
6
|
+
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
7
|
+
@original_phrase_tokens = PerseusMatch::PhraseTokenSet.instance_variable_get(:@tokens)
|
8
|
+
end
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
after :all do
|
11
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
|
12
|
+
PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, @original_phrase_tokens)
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
15
|
+
before :each do
|
16
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
17
|
+
PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, nil)
|
18
|
+
end
|
18
19
|
|
19
|
-
|
20
|
-
|
21
|
-
|
20
|
+
it 'should tokenize a string' do
|
21
|
+
PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
|
22
|
+
end
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
24
|
+
it 'should report strictly equal PhraseTokenSets as ==' do
|
25
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('foo bar')
|
26
|
+
end
|
26
27
|
|
27
|
-
|
28
|
-
|
29
|
-
|
28
|
+
it 'should report strictly equal PhraseTokenSets as eql' do
|
29
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').should be_eql(PerseusMatch::PhraseTokenSet.new('foo bar'))
|
30
|
+
end
|
30
31
|
|
31
|
-
|
32
|
-
|
33
|
-
|
32
|
+
it 'should report slightly equal PhraseTokenSets as ==' do
|
33
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('Foo Bar')
|
34
|
+
end
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
|
36
|
+
it 'should *not* report slightly equal PhraseTokenSets as eql' do
|
37
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').should_not be_eql(PerseusMatch::PhraseTokenSet.new('Foo Bar'))
|
38
|
+
end
|
38
39
|
|
39
|
-
|
40
|
+
it 'should collect unknown tokens' do
|
41
|
+
unknowns = []
|
42
|
+
PerseusMatch::PhraseTokenSet.tokenize('foo bar', unknowns)
|
43
|
+
unknowns.should == %w[foo]
|
44
|
+
end
|
40
45
|
|
41
|
-
|
46
|
+
it 'should include form in inspect' do
|
47
|
+
PerseusMatch::PhraseTokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
|
48
|
+
end
|
42
49
|
|
43
|
-
|
44
|
-
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
45
|
-
end
|
50
|
+
end if LINGO_FOUND
|
46
51
|
|
47
|
-
|
48
|
-
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
49
|
-
end
|
52
|
+
describe 'without lingo' do
|
50
53
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
+
before :all do
|
55
|
+
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
56
|
+
end
|
57
|
+
|
58
|
+
after :all do
|
59
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
|
60
|
+
end
|
54
61
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
LINGO_BASE.replace('')
|
62
|
+
before :each do
|
63
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
64
|
+
end
|
59
65
|
|
60
|
-
|
61
|
-
|
62
|
-
|
66
|
+
it 'should take a prepared file for tokenization' do
|
67
|
+
# prevent lingo from being used
|
68
|
+
lingo_base = LINGO_BASE.dup
|
69
|
+
LINGO_BASE.replace('')
|
63
70
|
|
64
|
-
|
65
|
-
|
71
|
+
temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
|
72
|
+
t.puts *%w[<foo|?> <bar|?>]
|
73
|
+
}
|
66
74
|
|
67
|
-
|
68
|
-
|
75
|
+
path = temp.path
|
76
|
+
link = 'perseus.tokens'
|
69
77
|
|
70
|
-
|
78
|
+
Dir.chdir(File.dirname(path)) {
|
79
|
+
begin
|
80
|
+
File.symlink(path, link)
|
81
|
+
PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
|
82
|
+
ensure
|
83
|
+
File.unlink(link) if File.symlink?(link) && File.readlink(link) == path
|
84
|
+
end
|
85
|
+
}
|
86
|
+
|
87
|
+
temp.unlink
|
88
|
+
|
89
|
+
# reset lingo base
|
90
|
+
LINGO_BASE.replace(lingo_base)
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
71
94
|
|
72
|
-
|
73
|
-
|
95
|
+
it 'should raise an error if asked for Soundex but is not available' do
|
96
|
+
soundex = Text.send(:remove_const, :Soundex)
|
74
97
|
|
75
|
-
|
98
|
+
lambda {
|
99
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').soundex
|
100
|
+
}.should raise_error(RuntimeError, /soundex/i)
|
76
101
|
|
77
|
-
|
78
|
-
LINGO_BASE.replace(lingo_base)
|
102
|
+
Text::Soundex = soundex
|
79
103
|
end
|
80
104
|
|
81
105
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
describe PerseusMatch::Token do
|
2
|
+
|
3
|
+
it 'should report strictly equal Tokens as ==' do
|
4
|
+
PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'a')
|
5
|
+
end
|
6
|
+
|
7
|
+
it 'should report strictly equal Tokens as eql' do
|
8
|
+
PerseusMatch::Token.new('foo', 'a').should be_eql(PerseusMatch::Token.new('foo', 'a'))
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should report slightly equal Tokens as ==' do
|
12
|
+
PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'b')
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should *not* report slightly equal Tokens as eql' do
|
16
|
+
PerseusMatch::Token.new('foo', 'a').should_not be_eql(PerseusMatch::Token.new('foo', 'b'))
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should include the word class in inspect' do
|
20
|
+
PerseusMatch::Token.new('foo', 'a').inspect.to_s.should =~ /\/a\z/
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
data/spec/perseus_match_spec.rb
CHANGED
@@ -37,7 +37,7 @@ describe PerseusMatch do
|
|
37
37
|
t.puts *phrases
|
38
38
|
}
|
39
39
|
|
40
|
-
PerseusMatch
|
40
|
+
PerseusMatch.tokenize(temp.path)
|
41
41
|
|
42
42
|
temp.unlink
|
43
43
|
|
@@ -158,13 +158,8 @@ describe PerseusMatch do
|
|
158
158
|
|
159
159
|
it 'should be checkable (2)' do
|
160
160
|
lambda {
|
161
|
-
|
162
|
-
|
163
|
-
rescue PerseusMatch::CheckFailedError => err
|
164
|
-
err.to_s.should =~ /0/
|
165
|
-
raise err
|
166
|
-
end
|
167
|
-
}.should raise_error(PerseusMatch::CheckFailedError)
|
161
|
+
PerseusMatch.check!('foo', 'bar', 0, :>)
|
162
|
+
}.should raise_error(PerseusMatch::CheckFailedError, /0/)
|
168
163
|
end
|
169
164
|
|
170
165
|
end if LINGO_FOUND
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: perseus_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-02-24 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -32,6 +32,16 @@ dependencies:
|
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 0.4.0
|
34
34
|
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: unicode
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.1.1
|
44
|
+
version:
|
35
45
|
description: Fuzzy string matching based on linguistic analysis
|
36
46
|
email: jens.wille@uni-koeln.de
|
37
47
|
executables:
|
@@ -43,6 +53,8 @@ extra_rdoc_files:
|
|
43
53
|
- ChangeLog
|
44
54
|
- README
|
45
55
|
files:
|
56
|
+
- lib/perseus_match/token.rb
|
57
|
+
- lib/perseus_match/core_ext.rb
|
46
58
|
- lib/perseus_match/list.rb
|
47
59
|
- lib/perseus_match/version.rb
|
48
60
|
- lib/perseus_match/token_set.rb
|
@@ -56,6 +68,7 @@ files:
|
|
56
68
|
- spec/spec_helper.rb
|
57
69
|
- spec/perseus_match/list_spec.rb
|
58
70
|
- spec/perseus_match/cluster_spec.rb
|
71
|
+
- spec/perseus_match/token_spec.rb
|
59
72
|
- spec/perseus_match/token_set_spec.rb
|
60
73
|
- spec/perseus_match_spec.rb
|
61
74
|
- sample/config.yaml
|