perseus_match 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +1 -1
- data/Rakefile +1 -1
- data/bin/perseus_match +22 -8
- data/lib/perseus_match/core_ext.rb +14 -0
- data/lib/perseus_match/token.rb +43 -0
- data/lib/perseus_match/token_set.rb +171 -116
- data/lib/perseus_match/version.rb +1 -1
- data/lib/perseus_match.rb +39 -13
- data/spec/perseus_match/token_set_spec.rb +80 -56
- data/spec/perseus_match/token_spec.rb +23 -0
- data/spec/perseus_match_spec.rb +3 -8
- metadata +15 -2
data/README
CHANGED
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ begin
|
|
14
14
|
:summary => %q{Fuzzy string matching based on linguistic analysis},
|
15
15
|
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
16
16
|
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
|
17
|
-
:dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
|
17
|
+
:dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0'], ['unicode', '>= 0.1.1']]
|
18
18
|
}
|
19
19
|
}}
|
20
20
|
rescue LoadError
|
data/bin/perseus_match
CHANGED
@@ -130,7 +130,7 @@ end
|
|
130
130
|
|
131
131
|
unknowns = Set.new if options[:unknowns]
|
132
132
|
|
133
|
-
PerseusMatch
|
133
|
+
PerseusMatch.tokenize(file, unknowns || !options[:silent])
|
134
134
|
|
135
135
|
if unknowns
|
136
136
|
File.open(options[:unknowns], 'w') { |f|
|
@@ -154,7 +154,12 @@ list_options = { :minimal => options[:minimal] }
|
|
154
154
|
threshold, count, count_all = options[:threshold], 0, 0
|
155
155
|
|
156
156
|
action = if options[:check]
|
157
|
-
require '
|
157
|
+
require 'csv'
|
158
|
+
|
159
|
+
if CSV.const_defined?(:Reader)
|
160
|
+
require 'fastercsv'
|
161
|
+
CSV = FasterCSV
|
162
|
+
end
|
158
163
|
|
159
164
|
format = if options[:align]
|
160
165
|
require 'jcode'
|
@@ -184,23 +189,23 @@ action = if options[:check]
|
|
184
189
|
positives = negatives = false_positives = false_negatives = 0.0
|
185
190
|
|
186
191
|
phrases.each { |line|
|
187
|
-
phrase, target, threshold, operator, _ = *
|
192
|
+
phrase, target, threshold, operator, _ = *CSV.parse_line(line)
|
188
193
|
|
189
194
|
threshold ||= global_threshold
|
190
195
|
operator ||= '>'
|
191
196
|
assign = operator =~ />/ || operator == '=='
|
192
197
|
|
193
198
|
begin
|
194
|
-
PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
|
199
|
+
res = PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
|
195
200
|
|
196
201
|
count += 1
|
197
202
|
assign ? positives += 1 : negatives += 1
|
198
203
|
|
199
|
-
puts format[line,
|
204
|
+
puts format[line, "OK -- #{res.value} (#{res.pm.distance})"] unless adjust_coeff || failed_only
|
200
205
|
rescue PerseusMatch::CheckFailedError => err
|
201
206
|
assign ? false_negatives += 1 : false_positives += 1
|
202
207
|
|
203
|
-
puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
|
208
|
+
puts format[line, "FAILED -- #{err.value} (#{err.pm.distance})"] unless adjust_coeff
|
204
209
|
end
|
205
210
|
|
206
211
|
count_all += 1
|
@@ -222,8 +227,17 @@ action = if options[:check]
|
|
222
227
|
precision = divide[positives, positives + false_positives]
|
223
228
|
f1 = divide[2 * recall * precision, recall + precision]
|
224
229
|
|
225
|
-
|
226
|
-
|
230
|
+
error_all = divide[ # trivial: assign all
|
231
|
+
negatives + false_positives,
|
232
|
+
positives + negatives + false_positives + false_negatives
|
233
|
+
]
|
234
|
+
error_none = divide[ # trivial: assign none
|
235
|
+
positives + false_negatives,
|
236
|
+
positives + negatives + false_positives + false_negatives
|
237
|
+
]
|
238
|
+
|
239
|
+
stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f (ALL = %0.4f, NONE = %0.4f)' % [
|
240
|
+
recall * 100, precision * 100, f1, error, error_all, error_none
|
227
241
|
]
|
228
242
|
|
229
243
|
stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
|
@@ -0,0 +1,43 @@
|
|
1
|
+
class PerseusMatch
|
2
|
+
|
3
|
+
class Token < String
|
4
|
+
|
5
|
+
WC_RE = %r{[/|]([^/|]*)\z}
|
6
|
+
|
7
|
+
ANY_WC = '*'.freeze
|
8
|
+
|
9
|
+
attr_reader :form, :wc
|
10
|
+
|
11
|
+
def initialize(form, wc = nil)
|
12
|
+
@form = form.sub(WC_RE, '')
|
13
|
+
@wc = wc || $1
|
14
|
+
|
15
|
+
super(@form)
|
16
|
+
end
|
17
|
+
|
18
|
+
def match?(wcs)
|
19
|
+
wcs = [*wcs].compact
|
20
|
+
wcs.include?(wc) || wcs.include?(ANY_WC)
|
21
|
+
end
|
22
|
+
|
23
|
+
def unk?
|
24
|
+
wc == '?'
|
25
|
+
end
|
26
|
+
|
27
|
+
def ==(other)
|
28
|
+
other.is_a?(self.class) ? form == other.form : form == other
|
29
|
+
end
|
30
|
+
|
31
|
+
def eql?(other)
|
32
|
+
self == other && wc == other.wc
|
33
|
+
end
|
34
|
+
|
35
|
+
def inspect
|
36
|
+
"#{super}/#{wc}"
|
37
|
+
end
|
38
|
+
|
39
|
+
alias_method :to_s, :inspect
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -40,20 +40,32 @@ require 'nuggets/util/i18n'
|
|
40
40
|
begin
|
41
41
|
require 'text/soundex'
|
42
42
|
rescue LoadError
|
43
|
-
warn "
|
43
|
+
warn "Could not load the Text gem -- Soundex functionality will not be available"
|
44
44
|
end
|
45
45
|
|
46
46
|
LINGO_BASE = ENV['PM_LINGO_BASE'] || (
|
47
47
|
File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
|
48
48
|
)
|
49
49
|
|
50
|
-
LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
|
51
|
-
|
50
|
+
if LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
|
51
|
+
begin
|
52
|
+
require File.join(LINGO_BASE, 'lib', 'const')
|
53
|
+
rescue LoadError
|
54
|
+
end
|
55
|
+
else
|
56
|
+
warn "Lingo installation not found at #{LINGO_BASE} -- proceeding anyway"
|
57
|
+
end
|
58
|
+
|
59
|
+
unless Object.const_defined?(:PRINTABLE_CHAR)
|
60
|
+
PRINTABLE_CHAR = '[\w-]'
|
61
|
+
end
|
62
|
+
|
63
|
+
PRINTABLE_CHAR_RE = %r{(?:#{PRINTABLE_CHAR})+}
|
52
64
|
|
53
65
|
lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
|
54
66
|
YAML.load_file(file)
|
55
67
|
else
|
56
|
-
warn "
|
68
|
+
warn "Lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
|
57
69
|
|
58
70
|
{
|
59
71
|
'meeting' => {
|
@@ -78,30 +90,81 @@ class PerseusMatch
|
|
78
90
|
|
79
91
|
class TokenSet < Array
|
80
92
|
|
81
|
-
|
82
|
-
return @tokens[form] if @tokens
|
93
|
+
class << self
|
83
94
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
)
|
90
|
-
|
95
|
+
def tokenize(form, unknowns = false)
|
96
|
+
form.downcase!
|
97
|
+
return @tokens[form] if @tokens ||= nil
|
98
|
+
|
99
|
+
@_tokens = Hash.new
|
100
|
+
@tokens = Hash.new { |h, k| h[k] = new(k, @_tokens[k] || []) }
|
101
|
+
|
102
|
+
tokens_file = ENV['PM_TOKENS_FILE'] || 'perseus.tokens'
|
103
|
+
|
104
|
+
if File.readable?(tokens_file)
|
105
|
+
File.open(tokens_file) { |f| parse(f, unknowns, @_tokens) }
|
106
|
+
@tokens[form]
|
107
|
+
else
|
108
|
+
raise "Lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
|
109
|
+
|
110
|
+
cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
|
111
|
+
YAML.dump(LINGO_CONFIG, t)
|
112
|
+
}
|
113
|
+
|
114
|
+
file = file?(form) || begin
|
115
|
+
temp = Tempfile.open('perseus_match_temp') { |t| t.puts form }
|
116
|
+
temp.path
|
117
|
+
end
|
118
|
+
|
119
|
+
ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
|
120
|
+
|
121
|
+
if keep = ENV['PM_KEEP_TOKENS']
|
122
|
+
keep = File.expand_path(keep =~ /\A(?:1|y(?:es)?|true)\z/i ? tokens_file : keep)
|
123
|
+
end
|
124
|
+
|
125
|
+
begin
|
126
|
+
Dir.chdir(LINGO_BASE) {
|
127
|
+
tokens = %x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}
|
128
|
+
File.open(keep, 'w') { |f| f.puts tokens } if keep
|
129
|
+
parse(tokens, unknowns, @_tokens)
|
130
|
+
}
|
131
|
+
ensure
|
132
|
+
cfg.unlink
|
133
|
+
temp.unlink if temp
|
134
|
+
end
|
135
|
+
|
136
|
+
if temp
|
137
|
+
tokens, @tokens = @tokens[form], nil
|
138
|
+
tokens
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def file?(form)
|
144
|
+
file = Pathname.new(form).absolute? ? form : File.expand_path(form)
|
145
|
+
file if File.file?(file) && File.readable?(file)
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
91
149
|
|
92
|
-
parse =
|
93
|
-
|
150
|
+
def parse(output, unknowns = false, tokens = {})
|
151
|
+
sanitize = lambda { |a|
|
152
|
+
a.sub!(Token::WC_RE, '')
|
153
|
+
a.downcase!
|
154
|
+
}
|
155
|
+
|
156
|
+
output.each_line { |res|
|
94
157
|
case res
|
95
158
|
when /<(.*?)\s=\s\[(.*)\]>/
|
96
159
|
a, b = $1, $2
|
97
|
-
a
|
160
|
+
sanitize[a]
|
98
161
|
|
99
|
-
|
162
|
+
tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten.map { |t| Token.new(t) }
|
100
163
|
when /<(.*)>/, /:(.*):/
|
101
|
-
a, b = $1, $1.
|
102
|
-
a
|
164
|
+
a, b = $1, Token.new($1.downcase)
|
165
|
+
sanitize[a]
|
103
166
|
|
104
|
-
if unknowns && b
|
167
|
+
if unknowns && b.unk?
|
105
168
|
if unknowns.respond_to?(:<<)
|
106
169
|
unknowns << a
|
107
170
|
else
|
@@ -109,134 +172,65 @@ class PerseusMatch
|
|
109
172
|
end
|
110
173
|
end
|
111
174
|
|
112
|
-
|
175
|
+
tokens[a] ||= [b]
|
113
176
|
end
|
114
177
|
}
|
115
|
-
}
|
116
|
-
|
117
|
-
if File.readable?(t = 'perseus.tokens')
|
118
|
-
File.open(t) { |f| parse[f] }
|
119
|
-
@tokens[form]
|
120
|
-
else
|
121
|
-
raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
|
122
|
-
|
123
|
-
cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
|
124
|
-
YAML.dump(LINGO_CONFIG, t)
|
125
|
-
}
|
126
|
-
|
127
|
-
file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
|
128
178
|
|
129
|
-
|
130
|
-
temp = Tempfile.open('perseus_match_temp') { |t|
|
131
|
-
t.puts form
|
132
|
-
}
|
133
|
-
|
134
|
-
file = temp.path
|
135
|
-
end
|
136
|
-
|
137
|
-
ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
|
138
|
-
|
139
|
-
begin
|
140
|
-
Dir.chdir(LINGO_BASE) {
|
141
|
-
parse[%x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}]
|
142
|
-
}
|
143
|
-
ensure
|
144
|
-
cfg.unlink
|
145
|
-
temp.unlink if temp
|
146
|
-
end
|
147
|
-
|
148
|
-
if temp
|
149
|
-
tokens, @tokens = @tokens[form], nil
|
150
|
-
tokens
|
151
|
-
else
|
152
|
-
@tokens[form]
|
153
|
-
end
|
179
|
+
tokens
|
154
180
|
end
|
181
|
+
|
155
182
|
end
|
156
183
|
|
157
184
|
private :push, :<<, :[]= # maybe more...
|
158
185
|
|
159
|
-
attr_reader :form
|
186
|
+
attr_reader :form, :tokens
|
160
187
|
|
161
188
|
def initialize(form, tokens = nil)
|
162
189
|
super(tokens || self.class.tokenize(form))
|
163
190
|
|
164
191
|
@form = form
|
165
|
-
@tokens = to_a
|
192
|
+
@tokens = to_a
|
166
193
|
end
|
167
194
|
|
168
195
|
def distance(other)
|
169
|
-
|
170
|
-
size1, size2 = tokens1.size, tokens2.size
|
171
|
-
|
172
|
-
return size2 if tokens1.empty?
|
173
|
-
return size1 if tokens2.empty?
|
174
|
-
|
175
|
-
distance, costs = nil, (0..size2).to_a
|
176
|
-
|
177
|
-
0.upto(size1 - 1) { |index1|
|
178
|
-
token1, cost = tokens1[index1], index1 + 1
|
179
|
-
|
180
|
-
0.upto(size2 - 1) { |index2|
|
181
|
-
penalty = token1 == tokens2[index2] ? 0 : 1
|
182
|
-
|
183
|
-
# rcov hack :-(
|
184
|
-
_ = [
|
185
|
-
costs[index2 + 1] + 1, # insertion
|
186
|
-
cost + 1, # deletion
|
187
|
-
costs[index2] + penalty # substitution
|
188
|
-
]
|
189
|
-
distance = _.min
|
190
|
-
|
191
|
-
costs[index2], cost = cost, distance
|
192
|
-
}
|
193
|
-
|
194
|
-
costs[size2] = distance
|
195
|
-
}
|
196
|
-
|
197
|
-
distance + 1 # > 0 !?!
|
196
|
+
(forms | other.forms).size - (forms & other.forms).size
|
198
197
|
end
|
199
198
|
|
200
|
-
def
|
201
|
-
|
202
|
-
token.sub(%r{[/|].*?\z}, '')
|
203
|
-
}
|
199
|
+
def forms
|
200
|
+
@forms ||= map { |token| token.form }
|
204
201
|
end
|
205
202
|
|
206
203
|
def disjoint?(other)
|
207
|
-
(
|
204
|
+
(forms.flatten & other.forms.flatten).flatten.empty?
|
208
205
|
end
|
209
206
|
|
210
207
|
def inclexcl(inclexcl = {})
|
211
|
-
incl(inclexcl[:incl] ||
|
208
|
+
incl(inclexcl[:incl] || Token::ANY_WC).excl(inclexcl[:excl])
|
212
209
|
end
|
213
210
|
|
214
|
-
def incl(
|
215
|
-
(
|
216
|
-
match?(token, wc)
|
217
|
-
}.to_token_set(form)
|
211
|
+
def incl(wcs)
|
212
|
+
self.class.new(form, select { |token| token.match?(wcs) })
|
218
213
|
end
|
219
214
|
|
220
|
-
def excl(
|
221
|
-
(
|
222
|
-
match?(token, wc)
|
223
|
-
}.to_token_set(form)
|
215
|
+
def excl(wcs)
|
216
|
+
self.class.new(form, reject { |token| token.match?(wcs) })
|
224
217
|
end
|
225
218
|
|
226
219
|
def soundex
|
227
|
-
|
220
|
+
ensure_soundex!
|
228
221
|
|
229
|
-
@soundex ||= map { |token|
|
230
|
-
token.sub(
|
231
|
-
|
222
|
+
@soundex ||= self.class.new(form, map { |token|
|
223
|
+
form = token.form.replace_diacritics.sub(/\W+/, '')
|
224
|
+
Token.new(Text::Soundex.soundex(form) || '', token.wc)
|
225
|
+
})
|
232
226
|
end
|
233
227
|
|
234
|
-
def
|
235
|
-
|
228
|
+
def ==(other)
|
229
|
+
tokens == other.tokens
|
236
230
|
end
|
237
231
|
|
238
232
|
def eql?(other)
|
239
|
-
|
233
|
+
self == other && form == other.form
|
240
234
|
end
|
241
235
|
|
242
236
|
def inspect
|
@@ -247,16 +241,77 @@ class PerseusMatch
|
|
247
241
|
|
248
242
|
private
|
249
243
|
|
250
|
-
def
|
251
|
-
|
244
|
+
def ensure_soundex!
|
245
|
+
unless defined?(Text::Soundex)
|
246
|
+
raise RuntimeError, "Soundex functionality not available", caller(1)
|
247
|
+
end
|
252
248
|
end
|
253
249
|
|
254
250
|
end
|
255
251
|
|
256
|
-
class
|
252
|
+
class PhraseTokenSet < TokenSet
|
253
|
+
|
254
|
+
class << self
|
255
|
+
|
256
|
+
def tokenize(form, unknowns = false)
|
257
|
+
(@tokens ||= {})[form] ||= new(form, form.scan(PRINTABLE_CHAR_RE).map { |i|
|
258
|
+
TokenSet.tokenize(i, unknowns)
|
259
|
+
})
|
260
|
+
end
|
261
|
+
|
262
|
+
end
|
263
|
+
|
264
|
+
alias_method :phrase, :form
|
265
|
+
alias_method :token_sets, :tokens
|
266
|
+
|
267
|
+
# (size1 - size2).abs <= distance <= [size1, size2].max
|
268
|
+
def distance(other)
|
269
|
+
token_sets1, token_sets2 = token_sets, other.token_sets
|
270
|
+
size1, size2 = token_sets1.size, token_sets2.size
|
271
|
+
|
272
|
+
return size2 if size1 == 0
|
273
|
+
return size1 if size2 == 0
|
274
|
+
|
275
|
+
distance, costs = nil, (0..size2).to_a
|
276
|
+
|
277
|
+
0.upto(size1 - 1) { |index1|
|
278
|
+
token_set1, cost = token_sets1[index1], index1 + 1
|
279
|
+
|
280
|
+
0.upto(size2 - 1) { |index2|
|
281
|
+
penalty = token_set1.distance(token_sets2[index2])
|
282
|
+
|
283
|
+
# rcov hack :-(
|
284
|
+
_ = [
|
285
|
+
costs[index2 + 1] + 1, # insertion
|
286
|
+
cost + 1, # deletion
|
287
|
+
costs[index2] + penalty # substitution
|
288
|
+
]
|
289
|
+
distance = _.min
|
257
290
|
|
258
|
-
|
259
|
-
|
291
|
+
costs[index2], cost = cost, distance
|
292
|
+
}
|
293
|
+
|
294
|
+
costs[size2] = distance
|
295
|
+
}
|
296
|
+
|
297
|
+
distance
|
298
|
+
end
|
299
|
+
|
300
|
+
def forms
|
301
|
+
@forms ||= map { |token_set| token_set.forms }
|
302
|
+
end
|
303
|
+
|
304
|
+
def incl(wcs)
|
305
|
+
self.class.new(form, map { |token_set| token_set.incl(wcs) })
|
306
|
+
end
|
307
|
+
|
308
|
+
def excl(wcs)
|
309
|
+
self.class.new(form, map { |token_set| token_set.excl(wcs) })
|
310
|
+
end
|
311
|
+
|
312
|
+
def soundex
|
313
|
+
ensure_soundex!
|
314
|
+
@soundex ||= self.class.new(form, map { |token_set| token_set.soundex })
|
260
315
|
end
|
261
316
|
|
262
317
|
end
|
data/lib/perseus_match.rb
CHANGED
@@ -26,8 +26,11 @@
|
|
26
26
|
###############################################################################
|
27
27
|
#++
|
28
28
|
|
29
|
+
require 'perseus_match/core_ext'
|
30
|
+
|
29
31
|
require 'perseus_match/list'
|
30
32
|
require 'perseus_match/cluster'
|
33
|
+
require 'perseus_match/token'
|
31
34
|
require 'perseus_match/token_set'
|
32
35
|
|
33
36
|
require 'perseus_match/version'
|
@@ -36,7 +39,7 @@ class PerseusMatch
|
|
36
39
|
|
37
40
|
Infinity = 1.0 / 0
|
38
41
|
|
39
|
-
DEFAULT_COEFF =
|
42
|
+
DEFAULT_COEFF = 2
|
40
43
|
|
41
44
|
DISTANCE_SPEC = [ # {
|
42
45
|
[{}, 1], # {} => 1,
|
@@ -68,8 +71,22 @@ class PerseusMatch
|
|
68
71
|
end
|
69
72
|
|
70
73
|
def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
|
71
|
-
|
72
|
-
value
|
74
|
+
pm = new(phrase, target, pm_options)
|
75
|
+
value = pm.send(attribute)
|
76
|
+
|
77
|
+
if value.send(operator, threshold)
|
78
|
+
Struct.new(:pm, :value, :threshold, :operator).new(pm, value, threshold, operator)
|
79
|
+
else
|
80
|
+
raise CheckFailedError.new(pm, value, threshold, operator)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def tokenize(form, unknowns = false)
|
85
|
+
if file = TokenSet.file?(form)
|
86
|
+
TokenSet.tokenize(file, unknowns)
|
87
|
+
else
|
88
|
+
PhraseTokenSet.tokenize(form, unknowns)
|
89
|
+
end
|
73
90
|
end
|
74
91
|
|
75
92
|
end
|
@@ -77,8 +94,8 @@ class PerseusMatch
|
|
77
94
|
attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
|
78
95
|
|
79
96
|
def initialize(phrase, target, options = {})
|
80
|
-
@phrase = phrase.to_s
|
81
|
-
@target = target.to_s
|
97
|
+
@phrase = sanitize(phrase.to_s)
|
98
|
+
@target = sanitize(target.to_s)
|
82
99
|
|
83
100
|
@default_coeff = options[:default_coeff] || DEFAULT_COEFF
|
84
101
|
@distance_spec = options[:distance_spec] || DISTANCE_SPEC
|
@@ -89,11 +106,11 @@ class PerseusMatch
|
|
89
106
|
end
|
90
107
|
|
91
108
|
def phrase_tokens
|
92
|
-
@phrase_tokens ||= tokenize(phrase)
|
109
|
+
@phrase_tokens ||= self.class.tokenize(phrase)
|
93
110
|
end
|
94
111
|
|
95
112
|
def target_tokens
|
96
|
-
@target_tokens ||= tokenize(target)
|
113
|
+
@target_tokens ||= self.class.tokenize(target)
|
97
114
|
end
|
98
115
|
|
99
116
|
# 0 <= distance <= Infinity
|
@@ -104,13 +121,13 @@ class PerseusMatch
|
|
104
121
|
# 1 >= similarity >= 0
|
105
122
|
def similarity(coeff = nil)
|
106
123
|
coeff ||= default_coeff # passed arg may be nil
|
107
|
-
@similarity[coeff] ||=
|
124
|
+
@similarity[coeff] ||= normalize_distance(coeff)
|
108
125
|
end
|
109
126
|
|
110
127
|
private
|
111
128
|
|
112
|
-
def
|
113
|
-
|
129
|
+
def sanitize(str)
|
130
|
+
str.gsub(/\s*\(.*?\)|\s*\[.*?\]/, '').sub(/\s*[\/:].*/, '')
|
114
131
|
end
|
115
132
|
|
116
133
|
def calculate_distance
|
@@ -148,16 +165,25 @@ class PerseusMatch
|
|
148
165
|
distance
|
149
166
|
end
|
150
167
|
|
168
|
+
def normalize_distance(coeff)
|
169
|
+
length = phrase_tokens.size + target_tokens.size
|
170
|
+
return 0 if length == 0
|
171
|
+
|
172
|
+
norm = Math.log(length ** Math.sqrt(2)) * coeff * total_weight * Math::E
|
173
|
+
|
174
|
+
1 / Math.exp(distance / norm)
|
175
|
+
end
|
176
|
+
|
151
177
|
def total_weight
|
152
178
|
@total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
|
153
179
|
end
|
154
180
|
|
155
181
|
class CheckFailedError < StandardError
|
156
182
|
|
157
|
-
attr_reader :value, :threshold, :operator
|
183
|
+
attr_reader :pm, :value, :threshold, :operator
|
158
184
|
|
159
|
-
def initialize(value, threshold, operator)
|
160
|
-
@value, @threshold, @operator = value, threshold, operator
|
185
|
+
def initialize(pm, value, threshold, operator)
|
186
|
+
@pm, @value, @threshold, @operator = pm, value, threshold, operator
|
161
187
|
end
|
162
188
|
|
163
189
|
def to_s
|
@@ -1,81 +1,105 @@
|
|
1
|
-
describe PerseusMatch::
|
1
|
+
describe PerseusMatch::PhraseTokenSet do
|
2
2
|
|
3
|
-
|
4
|
-
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
5
|
-
end
|
3
|
+
describe 'with lingo' do
|
6
4
|
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
before :all do
|
6
|
+
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
7
|
+
@original_phrase_tokens = PerseusMatch::PhraseTokenSet.instance_variable_get(:@tokens)
|
8
|
+
end
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
after :all do
|
11
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
|
12
|
+
PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, @original_phrase_tokens)
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
15
|
+
before :each do
|
16
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
17
|
+
PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, nil)
|
18
|
+
end
|
18
19
|
|
19
|
-
|
20
|
-
|
21
|
-
|
20
|
+
it 'should tokenize a string' do
|
21
|
+
PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
|
22
|
+
end
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
24
|
+
it 'should report strictly equal PhraseTokenSets as ==' do
|
25
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('foo bar')
|
26
|
+
end
|
26
27
|
|
27
|
-
|
28
|
-
|
29
|
-
|
28
|
+
it 'should report strictly equal PhraseTokenSets as eql' do
|
29
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').should be_eql(PerseusMatch::PhraseTokenSet.new('foo bar'))
|
30
|
+
end
|
30
31
|
|
31
|
-
|
32
|
-
|
33
|
-
|
32
|
+
it 'should report slightly equal PhraseTokenSets as ==' do
|
33
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('Foo Bar')
|
34
|
+
end
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
|
36
|
+
it 'should *not* report slightly equal PhraseTokenSets as eql' do
|
37
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').should_not be_eql(PerseusMatch::PhraseTokenSet.new('Foo Bar'))
|
38
|
+
end
|
38
39
|
|
39
|
-
|
40
|
+
it 'should collect unknown tokens' do
|
41
|
+
unknowns = []
|
42
|
+
PerseusMatch::PhraseTokenSet.tokenize('foo bar', unknowns)
|
43
|
+
unknowns.should == %w[foo]
|
44
|
+
end
|
40
45
|
|
41
|
-
|
46
|
+
it 'should include form in inspect' do
|
47
|
+
PerseusMatch::PhraseTokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
|
48
|
+
end
|
42
49
|
|
43
|
-
|
44
|
-
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
45
|
-
end
|
50
|
+
end if LINGO_FOUND
|
46
51
|
|
47
|
-
|
48
|
-
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
49
|
-
end
|
52
|
+
describe 'without lingo' do
|
50
53
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
+
before :all do
|
55
|
+
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
56
|
+
end
|
57
|
+
|
58
|
+
after :all do
|
59
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
|
60
|
+
end
|
54
61
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
LINGO_BASE.replace('')
|
62
|
+
before :each do
|
63
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
64
|
+
end
|
59
65
|
|
60
|
-
|
61
|
-
|
62
|
-
|
66
|
+
it 'should take a prepared file for tokenization' do
|
67
|
+
# prevent lingo from being used
|
68
|
+
lingo_base = LINGO_BASE.dup
|
69
|
+
LINGO_BASE.replace('')
|
63
70
|
|
64
|
-
|
65
|
-
|
71
|
+
temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
|
72
|
+
t.puts *%w[<foo|?> <bar|?>]
|
73
|
+
}
|
66
74
|
|
67
|
-
|
68
|
-
|
75
|
+
path = temp.path
|
76
|
+
link = 'perseus.tokens'
|
69
77
|
|
70
|
-
|
78
|
+
Dir.chdir(File.dirname(path)) {
|
79
|
+
begin
|
80
|
+
File.symlink(path, link)
|
81
|
+
PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
|
82
|
+
ensure
|
83
|
+
File.unlink(link) if File.symlink?(link) && File.readlink(link) == path
|
84
|
+
end
|
85
|
+
}
|
86
|
+
|
87
|
+
temp.unlink
|
88
|
+
|
89
|
+
# reset lingo base
|
90
|
+
LINGO_BASE.replace(lingo_base)
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
71
94
|
|
72
|
-
|
73
|
-
|
95
|
+
it 'should raise an error if asked for Soundex but is not available' do
|
96
|
+
soundex = Text.send(:remove_const, :Soundex)
|
74
97
|
|
75
|
-
|
98
|
+
lambda {
|
99
|
+
PerseusMatch::PhraseTokenSet.new('foo bar').soundex
|
100
|
+
}.should raise_error(RuntimeError, /soundex/i)
|
76
101
|
|
77
|
-
|
78
|
-
LINGO_BASE.replace(lingo_base)
|
102
|
+
Text::Soundex = soundex
|
79
103
|
end
|
80
104
|
|
81
105
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
describe PerseusMatch::Token do
|
2
|
+
|
3
|
+
it 'should report strictly equal Tokens as ==' do
|
4
|
+
PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'a')
|
5
|
+
end
|
6
|
+
|
7
|
+
it 'should report strictly equal Tokens as eql' do
|
8
|
+
PerseusMatch::Token.new('foo', 'a').should be_eql(PerseusMatch::Token.new('foo', 'a'))
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should report slightly equal Tokens as ==' do
|
12
|
+
PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'b')
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should *not* report slightly equal Tokens as eql' do
|
16
|
+
PerseusMatch::Token.new('foo', 'a').should_not be_eql(PerseusMatch::Token.new('foo', 'b'))
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should include the word class in inspect' do
|
20
|
+
PerseusMatch::Token.new('foo', 'a').inspect.to_s.should =~ /\/a\z/
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
data/spec/perseus_match_spec.rb
CHANGED
@@ -37,7 +37,7 @@ describe PerseusMatch do
|
|
37
37
|
t.puts *phrases
|
38
38
|
}
|
39
39
|
|
40
|
-
PerseusMatch
|
40
|
+
PerseusMatch.tokenize(temp.path)
|
41
41
|
|
42
42
|
temp.unlink
|
43
43
|
|
@@ -158,13 +158,8 @@ describe PerseusMatch do
|
|
158
158
|
|
159
159
|
it 'should be checkable (2)' do
|
160
160
|
lambda {
|
161
|
-
|
162
|
-
|
163
|
-
rescue PerseusMatch::CheckFailedError => err
|
164
|
-
err.to_s.should =~ /0/
|
165
|
-
raise err
|
166
|
-
end
|
167
|
-
}.should raise_error(PerseusMatch::CheckFailedError)
|
161
|
+
PerseusMatch.check!('foo', 'bar', 0, :>)
|
162
|
+
}.should raise_error(PerseusMatch::CheckFailedError, /0/)
|
168
163
|
end
|
169
164
|
|
170
165
|
end if LINGO_FOUND
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: perseus_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-02-24 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -32,6 +32,16 @@ dependencies:
|
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 0.4.0
|
34
34
|
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: unicode
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.1.1
|
44
|
+
version:
|
35
45
|
description: Fuzzy string matching based on linguistic analysis
|
36
46
|
email: jens.wille@uni-koeln.de
|
37
47
|
executables:
|
@@ -43,6 +53,8 @@ extra_rdoc_files:
|
|
43
53
|
- ChangeLog
|
44
54
|
- README
|
45
55
|
files:
|
56
|
+
- lib/perseus_match/token.rb
|
57
|
+
- lib/perseus_match/core_ext.rb
|
46
58
|
- lib/perseus_match/list.rb
|
47
59
|
- lib/perseus_match/version.rb
|
48
60
|
- lib/perseus_match/token_set.rb
|
@@ -56,6 +68,7 @@ files:
|
|
56
68
|
- spec/spec_helper.rb
|
57
69
|
- spec/perseus_match/list_spec.rb
|
58
70
|
- spec/perseus_match/cluster_spec.rb
|
71
|
+
- spec/perseus_match/token_spec.rb
|
59
72
|
- spec/perseus_match/token_set_spec.rb
|
60
73
|
- spec/perseus_match_spec.rb
|
61
74
|
- sample/config.yaml
|