perseus_match 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LINGO_BASE +1 -0
- data/README +8 -6
- data/Rakefile +2 -2
- data/bin/perseus_match +226 -22
- data/lib/perseus_match/cluster.rb +9 -8
- data/lib/perseus_match/list.rb +31 -9
- data/lib/perseus_match/token_set.rb +105 -90
- data/lib/perseus_match/version.rb +1 -1
- data/lib/perseus_match.rb +67 -21
- data/spec/perseus_match/cluster_spec.rb +45 -0
- data/spec/perseus_match/list_spec.rb +16 -0
- data/spec/perseus_match/token_set_spec.rb +65 -0
- data/spec/perseus_match_spec.rb +168 -0
- data/spec/spec_helper.rb +18 -0
- metadata +28 -12
data/LINGO_BASE
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
/home/jw/devel/lingo/trunk
|
data/README
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
== VERSION
|
4
4
|
|
5
|
-
This documentation refers to perseus_match version 0.0.
|
5
|
+
This documentation refers to perseus_match version 0.0.3
|
6
6
|
|
7
7
|
|
8
8
|
== DESCRIPTION
|
@@ -10,15 +10,17 @@ This documentation refers to perseus_match version 0.0.2
|
|
10
10
|
Fuzzy string matching based on linguistic analysis.
|
11
11
|
|
12
12
|
|
13
|
-
==
|
13
|
+
== LINKS
|
14
14
|
|
15
|
-
|
15
|
+
<b></b>
|
16
|
+
Documentation:: <http://prometheus.rubyforge.org/perseus_match>
|
17
|
+
Source code:: <http://github.com/blackwinter/perseus_match>
|
18
|
+
Rubyforge project:: <http://rubyforge.org/projects/prometheus>
|
16
19
|
|
17
20
|
|
18
|
-
==
|
21
|
+
== AUTHORS
|
19
22
|
|
20
|
-
* <
|
21
|
-
* <http://github.com/blackwinter/perseus_match>
|
23
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
22
24
|
|
23
25
|
|
24
26
|
== LICENSE AND COPYRIGHT
|
data/Rakefile
CHANGED
@@ -13,8 +13,8 @@ begin
|
|
13
13
|
:version => PerseusMatch::VERSION,
|
14
14
|
:summary => %q{Fuzzy string matching based on linguistic analysis},
|
15
15
|
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
16
|
-
:extra_files => FileList['[A-Z]*'].to_a,
|
17
|
-
:dependencies => [['ruby-nuggets', '>= 0.
|
16
|
+
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
|
17
|
+
:dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
|
18
18
|
}
|
19
19
|
}}
|
20
20
|
rescue LoadError
|
data/bin/perseus_match
CHANGED
@@ -2,8 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'optparse'
|
4
4
|
require 'benchmark'
|
5
|
+
require 'yaml'
|
5
6
|
|
6
7
|
require 'rubygems'
|
8
|
+
require 'nuggets/enumerable/minmax'
|
7
9
|
require 'nuggets/numeric/duration'
|
8
10
|
|
9
11
|
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
@@ -14,19 +16,29 @@ USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
|
|
14
16
|
abort USAGE if ARGV.empty?
|
15
17
|
|
16
18
|
options = {
|
17
|
-
:
|
18
|
-
:threshold
|
19
|
-
:sort
|
19
|
+
:config => nil,
|
20
|
+
:threshold => 0,
|
21
|
+
:sort => false,
|
22
|
+
:stats => false,
|
23
|
+
:lingo => false,
|
24
|
+
:minimal => false,
|
25
|
+
:separate => false,
|
26
|
+
:check => false,
|
27
|
+
:failed_only => false,
|
28
|
+
:align => false,
|
29
|
+
:adjust_coeff => false
|
20
30
|
}
|
21
31
|
|
22
32
|
OptionParser.new { |opts|
|
23
33
|
opts.banner = USAGE
|
24
34
|
|
25
|
-
opts.separator ''
|
35
|
+
opts.separator ' '
|
26
36
|
opts.separator 'Options:'
|
27
37
|
|
28
|
-
opts.on('--
|
29
|
-
|
38
|
+
opts.on('-c', '--config YAML', 'Config file in YAML format') { |f|
|
39
|
+
abort "Can't find config file: #{f}." unless File.readable?(f)
|
40
|
+
|
41
|
+
options[:config] = f
|
30
42
|
}
|
31
43
|
|
32
44
|
opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
|
@@ -37,7 +49,51 @@ OptionParser.new { |opts|
|
|
37
49
|
options[:sort] = true
|
38
50
|
}
|
39
51
|
|
40
|
-
opts.
|
52
|
+
opts.on('-S', '--stats', 'Output some statistics at the end') {
|
53
|
+
options[:stats] = true
|
54
|
+
}
|
55
|
+
|
56
|
+
opts.on('-v', '--verbose', 'Print additional information during processing') {
|
57
|
+
options[:verbose] = true
|
58
|
+
}
|
59
|
+
|
60
|
+
opts.separator ' '
|
61
|
+
opts.separator ' * Calculating similarities (default)'
|
62
|
+
opts.separator ' '
|
63
|
+
|
64
|
+
opts.on('-m', '--minimal', 'Produce minimal pairs only') {
|
65
|
+
options[:minimal] = true
|
66
|
+
}
|
67
|
+
|
68
|
+
opts.on('-p', '--separate [SEPARATOR]', 'Separate phrase blocks by SEPARATOR', '[Default: empty line]') { |p|
|
69
|
+
options[:separate] = p || ''
|
70
|
+
}
|
71
|
+
|
72
|
+
opts.on('-l', '--lingo', 'Output results in lingo dictionary format') {
|
73
|
+
options[:lingo] = true
|
74
|
+
}
|
75
|
+
|
76
|
+
opts.separator ' '
|
77
|
+
opts.separator ' * Checking pairs'
|
78
|
+
opts.separator ' '
|
79
|
+
|
80
|
+
opts.on('-C', '--check', 'Check pairs in input file (CSV) against', 'specified (or global) threshold') {
|
81
|
+
options[:check] = true
|
82
|
+
}
|
83
|
+
|
84
|
+
opts.on('-f', '--failed', 'Print only failed checks') {
|
85
|
+
options[:failed_only] = true
|
86
|
+
}
|
87
|
+
|
88
|
+
opts.on('-a', '--align', 'Align check results') {
|
89
|
+
options[:align] = true
|
90
|
+
}
|
91
|
+
|
92
|
+
opts.on('-A', '--adjust-coeff', 'Adjust coefficient to minimize the error') {
|
93
|
+
options[:adjust_coeff] = true
|
94
|
+
}
|
95
|
+
|
96
|
+
opts.separator ' '
|
41
97
|
opts.separator 'Generic options:'
|
42
98
|
|
43
99
|
opts.on('-h', '--help', 'Print this help message and exit') {
|
@@ -57,33 +113,179 @@ end
|
|
57
113
|
|
58
114
|
PerseusMatch::TokenSet.tokenize(file)
|
59
115
|
|
60
|
-
|
116
|
+
skip_re = %r{\A\s*(?:#|\z)}o
|
117
|
+
phrases = []
|
118
|
+
|
119
|
+
File.open(file).each { |line|
|
120
|
+
phrases << line.chomp unless line =~ skip_re
|
121
|
+
}.close
|
122
|
+
|
123
|
+
pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
|
124
|
+
pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
|
125
|
+
list_options = { :minimal => options[:minimal] }
|
61
126
|
|
62
127
|
threshold, count, count_all = options[:threshold], 0, 0
|
63
128
|
|
64
|
-
|
65
|
-
|
66
|
-
require 'pp'
|
129
|
+
action = if options[:check]
|
130
|
+
require 'fastercsv'
|
67
131
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
132
|
+
format = if options[:align]
|
133
|
+
require 'jcode'
|
134
|
+
|
135
|
+
width = phrases.max(:jlength) + 3
|
136
|
+
|
137
|
+
lambda { |line, res|
|
138
|
+
"#{line} #{'.' * (width - line.jlength)} [#{res}]"
|
139
|
+
}
|
75
140
|
else
|
76
|
-
|
77
|
-
|
78
|
-
|
141
|
+
lambda { |line, res|
|
142
|
+
"#{line} [#{res}]"
|
143
|
+
}
|
144
|
+
end
|
145
|
+
|
146
|
+
phrases.sort! if options[:sort]
|
147
|
+
phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
|
148
|
+
|
149
|
+
global_threshold = options[:threshold]
|
150
|
+
failed_only = options[:failed_only]
|
151
|
+
collect_stats = options[:stats]
|
152
|
+
adjust_coeff = options[:adjust_coeff]
|
153
|
+
|
154
|
+
_action = lambda { |*args|
|
155
|
+
pm_options[:default_coeff] = args.first unless args.empty?
|
156
|
+
|
157
|
+
count, count_all = 0, 0
|
158
|
+
positives = negatives = false_positives = false_negatives = 0.0
|
159
|
+
|
160
|
+
phrases.each { |line, spec|
|
161
|
+
phrase, target, threshold, operator, _ = *spec
|
162
|
+
|
163
|
+
threshold ||= global_threshold
|
164
|
+
operator ||= '>'
|
165
|
+
assign = operator =~ />/
|
166
|
+
|
167
|
+
begin
|
168
|
+
PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
|
169
|
+
|
79
170
|
count += 1
|
171
|
+
assign ? positives += 1 : negatives += 1
|
172
|
+
|
173
|
+
puts format[line, 'OK'] unless adjust_coeff || failed_only
|
174
|
+
rescue PerseusMatch::CheckFailedError => err
|
175
|
+
assign ? false_negatives += 1 : false_positives += 1
|
176
|
+
|
177
|
+
puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
|
80
178
|
end
|
81
179
|
count_all += 1
|
82
180
|
}
|
181
|
+
|
182
|
+
divide = lambda { |numerator, denominator|
|
183
|
+
denominator == 0 ? 0 : numerator / denominator
|
184
|
+
}
|
185
|
+
|
186
|
+
if collect_stats || adjust_coeff
|
187
|
+
error = divide[
|
188
|
+
false_positives + false_negatives,
|
189
|
+
positives + negatives + false_positives + false_negatives
|
190
|
+
]
|
191
|
+
end
|
192
|
+
|
193
|
+
if collect_stats
|
194
|
+
recall = divide[positives, positives + false_negatives]
|
195
|
+
precision = divide[positives, positives + false_positives]
|
196
|
+
f1 = divide[2 * recall * precision, recall + precision]
|
197
|
+
|
198
|
+
warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
|
199
|
+
recall * 100, precision * 100, f1, error
|
200
|
+
]
|
201
|
+
end
|
202
|
+
|
203
|
+
error if adjust_coeff
|
204
|
+
}
|
205
|
+
|
206
|
+
if adjust_coeff
|
207
|
+
lambda {
|
208
|
+
step, max = 1, 100
|
209
|
+
|
210
|
+
start_coeff = pm_options[:default_coeff] || PerseusMatch::DEFAULT_COEFF
|
211
|
+
start_err = _action[start_coeff]
|
212
|
+
|
213
|
+
previous_coeff = next_coeff = start_coeff
|
214
|
+
previous_err = next_err = start_err
|
215
|
+
|
216
|
+
max.times { break if (previous_err = _action[previous_coeff -= step]) != start_err }
|
217
|
+
max.times { break if (next_err = _action[next_coeff += step]) != start_err }
|
218
|
+
|
219
|
+
best_err = [start_err, previous_err, next_err].min
|
220
|
+
|
221
|
+
if best_err == start_err
|
222
|
+
best_coeff = start_coeff
|
223
|
+
else
|
224
|
+
if best_err == previous_err
|
225
|
+
max.times {
|
226
|
+
break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
|
227
|
+
previous_err, previous_coeff = current_err, current_coeff
|
228
|
+
}
|
229
|
+
|
230
|
+
best_err, best_coeff = previous_err, previous_coeff
|
231
|
+
else
|
232
|
+
max.times {
|
233
|
+
break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
|
234
|
+
next_err, next_coeff = current_err, current_coeff
|
235
|
+
}
|
236
|
+
|
237
|
+
best_err, best_coeff = next_err, next_coeff
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
|
242
|
+
}
|
243
|
+
else
|
244
|
+
_action
|
83
245
|
end
|
84
|
-
|
246
|
+
else
|
247
|
+
format =
|
248
|
+
options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
|
249
|
+
options[:sort] ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
|
250
|
+
lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
|
251
|
+
|
252
|
+
if options[:sort]
|
253
|
+
require 'pp'
|
254
|
+
|
255
|
+
lambda {
|
256
|
+
pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
|
257
|
+
if pm.similarity >= threshold
|
258
|
+
res = format[pm]
|
259
|
+
count += 1
|
260
|
+
end
|
261
|
+
count_all += 1
|
262
|
+
res
|
263
|
+
}.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
|
264
|
+
}
|
265
|
+
else
|
266
|
+
lambda {
|
267
|
+
separator, previous_phrase = options[:separate], nil
|
268
|
+
|
269
|
+
PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
|
270
|
+
if separator && pm.phrase != previous_phrase ||= pm.phrase
|
271
|
+
puts separator
|
272
|
+
previous_phrase = pm.phrase
|
273
|
+
end
|
274
|
+
|
275
|
+
if pm.similarity >= threshold
|
276
|
+
puts format[pm]
|
277
|
+
count += 1
|
278
|
+
end
|
279
|
+
|
280
|
+
count_all += 1
|
281
|
+
}
|
282
|
+
}
|
283
|
+
end
|
284
|
+
end
|
85
285
|
|
86
286
|
if options[:stats]
|
287
|
+
time = Benchmark.realtime(&action)
|
288
|
+
|
87
289
|
hms, x, y = time.to_hms(2), time / count, time / count_all
|
88
290
|
|
89
291
|
precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
|
@@ -91,4 +293,6 @@ if options[:stats]
|
|
91
293
|
warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
|
92
294
|
phrases.size, count, count_all, hms, x, y
|
93
295
|
]
|
296
|
+
else
|
297
|
+
action.call
|
94
298
|
end
|
@@ -30,10 +30,10 @@ class PerseusMatch
|
|
30
30
|
|
31
31
|
class Cluster < Hash
|
32
32
|
|
33
|
-
def initialize(phrases = [])
|
33
|
+
def initialize(phrases = [], pm_options = {}, list_options = {})
|
34
34
|
super() { |h, k| h[k] = [] }
|
35
35
|
|
36
|
-
List.
|
36
|
+
List.pair(phrases, pm_options, list_options) { |pm| add(pm) }
|
37
37
|
end
|
38
38
|
|
39
39
|
def add(pm)
|
@@ -42,7 +42,7 @@ class PerseusMatch
|
|
42
42
|
|
43
43
|
alias_method :<<, :add
|
44
44
|
|
45
|
-
def sort_by(attribute, *args
|
45
|
+
def sort_by(attribute, *args)
|
46
46
|
options = args.last.is_a?(Hash) ? args.pop : {}
|
47
47
|
|
48
48
|
_ = map { |phrase, matches|
|
@@ -63,15 +63,15 @@ class PerseusMatch
|
|
63
63
|
lambda { |match| res[match] < threshold } :
|
64
64
|
lambda { |match| res[match] > threshold }
|
65
65
|
|
66
|
-
matches.reject!
|
66
|
+
matches.reject!(&condition)
|
67
67
|
end
|
68
68
|
|
69
69
|
if limit = options[:limit]
|
70
|
-
matches.slice!(limit..-1)
|
70
|
+
matches.slice!(limit..-1) if matches.size > limit
|
71
71
|
end
|
72
72
|
|
73
73
|
# transform entries if so requested
|
74
|
-
matches.map!(
|
74
|
+
matches.map! { |match| yield(match) } if block_given?
|
75
75
|
|
76
76
|
[phrase, matches]
|
77
77
|
}.sort
|
@@ -79,8 +79,9 @@ class PerseusMatch
|
|
79
79
|
_ # rcov hack :-(
|
80
80
|
end
|
81
81
|
|
82
|
-
def sort(options = {}
|
83
|
-
|
82
|
+
def sort(options = {})
|
83
|
+
args = [:similarity, options.delete(:coeff), options]
|
84
|
+
block_given? ? sort_by(*args) { |*a| yield(*a) } : sort_by(*args)
|
84
85
|
end
|
85
86
|
|
86
87
|
def rank(options = {})
|
data/lib/perseus_match/list.rb
CHANGED
@@ -32,24 +32,46 @@ class PerseusMatch
|
|
32
32
|
|
33
33
|
class << self
|
34
34
|
|
35
|
-
def pair(phrases)
|
35
|
+
def pair(phrases, pm_options = {}, list_options = {})
|
36
36
|
phrases.uniq!
|
37
37
|
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
pairs = [] unless block_given?
|
39
|
+
|
40
|
+
unless list_options[:minimal]
|
41
|
+
# => pairs.size = phrases.size ** 2
|
42
|
+
|
43
|
+
phrases.each { |phrase|
|
44
|
+
phrases.each { |target|
|
45
|
+
pm = PerseusMatch.new(phrase, target, pm_options)
|
46
|
+
block_given? ? yield(pm) : pairs << pm
|
47
|
+
}
|
48
|
+
}
|
49
|
+
else
|
50
|
+
# => pairs.size = (phrases.size ** 2 - phrases.size) / 2
|
51
|
+
|
52
|
+
size = phrases.size
|
53
|
+
|
54
|
+
1.upto(size) { |i|
|
55
|
+
phrase = phrases[i - 1]
|
56
|
+
|
57
|
+
i.upto(size - 1) { |j|
|
58
|
+
pm = PerseusMatch.new(phrase, phrases[j], pm_options)
|
59
|
+
block_given? ? yield(pm) : pairs << pm
|
60
|
+
}
|
41
61
|
}
|
42
|
-
|
62
|
+
end
|
63
|
+
|
64
|
+
pairs || phrases
|
43
65
|
end
|
44
66
|
|
45
67
|
end
|
46
68
|
|
47
|
-
|
48
|
-
|
49
|
-
def initialize(phrases = [])
|
50
|
-
self.class.pair(phrases) { |pm| add(pm) }
|
69
|
+
def initialize(phrases = [], pm_options = {}, list_options = {})
|
70
|
+
self.class.pair(phrases, pm_options, list_options) { |pm| add(pm) }
|
51
71
|
end
|
52
72
|
|
73
|
+
alias_method :add, :push
|
74
|
+
|
53
75
|
end
|
54
76
|
|
55
77
|
end
|
@@ -28,44 +28,52 @@
|
|
28
28
|
|
29
29
|
$KCODE = 'u'
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
LINGO_CONFIG = {
|
34
|
-
'meeting' => {
|
35
|
-
'attendees' => [
|
36
|
-
{ 'textreader' => { 'files'=> 'STDIN' } },
|
37
|
-
{ 'tokenizer' => { } },
|
38
|
-
{ 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
|
39
|
-
{ 'decomposer' => { 'source' => 'sys-dic' } },
|
40
|
-
{ 'multiworder' => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
|
41
|
-
{ 'synonymer' => { 'source' => 'sys-syn', 'out' => 'syn', 'skip'=>'?,t' } },
|
42
|
-
{ 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } }
|
43
|
-
]
|
44
|
-
}
|
45
|
-
}
|
46
|
-
|
47
|
-
require 'tempfile'
|
31
|
+
require 'pathname'
|
32
|
+
require 'rbconfig'
|
48
33
|
require 'yaml'
|
49
34
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
def make_tmpname(basename, n)
|
55
|
-
case basename
|
56
|
-
when Array
|
57
|
-
prefix, suffix = *basename
|
58
|
-
else
|
59
|
-
prefix, suffix = basename, ''
|
60
|
-
end
|
35
|
+
require 'rubygems'
|
36
|
+
require 'backports/tempfile'
|
37
|
+
require 'nuggets/tempfile/open'
|
38
|
+
require 'nuggets/util/i18n'
|
61
39
|
|
62
|
-
|
63
|
-
|
64
|
-
|
40
|
+
begin
|
41
|
+
require 'text/soundex'
|
42
|
+
rescue LoadError
|
43
|
+
warn "could not load the Text gem -- soundex functionality will not be available"
|
44
|
+
end
|
65
45
|
|
66
|
-
|
46
|
+
LINGO_BASE = ENV['PM_LINGO_BASE'] || (
|
47
|
+
File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
|
48
|
+
)
|
49
|
+
|
50
|
+
LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
|
51
|
+
warn "lingo installation not found at #{LINGO_BASE} -- proceeding anyway" unless LINGO_FOUND
|
52
|
+
|
53
|
+
lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
|
54
|
+
YAML.load_file(file)
|
55
|
+
else
|
56
|
+
warn "lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
|
57
|
+
|
58
|
+
{
|
59
|
+
'meeting' => {
|
60
|
+
'attendees' => [
|
61
|
+
{ 'tokenizer' => { } },
|
62
|
+
{ 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
|
63
|
+
{ 'decomposer' => { 'source' => 'sys-dic' } },
|
64
|
+
{ 'multiworder' => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
|
65
|
+
{ 'synonymer' => { 'source' => 'sys-syn', 'skip' => '?,t' } },
|
66
|
+
]
|
67
|
+
}
|
68
|
+
}
|
67
69
|
end
|
68
|
-
|
70
|
+
|
71
|
+
lingo_config['meeting']['attendees'].
|
72
|
+
unshift({ 'textreader' => { 'files'=> 'STDIN' } }).
|
73
|
+
push({ 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } })
|
74
|
+
|
75
|
+
LINGO_CONFIG = lingo_config
|
76
|
+
|
69
77
|
class PerseusMatch
|
70
78
|
|
71
79
|
class TokenSet < Array
|
@@ -73,10 +81,8 @@ class PerseusMatch
|
|
73
81
|
def self.tokenize(form)
|
74
82
|
return @tokens[form] if @tokens
|
75
83
|
|
76
|
-
@_tokens = {}
|
77
|
-
|
78
|
-
k, @_tokens.has_key?(k) ? @_tokens[k] :
|
79
|
-
k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
|
84
|
+
@_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
|
85
|
+
k, (@_tokens[k] || []) | k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
|
80
86
|
)}
|
81
87
|
|
82
88
|
parse = lambda { |x|
|
@@ -85,8 +91,11 @@ class PerseusMatch
|
|
85
91
|
when /<(.*?)\s=\s\[(.*)\]>/
|
86
92
|
a, b = $1, $2
|
87
93
|
@_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
|
88
|
-
|
89
|
-
|
94
|
+
when /<(.*)>/, /:(.*):/
|
95
|
+
a, b = $1, $1.dup
|
96
|
+
@_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
|
97
|
+
|
98
|
+
warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
|
90
99
|
end
|
91
100
|
}
|
92
101
|
}
|
@@ -95,29 +104,32 @@ class PerseusMatch
|
|
95
104
|
File.open(t) { |f| parse[f] }
|
96
105
|
@tokens[form]
|
97
106
|
else
|
98
|
-
|
99
|
-
|
100
|
-
cfg.
|
107
|
+
raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
|
108
|
+
|
109
|
+
cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
|
110
|
+
YAML.dump(LINGO_CONFIG, t)
|
111
|
+
}
|
101
112
|
|
102
|
-
file = form
|
113
|
+
file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
|
103
114
|
|
104
115
|
unless File.file?(file) && File.readable?(file)
|
105
|
-
temp = Tempfile.
|
106
|
-
|
107
|
-
|
116
|
+
temp = Tempfile.open('perseus_match_temp') { |t|
|
117
|
+
t.puts form
|
118
|
+
}
|
108
119
|
|
109
120
|
file = temp.path
|
110
121
|
end
|
111
122
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
123
|
+
begin
|
124
|
+
Dir.chdir(LINGO_BASE) { parse[%x{
|
125
|
+
#{Config::CONFIG['ruby_install_name']} lingo.rb -c "#{cfg.path}" < "#{file}"
|
126
|
+
}] }
|
127
|
+
ensure
|
128
|
+
cfg.unlink
|
129
|
+
temp.unlink if temp
|
130
|
+
end
|
117
131
|
|
118
132
|
if temp
|
119
|
-
temp.unlink
|
120
|
-
|
121
133
|
tokens, @tokens = @tokens[form], nil
|
122
134
|
tokens
|
123
135
|
else
|
@@ -138,42 +150,40 @@ class PerseusMatch
|
|
138
150
|
end
|
139
151
|
|
140
152
|
def distance(other)
|
141
|
-
|
153
|
+
tokens1, tokens2 = tokens, other.tokens
|
154
|
+
size1, size2 = tokens1.size, tokens2.size
|
142
155
|
|
143
|
-
|
144
|
-
|
145
|
-
distance += 1
|
156
|
+
return size2 if tokens1.empty?
|
157
|
+
return size1 if tokens2.empty?
|
146
158
|
|
147
|
-
|
148
|
-
end
|
149
|
-
}
|
159
|
+
distance, costs = nil, (0..size2).to_a
|
150
160
|
|
151
|
-
|
152
|
-
|
161
|
+
0.upto(size1 - 1) { |index1|
|
162
|
+
token1, cost = tokens1[index1], index1 + 1
|
153
163
|
|
154
|
-
|
155
|
-
|
156
|
-
token.sub(%r{[/|].*?\z}, '')
|
157
|
-
}
|
158
|
-
end
|
164
|
+
0.upto(size2 - 1) { |index2|
|
165
|
+
penalty = token1 == tokens2[index2] ? 0 : 1
|
159
166
|
|
160
|
-
|
161
|
-
|
162
|
-
|
167
|
+
# rcov hack :-(
|
168
|
+
_ = [
|
169
|
+
costs[index2 + 1] + 1, # insertion
|
170
|
+
cost + 1, # deletion
|
171
|
+
costs[index2] + penalty # substitution
|
172
|
+
]
|
173
|
+
distance = _.min
|
163
174
|
|
164
|
-
|
165
|
-
|
166
|
-
end
|
175
|
+
costs[index2], cost = cost, distance
|
176
|
+
}
|
167
177
|
|
168
|
-
|
169
|
-
(self & other).inject([]) { |memo, token|
|
170
|
-
memo + [token] * [count(token), other.count(token)].max
|
178
|
+
costs[size2] = distance
|
171
179
|
}
|
180
|
+
|
181
|
+
distance + 1 # > 0 !?!
|
172
182
|
end
|
173
183
|
|
174
|
-
def
|
175
|
-
|
176
|
-
|
184
|
+
def tokens(wc = true)
|
185
|
+
wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
|
186
|
+
token.sub(%r{[/|].*?\z}, '')
|
177
187
|
}
|
178
188
|
end
|
179
189
|
|
@@ -186,26 +196,31 @@ class PerseusMatch
|
|
186
196
|
end
|
187
197
|
|
188
198
|
def incl(*wc)
|
189
|
-
(@incl ||= {})[wc = [*wc].compact] ||=
|
190
|
-
|
199
|
+
(@incl ||= {})[wc = [*wc].compact] ||= select { |token|
|
200
|
+
match?(token, wc)
|
191
201
|
}.to_token_set(form)
|
192
202
|
end
|
193
203
|
|
194
204
|
def excl(*wc)
|
195
|
-
(@excl ||= {})[wc = [*wc].compact] ||=
|
196
|
-
|
205
|
+
(@excl ||= {})[wc = [*wc].compact] ||= reject { |token|
|
206
|
+
match?(token, wc)
|
197
207
|
}.to_token_set(form)
|
198
208
|
end
|
199
209
|
|
200
|
-
def
|
201
|
-
|
210
|
+
def soundex
|
211
|
+
raise "soundex functionality not available" unless defined?(Text::Soundex)
|
212
|
+
|
213
|
+
@soundex ||= map { |token|
|
214
|
+
token.sub(/(.*)(?=[\/|])/) { |m| Text::Soundex.soundex(m.replace_diacritics) }
|
215
|
+
}.to_token_set(form)
|
202
216
|
end
|
203
217
|
|
204
|
-
def
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
218
|
+
def soundex!
|
219
|
+
replace soundex
|
220
|
+
end
|
221
|
+
|
222
|
+
def eql?(other)
|
223
|
+
tokens == other.tokens && form == other.form
|
209
224
|
end
|
210
225
|
|
211
226
|
def inspect
|
data/lib/perseus_match.rb
CHANGED
@@ -38,35 +38,53 @@ class PerseusMatch
|
|
38
38
|
|
39
39
|
DEFAULT_COEFF = 20
|
40
40
|
|
41
|
-
DISTANCE_SPEC = {
|
42
|
-
{}
|
43
|
-
{ :excl => %w[a t] } => 1,
|
44
|
-
{ :incl => 's' } => 2,
|
45
|
-
{ :incl => 'y' } => 4,
|
46
|
-
{ :sort => true } => 4
|
47
|
-
|
41
|
+
DISTANCE_SPEC = [ # {
|
42
|
+
[{}, 1], # {} => 1,
|
43
|
+
[{ :excl => %w[a t] }, 2], # { :excl => %w[a t] } => 1,
|
44
|
+
[{ :incl => 's' }, 3], # { :incl => 's' } => 2,
|
45
|
+
[{ :incl => 'y' }, 4], # { :incl => 'y' } => 4,
|
46
|
+
[{ :sort => true }, 4], # { :sort => true } => 4,
|
47
|
+
[{ :soundex => true }, 4] # { :soundex => true } => 8
|
48
|
+
] # }
|
48
49
|
|
49
50
|
class << self
|
50
51
|
|
51
|
-
def
|
52
|
-
|
52
|
+
def distance(*args)
|
53
|
+
new(*args).distance
|
53
54
|
end
|
54
55
|
|
55
|
-
def
|
56
|
-
|
56
|
+
def match(phrases, pm_options = {})
|
57
|
+
List.new(phrases, pm_options)
|
58
|
+
end
|
59
|
+
|
60
|
+
def cluster(phrases, options = {}, pm_options = {})
|
61
|
+
Cluster.new(phrases, pm_options).rank(options)
|
62
|
+
end
|
63
|
+
|
64
|
+
def check(*args)
|
65
|
+
check!(*args)
|
66
|
+
rescue CheckFailedError
|
67
|
+
false
|
68
|
+
end
|
69
|
+
|
70
|
+
def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
|
71
|
+
value = new(phrase, target, pm_options).send(attribute)
|
72
|
+
value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
|
57
73
|
end
|
58
74
|
|
59
75
|
end
|
60
76
|
|
61
|
-
attr_reader :phrase, :target, :distance_spec, :default_coeff
|
77
|
+
attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
|
62
78
|
|
63
79
|
def initialize(phrase, target, options = {})
|
64
|
-
@phrase = phrase
|
65
|
-
@target = target
|
80
|
+
@phrase = phrase.to_s
|
81
|
+
@target = target.to_s
|
66
82
|
|
67
83
|
@default_coeff = options[:default_coeff] || DEFAULT_COEFF
|
68
84
|
@distance_spec = options[:distance_spec] || DISTANCE_SPEC
|
69
85
|
|
86
|
+
@verbose = options[:verbose]
|
87
|
+
|
70
88
|
@similarity = {}
|
71
89
|
end
|
72
90
|
|
@@ -97,7 +115,7 @@ class PerseusMatch
|
|
97
115
|
|
98
116
|
def calculate_distance
|
99
117
|
return Infinity if phrase_tokens.disjoint?(target_tokens)
|
100
|
-
return 0 if phrase_tokens
|
118
|
+
return 0 if phrase_tokens.eql?(target_tokens)
|
101
119
|
|
102
120
|
distance_spec.inject(0) { |distance, (options, weight)|
|
103
121
|
distance + token_distance(options) * weight
|
@@ -105,19 +123,47 @@ class PerseusMatch
|
|
105
123
|
end
|
106
124
|
|
107
125
|
def token_distance(options = {})
|
108
|
-
|
109
|
-
|
126
|
+
tokens1 = phrase_tokens.inclexcl(options)
|
127
|
+
tokens2 = target_tokens.inclexcl(options)
|
110
128
|
|
111
129
|
if options[:sort]
|
112
|
-
|
113
|
-
|
130
|
+
tokens1 = tokens1.sort
|
131
|
+
tokens2 = tokens2.sort
|
114
132
|
end
|
115
133
|
|
116
|
-
|
134
|
+
if options[:soundex]
|
135
|
+
tokens1 = tokens1.soundex
|
136
|
+
tokens2 = tokens2.soundex
|
137
|
+
end
|
138
|
+
|
139
|
+
distance = tokens1.distance(tokens2)
|
140
|
+
|
141
|
+
warn <<-EOT if verbose
|
142
|
+
#{options.inspect}:
|
143
|
+
#{tokens1.inspect}
|
144
|
+
#{tokens2.inspect}
|
145
|
+
=> #{distance}
|
146
|
+
EOT
|
147
|
+
|
148
|
+
distance
|
117
149
|
end
|
118
150
|
|
119
151
|
def total_weight
|
120
|
-
distance_spec.
|
152
|
+
@total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
|
153
|
+
end
|
154
|
+
|
155
|
+
class CheckFailedError < StandardError
|
156
|
+
|
157
|
+
attr_reader :value, :threshold, :operator
|
158
|
+
|
159
|
+
def initialize(value, threshold, operator)
|
160
|
+
@value, @threshold, @operator = value, threshold, operator
|
161
|
+
end
|
162
|
+
|
163
|
+
def to_s
|
164
|
+
"FAILED: #{value} #{operator} #{threshold}"
|
165
|
+
end
|
166
|
+
|
121
167
|
end
|
122
168
|
|
123
169
|
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
describe PerseusMatch::Cluster do
|
2
|
+
|
3
|
+
it 'should accept limit option in sort_by' do
|
4
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :limit => 1).all? { |phrase, matches|
|
5
|
+
matches.size.should == 1
|
6
|
+
matches.size.should == matches.nitems
|
7
|
+
}
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should accept threshold option in sort_by (1a)' do
|
11
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
|
12
|
+
matches.size.should be_zero
|
13
|
+
matches.size.should == matches.nitems
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should accept threshold option in sort_by (1b)' do
|
18
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0).all? { |phrase, matches|
|
19
|
+
matches.size.should == 2
|
20
|
+
matches.size.should == matches.nitems
|
21
|
+
}
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should accept threshold option in sort_by (2)' do
|
25
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'c').all? { |phrase, matches|
|
26
|
+
matches.size.should == 1
|
27
|
+
matches.size.should == matches.nitems
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should accept both limit and threshold options in sort_by (1)' do
|
32
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'z', :limit => 1).all? { |phrase, matches|
|
33
|
+
matches.size.should == 1
|
34
|
+
matches.size.should == matches.nitems
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should accept both limit and threshold options in sort_by (2)' do
|
39
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'a', :limit => 1).all? { |phrase, matches|
|
40
|
+
matches.size.should be_zero
|
41
|
+
matches.size.should == matches.nitems
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
end if LINGO_FOUND
|
@@ -0,0 +1,16 @@
|
|
1
|
+
describe PerseusMatch::List, '::pair' do
|
2
|
+
|
3
|
+
before :all do
|
4
|
+
@phrases = %w[foo bar baz]
|
5
|
+
@size = @phrases.size
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'should produce full list of pairs with correct size' do
|
9
|
+
PerseusMatch::List.pair(@phrases).size.should == @size ** 2
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should produce minimal list of pairs with correct size' do
|
13
|
+
PerseusMatch::List.pair(@phrases, {}, :minimal => true).size.should == (@size ** 2 - @size) / 2
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
describe PerseusMatch::TokenSet, ' with lingo' do
|
2
|
+
|
3
|
+
before :each do
|
4
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
5
|
+
end
|
6
|
+
|
7
|
+
it 'should tokenize a string' do
|
8
|
+
PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should report strictly equal TokenSets as ==' do
|
12
|
+
PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should report strictly equal TokenSets as eql' do
|
16
|
+
PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should report slightly equal TokenSets as ==' do
|
20
|
+
PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should *not* report slightly equal TokenSets as eql' do
|
24
|
+
PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should include form in inspect' do
|
28
|
+
PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
|
29
|
+
end
|
30
|
+
|
31
|
+
end if LINGO_FOUND
|
32
|
+
|
33
|
+
describe PerseusMatch::TokenSet, ' without lingo' do
|
34
|
+
|
35
|
+
before :each do
|
36
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should take a prepared file for tokenization' do
|
40
|
+
# prevent lingo from being used
|
41
|
+
lingo_base = LINGO_BASE.dup
|
42
|
+
LINGO_BASE.replace('')
|
43
|
+
|
44
|
+
temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
|
45
|
+
t.puts *%w[<foo|?> <bar|?>]
|
46
|
+
}
|
47
|
+
|
48
|
+
path = temp.path
|
49
|
+
link = 'perseus.tokens'
|
50
|
+
|
51
|
+
Dir.chdir(File.dirname(path)) {
|
52
|
+
File.symlink(path, link)
|
53
|
+
|
54
|
+
PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
|
55
|
+
|
56
|
+
File.unlink(link)
|
57
|
+
}
|
58
|
+
|
59
|
+
temp.unlink
|
60
|
+
|
61
|
+
# reset lingo base
|
62
|
+
LINGO_BASE.replace(lingo_base)
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nuggets/tempfile/open'
|
3
|
+
require 'nuggets/util/i18n'
|
4
|
+
|
5
|
+
describe PerseusMatch do
|
6
|
+
|
7
|
+
before :all do
|
8
|
+
@highly_similar = [
|
9
|
+
'Anbetung der Könige',
|
10
|
+
'Die Anbetung der Könige'
|
11
|
+
] # ok
|
12
|
+
|
13
|
+
@similar = [
|
14
|
+
# @highly_similar + ...
|
15
|
+
'Die Anbetung der Heiligen Drei Könige',
|
16
|
+
'dIE AnBeTuNg der heILIGen dREI KÖniGE'
|
17
|
+
] # ok
|
18
|
+
|
19
|
+
@unfortunately_similar = [
|
20
|
+
# @similar + ...
|
21
|
+
'Die Die Die Anbetung der Könige',
|
22
|
+
'Die Könige der Anbetung',
|
23
|
+
'Königsanbetung hoch drei'
|
24
|
+
] # *not* ok -- eventually try to drop these below the threshold
|
25
|
+
|
26
|
+
@somewhat_similar = @highly_similar + @similar + @unfortunately_similar
|
27
|
+
|
28
|
+
phrases = @somewhat_similar + [
|
29
|
+
'Drei mal drei macht sechs',
|
30
|
+
'Das Ende dieses Blödsinns',
|
31
|
+
''
|
32
|
+
]
|
33
|
+
|
34
|
+
temp = Tempfile.open('perseus_match_spec_temp') { |t|
|
35
|
+
t.puts *phrases
|
36
|
+
}
|
37
|
+
|
38
|
+
PerseusMatch::TokenSet.tokenize(temp.path)
|
39
|
+
|
40
|
+
temp.unlink
|
41
|
+
|
42
|
+
@matchings = PerseusMatch.match(phrases)
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should identify identical (non-empty) strings as identical' do
|
46
|
+
@matchings.each { |matching|
|
47
|
+
if !matching.phrase.empty? && matching.phrase == matching.target
|
48
|
+
inform_on_error(matching) { matching.similarity.should == 1.0 }
|
49
|
+
end
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should identify case-insensitively identical (non-empty) strings as nearly identical' do
|
54
|
+
@matchings.each { |matching|
|
55
|
+
if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase == matching.target.replace_diacritics.downcase
|
56
|
+
inform_on_error(matching) { matching.similarity.should > 0.95 }
|
57
|
+
end
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should identify *only* case-insensitively identical (non-empty) strings as nearly identical' do
|
62
|
+
@matchings.each { |matching|
|
63
|
+
if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase != matching.target.replace_diacritics.downcase
|
64
|
+
inform_on_error(matching) { matching.similarity.should < 0.98 }
|
65
|
+
end
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should identify disjunct (non-empty) strings as disjunct' do
|
70
|
+
@matchings.each { |matching|
|
71
|
+
if !matching.phrase.empty? && matching.phrase_tokens.disjoint?(matching.target_tokens)
|
72
|
+
inform_on_error(matching) { matching.similarity.should == 0.0 }
|
73
|
+
end
|
74
|
+
}
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'should identify empty string as disjunct with anything, even with itself' do
|
78
|
+
@matchings.each { |matching|
|
79
|
+
if matching.phrase.empty? || matching.target.empty?
|
80
|
+
inform_on_error(matching) { matching.similarity.should == 0.0 }
|
81
|
+
end
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should identify certain strings as highly similar (1)' do
|
86
|
+
@matchings.each { |matching|
|
87
|
+
if @highly_similar.include?(matching.phrase) && @highly_similar.include?(matching.target)
|
88
|
+
inform_on_error(matching) { matching.similarity.should > 0.9 }
|
89
|
+
end
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should identify certain strings as highly similar (2)' do
|
94
|
+
@highly_similar.each { |phrase|
|
95
|
+
@highly_similar.each { |target|
|
96
|
+
inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.9).should be_true }
|
97
|
+
}
|
98
|
+
}
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'should identify certain strings as similar (1)' do
|
102
|
+
@matchings.each { |matching|
|
103
|
+
if @similar.include?(matching.phrase) && @similar.include?(matching.target)
|
104
|
+
inform_on_error(matching) { matching.similarity.should > 0.8 }
|
105
|
+
end
|
106
|
+
}
|
107
|
+
end
|
108
|
+
|
109
|
+
it 'should identify certain strings as similar (2)' do
|
110
|
+
@similar.each { |phrase|
|
111
|
+
@similar.each { |target|
|
112
|
+
inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.8).should be_true }
|
113
|
+
}
|
114
|
+
}
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'should *not* identify other strings as similar (1)' do
|
118
|
+
@matchings.each { |matching|
|
119
|
+
if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
|
120
|
+
inform_on_error(matching) { matching.similarity.should_not > 0.8 }
|
121
|
+
end
|
122
|
+
}
|
123
|
+
end
|
124
|
+
|
125
|
+
it 'should *not* identify other strings as similar (2)' do
|
126
|
+
@matchings.each { |matching|
|
127
|
+
if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
|
128
|
+
inform_on_error(matching) { PerseusMatch.check(matching.phrase, matching.target, 0.8).should be_false }
|
129
|
+
end
|
130
|
+
}
|
131
|
+
end
|
132
|
+
|
133
|
+
it 'should be symmetrical' do
|
134
|
+
similarities = {}
|
135
|
+
|
136
|
+
@matchings.each { |matching|
|
137
|
+
if similarity = similarities[[matching.target, matching.phrase]]
|
138
|
+
inform_on_error(matching) { similarity.should == matching.similarity }
|
139
|
+
else
|
140
|
+
similarities[[matching.phrase, matching.target]] = matching.similarity
|
141
|
+
end
|
142
|
+
}
|
143
|
+
end
|
144
|
+
|
145
|
+
it 'should calculate pair distance' do
|
146
|
+
PerseusMatch.distance('foo', 'bar').class.should < Numeric
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'should be clusterable' do
|
150
|
+
PerseusMatch.cluster(@somewhat_similar).should be_an_instance_of(Array)
|
151
|
+
end
|
152
|
+
|
153
|
+
it 'should be checkable (1)' do
|
154
|
+
PerseusMatch.check('foo', 'bar', 0, :>=).should be_true
|
155
|
+
end
|
156
|
+
|
157
|
+
it 'should be checkable (2)' do
|
158
|
+
lambda {
|
159
|
+
begin
|
160
|
+
PerseusMatch.check!('foo', 'bar', 0, :>)
|
161
|
+
rescue PerseusMatch::CheckFailedError => err
|
162
|
+
err.to_s.should =~ /0/
|
163
|
+
raise err
|
164
|
+
end
|
165
|
+
}.should raise_error(PerseusMatch::CheckFailedError)
|
166
|
+
end
|
167
|
+
|
168
|
+
end if LINGO_FOUND
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
unless Object.const_defined?(:PerseusMatch)
|
2
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
3
|
+
require 'perseus_match'
|
4
|
+
end
|
5
|
+
|
6
|
+
def inform_on_error(*args)
|
7
|
+
begin
|
8
|
+
yield
|
9
|
+
rescue Spec::Expectations::ExpectationNotMetError => err
|
10
|
+
unless args.empty?
|
11
|
+
puts
|
12
|
+
p *args
|
13
|
+
puts
|
14
|
+
end
|
15
|
+
|
16
|
+
raise
|
17
|
+
end
|
18
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: perseus_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,9 +9,19 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-12-09 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: ruby-backports
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
15
25
|
- !ruby/object:Gem::Dependency
|
16
26
|
name: ruby-nuggets
|
17
27
|
type: :runtime
|
@@ -20,7 +30,7 @@ dependencies:
|
|
20
30
|
requirements:
|
21
31
|
- - ">="
|
22
32
|
- !ruby/object:Gem::Version
|
23
|
-
version: 0.
|
33
|
+
version: 0.4.0
|
24
34
|
version:
|
25
35
|
description: Fuzzy string matching based on linguistic analysis
|
26
36
|
email: jens.wille@uni-koeln.de
|
@@ -33,29 +43,35 @@ extra_rdoc_files:
|
|
33
43
|
- ChangeLog
|
34
44
|
- README
|
35
45
|
files:
|
36
|
-
- lib/perseus_match.rb
|
46
|
+
- lib/perseus_match/list.rb
|
37
47
|
- lib/perseus_match/version.rb
|
38
48
|
- lib/perseus_match/token_set.rb
|
39
|
-
- lib/perseus_match/list.rb
|
40
49
|
- lib/perseus_match/cluster.rb
|
50
|
+
- lib/perseus_match.rb
|
41
51
|
- bin/perseus_match
|
52
|
+
- Rakefile
|
42
53
|
- COPYING
|
43
|
-
- README
|
44
54
|
- ChangeLog
|
45
|
-
-
|
55
|
+
- LINGO_BASE
|
56
|
+
- README
|
57
|
+
- spec/spec_helper.rb
|
58
|
+
- spec/perseus_match/list_spec.rb
|
59
|
+
- spec/perseus_match/cluster_spec.rb
|
60
|
+
- spec/perseus_match/token_set_spec.rb
|
61
|
+
- spec/perseus_match_spec.rb
|
46
62
|
has_rdoc: true
|
47
63
|
homepage: http://prometheus.rubyforge.org/perseus_match
|
48
64
|
post_install_message:
|
49
65
|
rdoc_options:
|
50
|
-
- --
|
51
|
-
-
|
66
|
+
- --line-numbers
|
67
|
+
- --inline-source
|
52
68
|
- --title
|
53
69
|
- perseus_match Application documentation
|
54
70
|
- --main
|
55
71
|
- README
|
72
|
+
- --charset
|
73
|
+
- UTF-8
|
56
74
|
- --all
|
57
|
-
- --line-numbers
|
58
|
-
- --inline-source
|
59
75
|
require_paths:
|
60
76
|
- lib
|
61
77
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -73,7 +89,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
73
89
|
requirements: []
|
74
90
|
|
75
91
|
rubyforge_project: prometheus
|
76
|
-
rubygems_version: 1.
|
92
|
+
rubygems_version: 1.3.1
|
77
93
|
signing_key:
|
78
94
|
specification_version: 2
|
79
95
|
summary: Fuzzy string matching based on linguistic analysis
|