perseus_match 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LINGO_BASE +1 -0
- data/README +8 -6
- data/Rakefile +2 -2
- data/bin/perseus_match +226 -22
- data/lib/perseus_match/cluster.rb +9 -8
- data/lib/perseus_match/list.rb +31 -9
- data/lib/perseus_match/token_set.rb +105 -90
- data/lib/perseus_match/version.rb +1 -1
- data/lib/perseus_match.rb +67 -21
- data/spec/perseus_match/cluster_spec.rb +45 -0
- data/spec/perseus_match/list_spec.rb +16 -0
- data/spec/perseus_match/token_set_spec.rb +65 -0
- data/spec/perseus_match_spec.rb +168 -0
- data/spec/spec_helper.rb +18 -0
- metadata +28 -12
data/LINGO_BASE
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
/home/jw/devel/lingo/trunk
|
data/README
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
== VERSION
|
4
4
|
|
5
|
-
This documentation refers to perseus_match version 0.0.
|
5
|
+
This documentation refers to perseus_match version 0.0.3
|
6
6
|
|
7
7
|
|
8
8
|
== DESCRIPTION
|
@@ -10,15 +10,17 @@ This documentation refers to perseus_match version 0.0.2
|
|
10
10
|
Fuzzy string matching based on linguistic analysis.
|
11
11
|
|
12
12
|
|
13
|
-
==
|
13
|
+
== LINKS
|
14
14
|
|
15
|
-
|
15
|
+
<b></b>
|
16
|
+
Documentation:: <http://prometheus.rubyforge.org/perseus_match>
|
17
|
+
Source code:: <http://github.com/blackwinter/perseus_match>
|
18
|
+
Rubyforge project:: <http://rubyforge.org/projects/prometheus>
|
16
19
|
|
17
20
|
|
18
|
-
==
|
21
|
+
== AUTHORS
|
19
22
|
|
20
|
-
* <
|
21
|
-
* <http://github.com/blackwinter/perseus_match>
|
23
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
22
24
|
|
23
25
|
|
24
26
|
== LICENSE AND COPYRIGHT
|
data/Rakefile
CHANGED
@@ -13,8 +13,8 @@ begin
|
|
13
13
|
:version => PerseusMatch::VERSION,
|
14
14
|
:summary => %q{Fuzzy string matching based on linguistic analysis},
|
15
15
|
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
16
|
-
:extra_files => FileList['[A-Z]*'].to_a,
|
17
|
-
:dependencies => [['ruby-nuggets', '>= 0.
|
16
|
+
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
|
17
|
+
:dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
|
18
18
|
}
|
19
19
|
}}
|
20
20
|
rescue LoadError
|
data/bin/perseus_match
CHANGED
@@ -2,8 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'optparse'
|
4
4
|
require 'benchmark'
|
5
|
+
require 'yaml'
|
5
6
|
|
6
7
|
require 'rubygems'
|
8
|
+
require 'nuggets/enumerable/minmax'
|
7
9
|
require 'nuggets/numeric/duration'
|
8
10
|
|
9
11
|
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
@@ -14,19 +16,29 @@ USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
|
|
14
16
|
abort USAGE if ARGV.empty?
|
15
17
|
|
16
18
|
options = {
|
17
|
-
:
|
18
|
-
:threshold
|
19
|
-
:sort
|
19
|
+
:config => nil,
|
20
|
+
:threshold => 0,
|
21
|
+
:sort => false,
|
22
|
+
:stats => false,
|
23
|
+
:lingo => false,
|
24
|
+
:minimal => false,
|
25
|
+
:separate => false,
|
26
|
+
:check => false,
|
27
|
+
:failed_only => false,
|
28
|
+
:align => false,
|
29
|
+
:adjust_coeff => false
|
20
30
|
}
|
21
31
|
|
22
32
|
OptionParser.new { |opts|
|
23
33
|
opts.banner = USAGE
|
24
34
|
|
25
|
-
opts.separator ''
|
35
|
+
opts.separator ' '
|
26
36
|
opts.separator 'Options:'
|
27
37
|
|
28
|
-
opts.on('--
|
29
|
-
|
38
|
+
opts.on('-c', '--config YAML', 'Config file in YAML format') { |f|
|
39
|
+
abort "Can't find config file: #{f}." unless File.readable?(f)
|
40
|
+
|
41
|
+
options[:config] = f
|
30
42
|
}
|
31
43
|
|
32
44
|
opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
|
@@ -37,7 +49,51 @@ OptionParser.new { |opts|
|
|
37
49
|
options[:sort] = true
|
38
50
|
}
|
39
51
|
|
40
|
-
opts.
|
52
|
+
opts.on('-S', '--stats', 'Output some statistics at the end') {
|
53
|
+
options[:stats] = true
|
54
|
+
}
|
55
|
+
|
56
|
+
opts.on('-v', '--verbose', 'Print additional information during processing') {
|
57
|
+
options[:verbose] = true
|
58
|
+
}
|
59
|
+
|
60
|
+
opts.separator ' '
|
61
|
+
opts.separator ' * Calculating similarities (default)'
|
62
|
+
opts.separator ' '
|
63
|
+
|
64
|
+
opts.on('-m', '--minimal', 'Produce minimal pairs only') {
|
65
|
+
options[:minimal] = true
|
66
|
+
}
|
67
|
+
|
68
|
+
opts.on('-p', '--separate [SEPARATOR]', 'Separate phrase blocks by SEPARATOR', '[Default: empty line]') { |p|
|
69
|
+
options[:separate] = p || ''
|
70
|
+
}
|
71
|
+
|
72
|
+
opts.on('-l', '--lingo', 'Output results in lingo dictionary format') {
|
73
|
+
options[:lingo] = true
|
74
|
+
}
|
75
|
+
|
76
|
+
opts.separator ' '
|
77
|
+
opts.separator ' * Checking pairs'
|
78
|
+
opts.separator ' '
|
79
|
+
|
80
|
+
opts.on('-C', '--check', 'Check pairs in input file (CSV) against', 'specified (or global) threshold') {
|
81
|
+
options[:check] = true
|
82
|
+
}
|
83
|
+
|
84
|
+
opts.on('-f', '--failed', 'Print only failed checks') {
|
85
|
+
options[:failed_only] = true
|
86
|
+
}
|
87
|
+
|
88
|
+
opts.on('-a', '--align', 'Align check results') {
|
89
|
+
options[:align] = true
|
90
|
+
}
|
91
|
+
|
92
|
+
opts.on('-A', '--adjust-coeff', 'Adjust coefficient to minimize the error') {
|
93
|
+
options[:adjust_coeff] = true
|
94
|
+
}
|
95
|
+
|
96
|
+
opts.separator ' '
|
41
97
|
opts.separator 'Generic options:'
|
42
98
|
|
43
99
|
opts.on('-h', '--help', 'Print this help message and exit') {
|
@@ -57,33 +113,179 @@ end
|
|
57
113
|
|
58
114
|
PerseusMatch::TokenSet.tokenize(file)
|
59
115
|
|
60
|
-
|
116
|
+
skip_re = %r{\A\s*(?:#|\z)}o
|
117
|
+
phrases = []
|
118
|
+
|
119
|
+
File.open(file).each { |line|
|
120
|
+
phrases << line.chomp unless line =~ skip_re
|
121
|
+
}.close
|
122
|
+
|
123
|
+
pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
|
124
|
+
pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
|
125
|
+
list_options = { :minimal => options[:minimal] }
|
61
126
|
|
62
127
|
threshold, count, count_all = options[:threshold], 0, 0
|
63
128
|
|
64
|
-
|
65
|
-
|
66
|
-
require 'pp'
|
129
|
+
action = if options[:check]
|
130
|
+
require 'fastercsv'
|
67
131
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
132
|
+
format = if options[:align]
|
133
|
+
require 'jcode'
|
134
|
+
|
135
|
+
width = phrases.max(:jlength) + 3
|
136
|
+
|
137
|
+
lambda { |line, res|
|
138
|
+
"#{line} #{'.' * (width - line.jlength)} [#{res}]"
|
139
|
+
}
|
75
140
|
else
|
76
|
-
|
77
|
-
|
78
|
-
|
141
|
+
lambda { |line, res|
|
142
|
+
"#{line} [#{res}]"
|
143
|
+
}
|
144
|
+
end
|
145
|
+
|
146
|
+
phrases.sort! if options[:sort]
|
147
|
+
phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
|
148
|
+
|
149
|
+
global_threshold = options[:threshold]
|
150
|
+
failed_only = options[:failed_only]
|
151
|
+
collect_stats = options[:stats]
|
152
|
+
adjust_coeff = options[:adjust_coeff]
|
153
|
+
|
154
|
+
_action = lambda { |*args|
|
155
|
+
pm_options[:default_coeff] = args.first unless args.empty?
|
156
|
+
|
157
|
+
count, count_all = 0, 0
|
158
|
+
positives = negatives = false_positives = false_negatives = 0.0
|
159
|
+
|
160
|
+
phrases.each { |line, spec|
|
161
|
+
phrase, target, threshold, operator, _ = *spec
|
162
|
+
|
163
|
+
threshold ||= global_threshold
|
164
|
+
operator ||= '>'
|
165
|
+
assign = operator =~ />/
|
166
|
+
|
167
|
+
begin
|
168
|
+
PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
|
169
|
+
|
79
170
|
count += 1
|
171
|
+
assign ? positives += 1 : negatives += 1
|
172
|
+
|
173
|
+
puts format[line, 'OK'] unless adjust_coeff || failed_only
|
174
|
+
rescue PerseusMatch::CheckFailedError => err
|
175
|
+
assign ? false_negatives += 1 : false_positives += 1
|
176
|
+
|
177
|
+
puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
|
80
178
|
end
|
81
179
|
count_all += 1
|
82
180
|
}
|
181
|
+
|
182
|
+
divide = lambda { |numerator, denominator|
|
183
|
+
denominator == 0 ? 0 : numerator / denominator
|
184
|
+
}
|
185
|
+
|
186
|
+
if collect_stats || adjust_coeff
|
187
|
+
error = divide[
|
188
|
+
false_positives + false_negatives,
|
189
|
+
positives + negatives + false_positives + false_negatives
|
190
|
+
]
|
191
|
+
end
|
192
|
+
|
193
|
+
if collect_stats
|
194
|
+
recall = divide[positives, positives + false_negatives]
|
195
|
+
precision = divide[positives, positives + false_positives]
|
196
|
+
f1 = divide[2 * recall * precision, recall + precision]
|
197
|
+
|
198
|
+
warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
|
199
|
+
recall * 100, precision * 100, f1, error
|
200
|
+
]
|
201
|
+
end
|
202
|
+
|
203
|
+
error if adjust_coeff
|
204
|
+
}
|
205
|
+
|
206
|
+
if adjust_coeff
|
207
|
+
lambda {
|
208
|
+
step, max = 1, 100
|
209
|
+
|
210
|
+
start_coeff = pm_options[:default_coeff] || PerseusMatch::DEFAULT_COEFF
|
211
|
+
start_err = _action[start_coeff]
|
212
|
+
|
213
|
+
previous_coeff = next_coeff = start_coeff
|
214
|
+
previous_err = next_err = start_err
|
215
|
+
|
216
|
+
max.times { break if (previous_err = _action[previous_coeff -= step]) != start_err }
|
217
|
+
max.times { break if (next_err = _action[next_coeff += step]) != start_err }
|
218
|
+
|
219
|
+
best_err = [start_err, previous_err, next_err].min
|
220
|
+
|
221
|
+
if best_err == start_err
|
222
|
+
best_coeff = start_coeff
|
223
|
+
else
|
224
|
+
if best_err == previous_err
|
225
|
+
max.times {
|
226
|
+
break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
|
227
|
+
previous_err, previous_coeff = current_err, current_coeff
|
228
|
+
}
|
229
|
+
|
230
|
+
best_err, best_coeff = previous_err, previous_coeff
|
231
|
+
else
|
232
|
+
max.times {
|
233
|
+
break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
|
234
|
+
next_err, next_coeff = current_err, current_coeff
|
235
|
+
}
|
236
|
+
|
237
|
+
best_err, best_coeff = next_err, next_coeff
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
|
242
|
+
}
|
243
|
+
else
|
244
|
+
_action
|
83
245
|
end
|
84
|
-
|
246
|
+
else
|
247
|
+
format =
|
248
|
+
options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
|
249
|
+
options[:sort] ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
|
250
|
+
lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
|
251
|
+
|
252
|
+
if options[:sort]
|
253
|
+
require 'pp'
|
254
|
+
|
255
|
+
lambda {
|
256
|
+
pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
|
257
|
+
if pm.similarity >= threshold
|
258
|
+
res = format[pm]
|
259
|
+
count += 1
|
260
|
+
end
|
261
|
+
count_all += 1
|
262
|
+
res
|
263
|
+
}.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
|
264
|
+
}
|
265
|
+
else
|
266
|
+
lambda {
|
267
|
+
separator, previous_phrase = options[:separate], nil
|
268
|
+
|
269
|
+
PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
|
270
|
+
if separator && pm.phrase != previous_phrase ||= pm.phrase
|
271
|
+
puts separator
|
272
|
+
previous_phrase = pm.phrase
|
273
|
+
end
|
274
|
+
|
275
|
+
if pm.similarity >= threshold
|
276
|
+
puts format[pm]
|
277
|
+
count += 1
|
278
|
+
end
|
279
|
+
|
280
|
+
count_all += 1
|
281
|
+
}
|
282
|
+
}
|
283
|
+
end
|
284
|
+
end
|
85
285
|
|
86
286
|
if options[:stats]
|
287
|
+
time = Benchmark.realtime(&action)
|
288
|
+
|
87
289
|
hms, x, y = time.to_hms(2), time / count, time / count_all
|
88
290
|
|
89
291
|
precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
|
@@ -91,4 +293,6 @@ if options[:stats]
|
|
91
293
|
warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
|
92
294
|
phrases.size, count, count_all, hms, x, y
|
93
295
|
]
|
296
|
+
else
|
297
|
+
action.call
|
94
298
|
end
|
@@ -30,10 +30,10 @@ class PerseusMatch
|
|
30
30
|
|
31
31
|
class Cluster < Hash
|
32
32
|
|
33
|
-
def initialize(phrases = [])
|
33
|
+
def initialize(phrases = [], pm_options = {}, list_options = {})
|
34
34
|
super() { |h, k| h[k] = [] }
|
35
35
|
|
36
|
-
List.
|
36
|
+
List.pair(phrases, pm_options, list_options) { |pm| add(pm) }
|
37
37
|
end
|
38
38
|
|
39
39
|
def add(pm)
|
@@ -42,7 +42,7 @@ class PerseusMatch
|
|
42
42
|
|
43
43
|
alias_method :<<, :add
|
44
44
|
|
45
|
-
def sort_by(attribute, *args
|
45
|
+
def sort_by(attribute, *args)
|
46
46
|
options = args.last.is_a?(Hash) ? args.pop : {}
|
47
47
|
|
48
48
|
_ = map { |phrase, matches|
|
@@ -63,15 +63,15 @@ class PerseusMatch
|
|
63
63
|
lambda { |match| res[match] < threshold } :
|
64
64
|
lambda { |match| res[match] > threshold }
|
65
65
|
|
66
|
-
matches.reject!
|
66
|
+
matches.reject!(&condition)
|
67
67
|
end
|
68
68
|
|
69
69
|
if limit = options[:limit]
|
70
|
-
matches.slice!(limit..-1)
|
70
|
+
matches.slice!(limit..-1) if matches.size > limit
|
71
71
|
end
|
72
72
|
|
73
73
|
# transform entries if so requested
|
74
|
-
matches.map!(
|
74
|
+
matches.map! { |match| yield(match) } if block_given?
|
75
75
|
|
76
76
|
[phrase, matches]
|
77
77
|
}.sort
|
@@ -79,8 +79,9 @@ class PerseusMatch
|
|
79
79
|
_ # rcov hack :-(
|
80
80
|
end
|
81
81
|
|
82
|
-
def sort(options = {}
|
83
|
-
|
82
|
+
def sort(options = {})
|
83
|
+
args = [:similarity, options.delete(:coeff), options]
|
84
|
+
block_given? ? sort_by(*args) { |*a| yield(*a) } : sort_by(*args)
|
84
85
|
end
|
85
86
|
|
86
87
|
def rank(options = {})
|
data/lib/perseus_match/list.rb
CHANGED
@@ -32,24 +32,46 @@ class PerseusMatch
|
|
32
32
|
|
33
33
|
class << self
|
34
34
|
|
35
|
-
def pair(phrases)
|
35
|
+
def pair(phrases, pm_options = {}, list_options = {})
|
36
36
|
phrases.uniq!
|
37
37
|
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
pairs = [] unless block_given?
|
39
|
+
|
40
|
+
unless list_options[:minimal]
|
41
|
+
# => pairs.size = phrases.size ** 2
|
42
|
+
|
43
|
+
phrases.each { |phrase|
|
44
|
+
phrases.each { |target|
|
45
|
+
pm = PerseusMatch.new(phrase, target, pm_options)
|
46
|
+
block_given? ? yield(pm) : pairs << pm
|
47
|
+
}
|
48
|
+
}
|
49
|
+
else
|
50
|
+
# => pairs.size = (phrases.size ** 2 - phrases.size) / 2
|
51
|
+
|
52
|
+
size = phrases.size
|
53
|
+
|
54
|
+
1.upto(size) { |i|
|
55
|
+
phrase = phrases[i - 1]
|
56
|
+
|
57
|
+
i.upto(size - 1) { |j|
|
58
|
+
pm = PerseusMatch.new(phrase, phrases[j], pm_options)
|
59
|
+
block_given? ? yield(pm) : pairs << pm
|
60
|
+
}
|
41
61
|
}
|
42
|
-
|
62
|
+
end
|
63
|
+
|
64
|
+
pairs || phrases
|
43
65
|
end
|
44
66
|
|
45
67
|
end
|
46
68
|
|
47
|
-
|
48
|
-
|
49
|
-
def initialize(phrases = [])
|
50
|
-
self.class.pair(phrases) { |pm| add(pm) }
|
69
|
+
def initialize(phrases = [], pm_options = {}, list_options = {})
|
70
|
+
self.class.pair(phrases, pm_options, list_options) { |pm| add(pm) }
|
51
71
|
end
|
52
72
|
|
73
|
+
alias_method :add, :push
|
74
|
+
|
53
75
|
end
|
54
76
|
|
55
77
|
end
|
@@ -28,44 +28,52 @@
|
|
28
28
|
|
29
29
|
$KCODE = 'u'
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
LINGO_CONFIG = {
|
34
|
-
'meeting' => {
|
35
|
-
'attendees' => [
|
36
|
-
{ 'textreader' => { 'files'=> 'STDIN' } },
|
37
|
-
{ 'tokenizer' => { } },
|
38
|
-
{ 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
|
39
|
-
{ 'decomposer' => { 'source' => 'sys-dic' } },
|
40
|
-
{ 'multiworder' => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
|
41
|
-
{ 'synonymer' => { 'source' => 'sys-syn', 'out' => 'syn', 'skip'=>'?,t' } },
|
42
|
-
{ 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } }
|
43
|
-
]
|
44
|
-
}
|
45
|
-
}
|
46
|
-
|
47
|
-
require 'tempfile'
|
31
|
+
require 'pathname'
|
32
|
+
require 'rbconfig'
|
48
33
|
require 'yaml'
|
49
34
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
def make_tmpname(basename, n)
|
55
|
-
case basename
|
56
|
-
when Array
|
57
|
-
prefix, suffix = *basename
|
58
|
-
else
|
59
|
-
prefix, suffix = basename, ''
|
60
|
-
end
|
35
|
+
require 'rubygems'
|
36
|
+
require 'backports/tempfile'
|
37
|
+
require 'nuggets/tempfile/open'
|
38
|
+
require 'nuggets/util/i18n'
|
61
39
|
|
62
|
-
|
63
|
-
|
64
|
-
|
40
|
+
begin
|
41
|
+
require 'text/soundex'
|
42
|
+
rescue LoadError
|
43
|
+
warn "could not load the Text gem -- soundex functionality will not be available"
|
44
|
+
end
|
65
45
|
|
66
|
-
|
46
|
+
LINGO_BASE = ENV['PM_LINGO_BASE'] || (
|
47
|
+
File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
|
48
|
+
)
|
49
|
+
|
50
|
+
LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
|
51
|
+
warn "lingo installation not found at #{LINGO_BASE} -- proceeding anyway" unless LINGO_FOUND
|
52
|
+
|
53
|
+
lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
|
54
|
+
YAML.load_file(file)
|
55
|
+
else
|
56
|
+
warn "lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
|
57
|
+
|
58
|
+
{
|
59
|
+
'meeting' => {
|
60
|
+
'attendees' => [
|
61
|
+
{ 'tokenizer' => { } },
|
62
|
+
{ 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
|
63
|
+
{ 'decomposer' => { 'source' => 'sys-dic' } },
|
64
|
+
{ 'multiworder' => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
|
65
|
+
{ 'synonymer' => { 'source' => 'sys-syn', 'skip' => '?,t' } },
|
66
|
+
]
|
67
|
+
}
|
68
|
+
}
|
67
69
|
end
|
68
|
-
|
70
|
+
|
71
|
+
lingo_config['meeting']['attendees'].
|
72
|
+
unshift({ 'textreader' => { 'files'=> 'STDIN' } }).
|
73
|
+
push({ 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } })
|
74
|
+
|
75
|
+
LINGO_CONFIG = lingo_config
|
76
|
+
|
69
77
|
class PerseusMatch
|
70
78
|
|
71
79
|
class TokenSet < Array
|
@@ -73,10 +81,8 @@ class PerseusMatch
|
|
73
81
|
def self.tokenize(form)
|
74
82
|
return @tokens[form] if @tokens
|
75
83
|
|
76
|
-
@_tokens = {}
|
77
|
-
|
78
|
-
k, @_tokens.has_key?(k) ? @_tokens[k] :
|
79
|
-
k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
|
84
|
+
@_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
|
85
|
+
k, (@_tokens[k] || []) | k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
|
80
86
|
)}
|
81
87
|
|
82
88
|
parse = lambda { |x|
|
@@ -85,8 +91,11 @@ class PerseusMatch
|
|
85
91
|
when /<(.*?)\s=\s\[(.*)\]>/
|
86
92
|
a, b = $1, $2
|
87
93
|
@_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
|
88
|
-
|
89
|
-
|
94
|
+
when /<(.*)>/, /:(.*):/
|
95
|
+
a, b = $1, $1.dup
|
96
|
+
@_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
|
97
|
+
|
98
|
+
warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
|
90
99
|
end
|
91
100
|
}
|
92
101
|
}
|
@@ -95,29 +104,32 @@ class PerseusMatch
|
|
95
104
|
File.open(t) { |f| parse[f] }
|
96
105
|
@tokens[form]
|
97
106
|
else
|
98
|
-
|
99
|
-
|
100
|
-
cfg.
|
107
|
+
raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
|
108
|
+
|
109
|
+
cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
|
110
|
+
YAML.dump(LINGO_CONFIG, t)
|
111
|
+
}
|
101
112
|
|
102
|
-
file = form
|
113
|
+
file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
|
103
114
|
|
104
115
|
unless File.file?(file) && File.readable?(file)
|
105
|
-
temp = Tempfile.
|
106
|
-
|
107
|
-
|
116
|
+
temp = Tempfile.open('perseus_match_temp') { |t|
|
117
|
+
t.puts form
|
118
|
+
}
|
108
119
|
|
109
120
|
file = temp.path
|
110
121
|
end
|
111
122
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
123
|
+
begin
|
124
|
+
Dir.chdir(LINGO_BASE) { parse[%x{
|
125
|
+
#{Config::CONFIG['ruby_install_name']} lingo.rb -c "#{cfg.path}" < "#{file}"
|
126
|
+
}] }
|
127
|
+
ensure
|
128
|
+
cfg.unlink
|
129
|
+
temp.unlink if temp
|
130
|
+
end
|
117
131
|
|
118
132
|
if temp
|
119
|
-
temp.unlink
|
120
|
-
|
121
133
|
tokens, @tokens = @tokens[form], nil
|
122
134
|
tokens
|
123
135
|
else
|
@@ -138,42 +150,40 @@ class PerseusMatch
|
|
138
150
|
end
|
139
151
|
|
140
152
|
def distance(other)
|
141
|
-
|
153
|
+
tokens1, tokens2 = tokens, other.tokens
|
154
|
+
size1, size2 = tokens1.size, tokens2.size
|
142
155
|
|
143
|
-
|
144
|
-
|
145
|
-
distance += 1
|
156
|
+
return size2 if tokens1.empty?
|
157
|
+
return size1 if tokens2.empty?
|
146
158
|
|
147
|
-
|
148
|
-
end
|
149
|
-
}
|
159
|
+
distance, costs = nil, (0..size2).to_a
|
150
160
|
|
151
|
-
|
152
|
-
|
161
|
+
0.upto(size1 - 1) { |index1|
|
162
|
+
token1, cost = tokens1[index1], index1 + 1
|
153
163
|
|
154
|
-
|
155
|
-
|
156
|
-
token.sub(%r{[/|].*?\z}, '')
|
157
|
-
}
|
158
|
-
end
|
164
|
+
0.upto(size2 - 1) { |index2|
|
165
|
+
penalty = token1 == tokens2[index2] ? 0 : 1
|
159
166
|
|
160
|
-
|
161
|
-
|
162
|
-
|
167
|
+
# rcov hack :-(
|
168
|
+
_ = [
|
169
|
+
costs[index2 + 1] + 1, # insertion
|
170
|
+
cost + 1, # deletion
|
171
|
+
costs[index2] + penalty # substitution
|
172
|
+
]
|
173
|
+
distance = _.min
|
163
174
|
|
164
|
-
|
165
|
-
|
166
|
-
end
|
175
|
+
costs[index2], cost = cost, distance
|
176
|
+
}
|
167
177
|
|
168
|
-
|
169
|
-
(self & other).inject([]) { |memo, token|
|
170
|
-
memo + [token] * [count(token), other.count(token)].max
|
178
|
+
costs[size2] = distance
|
171
179
|
}
|
180
|
+
|
181
|
+
distance + 1 # > 0 !?!
|
172
182
|
end
|
173
183
|
|
174
|
-
def
|
175
|
-
|
176
|
-
|
184
|
+
def tokens(wc = true)
|
185
|
+
wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
|
186
|
+
token.sub(%r{[/|].*?\z}, '')
|
177
187
|
}
|
178
188
|
end
|
179
189
|
|
@@ -186,26 +196,31 @@ class PerseusMatch
|
|
186
196
|
end
|
187
197
|
|
188
198
|
def incl(*wc)
|
189
|
-
(@incl ||= {})[wc = [*wc].compact] ||=
|
190
|
-
|
199
|
+
(@incl ||= {})[wc = [*wc].compact] ||= select { |token|
|
200
|
+
match?(token, wc)
|
191
201
|
}.to_token_set(form)
|
192
202
|
end
|
193
203
|
|
194
204
|
def excl(*wc)
|
195
|
-
(@excl ||= {})[wc = [*wc].compact] ||=
|
196
|
-
|
205
|
+
(@excl ||= {})[wc = [*wc].compact] ||= reject { |token|
|
206
|
+
match?(token, wc)
|
197
207
|
}.to_token_set(form)
|
198
208
|
end
|
199
209
|
|
200
|
-
def
|
201
|
-
|
210
|
+
def soundex
|
211
|
+
raise "soundex functionality not available" unless defined?(Text::Soundex)
|
212
|
+
|
213
|
+
@soundex ||= map { |token|
|
214
|
+
token.sub(/(.*)(?=[\/|])/) { |m| Text::Soundex.soundex(m.replace_diacritics) }
|
215
|
+
}.to_token_set(form)
|
202
216
|
end
|
203
217
|
|
204
|
-
def
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
218
|
+
def soundex!
|
219
|
+
replace soundex
|
220
|
+
end
|
221
|
+
|
222
|
+
def eql?(other)
|
223
|
+
tokens == other.tokens && form == other.form
|
209
224
|
end
|
210
225
|
|
211
226
|
def inspect
|
data/lib/perseus_match.rb
CHANGED
@@ -38,35 +38,53 @@ class PerseusMatch
|
|
38
38
|
|
39
39
|
DEFAULT_COEFF = 20
|
40
40
|
|
41
|
-
DISTANCE_SPEC = {
|
42
|
-
{}
|
43
|
-
{ :excl => %w[a t] } => 1,
|
44
|
-
{ :incl => 's' } => 2,
|
45
|
-
{ :incl => 'y' } => 4,
|
46
|
-
{ :sort => true } => 4
|
47
|
-
|
41
|
+
DISTANCE_SPEC = [ # {
|
42
|
+
[{}, 1], # {} => 1,
|
43
|
+
[{ :excl => %w[a t] }, 2], # { :excl => %w[a t] } => 1,
|
44
|
+
[{ :incl => 's' }, 3], # { :incl => 's' } => 2,
|
45
|
+
[{ :incl => 'y' }, 4], # { :incl => 'y' } => 4,
|
46
|
+
[{ :sort => true }, 4], # { :sort => true } => 4,
|
47
|
+
[{ :soundex => true }, 4] # { :soundex => true } => 8
|
48
|
+
] # }
|
48
49
|
|
49
50
|
class << self
|
50
51
|
|
51
|
-
def
|
52
|
-
|
52
|
+
def distance(*args)
|
53
|
+
new(*args).distance
|
53
54
|
end
|
54
55
|
|
55
|
-
def
|
56
|
-
|
56
|
+
def match(phrases, pm_options = {})
|
57
|
+
List.new(phrases, pm_options)
|
58
|
+
end
|
59
|
+
|
60
|
+
def cluster(phrases, options = {}, pm_options = {})
|
61
|
+
Cluster.new(phrases, pm_options).rank(options)
|
62
|
+
end
|
63
|
+
|
64
|
+
def check(*args)
|
65
|
+
check!(*args)
|
66
|
+
rescue CheckFailedError
|
67
|
+
false
|
68
|
+
end
|
69
|
+
|
70
|
+
def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
|
71
|
+
value = new(phrase, target, pm_options).send(attribute)
|
72
|
+
value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
|
57
73
|
end
|
58
74
|
|
59
75
|
end
|
60
76
|
|
61
|
-
attr_reader :phrase, :target, :distance_spec, :default_coeff
|
77
|
+
attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
|
62
78
|
|
63
79
|
def initialize(phrase, target, options = {})
|
64
|
-
@phrase = phrase
|
65
|
-
@target = target
|
80
|
+
@phrase = phrase.to_s
|
81
|
+
@target = target.to_s
|
66
82
|
|
67
83
|
@default_coeff = options[:default_coeff] || DEFAULT_COEFF
|
68
84
|
@distance_spec = options[:distance_spec] || DISTANCE_SPEC
|
69
85
|
|
86
|
+
@verbose = options[:verbose]
|
87
|
+
|
70
88
|
@similarity = {}
|
71
89
|
end
|
72
90
|
|
@@ -97,7 +115,7 @@ class PerseusMatch
|
|
97
115
|
|
98
116
|
def calculate_distance
|
99
117
|
return Infinity if phrase_tokens.disjoint?(target_tokens)
|
100
|
-
return 0 if phrase_tokens
|
118
|
+
return 0 if phrase_tokens.eql?(target_tokens)
|
101
119
|
|
102
120
|
distance_spec.inject(0) { |distance, (options, weight)|
|
103
121
|
distance + token_distance(options) * weight
|
@@ -105,19 +123,47 @@ class PerseusMatch
|
|
105
123
|
end
|
106
124
|
|
107
125
|
def token_distance(options = {})
|
108
|
-
|
109
|
-
|
126
|
+
tokens1 = phrase_tokens.inclexcl(options)
|
127
|
+
tokens2 = target_tokens.inclexcl(options)
|
110
128
|
|
111
129
|
if options[:sort]
|
112
|
-
|
113
|
-
|
130
|
+
tokens1 = tokens1.sort
|
131
|
+
tokens2 = tokens2.sort
|
114
132
|
end
|
115
133
|
|
116
|
-
|
134
|
+
if options[:soundex]
|
135
|
+
tokens1 = tokens1.soundex
|
136
|
+
tokens2 = tokens2.soundex
|
137
|
+
end
|
138
|
+
|
139
|
+
distance = tokens1.distance(tokens2)
|
140
|
+
|
141
|
+
warn <<-EOT if verbose
|
142
|
+
#{options.inspect}:
|
143
|
+
#{tokens1.inspect}
|
144
|
+
#{tokens2.inspect}
|
145
|
+
=> #{distance}
|
146
|
+
EOT
|
147
|
+
|
148
|
+
distance
|
117
149
|
end
|
118
150
|
|
119
151
|
def total_weight
|
120
|
-
distance_spec.
|
152
|
+
@total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
|
153
|
+
end
|
154
|
+
|
155
|
+
class CheckFailedError < StandardError
|
156
|
+
|
157
|
+
attr_reader :value, :threshold, :operator
|
158
|
+
|
159
|
+
def initialize(value, threshold, operator)
|
160
|
+
@value, @threshold, @operator = value, threshold, operator
|
161
|
+
end
|
162
|
+
|
163
|
+
def to_s
|
164
|
+
"FAILED: #{value} #{operator} #{threshold}"
|
165
|
+
end
|
166
|
+
|
121
167
|
end
|
122
168
|
|
123
169
|
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
describe PerseusMatch::Cluster do
|
2
|
+
|
3
|
+
it 'should accept limit option in sort_by' do
|
4
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :limit => 1).all? { |phrase, matches|
|
5
|
+
matches.size.should == 1
|
6
|
+
matches.size.should == matches.nitems
|
7
|
+
}
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'should accept threshold option in sort_by (1a)' do
|
11
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
|
12
|
+
matches.size.should be_zero
|
13
|
+
matches.size.should == matches.nitems
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should accept threshold option in sort_by (1b)' do
|
18
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0).all? { |phrase, matches|
|
19
|
+
matches.size.should == 2
|
20
|
+
matches.size.should == matches.nitems
|
21
|
+
}
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should accept threshold option in sort_by (2)' do
|
25
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'c').all? { |phrase, matches|
|
26
|
+
matches.size.should == 1
|
27
|
+
matches.size.should == matches.nitems
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should accept both limit and threshold options in sort_by (1)' do
|
32
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'z', :limit => 1).all? { |phrase, matches|
|
33
|
+
matches.size.should == 1
|
34
|
+
matches.size.should == matches.nitems
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should accept both limit and threshold options in sort_by (2)' do
|
39
|
+
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'a', :limit => 1).all? { |phrase, matches|
|
40
|
+
matches.size.should be_zero
|
41
|
+
matches.size.should == matches.nitems
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
end if LINGO_FOUND
|
@@ -0,0 +1,16 @@
|
|
1
|
+
describe PerseusMatch::List, '::pair' do
|
2
|
+
|
3
|
+
before :all do
|
4
|
+
@phrases = %w[foo bar baz]
|
5
|
+
@size = @phrases.size
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'should produce full list of pairs with correct size' do
|
9
|
+
PerseusMatch::List.pair(@phrases).size.should == @size ** 2
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should produce minimal list of pairs with correct size' do
|
13
|
+
PerseusMatch::List.pair(@phrases, {}, :minimal => true).size.should == (@size ** 2 - @size) / 2
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
describe PerseusMatch::TokenSet, ' with lingo' do
|
2
|
+
|
3
|
+
before :each do
|
4
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
5
|
+
end
|
6
|
+
|
7
|
+
it 'should tokenize a string' do
|
8
|
+
PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should report strictly equal TokenSets as ==' do
|
12
|
+
PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should report strictly equal TokenSets as eql' do
|
16
|
+
PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should report slightly equal TokenSets as ==' do
|
20
|
+
PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should *not* report slightly equal TokenSets as eql' do
|
24
|
+
PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should include form in inspect' do
|
28
|
+
PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
|
29
|
+
end
|
30
|
+
|
31
|
+
end if LINGO_FOUND
|
32
|
+
|
33
|
+
describe PerseusMatch::TokenSet, ' without lingo' do
|
34
|
+
|
35
|
+
before :each do
|
36
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'should take a prepared file for tokenization' do
|
40
|
+
# prevent lingo from being used
|
41
|
+
lingo_base = LINGO_BASE.dup
|
42
|
+
LINGO_BASE.replace('')
|
43
|
+
|
44
|
+
temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
|
45
|
+
t.puts *%w[<foo|?> <bar|?>]
|
46
|
+
}
|
47
|
+
|
48
|
+
path = temp.path
|
49
|
+
link = 'perseus.tokens'
|
50
|
+
|
51
|
+
Dir.chdir(File.dirname(path)) {
|
52
|
+
File.symlink(path, link)
|
53
|
+
|
54
|
+
PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
|
55
|
+
|
56
|
+
File.unlink(link)
|
57
|
+
}
|
58
|
+
|
59
|
+
temp.unlink
|
60
|
+
|
61
|
+
# reset lingo base
|
62
|
+
LINGO_BASE.replace(lingo_base)
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nuggets/tempfile/open'
|
3
|
+
require 'nuggets/util/i18n'
|
4
|
+
|
5
|
+
describe PerseusMatch do
|
6
|
+
|
7
|
+
before :all do
|
8
|
+
@highly_similar = [
|
9
|
+
'Anbetung der Könige',
|
10
|
+
'Die Anbetung der Könige'
|
11
|
+
] # ok
|
12
|
+
|
13
|
+
@similar = [
|
14
|
+
# @highly_similar + ...
|
15
|
+
'Die Anbetung der Heiligen Drei Könige',
|
16
|
+
'dIE AnBeTuNg der heILIGen dREI KÖniGE'
|
17
|
+
] # ok
|
18
|
+
|
19
|
+
@unfortunately_similar = [
|
20
|
+
# @similar + ...
|
21
|
+
'Die Die Die Anbetung der Könige',
|
22
|
+
'Die Könige der Anbetung',
|
23
|
+
'Königsanbetung hoch drei'
|
24
|
+
] # *not* ok -- eventually try to drop these below the threshold
|
25
|
+
|
26
|
+
@somewhat_similar = @highly_similar + @similar + @unfortunately_similar
|
27
|
+
|
28
|
+
phrases = @somewhat_similar + [
|
29
|
+
'Drei mal drei macht sechs',
|
30
|
+
'Das Ende dieses Blödsinns',
|
31
|
+
''
|
32
|
+
]
|
33
|
+
|
34
|
+
temp = Tempfile.open('perseus_match_spec_temp') { |t|
|
35
|
+
t.puts *phrases
|
36
|
+
}
|
37
|
+
|
38
|
+
PerseusMatch::TokenSet.tokenize(temp.path)
|
39
|
+
|
40
|
+
temp.unlink
|
41
|
+
|
42
|
+
@matchings = PerseusMatch.match(phrases)
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should identify identical (non-empty) strings as identical' do
|
46
|
+
@matchings.each { |matching|
|
47
|
+
if !matching.phrase.empty? && matching.phrase == matching.target
|
48
|
+
inform_on_error(matching) { matching.similarity.should == 1.0 }
|
49
|
+
end
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should identify case-insensitively identical (non-empty) strings as nearly identical' do
|
54
|
+
@matchings.each { |matching|
|
55
|
+
if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase == matching.target.replace_diacritics.downcase
|
56
|
+
inform_on_error(matching) { matching.similarity.should > 0.95 }
|
57
|
+
end
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should identify *only* case-insensitively identical (non-empty) strings as nearly identical' do
|
62
|
+
@matchings.each { |matching|
|
63
|
+
if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase != matching.target.replace_diacritics.downcase
|
64
|
+
inform_on_error(matching) { matching.similarity.should < 0.98 }
|
65
|
+
end
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should identify disjunct (non-empty) strings as disjunct' do
|
70
|
+
@matchings.each { |matching|
|
71
|
+
if !matching.phrase.empty? && matching.phrase_tokens.disjoint?(matching.target_tokens)
|
72
|
+
inform_on_error(matching) { matching.similarity.should == 0.0 }
|
73
|
+
end
|
74
|
+
}
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'should identify empty string as disjunct with anything, even with itself' do
|
78
|
+
@matchings.each { |matching|
|
79
|
+
if matching.phrase.empty? || matching.target.empty?
|
80
|
+
inform_on_error(matching) { matching.similarity.should == 0.0 }
|
81
|
+
end
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should identify certain strings as highly similar (1)' do
|
86
|
+
@matchings.each { |matching|
|
87
|
+
if @highly_similar.include?(matching.phrase) && @highly_similar.include?(matching.target)
|
88
|
+
inform_on_error(matching) { matching.similarity.should > 0.9 }
|
89
|
+
end
|
90
|
+
}
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should identify certain strings as highly similar (2)' do
|
94
|
+
@highly_similar.each { |phrase|
|
95
|
+
@highly_similar.each { |target|
|
96
|
+
inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.9).should be_true }
|
97
|
+
}
|
98
|
+
}
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'should identify certain strings as similar (1)' do
|
102
|
+
@matchings.each { |matching|
|
103
|
+
if @similar.include?(matching.phrase) && @similar.include?(matching.target)
|
104
|
+
inform_on_error(matching) { matching.similarity.should > 0.8 }
|
105
|
+
end
|
106
|
+
}
|
107
|
+
end
|
108
|
+
|
109
|
+
it 'should identify certain strings as similar (2)' do
|
110
|
+
@similar.each { |phrase|
|
111
|
+
@similar.each { |target|
|
112
|
+
inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.8).should be_true }
|
113
|
+
}
|
114
|
+
}
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'should *not* identify other strings as similar (1)' do
|
118
|
+
@matchings.each { |matching|
|
119
|
+
if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
|
120
|
+
inform_on_error(matching) { matching.similarity.should_not > 0.8 }
|
121
|
+
end
|
122
|
+
}
|
123
|
+
end
|
124
|
+
|
125
|
+
it 'should *not* identify other strings as similar (2)' do
|
126
|
+
@matchings.each { |matching|
|
127
|
+
if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
|
128
|
+
inform_on_error(matching) { PerseusMatch.check(matching.phrase, matching.target, 0.8).should be_false }
|
129
|
+
end
|
130
|
+
}
|
131
|
+
end
|
132
|
+
|
133
|
+
it 'should be symmetrical' do
|
134
|
+
similarities = {}
|
135
|
+
|
136
|
+
@matchings.each { |matching|
|
137
|
+
if similarity = similarities[[matching.target, matching.phrase]]
|
138
|
+
inform_on_error(matching) { similarity.should == matching.similarity }
|
139
|
+
else
|
140
|
+
similarities[[matching.phrase, matching.target]] = matching.similarity
|
141
|
+
end
|
142
|
+
}
|
143
|
+
end
|
144
|
+
|
145
|
+
it 'should calculate pair distance' do
|
146
|
+
PerseusMatch.distance('foo', 'bar').class.should < Numeric
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'should be clusterable' do
|
150
|
+
PerseusMatch.cluster(@somewhat_similar).should be_an_instance_of(Array)
|
151
|
+
end
|
152
|
+
|
153
|
+
it 'should be checkable (1)' do
|
154
|
+
PerseusMatch.check('foo', 'bar', 0, :>=).should be_true
|
155
|
+
end
|
156
|
+
|
157
|
+
it 'should be checkable (2)' do
|
158
|
+
lambda {
|
159
|
+
begin
|
160
|
+
PerseusMatch.check!('foo', 'bar', 0, :>)
|
161
|
+
rescue PerseusMatch::CheckFailedError => err
|
162
|
+
err.to_s.should =~ /0/
|
163
|
+
raise err
|
164
|
+
end
|
165
|
+
}.should raise_error(PerseusMatch::CheckFailedError)
|
166
|
+
end
|
167
|
+
|
168
|
+
end if LINGO_FOUND
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
unless Object.const_defined?(:PerseusMatch)
|
2
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
3
|
+
require 'perseus_match'
|
4
|
+
end
|
5
|
+
|
6
|
+
def inform_on_error(*args)
|
7
|
+
begin
|
8
|
+
yield
|
9
|
+
rescue Spec::Expectations::ExpectationNotMetError => err
|
10
|
+
unless args.empty?
|
11
|
+
puts
|
12
|
+
p *args
|
13
|
+
puts
|
14
|
+
end
|
15
|
+
|
16
|
+
raise
|
17
|
+
end
|
18
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: perseus_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,9 +9,19 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-12-09 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: ruby-backports
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
15
25
|
- !ruby/object:Gem::Dependency
|
16
26
|
name: ruby-nuggets
|
17
27
|
type: :runtime
|
@@ -20,7 +30,7 @@ dependencies:
|
|
20
30
|
requirements:
|
21
31
|
- - ">="
|
22
32
|
- !ruby/object:Gem::Version
|
23
|
-
version: 0.
|
33
|
+
version: 0.4.0
|
24
34
|
version:
|
25
35
|
description: Fuzzy string matching based on linguistic analysis
|
26
36
|
email: jens.wille@uni-koeln.de
|
@@ -33,29 +43,35 @@ extra_rdoc_files:
|
|
33
43
|
- ChangeLog
|
34
44
|
- README
|
35
45
|
files:
|
36
|
-
- lib/perseus_match.rb
|
46
|
+
- lib/perseus_match/list.rb
|
37
47
|
- lib/perseus_match/version.rb
|
38
48
|
- lib/perseus_match/token_set.rb
|
39
|
-
- lib/perseus_match/list.rb
|
40
49
|
- lib/perseus_match/cluster.rb
|
50
|
+
- lib/perseus_match.rb
|
41
51
|
- bin/perseus_match
|
52
|
+
- Rakefile
|
42
53
|
- COPYING
|
43
|
-
- README
|
44
54
|
- ChangeLog
|
45
|
-
-
|
55
|
+
- LINGO_BASE
|
56
|
+
- README
|
57
|
+
- spec/spec_helper.rb
|
58
|
+
- spec/perseus_match/list_spec.rb
|
59
|
+
- spec/perseus_match/cluster_spec.rb
|
60
|
+
- spec/perseus_match/token_set_spec.rb
|
61
|
+
- spec/perseus_match_spec.rb
|
46
62
|
has_rdoc: true
|
47
63
|
homepage: http://prometheus.rubyforge.org/perseus_match
|
48
64
|
post_install_message:
|
49
65
|
rdoc_options:
|
50
|
-
- --
|
51
|
-
-
|
66
|
+
- --line-numbers
|
67
|
+
- --inline-source
|
52
68
|
- --title
|
53
69
|
- perseus_match Application documentation
|
54
70
|
- --main
|
55
71
|
- README
|
72
|
+
- --charset
|
73
|
+
- UTF-8
|
56
74
|
- --all
|
57
|
-
- --line-numbers
|
58
|
-
- --inline-source
|
59
75
|
require_paths:
|
60
76
|
- lib
|
61
77
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -73,7 +89,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
73
89
|
requirements: []
|
74
90
|
|
75
91
|
rubyforge_project: prometheus
|
76
|
-
rubygems_version: 1.
|
92
|
+
rubygems_version: 1.3.1
|
77
93
|
signing_key:
|
78
94
|
specification_version: 2
|
79
95
|
summary: Fuzzy string matching based on linguistic analysis
|