blackwinter-perseus_match 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +1 -1
- data/Rakefile +1 -1
- data/bin/perseus_match +55 -34
- data/lib/perseus_match/token_set.rb +14 -4
- data/lib/perseus_match/version.rb +1 -1
- data/sample/check.csv +27 -0
- data/sample/config.yaml +28 -0
- data/sample/lingo.cfg +8 -0
- data/sample/phrases.txt +9 -0
- data/spec/perseus_match/cluster_spec.rb +2 -1
- data/spec/perseus_match/token_set_spec.rb +16 -0
- metadata +6 -3
data/README
CHANGED
data/Rakefile
CHANGED
@@ -13,7 +13,7 @@ begin
|
|
13
13
|
:version => PerseusMatch::VERSION,
|
14
14
|
:summary => %q{Fuzzy string matching based on linguistic analysis},
|
15
15
|
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
16
|
-
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
|
16
|
+
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
|
17
17
|
:dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
|
18
18
|
}
|
19
19
|
}}
|
data/bin/perseus_match
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
require 'optparse'
|
4
4
|
require 'benchmark'
|
5
5
|
require 'yaml'
|
6
|
+
require 'set'
|
6
7
|
|
7
8
|
require 'rubygems'
|
8
9
|
require 'nuggets/enumerable/minmax'
|
@@ -20,9 +21,11 @@ options = {
|
|
20
21
|
:threshold => 0,
|
21
22
|
:sort => false,
|
22
23
|
:stats => false,
|
23
|
-
:
|
24
|
+
:silent => false,
|
25
|
+
:unknowns => nil,
|
24
26
|
:minimal => false,
|
25
27
|
:separate => false,
|
28
|
+
:lingo => false,
|
26
29
|
:check => false,
|
27
30
|
:failed_only => false,
|
28
31
|
:align => false,
|
@@ -57,6 +60,14 @@ OptionParser.new { |opts|
|
|
57
60
|
options[:verbose] = true
|
58
61
|
}
|
59
62
|
|
63
|
+
opts.on('-n', '--silent', 'Suppress warnings') {
|
64
|
+
options[:silent] = true
|
65
|
+
}
|
66
|
+
|
67
|
+
opts.on('-u', '--unknowns FILE', 'Write unknown tokens as lingo dictionary to FILE') { |f|
|
68
|
+
options[:unknowns] = f
|
69
|
+
}
|
70
|
+
|
60
71
|
opts.separator ' '
|
61
72
|
opts.separator ' * Calculating similarities (default)'
|
62
73
|
opts.separator ' '
|
@@ -111,14 +122,24 @@ else
|
|
111
122
|
abort "Input file not found: #{file}" unless File.readable?(file)
|
112
123
|
end
|
113
124
|
|
114
|
-
|
125
|
+
unknowns = Set.new if options[:unknowns]
|
126
|
+
|
127
|
+
PerseusMatch::TokenSet.tokenize(file, unknowns || !options[:silent])
|
128
|
+
|
129
|
+
if unknowns
|
130
|
+
File.open(options[:unknowns], 'w') { |f|
|
131
|
+
unknowns.sort.each { |unk| f.puts "#{unk}=#{unk} #?" }
|
132
|
+
}
|
133
|
+
|
134
|
+
unknowns = nil
|
135
|
+
end
|
115
136
|
|
116
137
|
skip_re = %r{\A\s*(?:#|\z)}o
|
117
138
|
phrases = []
|
118
139
|
|
119
|
-
File.open(file)
|
120
|
-
phrases << line.chomp unless line =~ skip_re
|
121
|
-
}
|
140
|
+
File.open(file) { |f|
|
141
|
+
f.each { |line| phrases << line.chomp unless line =~ skip_re }
|
142
|
+
}
|
122
143
|
|
123
144
|
pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
|
124
145
|
pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
|
@@ -144,7 +165,6 @@ action = if options[:check]
|
|
144
165
|
end
|
145
166
|
|
146
167
|
phrases.sort! if options[:sort]
|
147
|
-
phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
|
148
168
|
|
149
169
|
global_threshold = options[:threshold]
|
150
170
|
failed_only = options[:failed_only]
|
@@ -157,12 +177,12 @@ action = if options[:check]
|
|
157
177
|
count, count_all = 0, 0
|
158
178
|
positives = negatives = false_positives = false_negatives = 0.0
|
159
179
|
|
160
|
-
phrases.each { |line
|
161
|
-
phrase, target, threshold, operator, _ = *
|
180
|
+
phrases.each { |line|
|
181
|
+
phrase, target, threshold, operator, _ = *FasterCSV.parse_line(line)
|
162
182
|
|
163
183
|
threshold ||= global_threshold
|
164
184
|
operator ||= '>'
|
165
|
-
assign = operator =~ />/
|
185
|
+
assign = operator =~ />/ || operator == '=='
|
166
186
|
|
167
187
|
begin
|
168
188
|
PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
|
@@ -176,6 +196,7 @@ action = if options[:check]
|
|
176
196
|
|
177
197
|
puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
|
178
198
|
end
|
199
|
+
|
179
200
|
count_all += 1
|
180
201
|
}
|
181
202
|
|
@@ -195,9 +216,13 @@ action = if options[:check]
|
|
195
216
|
precision = divide[positives, positives + false_positives]
|
196
217
|
f1 = divide[2 * recall * precision, recall + precision]
|
197
218
|
|
198
|
-
|
219
|
+
stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
|
199
220
|
recall * 100, precision * 100, f1, error
|
200
221
|
]
|
222
|
+
|
223
|
+
stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
|
224
|
+
|
225
|
+
warn stats
|
201
226
|
end
|
202
227
|
|
203
228
|
error if adjust_coeff
|
@@ -222,23 +247,19 @@ action = if options[:check]
|
|
222
247
|
best_coeff = start_coeff
|
223
248
|
else
|
224
249
|
if best_err == previous_err
|
225
|
-
|
226
|
-
|
227
|
-
previous_err, previous_coeff = current_err, current_coeff
|
228
|
-
}
|
229
|
-
|
230
|
-
best_err, best_coeff = previous_err, previous_coeff
|
250
|
+
step *= -1
|
251
|
+
best_coeff = previous_coeff
|
231
252
|
else
|
232
|
-
|
233
|
-
break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
|
234
|
-
next_err, next_coeff = current_err, current_coeff
|
235
|
-
}
|
236
|
-
|
237
|
-
best_err, best_coeff = next_err, next_coeff
|
253
|
+
best_coeff = next_coeff
|
238
254
|
end
|
255
|
+
|
256
|
+
max.times {
|
257
|
+
break if (current_err = _action[current_coeff = best_coeff + step]) > best_err
|
258
|
+
best_err, best_coeff = current_err, current_coeff
|
259
|
+
}
|
239
260
|
end
|
240
261
|
|
241
|
-
puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
|
262
|
+
puts 'Coeff = %d (%d), Err = %0.4f (%0.4f)' % [best_coeff, start_coeff, best_err, start_err]
|
242
263
|
}
|
243
264
|
else
|
244
265
|
_action
|
@@ -246,38 +267,38 @@ action = if options[:check]
|
|
246
267
|
else
|
247
268
|
format =
|
248
269
|
options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
|
249
|
-
options[:sort] ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
|
270
|
+
options[:sort] ? lambda { |pm| " #{[pm.target, pm.distance, pm.similarity].inspect}" } :
|
250
271
|
lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
|
251
272
|
|
252
273
|
if options[:sort]
|
253
|
-
require 'pp'
|
254
|
-
|
255
274
|
lambda {
|
256
|
-
|
275
|
+
PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
|
276
|
+
count_all += 1
|
277
|
+
|
257
278
|
if pm.similarity >= threshold
|
258
|
-
res = format[pm]
|
259
279
|
count += 1
|
280
|
+
format[pm]
|
260
281
|
end
|
261
|
-
|
262
|
-
|
263
|
-
}
|
282
|
+
}.each { |phrase, matches|
|
283
|
+
puts "#{phrase.inspect}:", matches.compact
|
284
|
+
}
|
264
285
|
}
|
265
286
|
else
|
266
287
|
lambda {
|
267
288
|
separator, previous_phrase = options[:separate], nil
|
268
289
|
|
269
290
|
PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
|
291
|
+
count_all += 1
|
292
|
+
|
270
293
|
if separator && pm.phrase != previous_phrase ||= pm.phrase
|
271
294
|
puts separator
|
272
295
|
previous_phrase = pm.phrase
|
273
296
|
end
|
274
297
|
|
275
298
|
if pm.similarity >= threshold
|
276
|
-
puts format[pm]
|
277
299
|
count += 1
|
300
|
+
puts format[pm]
|
278
301
|
end
|
279
|
-
|
280
|
-
count_all += 1
|
281
302
|
}
|
282
303
|
}
|
283
304
|
end
|
@@ -78,7 +78,7 @@ class PerseusMatch
|
|
78
78
|
|
79
79
|
class TokenSet < Array
|
80
80
|
|
81
|
-
def self.tokenize(form)
|
81
|
+
def self.tokenize(form, unknowns = false)
|
82
82
|
return @tokens[form] if @tokens
|
83
83
|
|
84
84
|
@_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
|
@@ -90,12 +90,22 @@ class PerseusMatch
|
|
90
90
|
case res
|
91
91
|
when /<(.*?)\s=\s\[(.*)\]>/
|
92
92
|
a, b = $1, $2
|
93
|
-
|
93
|
+
a.sub!(/\|.*/, '')
|
94
|
+
|
95
|
+
@_tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten
|
94
96
|
when /<(.*)>/, /:(.*):/
|
95
97
|
a, b = $1, $1.dup
|
96
|
-
|
98
|
+
a.sub!(/[\/|].*/, '')
|
99
|
+
|
100
|
+
if unknowns && b =~ /\|\?\z/
|
101
|
+
if unknowns.respond_to?(:<<)
|
102
|
+
unknowns << a
|
103
|
+
else
|
104
|
+
warn "UNK: #{a} [#{res.strip}]"
|
105
|
+
end
|
106
|
+
end
|
97
107
|
|
98
|
-
|
108
|
+
@_tokens[a] ||= [b.replace_diacritics.downcase]
|
99
109
|
end
|
100
110
|
}
|
101
111
|
}
|
data/sample/check.csv
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# phrase,target,threshold[,operator (default: >)]
|
2
|
+
|
3
|
+
"Anbetung der Könige","Die Anbetung der Könige",0.95
|
4
|
+
"Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
|
5
|
+
"Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
|
6
|
+
"Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
|
7
|
+
"Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
|
8
|
+
|
9
|
+
"Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
|
10
|
+
"Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
|
11
|
+
"Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
|
12
|
+
"Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
|
13
|
+
|
14
|
+
"Die Anbetung der Heiligen Drei Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95
|
15
|
+
|
16
|
+
"Anbetung der Könige","Die Die Die Anbetung der Könige",0.95,<
|
17
|
+
"Anbetung der Könige","Die Die Die Anbetung der Könige",0.8
|
18
|
+
"Anbetung der Könige","Die Könige der Anbetung",0.95,<
|
19
|
+
"Anbetung der Könige","Die Könige der Anbetung",0.8
|
20
|
+
"Anbetung der Könige","Königsanbetung hoch drei",0.95,<
|
21
|
+
"Anbetung der Könige","Königsanbetung hoch drei",0.8
|
22
|
+
|
23
|
+
"Anbetung der Könige","Drei mal drei macht sechs",0.5,<
|
24
|
+
"Anbetung der Könige","Das Ende dieses Blödsinns",0.5,<
|
25
|
+
|
26
|
+
# test ;-)
|
27
|
+
,,,
|
data/sample/config.yaml
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
---
|
2
|
+
:distance_spec:
|
3
|
+
|
4
|
+
# default, as is
|
5
|
+
# - - {}
|
6
|
+
# - 1
|
7
|
+
|
8
|
+
# ignore (exclude) adjectives and particles
|
9
|
+
- - :excl: [a, t]
|
10
|
+
- 2
|
11
|
+
|
12
|
+
# consider (include) only substantives
|
13
|
+
- - :incl: s
|
14
|
+
- 3
|
15
|
+
|
16
|
+
# consider (include) only synonyms
|
17
|
+
# - - :incl: y
|
18
|
+
# - 4
|
19
|
+
|
20
|
+
# sort the tokens when comparing
|
21
|
+
# - - :sort: true
|
22
|
+
# - 4
|
23
|
+
|
24
|
+
# replace tokens by their soundex value
|
25
|
+
- - :soundex: true
|
26
|
+
- 4
|
27
|
+
|
28
|
+
:default_coeff: 35
|
data/sample/lingo.cfg
ADDED
data/sample/phrases.txt
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
Anbetung der Könige
|
2
|
+
Das Ende dieses Blödsinns
|
3
|
+
dIE AnBeTuNg der heILIGen dREI KÖniGE
|
4
|
+
Die Anbetung der Heiligen Drei Könige
|
5
|
+
Die Anbetung der Könige
|
6
|
+
Die Die Die Anbetung der Könige
|
7
|
+
Die Könige der Anbetung
|
8
|
+
Drei mal drei macht sechs
|
9
|
+
Königsanbetung hoch drei
|
@@ -9,8 +9,9 @@ describe PerseusMatch::Cluster do
|
|
9
9
|
|
10
10
|
it 'should accept threshold option in sort_by (1a)' do
|
11
11
|
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
|
12
|
-
matches.size.should
|
12
|
+
matches.size.should == 1
|
13
13
|
matches.size.should == matches.nitems
|
14
|
+
matches.each { |match| match.target.should == phrase }
|
14
15
|
}
|
15
16
|
end
|
16
17
|
|
@@ -4,6 +4,14 @@ describe PerseusMatch::TokenSet, ' with lingo' do
|
|
4
4
|
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
5
5
|
end
|
6
6
|
|
7
|
+
before :all do
|
8
|
+
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
9
|
+
end
|
10
|
+
|
11
|
+
after :all do
|
12
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
|
13
|
+
end
|
14
|
+
|
7
15
|
it 'should tokenize a string' do
|
8
16
|
PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
|
9
17
|
end
|
@@ -36,6 +44,14 @@ describe PerseusMatch::TokenSet, ' without lingo' do
|
|
36
44
|
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
37
45
|
end
|
38
46
|
|
47
|
+
before :all do
|
48
|
+
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
49
|
+
end
|
50
|
+
|
51
|
+
after :all do
|
52
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
|
53
|
+
end
|
54
|
+
|
39
55
|
it 'should take a prepared file for tokenization' do
|
40
56
|
# prevent lingo from being used
|
41
57
|
lingo_base = LINGO_BASE.dup
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blackwinter-perseus_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-01-13 00:00:00 -08:00
|
13
13
|
default_executable: perseus_match
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -50,13 +50,16 @@ files:
|
|
50
50
|
- Rakefile
|
51
51
|
- COPYING
|
52
52
|
- ChangeLog
|
53
|
-
- LINGO_BASE
|
54
53
|
- README
|
55
54
|
- spec/spec_helper.rb
|
56
55
|
- spec/perseus_match/list_spec.rb
|
57
56
|
- spec/perseus_match/cluster_spec.rb
|
58
57
|
- spec/perseus_match/token_set_spec.rb
|
59
58
|
- spec/perseus_match_spec.rb
|
59
|
+
- sample/config.yaml
|
60
|
+
- sample/lingo.cfg
|
61
|
+
- sample/phrases.txt
|
62
|
+
- sample/check.csv
|
60
63
|
has_rdoc: true
|
61
64
|
homepage: http://prometheus.rubyforge.org/perseus_match
|
62
65
|
post_install_message:
|