perseus_match 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +1 -1
- data/Rakefile +1 -1
- data/bin/perseus_match +55 -34
- data/lib/perseus_match/token_set.rb +14 -4
- data/lib/perseus_match/version.rb +1 -1
- data/sample/check.csv +27 -0
- data/sample/config.yaml +28 -0
- data/sample/lingo.cfg +8 -0
- data/sample/phrases.txt +9 -0
- data/spec/perseus_match/cluster_spec.rb +2 -1
- data/spec/perseus_match/token_set_spec.rb +16 -0
- metadata +6 -3
- data/LINGO_BASE +0 -1
data/README
CHANGED
data/Rakefile
CHANGED
|
@@ -13,7 +13,7 @@ begin
|
|
|
13
13
|
:version => PerseusMatch::VERSION,
|
|
14
14
|
:summary => %q{Fuzzy string matching based on linguistic analysis},
|
|
15
15
|
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
|
16
|
-
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
|
|
16
|
+
:extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
|
|
17
17
|
:dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
|
|
18
18
|
}
|
|
19
19
|
}}
|
data/bin/perseus_match
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
require 'optparse'
|
|
4
4
|
require 'benchmark'
|
|
5
5
|
require 'yaml'
|
|
6
|
+
require 'set'
|
|
6
7
|
|
|
7
8
|
require 'rubygems'
|
|
8
9
|
require 'nuggets/enumerable/minmax'
|
|
@@ -20,9 +21,11 @@ options = {
|
|
|
20
21
|
:threshold => 0,
|
|
21
22
|
:sort => false,
|
|
22
23
|
:stats => false,
|
|
23
|
-
:
|
|
24
|
+
:silent => false,
|
|
25
|
+
:unknowns => nil,
|
|
24
26
|
:minimal => false,
|
|
25
27
|
:separate => false,
|
|
28
|
+
:lingo => false,
|
|
26
29
|
:check => false,
|
|
27
30
|
:failed_only => false,
|
|
28
31
|
:align => false,
|
|
@@ -57,6 +60,14 @@ OptionParser.new { |opts|
|
|
|
57
60
|
options[:verbose] = true
|
|
58
61
|
}
|
|
59
62
|
|
|
63
|
+
opts.on('-n', '--silent', 'Suppress warnings') {
|
|
64
|
+
options[:silent] = true
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
opts.on('-u', '--unknowns FILE', 'Write unknown tokens as lingo dictionary to FILE') { |f|
|
|
68
|
+
options[:unknowns] = f
|
|
69
|
+
}
|
|
70
|
+
|
|
60
71
|
opts.separator ' '
|
|
61
72
|
opts.separator ' * Calculating similarities (default)'
|
|
62
73
|
opts.separator ' '
|
|
@@ -111,14 +122,24 @@ else
|
|
|
111
122
|
abort "Input file not found: #{file}" unless File.readable?(file)
|
|
112
123
|
end
|
|
113
124
|
|
|
114
|
-
|
|
125
|
+
unknowns = Set.new if options[:unknowns]
|
|
126
|
+
|
|
127
|
+
PerseusMatch::TokenSet.tokenize(file, unknowns || !options[:silent])
|
|
128
|
+
|
|
129
|
+
if unknowns
|
|
130
|
+
File.open(options[:unknowns], 'w') { |f|
|
|
131
|
+
unknowns.sort.each { |unk| f.puts "#{unk}=#{unk} #?" }
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
unknowns = nil
|
|
135
|
+
end
|
|
115
136
|
|
|
116
137
|
skip_re = %r{\A\s*(?:#|\z)}o
|
|
117
138
|
phrases = []
|
|
118
139
|
|
|
119
|
-
File.open(file)
|
|
120
|
-
phrases << line.chomp unless line =~ skip_re
|
|
121
|
-
}
|
|
140
|
+
File.open(file) { |f|
|
|
141
|
+
f.each { |line| phrases << line.chomp unless line =~ skip_re }
|
|
142
|
+
}
|
|
122
143
|
|
|
123
144
|
pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
|
|
124
145
|
pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
|
|
@@ -144,7 +165,6 @@ action = if options[:check]
|
|
|
144
165
|
end
|
|
145
166
|
|
|
146
167
|
phrases.sort! if options[:sort]
|
|
147
|
-
phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
|
|
148
168
|
|
|
149
169
|
global_threshold = options[:threshold]
|
|
150
170
|
failed_only = options[:failed_only]
|
|
@@ -157,12 +177,12 @@ action = if options[:check]
|
|
|
157
177
|
count, count_all = 0, 0
|
|
158
178
|
positives = negatives = false_positives = false_negatives = 0.0
|
|
159
179
|
|
|
160
|
-
phrases.each { |line
|
|
161
|
-
phrase, target, threshold, operator, _ = *
|
|
180
|
+
phrases.each { |line|
|
|
181
|
+
phrase, target, threshold, operator, _ = *FasterCSV.parse_line(line)
|
|
162
182
|
|
|
163
183
|
threshold ||= global_threshold
|
|
164
184
|
operator ||= '>'
|
|
165
|
-
assign = operator =~ />/
|
|
185
|
+
assign = operator =~ />/ || operator == '=='
|
|
166
186
|
|
|
167
187
|
begin
|
|
168
188
|
PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
|
|
@@ -176,6 +196,7 @@ action = if options[:check]
|
|
|
176
196
|
|
|
177
197
|
puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
|
|
178
198
|
end
|
|
199
|
+
|
|
179
200
|
count_all += 1
|
|
180
201
|
}
|
|
181
202
|
|
|
@@ -195,9 +216,13 @@ action = if options[:check]
|
|
|
195
216
|
precision = divide[positives, positives + false_positives]
|
|
196
217
|
f1 = divide[2 * recall * precision, recall + precision]
|
|
197
218
|
|
|
198
|
-
|
|
219
|
+
stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
|
|
199
220
|
recall * 100, precision * 100, f1, error
|
|
200
221
|
]
|
|
222
|
+
|
|
223
|
+
stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
|
|
224
|
+
|
|
225
|
+
warn stats
|
|
201
226
|
end
|
|
202
227
|
|
|
203
228
|
error if adjust_coeff
|
|
@@ -222,23 +247,19 @@ action = if options[:check]
|
|
|
222
247
|
best_coeff = start_coeff
|
|
223
248
|
else
|
|
224
249
|
if best_err == previous_err
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
previous_err, previous_coeff = current_err, current_coeff
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
best_err, best_coeff = previous_err, previous_coeff
|
|
250
|
+
step *= -1
|
|
251
|
+
best_coeff = previous_coeff
|
|
231
252
|
else
|
|
232
|
-
|
|
233
|
-
break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
|
|
234
|
-
next_err, next_coeff = current_err, current_coeff
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
best_err, best_coeff = next_err, next_coeff
|
|
253
|
+
best_coeff = next_coeff
|
|
238
254
|
end
|
|
255
|
+
|
|
256
|
+
max.times {
|
|
257
|
+
break if (current_err = _action[current_coeff = best_coeff + step]) > best_err
|
|
258
|
+
best_err, best_coeff = current_err, current_coeff
|
|
259
|
+
}
|
|
239
260
|
end
|
|
240
261
|
|
|
241
|
-
puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
|
|
262
|
+
puts 'Coeff = %d (%d), Err = %0.4f (%0.4f)' % [best_coeff, start_coeff, best_err, start_err]
|
|
242
263
|
}
|
|
243
264
|
else
|
|
244
265
|
_action
|
|
@@ -246,38 +267,38 @@ action = if options[:check]
|
|
|
246
267
|
else
|
|
247
268
|
format =
|
|
248
269
|
options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
|
|
249
|
-
options[:sort] ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
|
|
270
|
+
options[:sort] ? lambda { |pm| " #{[pm.target, pm.distance, pm.similarity].inspect}" } :
|
|
250
271
|
lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
|
|
251
272
|
|
|
252
273
|
if options[:sort]
|
|
253
|
-
require 'pp'
|
|
254
|
-
|
|
255
274
|
lambda {
|
|
256
|
-
|
|
275
|
+
PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
|
|
276
|
+
count_all += 1
|
|
277
|
+
|
|
257
278
|
if pm.similarity >= threshold
|
|
258
|
-
res = format[pm]
|
|
259
279
|
count += 1
|
|
280
|
+
format[pm]
|
|
260
281
|
end
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
}
|
|
282
|
+
}.each { |phrase, matches|
|
|
283
|
+
puts "#{phrase.inspect}:", matches.compact
|
|
284
|
+
}
|
|
264
285
|
}
|
|
265
286
|
else
|
|
266
287
|
lambda {
|
|
267
288
|
separator, previous_phrase = options[:separate], nil
|
|
268
289
|
|
|
269
290
|
PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
|
|
291
|
+
count_all += 1
|
|
292
|
+
|
|
270
293
|
if separator && pm.phrase != previous_phrase ||= pm.phrase
|
|
271
294
|
puts separator
|
|
272
295
|
previous_phrase = pm.phrase
|
|
273
296
|
end
|
|
274
297
|
|
|
275
298
|
if pm.similarity >= threshold
|
|
276
|
-
puts format[pm]
|
|
277
299
|
count += 1
|
|
300
|
+
puts format[pm]
|
|
278
301
|
end
|
|
279
|
-
|
|
280
|
-
count_all += 1
|
|
281
302
|
}
|
|
282
303
|
}
|
|
283
304
|
end
|
|
@@ -78,7 +78,7 @@ class PerseusMatch
|
|
|
78
78
|
|
|
79
79
|
class TokenSet < Array
|
|
80
80
|
|
|
81
|
-
def self.tokenize(form)
|
|
81
|
+
def self.tokenize(form, unknowns = false)
|
|
82
82
|
return @tokens[form] if @tokens
|
|
83
83
|
|
|
84
84
|
@_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
|
|
@@ -90,12 +90,22 @@ class PerseusMatch
|
|
|
90
90
|
case res
|
|
91
91
|
when /<(.*?)\s=\s\[(.*)\]>/
|
|
92
92
|
a, b = $1, $2
|
|
93
|
-
|
|
93
|
+
a.sub!(/\|.*/, '')
|
|
94
|
+
|
|
95
|
+
@_tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten
|
|
94
96
|
when /<(.*)>/, /:(.*):/
|
|
95
97
|
a, b = $1, $1.dup
|
|
96
|
-
|
|
98
|
+
a.sub!(/[\/|].*/, '')
|
|
99
|
+
|
|
100
|
+
if unknowns && b =~ /\|\?\z/
|
|
101
|
+
if unknowns.respond_to?(:<<)
|
|
102
|
+
unknowns << a
|
|
103
|
+
else
|
|
104
|
+
warn "UNK: #{a} [#{res.strip}]"
|
|
105
|
+
end
|
|
106
|
+
end
|
|
97
107
|
|
|
98
|
-
|
|
108
|
+
@_tokens[a] ||= [b.replace_diacritics.downcase]
|
|
99
109
|
end
|
|
100
110
|
}
|
|
101
111
|
}
|
data/sample/check.csv
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# phrase,target,threshold[,operator (default: >)]
|
|
2
|
+
|
|
3
|
+
"Anbetung der Könige","Die Anbetung der Könige",0.95
|
|
4
|
+
"Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
|
|
5
|
+
"Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
|
|
6
|
+
"Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
|
|
7
|
+
"Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
|
|
8
|
+
|
|
9
|
+
"Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
|
|
10
|
+
"Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
|
|
11
|
+
"Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
|
|
12
|
+
"Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
|
|
13
|
+
|
|
14
|
+
"Die Anbetung der Heiligen Drei Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95
|
|
15
|
+
|
|
16
|
+
"Anbetung der Könige","Die Die Die Anbetung der Könige",0.95,<
|
|
17
|
+
"Anbetung der Könige","Die Die Die Anbetung der Könige",0.8
|
|
18
|
+
"Anbetung der Könige","Die Könige der Anbetung",0.95,<
|
|
19
|
+
"Anbetung der Könige","Die Könige der Anbetung",0.8
|
|
20
|
+
"Anbetung der Könige","Königsanbetung hoch drei",0.95,<
|
|
21
|
+
"Anbetung der Könige","Königsanbetung hoch drei",0.8
|
|
22
|
+
|
|
23
|
+
"Anbetung der Könige","Drei mal drei macht sechs",0.5,<
|
|
24
|
+
"Anbetung der Könige","Das Ende dieses Blödsinns",0.5,<
|
|
25
|
+
|
|
26
|
+
# test ;-)
|
|
27
|
+
,,,
|
data/sample/config.yaml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
---
|
|
2
|
+
:distance_spec:
|
|
3
|
+
|
|
4
|
+
# default, as is
|
|
5
|
+
# - - {}
|
|
6
|
+
# - 1
|
|
7
|
+
|
|
8
|
+
# ignore (exclude) adjectives and particles
|
|
9
|
+
- - :excl: [a, t]
|
|
10
|
+
- 2
|
|
11
|
+
|
|
12
|
+
# consider (include) only substantives
|
|
13
|
+
- - :incl: s
|
|
14
|
+
- 3
|
|
15
|
+
|
|
16
|
+
# consider (include) only synonyms
|
|
17
|
+
# - - :incl: y
|
|
18
|
+
# - 4
|
|
19
|
+
|
|
20
|
+
# sort the tokens when comparing
|
|
21
|
+
# - - :sort: true
|
|
22
|
+
# - 4
|
|
23
|
+
|
|
24
|
+
# replace tokens by their soundex value
|
|
25
|
+
- - :soundex: true
|
|
26
|
+
- 4
|
|
27
|
+
|
|
28
|
+
:default_coeff: 35
|
data/sample/lingo.cfg
ADDED
data/sample/phrases.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Anbetung der Könige
|
|
2
|
+
Das Ende dieses Blödsinns
|
|
3
|
+
dIE AnBeTuNg der heILIGen dREI KÖniGE
|
|
4
|
+
Die Anbetung der Heiligen Drei Könige
|
|
5
|
+
Die Anbetung der Könige
|
|
6
|
+
Die Die Die Anbetung der Könige
|
|
7
|
+
Die Könige der Anbetung
|
|
8
|
+
Drei mal drei macht sechs
|
|
9
|
+
Königsanbetung hoch drei
|
|
@@ -9,8 +9,9 @@ describe PerseusMatch::Cluster do
|
|
|
9
9
|
|
|
10
10
|
it 'should accept threshold option in sort_by (1a)' do
|
|
11
11
|
PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
|
|
12
|
-
matches.size.should
|
|
12
|
+
matches.size.should == 1
|
|
13
13
|
matches.size.should == matches.nitems
|
|
14
|
+
matches.each { |match| match.target.should == phrase }
|
|
14
15
|
}
|
|
15
16
|
end
|
|
16
17
|
|
|
@@ -4,6 +4,14 @@ describe PerseusMatch::TokenSet, ' with lingo' do
|
|
|
4
4
|
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
|
5
5
|
end
|
|
6
6
|
|
|
7
|
+
before :all do
|
|
8
|
+
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
after :all do
|
|
12
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
|
|
13
|
+
end
|
|
14
|
+
|
|
7
15
|
it 'should tokenize a string' do
|
|
8
16
|
PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
|
|
9
17
|
end
|
|
@@ -36,6 +44,14 @@ describe PerseusMatch::TokenSet, ' without lingo' do
|
|
|
36
44
|
PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
|
|
37
45
|
end
|
|
38
46
|
|
|
47
|
+
before :all do
|
|
48
|
+
@original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
after :all do
|
|
52
|
+
PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
|
|
53
|
+
end
|
|
54
|
+
|
|
39
55
|
it 'should take a prepared file for tokenization' do
|
|
40
56
|
# prevent lingo from being used
|
|
41
57
|
lingo_base = LINGO_BASE.dup
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: perseus_match
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jens Wille
|
|
@@ -9,7 +9,7 @@ autorequire:
|
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
11
|
|
|
12
|
-
date:
|
|
12
|
+
date: 2009-01-13 00:00:00 +01:00
|
|
13
13
|
default_executable:
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
@@ -52,13 +52,16 @@ files:
|
|
|
52
52
|
- Rakefile
|
|
53
53
|
- COPYING
|
|
54
54
|
- ChangeLog
|
|
55
|
-
- LINGO_BASE
|
|
56
55
|
- README
|
|
57
56
|
- spec/spec_helper.rb
|
|
58
57
|
- spec/perseus_match/list_spec.rb
|
|
59
58
|
- spec/perseus_match/cluster_spec.rb
|
|
60
59
|
- spec/perseus_match/token_set_spec.rb
|
|
61
60
|
- spec/perseus_match_spec.rb
|
|
61
|
+
- sample/config.yaml
|
|
62
|
+
- sample/lingo.cfg
|
|
63
|
+
- sample/phrases.txt
|
|
64
|
+
- sample/check.csv
|
|
62
65
|
has_rdoc: true
|
|
63
66
|
homepage: http://prometheus.rubyforge.org/perseus_match
|
|
64
67
|
post_install_message:
|
data/LINGO_BASE
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
/home/jw/devel/lingo/trunk
|