perseus_match 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to perseus_match version 0.0.3
5
+ This documentation refers to perseus_match version 0.0.4
6
6
 
7
7
 
8
8
  == DESCRIPTION
data/Rakefile CHANGED
@@ -13,7 +13,7 @@ begin
13
13
  :version => PerseusMatch::VERSION,
14
14
  :summary => %q{Fuzzy string matching based on linguistic analysis},
15
15
  :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
- :extra_files => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
16
+ :extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
17
17
  :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
18
18
  }
19
19
  }}
data/bin/perseus_match CHANGED
@@ -3,6 +3,7 @@
3
3
  require 'optparse'
4
4
  require 'benchmark'
5
5
  require 'yaml'
6
+ require 'set'
6
7
 
7
8
  require 'rubygems'
8
9
  require 'nuggets/enumerable/minmax'
@@ -20,9 +21,11 @@ options = {
20
21
  :threshold => 0,
21
22
  :sort => false,
22
23
  :stats => false,
23
- :lingo => false,
24
+ :silent => false,
25
+ :unknowns => nil,
24
26
  :minimal => false,
25
27
  :separate => false,
28
+ :lingo => false,
26
29
  :check => false,
27
30
  :failed_only => false,
28
31
  :align => false,
@@ -57,6 +60,14 @@ OptionParser.new { |opts|
57
60
  options[:verbose] = true
58
61
  }
59
62
 
63
+ opts.on('-n', '--silent', 'Suppress warnings') {
64
+ options[:silent] = true
65
+ }
66
+
67
+ opts.on('-u', '--unknowns FILE', 'Write unknown tokens as lingo dictionary to FILE') { |f|
68
+ options[:unknowns] = f
69
+ }
70
+
60
71
  opts.separator ' '
61
72
  opts.separator ' * Calculating similarities (default)'
62
73
  opts.separator ' '
@@ -111,14 +122,24 @@ else
111
122
  abort "Input file not found: #{file}" unless File.readable?(file)
112
123
  end
113
124
 
114
- PerseusMatch::TokenSet.tokenize(file)
125
+ unknowns = Set.new if options[:unknowns]
126
+
127
+ PerseusMatch::TokenSet.tokenize(file, unknowns || !options[:silent])
128
+
129
+ if unknowns
130
+ File.open(options[:unknowns], 'w') { |f|
131
+ unknowns.sort.each { |unk| f.puts "#{unk}=#{unk} #?" }
132
+ }
133
+
134
+ unknowns = nil
135
+ end
115
136
 
116
137
  skip_re = %r{\A\s*(?:#|\z)}o
117
138
  phrases = []
118
139
 
119
- File.open(file).each { |line|
120
- phrases << line.chomp unless line =~ skip_re
121
- }.close
140
+ File.open(file) { |f|
141
+ f.each { |line| phrases << line.chomp unless line =~ skip_re }
142
+ }
122
143
 
123
144
  pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
124
145
  pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
@@ -144,7 +165,6 @@ action = if options[:check]
144
165
  end
145
166
 
146
167
  phrases.sort! if options[:sort]
147
- phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
148
168
 
149
169
  global_threshold = options[:threshold]
150
170
  failed_only = options[:failed_only]
@@ -157,12 +177,12 @@ action = if options[:check]
157
177
  count, count_all = 0, 0
158
178
  positives = negatives = false_positives = false_negatives = 0.0
159
179
 
160
- phrases.each { |line, spec|
161
- phrase, target, threshold, operator, _ = *spec
180
+ phrases.each { |line|
181
+ phrase, target, threshold, operator, _ = *FasterCSV.parse_line(line)
162
182
 
163
183
  threshold ||= global_threshold
164
184
  operator ||= '>'
165
- assign = operator =~ />/
185
+ assign = operator =~ />/ || operator == '=='
166
186
 
167
187
  begin
168
188
  PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
@@ -176,6 +196,7 @@ action = if options[:check]
176
196
 
177
197
  puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
178
198
  end
199
+
179
200
  count_all += 1
180
201
  }
181
202
 
@@ -195,9 +216,13 @@ action = if options[:check]
195
216
  precision = divide[positives, positives + false_positives]
196
217
  f1 = divide[2 * recall * precision, recall + precision]
197
218
 
198
- warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
219
+ stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
199
220
  recall * 100, precision * 100, f1, error
200
221
  ]
222
+
223
+ stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
224
+
225
+ warn stats
201
226
  end
202
227
 
203
228
  error if adjust_coeff
@@ -222,23 +247,19 @@ action = if options[:check]
222
247
  best_coeff = start_coeff
223
248
  else
224
249
  if best_err == previous_err
225
- max.times {
226
- break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
227
- previous_err, previous_coeff = current_err, current_coeff
228
- }
229
-
230
- best_err, best_coeff = previous_err, previous_coeff
250
+ step *= -1
251
+ best_coeff = previous_coeff
231
252
  else
232
- max.times {
233
- break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
234
- next_err, next_coeff = current_err, current_coeff
235
- }
236
-
237
- best_err, best_coeff = next_err, next_coeff
253
+ best_coeff = next_coeff
238
254
  end
255
+
256
+ max.times {
257
+ break if (current_err = _action[current_coeff = best_coeff + step]) > best_err
258
+ best_err, best_coeff = current_err, current_coeff
259
+ }
239
260
  end
240
261
 
241
- puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
262
+ puts 'Coeff = %d (%d), Err = %0.4f (%0.4f)' % [best_coeff, start_coeff, best_err, start_err]
242
263
  }
243
264
  else
244
265
  _action
@@ -246,38 +267,38 @@ action = if options[:check]
246
267
  else
247
268
  format =
248
269
  options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
249
- options[:sort] ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
270
+ options[:sort] ? lambda { |pm| " #{[pm.target, pm.distance, pm.similarity].inspect}" } :
250
271
  lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
251
272
 
252
273
  if options[:sort]
253
- require 'pp'
254
-
255
274
  lambda {
256
- pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
275
+ PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
276
+ count_all += 1
277
+
257
278
  if pm.similarity >= threshold
258
- res = format[pm]
259
279
  count += 1
280
+ format[pm]
260
281
  end
261
- count_all += 1
262
- res
263
- }.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
282
+ }.each { |phrase, matches|
283
+ puts "#{phrase.inspect}:", matches.compact
284
+ }
264
285
  }
265
286
  else
266
287
  lambda {
267
288
  separator, previous_phrase = options[:separate], nil
268
289
 
269
290
  PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
291
+ count_all += 1
292
+
270
293
  if separator && pm.phrase != previous_phrase ||= pm.phrase
271
294
  puts separator
272
295
  previous_phrase = pm.phrase
273
296
  end
274
297
 
275
298
  if pm.similarity >= threshold
276
- puts format[pm]
277
299
  count += 1
300
+ puts format[pm]
278
301
  end
279
-
280
- count_all += 1
281
302
  }
282
303
  }
283
304
  end
@@ -78,7 +78,7 @@ class PerseusMatch
78
78
 
79
79
  class TokenSet < Array
80
80
 
81
- def self.tokenize(form)
81
+ def self.tokenize(form, unknowns = false)
82
82
  return @tokens[form] if @tokens
83
83
 
84
84
  @_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
@@ -90,12 +90,22 @@ class PerseusMatch
90
90
  case res
91
91
  when /<(.*?)\s=\s\[(.*)\]>/
92
92
  a, b = $1, $2
93
- @_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
93
+ a.sub!(/\|.*/, '')
94
+
95
+ @_tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten
94
96
  when /<(.*)>/, /:(.*):/
95
97
  a, b = $1, $1.dup
96
- @_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
98
+ a.sub!(/[\/|].*/, '')
99
+
100
+ if unknowns && b =~ /\|\?\z/
101
+ if unknowns.respond_to?(:<<)
102
+ unknowns << a
103
+ else
104
+ warn "UNK: #{a} [#{res.strip}]"
105
+ end
106
+ end
97
107
 
98
- warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
108
+ @_tokens[a] ||= [b.replace_diacritics.downcase]
99
109
  end
100
110
  }
101
111
  }
@@ -4,7 +4,7 @@ class PerseusMatch
4
4
 
5
5
  MAJOR = 0
6
6
  MINOR = 0
7
- TINY = 3
7
+ TINY = 4
8
8
 
9
9
  class << self
10
10
 
data/sample/check.csv ADDED
@@ -0,0 +1,27 @@
1
+ # phrase,target,threshold[,operator (default: >)]
2
+
3
+ "Anbetung der Könige","Die Anbetung der Könige",0.95
4
+ "Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
5
+ "Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
6
+ "Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
7
+ "Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
8
+
9
+ "Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
10
+ "Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
11
+ "Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
12
+ "Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
13
+
14
+ "Die Anbetung der Heiligen Drei Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95
15
+
16
+ "Anbetung der Könige","Die Die Die Anbetung der Könige",0.95,<
17
+ "Anbetung der Könige","Die Die Die Anbetung der Könige",0.8
18
+ "Anbetung der Könige","Die Könige der Anbetung",0.95,<
19
+ "Anbetung der Könige","Die Könige der Anbetung",0.8
20
+ "Anbetung der Könige","Königsanbetung hoch drei",0.95,<
21
+ "Anbetung der Könige","Königsanbetung hoch drei",0.8
22
+
23
+ "Anbetung der Könige","Drei mal drei macht sechs",0.5,<
24
+ "Anbetung der Könige","Das Ende dieses Blödsinns",0.5,<
25
+
26
+ # test ;-)
27
+ ,,,
@@ -0,0 +1,28 @@
1
+ ---
2
+ :distance_spec:
3
+
4
+ # default, as is
5
+ # - - {}
6
+ # - 1
7
+
8
+ # ignore (exclude) adjectives and particles
9
+ - - :excl: [a, t]
10
+ - 2
11
+
12
+ # consider (include) only substantives
13
+ - - :incl: s
14
+ - 3
15
+
16
+ # consider (include) only synonyms
17
+ # - - :incl: y
18
+ # - 4
19
+
20
+ # sort the tokens when comparing
21
+ # - - :sort: true
22
+ # - 4
23
+
24
+ # replace tokens by their soundex value
25
+ - - :soundex: true
26
+ - 4
27
+
28
+ :default_coeff: 35
data/sample/lingo.cfg ADDED
@@ -0,0 +1,8 @@
1
+ ---
2
+ meeting:
3
+ attendees:
4
+ - tokenizer: { }
5
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
6
+ - decomposer: { source: 'sys-dic' }
7
+ - multiworder: { source: 'sys-mul', stopper: 'PUNC,OTHR' }
8
+ - synonymer: { source: 'sys-syn', skip: '?,t' }
@@ -0,0 +1,9 @@
1
+ Anbetung der Könige
2
+ Das Ende dieses Blödsinns
3
+ dIE AnBeTuNg der heILIGen dREI KÖniGE
4
+ Die Anbetung der Heiligen Drei Könige
5
+ Die Anbetung der Könige
6
+ Die Die Die Anbetung der Könige
7
+ Die Könige der Anbetung
8
+ Drei mal drei macht sechs
9
+ Königsanbetung hoch drei
@@ -9,8 +9,9 @@ describe PerseusMatch::Cluster do
9
9
 
10
10
  it 'should accept threshold option in sort_by (1a)' do
11
11
  PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
12
- matches.size.should be_zero
12
+ matches.size.should == 1
13
13
  matches.size.should == matches.nitems
14
+ matches.each { |match| match.target.should == phrase }
14
15
  }
15
16
  end
16
17
 
@@ -4,6 +4,14 @@ describe PerseusMatch::TokenSet, ' with lingo' do
4
4
  PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
5
5
  end
6
6
 
7
+ before :all do
8
+ @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
9
+ end
10
+
11
+ after :all do
12
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
13
+ end
14
+
7
15
  it 'should tokenize a string' do
8
16
  PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
9
17
  end
@@ -36,6 +44,14 @@ describe PerseusMatch::TokenSet, ' without lingo' do
36
44
  PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
37
45
  end
38
46
 
47
+ before :all do
48
+ @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
49
+ end
50
+
51
+ after :all do
52
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
53
+ end
54
+
39
55
  it 'should take a prepared file for tokenization' do
40
56
  # prevent lingo from being used
41
57
  lingo_base = LINGO_BASE.dup
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: perseus_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-12-09 00:00:00 +01:00
12
+ date: 2009-01-13 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -52,13 +52,16 @@ files:
52
52
  - Rakefile
53
53
  - COPYING
54
54
  - ChangeLog
55
- - LINGO_BASE
56
55
  - README
57
56
  - spec/spec_helper.rb
58
57
  - spec/perseus_match/list_spec.rb
59
58
  - spec/perseus_match/cluster_spec.rb
60
59
  - spec/perseus_match/token_set_spec.rb
61
60
  - spec/perseus_match_spec.rb
61
+ - sample/config.yaml
62
+ - sample/lingo.cfg
63
+ - sample/phrases.txt
64
+ - sample/check.csv
62
65
  has_rdoc: true
63
66
  homepage: http://prometheus.rubyforge.org/perseus_match
64
67
  post_install_message:
data/LINGO_BASE DELETED
@@ -1 +0,0 @@
1
- /home/jw/devel/lingo/trunk