perseus_match 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to perseus_match version 0.0.3
5
+ This documentation refers to perseus_match version 0.0.4
6
6
 
7
7
 
8
8
  == DESCRIPTION
data/Rakefile CHANGED
@@ -13,7 +13,7 @@ begin
13
13
  :version => PerseusMatch::VERSION,
14
14
  :summary => %q{Fuzzy string matching based on linguistic analysis},
15
15
  :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
- :extra_files => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
16
+ :extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
17
17
  :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
18
18
  }
19
19
  }}
data/bin/perseus_match CHANGED
@@ -3,6 +3,7 @@
3
3
  require 'optparse'
4
4
  require 'benchmark'
5
5
  require 'yaml'
6
+ require 'set'
6
7
 
7
8
  require 'rubygems'
8
9
  require 'nuggets/enumerable/minmax'
@@ -20,9 +21,11 @@ options = {
20
21
  :threshold => 0,
21
22
  :sort => false,
22
23
  :stats => false,
23
- :lingo => false,
24
+ :silent => false,
25
+ :unknowns => nil,
24
26
  :minimal => false,
25
27
  :separate => false,
28
+ :lingo => false,
26
29
  :check => false,
27
30
  :failed_only => false,
28
31
  :align => false,
@@ -57,6 +60,14 @@ OptionParser.new { |opts|
57
60
  options[:verbose] = true
58
61
  }
59
62
 
63
+ opts.on('-n', '--silent', 'Suppress warnings') {
64
+ options[:silent] = true
65
+ }
66
+
67
+ opts.on('-u', '--unknowns FILE', 'Write unknown tokens as lingo dictionary to FILE') { |f|
68
+ options[:unknowns] = f
69
+ }
70
+
60
71
  opts.separator ' '
61
72
  opts.separator ' * Calculating similarities (default)'
62
73
  opts.separator ' '
@@ -111,14 +122,24 @@ else
111
122
  abort "Input file not found: #{file}" unless File.readable?(file)
112
123
  end
113
124
 
114
- PerseusMatch::TokenSet.tokenize(file)
125
+ unknowns = Set.new if options[:unknowns]
126
+
127
+ PerseusMatch::TokenSet.tokenize(file, unknowns || !options[:silent])
128
+
129
+ if unknowns
130
+ File.open(options[:unknowns], 'w') { |f|
131
+ unknowns.sort.each { |unk| f.puts "#{unk}=#{unk} #?" }
132
+ }
133
+
134
+ unknowns = nil
135
+ end
115
136
 
116
137
  skip_re = %r{\A\s*(?:#|\z)}o
117
138
  phrases = []
118
139
 
119
- File.open(file).each { |line|
120
- phrases << line.chomp unless line =~ skip_re
121
- }.close
140
+ File.open(file) { |f|
141
+ f.each { |line| phrases << line.chomp unless line =~ skip_re }
142
+ }
122
143
 
123
144
  pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
124
145
  pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
@@ -144,7 +165,6 @@ action = if options[:check]
144
165
  end
145
166
 
146
167
  phrases.sort! if options[:sort]
147
- phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
148
168
 
149
169
  global_threshold = options[:threshold]
150
170
  failed_only = options[:failed_only]
@@ -157,12 +177,12 @@ action = if options[:check]
157
177
  count, count_all = 0, 0
158
178
  positives = negatives = false_positives = false_negatives = 0.0
159
179
 
160
- phrases.each { |line, spec|
161
- phrase, target, threshold, operator, _ = *spec
180
+ phrases.each { |line|
181
+ phrase, target, threshold, operator, _ = *FasterCSV.parse_line(line)
162
182
 
163
183
  threshold ||= global_threshold
164
184
  operator ||= '>'
165
- assign = operator =~ />/
185
+ assign = operator =~ />/ || operator == '=='
166
186
 
167
187
  begin
168
188
  PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
@@ -176,6 +196,7 @@ action = if options[:check]
176
196
 
177
197
  puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
178
198
  end
199
+
179
200
  count_all += 1
180
201
  }
181
202
 
@@ -195,9 +216,13 @@ action = if options[:check]
195
216
  precision = divide[positives, positives + false_positives]
196
217
  f1 = divide[2 * recall * precision, recall + precision]
197
218
 
198
- warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
219
+ stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
199
220
  recall * 100, precision * 100, f1, error
200
221
  ]
222
+
223
+ stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
224
+
225
+ warn stats
201
226
  end
202
227
 
203
228
  error if adjust_coeff
@@ -222,23 +247,19 @@ action = if options[:check]
222
247
  best_coeff = start_coeff
223
248
  else
224
249
  if best_err == previous_err
225
- max.times {
226
- break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
227
- previous_err, previous_coeff = current_err, current_coeff
228
- }
229
-
230
- best_err, best_coeff = previous_err, previous_coeff
250
+ step *= -1
251
+ best_coeff = previous_coeff
231
252
  else
232
- max.times {
233
- break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
234
- next_err, next_coeff = current_err, current_coeff
235
- }
236
-
237
- best_err, best_coeff = next_err, next_coeff
253
+ best_coeff = next_coeff
238
254
  end
255
+
256
+ max.times {
257
+ break if (current_err = _action[current_coeff = best_coeff + step]) > best_err
258
+ best_err, best_coeff = current_err, current_coeff
259
+ }
239
260
  end
240
261
 
241
- puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
262
+ puts 'Coeff = %d (%d), Err = %0.4f (%0.4f)' % [best_coeff, start_coeff, best_err, start_err]
242
263
  }
243
264
  else
244
265
  _action
@@ -246,38 +267,38 @@ action = if options[:check]
246
267
  else
247
268
  format =
248
269
  options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
249
- options[:sort] ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
270
+ options[:sort] ? lambda { |pm| " #{[pm.target, pm.distance, pm.similarity].inspect}" } :
250
271
  lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
251
272
 
252
273
  if options[:sort]
253
- require 'pp'
254
-
255
274
  lambda {
256
- pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
275
+ PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
276
+ count_all += 1
277
+
257
278
  if pm.similarity >= threshold
258
- res = format[pm]
259
279
  count += 1
280
+ format[pm]
260
281
  end
261
- count_all += 1
262
- res
263
- }.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
282
+ }.each { |phrase, matches|
283
+ puts "#{phrase.inspect}:", matches.compact
284
+ }
264
285
  }
265
286
  else
266
287
  lambda {
267
288
  separator, previous_phrase = options[:separate], nil
268
289
 
269
290
  PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
291
+ count_all += 1
292
+
270
293
  if separator && pm.phrase != previous_phrase ||= pm.phrase
271
294
  puts separator
272
295
  previous_phrase = pm.phrase
273
296
  end
274
297
 
275
298
  if pm.similarity >= threshold
276
- puts format[pm]
277
299
  count += 1
300
+ puts format[pm]
278
301
  end
279
-
280
- count_all += 1
281
302
  }
282
303
  }
283
304
  end
@@ -78,7 +78,7 @@ class PerseusMatch
78
78
 
79
79
  class TokenSet < Array
80
80
 
81
- def self.tokenize(form)
81
+ def self.tokenize(form, unknowns = false)
82
82
  return @tokens[form] if @tokens
83
83
 
84
84
  @_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
@@ -90,12 +90,22 @@ class PerseusMatch
90
90
  case res
91
91
  when /<(.*?)\s=\s\[(.*)\]>/
92
92
  a, b = $1, $2
93
- @_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
93
+ a.sub!(/\|.*/, '')
94
+
95
+ @_tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten
94
96
  when /<(.*)>/, /:(.*):/
95
97
  a, b = $1, $1.dup
96
- @_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
98
+ a.sub!(/[\/|].*/, '')
99
+
100
+ if unknowns && b =~ /\|\?\z/
101
+ if unknowns.respond_to?(:<<)
102
+ unknowns << a
103
+ else
104
+ warn "UNK: #{a} [#{res.strip}]"
105
+ end
106
+ end
97
107
 
98
- warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
108
+ @_tokens[a] ||= [b.replace_diacritics.downcase]
99
109
  end
100
110
  }
101
111
  }
@@ -4,7 +4,7 @@ class PerseusMatch
4
4
 
5
5
  MAJOR = 0
6
6
  MINOR = 0
7
- TINY = 3
7
+ TINY = 4
8
8
 
9
9
  class << self
10
10
 
data/sample/check.csv ADDED
@@ -0,0 +1,27 @@
1
+ # phrase,target,threshold[,operator (default: >)]
2
+
3
+ "Anbetung der Könige","Die Anbetung der Könige",0.95
4
+ "Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
5
+ "Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
6
+ "Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
7
+ "Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
8
+
9
+ "Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
10
+ "Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
11
+ "Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
12
+ "Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
13
+
14
+ "Die Anbetung der Heiligen Drei Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95
15
+
16
+ "Anbetung der Könige","Die Die Die Anbetung der Könige",0.95,<
17
+ "Anbetung der Könige","Die Die Die Anbetung der Könige",0.8
18
+ "Anbetung der Könige","Die Könige der Anbetung",0.95,<
19
+ "Anbetung der Könige","Die Könige der Anbetung",0.8
20
+ "Anbetung der Könige","Königsanbetung hoch drei",0.95,<
21
+ "Anbetung der Könige","Königsanbetung hoch drei",0.8
22
+
23
+ "Anbetung der Könige","Drei mal drei macht sechs",0.5,<
24
+ "Anbetung der Könige","Das Ende dieses Blödsinns",0.5,<
25
+
26
+ # test ;-)
27
+ ,,,
@@ -0,0 +1,28 @@
1
+ ---
2
+ :distance_spec:
3
+
4
+ # default, as is
5
+ # - - {}
6
+ # - 1
7
+
8
+ # ignore (exclude) adjectives and particles
9
+ - - :excl: [a, t]
10
+ - 2
11
+
12
+ # consider (include) only substantives
13
+ - - :incl: s
14
+ - 3
15
+
16
+ # consider (include) only synonyms
17
+ # - - :incl: y
18
+ # - 4
19
+
20
+ # sort the tokens when comparing
21
+ # - - :sort: true
22
+ # - 4
23
+
24
+ # replace tokens by their soundex value
25
+ - - :soundex: true
26
+ - 4
27
+
28
+ :default_coeff: 35
data/sample/lingo.cfg ADDED
@@ -0,0 +1,8 @@
1
+ ---
2
+ meeting:
3
+ attendees:
4
+ - tokenizer: { }
5
+ - wordsearcher: { source: 'sys-dic', mode: 'first' }
6
+ - decomposer: { source: 'sys-dic' }
7
+ - multiworder: { source: 'sys-mul', stopper: 'PUNC,OTHR' }
8
+ - synonymer: { source: 'sys-syn', skip: '?,t' }
@@ -0,0 +1,9 @@
1
+ Anbetung der Könige
2
+ Das Ende dieses Blödsinns
3
+ dIE AnBeTuNg der heILIGen dREI KÖniGE
4
+ Die Anbetung der Heiligen Drei Könige
5
+ Die Anbetung der Könige
6
+ Die Die Die Anbetung der Könige
7
+ Die Könige der Anbetung
8
+ Drei mal drei macht sechs
9
+ Königsanbetung hoch drei
@@ -9,8 +9,9 @@ describe PerseusMatch::Cluster do
9
9
 
10
10
  it 'should accept threshold option in sort_by (1a)' do
11
11
  PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
12
- matches.size.should be_zero
12
+ matches.size.should == 1
13
13
  matches.size.should == matches.nitems
14
+ matches.each { |match| match.target.should == phrase }
14
15
  }
15
16
  end
16
17
 
@@ -4,6 +4,14 @@ describe PerseusMatch::TokenSet, ' with lingo' do
4
4
  PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
5
5
  end
6
6
 
7
+ before :all do
8
+ @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
9
+ end
10
+
11
+ after :all do
12
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
13
+ end
14
+
7
15
  it 'should tokenize a string' do
8
16
  PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
9
17
  end
@@ -36,6 +44,14 @@ describe PerseusMatch::TokenSet, ' without lingo' do
36
44
  PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
37
45
  end
38
46
 
47
+ before :all do
48
+ @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
49
+ end
50
+
51
+ after :all do
52
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
53
+ end
54
+
39
55
  it 'should take a prepared file for tokenization' do
40
56
  # prevent lingo from being used
41
57
  lingo_base = LINGO_BASE.dup
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: perseus_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-12-09 00:00:00 +01:00
12
+ date: 2009-01-13 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -52,13 +52,16 @@ files:
52
52
  - Rakefile
53
53
  - COPYING
54
54
  - ChangeLog
55
- - LINGO_BASE
56
55
  - README
57
56
  - spec/spec_helper.rb
58
57
  - spec/perseus_match/list_spec.rb
59
58
  - spec/perseus_match/cluster_spec.rb
60
59
  - spec/perseus_match/token_set_spec.rb
61
60
  - spec/perseus_match_spec.rb
61
+ - sample/config.yaml
62
+ - sample/lingo.cfg
63
+ - sample/phrases.txt
64
+ - sample/check.csv
62
65
  has_rdoc: true
63
66
  homepage: http://prometheus.rubyforge.org/perseus_match
64
67
  post_install_message:
data/LINGO_BASE DELETED
@@ -1 +0,0 @@
1
- /home/jw/devel/lingo/trunk