blackwinter-perseus_match 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to perseus_match version 0.0.6
5
+ This documentation refers to perseus_match version 0.0.7
6
6
 
7
7
 
8
8
  == DESCRIPTION
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ begin
14
14
  :summary => %q{Fuzzy string matching based on linguistic analysis},
15
15
  :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
16
  :extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
17
- :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
17
+ :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0'], ['unicode', '>= 0.1.1']]
18
18
  }
19
19
  }}
20
20
  rescue LoadError
@@ -130,7 +130,7 @@ end
130
130
 
131
131
  unknowns = Set.new if options[:unknowns]
132
132
 
133
- PerseusMatch::TokenSet.tokenize(file, unknowns || !options[:silent])
133
+ PerseusMatch.tokenize(file, unknowns || !options[:silent])
134
134
 
135
135
  if unknowns
136
136
  File.open(options[:unknowns], 'w') { |f|
@@ -154,7 +154,12 @@ list_options = { :minimal => options[:minimal] }
154
154
  threshold, count, count_all = options[:threshold], 0, 0
155
155
 
156
156
  action = if options[:check]
157
- require 'fastercsv'
157
+ require 'csv'
158
+
159
+ if CSV.const_defined?(:Reader)
160
+ require 'fastercsv'
161
+ CSV = FasterCSV
162
+ end
158
163
 
159
164
  format = if options[:align]
160
165
  require 'jcode'
@@ -184,23 +189,23 @@ action = if options[:check]
184
189
  positives = negatives = false_positives = false_negatives = 0.0
185
190
 
186
191
  phrases.each { |line|
187
- phrase, target, threshold, operator, _ = *FasterCSV.parse_line(line)
192
+ phrase, target, threshold, operator, _ = *CSV.parse_line(line)
188
193
 
189
194
  threshold ||= global_threshold
190
195
  operator ||= '>'
191
196
  assign = operator =~ />/ || operator == '=='
192
197
 
193
198
  begin
194
- PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
199
+ res = PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
195
200
 
196
201
  count += 1
197
202
  assign ? positives += 1 : negatives += 1
198
203
 
199
- puts format[line, 'OK'] unless adjust_coeff || failed_only
204
+ puts format[line, "OK -- #{res.value} (#{res.pm.distance})"] unless adjust_coeff || failed_only
200
205
  rescue PerseusMatch::CheckFailedError => err
201
206
  assign ? false_negatives += 1 : false_positives += 1
202
207
 
203
- puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
208
+ puts format[line, "FAILED -- #{err.value} (#{err.pm.distance})"] unless adjust_coeff
204
209
  end
205
210
 
206
211
  count_all += 1
@@ -222,8 +227,17 @@ action = if options[:check]
222
227
  precision = divide[positives, positives + false_positives]
223
228
  f1 = divide[2 * recall * precision, recall + precision]
224
229
 
225
- stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
226
- recall * 100, precision * 100, f1, error
230
+ error_all = divide[ # trivial: assign all
231
+ negatives + false_positives,
232
+ positives + negatives + false_positives + false_negatives
233
+ ]
234
+ error_none = divide[ # trivial: assign none
235
+ positives + false_negatives,
236
+ positives + negatives + false_positives + false_negatives
237
+ ]
238
+
239
+ stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f (ALL = %0.4f, NONE = %0.4f)' % [
240
+ recall * 100, precision * 100, f1, error, error_all, error_none
227
241
  ]
228
242
 
229
243
  stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
@@ -26,8 +26,11 @@
26
26
  ###############################################################################
27
27
  #++
28
28
 
29
+ require 'perseus_match/core_ext'
30
+
29
31
  require 'perseus_match/list'
30
32
  require 'perseus_match/cluster'
33
+ require 'perseus_match/token'
31
34
  require 'perseus_match/token_set'
32
35
 
33
36
  require 'perseus_match/version'
@@ -36,7 +39,7 @@ class PerseusMatch
36
39
 
37
40
  Infinity = 1.0 / 0
38
41
 
39
- DEFAULT_COEFF = 20
42
+ DEFAULT_COEFF = 2
40
43
 
41
44
  DISTANCE_SPEC = [ # {
42
45
  [{}, 1], # {} => 1,
@@ -68,8 +71,22 @@ class PerseusMatch
68
71
  end
69
72
 
70
73
  def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
71
- value = new(phrase, target, pm_options).send(attribute)
72
- value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
74
+ pm = new(phrase, target, pm_options)
75
+ value = pm.send(attribute)
76
+
77
+ if value.send(operator, threshold)
78
+ Struct.new(:pm, :value, :threshold, :operator).new(pm, value, threshold, operator)
79
+ else
80
+ raise CheckFailedError.new(pm, value, threshold, operator)
81
+ end
82
+ end
83
+
84
+ def tokenize(form, unknowns = false)
85
+ if file = TokenSet.file?(form)
86
+ TokenSet.tokenize(file, unknowns)
87
+ else
88
+ PhraseTokenSet.tokenize(form, unknowns)
89
+ end
73
90
  end
74
91
 
75
92
  end
@@ -77,8 +94,8 @@ class PerseusMatch
77
94
  attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
78
95
 
79
96
  def initialize(phrase, target, options = {})
80
- @phrase = phrase.to_s
81
- @target = target.to_s
97
+ @phrase = sanitize(phrase.to_s)
98
+ @target = sanitize(target.to_s)
82
99
 
83
100
  @default_coeff = options[:default_coeff] || DEFAULT_COEFF
84
101
  @distance_spec = options[:distance_spec] || DISTANCE_SPEC
@@ -89,11 +106,11 @@ class PerseusMatch
89
106
  end
90
107
 
91
108
  def phrase_tokens
92
- @phrase_tokens ||= tokenize(phrase)
109
+ @phrase_tokens ||= self.class.tokenize(phrase)
93
110
  end
94
111
 
95
112
  def target_tokens
96
- @target_tokens ||= tokenize(target)
113
+ @target_tokens ||= self.class.tokenize(target)
97
114
  end
98
115
 
99
116
  # 0 <= distance <= Infinity
@@ -104,13 +121,13 @@ class PerseusMatch
104
121
  # 1 >= similarity >= 0
105
122
  def similarity(coeff = nil)
106
123
  coeff ||= default_coeff # passed arg may be nil
107
- @similarity[coeff] ||= 1 / Math.exp(distance / (coeff * total_weight))
124
+ @similarity[coeff] ||= normalize_distance(coeff)
108
125
  end
109
126
 
110
127
  private
111
128
 
112
- def tokenize(str)
113
- TokenSet.new(str)
129
+ def sanitize(str)
130
+ str.gsub(/\s*\(.*?\)|\s*\[.*?\]/, '').sub(/\s*[\/:].*/, '')
114
131
  end
115
132
 
116
133
  def calculate_distance
@@ -148,16 +165,25 @@ class PerseusMatch
148
165
  distance
149
166
  end
150
167
 
168
+ def normalize_distance(coeff)
169
+ length = phrase_tokens.size + target_tokens.size
170
+ return 0 if length == 0
171
+
172
+ norm = Math.log(length ** Math.sqrt(2)) * coeff * total_weight * Math::E
173
+
174
+ 1 / Math.exp(distance / norm)
175
+ end
176
+
151
177
  def total_weight
152
178
  @total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
153
179
  end
154
180
 
155
181
  class CheckFailedError < StandardError
156
182
 
157
- attr_reader :value, :threshold, :operator
183
+ attr_reader :pm, :value, :threshold, :operator
158
184
 
159
- def initialize(value, threshold, operator)
160
- @value, @threshold, @operator = value, threshold, operator
185
+ def initialize(pm, value, threshold, operator)
186
+ @pm, @value, @threshold, @operator = pm, value, threshold, operator
161
187
  end
162
188
 
163
189
  def to_s
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'unicode'
3
+
4
+ class String
5
+
6
+ def downcase
7
+ Unicode.downcase(self)
8
+ end
9
+
10
+ def downcase!
11
+ replace downcase
12
+ end
13
+
14
+ end
@@ -0,0 +1,43 @@
1
+ class PerseusMatch
2
+
3
+ class Token < String
4
+
5
+ WC_RE = %r{[/|]([^/|]*)\z}
6
+
7
+ ANY_WC = '*'.freeze
8
+
9
+ attr_reader :form, :wc
10
+
11
+ def initialize(form, wc = nil)
12
+ @form = form.sub(WC_RE, '')
13
+ @wc = wc || $1
14
+
15
+ super(@form)
16
+ end
17
+
18
+ def match?(wcs)
19
+ wcs = [*wcs].compact
20
+ wcs.include?(wc) || wcs.include?(ANY_WC)
21
+ end
22
+
23
+ def unk?
24
+ wc == '?'
25
+ end
26
+
27
+ def ==(other)
28
+ other.is_a?(self.class) ? form == other.form : form == other
29
+ end
30
+
31
+ def eql?(other)
32
+ self == other && wc == other.wc
33
+ end
34
+
35
+ def inspect
36
+ "#{super}/#{wc}"
37
+ end
38
+
39
+ alias_method :to_s, :inspect
40
+
41
+ end
42
+
43
+ end
@@ -40,20 +40,32 @@ require 'nuggets/util/i18n'
40
40
  begin
41
41
  require 'text/soundex'
42
42
  rescue LoadError
43
- warn "could not load the Text gem -- soundex functionality will not be available"
43
+ warn "Could not load the Text gem -- Soundex functionality will not be available"
44
44
  end
45
45
 
46
46
  LINGO_BASE = ENV['PM_LINGO_BASE'] || (
47
47
  File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
48
48
  )
49
49
 
50
- LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
51
- warn "lingo installation not found at #{LINGO_BASE} -- proceeding anyway" unless LINGO_FOUND
50
+ if LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
51
+ begin
52
+ require File.join(LINGO_BASE, 'lib', 'const')
53
+ rescue LoadError
54
+ end
55
+ else
56
+ warn "Lingo installation not found at #{LINGO_BASE} -- proceeding anyway"
57
+ end
58
+
59
+ unless Object.const_defined?(:PRINTABLE_CHAR)
60
+ PRINTABLE_CHAR = '[\w-]'
61
+ end
62
+
63
+ PRINTABLE_CHAR_RE = %r{(?:#{PRINTABLE_CHAR})+}
52
64
 
53
65
  lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
54
66
  YAML.load_file(file)
55
67
  else
56
- warn "lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
68
+ warn "Lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
57
69
 
58
70
  {
59
71
  'meeting' => {
@@ -78,30 +90,81 @@ class PerseusMatch
78
90
 
79
91
  class TokenSet < Array
80
92
 
81
- def self.tokenize(form, unknowns = false)
82
- return @tokens[form] if @tokens
93
+ class << self
83
94
 
84
- @_tokens, @tokens = {}, Hash.new { |h, k|
85
- h[k] = new(
86
- k, (@_tokens[k] || []) | (
87
- k.scan(/\w+/) + k.scan(/[\w-]+/)
88
- ).map { |i| @_tokens[i] }.flatten.compact
89
- )
90
- }
95
+ def tokenize(form, unknowns = false)
96
+ form.downcase!
97
+ return @tokens[form] if @tokens ||= nil
98
+
99
+ @_tokens = Hash.new
100
+ @tokens = Hash.new { |h, k| h[k] = new(k, @_tokens[k] || []) }
101
+
102
+ tokens_file = ENV['PM_TOKENS_FILE'] || 'perseus.tokens'
103
+
104
+ if File.readable?(tokens_file)
105
+ File.open(tokens_file) { |f| parse(f, unknowns, @_tokens) }
106
+ @tokens[form]
107
+ else
108
+ raise "Lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
109
+
110
+ cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
111
+ YAML.dump(LINGO_CONFIG, t)
112
+ }
113
+
114
+ file = file?(form) || begin
115
+ temp = Tempfile.open('perseus_match_temp') { |t| t.puts form }
116
+ temp.path
117
+ end
118
+
119
+ ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
120
+
121
+ if keep = ENV['PM_KEEP_TOKENS']
122
+ keep = File.expand_path(keep =~ /\A(?:1|y(?:es)?|true)\z/i ? tokens_file : keep)
123
+ end
124
+
125
+ begin
126
+ Dir.chdir(LINGO_BASE) {
127
+ tokens = %x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}
128
+ File.open(keep, 'w') { |f| f.puts tokens } if keep
129
+ parse(tokens, unknowns, @_tokens)
130
+ }
131
+ ensure
132
+ cfg.unlink
133
+ temp.unlink if temp
134
+ end
135
+
136
+ if temp
137
+ tokens, @tokens = @tokens[form], nil
138
+ tokens
139
+ end
140
+ end
141
+ end
142
+
143
+ def file?(form)
144
+ file = Pathname.new(form).absolute? ? form : File.expand_path(form)
145
+ file if File.file?(file) && File.readable?(file)
146
+ end
147
+
148
+ private
91
149
 
92
- parse = lambda { |x|
93
- x.each_line { |res|
150
+ def parse(output, unknowns = false, tokens = {})
151
+ sanitize = lambda { |a|
152
+ a.sub!(Token::WC_RE, '')
153
+ a.downcase!
154
+ }
155
+
156
+ output.each_line { |res|
94
157
  case res
95
158
  when /<(.*?)\s=\s\[(.*)\]>/
96
159
  a, b = $1, $2
97
- a.sub!(/\|.*/, '')
160
+ sanitize[a]
98
161
 
99
- @_tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten
162
+ tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten.map { |t| Token.new(t) }
100
163
  when /<(.*)>/, /:(.*):/
101
- a, b = $1, $1.dup
102
- a.sub!(/[\/|].*/, '')
164
+ a, b = $1, Token.new($1.downcase)
165
+ sanitize[a]
103
166
 
104
- if unknowns && b =~ /\|\?\z/
167
+ if unknowns && b.unk?
105
168
  if unknowns.respond_to?(:<<)
106
169
  unknowns << a
107
170
  else
@@ -109,134 +172,65 @@ class PerseusMatch
109
172
  end
110
173
  end
111
174
 
112
- @_tokens[a] ||= [b.replace_diacritics.downcase]
175
+ tokens[a] ||= [b]
113
176
  end
114
177
  }
115
- }
116
-
117
- if File.readable?(t = 'perseus.tokens')
118
- File.open(t) { |f| parse[f] }
119
- @tokens[form]
120
- else
121
- raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
122
-
123
- cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
124
- YAML.dump(LINGO_CONFIG, t)
125
- }
126
-
127
- file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
128
178
 
129
- unless File.file?(file) && File.readable?(file)
130
- temp = Tempfile.open('perseus_match_temp') { |t|
131
- t.puts form
132
- }
133
-
134
- file = temp.path
135
- end
136
-
137
- ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
138
-
139
- begin
140
- Dir.chdir(LINGO_BASE) {
141
- parse[%x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}]
142
- }
143
- ensure
144
- cfg.unlink
145
- temp.unlink if temp
146
- end
147
-
148
- if temp
149
- tokens, @tokens = @tokens[form], nil
150
- tokens
151
- else
152
- @tokens[form]
153
- end
179
+ tokens
154
180
  end
181
+
155
182
  end
156
183
 
157
184
  private :push, :<<, :[]= # maybe more...
158
185
 
159
- attr_reader :form
186
+ attr_reader :form, :tokens
160
187
 
161
188
  def initialize(form, tokens = nil)
162
189
  super(tokens || self.class.tokenize(form))
163
190
 
164
191
  @form = form
165
- @tokens = to_a.flatten
192
+ @tokens = to_a
166
193
  end
167
194
 
168
195
  def distance(other)
169
- tokens1, tokens2 = tokens, other.tokens
170
- size1, size2 = tokens1.size, tokens2.size
171
-
172
- return size2 if tokens1.empty?
173
- return size1 if tokens2.empty?
174
-
175
- distance, costs = nil, (0..size2).to_a
176
-
177
- 0.upto(size1 - 1) { |index1|
178
- token1, cost = tokens1[index1], index1 + 1
179
-
180
- 0.upto(size2 - 1) { |index2|
181
- penalty = token1 == tokens2[index2] ? 0 : 1
182
-
183
- # rcov hack :-(
184
- _ = [
185
- costs[index2 + 1] + 1, # insertion
186
- cost + 1, # deletion
187
- costs[index2] + penalty # substitution
188
- ]
189
- distance = _.min
190
-
191
- costs[index2], cost = cost, distance
192
- }
193
-
194
- costs[size2] = distance
195
- }
196
-
197
- distance + 1 # > 0 !?!
196
+ (forms | other.forms).size - (forms & other.forms).size
198
197
  end
199
198
 
200
- def tokens(wc = true)
201
- wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
202
- token.sub(%r{[/|].*?\z}, '')
203
- }
199
+ def forms
200
+ @forms ||= map { |token| token.form }
204
201
  end
205
202
 
206
203
  def disjoint?(other)
207
- (tokens(false) & other.tokens(false)).empty?
204
+ (forms.flatten & other.forms.flatten).flatten.empty?
208
205
  end
209
206
 
210
207
  def inclexcl(inclexcl = {})
211
- incl(inclexcl[:incl] || '.*').excl(inclexcl[:excl])
208
+ incl(inclexcl[:incl] || Token::ANY_WC).excl(inclexcl[:excl])
212
209
  end
213
210
 
214
- def incl(*wc)
215
- (@incl ||= {})[wc = [*wc].compact] ||= select { |token|
216
- match?(token, wc)
217
- }.to_token_set(form)
211
+ def incl(wcs)
212
+ self.class.new(form, select { |token| token.match?(wcs) })
218
213
  end
219
214
 
220
- def excl(*wc)
221
- (@excl ||= {})[wc = [*wc].compact] ||= reject { |token|
222
- match?(token, wc)
223
- }.to_token_set(form)
215
+ def excl(wcs)
216
+ self.class.new(form, reject { |token| token.match?(wcs) })
224
217
  end
225
218
 
226
219
  def soundex
227
- raise "soundex functionality not available" unless defined?(Text::Soundex)
220
+ ensure_soundex!
228
221
 
229
- @soundex ||= map { |token|
230
- token.sub(/(.*)(?=[\/|])/) { |m| Text::Soundex.soundex(m.replace_diacritics) }
231
- }.to_token_set(form)
222
+ @soundex ||= self.class.new(form, map { |token|
223
+ form = token.form.replace_diacritics.sub(/\W+/, '')
224
+ Token.new(Text::Soundex.soundex(form) || '', token.wc)
225
+ })
232
226
  end
233
227
 
234
- def soundex!
235
- replace soundex
228
+ def ==(other)
229
+ tokens == other.tokens
236
230
  end
237
231
 
238
232
  def eql?(other)
239
- tokens == other.tokens && form == other.form
233
+ self == other && form == other.form
240
234
  end
241
235
 
242
236
  def inspect
@@ -247,16 +241,77 @@ class PerseusMatch
247
241
 
248
242
  private
249
243
 
250
- def match?(token, wc)
251
- token =~ %r{[/|](?:#{wc.join('|')})\z}
244
+ def ensure_soundex!
245
+ unless defined?(Text::Soundex)
246
+ raise RuntimeError, "Soundex functionality not available", caller(1)
247
+ end
252
248
  end
253
249
 
254
250
  end
255
251
 
256
- class ::Array
252
+ class PhraseTokenSet < TokenSet
253
+
254
+ class << self
255
+
256
+ def tokenize(form, unknowns = false)
257
+ (@tokens ||= {})[form] ||= new(form, form.scan(PRINTABLE_CHAR_RE).map { |i|
258
+ TokenSet.tokenize(i, unknowns)
259
+ })
260
+ end
261
+
262
+ end
263
+
264
+ alias_method :phrase, :form
265
+ alias_method :token_sets, :tokens
266
+
267
+ # (size1 - size2).abs <= distance <= [size1, size2].max
268
+ def distance(other)
269
+ token_sets1, token_sets2 = token_sets, other.token_sets
270
+ size1, size2 = token_sets1.size, token_sets2.size
271
+
272
+ return size2 if size1 == 0
273
+ return size1 if size2 == 0
274
+
275
+ distance, costs = nil, (0..size2).to_a
276
+
277
+ 0.upto(size1 - 1) { |index1|
278
+ token_set1, cost = token_sets1[index1], index1 + 1
279
+
280
+ 0.upto(size2 - 1) { |index2|
281
+ penalty = token_set1.distance(token_sets2[index2])
282
+
283
+ # rcov hack :-(
284
+ _ = [
285
+ costs[index2 + 1] + 1, # insertion
286
+ cost + 1, # deletion
287
+ costs[index2] + penalty # substitution
288
+ ]
289
+ distance = _.min
257
290
 
258
- def to_token_set(form)
259
- TokenSet.new(form, self)
291
+ costs[index2], cost = cost, distance
292
+ }
293
+
294
+ costs[size2] = distance
295
+ }
296
+
297
+ distance
298
+ end
299
+
300
+ def forms
301
+ @forms ||= map { |token_set| token_set.forms }
302
+ end
303
+
304
+ def incl(wcs)
305
+ self.class.new(form, map { |token_set| token_set.incl(wcs) })
306
+ end
307
+
308
+ def excl(wcs)
309
+ self.class.new(form, map { |token_set| token_set.excl(wcs) })
310
+ end
311
+
312
+ def soundex
313
+ ensure_soundex!
314
+ @soundex ||= self.class.new(form, map { |token_set| token_set.soundex })
260
315
  end
261
316
 
262
317
  end
@@ -4,7 +4,7 @@ class PerseusMatch
4
4
 
5
5
  MAJOR = 0
6
6
  MINOR = 0
7
- TINY = 6
7
+ TINY = 7
8
8
 
9
9
  class << self
10
10
 
@@ -1,81 +1,105 @@
1
- describe PerseusMatch::TokenSet, ' with lingo' do
1
+ describe PerseusMatch::PhraseTokenSet do
2
2
 
3
- before :each do
4
- PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
5
- end
3
+ describe 'with lingo' do
6
4
 
7
- before :all do
8
- @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
9
- end
5
+ before :all do
6
+ @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
7
+ @original_phrase_tokens = PerseusMatch::PhraseTokenSet.instance_variable_get(:@tokens)
8
+ end
10
9
 
11
- after :all do
12
- PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
13
- end
10
+ after :all do
11
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
12
+ PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, @original_phrase_tokens)
13
+ end
14
14
 
15
- it 'should tokenize a string' do
16
- PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
17
- end
15
+ before :each do
16
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
17
+ PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, nil)
18
+ end
18
19
 
19
- it 'should report strictly equal TokenSets as ==' do
20
- PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
21
- end
20
+ it 'should tokenize a string' do
21
+ PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
22
+ end
22
23
 
23
- it 'should report strictly equal TokenSets as eql' do
24
- PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
25
- end
24
+ it 'should report strictly equal PhraseTokenSets as ==' do
25
+ PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('foo bar')
26
+ end
26
27
 
27
- it 'should report slightly equal TokenSets as ==' do
28
- PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
29
- end
28
+ it 'should report strictly equal PhraseTokenSets as eql' do
29
+ PerseusMatch::PhraseTokenSet.new('foo bar').should be_eql(PerseusMatch::PhraseTokenSet.new('foo bar'))
30
+ end
30
31
 
31
- it 'should *not* report slightly equal TokenSets as eql' do
32
- PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
33
- end
32
+ it 'should report slightly equal PhraseTokenSets as ==' do
33
+ PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('Foo Bar')
34
+ end
34
35
 
35
- it 'should include form in inspect' do
36
- PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
37
- end
36
+ it 'should *not* report slightly equal PhraseTokenSets as eql' do
37
+ PerseusMatch::PhraseTokenSet.new('foo bar').should_not be_eql(PerseusMatch::PhraseTokenSet.new('Foo Bar'))
38
+ end
38
39
 
39
- end if LINGO_FOUND
40
+ it 'should collect unknown tokens' do
41
+ unknowns = []
42
+ PerseusMatch::PhraseTokenSet.tokenize('foo bar', unknowns)
43
+ unknowns.should == %w[foo]
44
+ end
40
45
 
41
- describe PerseusMatch::TokenSet, ' without lingo' do
46
+ it 'should include form in inspect' do
47
+ PerseusMatch::PhraseTokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
48
+ end
42
49
 
43
- before :each do
44
- PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
45
- end
50
+ end if LINGO_FOUND
46
51
 
47
- before :all do
48
- @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
49
- end
52
+ describe 'without lingo' do
50
53
 
51
- after :all do
52
- PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
53
- end
54
+ before :all do
55
+ @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
56
+ end
57
+
58
+ after :all do
59
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
60
+ end
54
61
 
55
- it 'should take a prepared file for tokenization' do
56
- # prevent lingo from being used
57
- lingo_base = LINGO_BASE.dup
58
- LINGO_BASE.replace('')
62
+ before :each do
63
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
64
+ end
59
65
 
60
- temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
61
- t.puts *%w[<foo|?> <bar|?>]
62
- }
66
+ it 'should take a prepared file for tokenization' do
67
+ # prevent lingo from being used
68
+ lingo_base = LINGO_BASE.dup
69
+ LINGO_BASE.replace('')
63
70
 
64
- path = temp.path
65
- link = 'perseus.tokens'
71
+ temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
72
+ t.puts *%w[<foo|?> <bar|?>]
73
+ }
66
74
 
67
- Dir.chdir(File.dirname(path)) {
68
- File.symlink(path, link)
75
+ path = temp.path
76
+ link = 'perseus.tokens'
69
77
 
70
- PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
78
+ Dir.chdir(File.dirname(path)) {
79
+ begin
80
+ File.symlink(path, link)
81
+ PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
82
+ ensure
83
+ File.unlink(link) if File.symlink?(link) && File.readlink(link) == path
84
+ end
85
+ }
86
+
87
+ temp.unlink
88
+
89
+ # reset lingo base
90
+ LINGO_BASE.replace(lingo_base)
91
+ end
92
+
93
+ end
71
94
 
72
- File.unlink(link)
73
- }
95
+ it 'should raise an error if asked for Soundex but is not available' do
96
+ soundex = Text.send(:remove_const, :Soundex)
74
97
 
75
- temp.unlink
98
+ lambda {
99
+ PerseusMatch::PhraseTokenSet.new('foo bar').soundex
100
+ }.should raise_error(RuntimeError, /soundex/i)
76
101
 
77
- # reset lingo base
78
- LINGO_BASE.replace(lingo_base)
102
+ Text::Soundex = soundex
79
103
  end
80
104
 
81
105
  end
@@ -0,0 +1,23 @@
1
+ describe PerseusMatch::Token do
2
+
3
+ it 'should report strictly equal Tokens as ==' do
4
+ PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'a')
5
+ end
6
+
7
+ it 'should report strictly equal Tokens as eql' do
8
+ PerseusMatch::Token.new('foo', 'a').should be_eql(PerseusMatch::Token.new('foo', 'a'))
9
+ end
10
+
11
+ it 'should report slightly equal Tokens as ==' do
12
+ PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'b')
13
+ end
14
+
15
+ it 'should *not* report slightly equal Tokens as eql' do
16
+ PerseusMatch::Token.new('foo', 'a').should_not be_eql(PerseusMatch::Token.new('foo', 'b'))
17
+ end
18
+
19
+ it 'should include the word class in inspect' do
20
+ PerseusMatch::Token.new('foo', 'a').inspect.to_s.should =~ /\/a\z/
21
+ end
22
+
23
+ end
@@ -37,7 +37,7 @@ describe PerseusMatch do
37
37
  t.puts *phrases
38
38
  }
39
39
 
40
- PerseusMatch::TokenSet.tokenize(temp.path)
40
+ PerseusMatch.tokenize(temp.path)
41
41
 
42
42
  temp.unlink
43
43
 
@@ -158,13 +158,8 @@ describe PerseusMatch do
158
158
 
159
159
  it 'should be checkable (2)' do
160
160
  lambda {
161
- begin
162
- PerseusMatch.check!('foo', 'bar', 0, :>)
163
- rescue PerseusMatch::CheckFailedError => err
164
- err.to_s.should =~ /0/
165
- raise err
166
- end
167
- }.should raise_error(PerseusMatch::CheckFailedError)
161
+ PerseusMatch.check!('foo', 'bar', 0, :>)
162
+ }.should raise_error(PerseusMatch::CheckFailedError, /0/)
168
163
  end
169
164
 
170
165
  end if LINGO_FOUND
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blackwinter-perseus_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,11 +9,12 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-01-26 00:00:00 -08:00
12
+ date: 2009-02-24 00:00:00 -08:00
13
13
  default_executable: perseus_match
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ruby-backports
17
+ type: :runtime
17
18
  version_requirement:
18
19
  version_requirements: !ruby/object:Gem::Requirement
19
20
  requirements:
@@ -23,6 +24,7 @@ dependencies:
23
24
  version:
24
25
  - !ruby/object:Gem::Dependency
25
26
  name: ruby-nuggets
27
+ type: :runtime
26
28
  version_requirement:
27
29
  version_requirements: !ruby/object:Gem::Requirement
28
30
  requirements:
@@ -30,6 +32,16 @@ dependencies:
30
32
  - !ruby/object:Gem::Version
31
33
  version: 0.4.0
32
34
  version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: unicode
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 0.1.1
44
+ version:
33
45
  description: Fuzzy string matching based on linguistic analysis
34
46
  email: jens.wille@uni-koeln.de
35
47
  executables:
@@ -41,6 +53,8 @@ extra_rdoc_files:
41
53
  - ChangeLog
42
54
  - README
43
55
  files:
56
+ - lib/perseus_match/token.rb
57
+ - lib/perseus_match/core_ext.rb
44
58
  - lib/perseus_match/list.rb
45
59
  - lib/perseus_match/version.rb
46
60
  - lib/perseus_match/token_set.rb
@@ -54,6 +68,7 @@ files:
54
68
  - spec/spec_helper.rb
55
69
  - spec/perseus_match/list_spec.rb
56
70
  - spec/perseus_match/cluster_spec.rb
71
+ - spec/perseus_match/token_spec.rb
57
72
  - spec/perseus_match/token_set_spec.rb
58
73
  - spec/perseus_match_spec.rb
59
74
  - sample/config.yaml