perseus_match 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to perseus_match version 0.0.6
5
+ This documentation refers to perseus_match version 0.0.7
6
6
 
7
7
 
8
8
  == DESCRIPTION
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ begin
14
14
  :summary => %q{Fuzzy string matching based on linguistic analysis},
15
15
  :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
16
  :extra_files => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
17
- :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
17
+ :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0'], ['unicode', '>= 0.1.1']]
18
18
  }
19
19
  }}
20
20
  rescue LoadError
data/bin/perseus_match CHANGED
@@ -130,7 +130,7 @@ end
130
130
 
131
131
  unknowns = Set.new if options[:unknowns]
132
132
 
133
- PerseusMatch::TokenSet.tokenize(file, unknowns || !options[:silent])
133
+ PerseusMatch.tokenize(file, unknowns || !options[:silent])
134
134
 
135
135
  if unknowns
136
136
  File.open(options[:unknowns], 'w') { |f|
@@ -154,7 +154,12 @@ list_options = { :minimal => options[:minimal] }
154
154
  threshold, count, count_all = options[:threshold], 0, 0
155
155
 
156
156
  action = if options[:check]
157
- require 'fastercsv'
157
+ require 'csv'
158
+
159
+ if CSV.const_defined?(:Reader)
160
+ require 'fastercsv'
161
+ CSV = FasterCSV
162
+ end
158
163
 
159
164
  format = if options[:align]
160
165
  require 'jcode'
@@ -184,23 +189,23 @@ action = if options[:check]
184
189
  positives = negatives = false_positives = false_negatives = 0.0
185
190
 
186
191
  phrases.each { |line|
187
- phrase, target, threshold, operator, _ = *FasterCSV.parse_line(line)
192
+ phrase, target, threshold, operator, _ = *CSV.parse_line(line)
188
193
 
189
194
  threshold ||= global_threshold
190
195
  operator ||= '>'
191
196
  assign = operator =~ />/ || operator == '=='
192
197
 
193
198
  begin
194
- PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
199
+ res = PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
195
200
 
196
201
  count += 1
197
202
  assign ? positives += 1 : negatives += 1
198
203
 
199
- puts format[line, 'OK'] unless adjust_coeff || failed_only
204
+ puts format[line, "OK -- #{res.value} (#{res.pm.distance})"] unless adjust_coeff || failed_only
200
205
  rescue PerseusMatch::CheckFailedError => err
201
206
  assign ? false_negatives += 1 : false_positives += 1
202
207
 
203
- puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
208
+ puts format[line, "FAILED -- #{err.value} (#{err.pm.distance})"] unless adjust_coeff
204
209
  end
205
210
 
206
211
  count_all += 1
@@ -222,8 +227,17 @@ action = if options[:check]
222
227
  precision = divide[positives, positives + false_positives]
223
228
  f1 = divide[2 * recall * precision, recall + precision]
224
229
 
225
- stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
226
- recall * 100, precision * 100, f1, error
230
+ error_all = divide[ # trivial: assign all
231
+ negatives + false_positives,
232
+ positives + negatives + false_positives + false_negatives
233
+ ]
234
+ error_none = divide[ # trivial: assign none
235
+ positives + false_negatives,
236
+ positives + negatives + false_positives + false_negatives
237
+ ]
238
+
239
+ stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f (ALL = %0.4f, NONE = %0.4f)' % [
240
+ recall * 100, precision * 100, f1, error, error_all, error_none
227
241
  ]
228
242
 
229
243
  stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'unicode'
3
+
4
+ class String
5
+
6
+ def downcase
7
+ Unicode.downcase(self)
8
+ end
9
+
10
+ def downcase!
11
+ replace downcase
12
+ end
13
+
14
+ end
@@ -0,0 +1,43 @@
1
+ class PerseusMatch
2
+
3
+ class Token < String
4
+
5
+ WC_RE = %r{[/|]([^/|]*)\z}
6
+
7
+ ANY_WC = '*'.freeze
8
+
9
+ attr_reader :form, :wc
10
+
11
+ def initialize(form, wc = nil)
12
+ @form = form.sub(WC_RE, '')
13
+ @wc = wc || $1
14
+
15
+ super(@form)
16
+ end
17
+
18
+ def match?(wcs)
19
+ wcs = [*wcs].compact
20
+ wcs.include?(wc) || wcs.include?(ANY_WC)
21
+ end
22
+
23
+ def unk?
24
+ wc == '?'
25
+ end
26
+
27
+ def ==(other)
28
+ other.is_a?(self.class) ? form == other.form : form == other
29
+ end
30
+
31
+ def eql?(other)
32
+ self == other && wc == other.wc
33
+ end
34
+
35
+ def inspect
36
+ "#{super}/#{wc}"
37
+ end
38
+
39
+ alias_method :to_s, :inspect
40
+
41
+ end
42
+
43
+ end
@@ -40,20 +40,32 @@ require 'nuggets/util/i18n'
40
40
  begin
41
41
  require 'text/soundex'
42
42
  rescue LoadError
43
- warn "could not load the Text gem -- soundex functionality will not be available"
43
+ warn "Could not load the Text gem -- Soundex functionality will not be available"
44
44
  end
45
45
 
46
46
  LINGO_BASE = ENV['PM_LINGO_BASE'] || (
47
47
  File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
48
48
  )
49
49
 
50
- LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
51
- warn "lingo installation not found at #{LINGO_BASE} -- proceeding anyway" unless LINGO_FOUND
50
+ if LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
51
+ begin
52
+ require File.join(LINGO_BASE, 'lib', 'const')
53
+ rescue LoadError
54
+ end
55
+ else
56
+ warn "Lingo installation not found at #{LINGO_BASE} -- proceeding anyway"
57
+ end
58
+
59
+ unless Object.const_defined?(:PRINTABLE_CHAR)
60
+ PRINTABLE_CHAR = '[\w-]'
61
+ end
62
+
63
+ PRINTABLE_CHAR_RE = %r{(?:#{PRINTABLE_CHAR})+}
52
64
 
53
65
  lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
54
66
  YAML.load_file(file)
55
67
  else
56
- warn "lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
68
+ warn "Lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
57
69
 
58
70
  {
59
71
  'meeting' => {
@@ -78,30 +90,81 @@ class PerseusMatch
78
90
 
79
91
  class TokenSet < Array
80
92
 
81
- def self.tokenize(form, unknowns = false)
82
- return @tokens[form] if @tokens
93
+ class << self
83
94
 
84
- @_tokens, @tokens = {}, Hash.new { |h, k|
85
- h[k] = new(
86
- k, (@_tokens[k] || []) | (
87
- k.scan(/\w+/) + k.scan(/[\w-]+/)
88
- ).map { |i| @_tokens[i] }.flatten.compact
89
- )
90
- }
95
+ def tokenize(form, unknowns = false)
96
+ form.downcase!
97
+ return @tokens[form] if @tokens ||= nil
98
+
99
+ @_tokens = Hash.new
100
+ @tokens = Hash.new { |h, k| h[k] = new(k, @_tokens[k] || []) }
101
+
102
+ tokens_file = ENV['PM_TOKENS_FILE'] || 'perseus.tokens'
103
+
104
+ if File.readable?(tokens_file)
105
+ File.open(tokens_file) { |f| parse(f, unknowns, @_tokens) }
106
+ @tokens[form]
107
+ else
108
+ raise "Lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
109
+
110
+ cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
111
+ YAML.dump(LINGO_CONFIG, t)
112
+ }
113
+
114
+ file = file?(form) || begin
115
+ temp = Tempfile.open('perseus_match_temp') { |t| t.puts form }
116
+ temp.path
117
+ end
118
+
119
+ ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
120
+
121
+ if keep = ENV['PM_KEEP_TOKENS']
122
+ keep = File.expand_path(keep =~ /\A(?:1|y(?:es)?|true)\z/i ? tokens_file : keep)
123
+ end
124
+
125
+ begin
126
+ Dir.chdir(LINGO_BASE) {
127
+ tokens = %x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}
128
+ File.open(keep, 'w') { |f| f.puts tokens } if keep
129
+ parse(tokens, unknowns, @_tokens)
130
+ }
131
+ ensure
132
+ cfg.unlink
133
+ temp.unlink if temp
134
+ end
135
+
136
+ if temp
137
+ tokens, @tokens = @tokens[form], nil
138
+ tokens
139
+ end
140
+ end
141
+ end
142
+
143
+ def file?(form)
144
+ file = Pathname.new(form).absolute? ? form : File.expand_path(form)
145
+ file if File.file?(file) && File.readable?(file)
146
+ end
147
+
148
+ private
91
149
 
92
- parse = lambda { |x|
93
- x.each_line { |res|
150
+ def parse(output, unknowns = false, tokens = {})
151
+ sanitize = lambda { |a|
152
+ a.sub!(Token::WC_RE, '')
153
+ a.downcase!
154
+ }
155
+
156
+ output.each_line { |res|
94
157
  case res
95
158
  when /<(.*?)\s=\s\[(.*)\]>/
96
159
  a, b = $1, $2
97
- a.sub!(/\|.*/, '')
160
+ sanitize[a]
98
161
 
99
- @_tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten
162
+ tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten.map { |t| Token.new(t) }
100
163
  when /<(.*)>/, /:(.*):/
101
- a, b = $1, $1.dup
102
- a.sub!(/[\/|].*/, '')
164
+ a, b = $1, Token.new($1.downcase)
165
+ sanitize[a]
103
166
 
104
- if unknowns && b =~ /\|\?\z/
167
+ if unknowns && b.unk?
105
168
  if unknowns.respond_to?(:<<)
106
169
  unknowns << a
107
170
  else
@@ -109,134 +172,65 @@ class PerseusMatch
109
172
  end
110
173
  end
111
174
 
112
- @_tokens[a] ||= [b.replace_diacritics.downcase]
175
+ tokens[a] ||= [b]
113
176
  end
114
177
  }
115
- }
116
-
117
- if File.readable?(t = 'perseus.tokens')
118
- File.open(t) { |f| parse[f] }
119
- @tokens[form]
120
- else
121
- raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
122
-
123
- cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
124
- YAML.dump(LINGO_CONFIG, t)
125
- }
126
-
127
- file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
128
178
 
129
- unless File.file?(file) && File.readable?(file)
130
- temp = Tempfile.open('perseus_match_temp') { |t|
131
- t.puts form
132
- }
133
-
134
- file = temp.path
135
- end
136
-
137
- ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
138
-
139
- begin
140
- Dir.chdir(LINGO_BASE) {
141
- parse[%x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}]
142
- }
143
- ensure
144
- cfg.unlink
145
- temp.unlink if temp
146
- end
147
-
148
- if temp
149
- tokens, @tokens = @tokens[form], nil
150
- tokens
151
- else
152
- @tokens[form]
153
- end
179
+ tokens
154
180
  end
181
+
155
182
  end
156
183
 
157
184
  private :push, :<<, :[]= # maybe more...
158
185
 
159
- attr_reader :form
186
+ attr_reader :form, :tokens
160
187
 
161
188
  def initialize(form, tokens = nil)
162
189
  super(tokens || self.class.tokenize(form))
163
190
 
164
191
  @form = form
165
- @tokens = to_a.flatten
192
+ @tokens = to_a
166
193
  end
167
194
 
168
195
  def distance(other)
169
- tokens1, tokens2 = tokens, other.tokens
170
- size1, size2 = tokens1.size, tokens2.size
171
-
172
- return size2 if tokens1.empty?
173
- return size1 if tokens2.empty?
174
-
175
- distance, costs = nil, (0..size2).to_a
176
-
177
- 0.upto(size1 - 1) { |index1|
178
- token1, cost = tokens1[index1], index1 + 1
179
-
180
- 0.upto(size2 - 1) { |index2|
181
- penalty = token1 == tokens2[index2] ? 0 : 1
182
-
183
- # rcov hack :-(
184
- _ = [
185
- costs[index2 + 1] + 1, # insertion
186
- cost + 1, # deletion
187
- costs[index2] + penalty # substitution
188
- ]
189
- distance = _.min
190
-
191
- costs[index2], cost = cost, distance
192
- }
193
-
194
- costs[size2] = distance
195
- }
196
-
197
- distance + 1 # > 0 !?!
196
+ (forms | other.forms).size - (forms & other.forms).size
198
197
  end
199
198
 
200
- def tokens(wc = true)
201
- wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
202
- token.sub(%r{[/|].*?\z}, '')
203
- }
199
+ def forms
200
+ @forms ||= map { |token| token.form }
204
201
  end
205
202
 
206
203
  def disjoint?(other)
207
- (tokens(false) & other.tokens(false)).empty?
204
+ (forms.flatten & other.forms.flatten).flatten.empty?
208
205
  end
209
206
 
210
207
  def inclexcl(inclexcl = {})
211
- incl(inclexcl[:incl] || '.*').excl(inclexcl[:excl])
208
+ incl(inclexcl[:incl] || Token::ANY_WC).excl(inclexcl[:excl])
212
209
  end
213
210
 
214
- def incl(*wc)
215
- (@incl ||= {})[wc = [*wc].compact] ||= select { |token|
216
- match?(token, wc)
217
- }.to_token_set(form)
211
+ def incl(wcs)
212
+ self.class.new(form, select { |token| token.match?(wcs) })
218
213
  end
219
214
 
220
- def excl(*wc)
221
- (@excl ||= {})[wc = [*wc].compact] ||= reject { |token|
222
- match?(token, wc)
223
- }.to_token_set(form)
215
+ def excl(wcs)
216
+ self.class.new(form, reject { |token| token.match?(wcs) })
224
217
  end
225
218
 
226
219
  def soundex
227
- raise "soundex functionality not available" unless defined?(Text::Soundex)
220
+ ensure_soundex!
228
221
 
229
- @soundex ||= map { |token|
230
- token.sub(/(.*)(?=[\/|])/) { |m| Text::Soundex.soundex(m.replace_diacritics) }
231
- }.to_token_set(form)
222
+ @soundex ||= self.class.new(form, map { |token|
223
+ form = token.form.replace_diacritics.sub(/\W+/, '')
224
+ Token.new(Text::Soundex.soundex(form) || '', token.wc)
225
+ })
232
226
  end
233
227
 
234
- def soundex!
235
- replace soundex
228
+ def ==(other)
229
+ tokens == other.tokens
236
230
  end
237
231
 
238
232
  def eql?(other)
239
- tokens == other.tokens && form == other.form
233
+ self == other && form == other.form
240
234
  end
241
235
 
242
236
  def inspect
@@ -247,16 +241,77 @@ class PerseusMatch
247
241
 
248
242
  private
249
243
 
250
- def match?(token, wc)
251
- token =~ %r{[/|](?:#{wc.join('|')})\z}
244
+ def ensure_soundex!
245
+ unless defined?(Text::Soundex)
246
+ raise RuntimeError, "Soundex functionality not available", caller(1)
247
+ end
252
248
  end
253
249
 
254
250
  end
255
251
 
256
- class ::Array
252
+ class PhraseTokenSet < TokenSet
253
+
254
+ class << self
255
+
256
+ def tokenize(form, unknowns = false)
257
+ (@tokens ||= {})[form] ||= new(form, form.scan(PRINTABLE_CHAR_RE).map { |i|
258
+ TokenSet.tokenize(i, unknowns)
259
+ })
260
+ end
261
+
262
+ end
263
+
264
+ alias_method :phrase, :form
265
+ alias_method :token_sets, :tokens
266
+
267
+ # (size1 - size2).abs <= distance <= [size1, size2].max
268
+ def distance(other)
269
+ token_sets1, token_sets2 = token_sets, other.token_sets
270
+ size1, size2 = token_sets1.size, token_sets2.size
271
+
272
+ return size2 if size1 == 0
273
+ return size1 if size2 == 0
274
+
275
+ distance, costs = nil, (0..size2).to_a
276
+
277
+ 0.upto(size1 - 1) { |index1|
278
+ token_set1, cost = token_sets1[index1], index1 + 1
279
+
280
+ 0.upto(size2 - 1) { |index2|
281
+ penalty = token_set1.distance(token_sets2[index2])
282
+
283
+ # rcov hack :-(
284
+ _ = [
285
+ costs[index2 + 1] + 1, # insertion
286
+ cost + 1, # deletion
287
+ costs[index2] + penalty # substitution
288
+ ]
289
+ distance = _.min
257
290
 
258
- def to_token_set(form)
259
- TokenSet.new(form, self)
291
+ costs[index2], cost = cost, distance
292
+ }
293
+
294
+ costs[size2] = distance
295
+ }
296
+
297
+ distance
298
+ end
299
+
300
+ def forms
301
+ @forms ||= map { |token_set| token_set.forms }
302
+ end
303
+
304
+ def incl(wcs)
305
+ self.class.new(form, map { |token_set| token_set.incl(wcs) })
306
+ end
307
+
308
+ def excl(wcs)
309
+ self.class.new(form, map { |token_set| token_set.excl(wcs) })
310
+ end
311
+
312
+ def soundex
313
+ ensure_soundex!
314
+ @soundex ||= self.class.new(form, map { |token_set| token_set.soundex })
260
315
  end
261
316
 
262
317
  end
@@ -4,7 +4,7 @@ class PerseusMatch
4
4
 
5
5
  MAJOR = 0
6
6
  MINOR = 0
7
- TINY = 6
7
+ TINY = 7
8
8
 
9
9
  class << self
10
10
 
data/lib/perseus_match.rb CHANGED
@@ -26,8 +26,11 @@
26
26
  ###############################################################################
27
27
  #++
28
28
 
29
+ require 'perseus_match/core_ext'
30
+
29
31
  require 'perseus_match/list'
30
32
  require 'perseus_match/cluster'
33
+ require 'perseus_match/token'
31
34
  require 'perseus_match/token_set'
32
35
 
33
36
  require 'perseus_match/version'
@@ -36,7 +39,7 @@ class PerseusMatch
36
39
 
37
40
  Infinity = 1.0 / 0
38
41
 
39
- DEFAULT_COEFF = 20
42
+ DEFAULT_COEFF = 2
40
43
 
41
44
  DISTANCE_SPEC = [ # {
42
45
  [{}, 1], # {} => 1,
@@ -68,8 +71,22 @@ class PerseusMatch
68
71
  end
69
72
 
70
73
  def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
71
- value = new(phrase, target, pm_options).send(attribute)
72
- value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
74
+ pm = new(phrase, target, pm_options)
75
+ value = pm.send(attribute)
76
+
77
+ if value.send(operator, threshold)
78
+ Struct.new(:pm, :value, :threshold, :operator).new(pm, value, threshold, operator)
79
+ else
80
+ raise CheckFailedError.new(pm, value, threshold, operator)
81
+ end
82
+ end
83
+
84
+ def tokenize(form, unknowns = false)
85
+ if file = TokenSet.file?(form)
86
+ TokenSet.tokenize(file, unknowns)
87
+ else
88
+ PhraseTokenSet.tokenize(form, unknowns)
89
+ end
73
90
  end
74
91
 
75
92
  end
@@ -77,8 +94,8 @@ class PerseusMatch
77
94
  attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
78
95
 
79
96
  def initialize(phrase, target, options = {})
80
- @phrase = phrase.to_s
81
- @target = target.to_s
97
+ @phrase = sanitize(phrase.to_s)
98
+ @target = sanitize(target.to_s)
82
99
 
83
100
  @default_coeff = options[:default_coeff] || DEFAULT_COEFF
84
101
  @distance_spec = options[:distance_spec] || DISTANCE_SPEC
@@ -89,11 +106,11 @@ class PerseusMatch
89
106
  end
90
107
 
91
108
  def phrase_tokens
92
- @phrase_tokens ||= tokenize(phrase)
109
+ @phrase_tokens ||= self.class.tokenize(phrase)
93
110
  end
94
111
 
95
112
  def target_tokens
96
- @target_tokens ||= tokenize(target)
113
+ @target_tokens ||= self.class.tokenize(target)
97
114
  end
98
115
 
99
116
  # 0 <= distance <= Infinity
@@ -104,13 +121,13 @@ class PerseusMatch
104
121
  # 1 >= similarity >= 0
105
122
  def similarity(coeff = nil)
106
123
  coeff ||= default_coeff # passed arg may be nil
107
- @similarity[coeff] ||= 1 / Math.exp(distance / (coeff * total_weight))
124
+ @similarity[coeff] ||= normalize_distance(coeff)
108
125
  end
109
126
 
110
127
  private
111
128
 
112
- def tokenize(str)
113
- TokenSet.new(str)
129
+ def sanitize(str)
130
+ str.gsub(/\s*\(.*?\)|\s*\[.*?\]/, '').sub(/\s*[\/:].*/, '')
114
131
  end
115
132
 
116
133
  def calculate_distance
@@ -148,16 +165,25 @@ class PerseusMatch
148
165
  distance
149
166
  end
150
167
 
168
+ def normalize_distance(coeff)
169
+ length = phrase_tokens.size + target_tokens.size
170
+ return 0 if length == 0
171
+
172
+ norm = Math.log(length ** Math.sqrt(2)) * coeff * total_weight * Math::E
173
+
174
+ 1 / Math.exp(distance / norm)
175
+ end
176
+
151
177
  def total_weight
152
178
  @total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
153
179
  end
154
180
 
155
181
  class CheckFailedError < StandardError
156
182
 
157
- attr_reader :value, :threshold, :operator
183
+ attr_reader :pm, :value, :threshold, :operator
158
184
 
159
- def initialize(value, threshold, operator)
160
- @value, @threshold, @operator = value, threshold, operator
185
+ def initialize(pm, value, threshold, operator)
186
+ @pm, @value, @threshold, @operator = pm, value, threshold, operator
161
187
  end
162
188
 
163
189
  def to_s
@@ -1,81 +1,105 @@
1
- describe PerseusMatch::TokenSet, ' with lingo' do
1
+ describe PerseusMatch::PhraseTokenSet do
2
2
 
3
- before :each do
4
- PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
5
- end
3
+ describe 'with lingo' do
6
4
 
7
- before :all do
8
- @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
9
- end
5
+ before :all do
6
+ @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
7
+ @original_phrase_tokens = PerseusMatch::PhraseTokenSet.instance_variable_get(:@tokens)
8
+ end
10
9
 
11
- after :all do
12
- PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
13
- end
10
+ after :all do
11
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
12
+ PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, @original_phrase_tokens)
13
+ end
14
14
 
15
- it 'should tokenize a string' do
16
- PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
17
- end
15
+ before :each do
16
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
17
+ PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, nil)
18
+ end
18
19
 
19
- it 'should report strictly equal TokenSets as ==' do
20
- PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
21
- end
20
+ it 'should tokenize a string' do
21
+ PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
22
+ end
22
23
 
23
- it 'should report strictly equal TokenSets as eql' do
24
- PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
25
- end
24
+ it 'should report strictly equal PhraseTokenSets as ==' do
25
+ PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('foo bar')
26
+ end
26
27
 
27
- it 'should report slightly equal TokenSets as ==' do
28
- PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
29
- end
28
+ it 'should report strictly equal PhraseTokenSets as eql' do
29
+ PerseusMatch::PhraseTokenSet.new('foo bar').should be_eql(PerseusMatch::PhraseTokenSet.new('foo bar'))
30
+ end
30
31
 
31
- it 'should *not* report slightly equal TokenSets as eql' do
32
- PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
33
- end
32
+ it 'should report slightly equal PhraseTokenSets as ==' do
33
+ PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('Foo Bar')
34
+ end
34
35
 
35
- it 'should include form in inspect' do
36
- PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
37
- end
36
+ it 'should *not* report slightly equal PhraseTokenSets as eql' do
37
+ PerseusMatch::PhraseTokenSet.new('foo bar').should_not be_eql(PerseusMatch::PhraseTokenSet.new('Foo Bar'))
38
+ end
38
39
 
39
- end if LINGO_FOUND
40
+ it 'should collect unknown tokens' do
41
+ unknowns = []
42
+ PerseusMatch::PhraseTokenSet.tokenize('foo bar', unknowns)
43
+ unknowns.should == %w[foo]
44
+ end
40
45
 
41
- describe PerseusMatch::TokenSet, ' without lingo' do
46
+ it 'should include form in inspect' do
47
+ PerseusMatch::PhraseTokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
48
+ end
42
49
 
43
- before :each do
44
- PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
45
- end
50
+ end if LINGO_FOUND
46
51
 
47
- before :all do
48
- @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
49
- end
52
+ describe 'without lingo' do
50
53
 
51
- after :all do
52
- PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
53
- end
54
+ before :all do
55
+ @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
56
+ end
57
+
58
+ after :all do
59
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
60
+ end
54
61
 
55
- it 'should take a prepared file for tokenization' do
56
- # prevent lingo from being used
57
- lingo_base = LINGO_BASE.dup
58
- LINGO_BASE.replace('')
62
+ before :each do
63
+ PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
64
+ end
59
65
 
60
- temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
61
- t.puts *%w[<foo|?> <bar|?>]
62
- }
66
+ it 'should take a prepared file for tokenization' do
67
+ # prevent lingo from being used
68
+ lingo_base = LINGO_BASE.dup
69
+ LINGO_BASE.replace('')
63
70
 
64
- path = temp.path
65
- link = 'perseus.tokens'
71
+ temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
72
+ t.puts *%w[<foo|?> <bar|?>]
73
+ }
66
74
 
67
- Dir.chdir(File.dirname(path)) {
68
- File.symlink(path, link)
75
+ path = temp.path
76
+ link = 'perseus.tokens'
69
77
 
70
- PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
78
+ Dir.chdir(File.dirname(path)) {
79
+ begin
80
+ File.symlink(path, link)
81
+ PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
82
+ ensure
83
+ File.unlink(link) if File.symlink?(link) && File.readlink(link) == path
84
+ end
85
+ }
86
+
87
+ temp.unlink
88
+
89
+ # reset lingo base
90
+ LINGO_BASE.replace(lingo_base)
91
+ end
92
+
93
+ end
71
94
 
72
- File.unlink(link)
73
- }
95
+ it 'should raise an error if asked for Soundex but is not available' do
96
+ soundex = Text.send(:remove_const, :Soundex)
74
97
 
75
- temp.unlink
98
+ lambda {
99
+ PerseusMatch::PhraseTokenSet.new('foo bar').soundex
100
+ }.should raise_error(RuntimeError, /soundex/i)
76
101
 
77
- # reset lingo base
78
- LINGO_BASE.replace(lingo_base)
102
+ Text::Soundex = soundex
79
103
  end
80
104
 
81
105
  end
@@ -0,0 +1,23 @@
1
+ describe PerseusMatch::Token do
2
+
3
+ it 'should report strictly equal Tokens as ==' do
4
+ PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'a')
5
+ end
6
+
7
+ it 'should report strictly equal Tokens as eql' do
8
+ PerseusMatch::Token.new('foo', 'a').should be_eql(PerseusMatch::Token.new('foo', 'a'))
9
+ end
10
+
11
+ it 'should report slightly equal Tokens as ==' do
12
+ PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'b')
13
+ end
14
+
15
+ it 'should *not* report slightly equal Tokens as eql' do
16
+ PerseusMatch::Token.new('foo', 'a').should_not be_eql(PerseusMatch::Token.new('foo', 'b'))
17
+ end
18
+
19
+ it 'should include the word class in inspect' do
20
+ PerseusMatch::Token.new('foo', 'a').inspect.to_s.should =~ /\/a\z/
21
+ end
22
+
23
+ end
@@ -37,7 +37,7 @@ describe PerseusMatch do
37
37
  t.puts *phrases
38
38
  }
39
39
 
40
- PerseusMatch::TokenSet.tokenize(temp.path)
40
+ PerseusMatch.tokenize(temp.path)
41
41
 
42
42
  temp.unlink
43
43
 
@@ -158,13 +158,8 @@ describe PerseusMatch do
158
158
 
159
159
  it 'should be checkable (2)' do
160
160
  lambda {
161
- begin
162
- PerseusMatch.check!('foo', 'bar', 0, :>)
163
- rescue PerseusMatch::CheckFailedError => err
164
- err.to_s.should =~ /0/
165
- raise err
166
- end
167
- }.should raise_error(PerseusMatch::CheckFailedError)
161
+ PerseusMatch.check!('foo', 'bar', 0, :>)
162
+ }.should raise_error(PerseusMatch::CheckFailedError, /0/)
168
163
  end
169
164
 
170
165
  end if LINGO_FOUND
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: perseus_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-01-26 00:00:00 +01:00
12
+ date: 2009-02-24 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -32,6 +32,16 @@ dependencies:
32
32
  - !ruby/object:Gem::Version
33
33
  version: 0.4.0
34
34
  version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: unicode
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 0.1.1
44
+ version:
35
45
  description: Fuzzy string matching based on linguistic analysis
36
46
  email: jens.wille@uni-koeln.de
37
47
  executables:
@@ -43,6 +53,8 @@ extra_rdoc_files:
43
53
  - ChangeLog
44
54
  - README
45
55
  files:
56
+ - lib/perseus_match/token.rb
57
+ - lib/perseus_match/core_ext.rb
46
58
  - lib/perseus_match/list.rb
47
59
  - lib/perseus_match/version.rb
48
60
  - lib/perseus_match/token_set.rb
@@ -56,6 +68,7 @@ files:
56
68
  - spec/spec_helper.rb
57
69
  - spec/perseus_match/list_spec.rb
58
70
  - spec/perseus_match/cluster_spec.rb
71
+ - spec/perseus_match/token_spec.rb
59
72
  - spec/perseus_match/token_set_spec.rb
60
73
  - spec/perseus_match_spec.rb
61
74
  - sample/config.yaml