blackwinter-perseus_match 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ = Revision history for perseus_match
2
+
3
+ == 0.0.1 [2008-08-11]
4
+
5
+ * Birthday :-)
data/README ADDED
@@ -0,0 +1,41 @@
1
+ = perseus_match - Fuzzy string matching based on linguistic analysis
2
+
3
+ == VERSION
4
+
5
+ This documentation refers to perseus_match version 0.0.3
6
+
7
+
8
+ == DESCRIPTION
9
+
10
+ Fuzzy string matching based on linguistic analysis.
11
+
12
+
13
+ == LINKS
14
+
15
+ <b></b>
16
+ Documentation:: <http://prometheus.rubyforge.org/perseus_match>
17
+ Source code:: <http://github.com/blackwinter/perseus_match>
18
+ Rubyforge project:: <http://rubyforge.org/projects/prometheus>
19
+
20
+
21
+ == AUTHORS
22
+
23
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
24
+
25
+
26
+ == LICENSE AND COPYRIGHT
27
+
28
+ Copyright (C) 2008 Cologne University of Applied Sciences,
29
+ Claudiusstr. 1, 50678 Cologne, Germany
30
+
31
+ perseus_match is free software: you can redistribute it and/or modify it under
32
+ the terms of the GNU General Public License as published by the Free Software
33
+ Foundation, either version 3 of the License, or (at your option) any later
34
+ version.
35
+
36
+ perseus_match is distributed in the hope that it will be useful, but WITHOUT
37
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
38
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
39
+
40
+ You should have received a copy of the GNU General Public License along with
41
+ perseus_match. If not, see <http://www.gnu.org/licenses/>.
@@ -0,0 +1,24 @@
1
+ require %q{lib/perseus_match/version}
2
+
3
+ begin
4
+ require 'hen'
5
+
6
+ Hen.lay! {{
7
+ :rubyforge => {
8
+ :project => %q{prometheus},
9
+ :package => %q{perseus_match}
10
+ },
11
+
12
+ :gem => {
13
+ :version => PerseusMatch::VERSION,
14
+ :summary => %q{Fuzzy string matching based on linguistic analysis},
15
+ :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
+ :extra_files => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
17
+ :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
18
+ }
19
+ }}
20
+ rescue LoadError
21
+ abort "Please install the 'hen' gem first."
22
+ end
23
+
24
+ ### Place your custom Rake tasks here.
@@ -0,0 +1,298 @@
1
+ #! /usr/bin/ruby
2
+
3
+ require 'optparse'
4
+ require 'benchmark'
5
+ require 'yaml'
6
+
7
+ require 'rubygems'
8
+ require 'nuggets/enumerable/minmax'
9
+ require 'nuggets/numeric/duration'
10
+
11
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
12
+
13
+ require 'perseus_match'
14
+
15
+ USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
16
+ abort USAGE if ARGV.empty?
17
+
18
+ options = {
19
+ :config => nil,
20
+ :threshold => 0,
21
+ :sort => false,
22
+ :stats => false,
23
+ :lingo => false,
24
+ :minimal => false,
25
+ :separate => false,
26
+ :check => false,
27
+ :failed_only => false,
28
+ :align => false,
29
+ :adjust_coeff => false
30
+ }
31
+
32
+ OptionParser.new { |opts|
33
+ opts.banner = USAGE
34
+
35
+ opts.separator ' '
36
+ opts.separator 'Options:'
37
+
38
+ opts.on('-c', '--config YAML', 'Config file in YAML format') { |f|
39
+ abort "Can't find config file: #{f}." unless File.readable?(f)
40
+
41
+ options[:config] = f
42
+ }
43
+
44
+ opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
45
+ options[:threshold] = t
46
+ }
47
+
48
+ opts.on('-s', '--sort', 'Sort results (considerably slower!)') {
49
+ options[:sort] = true
50
+ }
51
+
52
+ opts.on('-S', '--stats', 'Output some statistics at the end') {
53
+ options[:stats] = true
54
+ }
55
+
56
+ opts.on('-v', '--verbose', 'Print additional information during processing') {
57
+ options[:verbose] = true
58
+ }
59
+
60
+ opts.separator ' '
61
+ opts.separator ' * Calculating similarities (default)'
62
+ opts.separator ' '
63
+
64
+ opts.on('-m', '--minimal', 'Produce minimal pairs only') {
65
+ options[:minimal] = true
66
+ }
67
+
68
+ opts.on('-p', '--separate [SEPARATOR]', 'Separate phrase blocks by SEPARATOR', '[Default: empty line]') { |p|
69
+ options[:separate] = p || ''
70
+ }
71
+
72
+ opts.on('-l', '--lingo', 'Output results in lingo dictionary format') {
73
+ options[:lingo] = true
74
+ }
75
+
76
+ opts.separator ' '
77
+ opts.separator ' * Checking pairs'
78
+ opts.separator ' '
79
+
80
+ opts.on('-C', '--check', 'Check pairs in input file (CSV) against', 'specified (or global) threshold') {
81
+ options[:check] = true
82
+ }
83
+
84
+ opts.on('-f', '--failed', 'Print only failed checks') {
85
+ options[:failed_only] = true
86
+ }
87
+
88
+ opts.on('-a', '--align', 'Align check results') {
89
+ options[:align] = true
90
+ }
91
+
92
+ opts.on('-A', '--adjust-coeff', 'Adjust coefficient to minimize the error') {
93
+ options[:adjust_coeff] = true
94
+ }
95
+
96
+ opts.separator ' '
97
+ opts.separator 'Generic options:'
98
+
99
+ opts.on('-h', '--help', 'Print this help message and exit') {
100
+ abort opts.to_s
101
+ }
102
+
103
+ opts.on('--version', 'Print program version and exit') {
104
+ abort "#{File.basename($0)} v#{PerseusMatch::VERSION}"
105
+ }
106
+ }.parse!
107
+
108
+ unless file = ARGV.shift
109
+ abort "No input file specified.\n#{USAGE}"
110
+ else
111
+ abort "Input file not found: #{file}" unless File.readable?(file)
112
+ end
113
+
114
+ PerseusMatch::TokenSet.tokenize(file)
115
+
116
+ skip_re = %r{\A\s*(?:#|\z)}o
117
+ phrases = []
118
+
119
+ File.open(file).each { |line|
120
+ phrases << line.chomp unless line =~ skip_re
121
+ }.close
122
+
123
+ pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
124
+ pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
125
+ list_options = { :minimal => options[:minimal] }
126
+
127
+ threshold, count, count_all = options[:threshold], 0, 0
128
+
129
+ action = if options[:check]
130
+ require 'fastercsv'
131
+
132
+ format = if options[:align]
133
+ require 'jcode'
134
+
135
+ width = phrases.max(:jlength) + 3
136
+
137
+ lambda { |line, res|
138
+ "#{line} #{'.' * (width - line.jlength)} [#{res}]"
139
+ }
140
+ else
141
+ lambda { |line, res|
142
+ "#{line} [#{res}]"
143
+ }
144
+ end
145
+
146
+ phrases.sort! if options[:sort]
147
+ phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
148
+
149
+ global_threshold = options[:threshold]
150
+ failed_only = options[:failed_only]
151
+ collect_stats = options[:stats]
152
+ adjust_coeff = options[:adjust_coeff]
153
+
154
+ _action = lambda { |*args|
155
+ pm_options[:default_coeff] = args.first unless args.empty?
156
+
157
+ count, count_all = 0, 0
158
+ positives = negatives = false_positives = false_negatives = 0.0
159
+
160
+ phrases.each { |line, spec|
161
+ phrase, target, threshold, operator, _ = *spec
162
+
163
+ threshold ||= global_threshold
164
+ operator ||= '>'
165
+ assign = operator =~ />/
166
+
167
+ begin
168
+ PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
169
+
170
+ count += 1
171
+ assign ? positives += 1 : negatives += 1
172
+
173
+ puts format[line, 'OK'] unless adjust_coeff || failed_only
174
+ rescue PerseusMatch::CheckFailedError => err
175
+ assign ? false_negatives += 1 : false_positives += 1
176
+
177
+ puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
178
+ end
179
+ count_all += 1
180
+ }
181
+
182
+ divide = lambda { |numerator, denominator|
183
+ denominator == 0 ? 0 : numerator / denominator
184
+ }
185
+
186
+ if collect_stats || adjust_coeff
187
+ error = divide[
188
+ false_positives + false_negatives,
189
+ positives + negatives + false_positives + false_negatives
190
+ ]
191
+ end
192
+
193
+ if collect_stats
194
+ recall = divide[positives, positives + false_negatives]
195
+ precision = divide[positives, positives + false_positives]
196
+ f1 = divide[2 * recall * precision, recall + precision]
197
+
198
+ warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
199
+ recall * 100, precision * 100, f1, error
200
+ ]
201
+ end
202
+
203
+ error if adjust_coeff
204
+ }
205
+
206
+ if adjust_coeff
207
+ lambda {
208
+ step, max = 1, 100
209
+
210
+ start_coeff = pm_options[:default_coeff] || PerseusMatch::DEFAULT_COEFF
211
+ start_err = _action[start_coeff]
212
+
213
+ previous_coeff = next_coeff = start_coeff
214
+ previous_err = next_err = start_err
215
+
216
+ max.times { break if (previous_err = _action[previous_coeff -= step]) != start_err }
217
+ max.times { break if (next_err = _action[next_coeff += step]) != start_err }
218
+
219
+ best_err = [start_err, previous_err, next_err].min
220
+
221
+ if best_err == start_err
222
+ best_coeff = start_coeff
223
+ else
224
+ if best_err == previous_err
225
+ max.times {
226
+ break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
227
+ previous_err, previous_coeff = current_err, current_coeff
228
+ }
229
+
230
+ best_err, best_coeff = previous_err, previous_coeff
231
+ else
232
+ max.times {
233
+ break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
234
+ next_err, next_coeff = current_err, current_coeff
235
+ }
236
+
237
+ best_err, best_coeff = next_err, next_coeff
238
+ end
239
+ end
240
+
241
+ puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
242
+ }
243
+ else
244
+ _action
245
+ end
246
+ else
247
+ format =
248
+ options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
249
+ options[:sort] ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
250
+ lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
251
+
252
+ if options[:sort]
253
+ require 'pp'
254
+
255
+ lambda {
256
+ pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
257
+ if pm.similarity >= threshold
258
+ res = format[pm]
259
+ count += 1
260
+ end
261
+ count_all += 1
262
+ res
263
+ }.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
264
+ }
265
+ else
266
+ lambda {
267
+ separator, previous_phrase = options[:separate], nil
268
+
269
+ PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
270
+ if separator && pm.phrase != previous_phrase ||= pm.phrase
271
+ puts separator
272
+ previous_phrase = pm.phrase
273
+ end
274
+
275
+ if pm.similarity >= threshold
276
+ puts format[pm]
277
+ count += 1
278
+ end
279
+
280
+ count_all += 1
281
+ }
282
+ }
283
+ end
284
+ end
285
+
286
+ if options[:stats]
287
+ time = Benchmark.realtime(&action)
288
+
289
+ hms, x, y = time.to_hms(2), time / count, time / count_all
290
+
291
+ precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
292
+
293
+ warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
294
+ phrases.size, count, count_all, hms, x, y
295
+ ]
296
+ else
297
+ action.call
298
+ end
@@ -0,0 +1,169 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # perseus_match -- Fuzzy string matching based on linguistic analysis #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'perseus_match/list'
30
+ require 'perseus_match/cluster'
31
+ require 'perseus_match/token_set'
32
+
33
+ require 'perseus_match/version'
34
+
35
+ class PerseusMatch
36
+
37
+ Infinity = 1.0 / 0
38
+
39
+ DEFAULT_COEFF = 20
40
+
41
+ DISTANCE_SPEC = [ # {
42
+ [{}, 1], # {} => 1,
43
+ [{ :excl => %w[a t] }, 2], # { :excl => %w[a t] } => 1,
44
+ [{ :incl => 's' }, 3], # { :incl => 's' } => 2,
45
+ [{ :incl => 'y' }, 4], # { :incl => 'y' } => 4,
46
+ [{ :sort => true }, 4], # { :sort => true } => 4,
47
+ [{ :soundex => true }, 4] # { :soundex => true } => 8
48
+ ] # }
49
+
50
+ class << self
51
+
52
+ def distance(*args)
53
+ new(*args).distance
54
+ end
55
+
56
+ def match(phrases, pm_options = {})
57
+ List.new(phrases, pm_options)
58
+ end
59
+
60
+ def cluster(phrases, options = {}, pm_options = {})
61
+ Cluster.new(phrases, pm_options).rank(options)
62
+ end
63
+
64
+ def check(*args)
65
+ check!(*args)
66
+ rescue CheckFailedError
67
+ false
68
+ end
69
+
70
+ def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
71
+ value = new(phrase, target, pm_options).send(attribute)
72
+ value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
73
+ end
74
+
75
+ end
76
+
77
+ attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
78
+
79
+ def initialize(phrase, target, options = {})
80
+ @phrase = phrase.to_s
81
+ @target = target.to_s
82
+
83
+ @default_coeff = options[:default_coeff] || DEFAULT_COEFF
84
+ @distance_spec = options[:distance_spec] || DISTANCE_SPEC
85
+
86
+ @verbose = options[:verbose]
87
+
88
+ @similarity = {}
89
+ end
90
+
91
+ def phrase_tokens
92
+ @phrase_tokens ||= tokenize(phrase)
93
+ end
94
+
95
+ def target_tokens
96
+ @target_tokens ||= tokenize(target)
97
+ end
98
+
99
+ # 0 <= distance <= Infinity
100
+ def distance
101
+ @distance ||= calculate_distance
102
+ end
103
+
104
+ # 1 >= similarity >= 0
105
+ def similarity(coeff = nil)
106
+ coeff ||= default_coeff # passed arg may be nil
107
+ @similarity[coeff] ||= 1 / Math.exp(distance / (coeff * total_weight))
108
+ end
109
+
110
+ private
111
+
112
+ def tokenize(str)
113
+ TokenSet.new(str)
114
+ end
115
+
116
+ def calculate_distance
117
+ return Infinity if phrase_tokens.disjoint?(target_tokens)
118
+ return 0 if phrase_tokens.eql?(target_tokens)
119
+
120
+ distance_spec.inject(0) { |distance, (options, weight)|
121
+ distance + token_distance(options) * weight
122
+ }
123
+ end
124
+
125
+ def token_distance(options = {})
126
+ tokens1 = phrase_tokens.inclexcl(options)
127
+ tokens2 = target_tokens.inclexcl(options)
128
+
129
+ if options[:sort]
130
+ tokens1 = tokens1.sort
131
+ tokens2 = tokens2.sort
132
+ end
133
+
134
+ if options[:soundex]
135
+ tokens1 = tokens1.soundex
136
+ tokens2 = tokens2.soundex
137
+ end
138
+
139
+ distance = tokens1.distance(tokens2)
140
+
141
+ warn <<-EOT if verbose
142
+ #{options.inspect}:
143
+ #{tokens1.inspect}
144
+ #{tokens2.inspect}
145
+ => #{distance}
146
+ EOT
147
+
148
+ distance
149
+ end
150
+
151
+ def total_weight
152
+ @total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
153
+ end
154
+
155
+ class CheckFailedError < StandardError
156
+
157
+ attr_reader :value, :threshold, :operator
158
+
159
+ def initialize(value, threshold, operator)
160
+ @value, @threshold, @operator = value, threshold, operator
161
+ end
162
+
163
+ def to_s
164
+ "FAILED: #{value} #{operator} #{threshold}"
165
+ end
166
+
167
+ end
168
+
169
+ end