blackwinter-perseus_match 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,94 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of perseus_match, the fuzzy string matcher #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ class PerseusMatch
30
+
31
+ class Cluster < Hash
32
+
33
+ def initialize(phrases = [], pm_options = {}, list_options = {})
34
+ super() { |h, k| h[k] = [] }
35
+
36
+ List.pair(phrases, pm_options, list_options) { |pm| add(pm) }
37
+ end
38
+
39
+ def add(pm)
40
+ self[pm.phrase] << pm
41
+ end
42
+
43
+ alias_method :<<, :add
44
+
45
+ def sort_by(attribute, *args)
46
+ options = args.last.is_a?(Hash) ? args.pop : {}
47
+
48
+ _ = map { |phrase, matches|
49
+ res = {}
50
+
51
+ matches = matches.sort_by { |match|
52
+ res[match] = match.send(attribute, *args)
53
+ }
54
+
55
+ # premise: if any is, then all are (i.e., only first needs checking)
56
+ numeric = res.any? { |_, r| break r.is_a?(Numeric) }
57
+
58
+ # sort numeric results in reverse order
59
+ matches.reverse! if numeric
60
+
61
+ if threshold = options[:threshold]
62
+ condition = numeric ?
63
+ lambda { |match| res[match] < threshold } :
64
+ lambda { |match| res[match] > threshold }
65
+
66
+ matches.reject!(&condition)
67
+ end
68
+
69
+ if limit = options[:limit]
70
+ matches.slice!(limit..-1) if matches.size > limit
71
+ end
72
+
73
+ # transform entries if so requested
74
+ matches.map! { |match| yield(match) } if block_given?
75
+
76
+ [phrase, matches]
77
+ }.sort
78
+
79
+ _ # rcov hack :-(
80
+ end
81
+
82
+ def sort(options = {})
83
+ args = [:similarity, options.delete(:coeff), options]
84
+ block_given? ? sort_by(*args) { |*a| yield(*a) } : sort_by(*args)
85
+ end
86
+
87
+ def rank(options = {})
88
+ coeff = options[:coeff]
89
+ sort(options) { |match| [match.target, match.similarity(coeff)] }
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -0,0 +1,77 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of perseus_match, the fuzzy string matcher #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ class PerseusMatch
30
+
31
+ class List < Array
32
+
33
+ class << self
34
+
35
+ def pair(phrases, pm_options = {}, list_options = {})
36
+ phrases.uniq!
37
+
38
+ pairs = [] unless block_given?
39
+
40
+ unless list_options[:minimal]
41
+ # => pairs.size = phrases.size ** 2
42
+
43
+ phrases.each { |phrase|
44
+ phrases.each { |target|
45
+ pm = PerseusMatch.new(phrase, target, pm_options)
46
+ block_given? ? yield(pm) : pairs << pm
47
+ }
48
+ }
49
+ else
50
+ # => pairs.size = (phrases.size ** 2 - phrases.size) / 2
51
+
52
+ size = phrases.size
53
+
54
+ 1.upto(size) { |i|
55
+ phrase = phrases[i - 1]
56
+
57
+ i.upto(size - 1) { |j|
58
+ pm = PerseusMatch.new(phrase, phrases[j], pm_options)
59
+ block_given? ? yield(pm) : pairs << pm
60
+ }
61
+ }
62
+ end
63
+
64
+ pairs || phrases
65
+ end
66
+
67
+ end
68
+
69
+ def initialize(phrases = [], pm_options = {}, list_options = {})
70
+ self.class.pair(phrases, pm_options, list_options) { |pm| add(pm) }
71
+ end
72
+
73
+ alias_method :add, :push
74
+
75
+ end
76
+
77
+ end
@@ -0,0 +1,248 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of perseus_match, the fuzzy string matcher #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ $KCODE = 'u'
30
+
31
+ require 'pathname'
32
+ require 'rbconfig'
33
+ require 'yaml'
34
+
35
+ require 'rubygems'
36
+ require 'backports/tempfile'
37
+ require 'nuggets/tempfile/open'
38
+ require 'nuggets/util/i18n'
39
+
40
+ begin
41
+ require 'text/soundex'
42
+ rescue LoadError
43
+ warn "could not load the Text gem -- soundex functionality will not be available"
44
+ end
45
+
46
+ LINGO_BASE = ENV['PM_LINGO_BASE'] || (
47
+ File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
48
+ )
49
+
50
+ LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
51
+ warn "lingo installation not found at #{LINGO_BASE} -- proceeding anyway" unless LINGO_FOUND
52
+
53
+ lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
54
+ YAML.load_file(file)
55
+ else
56
+ warn "lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
57
+
58
+ {
59
+ 'meeting' => {
60
+ 'attendees' => [
61
+ { 'tokenizer' => { } },
62
+ { 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
63
+ { 'decomposer' => { 'source' => 'sys-dic' } },
64
+ { 'multiworder' => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
65
+ { 'synonymer' => { 'source' => 'sys-syn', 'skip' => '?,t' } },
66
+ ]
67
+ }
68
+ }
69
+ end
70
+
71
+ lingo_config['meeting']['attendees'].
72
+ unshift({ 'textreader' => { 'files'=> 'STDIN' } }).
73
+ push({ 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } })
74
+
75
+ LINGO_CONFIG = lingo_config
76
+
77
+ class PerseusMatch
78
+
79
+ class TokenSet < Array
80
+
81
+ def self.tokenize(form)
82
+ return @tokens[form] if @tokens
83
+
84
+ @_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
85
+ k, (@_tokens[k] || []) | k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
86
+ )}
87
+
88
+ parse = lambda { |x|
89
+ x.each { |res|
90
+ case res
91
+ when /<(.*?)\s=\s\[(.*)\]>/
92
+ a, b = $1, $2
93
+ @_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
94
+ when /<(.*)>/, /:(.*):/
95
+ a, b = $1, $1.dup
96
+ @_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
97
+
98
+ warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
99
+ end
100
+ }
101
+ }
102
+
103
+ if File.readable?(t = 'perseus.tokens')
104
+ File.open(t) { |f| parse[f] }
105
+ @tokens[form]
106
+ else
107
+ raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
108
+
109
+ cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
110
+ YAML.dump(LINGO_CONFIG, t)
111
+ }
112
+
113
+ file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
114
+
115
+ unless File.file?(file) && File.readable?(file)
116
+ temp = Tempfile.open('perseus_match_temp') { |t|
117
+ t.puts form
118
+ }
119
+
120
+ file = temp.path
121
+ end
122
+
123
+ begin
124
+ Dir.chdir(LINGO_BASE) { parse[%x{
125
+ #{Config::CONFIG['ruby_install_name']} lingo.rb -c "#{cfg.path}" < "#{file}"
126
+ }] }
127
+ ensure
128
+ cfg.unlink
129
+ temp.unlink if temp
130
+ end
131
+
132
+ if temp
133
+ tokens, @tokens = @tokens[form], nil
134
+ tokens
135
+ else
136
+ @tokens[form]
137
+ end
138
+ end
139
+ end
140
+
141
+ private :push, :<<, :[]= # maybe more...
142
+
143
+ attr_reader :form
144
+
145
+ def initialize(form, tokens = nil)
146
+ super(tokens || self.class.tokenize(form))
147
+
148
+ @form = form
149
+ @tokens = to_a.flatten
150
+ end
151
+
152
+ def distance(other)
153
+ tokens1, tokens2 = tokens, other.tokens
154
+ size1, size2 = tokens1.size, tokens2.size
155
+
156
+ return size2 if tokens1.empty?
157
+ return size1 if tokens2.empty?
158
+
159
+ distance, costs = nil, (0..size2).to_a
160
+
161
+ 0.upto(size1 - 1) { |index1|
162
+ token1, cost = tokens1[index1], index1 + 1
163
+
164
+ 0.upto(size2 - 1) { |index2|
165
+ penalty = token1 == tokens2[index2] ? 0 : 1
166
+
167
+ # rcov hack :-(
168
+ _ = [
169
+ costs[index2 + 1] + 1, # insertion
170
+ cost + 1, # deletion
171
+ costs[index2] + penalty # substitution
172
+ ]
173
+ distance = _.min
174
+
175
+ costs[index2], cost = cost, distance
176
+ }
177
+
178
+ costs[size2] = distance
179
+ }
180
+
181
+ distance + 1 # > 0 !?!
182
+ end
183
+
184
+ def tokens(wc = true)
185
+ wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
186
+ token.sub(%r{[/|].*?\z}, '')
187
+ }
188
+ end
189
+
190
+ def disjoint?(other)
191
+ (tokens(false) & other.tokens(false)).empty?
192
+ end
193
+
194
+ def inclexcl(inclexcl = {})
195
+ incl(inclexcl[:incl] || '.*').excl(inclexcl[:excl])
196
+ end
197
+
198
+ def incl(*wc)
199
+ (@incl ||= {})[wc = [*wc].compact] ||= select { |token|
200
+ match?(token, wc)
201
+ }.to_token_set(form)
202
+ end
203
+
204
+ def excl(*wc)
205
+ (@excl ||= {})[wc = [*wc].compact] ||= reject { |token|
206
+ match?(token, wc)
207
+ }.to_token_set(form)
208
+ end
209
+
210
+ def soundex
211
+ raise "soundex functionality not available" unless defined?(Text::Soundex)
212
+
213
+ @soundex ||= map { |token|
214
+ token.sub(/(.*)(?=[\/|])/) { |m| Text::Soundex.soundex(m.replace_diacritics) }
215
+ }.to_token_set(form)
216
+ end
217
+
218
+ def soundex!
219
+ replace soundex
220
+ end
221
+
222
+ def eql?(other)
223
+ tokens == other.tokens && form == other.form
224
+ end
225
+
226
+ def inspect
227
+ "#{super}<#{form}>"
228
+ end
229
+
230
+ alias_method :to_s, :inspect
231
+
232
+ private
233
+
234
+ def match?(token, wc)
235
+ token =~ %r{[/|](?:#{wc.join('|')})\z}
236
+ end
237
+
238
+ end
239
+
240
+ class ::Array
241
+
242
+ def to_token_set(form)
243
+ TokenSet.new(form, self)
244
+ end
245
+
246
+ end
247
+
248
+ end
@@ -0,0 +1,27 @@
1
+ class PerseusMatch
2
+
3
+ module Version
4
+
5
+ MAJOR = 0
6
+ MINOR = 0
7
+ TINY = 3
8
+
9
+ class << self
10
+
11
+ # Returns array representation.
12
+ def to_a
13
+ [MAJOR, MINOR, TINY]
14
+ end
15
+
16
+ # Short-cut for version string.
17
+ def to_s
18
+ to_a.join('.')
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+
25
+ VERSION = Version.to_s
26
+
27
+ end
@@ -0,0 +1,45 @@
1
+ describe PerseusMatch::Cluster do
2
+
3
+ it 'should accept limit option in sort_by' do
4
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :limit => 1).all? { |phrase, matches|
5
+ matches.size.should == 1
6
+ matches.size.should == matches.nitems
7
+ }
8
+ end
9
+
10
+ it 'should accept threshold option in sort_by (1a)' do
11
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
12
+ matches.size.should be_zero
13
+ matches.size.should == matches.nitems
14
+ }
15
+ end
16
+
17
+ it 'should accept threshold option in sort_by (1b)' do
18
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0).all? { |phrase, matches|
19
+ matches.size.should == 2
20
+ matches.size.should == matches.nitems
21
+ }
22
+ end
23
+
24
+ it 'should accept threshold option in sort_by (2)' do
25
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'c').all? { |phrase, matches|
26
+ matches.size.should == 1
27
+ matches.size.should == matches.nitems
28
+ }
29
+ end
30
+
31
+ it 'should accept both limit and threshold options in sort_by (1)' do
32
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'z', :limit => 1).all? { |phrase, matches|
33
+ matches.size.should == 1
34
+ matches.size.should == matches.nitems
35
+ }
36
+ end
37
+
38
+ it 'should accept both limit and threshold options in sort_by (2)' do
39
+ PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'a', :limit => 1).all? { |phrase, matches|
40
+ matches.size.should be_zero
41
+ matches.size.should == matches.nitems
42
+ }
43
+ end
44
+
45
+ end if LINGO_FOUND