simstring_pure 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/bin/simstring +28 -0
  3. data/lib/simstring_pure.rb +305 -0
  4. metadata +46 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9626d0b524a0b66517fb736af6d39d03bf2d61a3
4
+ data.tar.gz: 91817a82fc24bda6d383f8c429b3a81cea0989b5
5
+ SHA512:
6
+ metadata.gz: c2182f5348eeadb1ab1f7a9a04cb63117174041a1aab34a411a17c97bc4f585da96099a1b31cfc097696c2295fce2fe040fdcd291df2497660d449a8931a273e
7
+ data.tar.gz: 9f548a200810181726e2d9a74155222c0de9a2f74d92803933b35a3f626ca8e183b6d24753a85b4860e80b84017b1b8a1bdcc5d7a5cad38a5819db4187c7ab31
data/bin/simstring ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'simstring_pure'
4
+
5
+ def main
6
+ filename, query_string, similarity_threshold = *ARGV
7
+ similarity_threshold = (similarity_threshold || 0.7).to_f
8
+
9
+ t1 = Time.now
10
+
11
+ ngram_builder = SimString::NGramBuilder.new(3)
12
+ db = SimString::Database.new(ngram_builder)
13
+
14
+ File.readlines(filename).each {|line| db.add(line.strip) }
15
+
16
+ t2 = Time.now
17
+
18
+ matcher = SimString::StringMatcher.new(db, SimString::CosineMeasure.new)
19
+
20
+ pp matcher.search(query_string, similarity_threshold)
21
+
22
+ t3 = Time.now
23
+
24
+ puts "#{t2 - t1} seconds to build database"
25
+ puts "#{t3 - t2} seconds to search"
26
+ end
27
+
28
+ main
@@ -0,0 +1,305 @@
1
+ require 'set'
2
+ require 'pp'
3
+
4
+ module SimString
5
+
6
+ class FeatureExtractor
7
+ # returns a Set of features
8
+ def features(string)
9
+ raise "Not implemented."
10
+ end
11
+ end
12
+
13
+ NGram = Struct.new(:ngram, :index)
14
+ class NGramBuilder < FeatureExtractor
15
+ SENTINAL_CHAR = "\u00A0" # non-breaking space
16
+
17
+ attr_accessor :n
18
+
19
+ def initialize(n)
20
+ self.n = n
21
+ end
22
+
23
+ def features(string)
24
+ prefix_and_suffix_string = SENTINAL_CHAR * (n - 1)
25
+ string = prefix_and_suffix_string + string + prefix_and_suffix_string
26
+ ngram_strings = string.each_char.each_cons(n).map(&:join)
27
+ ngram_strings_to_count_map = ngram_strings.reduce({}) {|memo, ngram_string| memo[ngram_string] = (memo[ngram_string] || 0) + 1; memo }
28
+ numbered_ngrams = ngram_strings_to_count_map.flat_map {|ngram_string, count| (1..count).map {|i| NGram.new(ngram_string, i) } }
29
+ numbered_ngrams.to_set
30
+ end
31
+ end
32
+
33
+ class Measure
34
+ # The #min_feature_size and #max_feature_size methods return the lower and upper bounds, respectively, of the range of feature set sizes
35
+ # belonging to the candidate search results.
36
+ # In other words, the only strings in the database that can possibly be considered an approximate search match *must* have a feature set size
37
+ # within the closed interval [min_feature_size(...), max_feature_size(...)]
38
+
39
+ # query_size is an int
40
+ # alpha is a double
41
+ def min_feature_size(db, query_size, alpha)
42
+ raise "Not implemented."
43
+ end
44
+
45
+ # query_size is an int
46
+ # alpha is a double
47
+ def max_feature_size(db, query_size, alpha)
48
+ raise "Not implemented."
49
+ end
50
+
51
+ # This method returns tau, the number of of features that two strings, x and y,
52
+ # must have in common in order for their similarity coefficient to be greater than or equal to alpha.
53
+ # Parameters:
54
+ # query_size is an int - the number of features in x
55
+ # y_size is an int - the number of features in y
56
+ # alpha is a double - the similarity threshold
57
+ def minimum_common_feature_count(query_size, y_size, alpha)
58
+ raise "Not implemented."
59
+ end
60
+
61
+ def similarity(x_feature_set, y_feature_set)
62
+ raise "Not implemented."
63
+ end
64
+ end
65
+
66
+ class CosineMeasure < Measure
67
+ def min_feature_size(db, query_size, alpha)
68
+ (alpha * alpha * query_size).ceil.to_i
69
+ end
70
+
71
+ def max_feature_size(db, query_size, alpha)
72
+ (query_size.to_f / (alpha * alpha)).floor.to_i
73
+ end
74
+
75
+ def minimum_common_feature_count(query_size, y_size, alpha)
76
+ (alpha * Math.sqrt(query_size * y_size)).ceil.to_i
77
+ end
78
+
79
+ def similarity(x_feature_set, y_feature_set)
80
+ (x_feature_set & y_feature_set).size.to_f / Math.sqrt(x_feature_set.size * y_feature_set.size)
81
+ end
82
+ end
83
+
84
+ class DiceMeasure < Measure
85
+ def min_feature_size(db, query_size, alpha)
86
+ ((alpha.to_f / (2 - alpha)) * query_size).ceil.to_i
87
+ end
88
+
89
+ def max_feature_size(db, query_size, alpha)
90
+ (((2 - alpha).to_f / alpha) * query_size).floor.to_i
91
+ end
92
+
93
+ def minimum_common_feature_count(query_size, y_size, alpha)
94
+ (0.5 * alpha * (query_size * y_size)).ceil.to_i
95
+ end
96
+
97
+ def similarity(x_feature_set, y_feature_set)
98
+ (2 * (x_feature_set & y_feature_set).size).to_f / (x_feature_set.size + y_feature_set.size)
99
+ end
100
+ end
101
+
102
+ class ExactMeasure < Measure
103
+ def min_feature_size(db, query_size, alpha)
104
+ query_size
105
+ end
106
+
107
+ def max_feature_size(db, query_size, alpha)
108
+ query_size
109
+ end
110
+
111
+ def minimum_common_feature_count(query_size, y_size, alpha)
112
+ query_size
113
+ end
114
+
115
+ def similarity(x_feature_set, y_feature_set)
116
+ if x_feature_set == y_feature_set
117
+ 1.0
118
+ else
119
+ 0.0
120
+ end
121
+ end
122
+ end
123
+
124
+ class JaccardMeasure < Measure
125
+ def min_feature_size(db, query_size, alpha)
126
+ (alpha * query_size).ceil.to_i
127
+ end
128
+
129
+ def max_feature_size(db, query_size, alpha)
130
+ (query_size.to_f / alpha).floor.to_i
131
+ end
132
+
133
+ def minimum_common_feature_count(query_size, y_size, alpha)
134
+ (alpha * (query_size + y_size).to_f / (1 + alpha)).ceil.to_i
135
+ end
136
+
137
+ def similarity(x_feature_set, y_feature_set)
138
+ (x_feature_set & y_feature_set).size.to_f / (x_feature_set | y_feature_set).size
139
+ end
140
+ end
141
+
142
+ class OverlapMeasure < Measure
143
+ def min_feature_size(db, query_size, alpha)
144
+ 1
145
+ end
146
+
147
+ def max_feature_size(db, query_size, alpha)
148
+ db.max_feature_size
149
+ end
150
+
151
+ def minimum_common_feature_count(query_size, y_size, alpha)
152
+ (alpha * [query_size, y_size].min).ceil.to_i
153
+ end
154
+
155
+ def similarity(x_feature_set, y_feature_set)
156
+ (x_feature_set & y_feature_set).size.to_f / [x_feature_set.size, y_feature_set.size].min
157
+ end
158
+ end
159
+
160
+
161
+ class Database
162
+ class << self
163
+ def load(file_path)
164
+ m = Marshal.load(File.read(file_path))
165
+ end
166
+ end
167
+
168
+ attr_reader :feature_extractor
169
+
170
+ def initialize(feature_extractor)
171
+ @strings = Set.new
172
+ @feature_extractor = feature_extractor
173
+ @feature_to_string_map = {}
174
+ @feature_set_size_to_string_map = {}
175
+ end
176
+
177
+ def add(string)
178
+ if !@strings.include?(string)
179
+ @strings << string
180
+
181
+ features = feature_extractor.features(string)
182
+ feature_set_size = features.size
183
+
184
+ # update @feature_set_size_to_string_map
185
+ @feature_set_size_to_string_map[feature_set_size] ||= Set.new
186
+ @feature_set_size_to_string_map[feature_set_size] << string
187
+
188
+ # update @feature_to_string_map
189
+ features.each do |feature|
190
+ @feature_to_string_map[feature] ||= Set.new
191
+ @feature_to_string_map[feature] << string
192
+ end
193
+ end
194
+ nil
195
+ end
196
+
197
+ def min_feature_size
198
+ @feature_set_size_to_string_map.keys.min
199
+ end
200
+
201
+ def max_feature_size
202
+ @feature_set_size_to_string_map.keys.max
203
+ end
204
+
205
+ def lookup_strings_by_feature_set_size(size)
206
+ @feature_set_size_to_string_map[size] || Set.new
207
+ end
208
+
209
+ def lookup_strings_by_feature(feature)
210
+ @feature_to_string_map[feature] || Set.new
211
+ end
212
+
213
+ def save(file_path)
214
+ File.open(file_path, 'w') {|f| f.write(Marshal.dump(self)) }
215
+ end
216
+ end
217
+
218
+
219
+ Match = Struct.new(:value, :score)
220
+
221
+ class StringMatcher
222
+ def initialize(simstring_db, measure)
223
+ @db = simstring_db
224
+ @measure = measure
225
+ @feature_extractor = @db.feature_extractor
226
+ end
227
+
228
+ # Implements "Algorithm 1: Approximate dictionary matching" described in "Simple and Efficient Algorithm for Approximate Dictionary Matching" (see http://www.aclweb.org/anthology/C10-1096)
229
+ # Returns an array of matching strings.
230
+ # Example:
231
+ # matcher.search("Fooo", 0.5)
232
+ # => ["Foo", "Food", "Foot"]
233
+ def search(query_string, alpha, measure = @measure)
234
+ feature_set = @feature_extractor.features(query_string)
235
+ feature_set_size = feature_set.size
236
+ matches = []
237
+ min_feature_size_of_matching_string = measure.min_feature_size(@db, feature_set_size, alpha)
238
+ max_feature_size_of_matching_string = measure.max_feature_size(@db, feature_set_size, alpha)
239
+ (min_feature_size_of_matching_string..max_feature_size_of_matching_string).each do |candidate_match_feature_size|
240
+ tau = min_overlap(measure, feature_set_size, candidate_match_feature_size, alpha)
241
+ additional_matches = overlap_join(feature_set, tau, @db, candidate_match_feature_size)
242
+ matches.concat(additional_matches)
243
+ end
244
+ matches
245
+ end
246
+
247
+ # Same as #search, except returns an array of Match objects indicating both the matched string(s) and their corresponding similarity scores.
248
+ # Example:
249
+ # matcher.ranked_search("Fooo", 0.5)
250
+ # => [#<struct Match value="Foo", score=0.9128709291752769>,
251
+ # <struct Match value="Food", score=0.5>,
252
+ # <struct Match value="Foot", score=0.5>]
253
+ def ranked_search(query_string, alpha, measure = @measure)
254
+ feature_set = @feature_extractor.features(query_string)
255
+ search(query_string, alpha, measure).map do |matching_string|
256
+ Match.new(matching_string, measure.similarity(feature_set, @feature_extractor.features(matching_string)))
257
+ end.sort_by {|match| -match.score }
258
+ end
259
+
260
+ private
261
+
262
+ def min_overlap(measure, query_size, y_size, alpha)
263
+ measure.minimum_common_feature_count(query_size, y_size, alpha)
264
+ end
265
+
266
+ # implements "Algorithm 3: CPMerge algorithm" described in "Simple and Efficient Algorithm for Approximate Dictionary Matching" (see http://www.aclweb.org/anthology/C10-1096)
267
+ def overlap_join(query_feature_set, tau, db, y_size)
268
+ memoized_get_fn_results = query_feature_set.reduce({}) {|memo, feature| memo[feature] = get(db, y_size, feature); memo }
269
+ query_feature_set_size = query_feature_set.size
270
+ sorted_features = query_feature_set.sort_by {|feature| memoized_get_fn_results[feature].size }
271
+ m = {}
272
+ (0..(query_feature_set_size - tau)).each do |k|
273
+ memoized_get_fn_results[sorted_features[k]].each do |s|
274
+ m[s] ||= 0
275
+ m[s] += 1
276
+ end
277
+ end
278
+ r = []
279
+ ((query_feature_set_size - tau + 1)..(query_feature_set_size - 1)).each do |k|
280
+ candidate_matching_strings = m.keys
281
+ candidate_matching_strings.each do |s|
282
+ m[s] ||= 0
283
+ if memoized_get_fn_results[sorted_features[k]].include?(s)
284
+ m[s] += 1
285
+ end
286
+ if tau <= m[s]
287
+ r << s
288
+ m.delete(s)
289
+ elsif m[s] + (query_feature_set_size - k - 1) < tau
290
+ m.delete(s)
291
+ end
292
+ end
293
+ end
294
+ r
295
+ end
296
+
297
+ # Returns a Set of strings that each meet the following 2 criteria:
298
+ # 1. the string has a feature set size equal to <y_size>
299
+ # 2. the string's feature set contains the feature <feature>
300
+ def get(db, y_size, feature)
301
+ db.lookup_strings_by_feature_set_size(y_size) & db.lookup_strings_by_feature(feature)
302
+ end
303
+ end
304
+
305
+ end
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simstring_pure
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - David Ellis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-08 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A Ruby implementation of the SimString approximate string matching algorithm.
14
+ email: davidkellis@gmail.com
15
+ executables:
16
+ - simstring
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/simstring
21
+ - lib/simstring_pure.rb
22
+ homepage: https://github.com/davidkellis/simstring
23
+ licenses:
24
+ - MIT
25
+ metadata: {}
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 2.4.5.1
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: SimString approximate string matching library.
46
+ test_files: []