simstring_pure 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/bin/simstring +28 -0
  3. data/lib/simstring_pure.rb +305 -0
  4. metadata +46 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9626d0b524a0b66517fb736af6d39d03bf2d61a3
4
+ data.tar.gz: 91817a82fc24bda6d383f8c429b3a81cea0989b5
5
+ SHA512:
6
+ metadata.gz: c2182f5348eeadb1ab1f7a9a04cb63117174041a1aab34a411a17c97bc4f585da96099a1b31cfc097696c2295fce2fe040fdcd291df2497660d449a8931a273e
7
+ data.tar.gz: 9f548a200810181726e2d9a74155222c0de9a2f74d92803933b35a3f626ca8e183b6d24753a85b4860e80b84017b1b8a1bdcc5d7a5cad38a5819db4187c7ab31
data/bin/simstring ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'simstring_pure'
4
+
5
+ def main
6
+ filename, query_string, similarity_threshold = *ARGV
7
+ similarity_threshold = (similarity_threshold || 0.7).to_f
8
+
9
+ t1 = Time.now
10
+
11
+ ngram_builder = SimString::NGramBuilder.new(3)
12
+ db = SimString::Database.new(ngram_builder)
13
+
14
+ File.readlines(filename).each {|line| db.add(line.strip) }
15
+
16
+ t2 = Time.now
17
+
18
+ matcher = SimString::StringMatcher.new(db, SimString::CosineMeasure.new)
19
+
20
+ pp matcher.search(query_string, similarity_threshold)
21
+
22
+ t3 = Time.now
23
+
24
+ puts "#{t2 - t1} seconds to build database"
25
+ puts "#{t3 - t2} seconds to search"
26
+ end
27
+
28
+ main
@@ -0,0 +1,305 @@
1
+ require 'set'
2
+ require 'pp'
3
+
4
+ module SimString
5
+
6
+ class FeatureExtractor
7
+ # returns a Set of features
8
+ def features(string)
9
+ raise "Not implemented."
10
+ end
11
+ end
12
+
13
+ NGram = Struct.new(:ngram, :index)
14
+ class NGramBuilder < FeatureExtractor
15
+ SENTINAL_CHAR = "\u00A0" # non-breaking space
16
+
17
+ attr_accessor :n
18
+
19
+ def initialize(n)
20
+ self.n = n
21
+ end
22
+
23
+ def features(string)
24
+ prefix_and_suffix_string = SENTINAL_CHAR * (n - 1)
25
+ string = prefix_and_suffix_string + string + prefix_and_suffix_string
26
+ ngram_strings = string.each_char.each_cons(n).map(&:join)
27
+ ngram_strings_to_count_map = ngram_strings.reduce({}) {|memo, ngram_string| memo[ngram_string] = (memo[ngram_string] || 0) + 1; memo }
28
+ numbered_ngrams = ngram_strings_to_count_map.flat_map {|ngram_string, count| (1..count).map {|i| NGram.new(ngram_string, i) } }
29
+ numbered_ngrams.to_set
30
+ end
31
+ end
32
+
33
+ class Measure
34
+ # The #min_feature_size and #max_feature_size methods return the lower and upper bounds, respectively, of the range of feature set sizes
35
+ # belonging to the candidate search results.
36
+ # In other words, the only strings in the database that can possibly be considered an approximate search match *must* have a feature set size
37
+ # within the closed interval [min_feature_size(...), max_feature_size(...)]
38
+
39
+ # query_size is an int
40
+ # alpha is a double
41
+ def min_feature_size(db, query_size, alpha)
42
+ raise "Not implemented."
43
+ end
44
+
45
+ # query_size is an int
46
+ # alpha is a double
47
+ def max_feature_size(db, query_size, alpha)
48
+ raise "Not implemented."
49
+ end
50
+
51
+ # This method returns tau, the number of of features that two strings, x and y,
52
+ # must have in common in order for their similarity coefficient to be greater than or equal to alpha.
53
+ # Parameters:
54
+ # query_size is an int - the number of features in x
55
+ # y_size is an int - the number of features in y
56
+ # alpha is a double - the similarity threshold
57
+ def minimum_common_feature_count(query_size, y_size, alpha)
58
+ raise "Not implemented."
59
+ end
60
+
61
+ def similarity(x_feature_set, y_feature_set)
62
+ raise "Not implemented."
63
+ end
64
+ end
65
+
66
+ class CosineMeasure < Measure
67
+ def min_feature_size(db, query_size, alpha)
68
+ (alpha * alpha * query_size).ceil.to_i
69
+ end
70
+
71
+ def max_feature_size(db, query_size, alpha)
72
+ (query_size.to_f / (alpha * alpha)).floor.to_i
73
+ end
74
+
75
+ def minimum_common_feature_count(query_size, y_size, alpha)
76
+ (alpha * Math.sqrt(query_size * y_size)).ceil.to_i
77
+ end
78
+
79
+ def similarity(x_feature_set, y_feature_set)
80
+ (x_feature_set & y_feature_set).size.to_f / Math.sqrt(x_feature_set.size * y_feature_set.size)
81
+ end
82
+ end
83
+
84
+ class DiceMeasure < Measure
85
+ def min_feature_size(db, query_size, alpha)
86
+ ((alpha.to_f / (2 - alpha)) * query_size).ceil.to_i
87
+ end
88
+
89
+ def max_feature_size(db, query_size, alpha)
90
+ (((2 - alpha).to_f / alpha) * query_size).floor.to_i
91
+ end
92
+
93
+ def minimum_common_feature_count(query_size, y_size, alpha)
94
+ (0.5 * alpha * (query_size * y_size)).ceil.to_i
95
+ end
96
+
97
+ def similarity(x_feature_set, y_feature_set)
98
+ (2 * (x_feature_set & y_feature_set).size).to_f / (x_feature_set.size + y_feature_set.size)
99
+ end
100
+ end
101
+
102
+ class ExactMeasure < Measure
103
+ def min_feature_size(db, query_size, alpha)
104
+ query_size
105
+ end
106
+
107
+ def max_feature_size(db, query_size, alpha)
108
+ query_size
109
+ end
110
+
111
+ def minimum_common_feature_count(query_size, y_size, alpha)
112
+ query_size
113
+ end
114
+
115
+ def similarity(x_feature_set, y_feature_set)
116
+ if x_feature_set == y_feature_set
117
+ 1.0
118
+ else
119
+ 0.0
120
+ end
121
+ end
122
+ end
123
+
124
+ class JaccardMeasure < Measure
125
+ def min_feature_size(db, query_size, alpha)
126
+ (alpha * query_size).ceil.to_i
127
+ end
128
+
129
+ def max_feature_size(db, query_size, alpha)
130
+ (query_size.to_f / alpha).floor.to_i
131
+ end
132
+
133
+ def minimum_common_feature_count(query_size, y_size, alpha)
134
+ (alpha * (query_size + y_size).to_f / (1 + alpha)).ceil.to_i
135
+ end
136
+
137
+ def similarity(x_feature_set, y_feature_set)
138
+ (x_feature_set & y_feature_set).size.to_f / (x_feature_set | y_feature_set).size
139
+ end
140
+ end
141
+
142
+ class OverlapMeasure < Measure
143
+ def min_feature_size(db, query_size, alpha)
144
+ 1
145
+ end
146
+
147
+ def max_feature_size(db, query_size, alpha)
148
+ db.max_feature_size
149
+ end
150
+
151
+ def minimum_common_feature_count(query_size, y_size, alpha)
152
+ (alpha * [query_size, y_size].min).ceil.to_i
153
+ end
154
+
155
+ def similarity(x_feature_set, y_feature_set)
156
+ (x_feature_set & y_feature_set).size.to_f / [x_feature_set.size, y_feature_set.size].min
157
+ end
158
+ end
159
+
160
+
161
+ class Database
162
+ class << self
163
+ def load(file_path)
164
+ m = Marshal.load(File.read(file_path))
165
+ end
166
+ end
167
+
168
+ attr_reader :feature_extractor
169
+
170
+ def initialize(feature_extractor)
171
+ @strings = Set.new
172
+ @feature_extractor = feature_extractor
173
+ @feature_to_string_map = {}
174
+ @feature_set_size_to_string_map = {}
175
+ end
176
+
177
+ def add(string)
178
+ if !@strings.include?(string)
179
+ @strings << string
180
+
181
+ features = feature_extractor.features(string)
182
+ feature_set_size = features.size
183
+
184
+ # update @feature_set_size_to_string_map
185
+ @feature_set_size_to_string_map[feature_set_size] ||= Set.new
186
+ @feature_set_size_to_string_map[feature_set_size] << string
187
+
188
+ # update @feature_to_string_map
189
+ features.each do |feature|
190
+ @feature_to_string_map[feature] ||= Set.new
191
+ @feature_to_string_map[feature] << string
192
+ end
193
+ end
194
+ nil
195
+ end
196
+
197
+ def min_feature_size
198
+ @feature_set_size_to_string_map.keys.min
199
+ end
200
+
201
+ def max_feature_size
202
+ @feature_set_size_to_string_map.keys.max
203
+ end
204
+
205
+ def lookup_strings_by_feature_set_size(size)
206
+ @feature_set_size_to_string_map[size] || Set.new
207
+ end
208
+
209
+ def lookup_strings_by_feature(feature)
210
+ @feature_to_string_map[feature] || Set.new
211
+ end
212
+
213
+ def save(file_path)
214
+ File.open(file_path, 'w') {|f| f.write(Marshal.dump(self)) }
215
+ end
216
+ end
217
+
218
+
219
+ Match = Struct.new(:value, :score)
220
+
221
+ class StringMatcher
222
+ def initialize(simstring_db, measure)
223
+ @db = simstring_db
224
+ @measure = measure
225
+ @feature_extractor = @db.feature_extractor
226
+ end
227
+
228
+ # Implements "Algorithm 1: Approximate dictionary matching" described in "Simple and Efficient Algorithm for Approximate Dictionary Matching" (see http://www.aclweb.org/anthology/C10-1096)
229
+ # Returns an array of matching strings.
230
+ # Example:
231
+ # matcher.search("Fooo", 0.5)
232
+ # => ["Foo", "Food", "Foot"]
233
+ def search(query_string, alpha, measure = @measure)
234
+ feature_set = @feature_extractor.features(query_string)
235
+ feature_set_size = feature_set.size
236
+ matches = []
237
+ min_feature_size_of_matching_string = measure.min_feature_size(@db, feature_set_size, alpha)
238
+ max_feature_size_of_matching_string = measure.max_feature_size(@db, feature_set_size, alpha)
239
+ (min_feature_size_of_matching_string..max_feature_size_of_matching_string).each do |candidate_match_feature_size|
240
+ tau = min_overlap(measure, feature_set_size, candidate_match_feature_size, alpha)
241
+ additional_matches = overlap_join(feature_set, tau, @db, candidate_match_feature_size)
242
+ matches.concat(additional_matches)
243
+ end
244
+ matches
245
+ end
246
+
247
+ # Same as #search, except returns an array of Match objects indicating both the matched string(s) and their corresponding similarity scores.
248
+ # Example:
249
+ # matcher.ranked_search("Fooo", 0.5)
250
+ # => [#<struct Match value="Foo", score=0.9128709291752769>,
251
+ # <struct Match value="Food", score=0.5>,
252
+ # <struct Match value="Foot", score=0.5>]
253
+ def ranked_search(query_string, alpha, measure = @measure)
254
+ feature_set = @feature_extractor.features(query_string)
255
+ search(query_string, alpha, measure).map do |matching_string|
256
+ Match.new(matching_string, measure.similarity(feature_set, @feature_extractor.features(matching_string)))
257
+ end.sort_by {|match| -match.score }
258
+ end
259
+
260
+ private
261
+
262
+ def min_overlap(measure, query_size, y_size, alpha)
263
+ measure.minimum_common_feature_count(query_size, y_size, alpha)
264
+ end
265
+
266
+ # implements "Algorithm 3: CPMerge algorithm" described in "Simple and Efficient Algorithm for Approximate Dictionary Matching" (see http://www.aclweb.org/anthology/C10-1096)
267
+ def overlap_join(query_feature_set, tau, db, y_size)
268
+ memoized_get_fn_results = query_feature_set.reduce({}) {|memo, feature| memo[feature] = get(db, y_size, feature); memo }
269
+ query_feature_set_size = query_feature_set.size
270
+ sorted_features = query_feature_set.sort_by {|feature| memoized_get_fn_results[feature].size }
271
+ m = {}
272
+ (0..(query_feature_set_size - tau)).each do |k|
273
+ memoized_get_fn_results[sorted_features[k]].each do |s|
274
+ m[s] ||= 0
275
+ m[s] += 1
276
+ end
277
+ end
278
+ r = []
279
+ ((query_feature_set_size - tau + 1)..(query_feature_set_size - 1)).each do |k|
280
+ candidate_matching_strings = m.keys
281
+ candidate_matching_strings.each do |s|
282
+ m[s] ||= 0
283
+ if memoized_get_fn_results[sorted_features[k]].include?(s)
284
+ m[s] += 1
285
+ end
286
+ if tau <= m[s]
287
+ r << s
288
+ m.delete(s)
289
+ elsif m[s] + (query_feature_set_size - k - 1) < tau
290
+ m.delete(s)
291
+ end
292
+ end
293
+ end
294
+ r
295
+ end
296
+
297
+ # Returns a Set of strings that each meet the following 2 criteria:
298
+ # 1. the string has a feature set size equal to <y_size>
299
+ # 2. the string's feature set contains the feature <feature>
300
+ def get(db, y_size, feature)
301
+ db.lookup_strings_by_feature_set_size(y_size) & db.lookup_strings_by_feature(feature)
302
+ end
303
+ end
304
+
305
+ end
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simstring_pure
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - David Ellis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-08 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A Ruby implementation of the SimString approximate string matching algorithm.
14
+ email: davidkellis@gmail.com
15
+ executables:
16
+ - simstring
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/simstring
21
+ - lib/simstring_pure.rb
22
+ homepage: https://github.com/davidkellis/simstring
23
+ licenses:
24
+ - MIT
25
+ metadata: {}
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 2.4.5.1
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: SimString approximate string matching library.
46
+ test_files: []