simstring_pure 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/simstring +28 -0
- data/lib/simstring_pure.rb +305 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9626d0b524a0b66517fb736af6d39d03bf2d61a3
|
4
|
+
data.tar.gz: 91817a82fc24bda6d383f8c429b3a81cea0989b5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c2182f5348eeadb1ab1f7a9a04cb63117174041a1aab34a411a17c97bc4f585da96099a1b31cfc097696c2295fce2fe040fdcd291df2497660d449a8931a273e
|
7
|
+
data.tar.gz: 9f548a200810181726e2d9a74155222c0de9a2f74d92803933b35a3f626ca8e183b6d24753a85b4860e80b84017b1b8a1bdcc5d7a5cad38a5819db4187c7ab31
|
data/bin/simstring
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'simstring_pure'
|
4
|
+
|
5
|
+
def main
|
6
|
+
filename, query_string, similarity_threshold = *ARGV
|
7
|
+
similarity_threshold = (similarity_threshold || 0.7).to_f
|
8
|
+
|
9
|
+
t1 = Time.now
|
10
|
+
|
11
|
+
ngram_builder = SimString::NGramBuilder.new(3)
|
12
|
+
db = SimString::Database.new(ngram_builder)
|
13
|
+
|
14
|
+
File.readlines(filename).each {|line| db.add(line.strip) }
|
15
|
+
|
16
|
+
t2 = Time.now
|
17
|
+
|
18
|
+
matcher = SimString::StringMatcher.new(db, SimString::CosineMeasure.new)
|
19
|
+
|
20
|
+
pp matcher.search(query_string, similarity_threshold)
|
21
|
+
|
22
|
+
t3 = Time.now
|
23
|
+
|
24
|
+
puts "#{t2 - t1} seconds to build database"
|
25
|
+
puts "#{t3 - t2} seconds to search"
|
26
|
+
end
|
27
|
+
|
28
|
+
main
|
@@ -0,0 +1,305 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'pp'
|
3
|
+
|
4
|
+
module SimString
|
5
|
+
|
6
|
+
class FeatureExtractor
|
7
|
+
# returns a Set of features
|
8
|
+
def features(string)
|
9
|
+
raise "Not implemented."
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
NGram = Struct.new(:ngram, :index)
|
14
|
+
class NGramBuilder < FeatureExtractor
|
15
|
+
SENTINAL_CHAR = "\u00A0" # non-breaking space
|
16
|
+
|
17
|
+
attr_accessor :n
|
18
|
+
|
19
|
+
def initialize(n)
|
20
|
+
self.n = n
|
21
|
+
end
|
22
|
+
|
23
|
+
def features(string)
|
24
|
+
prefix_and_suffix_string = SENTINAL_CHAR * (n - 1)
|
25
|
+
string = prefix_and_suffix_string + string + prefix_and_suffix_string
|
26
|
+
ngram_strings = string.each_char.each_cons(n).map(&:join)
|
27
|
+
ngram_strings_to_count_map = ngram_strings.reduce({}) {|memo, ngram_string| memo[ngram_string] = (memo[ngram_string] || 0) + 1; memo }
|
28
|
+
numbered_ngrams = ngram_strings_to_count_map.flat_map {|ngram_string, count| (1..count).map {|i| NGram.new(ngram_string, i) } }
|
29
|
+
numbered_ngrams.to_set
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Measure
|
34
|
+
# The #min_feature_size and #max_feature_size methods return the lower and upper bounds, respectively, of the range of feature set sizes
|
35
|
+
# belonging to the candidate search results.
|
36
|
+
# In other words, the only strings in the database that can possibly be considered an approximate search match *must* have a feature set size
|
37
|
+
# within the closed interval [min_feature_size(...), max_feature_size(...)]
|
38
|
+
|
39
|
+
# query_size is an int
|
40
|
+
# alpha is a double
|
41
|
+
def min_feature_size(db, query_size, alpha)
|
42
|
+
raise "Not implemented."
|
43
|
+
end
|
44
|
+
|
45
|
+
# query_size is an int
|
46
|
+
# alpha is a double
|
47
|
+
def max_feature_size(db, query_size, alpha)
|
48
|
+
raise "Not implemented."
|
49
|
+
end
|
50
|
+
|
51
|
+
# This method returns tau, the number of of features that two strings, x and y,
|
52
|
+
# must have in common in order for their similarity coefficient to be greater than or equal to alpha.
|
53
|
+
# Parameters:
|
54
|
+
# query_size is an int - the number of features in x
|
55
|
+
# y_size is an int - the number of features in y
|
56
|
+
# alpha is a double - the similarity threshold
|
57
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
58
|
+
raise "Not implemented."
|
59
|
+
end
|
60
|
+
|
61
|
+
def similarity(x_feature_set, y_feature_set)
|
62
|
+
raise "Not implemented."
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class CosineMeasure < Measure
|
67
|
+
def min_feature_size(db, query_size, alpha)
|
68
|
+
(alpha * alpha * query_size).ceil.to_i
|
69
|
+
end
|
70
|
+
|
71
|
+
def max_feature_size(db, query_size, alpha)
|
72
|
+
(query_size.to_f / (alpha * alpha)).floor.to_i
|
73
|
+
end
|
74
|
+
|
75
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
76
|
+
(alpha * Math.sqrt(query_size * y_size)).ceil.to_i
|
77
|
+
end
|
78
|
+
|
79
|
+
def similarity(x_feature_set, y_feature_set)
|
80
|
+
(x_feature_set & y_feature_set).size.to_f / Math.sqrt(x_feature_set.size * y_feature_set.size)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class DiceMeasure < Measure
|
85
|
+
def min_feature_size(db, query_size, alpha)
|
86
|
+
((alpha.to_f / (2 - alpha)) * query_size).ceil.to_i
|
87
|
+
end
|
88
|
+
|
89
|
+
def max_feature_size(db, query_size, alpha)
|
90
|
+
(((2 - alpha).to_f / alpha) * query_size).floor.to_i
|
91
|
+
end
|
92
|
+
|
93
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
94
|
+
(0.5 * alpha * (query_size * y_size)).ceil.to_i
|
95
|
+
end
|
96
|
+
|
97
|
+
def similarity(x_feature_set, y_feature_set)
|
98
|
+
(2 * (x_feature_set & y_feature_set).size).to_f / (x_feature_set.size + y_feature_set.size)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
class ExactMeasure < Measure
|
103
|
+
def min_feature_size(db, query_size, alpha)
|
104
|
+
query_size
|
105
|
+
end
|
106
|
+
|
107
|
+
def max_feature_size(db, query_size, alpha)
|
108
|
+
query_size
|
109
|
+
end
|
110
|
+
|
111
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
112
|
+
query_size
|
113
|
+
end
|
114
|
+
|
115
|
+
def similarity(x_feature_set, y_feature_set)
|
116
|
+
if x_feature_set == y_feature_set
|
117
|
+
1.0
|
118
|
+
else
|
119
|
+
0.0
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
class JaccardMeasure < Measure
|
125
|
+
def min_feature_size(db, query_size, alpha)
|
126
|
+
(alpha * query_size).ceil.to_i
|
127
|
+
end
|
128
|
+
|
129
|
+
def max_feature_size(db, query_size, alpha)
|
130
|
+
(query_size.to_f / alpha).floor.to_i
|
131
|
+
end
|
132
|
+
|
133
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
134
|
+
(alpha * (query_size + y_size).to_f / (1 + alpha)).ceil.to_i
|
135
|
+
end
|
136
|
+
|
137
|
+
def similarity(x_feature_set, y_feature_set)
|
138
|
+
(x_feature_set & y_feature_set).size.to_f / (x_feature_set | y_feature_set).size
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
class OverlapMeasure < Measure
|
143
|
+
def min_feature_size(db, query_size, alpha)
|
144
|
+
1
|
145
|
+
end
|
146
|
+
|
147
|
+
def max_feature_size(db, query_size, alpha)
|
148
|
+
db.max_feature_size
|
149
|
+
end
|
150
|
+
|
151
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
152
|
+
(alpha * [query_size, y_size].min).ceil.to_i
|
153
|
+
end
|
154
|
+
|
155
|
+
def similarity(x_feature_set, y_feature_set)
|
156
|
+
(x_feature_set & y_feature_set).size.to_f / [x_feature_set.size, y_feature_set.size].min
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
class Database
|
162
|
+
class << self
|
163
|
+
def load(file_path)
|
164
|
+
m = Marshal.load(File.read(file_path))
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
attr_reader :feature_extractor
|
169
|
+
|
170
|
+
def initialize(feature_extractor)
|
171
|
+
@strings = Set.new
|
172
|
+
@feature_extractor = feature_extractor
|
173
|
+
@feature_to_string_map = {}
|
174
|
+
@feature_set_size_to_string_map = {}
|
175
|
+
end
|
176
|
+
|
177
|
+
def add(string)
|
178
|
+
if !@strings.include?(string)
|
179
|
+
@strings << string
|
180
|
+
|
181
|
+
features = feature_extractor.features(string)
|
182
|
+
feature_set_size = features.size
|
183
|
+
|
184
|
+
# update @feature_set_size_to_string_map
|
185
|
+
@feature_set_size_to_string_map[feature_set_size] ||= Set.new
|
186
|
+
@feature_set_size_to_string_map[feature_set_size] << string
|
187
|
+
|
188
|
+
# update @feature_to_string_map
|
189
|
+
features.each do |feature|
|
190
|
+
@feature_to_string_map[feature] ||= Set.new
|
191
|
+
@feature_to_string_map[feature] << string
|
192
|
+
end
|
193
|
+
end
|
194
|
+
nil
|
195
|
+
end
|
196
|
+
|
197
|
+
def min_feature_size
|
198
|
+
@feature_set_size_to_string_map.keys.min
|
199
|
+
end
|
200
|
+
|
201
|
+
def max_feature_size
|
202
|
+
@feature_set_size_to_string_map.keys.max
|
203
|
+
end
|
204
|
+
|
205
|
+
def lookup_strings_by_feature_set_size(size)
|
206
|
+
@feature_set_size_to_string_map[size] || Set.new
|
207
|
+
end
|
208
|
+
|
209
|
+
def lookup_strings_by_feature(feature)
|
210
|
+
@feature_to_string_map[feature] || Set.new
|
211
|
+
end
|
212
|
+
|
213
|
+
def save(file_path)
|
214
|
+
File.open(file_path, 'w') {|f| f.write(Marshal.dump(self)) }
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
Match = Struct.new(:value, :score)
|
220
|
+
|
221
|
+
class StringMatcher
|
222
|
+
def initialize(simstring_db, measure)
|
223
|
+
@db = simstring_db
|
224
|
+
@measure = measure
|
225
|
+
@feature_extractor = @db.feature_extractor
|
226
|
+
end
|
227
|
+
|
228
|
+
# Implements "Algorithm 1: Approximate dictionary matching" described in "Simple and Efficient Algorithm for Approximate Dictionary Matching" (see http://www.aclweb.org/anthology/C10-1096)
|
229
|
+
# Returns an array of matching strings.
|
230
|
+
# Example:
|
231
|
+
# matcher.search("Fooo", 0.5)
|
232
|
+
# => ["Foo", "Food", "Foot"]
|
233
|
+
def search(query_string, alpha, measure = @measure)
|
234
|
+
feature_set = @feature_extractor.features(query_string)
|
235
|
+
feature_set_size = feature_set.size
|
236
|
+
matches = []
|
237
|
+
min_feature_size_of_matching_string = measure.min_feature_size(@db, feature_set_size, alpha)
|
238
|
+
max_feature_size_of_matching_string = measure.max_feature_size(@db, feature_set_size, alpha)
|
239
|
+
(min_feature_size_of_matching_string..max_feature_size_of_matching_string).each do |candidate_match_feature_size|
|
240
|
+
tau = min_overlap(measure, feature_set_size, candidate_match_feature_size, alpha)
|
241
|
+
additional_matches = overlap_join(feature_set, tau, @db, candidate_match_feature_size)
|
242
|
+
matches.concat(additional_matches)
|
243
|
+
end
|
244
|
+
matches
|
245
|
+
end
|
246
|
+
|
247
|
+
# Same as #search, except returns an array of Match objects indicating both the matched string(s) and their corresponding similarity scores.
|
248
|
+
# Example:
|
249
|
+
# matcher.ranked_search("Fooo", 0.5)
|
250
|
+
# => [#<struct Match value="Foo", score=0.9128709291752769>,
|
251
|
+
# <struct Match value="Food", score=0.5>,
|
252
|
+
# <struct Match value="Foot", score=0.5>]
|
253
|
+
def ranked_search(query_string, alpha, measure = @measure)
|
254
|
+
feature_set = @feature_extractor.features(query_string)
|
255
|
+
search(query_string, alpha, measure).map do |matching_string|
|
256
|
+
Match.new(matching_string, measure.similarity(feature_set, @feature_extractor.features(matching_string)))
|
257
|
+
end.sort_by {|match| -match.score }
|
258
|
+
end
|
259
|
+
|
260
|
+
private
|
261
|
+
|
262
|
+
def min_overlap(measure, query_size, y_size, alpha)
|
263
|
+
measure.minimum_common_feature_count(query_size, y_size, alpha)
|
264
|
+
end
|
265
|
+
|
266
|
+
# implements "Algorithm 3: CPMerge algorithm" described in "Simple and Efficient Algorithm for Approximate Dictionary Matching" (see http://www.aclweb.org/anthology/C10-1096)
|
267
|
+
def overlap_join(query_feature_set, tau, db, y_size)
|
268
|
+
memoized_get_fn_results = query_feature_set.reduce({}) {|memo, feature| memo[feature] = get(db, y_size, feature); memo }
|
269
|
+
query_feature_set_size = query_feature_set.size
|
270
|
+
sorted_features = query_feature_set.sort_by {|feature| memoized_get_fn_results[feature].size }
|
271
|
+
m = {}
|
272
|
+
(0..(query_feature_set_size - tau)).each do |k|
|
273
|
+
memoized_get_fn_results[sorted_features[k]].each do |s|
|
274
|
+
m[s] ||= 0
|
275
|
+
m[s] += 1
|
276
|
+
end
|
277
|
+
end
|
278
|
+
r = []
|
279
|
+
((query_feature_set_size - tau + 1)..(query_feature_set_size - 1)).each do |k|
|
280
|
+
candidate_matching_strings = m.keys
|
281
|
+
candidate_matching_strings.each do |s|
|
282
|
+
m[s] ||= 0
|
283
|
+
if memoized_get_fn_results[sorted_features[k]].include?(s)
|
284
|
+
m[s] += 1
|
285
|
+
end
|
286
|
+
if tau <= m[s]
|
287
|
+
r << s
|
288
|
+
m.delete(s)
|
289
|
+
elsif m[s] + (query_feature_set_size - k - 1) < tau
|
290
|
+
m.delete(s)
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|
294
|
+
r
|
295
|
+
end
|
296
|
+
|
297
|
+
# Returns a Set of strings that each meet the following 2 criteria:
|
298
|
+
# 1. the string has a feature set size equal to <y_size>
|
299
|
+
# 2. the string's feature set contains the feature <feature>
|
300
|
+
def get(db, y_size, feature)
|
301
|
+
db.lookup_strings_by_feature_set_size(y_size) & db.lookup_strings_by_feature(feature)
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
end
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simstring_pure
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- David Ellis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-03-08 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A Ruby implementation of the SimString approximate string matching algorithm.
|
14
|
+
email: davidkellis@gmail.com
|
15
|
+
executables:
|
16
|
+
- simstring
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/simstring
|
21
|
+
- lib/simstring_pure.rb
|
22
|
+
homepage: https://github.com/davidkellis/simstring
|
23
|
+
licenses:
|
24
|
+
- MIT
|
25
|
+
metadata: {}
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 2.4.5.1
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: SimString approximate string matching library.
|
46
|
+
test_files: []
|