simstring_pure 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/simstring +28 -0
- data/lib/simstring_pure.rb +305 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9626d0b524a0b66517fb736af6d39d03bf2d61a3
|
4
|
+
data.tar.gz: 91817a82fc24bda6d383f8c429b3a81cea0989b5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c2182f5348eeadb1ab1f7a9a04cb63117174041a1aab34a411a17c97bc4f585da96099a1b31cfc097696c2295fce2fe040fdcd291df2497660d449a8931a273e
|
7
|
+
data.tar.gz: 9f548a200810181726e2d9a74155222c0de9a2f74d92803933b35a3f626ca8e183b6d24753a85b4860e80b84017b1b8a1bdcc5d7a5cad38a5819db4187c7ab31
|
data/bin/simstring
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'simstring_pure'
|
4
|
+
|
5
|
+
def main
|
6
|
+
filename, query_string, similarity_threshold = *ARGV
|
7
|
+
similarity_threshold = (similarity_threshold || 0.7).to_f
|
8
|
+
|
9
|
+
t1 = Time.now
|
10
|
+
|
11
|
+
ngram_builder = SimString::NGramBuilder.new(3)
|
12
|
+
db = SimString::Database.new(ngram_builder)
|
13
|
+
|
14
|
+
File.readlines(filename).each {|line| db.add(line.strip) }
|
15
|
+
|
16
|
+
t2 = Time.now
|
17
|
+
|
18
|
+
matcher = SimString::StringMatcher.new(db, SimString::CosineMeasure.new)
|
19
|
+
|
20
|
+
pp matcher.search(query_string, similarity_threshold)
|
21
|
+
|
22
|
+
t3 = Time.now
|
23
|
+
|
24
|
+
puts "#{t2 - t1} seconds to build database"
|
25
|
+
puts "#{t3 - t2} seconds to search"
|
26
|
+
end
|
27
|
+
|
28
|
+
main
|
@@ -0,0 +1,305 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'pp'
|
3
|
+
|
4
|
+
module SimString
|
5
|
+
|
6
|
+
class FeatureExtractor
|
7
|
+
# returns a Set of features
|
8
|
+
def features(string)
|
9
|
+
raise "Not implemented."
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
NGram = Struct.new(:ngram, :index)
|
14
|
+
class NGramBuilder < FeatureExtractor
|
15
|
+
SENTINAL_CHAR = "\u00A0" # non-breaking space
|
16
|
+
|
17
|
+
attr_accessor :n
|
18
|
+
|
19
|
+
def initialize(n)
|
20
|
+
self.n = n
|
21
|
+
end
|
22
|
+
|
23
|
+
def features(string)
|
24
|
+
prefix_and_suffix_string = SENTINAL_CHAR * (n - 1)
|
25
|
+
string = prefix_and_suffix_string + string + prefix_and_suffix_string
|
26
|
+
ngram_strings = string.each_char.each_cons(n).map(&:join)
|
27
|
+
ngram_strings_to_count_map = ngram_strings.reduce({}) {|memo, ngram_string| memo[ngram_string] = (memo[ngram_string] || 0) + 1; memo }
|
28
|
+
numbered_ngrams = ngram_strings_to_count_map.flat_map {|ngram_string, count| (1..count).map {|i| NGram.new(ngram_string, i) } }
|
29
|
+
numbered_ngrams.to_set
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Measure
|
34
|
+
# The #min_feature_size and #max_feature_size methods return the lower and upper bounds, respectively, of the range of feature set sizes
|
35
|
+
# belonging to the candidate search results.
|
36
|
+
# In other words, the only strings in the database that can possibly be considered an approximate search match *must* have a feature set size
|
37
|
+
# within the closed interval [min_feature_size(...), max_feature_size(...)]
|
38
|
+
|
39
|
+
# query_size is an int
|
40
|
+
# alpha is a double
|
41
|
+
def min_feature_size(db, query_size, alpha)
|
42
|
+
raise "Not implemented."
|
43
|
+
end
|
44
|
+
|
45
|
+
# query_size is an int
|
46
|
+
# alpha is a double
|
47
|
+
def max_feature_size(db, query_size, alpha)
|
48
|
+
raise "Not implemented."
|
49
|
+
end
|
50
|
+
|
51
|
+
# This method returns tau, the number of of features that two strings, x and y,
|
52
|
+
# must have in common in order for their similarity coefficient to be greater than or equal to alpha.
|
53
|
+
# Parameters:
|
54
|
+
# query_size is an int - the number of features in x
|
55
|
+
# y_size is an int - the number of features in y
|
56
|
+
# alpha is a double - the similarity threshold
|
57
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
58
|
+
raise "Not implemented."
|
59
|
+
end
|
60
|
+
|
61
|
+
def similarity(x_feature_set, y_feature_set)
|
62
|
+
raise "Not implemented."
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class CosineMeasure < Measure
|
67
|
+
def min_feature_size(db, query_size, alpha)
|
68
|
+
(alpha * alpha * query_size).ceil.to_i
|
69
|
+
end
|
70
|
+
|
71
|
+
def max_feature_size(db, query_size, alpha)
|
72
|
+
(query_size.to_f / (alpha * alpha)).floor.to_i
|
73
|
+
end
|
74
|
+
|
75
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
76
|
+
(alpha * Math.sqrt(query_size * y_size)).ceil.to_i
|
77
|
+
end
|
78
|
+
|
79
|
+
def similarity(x_feature_set, y_feature_set)
|
80
|
+
(x_feature_set & y_feature_set).size.to_f / Math.sqrt(x_feature_set.size * y_feature_set.size)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class DiceMeasure < Measure
|
85
|
+
def min_feature_size(db, query_size, alpha)
|
86
|
+
((alpha.to_f / (2 - alpha)) * query_size).ceil.to_i
|
87
|
+
end
|
88
|
+
|
89
|
+
def max_feature_size(db, query_size, alpha)
|
90
|
+
(((2 - alpha).to_f / alpha) * query_size).floor.to_i
|
91
|
+
end
|
92
|
+
|
93
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
94
|
+
(0.5 * alpha * (query_size * y_size)).ceil.to_i
|
95
|
+
end
|
96
|
+
|
97
|
+
def similarity(x_feature_set, y_feature_set)
|
98
|
+
(2 * (x_feature_set & y_feature_set).size).to_f / (x_feature_set.size + y_feature_set.size)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
class ExactMeasure < Measure
|
103
|
+
def min_feature_size(db, query_size, alpha)
|
104
|
+
query_size
|
105
|
+
end
|
106
|
+
|
107
|
+
def max_feature_size(db, query_size, alpha)
|
108
|
+
query_size
|
109
|
+
end
|
110
|
+
|
111
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
112
|
+
query_size
|
113
|
+
end
|
114
|
+
|
115
|
+
def similarity(x_feature_set, y_feature_set)
|
116
|
+
if x_feature_set == y_feature_set
|
117
|
+
1.0
|
118
|
+
else
|
119
|
+
0.0
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
class JaccardMeasure < Measure
|
125
|
+
def min_feature_size(db, query_size, alpha)
|
126
|
+
(alpha * query_size).ceil.to_i
|
127
|
+
end
|
128
|
+
|
129
|
+
def max_feature_size(db, query_size, alpha)
|
130
|
+
(query_size.to_f / alpha).floor.to_i
|
131
|
+
end
|
132
|
+
|
133
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
134
|
+
(alpha * (query_size + y_size).to_f / (1 + alpha)).ceil.to_i
|
135
|
+
end
|
136
|
+
|
137
|
+
def similarity(x_feature_set, y_feature_set)
|
138
|
+
(x_feature_set & y_feature_set).size.to_f / (x_feature_set | y_feature_set).size
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
class OverlapMeasure < Measure
|
143
|
+
def min_feature_size(db, query_size, alpha)
|
144
|
+
1
|
145
|
+
end
|
146
|
+
|
147
|
+
def max_feature_size(db, query_size, alpha)
|
148
|
+
db.max_feature_size
|
149
|
+
end
|
150
|
+
|
151
|
+
def minimum_common_feature_count(query_size, y_size, alpha)
|
152
|
+
(alpha * [query_size, y_size].min).ceil.to_i
|
153
|
+
end
|
154
|
+
|
155
|
+
def similarity(x_feature_set, y_feature_set)
|
156
|
+
(x_feature_set & y_feature_set).size.to_f / [x_feature_set.size, y_feature_set.size].min
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
class Database
|
162
|
+
class << self
|
163
|
+
def load(file_path)
|
164
|
+
m = Marshal.load(File.read(file_path))
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
attr_reader :feature_extractor
|
169
|
+
|
170
|
+
def initialize(feature_extractor)
|
171
|
+
@strings = Set.new
|
172
|
+
@feature_extractor = feature_extractor
|
173
|
+
@feature_to_string_map = {}
|
174
|
+
@feature_set_size_to_string_map = {}
|
175
|
+
end
|
176
|
+
|
177
|
+
def add(string)
|
178
|
+
if !@strings.include?(string)
|
179
|
+
@strings << string
|
180
|
+
|
181
|
+
features = feature_extractor.features(string)
|
182
|
+
feature_set_size = features.size
|
183
|
+
|
184
|
+
# update @feature_set_size_to_string_map
|
185
|
+
@feature_set_size_to_string_map[feature_set_size] ||= Set.new
|
186
|
+
@feature_set_size_to_string_map[feature_set_size] << string
|
187
|
+
|
188
|
+
# update @feature_to_string_map
|
189
|
+
features.each do |feature|
|
190
|
+
@feature_to_string_map[feature] ||= Set.new
|
191
|
+
@feature_to_string_map[feature] << string
|
192
|
+
end
|
193
|
+
end
|
194
|
+
nil
|
195
|
+
end
|
196
|
+
|
197
|
+
def min_feature_size
|
198
|
+
@feature_set_size_to_string_map.keys.min
|
199
|
+
end
|
200
|
+
|
201
|
+
def max_feature_size
|
202
|
+
@feature_set_size_to_string_map.keys.max
|
203
|
+
end
|
204
|
+
|
205
|
+
def lookup_strings_by_feature_set_size(size)
|
206
|
+
@feature_set_size_to_string_map[size] || Set.new
|
207
|
+
end
|
208
|
+
|
209
|
+
def lookup_strings_by_feature(feature)
|
210
|
+
@feature_to_string_map[feature] || Set.new
|
211
|
+
end
|
212
|
+
|
213
|
+
def save(file_path)
|
214
|
+
File.open(file_path, 'w') {|f| f.write(Marshal.dump(self)) }
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
Match = Struct.new(:value, :score)
|
220
|
+
|
221
|
+
class StringMatcher
|
222
|
+
def initialize(simstring_db, measure)
|
223
|
+
@db = simstring_db
|
224
|
+
@measure = measure
|
225
|
+
@feature_extractor = @db.feature_extractor
|
226
|
+
end
|
227
|
+
|
228
|
+
# Implements "Algorithm 1: Approximate dictionary matching" described in "Simple and Efficient Algorithm for Approximate Dictionary Matching" (see http://www.aclweb.org/anthology/C10-1096)
|
229
|
+
# Returns an array of matching strings.
|
230
|
+
# Example:
|
231
|
+
# matcher.search("Fooo", 0.5)
|
232
|
+
# => ["Foo", "Food", "Foot"]
|
233
|
+
def search(query_string, alpha, measure = @measure)
|
234
|
+
feature_set = @feature_extractor.features(query_string)
|
235
|
+
feature_set_size = feature_set.size
|
236
|
+
matches = []
|
237
|
+
min_feature_size_of_matching_string = measure.min_feature_size(@db, feature_set_size, alpha)
|
238
|
+
max_feature_size_of_matching_string = measure.max_feature_size(@db, feature_set_size, alpha)
|
239
|
+
(min_feature_size_of_matching_string..max_feature_size_of_matching_string).each do |candidate_match_feature_size|
|
240
|
+
tau = min_overlap(measure, feature_set_size, candidate_match_feature_size, alpha)
|
241
|
+
additional_matches = overlap_join(feature_set, tau, @db, candidate_match_feature_size)
|
242
|
+
matches.concat(additional_matches)
|
243
|
+
end
|
244
|
+
matches
|
245
|
+
end
|
246
|
+
|
247
|
+
# Same as #search, except returns an array of Match objects indicating both the matched string(s) and their corresponding similarity scores.
|
248
|
+
# Example:
|
249
|
+
# matcher.ranked_search("Fooo", 0.5)
|
250
|
+
# => [#<struct Match value="Foo", score=0.9128709291752769>,
|
251
|
+
# <struct Match value="Food", score=0.5>,
|
252
|
+
# <struct Match value="Foot", score=0.5>]
|
253
|
+
def ranked_search(query_string, alpha, measure = @measure)
|
254
|
+
feature_set = @feature_extractor.features(query_string)
|
255
|
+
search(query_string, alpha, measure).map do |matching_string|
|
256
|
+
Match.new(matching_string, measure.similarity(feature_set, @feature_extractor.features(matching_string)))
|
257
|
+
end.sort_by {|match| -match.score }
|
258
|
+
end
|
259
|
+
|
260
|
+
private
|
261
|
+
|
262
|
+
def min_overlap(measure, query_size, y_size, alpha)
|
263
|
+
measure.minimum_common_feature_count(query_size, y_size, alpha)
|
264
|
+
end
|
265
|
+
|
266
|
+
# implements "Algorithm 3: CPMerge algorithm" described in "Simple and Efficient Algorithm for Approximate Dictionary Matching" (see http://www.aclweb.org/anthology/C10-1096)
|
267
|
+
def overlap_join(query_feature_set, tau, db, y_size)
|
268
|
+
memoized_get_fn_results = query_feature_set.reduce({}) {|memo, feature| memo[feature] = get(db, y_size, feature); memo }
|
269
|
+
query_feature_set_size = query_feature_set.size
|
270
|
+
sorted_features = query_feature_set.sort_by {|feature| memoized_get_fn_results[feature].size }
|
271
|
+
m = {}
|
272
|
+
(0..(query_feature_set_size - tau)).each do |k|
|
273
|
+
memoized_get_fn_results[sorted_features[k]].each do |s|
|
274
|
+
m[s] ||= 0
|
275
|
+
m[s] += 1
|
276
|
+
end
|
277
|
+
end
|
278
|
+
r = []
|
279
|
+
((query_feature_set_size - tau + 1)..(query_feature_set_size - 1)).each do |k|
|
280
|
+
candidate_matching_strings = m.keys
|
281
|
+
candidate_matching_strings.each do |s|
|
282
|
+
m[s] ||= 0
|
283
|
+
if memoized_get_fn_results[sorted_features[k]].include?(s)
|
284
|
+
m[s] += 1
|
285
|
+
end
|
286
|
+
if tau <= m[s]
|
287
|
+
r << s
|
288
|
+
m.delete(s)
|
289
|
+
elsif m[s] + (query_feature_set_size - k - 1) < tau
|
290
|
+
m.delete(s)
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|
294
|
+
r
|
295
|
+
end
|
296
|
+
|
297
|
+
# Returns a Set of strings that each meet the following 2 criteria:
|
298
|
+
# 1. the string has a feature set size equal to <y_size>
|
299
|
+
# 2. the string's feature set contains the feature <feature>
|
300
|
+
def get(db, y_size, feature)
|
301
|
+
db.lookup_strings_by_feature_set_size(y_size) & db.lookup_strings_by_feature(feature)
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
end
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simstring_pure
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- David Ellis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-03-08 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A Ruby implementation of the SimString approximate string matching algorithm.
|
14
|
+
email: davidkellis@gmail.com
|
15
|
+
executables:
|
16
|
+
- simstring
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/simstring
|
21
|
+
- lib/simstring_pure.rb
|
22
|
+
homepage: https://github.com/davidkellis/simstring
|
23
|
+
licenses:
|
24
|
+
- MIT
|
25
|
+
metadata: {}
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 2.4.5.1
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: SimString approximate string matching library.
|
46
|
+
test_files: []
|