fast_fuzzy_matcher 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +21 -0
- data/README.md +66 -0
- data/Rakefile +8 -0
- data/ext/.idea/.gitignore +8 -0
- data/ext/.idea/ext.iml +9 -0
- data/ext/.idea/modules.xml +8 -0
- data/ext/.idea/vcs.xml +6 -0
- data/ext/fuzzy.go +416 -0
- data/ext/fuzzy.h +87 -0
- data/ext/fuzzy.so +0 -0
- data/ext/go.mod +5 -0
- data/ext/go.sum +2 -0
- data/fast_fuzzy_matcher.gemspec +41 -0
- data/lib/fuzzy_matcher/version.rb +5 -0
- data/lib/fuzzy_matcher.rb +58 -0
- data/spec/fuzzy_matcher_spec.rb +27 -0
- data/spec/spec_helper.rb +15 -0
- metadata +84 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e10b6a2b5f47c25d479a1a3623db229ecbc24f79176c3d6ae879ce6767e931b0
|
4
|
+
data.tar.gz: c4c910d1e4a462fab8819f2d406d2affe1c872d58f2943c7be196d25c439a7b6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3cef0e3e2a233385a0daf2c882002be8af28e38df4c58c500bbee29fd3b319d904b9ee918f6fbeb44c3557602562b7b0f3d1f53c3b03c1c197f2824f6caf81fd
|
7
|
+
data.tar.gz: 78c6671ffaad4807137977cc90100be0214e241b0895ab71955baf96500db286b235b62e121c9c13f00134a86f3138b92e652af597a37ed9bb0dfca559215069
|
data/.rspec
ADDED
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2023 wowinter13
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# FuzzyMatch
|
2
|
+
|
3
|
+
This library is a work in progress.
|
4
|
+
|
5
|
+
The fastest Fuzzy Matcher in the wild west. FFI-based.
|
6
|
+
|
7
|
+
Find a needle in a haystack based on string similarity and regular expression rules.
|
8
|
+
|
9
|
+
|
10
|
+
### Basic usage
|
11
|
+
|
12
|
+
Just pass an array of strings to the matcher and it will return the best match(es) for the given needle.
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
require 'fast_fuzzy_matcher'
|
16
|
+
|
17
|
+
FuzzyMatcher.find("whl", ["cartwheel", "foobar", "wheel", "baz"])
|
18
|
+
=> ["cartwheel", "wheel"]
|
19
|
+
|
20
|
+
```
|
21
|
+
|
22
|
+
### Advanced usage
|
23
|
+
|
24
|
+
Better documentation is coming soon. For now, please refer to the specs.
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
# Benchmarks
|
29
|
+
|
30
|
+
To be done.
|
31
|
+
|
32
|
+
Approximately 10-60x faster than the fastest Ruby implementation. The difference is more pronounced for longer strings and larger dictionaries.
|
33
|
+
|
34
|
+
|
35
|
+
## Documentation
|
36
|
+
|
37
|
+
Detailed documentation is available at [rubydoc](https://rubydoc.info/gems/fast_fuzzy_matcher).
|
38
|
+
|
39
|
+
## Installation
|
40
|
+
|
41
|
+
fast_fuzzy_matcher is available as a gem, to install it just install the gem:
|
42
|
+
|
43
|
+
gem install fast_fuzzy_matcher
|
44
|
+
|
45
|
+
If you're using Bundler, add the gem to Gemfile.
|
46
|
+
|
47
|
+
gem 'fast_fuzzy_matcher'
|
48
|
+
|
49
|
+
Run `bundle install`.
|
50
|
+
|
51
|
+
## Running tests
|
52
|
+
|
53
|
+
bundle exec rspec spec/
|
54
|
+
|
55
|
+
|
56
|
+
## Contributing
|
57
|
+
|
58
|
+
1. Fork it ( https://github.com/wowinter13/fast_fuzzy_matcher/fork )
|
59
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
60
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
61
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
62
|
+
5. Create a new Pull Request
|
63
|
+
|
64
|
+
## License
|
65
|
+
|
66
|
+
MIT License. See LICENSE for details.
|
data/Rakefile
ADDED
data/ext/.idea/ext.iml
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<module type="WEB_MODULE" version="4">
|
3
|
+
<component name="Go" enabled="true" />
|
4
|
+
<component name="NewModuleRootManager">
|
5
|
+
<content url="file://$MODULE_DIR$" />
|
6
|
+
<orderEntry type="inheritedJdk" />
|
7
|
+
<orderEntry type="sourceFolder" forTests="false" />
|
8
|
+
</component>
|
9
|
+
</module>
|
data/ext/.idea/vcs.xml
ADDED
data/ext/fuzzy.go
ADDED
@@ -0,0 +1,416 @@
|
|
1
|
+
// Fuzzy searching allows for flexibly matching a string with partial input,
|
2
|
+
// useful for filtering data very quickly based on lightweight user input.
|
3
|
+
package main
|
4
|
+
|
5
|
+
import (
|
6
|
+
"unicode"
|
7
|
+
"unicode/utf8"
|
8
|
+
"unsafe"
|
9
|
+
|
10
|
+
"golang.org/x/text/runes"
|
11
|
+
"golang.org/x/text/transform"
|
12
|
+
"golang.org/x/text/unicode/norm"
|
13
|
+
)
|
14
|
+
|
15
|
+
/*
|
16
|
+
#include <stdlib.h> // for C.free
|
17
|
+
*/
|
18
|
+
import "C"
|
19
|
+
|
20
|
+
func noopTransformer() transform.Transformer {
|
21
|
+
return nopTransformer{}
|
22
|
+
}
|
23
|
+
|
24
|
+
func foldTransformer() transform.Transformer {
|
25
|
+
return unicodeFoldTransformer{}
|
26
|
+
}
|
27
|
+
|
28
|
+
func normalizeTransformer() transform.Transformer {
|
29
|
+
return transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
|
30
|
+
}
|
31
|
+
|
32
|
+
func normalizedFoldTransformer() transform.Transformer {
|
33
|
+
return transform.Chain(normalizeTransformer(), foldTransformer())
|
34
|
+
}
|
35
|
+
|
36
|
+
// Match returns true if source matches target using a fuzzy-searching
|
37
|
+
// algorithm. Note that it doesn't implement Levenshtein distance (see
|
38
|
+
// RankMatch instead), but rather a simplified version where there's no
|
39
|
+
// approximation. The method will return true only if each character in the
|
40
|
+
// source can be found in the target and occurs after the preceding matches.
|
41
|
+
|
42
|
+
// TODO: export Match
|
43
|
+
func Match(source, target string) bool {
|
44
|
+
return match(source, target, noopTransformer())
|
45
|
+
}
|
46
|
+
|
47
|
+
// MatchFold is a case-insensitive version of Match.
|
48
|
+
|
49
|
+
// TODO: export MatchFold
|
50
|
+
func MatchFold(source, target string) bool {
|
51
|
+
return match(source, target, foldTransformer())
|
52
|
+
}
|
53
|
+
|
54
|
+
// MatchNormalized is a unicode-normalized version of Match.
|
55
|
+
|
56
|
+
// TODO: export MatchNormalized
|
57
|
+
func MatchNormalized(source, target string) bool {
|
58
|
+
return match(source, target, normalizeTransformer())
|
59
|
+
}
|
60
|
+
|
61
|
+
// MatchNormalizedFold is a unicode-normalized and case-insensitive version of Match.
|
62
|
+
|
63
|
+
// TODO: export MatchNormalizedFold
|
64
|
+
func MatchNormalizedFold(source, target string) bool {
|
65
|
+
return match(source, target, normalizedFoldTransformer())
|
66
|
+
}
|
67
|
+
|
68
|
+
func match(source, target string, transformer transform.Transformer) bool {
|
69
|
+
sourceT := stringTransform(source, transformer)
|
70
|
+
targetT := stringTransform(target, transformer)
|
71
|
+
return matchTransformed(sourceT, targetT)
|
72
|
+
}
|
73
|
+
|
74
|
+
func matchTransformed(source, target string) bool {
|
75
|
+
lenDiff := len(target) - len(source)
|
76
|
+
|
77
|
+
if lenDiff < 0 {
|
78
|
+
return false
|
79
|
+
}
|
80
|
+
|
81
|
+
if lenDiff == 0 && source == target {
|
82
|
+
return true
|
83
|
+
}
|
84
|
+
|
85
|
+
Outer:
|
86
|
+
for _, r1 := range source {
|
87
|
+
for i, r2 := range target {
|
88
|
+
if r1 == r2 {
|
89
|
+
target = target[i+utf8.RuneLen(r2):]
|
90
|
+
continue Outer
|
91
|
+
}
|
92
|
+
}
|
93
|
+
return false
|
94
|
+
}
|
95
|
+
|
96
|
+
return true
|
97
|
+
}
|
98
|
+
|
99
|
+
// Find will return a list of strings in targets that fuzzy matches source.
|
100
|
+
//
|
101
|
+
// The returned list will be ordered by best matches first, or nil if there are
|
102
|
+
// no matches. The algorithm is optimized for matches where the target string
|
103
|
+
// has a prefix that matches the source.
|
104
|
+
//
|
105
|
+
// FFI note: the returned list is allocated using malloc and must be freed using
|
106
|
+
// free_cstrings.
|
107
|
+
//
|
108
|
+
//export Find
|
109
|
+
func Find(source *C.char, targets **C.char, targetsLen C.int) **C.char {
|
110
|
+
goSource := C.GoString(source)
|
111
|
+
sliceHeaders := (*[1 << 30]*C.char)(unsafe.Pointer(targets))[:targetsLen:targetsLen]
|
112
|
+
|
113
|
+
goTargets := make([]string, int(targetsLen))
|
114
|
+
for i := 0; i < int(targetsLen); i++ {
|
115
|
+
goTargets[i] = C.GoString(sliceHeaders[i])
|
116
|
+
}
|
117
|
+
|
118
|
+
results := find(goSource, goTargets, noopTransformer())
|
119
|
+
|
120
|
+
cResults := C.malloc(C.size_t(targetsLen) * C.size_t(unsafe.Sizeof(uintptr(0))))
|
121
|
+
cArray := (*[1 << 30]*C.char)(cResults)
|
122
|
+
|
123
|
+
for i := 0; i < int(targetsLen); i++ {
|
124
|
+
cArray[i] = C.CString("")
|
125
|
+
}
|
126
|
+
|
127
|
+
for i, s := range results {
|
128
|
+
C.free(unsafe.Pointer(cArray[i]))
|
129
|
+
cArray[i] = C.CString(s)
|
130
|
+
}
|
131
|
+
|
132
|
+
return (**C.char)(cResults)
|
133
|
+
}
|
134
|
+
|
135
|
+
//export free_cstrings
|
136
|
+
func free_cstrings(strs **C.char, len C.int) {
|
137
|
+
slice := (*[1 << 30]*C.char)(unsafe.Pointer(strs))[:len:len]
|
138
|
+
for i := 0; i < int(len); i++ {
|
139
|
+
C.free(unsafe.Pointer(slice[i]))
|
140
|
+
}
|
141
|
+
C.free(unsafe.Pointer(strs))
|
142
|
+
}
|
143
|
+
|
144
|
+
// FindFold is a case-insensitive version of Find.
|
145
|
+
|
146
|
+
// TODO: export FindFold
|
147
|
+
func FindFold(source string, targets []string) []string {
|
148
|
+
return find(source, targets, foldTransformer())
|
149
|
+
}
|
150
|
+
|
151
|
+
// FindNormalized is a unicode-normalized version of Find.
|
152
|
+
|
153
|
+
// TODO: export FindNormalized
|
154
|
+
func FindNormalized(source string, targets []string) []string {
|
155
|
+
return find(source, targets, normalizeTransformer())
|
156
|
+
}
|
157
|
+
|
158
|
+
// FindNormalizedFold is a unicode-normalized and case-insensitive version of Find.
|
159
|
+
|
160
|
+
// TODO: export FindNormalizedFold
|
161
|
+
func FindNormalizedFold(source string, targets []string) []string {
|
162
|
+
return find(source, targets, normalizedFoldTransformer())
|
163
|
+
}
|
164
|
+
|
165
|
+
func find(source string, targets []string, transformer transform.Transformer) []string {
|
166
|
+
sourceT := stringTransform(source, transformer)
|
167
|
+
|
168
|
+
var matches []string
|
169
|
+
|
170
|
+
for _, target := range targets {
|
171
|
+
targetT := stringTransform(target, transformer)
|
172
|
+
if matchTransformed(sourceT, targetT) {
|
173
|
+
matches = append(matches, target)
|
174
|
+
}
|
175
|
+
}
|
176
|
+
|
177
|
+
return matches
|
178
|
+
}
|
179
|
+
|
180
|
+
// RankMatch is similar to Match except it will measure the Levenshtein
|
181
|
+
// distance between the source and the target and return its result. If there
|
182
|
+
// was no match, it will return -1.
|
183
|
+
// Given the requirements of match, RankMatch only needs to perform a subset of
|
184
|
+
// the Levenshtein calculation, only deletions need be considered, required
|
185
|
+
// additions and substitutions would fail the match test.
|
186
|
+
|
187
|
+
// TODO: export RankMatch
|
188
|
+
func RankMatch(source, target string) int {
|
189
|
+
return rank(source, target, noopTransformer())
|
190
|
+
}
|
191
|
+
|
192
|
+
// RankMatchFold is a case-insensitive version of RankMatch.
|
193
|
+
|
194
|
+
// TODO: export RankMatchFold
|
195
|
+
func RankMatchFold(source, target string) int {
|
196
|
+
return rank(source, target, foldTransformer())
|
197
|
+
}
|
198
|
+
|
199
|
+
// RankMatchNormalized is a unicode-normalized version of RankMatch.
|
200
|
+
|
201
|
+
// TODO: export RankMatchNormalized
|
202
|
+
func RankMatchNormalized(source, target string) int {
|
203
|
+
return rank(source, target, normalizeTransformer())
|
204
|
+
}
|
205
|
+
|
206
|
+
// RankMatchNormalizedFold is a unicode-normalized and case-insensitive version of RankMatch.
|
207
|
+
|
208
|
+
// TODO: export RankMatchNormalizedFold
|
209
|
+
func RankMatchNormalizedFold(source, target string) int {
|
210
|
+
return rank(source, target, normalizedFoldTransformer())
|
211
|
+
}
|
212
|
+
|
213
|
+
func rank(source, target string, transformer transform.Transformer) int {
|
214
|
+
lenDiff := len(target) - len(source)
|
215
|
+
|
216
|
+
if lenDiff < 0 {
|
217
|
+
return -1
|
218
|
+
}
|
219
|
+
|
220
|
+
source = stringTransform(source, transformer)
|
221
|
+
target = stringTransform(target, transformer)
|
222
|
+
|
223
|
+
if lenDiff == 0 && source == target {
|
224
|
+
return 0
|
225
|
+
}
|
226
|
+
|
227
|
+
runeDiff := 0
|
228
|
+
|
229
|
+
Outer:
|
230
|
+
for _, r1 := range source {
|
231
|
+
for i, r2 := range target {
|
232
|
+
if r1 == r2 {
|
233
|
+
target = target[i+utf8.RuneLen(r2):]
|
234
|
+
continue Outer
|
235
|
+
} else {
|
236
|
+
runeDiff++
|
237
|
+
}
|
238
|
+
}
|
239
|
+
return -1
|
240
|
+
}
|
241
|
+
|
242
|
+
// Count up remaining char
|
243
|
+
runeDiff += utf8.RuneCountInString(target)
|
244
|
+
|
245
|
+
return runeDiff
|
246
|
+
}
|
247
|
+
|
248
|
+
// RankFind is similar to Find, except it will also rank all matches using
|
249
|
+
// Levenshtein distance.
|
250
|
+
|
251
|
+
// TODO: export RankFind
|
252
|
+
func RankFind(source string, targets []string) Ranks {
|
253
|
+
return rankFind(source, targets, noopTransformer())
|
254
|
+
}
|
255
|
+
|
256
|
+
// RankFindFold is a case-insensitive version of RankFind.
|
257
|
+
|
258
|
+
// TODO: export RankFindFold
|
259
|
+
func RankFindFold(source string, targets []string) Ranks {
|
260
|
+
return rankFind(source, targets, foldTransformer())
|
261
|
+
}
|
262
|
+
|
263
|
+
// RankFindNormalized is a unicode-normalized version of RankFind.
|
264
|
+
|
265
|
+
// TODO: export RankFindNormalized
|
266
|
+
func RankFindNormalized(source string, targets []string) Ranks {
|
267
|
+
return rankFind(source, targets, normalizeTransformer())
|
268
|
+
}
|
269
|
+
|
270
|
+
// RankFindNormalizedFold is a unicode-normalized and case-insensitive version of RankFind.
|
271
|
+
|
272
|
+
// TODO: export RankFindNormalizedFold
|
273
|
+
func RankFindNormalizedFold(source string, targets []string) Ranks {
|
274
|
+
return rankFind(source, targets, normalizedFoldTransformer())
|
275
|
+
}
|
276
|
+
|
277
|
+
func rankFind(source string, targets []string, transformer transform.Transformer) Ranks {
|
278
|
+
sourceT := stringTransform(source, transformer)
|
279
|
+
|
280
|
+
var r Ranks
|
281
|
+
|
282
|
+
for index, target := range targets {
|
283
|
+
targetT := stringTransform(target, transformer)
|
284
|
+
if matchTransformed(sourceT, targetT) {
|
285
|
+
distance := LevenshteinDistance(source, target)
|
286
|
+
r = append(r, Rank{source, target, distance, index})
|
287
|
+
}
|
288
|
+
}
|
289
|
+
return r
|
290
|
+
}
|
291
|
+
|
292
|
+
type Rank struct {
|
293
|
+
// Source is used as the source for matching.
|
294
|
+
Source string
|
295
|
+
|
296
|
+
// Target is the word matched against.
|
297
|
+
Target string
|
298
|
+
|
299
|
+
// Distance is the Levenshtein distance between Source and Target.
|
300
|
+
Distance int
|
301
|
+
|
302
|
+
// Location of Target in original list
|
303
|
+
OriginalIndex int
|
304
|
+
}
|
305
|
+
|
306
|
+
type Ranks []Rank
|
307
|
+
|
308
|
+
func (r Ranks) Len() int {
|
309
|
+
return len(r)
|
310
|
+
}
|
311
|
+
|
312
|
+
func (r Ranks) Swap(i, j int) {
|
313
|
+
r[i], r[j] = r[j], r[i]
|
314
|
+
}
|
315
|
+
|
316
|
+
func (r Ranks) Less(i, j int) bool {
|
317
|
+
return r[i].Distance < r[j].Distance
|
318
|
+
}
|
319
|
+
|
320
|
+
func stringTransform(s string, t transform.Transformer) (transformed string) {
|
321
|
+
// Fast path for the nop transformer to prevent unnecessary allocations.
|
322
|
+
if _, ok := t.(nopTransformer); ok {
|
323
|
+
return s
|
324
|
+
}
|
325
|
+
|
326
|
+
var err error
|
327
|
+
transformed, _, err = transform.String(t, s)
|
328
|
+
if err != nil {
|
329
|
+
transformed = s
|
330
|
+
}
|
331
|
+
|
332
|
+
return
|
333
|
+
}
|
334
|
+
|
335
|
+
type unicodeFoldTransformer struct{ transform.NopResetter }
|
336
|
+
|
337
|
+
func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
338
|
+
// Converting src to a string allocates.
|
339
|
+
// In theory, it need not; see https://go.dev/issue/27148.
|
340
|
+
// It is possible to write this loop using utf8.DecodeRune
|
341
|
+
// and thereby avoid allocations, but it is noticeably slower.
|
342
|
+
// So just let's wait for the compiler to get smarter.
|
343
|
+
for _, r := range string(src) {
|
344
|
+
if r == utf8.RuneError {
|
345
|
+
// Go spec for ranging over a string says:
|
346
|
+
// If the iteration encounters an invalid UTF-8 sequence,
|
347
|
+
// the second value will be 0xFFFD, the Unicode replacement character,
|
348
|
+
// and the next iteration will advance a single byte in the string.
|
349
|
+
nSrc++
|
350
|
+
} else {
|
351
|
+
nSrc += utf8.RuneLen(r)
|
352
|
+
}
|
353
|
+
r = unicode.ToLower(r)
|
354
|
+
x := utf8.RuneLen(r)
|
355
|
+
if x > len(dst[nDst:]) {
|
356
|
+
err = transform.ErrShortDst
|
357
|
+
break
|
358
|
+
}
|
359
|
+
nDst += utf8.EncodeRune(dst[nDst:], r)
|
360
|
+
}
|
361
|
+
return nDst, nSrc, err
|
362
|
+
}
|
363
|
+
|
364
|
+
type nopTransformer struct{ transform.NopResetter }
|
365
|
+
|
366
|
+
func (nopTransformer) Transform(dst []byte, src []byte, atEOF bool) (int, int, error) {
|
367
|
+
return 0, len(src), nil
|
368
|
+
}
|
369
|
+
|
370
|
+
// LevenshteinDistance measures the difference between two strings.
|
371
|
+
// The Levenshtein distance between two words is the minimum number of
|
372
|
+
// single-character edits (i.e. insertions, deletions or substitutions)
|
373
|
+
// required to change one word into the other.
|
374
|
+
//
|
375
|
+
// This implemention is optimized to use O(min(m,n)) space and is based on the
|
376
|
+
// optimized C version found here:
|
377
|
+
// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#C
|
378
|
+
|
379
|
+
// export LevenshteinDistance
|
380
|
+
func LevenshteinDistance(s, t string) int {
|
381
|
+
r1, r2 := []rune(s), []rune(t)
|
382
|
+
column := make([]int, 1, 64)
|
383
|
+
|
384
|
+
for y := 1; y <= len(r1); y++ {
|
385
|
+
column = append(column, y)
|
386
|
+
}
|
387
|
+
|
388
|
+
for x := 1; x <= len(r2); x++ {
|
389
|
+
column[0] = x
|
390
|
+
|
391
|
+
for y, lastDiag := 1, x-1; y <= len(r1); y++ {
|
392
|
+
oldDiag := column[y]
|
393
|
+
cost := 0
|
394
|
+
if r1[y-1] != r2[x-1] {
|
395
|
+
cost = 1
|
396
|
+
}
|
397
|
+
column[y] = min(column[y]+1, column[y-1]+1, lastDiag+cost)
|
398
|
+
lastDiag = oldDiag
|
399
|
+
}
|
400
|
+
}
|
401
|
+
|
402
|
+
return column[len(r1)]
|
403
|
+
}
|
404
|
+
|
405
|
+
func min2(a, b int) int {
|
406
|
+
if a < b {
|
407
|
+
return a
|
408
|
+
}
|
409
|
+
return b
|
410
|
+
}
|
411
|
+
|
412
|
+
func min(a, b, c int) int {
|
413
|
+
return min2(min2(a, b), c)
|
414
|
+
}
|
415
|
+
|
416
|
+
func main() {}
|
data/ext/fuzzy.h
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
/* Code generated by cmd/cgo; DO NOT EDIT. */
|
2
|
+
|
3
|
+
/* package command-line-arguments */
|
4
|
+
|
5
|
+
|
6
|
+
#line 1 "cgo-builtin-export-prolog"
|
7
|
+
|
8
|
+
#include <stddef.h>
|
9
|
+
|
10
|
+
#ifndef GO_CGO_EXPORT_PROLOGUE_H
|
11
|
+
#define GO_CGO_EXPORT_PROLOGUE_H
|
12
|
+
|
13
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
14
|
+
typedef struct { const char *p; ptrdiff_t n; } _GoString_;
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#endif
|
18
|
+
|
19
|
+
/* Start of preamble from import "C" comments. */
|
20
|
+
|
21
|
+
|
22
|
+
#line 15 "fuzzy.go"
|
23
|
+
|
24
|
+
#include <stdlib.h> // for C.free
|
25
|
+
|
26
|
+
#line 1 "cgo-generated-wrapper"
|
27
|
+
|
28
|
+
|
29
|
+
/* End of preamble from import "C" comments. */
|
30
|
+
|
31
|
+
|
32
|
+
/* Start of boilerplate cgo prologue. */
|
33
|
+
#line 1 "cgo-gcc-export-header-prolog"
|
34
|
+
|
35
|
+
#ifndef GO_CGO_PROLOGUE_H
|
36
|
+
#define GO_CGO_PROLOGUE_H
|
37
|
+
|
38
|
+
typedef signed char GoInt8;
|
39
|
+
typedef unsigned char GoUint8;
|
40
|
+
typedef short GoInt16;
|
41
|
+
typedef unsigned short GoUint16;
|
42
|
+
typedef int GoInt32;
|
43
|
+
typedef unsigned int GoUint32;
|
44
|
+
typedef long long GoInt64;
|
45
|
+
typedef unsigned long long GoUint64;
|
46
|
+
typedef GoInt64 GoInt;
|
47
|
+
typedef GoUint64 GoUint;
|
48
|
+
typedef size_t GoUintptr;
|
49
|
+
typedef float GoFloat32;
|
50
|
+
typedef double GoFloat64;
|
51
|
+
#ifdef _MSC_VER
|
52
|
+
#include <complex.h>
|
53
|
+
typedef _Fcomplex GoComplex64;
|
54
|
+
typedef _Dcomplex GoComplex128;
|
55
|
+
#else
|
56
|
+
typedef float _Complex GoComplex64;
|
57
|
+
typedef double _Complex GoComplex128;
|
58
|
+
#endif
|
59
|
+
|
60
|
+
/*
|
61
|
+
static assertion to make sure the file is being used on architecture
|
62
|
+
at least with matching size of GoInt.
|
63
|
+
*/
|
64
|
+
typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
|
65
|
+
|
66
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
67
|
+
typedef _GoString_ GoString;
|
68
|
+
#endif
|
69
|
+
typedef void *GoMap;
|
70
|
+
typedef void *GoChan;
|
71
|
+
typedef struct { void *t; void *v; } GoInterface;
|
72
|
+
typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
|
73
|
+
|
74
|
+
#endif
|
75
|
+
|
76
|
+
/* End of boilerplate cgo prologue. */
|
77
|
+
|
78
|
+
#ifdef __cplusplus
|
79
|
+
extern "C" {
|
80
|
+
#endif
|
81
|
+
|
82
|
+
extern char** Find(char* source, char** targets, int targetsLen);
|
83
|
+
extern void free_cstrings(char** strs, int len);
|
84
|
+
|
85
|
+
#ifdef __cplusplus
|
86
|
+
}
|
87
|
+
#endif
|
data/ext/fuzzy.so
ADDED
Binary file
|
data/ext/go.mod
ADDED
data/ext/go.sum
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/fuzzy_matcher/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "fast_fuzzy_matcher"
|
7
|
+
spec.version = FuzzyMatcher::VERSION
|
8
|
+
spec.authors = ["Vlad Dyachenko"]
|
9
|
+
spec.email = ["vla-dy@yandex.ru"]
|
10
|
+
|
11
|
+
spec.summary = "fast_fuzzy_matcher is the fastest fuzzy search library for Ruby."
|
12
|
+
spec.description = "A tiny and blazing-fast fuzzy search in pure Ruby with FFI bindings to Go."\
|
13
|
+
"Fuzzy searching allows for flexibly matching a string with partial input, " \
|
14
|
+
"useful for filtering data very quickly based on lightweight user input."
|
15
|
+
spec.homepage = "https://github.com/wowinter13/fast_fuzzy_matcher"
|
16
|
+
spec.license = "MIT"
|
17
|
+
spec.required_ruby_version = ">= 2.6.0"
|
18
|
+
|
19
|
+
spec.metadata = {
|
20
|
+
'bug_tracker_uri' => 'https://github.com/wowinter13/fast_fuzzy_matcher/issues',
|
21
|
+
'changelog_uri' => "https://github.com/wowinter13/fast_fuzzy_matcher/blob/master/CHANGELOG.md",
|
22
|
+
'documentation_uri' => "https://www.rubydoc.info/github/wowinter13/fast_fuzzy_matcher",
|
23
|
+
'source_code_uri' => "https://github.com/wowinter13/fast_fuzzy_matcher"
|
24
|
+
}
|
25
|
+
|
26
|
+
# Specify which files should be added to the gem when it is released.
|
27
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
28
|
+
spec.files = Dir.chdir(__dir__) do
|
29
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
30
|
+
(File.expand_path(f) == __FILE__) ||
|
31
|
+
f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor Gemfile])
|
32
|
+
end
|
33
|
+
end
|
34
|
+
spec.bindir = "exe"
|
35
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
36
|
+
spec.require_paths = ["lib"]
|
37
|
+
|
38
|
+
spec.test_files = Dir['spec/**/*']
|
39
|
+
|
40
|
+
spec.add_dependency "ffi"
|
41
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "fuzzy_matcher/version"
|
4
|
+
|
5
|
+
require 'ffi'
|
6
|
+
|
7
|
+
module FuzzyMatcher
|
8
|
+
class Error < StandardError; end
|
9
|
+
|
10
|
+
# find() will return a list of strings in targets that fuzzy matches source.
|
11
|
+
#
|
12
|
+
# @param [String] source The string to match against.
|
13
|
+
# @param [Array<String>] targets The strings to match.
|
14
|
+
#
|
15
|
+
# @return [Array<String>] The strings in targets that fuzzy match source.
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# require 'fast_fuzzy_matcher'
|
19
|
+
# FuzzyMatch.find("whl", ["cartwheel", "foobar", "wheel", "baz"])
|
20
|
+
# => ["cartwheel", "wheel"]
|
21
|
+
#
|
22
|
+
# @note This method possibly is not thread safe.
|
23
|
+
# @note This method is case sensitive. For case insensitive matching, downcase targets/source or use a case insensitive matcher (#find_fold)
|
24
|
+
#
|
25
|
+
# @see ext/fuzzy.go#Find for the implementation of this method.
|
26
|
+
def self.find(source, targets)
|
27
|
+
pointers = targets.map { |t| FFI::MemoryPointer.from_string(t) }
|
28
|
+
targets_ptr = FFI::MemoryPointer.new(:pointer, targets.size)
|
29
|
+
targets_ptr.write_array_of_pointer(pointers)
|
30
|
+
|
31
|
+
result_ptr = FuzzyBinding.Find(source, targets_ptr, targets.size)
|
32
|
+
|
33
|
+
return [] if result_ptr.null?
|
34
|
+
|
35
|
+
pointers_array = result_ptr.read_array_of_pointer(targets.size)
|
36
|
+
|
37
|
+
result_array = pointers_array.each_with_object([]) do |ptr, arr|
|
38
|
+
if ptr && !ptr.null?
|
39
|
+
value = ptr.read_string_to_null
|
40
|
+
arr << value unless value.nil? || value == ""
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
FuzzyBinding.free_cstrings(result_ptr, targets.size)
|
45
|
+
|
46
|
+
FFI::MemoryPointer.new(:pointer).write_pointer(result_ptr).free
|
47
|
+
|
48
|
+
result_array
|
49
|
+
end
|
50
|
+
|
51
|
+
module FuzzyBinding
|
52
|
+
extend FFI::Library
|
53
|
+
ffi_lib File.expand_path("../ext/fuzzy.so", File.dirname(__FILE__))
|
54
|
+
|
55
|
+
attach_function :Find, [:string, :pointer, :int], :pointer
|
56
|
+
attach_function :free_cstrings, [:pointer, :int], :void
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
RSpec.describe FuzzyMatcher do
|
6
|
+
it "has a version number" do
|
7
|
+
expect(FuzzyMatcher::VERSION).not_to be nil
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "#find" do
|
11
|
+
it "responds with an empty array when no matches are found" do
|
12
|
+
expect(FuzzyMatcher.find("foo", ["bar", "baz"])).to eq([])
|
13
|
+
end
|
14
|
+
|
15
|
+
it "responds with an empty array when no targets are given" do
|
16
|
+
expect(FuzzyMatcher.find("foo", [])).to eq([])
|
17
|
+
end
|
18
|
+
|
19
|
+
it "responds with matches when the source is a substring of a target" do
|
20
|
+
expect(FuzzyMatcher.find("whl", ["cartwheel", "foobar", "wheel", "baz"])).to eq(["cartwheel", "wheel"])
|
21
|
+
end
|
22
|
+
|
23
|
+
it "does not respond with matches when the source is a substring of a target and the source is uppercase" do
|
24
|
+
expect(FuzzyMatcher.find("WHL", ["cartwheel", "foobar", "wheel", "baz"])).to eq([])
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "fuzzy_matcher"
|
4
|
+
|
5
|
+
RSpec.configure do |config|
|
6
|
+
# Enable flags like --only-failures and --next-failure
|
7
|
+
config.example_status_persistence_file_path = ".rspec_status"
|
8
|
+
|
9
|
+
# Disable RSpec exposing methods globally on `Module` and `main`
|
10
|
+
config.disable_monkey_patching!
|
11
|
+
|
12
|
+
config.expect_with :rspec do |c|
|
13
|
+
c.syntax = :expect
|
14
|
+
end
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fast_fuzzy_matcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Vlad Dyachenko
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-01-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ffi
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
description: A tiny and blazing-fast fuzzy search in pure Ruby with FFI bindings to
|
28
|
+
Go.Fuzzy searching allows for flexibly matching a string with partial input, useful
|
29
|
+
for filtering data very quickly based on lightweight user input.
|
30
|
+
email:
|
31
|
+
- vla-dy@yandex.ru
|
32
|
+
executables: []
|
33
|
+
extensions: []
|
34
|
+
extra_rdoc_files: []
|
35
|
+
files:
|
36
|
+
- ".rspec"
|
37
|
+
- CHANGELOG.md
|
38
|
+
- LICENSE.txt
|
39
|
+
- README.md
|
40
|
+
- Rakefile
|
41
|
+
- ext/.idea/.gitignore
|
42
|
+
- ext/.idea/ext.iml
|
43
|
+
- ext/.idea/modules.xml
|
44
|
+
- ext/.idea/vcs.xml
|
45
|
+
- ext/fuzzy.go
|
46
|
+
- ext/fuzzy.h
|
47
|
+
- ext/fuzzy.so
|
48
|
+
- ext/go.mod
|
49
|
+
- ext/go.sum
|
50
|
+
- fast_fuzzy_matcher.gemspec
|
51
|
+
- lib/fuzzy_matcher.rb
|
52
|
+
- lib/fuzzy_matcher/version.rb
|
53
|
+
- spec/fuzzy_matcher_spec.rb
|
54
|
+
- spec/spec_helper.rb
|
55
|
+
homepage: https://github.com/wowinter13/fast_fuzzy_matcher
|
56
|
+
licenses:
|
57
|
+
- MIT
|
58
|
+
metadata:
|
59
|
+
bug_tracker_uri: https://github.com/wowinter13/fast_fuzzy_matcher/issues
|
60
|
+
changelog_uri: https://github.com/wowinter13/fast_fuzzy_matcher/blob/master/CHANGELOG.md
|
61
|
+
documentation_uri: https://www.rubydoc.info/github/wowinter13/fast_fuzzy_matcher
|
62
|
+
source_code_uri: https://github.com/wowinter13/fast_fuzzy_matcher
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options: []
|
65
|
+
require_paths:
|
66
|
+
- lib
|
67
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: 2.6.0
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
requirements: []
|
78
|
+
rubygems_version: 3.4.10
|
79
|
+
signing_key:
|
80
|
+
specification_version: 4
|
81
|
+
summary: fast_fuzzy_matcher is the fastest fuzzy search library for Ruby.
|
82
|
+
test_files:
|
83
|
+
- spec/fuzzy_matcher_spec.rb
|
84
|
+
- spec/spec_helper.rb
|