fuzzy-string 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +3 -0
- data/README.md +27 -0
- data/ext/fuzzy-string/extconf.rb +8 -0
- data/ext/fuzzy-string/fuzzy-string.c +247 -0
- data/ext/fuzzy-string/version.h +1 -0
- data/lib/fuzzy-string.rb +1 -0
- data/test/helper.rb +3 -0
- data/test/test_fuzzy.rb +55 -0
- metadata +84 -0
data/CHANGELOG
ADDED
data/README.md
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# Fuzzy
|
2
|
+
|
3
|
+
A compilation of functions that allow fuzzy string matching.
|
4
|
+
|
5
|
+
## API
|
6
|
+
|
7
|
+
```
|
8
|
+
FuzzyString
|
9
|
+
.stem(word, language = "english") #=> snowball stem of word
|
10
|
+
.stem_languages #=> list of languages supporter by stemmer
|
11
|
+
.soundex(word) #=> soundex code of english word
|
12
|
+
.jaro_winkler_distance(string1, string2) #=> numeric Jaro-Winkler distance of two strings (0-1) 1: same 0: different
|
13
|
+
.levenstein_distance(string1, string2) #=> numeric levenstein distance of two strings (edit distance)
|
14
|
+
|
15
|
+
```
|
16
|
+
|
17
|
+
## Example
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
|
21
|
+
require 'fuzzy-string'
|
22
|
+
FuzzyString.jaro_winkler_distance("apples", "apple")
|
23
|
+
```
|
24
|
+
|
25
|
+
## License
|
26
|
+
|
27
|
+
[Creative Commons Attribution - CC BY](http://creativecommons.org/licenses/by/3.0)
|
@@ -0,0 +1,247 @@
|
|
1
|
+
/*
|
2
|
+
(c) Bharanee Rathna 2011
|
3
|
+
|
4
|
+
CC BY-SA 3.0
|
5
|
+
http://creativecommons.org/licenses/by-sa/3.0/
|
6
|
+
|
7
|
+
Free for every type of use. The author cannot be legally held responsible for
|
8
|
+
any damages resulting from the use of this work. All modifications or derivatives
|
9
|
+
need to be attributed.
|
10
|
+
*/
|
11
|
+
|
12
|
+
#include <ctype.h>
|
13
|
+
#include <libstemmer.h>
|
14
|
+
#include <ruby/ruby.h>
|
15
|
+
#include <ruby/encoding.h>
|
16
|
+
#include "version.h"
|
17
|
+
|
18
|
+
#define max(a, b) (a > b ? a : b)
|
19
|
+
#define min(a, b) (a < b ? a : b)
|
20
|
+
#define min3(a, b, c) (a < b ? (a < c ? a : c) : (b < c ? b : c))
|
21
|
+
|
22
|
+
#define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
|
23
|
+
#define CSTRING(v) RSTRING_PTR(TO_S(v))
|
24
|
+
|
25
|
+
VALUE fuzzy_default_language, mFuzzy;
|
26
|
+
|
27
|
+
// shamelessly stolen from https://github.com/kiyoka/fuzzy-string-match.git
|
28
|
+
double c_jaro_winkler_distance(char *s1, char *s2) {
|
29
|
+
char *_max;
|
30
|
+
char *_min;
|
31
|
+
int _max_length = 0;
|
32
|
+
int _min_length = 0;
|
33
|
+
if (strlen(s1) > strlen(s2)) {
|
34
|
+
_max = s1;
|
35
|
+
_max_length = strlen(s1);
|
36
|
+
_min = s2;
|
37
|
+
_min_length = strlen(s2);
|
38
|
+
}
|
39
|
+
else {
|
40
|
+
_max = s2;
|
41
|
+
_max_length = strlen(s2);
|
42
|
+
_min = s1;
|
43
|
+
_min_length = strlen(s1);
|
44
|
+
}
|
45
|
+
int range = max(_max_length / 2 - 1, 0);
|
46
|
+
|
47
|
+
int indexes[_min_length];
|
48
|
+
for (int i = 0; i < _min_length; i++) {
|
49
|
+
indexes[i] = -1;
|
50
|
+
}
|
51
|
+
|
52
|
+
int flags[_max_length];
|
53
|
+
for (int i = 0; i < _max_length; i++) {
|
54
|
+
flags[i] = 0;
|
55
|
+
}
|
56
|
+
int matches = 0;
|
57
|
+
for (int mi = 0; mi < _min_length; mi++) {
|
58
|
+
char c1 = _min[mi];
|
59
|
+
for (int xi = max(mi - range, 0), xn = min(mi + range + 1, _max_length); xi < xn; xi++) {
|
60
|
+
if (!flags[xi] && (c1 == _max[xi])) {
|
61
|
+
indexes[mi] = xi;
|
62
|
+
flags[xi] = 1;
|
63
|
+
matches++;
|
64
|
+
break;
|
65
|
+
}
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
char ms1[matches];
|
70
|
+
char ms2[matches];
|
71
|
+
int ms1_length = matches;
|
72
|
+
|
73
|
+
for (int i = 0, si = 0; i < _min_length; i++) {
|
74
|
+
if (indexes[i] != -1) {
|
75
|
+
ms1[si] = _min[i];
|
76
|
+
si++;
|
77
|
+
}
|
78
|
+
}
|
79
|
+
for (int i = 0, si = 0; i < _max_length; i++) {
|
80
|
+
if (flags[i]) {
|
81
|
+
ms2[si] = _max[i];
|
82
|
+
si++;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
int transpositions = 0;
|
86
|
+
for (int mi = 0; mi < ms1_length; mi++) {
|
87
|
+
if (ms1[mi] != ms2[mi]) {
|
88
|
+
transpositions++;
|
89
|
+
}
|
90
|
+
}
|
91
|
+
int prefix = 0;
|
92
|
+
for (int mi = 0; mi < _min_length; mi++) {
|
93
|
+
if (s1[mi] == s2[mi]) {
|
94
|
+
prefix++;
|
95
|
+
}
|
96
|
+
else {
|
97
|
+
break;
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
double m = (double) matches;
|
102
|
+
if (matches == 0) {
|
103
|
+
return 0.0;
|
104
|
+
}
|
105
|
+
int t = transpositions / 2;
|
106
|
+
double j = ((m / strlen(s1) + m / strlen(s2) + (m - t) / m)) / 3;
|
107
|
+
double jw = j < 0.7 ? j : j + min(0.1, 1.0 / _max_length) * prefix * (1 - j);
|
108
|
+
return jw;
|
109
|
+
}
|
110
|
+
|
111
|
+
int c_levenstein_distance(char *s, char *t) {
|
112
|
+
int k, i, j, n, m, cost, *d, distance;
|
113
|
+
n = strlen(s);
|
114
|
+
m = strlen(t);
|
115
|
+
if (n != 0 && m != 0) {
|
116
|
+
d = (int*)malloc((sizeof(int)) * (m + 1) * (n + 1));
|
117
|
+
m++;
|
118
|
+
n++;
|
119
|
+
//Step 2
|
120
|
+
for (k = 0; k < n; k++)
|
121
|
+
d[k] = k;
|
122
|
+
for (k = 0; k < m; k++)
|
123
|
+
d[k * n] = k;
|
124
|
+
//Step 3 and 4
|
125
|
+
for (i = 1; i < n; i++)
|
126
|
+
for (j = 1; j < m; j++) {
|
127
|
+
//Step 5
|
128
|
+
if (s[i - 1] == t[j - 1])
|
129
|
+
cost = 0;
|
130
|
+
else
|
131
|
+
cost = 1;
|
132
|
+
//Step 6
|
133
|
+
d[j * n + i] = min3(d[(j - 1) * n + i] + 1, d[j * n + i - 1] + 1, d[(j - 1) * n + i - 1] + cost);
|
134
|
+
}
|
135
|
+
distance = d[n * m - 1];
|
136
|
+
free(d);
|
137
|
+
return distance;
|
138
|
+
}
|
139
|
+
//a negative return value means that one or both strings are empty.
|
140
|
+
else
|
141
|
+
return -1;
|
142
|
+
}
|
143
|
+
|
144
|
+
VALUE fuzzy_jaro_winkler_distance(VALUE self, VALUE s1, VALUE s2) {
|
145
|
+
return DBL2NUM(c_jaro_winkler_distance(CSTRING(s1), CSTRING(s2)));
|
146
|
+
}
|
147
|
+
|
148
|
+
VALUE fuzzy_levenstein_distance(VALUE self, VALUE s1, VALUE s2) {
|
149
|
+
return INT2NUM(c_levenstein_distance(CSTRING(s1), CSTRING(s2)));
|
150
|
+
}
|
151
|
+
|
152
|
+
VALUE fuzzy_snowball(int argc, VALUE * argv, VALUE self) {
|
153
|
+
VALUE word, language, result = Qnil;
|
154
|
+
|
155
|
+
rb_scan_args(argc, argv, "11", &word, &language);
|
156
|
+
if (NIL_P(language))
|
157
|
+
language = fuzzy_default_language;
|
158
|
+
|
159
|
+
if (TYPE(word) != T_STRING)
|
160
|
+
rb_raise(rb_eArgError, "invalid word, expect string");
|
161
|
+
|
162
|
+
struct sb_stemmer *stemmer = sb_stemmer_new(CSTRING(language), "UTF_8");
|
163
|
+
if (stemmer) {
|
164
|
+
const sb_symbol *stem = sb_stemmer_stem(stemmer, RSTRING_PTR(word), RSTRING_LEN(word));
|
165
|
+
uint32_t stem_len = sb_stemmer_length(stemmer);
|
166
|
+
result = rb_enc_str_new(stem, stem_len, rb_enc_get(word));
|
167
|
+
sb_stemmer_delete(stemmer);
|
168
|
+
}
|
169
|
+
|
170
|
+
return result;
|
171
|
+
}
|
172
|
+
|
173
|
+
VALUE fuzzy_snowball_languages(VALUE self) {
|
174
|
+
VALUE languages = rb_ary_new();
|
175
|
+
const char **list = sb_stemmer_list();
|
176
|
+
while (*list) {
|
177
|
+
// ignore 'porter' - it's only for backwards compatibility.
|
178
|
+
if (strcmp(*list, "porter"))
|
179
|
+
rb_ary_push(languages, rb_str_new2(*list));
|
180
|
+
list++;
|
181
|
+
}
|
182
|
+
|
183
|
+
return languages;
|
184
|
+
}
|
185
|
+
|
186
|
+
// adapted from http://en.literateprograms.org/Soundex_(C)
|
187
|
+
VALUE fuzzy_soundex(VALUE self, VALUE string) {
|
188
|
+
if (TYPE(string) != T_STRING)
|
189
|
+
rb_raise(rb_eArgError, "invalid argument, expect string");
|
190
|
+
|
191
|
+
static int code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2 };
|
192
|
+
/* a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z */
|
193
|
+
static char key[5];
|
194
|
+
register char ch;
|
195
|
+
register int last;
|
196
|
+
register int count;
|
197
|
+
const char *cstring = RSTRING_PTR(string);
|
198
|
+
|
199
|
+
/* Set up default key, complete with trailing '0's */
|
200
|
+
strcpy(key, "Z000");
|
201
|
+
|
202
|
+
/* Advance to the first letter. If none present,
|
203
|
+
return default key */
|
204
|
+
while (*cstring && !isalpha(*cstring))
|
205
|
+
++cstring;
|
206
|
+
if (*cstring == 0)
|
207
|
+
return rb_str_new2(key);
|
208
|
+
|
209
|
+
/* Pull out the first letter, uppercase it, and
|
210
|
+
set up for main loop */
|
211
|
+
key[0] = toupper(*cstring);
|
212
|
+
last = code[key[0] - 'A'];
|
213
|
+
++cstring;
|
214
|
+
|
215
|
+
/* Scan rest of string, stop at end of string or
|
216
|
+
when the key is full */
|
217
|
+
for (count = 1; count < 4 && *cstring; ++cstring) {
|
218
|
+
/* If non-alpha, ignore the character altogether */
|
219
|
+
if (isalpha(*cstring)) {
|
220
|
+
ch = tolower(*cstring);
|
221
|
+
/* Fold together adjacent letters sharing the same code */
|
222
|
+
if (last != code[ch - 'a']) {
|
223
|
+
last = code[ch - 'a'];
|
224
|
+
/* Ignore code==0 letters except as separators */
|
225
|
+
if (last != 0)
|
226
|
+
key[count++] = '0' + last;
|
227
|
+
}
|
228
|
+
}
|
229
|
+
}
|
230
|
+
|
231
|
+
return rb_str_new2(key);
|
232
|
+
}
|
233
|
+
|
234
|
+
void Init_fuzzy_string() {
|
235
|
+
mFuzzy = rb_define_module("FuzzyString");
|
236
|
+
|
237
|
+
fuzzy_default_language = rb_str_new2("en");
|
238
|
+
rb_global_variable(&fuzzy_default_language);
|
239
|
+
|
240
|
+
rb_define_module_function(mFuzzy, "jaro_winkler_distance", RUBY_METHOD_FUNC(fuzzy_jaro_winkler_distance), 2);
|
241
|
+
rb_define_module_function(mFuzzy, "levenstein_distance", RUBY_METHOD_FUNC(fuzzy_levenstein_distance), 2);
|
242
|
+
rb_define_module_function(mFuzzy, "stem", RUBY_METHOD_FUNC(fuzzy_snowball), -1);
|
243
|
+
rb_define_module_function(mFuzzy, "stem_languages", RUBY_METHOD_FUNC(fuzzy_snowball_languages), 0);
|
244
|
+
rb_define_module_function(mFuzzy, "soundex", RUBY_METHOD_FUNC(fuzzy_soundex), 1);
|
245
|
+
|
246
|
+
rb_define_const(mFuzzy, "VERSION", rb_str_new2(RUBY_FUZZY_VERSION));
|
247
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
#define RUBY_FUZZY_VERSION "0.1.0"
|
data/lib/fuzzy-string.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'fuzzy-string/fuzzy_string'
|
data/test/helper.rb
ADDED
data/test/test_fuzzy.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'helper'
|
3
|
+
|
4
|
+
FS = FuzzyString
|
5
|
+
|
6
|
+
describe 'fuzzy' do
|
7
|
+
describe 'soundex' do
|
8
|
+
it 'should raise exception for nil' do
|
9
|
+
assert_raises(ArgumentError) { FS.soundex(nil) }
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should default to Z000 for empty string' do
|
13
|
+
assert_equal 'Z000', FS.soundex('')
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should generate valid soundex codes' do
|
17
|
+
assert_equal "A140", FS.soundex("apple")
|
18
|
+
assert_equal "A140", FS.soundex("appel")
|
19
|
+
assert_equal "A142", FS.soundex("apples")
|
20
|
+
assert_equal "P200", FS.soundex("peach")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
describe 'stem' do
|
25
|
+
it 'should stem english words' do
|
26
|
+
assert_equal "appl", FS.stem("apples")
|
27
|
+
assert_equal "appl", FS.stem("apple")
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should stem spanish words' do
|
31
|
+
assert_equal "manz", FS.stem("manza", "es")
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should return a list of languages' do
|
35
|
+
assert_kind_of Array, FS.stem_languages
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe 'jaro winkler distance' do
|
40
|
+
it 'should work' do
|
41
|
+
assert_equal 1, FS.jaro_winkler_distance("dean", "dean")
|
42
|
+
assert_equal 0, FS.jaro_winkler_distance("dean", "mike")
|
43
|
+
assert_in_delta 0.8333, FS.jaro_winkler_distance("dean", "sean")
|
44
|
+
assert_in_delta 0.6666, FS.jaro_winkler_distance("dean", "teen")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe 'levenstein distance' do
|
49
|
+
it 'should work' do
|
50
|
+
assert_equal 0, FS.levenstein_distance("apple", "apple")
|
51
|
+
assert_equal 1, FS.levenstein_distance("apple", "apples")
|
52
|
+
assert_equal 5, FS.levenstein_distance("apple", "orange")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
metadata
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fuzzy-string
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Bharanee Rathna
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2012-02-15 00:00:00 +11:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rake
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :development
|
32
|
+
version_requirements: *id001
|
33
|
+
description: A collection of functions for fuzzy string matching.
|
34
|
+
email:
|
35
|
+
- deepfryed@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions:
|
39
|
+
- ext/fuzzy-string/extconf.rb
|
40
|
+
extra_rdoc_files: []
|
41
|
+
|
42
|
+
files:
|
43
|
+
- ext/fuzzy-string/fuzzy-string.c
|
44
|
+
- ext/fuzzy-string/version.h
|
45
|
+
- ext/fuzzy-string/extconf.rb
|
46
|
+
- test/helper.rb
|
47
|
+
- test/test_fuzzy.rb
|
48
|
+
- lib/fuzzy-string.rb
|
49
|
+
- README.md
|
50
|
+
- CHANGELOG
|
51
|
+
has_rdoc: true
|
52
|
+
homepage: http://github.com/deepfryed/fuzzy-string
|
53
|
+
licenses: []
|
54
|
+
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
segments:
|
74
|
+
- 0
|
75
|
+
version: "0"
|
76
|
+
requirements: []
|
77
|
+
|
78
|
+
rubyforge_project:
|
79
|
+
rubygems_version: 1.3.7
|
80
|
+
signing_key:
|
81
|
+
specification_version: 3
|
82
|
+
summary: Fuzzy string matching
|
83
|
+
test_files: []
|
84
|
+
|