fuzzy-string 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ === 0.1.0 (2012-02-15)
2
+
3
+ * Initial Version.
@@ -0,0 +1,27 @@
1
+ # Fuzzy
2
+
3
+ A compilation of functions that allow fuzzy string matching.
4
+
5
+ ## API
6
+
7
+ ```
8
+ FuzzyString
9
+ .stem(word, language = "english") #=> snowball stem of word
10
+ .stem_languages #=> list of languages supporter by stemmer
11
+ .soundex(word) #=> soundex code of english word
12
+ .jaro_winkler_distance(string1, string2) #=> numeric Jaro-Winkler distance of two strings (0-1) 1: same 0: different
13
+ .levenstein_distance(string1, string2) #=> numeric levenstein distance of two strings (edit distance)
14
+
15
+ ```
16
+
17
+ ## Example
18
+
19
+ ```ruby
20
+
21
+ require 'fuzzy-string'
22
+ FuzzyString.jaro_winkler_distance("apples", "apple")
23
+ ```
24
+
25
+ ## License
26
+
27
+ [Creative Commons Attribution - CC BY](http://creativecommons.org/licenses/by/3.0)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'mkmf'
4
+
5
+ $CFLAGS = '-std=c99'
6
+
7
+ have_library('stemmer') or raise "please install snowball stemmer (debian package: libstemmer-dev)"
8
+ create_makefile 'fuzzy-string/fuzzy_string'
@@ -0,0 +1,247 @@
1
+ /*
2
+ (c) Bharanee Rathna 2011
3
+
4
+ CC BY-SA 3.0
5
+ http://creativecommons.org/licenses/by-sa/3.0/
6
+
7
+ Free for every type of use. The author cannot be legally held responsible for
8
+ any damages resulting from the use of this work. All modifications or derivatives
9
+ need to be attributed.
10
+ */
11
+
12
+ #include <ctype.h>
13
+ #include <libstemmer.h>
14
+ #include <ruby/ruby.h>
15
+ #include <ruby/encoding.h>
16
+ #include "version.h"
17
+
18
+ #define max(a, b) (a > b ? a : b)
19
+ #define min(a, b) (a < b ? a : b)
20
+ #define min3(a, b, c) (a < b ? (a < c ? a : c) : (b < c ? b : c))
21
+
22
+ #define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
23
+ #define CSTRING(v) RSTRING_PTR(TO_S(v))
24
+
25
+ VALUE fuzzy_default_language, mFuzzy;
26
+
27
+ // shamelessly stolen from https://github.com/kiyoka/fuzzy-string-match.git
28
+ double c_jaro_winkler_distance(char *s1, char *s2) {
29
+ char *_max;
30
+ char *_min;
31
+ int _max_length = 0;
32
+ int _min_length = 0;
33
+ if (strlen(s1) > strlen(s2)) {
34
+ _max = s1;
35
+ _max_length = strlen(s1);
36
+ _min = s2;
37
+ _min_length = strlen(s2);
38
+ }
39
+ else {
40
+ _max = s2;
41
+ _max_length = strlen(s2);
42
+ _min = s1;
43
+ _min_length = strlen(s1);
44
+ }
45
+ int range = max(_max_length / 2 - 1, 0);
46
+
47
+ int indexes[_min_length];
48
+ for (int i = 0; i < _min_length; i++) {
49
+ indexes[i] = -1;
50
+ }
51
+
52
+ int flags[_max_length];
53
+ for (int i = 0; i < _max_length; i++) {
54
+ flags[i] = 0;
55
+ }
56
+ int matches = 0;
57
+ for (int mi = 0; mi < _min_length; mi++) {
58
+ char c1 = _min[mi];
59
+ for (int xi = max(mi - range, 0), xn = min(mi + range + 1, _max_length); xi < xn; xi++) {
60
+ if (!flags[xi] && (c1 == _max[xi])) {
61
+ indexes[mi] = xi;
62
+ flags[xi] = 1;
63
+ matches++;
64
+ break;
65
+ }
66
+ }
67
+ }
68
+
69
+ char ms1[matches];
70
+ char ms2[matches];
71
+ int ms1_length = matches;
72
+
73
+ for (int i = 0, si = 0; i < _min_length; i++) {
74
+ if (indexes[i] != -1) {
75
+ ms1[si] = _min[i];
76
+ si++;
77
+ }
78
+ }
79
+ for (int i = 0, si = 0; i < _max_length; i++) {
80
+ if (flags[i]) {
81
+ ms2[si] = _max[i];
82
+ si++;
83
+ }
84
+ }
85
+ int transpositions = 0;
86
+ for (int mi = 0; mi < ms1_length; mi++) {
87
+ if (ms1[mi] != ms2[mi]) {
88
+ transpositions++;
89
+ }
90
+ }
91
+ int prefix = 0;
92
+ for (int mi = 0; mi < _min_length; mi++) {
93
+ if (s1[mi] == s2[mi]) {
94
+ prefix++;
95
+ }
96
+ else {
97
+ break;
98
+ }
99
+ }
100
+
101
+ double m = (double) matches;
102
+ if (matches == 0) {
103
+ return 0.0;
104
+ }
105
+ int t = transpositions / 2;
106
+ double j = ((m / strlen(s1) + m / strlen(s2) + (m - t) / m)) / 3;
107
+ double jw = j < 0.7 ? j : j + min(0.1, 1.0 / _max_length) * prefix * (1 - j);
108
+ return jw;
109
+ }
110
+
111
+ int c_levenstein_distance(char *s, char *t) {
112
+ int k, i, j, n, m, cost, *d, distance;
113
+ n = strlen(s);
114
+ m = strlen(t);
115
+ if (n != 0 && m != 0) {
116
+ d = (int*)malloc((sizeof(int)) * (m + 1) * (n + 1));
117
+ m++;
118
+ n++;
119
+ //Step 2
120
+ for (k = 0; k < n; k++)
121
+ d[k] = k;
122
+ for (k = 0; k < m; k++)
123
+ d[k * n] = k;
124
+ //Step 3 and 4
125
+ for (i = 1; i < n; i++)
126
+ for (j = 1; j < m; j++) {
127
+ //Step 5
128
+ if (s[i - 1] == t[j - 1])
129
+ cost = 0;
130
+ else
131
+ cost = 1;
132
+ //Step 6
133
+ d[j * n + i] = min3(d[(j - 1) * n + i] + 1, d[j * n + i - 1] + 1, d[(j - 1) * n + i - 1] + cost);
134
+ }
135
+ distance = d[n * m - 1];
136
+ free(d);
137
+ return distance;
138
+ }
139
+ //a negative return value means that one or both strings are empty.
140
+ else
141
+ return -1;
142
+ }
143
+
144
+ VALUE fuzzy_jaro_winkler_distance(VALUE self, VALUE s1, VALUE s2) {
145
+ return DBL2NUM(c_jaro_winkler_distance(CSTRING(s1), CSTRING(s2)));
146
+ }
147
+
148
+ VALUE fuzzy_levenstein_distance(VALUE self, VALUE s1, VALUE s2) {
149
+ return INT2NUM(c_levenstein_distance(CSTRING(s1), CSTRING(s2)));
150
+ }
151
+
152
+ VALUE fuzzy_snowball(int argc, VALUE * argv, VALUE self) {
153
+ VALUE word, language, result = Qnil;
154
+
155
+ rb_scan_args(argc, argv, "11", &word, &language);
156
+ if (NIL_P(language))
157
+ language = fuzzy_default_language;
158
+
159
+ if (TYPE(word) != T_STRING)
160
+ rb_raise(rb_eArgError, "invalid word, expect string");
161
+
162
+ struct sb_stemmer *stemmer = sb_stemmer_new(CSTRING(language), "UTF_8");
163
+ if (stemmer) {
164
+ const sb_symbol *stem = sb_stemmer_stem(stemmer, RSTRING_PTR(word), RSTRING_LEN(word));
165
+ uint32_t stem_len = sb_stemmer_length(stemmer);
166
+ result = rb_enc_str_new(stem, stem_len, rb_enc_get(word));
167
+ sb_stemmer_delete(stemmer);
168
+ }
169
+
170
+ return result;
171
+ }
172
+
173
+ VALUE fuzzy_snowball_languages(VALUE self) {
174
+ VALUE languages = rb_ary_new();
175
+ const char **list = sb_stemmer_list();
176
+ while (*list) {
177
+ // ignore 'porter' - it's only for backwards compatibility.
178
+ if (strcmp(*list, "porter"))
179
+ rb_ary_push(languages, rb_str_new2(*list));
180
+ list++;
181
+ }
182
+
183
+ return languages;
184
+ }
185
+
186
+ // adapted from http://en.literateprograms.org/Soundex_(C)
187
+ VALUE fuzzy_soundex(VALUE self, VALUE string) {
188
+ if (TYPE(string) != T_STRING)
189
+ rb_raise(rb_eArgError, "invalid argument, expect string");
190
+
191
+ static int code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2 };
192
+ /* a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z */
193
+ static char key[5];
194
+ register char ch;
195
+ register int last;
196
+ register int count;
197
+ const char *cstring = RSTRING_PTR(string);
198
+
199
+ /* Set up default key, complete with trailing '0's */
200
+ strcpy(key, "Z000");
201
+
202
+ /* Advance to the first letter. If none present,
203
+ return default key */
204
+ while (*cstring && !isalpha(*cstring))
205
+ ++cstring;
206
+ if (*cstring == 0)
207
+ return rb_str_new2(key);
208
+
209
+ /* Pull out the first letter, uppercase it, and
210
+ set up for main loop */
211
+ key[0] = toupper(*cstring);
212
+ last = code[key[0] - 'A'];
213
+ ++cstring;
214
+
215
+ /* Scan rest of string, stop at end of string or
216
+ when the key is full */
217
+ for (count = 1; count < 4 && *cstring; ++cstring) {
218
+ /* If non-alpha, ignore the character altogether */
219
+ if (isalpha(*cstring)) {
220
+ ch = tolower(*cstring);
221
+ /* Fold together adjacent letters sharing the same code */
222
+ if (last != code[ch - 'a']) {
223
+ last = code[ch - 'a'];
224
+ /* Ignore code==0 letters except as separators */
225
+ if (last != 0)
226
+ key[count++] = '0' + last;
227
+ }
228
+ }
229
+ }
230
+
231
+ return rb_str_new2(key);
232
+ }
233
+
234
+ void Init_fuzzy_string() {
235
+ mFuzzy = rb_define_module("FuzzyString");
236
+
237
+ fuzzy_default_language = rb_str_new2("en");
238
+ rb_global_variable(&fuzzy_default_language);
239
+
240
+ rb_define_module_function(mFuzzy, "jaro_winkler_distance", RUBY_METHOD_FUNC(fuzzy_jaro_winkler_distance), 2);
241
+ rb_define_module_function(mFuzzy, "levenstein_distance", RUBY_METHOD_FUNC(fuzzy_levenstein_distance), 2);
242
+ rb_define_module_function(mFuzzy, "stem", RUBY_METHOD_FUNC(fuzzy_snowball), -1);
243
+ rb_define_module_function(mFuzzy, "stem_languages", RUBY_METHOD_FUNC(fuzzy_snowball_languages), 0);
244
+ rb_define_module_function(mFuzzy, "soundex", RUBY_METHOD_FUNC(fuzzy_soundex), 1);
245
+
246
+ rb_define_const(mFuzzy, "VERSION", rb_str_new2(RUBY_FUZZY_VERSION));
247
+ }
@@ -0,0 +1 @@
1
+ #define RUBY_FUZZY_VERSION "0.1.0"
@@ -0,0 +1 @@
1
+ require 'fuzzy-string/fuzzy_string'
@@ -0,0 +1,3 @@
1
+ require 'fuzzy-string'
2
+ require 'minitest/spec'
3
+ require 'minitest/autorun'
@@ -0,0 +1,55 @@
1
+ # encoding: utf-8
2
+ require 'helper'
3
+
4
+ FS = FuzzyString
5
+
6
+ describe 'fuzzy' do
7
+ describe 'soundex' do
8
+ it 'should raise exception for nil' do
9
+ assert_raises(ArgumentError) { FS.soundex(nil) }
10
+ end
11
+
12
+ it 'should default to Z000 for empty string' do
13
+ assert_equal 'Z000', FS.soundex('')
14
+ end
15
+
16
+ it 'should generate valid soundex codes' do
17
+ assert_equal "A140", FS.soundex("apple")
18
+ assert_equal "A140", FS.soundex("appel")
19
+ assert_equal "A142", FS.soundex("apples")
20
+ assert_equal "P200", FS.soundex("peach")
21
+ end
22
+ end
23
+
24
+ describe 'stem' do
25
+ it 'should stem english words' do
26
+ assert_equal "appl", FS.stem("apples")
27
+ assert_equal "appl", FS.stem("apple")
28
+ end
29
+
30
+ it 'should stem spanish words' do
31
+ assert_equal "manz", FS.stem("manza", "es")
32
+ end
33
+
34
+ it 'should return a list of languages' do
35
+ assert_kind_of Array, FS.stem_languages
36
+ end
37
+ end
38
+
39
+ describe 'jaro winkler distance' do
40
+ it 'should work' do
41
+ assert_equal 1, FS.jaro_winkler_distance("dean", "dean")
42
+ assert_equal 0, FS.jaro_winkler_distance("dean", "mike")
43
+ assert_in_delta 0.8333, FS.jaro_winkler_distance("dean", "sean")
44
+ assert_in_delta 0.6666, FS.jaro_winkler_distance("dean", "teen")
45
+ end
46
+ end
47
+
48
+ describe 'levenstein distance' do
49
+ it 'should work' do
50
+ assert_equal 0, FS.levenstein_distance("apple", "apple")
51
+ assert_equal 1, FS.levenstein_distance("apple", "apples")
52
+ assert_equal 5, FS.levenstein_distance("apple", "orange")
53
+ end
54
+ end
55
+ end
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fuzzy-string
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Bharanee Rathna
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2012-02-15 00:00:00 +11:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rake
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :development
32
+ version_requirements: *id001
33
+ description: A collection of functions for fuzzy string matching.
34
+ email:
35
+ - deepfryed@gmail.com
36
+ executables: []
37
+
38
+ extensions:
39
+ - ext/fuzzy-string/extconf.rb
40
+ extra_rdoc_files: []
41
+
42
+ files:
43
+ - ext/fuzzy-string/fuzzy-string.c
44
+ - ext/fuzzy-string/version.h
45
+ - ext/fuzzy-string/extconf.rb
46
+ - test/helper.rb
47
+ - test/test_fuzzy.rb
48
+ - lib/fuzzy-string.rb
49
+ - README.md
50
+ - CHANGELOG
51
+ has_rdoc: true
52
+ homepage: http://github.com/deepfryed/fuzzy-string
53
+ licenses: []
54
+
55
+ post_install_message:
56
+ rdoc_options: []
57
+
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ segments:
74
+ - 0
75
+ version: "0"
76
+ requirements: []
77
+
78
+ rubyforge_project:
79
+ rubygems_version: 1.3.7
80
+ signing_key:
81
+ specification_version: 3
82
+ summary: Fuzzy string matching
83
+ test_files: []
84
+