fuzzy-string 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ === 0.1.0 (2012-02-15)
2
+
3
+ * Initial Version.
@@ -0,0 +1,27 @@
1
+ # Fuzzy
2
+
3
+ A compilation of functions that allow fuzzy string matching.
4
+
5
+ ## API
6
+
7
+ ```
8
+ FuzzyString
9
+ .stem(word, language = "english") #=> snowball stem of word
10
+ .stem_languages #=> list of languages supporter by stemmer
11
+ .soundex(word) #=> soundex code of english word
12
+ .jaro_winkler_distance(string1, string2) #=> numeric Jaro-Winkler distance of two strings (0-1) 1: same 0: different
13
+ .levenstein_distance(string1, string2) #=> numeric levenstein distance of two strings (edit distance)
14
+
15
+ ```
16
+
17
+ ## Example
18
+
19
+ ```ruby
20
+
21
+ require 'fuzzy-string'
22
+ FuzzyString.jaro_winkler_distance("apples", "apple")
23
+ ```
24
+
25
+ ## License
26
+
27
+ [Creative Commons Attribution - CC BY](http://creativecommons.org/licenses/by/3.0)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'mkmf'
4
+
5
+ $CFLAGS = '-std=c99'
6
+
7
+ have_library('stemmer') or raise "please install snowball stemmer (debian package: libstemmer-dev)"
8
+ create_makefile 'fuzzy-string/fuzzy_string'
@@ -0,0 +1,247 @@
1
+ /*
2
+ (c) Bharanee Rathna 2011
3
+
4
+ CC BY-SA 3.0
5
+ http://creativecommons.org/licenses/by-sa/3.0/
6
+
7
+ Free for every type of use. The author cannot be legally held responsible for
8
+ any damages resulting from the use of this work. All modifications or derivatives
9
+ need to be attributed.
10
+ */
11
+
12
+ #include <ctype.h>
13
+ #include <libstemmer.h>
14
+ #include <ruby/ruby.h>
15
+ #include <ruby/encoding.h>
16
+ #include "version.h"
17
+
18
+ #define max(a, b) (a > b ? a : b)
19
+ #define min(a, b) (a < b ? a : b)
20
+ #define min3(a, b, c) (a < b ? (a < c ? a : c) : (b < c ? b : c))
21
+
22
+ #define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
23
+ #define CSTRING(v) RSTRING_PTR(TO_S(v))
24
+
25
+ VALUE fuzzy_default_language, mFuzzy;
26
+
27
+ // shamelessly stolen from https://github.com/kiyoka/fuzzy-string-match.git
28
+ double c_jaro_winkler_distance(char *s1, char *s2) {
29
+ char *_max;
30
+ char *_min;
31
+ int _max_length = 0;
32
+ int _min_length = 0;
33
+ if (strlen(s1) > strlen(s2)) {
34
+ _max = s1;
35
+ _max_length = strlen(s1);
36
+ _min = s2;
37
+ _min_length = strlen(s2);
38
+ }
39
+ else {
40
+ _max = s2;
41
+ _max_length = strlen(s2);
42
+ _min = s1;
43
+ _min_length = strlen(s1);
44
+ }
45
+ int range = max(_max_length / 2 - 1, 0);
46
+
47
+ int indexes[_min_length];
48
+ for (int i = 0; i < _min_length; i++) {
49
+ indexes[i] = -1;
50
+ }
51
+
52
+ int flags[_max_length];
53
+ for (int i = 0; i < _max_length; i++) {
54
+ flags[i] = 0;
55
+ }
56
+ int matches = 0;
57
+ for (int mi = 0; mi < _min_length; mi++) {
58
+ char c1 = _min[mi];
59
+ for (int xi = max(mi - range, 0), xn = min(mi + range + 1, _max_length); xi < xn; xi++) {
60
+ if (!flags[xi] && (c1 == _max[xi])) {
61
+ indexes[mi] = xi;
62
+ flags[xi] = 1;
63
+ matches++;
64
+ break;
65
+ }
66
+ }
67
+ }
68
+
69
+ char ms1[matches];
70
+ char ms2[matches];
71
+ int ms1_length = matches;
72
+
73
+ for (int i = 0, si = 0; i < _min_length; i++) {
74
+ if (indexes[i] != -1) {
75
+ ms1[si] = _min[i];
76
+ si++;
77
+ }
78
+ }
79
+ for (int i = 0, si = 0; i < _max_length; i++) {
80
+ if (flags[i]) {
81
+ ms2[si] = _max[i];
82
+ si++;
83
+ }
84
+ }
85
+ int transpositions = 0;
86
+ for (int mi = 0; mi < ms1_length; mi++) {
87
+ if (ms1[mi] != ms2[mi]) {
88
+ transpositions++;
89
+ }
90
+ }
91
+ int prefix = 0;
92
+ for (int mi = 0; mi < _min_length; mi++) {
93
+ if (s1[mi] == s2[mi]) {
94
+ prefix++;
95
+ }
96
+ else {
97
+ break;
98
+ }
99
+ }
100
+
101
+ double m = (double) matches;
102
+ if (matches == 0) {
103
+ return 0.0;
104
+ }
105
+ int t = transpositions / 2;
106
+ double j = ((m / strlen(s1) + m / strlen(s2) + (m - t) / m)) / 3;
107
+ double jw = j < 0.7 ? j : j + min(0.1, 1.0 / _max_length) * prefix * (1 - j);
108
+ return jw;
109
+ }
110
+
111
+ int c_levenstein_distance(char *s, char *t) {
112
+ int k, i, j, n, m, cost, *d, distance;
113
+ n = strlen(s);
114
+ m = strlen(t);
115
+ if (n != 0 && m != 0) {
116
+ d = (int*)malloc((sizeof(int)) * (m + 1) * (n + 1));
117
+ m++;
118
+ n++;
119
+ //Step 2
120
+ for (k = 0; k < n; k++)
121
+ d[k] = k;
122
+ for (k = 0; k < m; k++)
123
+ d[k * n] = k;
124
+ //Step 3 and 4
125
+ for (i = 1; i < n; i++)
126
+ for (j = 1; j < m; j++) {
127
+ //Step 5
128
+ if (s[i - 1] == t[j - 1])
129
+ cost = 0;
130
+ else
131
+ cost = 1;
132
+ //Step 6
133
+ d[j * n + i] = min3(d[(j - 1) * n + i] + 1, d[j * n + i - 1] + 1, d[(j - 1) * n + i - 1] + cost);
134
+ }
135
+ distance = d[n * m - 1];
136
+ free(d);
137
+ return distance;
138
+ }
139
+ //a negative return value means that one or both strings are empty.
140
+ else
141
+ return -1;
142
+ }
143
+
144
+ VALUE fuzzy_jaro_winkler_distance(VALUE self, VALUE s1, VALUE s2) {
145
+ return DBL2NUM(c_jaro_winkler_distance(CSTRING(s1), CSTRING(s2)));
146
+ }
147
+
148
+ VALUE fuzzy_levenstein_distance(VALUE self, VALUE s1, VALUE s2) {
149
+ return INT2NUM(c_levenstein_distance(CSTRING(s1), CSTRING(s2)));
150
+ }
151
+
152
+ VALUE fuzzy_snowball(int argc, VALUE * argv, VALUE self) {
153
+ VALUE word, language, result = Qnil;
154
+
155
+ rb_scan_args(argc, argv, "11", &word, &language);
156
+ if (NIL_P(language))
157
+ language = fuzzy_default_language;
158
+
159
+ if (TYPE(word) != T_STRING)
160
+ rb_raise(rb_eArgError, "invalid word, expect string");
161
+
162
+ struct sb_stemmer *stemmer = sb_stemmer_new(CSTRING(language), "UTF_8");
163
+ if (stemmer) {
164
+ const sb_symbol *stem = sb_stemmer_stem(stemmer, RSTRING_PTR(word), RSTRING_LEN(word));
165
+ uint32_t stem_len = sb_stemmer_length(stemmer);
166
+ result = rb_enc_str_new(stem, stem_len, rb_enc_get(word));
167
+ sb_stemmer_delete(stemmer);
168
+ }
169
+
170
+ return result;
171
+ }
172
+
173
+ VALUE fuzzy_snowball_languages(VALUE self) {
174
+ VALUE languages = rb_ary_new();
175
+ const char **list = sb_stemmer_list();
176
+ while (*list) {
177
+ // ignore 'porter' - it's only for backwards compatibility.
178
+ if (strcmp(*list, "porter"))
179
+ rb_ary_push(languages, rb_str_new2(*list));
180
+ list++;
181
+ }
182
+
183
+ return languages;
184
+ }
185
+
186
+ // adapted from http://en.literateprograms.org/Soundex_(C)
187
+ VALUE fuzzy_soundex(VALUE self, VALUE string) {
188
+ if (TYPE(string) != T_STRING)
189
+ rb_raise(rb_eArgError, "invalid argument, expect string");
190
+
191
+ static int code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2 };
192
+ /* a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z */
193
+ static char key[5];
194
+ register char ch;
195
+ register int last;
196
+ register int count;
197
+ const char *cstring = RSTRING_PTR(string);
198
+
199
+ /* Set up default key, complete with trailing '0's */
200
+ strcpy(key, "Z000");
201
+
202
+ /* Advance to the first letter. If none present,
203
+ return default key */
204
+ while (*cstring && !isalpha(*cstring))
205
+ ++cstring;
206
+ if (*cstring == 0)
207
+ return rb_str_new2(key);
208
+
209
+ /* Pull out the first letter, uppercase it, and
210
+ set up for main loop */
211
+ key[0] = toupper(*cstring);
212
+ last = code[key[0] - 'A'];
213
+ ++cstring;
214
+
215
+ /* Scan rest of string, stop at end of string or
216
+ when the key is full */
217
+ for (count = 1; count < 4 && *cstring; ++cstring) {
218
+ /* If non-alpha, ignore the character altogether */
219
+ if (isalpha(*cstring)) {
220
+ ch = tolower(*cstring);
221
+ /* Fold together adjacent letters sharing the same code */
222
+ if (last != code[ch - 'a']) {
223
+ last = code[ch - 'a'];
224
+ /* Ignore code==0 letters except as separators */
225
+ if (last != 0)
226
+ key[count++] = '0' + last;
227
+ }
228
+ }
229
+ }
230
+
231
+ return rb_str_new2(key);
232
+ }
233
+
234
+ void Init_fuzzy_string() {
235
+ mFuzzy = rb_define_module("FuzzyString");
236
+
237
+ fuzzy_default_language = rb_str_new2("en");
238
+ rb_global_variable(&fuzzy_default_language);
239
+
240
+ rb_define_module_function(mFuzzy, "jaro_winkler_distance", RUBY_METHOD_FUNC(fuzzy_jaro_winkler_distance), 2);
241
+ rb_define_module_function(mFuzzy, "levenstein_distance", RUBY_METHOD_FUNC(fuzzy_levenstein_distance), 2);
242
+ rb_define_module_function(mFuzzy, "stem", RUBY_METHOD_FUNC(fuzzy_snowball), -1);
243
+ rb_define_module_function(mFuzzy, "stem_languages", RUBY_METHOD_FUNC(fuzzy_snowball_languages), 0);
244
+ rb_define_module_function(mFuzzy, "soundex", RUBY_METHOD_FUNC(fuzzy_soundex), 1);
245
+
246
+ rb_define_const(mFuzzy, "VERSION", rb_str_new2(RUBY_FUZZY_VERSION));
247
+ }
@@ -0,0 +1 @@
1
+ #define RUBY_FUZZY_VERSION "0.1.0"
@@ -0,0 +1 @@
1
+ require 'fuzzy-string/fuzzy_string'
@@ -0,0 +1,3 @@
1
+ require 'fuzzy-string'
2
+ require 'minitest/spec'
3
+ require 'minitest/autorun'
@@ -0,0 +1,55 @@
1
+ # encoding: utf-8
2
+ require 'helper'
3
+
4
+ FS = FuzzyString
5
+
6
+ describe 'fuzzy' do
7
+ describe 'soundex' do
8
+ it 'should raise exception for nil' do
9
+ assert_raises(ArgumentError) { FS.soundex(nil) }
10
+ end
11
+
12
+ it 'should default to Z000 for empty string' do
13
+ assert_equal 'Z000', FS.soundex('')
14
+ end
15
+
16
+ it 'should generate valid soundex codes' do
17
+ assert_equal "A140", FS.soundex("apple")
18
+ assert_equal "A140", FS.soundex("appel")
19
+ assert_equal "A142", FS.soundex("apples")
20
+ assert_equal "P200", FS.soundex("peach")
21
+ end
22
+ end
23
+
24
+ describe 'stem' do
25
+ it 'should stem english words' do
26
+ assert_equal "appl", FS.stem("apples")
27
+ assert_equal "appl", FS.stem("apple")
28
+ end
29
+
30
+ it 'should stem spanish words' do
31
+ assert_equal "manz", FS.stem("manza", "es")
32
+ end
33
+
34
+ it 'should return a list of languages' do
35
+ assert_kind_of Array, FS.stem_languages
36
+ end
37
+ end
38
+
39
+ describe 'jaro winkler distance' do
40
+ it 'should work' do
41
+ assert_equal 1, FS.jaro_winkler_distance("dean", "dean")
42
+ assert_equal 0, FS.jaro_winkler_distance("dean", "mike")
43
+ assert_in_delta 0.8333, FS.jaro_winkler_distance("dean", "sean")
44
+ assert_in_delta 0.6666, FS.jaro_winkler_distance("dean", "teen")
45
+ end
46
+ end
47
+
48
+ describe 'levenstein distance' do
49
+ it 'should work' do
50
+ assert_equal 0, FS.levenstein_distance("apple", "apple")
51
+ assert_equal 1, FS.levenstein_distance("apple", "apples")
52
+ assert_equal 5, FS.levenstein_distance("apple", "orange")
53
+ end
54
+ end
55
+ end
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fuzzy-string
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Bharanee Rathna
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2012-02-15 00:00:00 +11:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rake
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :development
32
+ version_requirements: *id001
33
+ description: A collection of functions for fuzzy string matching.
34
+ email:
35
+ - deepfryed@gmail.com
36
+ executables: []
37
+
38
+ extensions:
39
+ - ext/fuzzy-string/extconf.rb
40
+ extra_rdoc_files: []
41
+
42
+ files:
43
+ - ext/fuzzy-string/fuzzy-string.c
44
+ - ext/fuzzy-string/version.h
45
+ - ext/fuzzy-string/extconf.rb
46
+ - test/helper.rb
47
+ - test/test_fuzzy.rb
48
+ - lib/fuzzy-string.rb
49
+ - README.md
50
+ - CHANGELOG
51
+ has_rdoc: true
52
+ homepage: http://github.com/deepfryed/fuzzy-string
53
+ licenses: []
54
+
55
+ post_install_message:
56
+ rdoc_options: []
57
+
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ segments:
74
+ - 0
75
+ version: "0"
76
+ requirements: []
77
+
78
+ rubyforge_project:
79
+ rubygems_version: 1.3.7
80
+ signing_key:
81
+ specification_version: 3
82
+ summary: Fuzzy string matching
83
+ test_files: []
84
+