rubyfish 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +9 -3
- data/ROADMAP.md +1 -2
- data/lib/rubyfish.rb +1 -2
- data/lib/rubyfish/double_metaphone.rb +342 -0
- data/lib/rubyfish/version.rb +1 -1
- metadata +5 -5
- data/lib/rubyfish/mra.rb +0 -3
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
|
2
|
+
RubyFish
|
3
3
|
=========
|
4
4
|
|
5
5
|
RubyFish is a ruby port of python library jellyfish (http://github.com/sunlightlabs/jellyfish) for doing approximate and phonetic matching of strings.
|
@@ -27,6 +27,10 @@ String comparison:
|
|
27
27
|
* Longest Substring
|
28
28
|
* Longest Subsequence
|
29
29
|
|
30
|
+
Phonetic encoding:
|
31
|
+
|
32
|
+
* Double Metaphone
|
33
|
+
|
30
34
|
Example Usage
|
31
35
|
=============
|
32
36
|
|
@@ -36,4 +40,6 @@ Example Usage
|
|
36
40
|
ruby-1.9.2-p0 > RubyFish::Jaro.distance("jellyfish", "rubyfish")
|
37
41
|
=> 0.7268518518518519
|
38
42
|
ruby-1.9.2-p0 > RubyFish::DamerauLevenshtein.distance("rubyfish", "rubyfihs")
|
39
|
-
|
43
|
+
=> 1
|
44
|
+
|
45
|
+
|
data/ROADMAP.md
CHANGED
data/lib/rubyfish.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
#require "rubyfish/awesome"
|
2
|
-
|
3
1
|
module RubyFish
|
4
2
|
autoload :Hamming, 'rubyfish/hamming'
|
5
3
|
autoload :Levenshtein, 'rubyfish/levenshtein'
|
@@ -8,5 +6,6 @@ module RubyFish
|
|
8
6
|
autoload :LongestSubsequence, 'rubyfish/longest_subsequence'
|
9
7
|
autoload :Jaro, 'rubyfish/jaro'
|
10
8
|
autoload :JaroWinkler, 'rubyfish/jaro_winkler'
|
9
|
+
autoload :DoubleMetaphone, 'rubyfish/double_metaphone'
|
11
10
|
autoload :MMatrix, 'rubyfish/mmatrix'
|
12
11
|
end
|
@@ -0,0 +1,342 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
|
3
|
+
# stolen from http://english.rubyforge.org/
|
4
|
+
module RubyFish::DoubleMetaphone
|
5
|
+
|
6
|
+
def phonetic_code string
|
7
|
+
str = string.to_s
|
8
|
+
primary, secondary, current = '', '', 0
|
9
|
+
original, length, last = "#{str} ".upcase, str.length, str.length - 1
|
10
|
+
if /^GN|KN|PN|WR|PS$/ =~ original[0, 2]
|
11
|
+
current += 1
|
12
|
+
end
|
13
|
+
if 'X' == original[0, 1]
|
14
|
+
primary << 'S'
|
15
|
+
secondary << 'S'
|
16
|
+
current += 1
|
17
|
+
end
|
18
|
+
while primary.length < 4 || secondary.length < 4
|
19
|
+
break if current > str.length
|
20
|
+
a, b, c = lookup(original, current, length, last)
|
21
|
+
primary << a if a
|
22
|
+
secondary << b if b
|
23
|
+
current += c if c
|
24
|
+
end
|
25
|
+
primary, secondary = primary[0, 4], secondary[0, 4]
|
26
|
+
[primary, (primary == secondary ? nil : secondary)]
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def slavo_germanic?(str)
|
32
|
+
/W|K|CZ|WITZ/ =~ str
|
33
|
+
end
|
34
|
+
|
35
|
+
def vowel?(str)
|
36
|
+
/^A|E|I|O|U|Y$/ =~ str
|
37
|
+
end
|
38
|
+
|
39
|
+
def lookup(str, pos, length, last)
|
40
|
+
case str[pos, 1]
|
41
|
+
when /^A|E|I|O|U|Y$/
|
42
|
+
if 0 == pos
|
43
|
+
return 'A', 'A', 1
|
44
|
+
else
|
45
|
+
return nil, nil, 1
|
46
|
+
end
|
47
|
+
when 'B'
|
48
|
+
return 'P', 'P', ('B' == str[pos + 1, 1] ? 2 : 1)
|
49
|
+
when 'Ç'
|
50
|
+
return 'S', 'S', 1
|
51
|
+
when 'C'
|
52
|
+
if pos > 1 &&
|
53
|
+
!vowel?(str[pos - 2, 1]) &&
|
54
|
+
'ACH' == str[pos - 1, 3] &&
|
55
|
+
str[pos + 2, 1] != 'I' && (
|
56
|
+
str[pos + 2, 1] != 'E' ||
|
57
|
+
str[pos - 2, 6] =~ /^(B|M)ACHER$/
|
58
|
+
) then
|
59
|
+
return 'K', 'K', 2
|
60
|
+
elsif 0 == pos && 'CAESAR' == str[pos, 6]
|
61
|
+
return 'S', 'S', 2
|
62
|
+
elsif 'CHIA' == str[pos, 4]
|
63
|
+
return 'K', 'K', 2
|
64
|
+
elsif 'CH' == str[pos, 2]
|
65
|
+
if 0 == pos && 'CHAE' == str[pos, 4]
|
66
|
+
return 'K', 'X', 2
|
67
|
+
elsif 0 == pos && (
|
68
|
+
['HARAC', 'HARIS'].include?(str[pos + 1, 5]) ||
|
69
|
+
['HOR', 'HYM', 'HIA', 'HEM'].include?(str[pos + 1, 3])
|
70
|
+
) && str[0, 5] != 'CHORE' then
|
71
|
+
return 'K', 'K', 2
|
72
|
+
elsif ['VAN ','VON '].include?(str[0, 4]) ||
|
73
|
+
'SCH' == str[0, 3] ||
|
74
|
+
['ORCHES','ARCHIT','ORCHID'].include?(str[pos - 2, 6]) ||
|
75
|
+
['T','S'].include?(str[pos + 2, 1]) || (
|
76
|
+
((0 == pos) || ['A','O','U','E'].include?(str[pos - 1, 1])) &&
|
77
|
+
['L','R','N','M','B','H','F','V','W',' '].include?(str[pos + 2, 1])
|
78
|
+
) then
|
79
|
+
return 'K', 'K', 2
|
80
|
+
elsif pos > 0
|
81
|
+
return ('MC' == str[0, 2] ? 'K' : 'X'), 'K', 2
|
82
|
+
else
|
83
|
+
return 'X', 'X', 2
|
84
|
+
end
|
85
|
+
elsif 'CZ' == str[pos, 2] && 'WICZ' != str[pos - 2, 4]
|
86
|
+
return 'S', 'X', 2
|
87
|
+
elsif 'CIA' == str[pos + 1, 3]
|
88
|
+
return 'X', 'X', 3
|
89
|
+
elsif 'CC' == str[pos, 2] && !(1 == pos && 'M' == str[0, 1])
|
90
|
+
if /^I|E|H$/ =~ str[pos + 2, 1] && 'HU' != str[pos + 2, 2]
|
91
|
+
if (1 == pos && 'A' == str[pos - 1, 1]) ||
|
92
|
+
/^UCCE(E|S)$/ =~ str[pos - 1, 5] then
|
93
|
+
return 'KS', 'KS', 3
|
94
|
+
else
|
95
|
+
return 'X', 'X', 3
|
96
|
+
end
|
97
|
+
else
|
98
|
+
return 'K', 'K', 2
|
99
|
+
end
|
100
|
+
elsif /^C(K|G|Q)$/ =~ str[pos, 2]
|
101
|
+
return 'K', 'K', 2
|
102
|
+
elsif /^C(I|E|Y)$/ =~ str[pos, 2]
|
103
|
+
return 'S', (/^CI(O|E|A)$/ =~ str[pos, 3] ? 'X' : 'S'), 2
|
104
|
+
else
|
105
|
+
if /^ (C|Q|G)$/ =~ str[pos + 1, 2]
|
106
|
+
return 'K', 'K', 3
|
107
|
+
else
|
108
|
+
return 'K', 'K', (/^C|K|Q$/ =~ str[pos + 1, 1] && !(['CE','CI'].include?(str[pos + 1, 2])) ? 2 : 1)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
when 'D'
|
112
|
+
if 'DG' == str[pos, 2]
|
113
|
+
if /^I|E|Y$/ =~ str[pos + 2, 1]
|
114
|
+
return 'J', 'J', 3
|
115
|
+
else
|
116
|
+
return 'TK', 'TK', 2
|
117
|
+
end
|
118
|
+
else
|
119
|
+
return 'T', 'T', (/^D(T|D)$/ =~ str[pos, 2] ? 2 : 1)
|
120
|
+
end
|
121
|
+
when 'F'
|
122
|
+
return 'F', 'F', ('F' == str[pos + 1, 1] ? 2 : 1)
|
123
|
+
when 'G'
|
124
|
+
if 'H' == str[pos + 1, 1]
|
125
|
+
if pos > 0 && !vowel?(str[pos - 1, 1])
|
126
|
+
return 'K', 'K', 2
|
127
|
+
elsif 0 == pos
|
128
|
+
if 'I' == str[pos + 2, 1]
|
129
|
+
return 'J', 'J', 2
|
130
|
+
else
|
131
|
+
return 'K', 'K', 2
|
132
|
+
end
|
133
|
+
elsif (pos > 1 && /^B|H|D$/ =~ str[pos - 2, 1]) ||
|
134
|
+
(pos > 2 && /^B|H|D$/ =~ str[pos - 3, 1]) ||
|
135
|
+
(pos > 3 && /^B|H$/ =~ str[pos - 4, 1])
|
136
|
+
return nil, nil, 2
|
137
|
+
else
|
138
|
+
if (pos > 2 && 'U' == str[pos - 1, 1] && /^C|G|L|R|T$/ =~ str[pos - 3, 1])
|
139
|
+
return 'F', 'F', 2
|
140
|
+
elsif pos > 0 && 'I' != str[pos - 1, 1]
|
141
|
+
return 'K', 'K', 2
|
142
|
+
else
|
143
|
+
return nil, nil, 2
|
144
|
+
end
|
145
|
+
end
|
146
|
+
elsif 'N' == str[pos + 1, 1]
|
147
|
+
if 1 == pos && vowel?(str[0, 1]) && !slavo_germanic?(str)
|
148
|
+
return 'KN', 'N', 2
|
149
|
+
else
|
150
|
+
if 'EY' != str[pos + 2, 2] && 'Y' != str[pos + 1, 1] && !slavo_germanic?(str)
|
151
|
+
return 'N', 'KN', 2
|
152
|
+
else
|
153
|
+
return 'KN', 'KN', 2
|
154
|
+
end
|
155
|
+
end
|
156
|
+
elsif 'LI' == str[pos + 1, 2] && !slavo_germanic?(str)
|
157
|
+
return 'KL', 'L', 2
|
158
|
+
elsif 0 == pos && ('Y' == str[pos + 1, 1] || /^(E(S|P|B|L|Y|I|R)|I(B|L|N|E))$/ =~ str[pos + 1, 2])
|
159
|
+
return 'K', 'J', 2
|
160
|
+
elsif (('ER' == str[pos + 1, 2] || 'Y' == str[pos + 1, 1]) &&
|
161
|
+
/^(D|R|M)ANGER$/ !~ str[0, 6] &&
|
162
|
+
/^E|I$/ !~ str[pos - 1, 1] &&
|
163
|
+
/^(R|O)GY$/ !~ str[pos - 1, 3])
|
164
|
+
return 'K', 'J', 2
|
165
|
+
elsif /^E|I|Y$/ =~ str[pos + 1, 1] || /^(A|O)GGI$/ =~ str[pos - 1, 4]
|
166
|
+
if (/^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]) || 'ET' == str[pos + 1, 2]
|
167
|
+
return 'K', 'K', 2
|
168
|
+
else
|
169
|
+
if 'IER ' == str[pos + 1, 4]
|
170
|
+
return 'J', 'J', 2
|
171
|
+
else
|
172
|
+
return 'J', 'K', 2
|
173
|
+
end
|
174
|
+
end
|
175
|
+
elsif 'G' == str[pos + 1, 1]
|
176
|
+
return 'K', 'K', 2
|
177
|
+
else
|
178
|
+
return 'K', 'K', 1
|
179
|
+
end
|
180
|
+
when 'H'
|
181
|
+
if (0 == pos || vowel?(str[pos - 1, 1])) && vowel?(str[pos + 1, 1])
|
182
|
+
return 'H', 'H', 2
|
183
|
+
else
|
184
|
+
return nil, nil, 1
|
185
|
+
end
|
186
|
+
when 'J'
|
187
|
+
if 'JOSE' == str[pos, 4] || 'SAN ' == str[0, 4]
|
188
|
+
if (0 == pos && ' ' == str[pos + 4, 1]) || 'SAN ' == str[0, 4]
|
189
|
+
return 'H', 'H', 1
|
190
|
+
else
|
191
|
+
return 'J', 'H', 1
|
192
|
+
end
|
193
|
+
else
|
194
|
+
current = ('J' == str[pos + 1, 1] ? 2 : 1)
|
195
|
+
|
196
|
+
if 0 == pos && 'JOSE' != str[pos, 4]
|
197
|
+
return 'J', 'A', current
|
198
|
+
else
|
199
|
+
if vowel?(str[pos - 1, 1]) && !slavo_germanic?(str) && /^A|O$/ =~ str[pos + 1, 1]
|
200
|
+
return 'J', 'H', current
|
201
|
+
else
|
202
|
+
if last == pos
|
203
|
+
return 'J', nil, current
|
204
|
+
else
|
205
|
+
if /^L|T|K|S|N|M|B|Z$/ !~ str[pos + 1, 1] && /^S|K|L$/ !~ str[pos - 1, 1]
|
206
|
+
return 'J', 'J', current
|
207
|
+
else
|
208
|
+
return nil, nil, current
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
when 'K'
|
215
|
+
return 'K', 'K', ('K' == str[pos + 1, 1] ? 2 : 1)
|
216
|
+
when 'L'
|
217
|
+
if 'L' == str[pos + 1, 1]
|
218
|
+
if (((length - 3) == pos && /^(ILL(O|A)|ALLE)$/ =~ str[pos - 1, 4]) ||
|
219
|
+
((/^(A|O)S$/ =~ str[last - 1, 2] || /^A|O$/ =~ str[last, 1]) && 'ALLE' == str[pos - 1, 4]))
|
220
|
+
return 'L', nil, 2
|
221
|
+
else
|
222
|
+
return 'L', 'L', 2
|
223
|
+
end
|
224
|
+
else
|
225
|
+
return 'L', 'L', 1
|
226
|
+
end
|
227
|
+
when 'M'
|
228
|
+
if ('UMB' == str[pos - 1, 3] &&
|
229
|
+
((last - 1) == pos || 'ER' == str[pos + 2, 2])) || 'M' == str[pos + 1, 1]
|
230
|
+
return 'M', 'M', 2
|
231
|
+
else
|
232
|
+
return 'M', 'M', 1
|
233
|
+
end
|
234
|
+
when 'N'
|
235
|
+
return 'N', 'N', ('N' == str[pos + 1, 1] ? 2 : 1)
|
236
|
+
when 'Ñ'
|
237
|
+
return 'N', 'N', 1
|
238
|
+
when 'P'
|
239
|
+
if 'H' == str[pos + 1, 1]
|
240
|
+
return 'F', 'F', 2
|
241
|
+
else
|
242
|
+
return 'P', 'P', (/^P|B$/ =~ str[pos + 1, 1] ? 2 : 1)
|
243
|
+
end
|
244
|
+
when 'Q'
|
245
|
+
return 'K', 'K', ('Q' == str[pos + 1, 1] ? 2 : 1)
|
246
|
+
when 'R'
|
247
|
+
current = ('R' == str[pos + 1, 1] ? 2 : 1)
|
248
|
+
|
249
|
+
if last == pos && !slavo_germanic?(str) && 'IE' == str[pos - 2, 2] && /^M(E|A)$/ !~ str[pos - 4, 2]
|
250
|
+
return nil, 'R', current
|
251
|
+
else
|
252
|
+
return 'R', 'R', current
|
253
|
+
end
|
254
|
+
when 'S'
|
255
|
+
if /^(I|Y)SL$/ =~ str[pos - 1, 3]
|
256
|
+
return nil, nil, 1
|
257
|
+
elsif 0 == pos && 'SUGAR' == str[pos, 5]
|
258
|
+
return 'X', 'S', 1
|
259
|
+
elsif 'SH' == str[pos, 2]
|
260
|
+
if /^H(EIM|OEK|OLM|OLZ)$/ =~ str[pos + 1, 4]
|
261
|
+
return 'S', 'S', 2
|
262
|
+
else
|
263
|
+
return 'X', 'X', 2
|
264
|
+
end
|
265
|
+
elsif /^SI(O|A)$/ =~ str[pos, 3] || 'SIAN' == str[pos, 4]
|
266
|
+
return 'S', (slavo_germanic?(str) ? 'S' : 'X'), 3
|
267
|
+
elsif (0 == pos && /^M|N|L|W$/ =~ str[pos + 1, 1]) || 'Z' == str[pos + 1, 1]
|
268
|
+
return 'S', 'X', ('Z' == str[pos + 1, 1] ? 2 : 1)
|
269
|
+
elsif 'SC' == str[pos, 2]
|
270
|
+
if 'H' == str[pos + 2, 1]
|
271
|
+
if /^OO|ER|EN|UY|ED|EM$/ =~ str[pos + 3, 2]
|
272
|
+
return (/^E(R|N)$/ =~ str[pos + 3, 2] ? 'X' : 'SK'), 'SK', 3
|
273
|
+
else
|
274
|
+
return 'X', ((0 == pos && !vowel?(str[3, 1]) && ('W' != str[pos + 3, 1])) ? 'S' : 'X'), 3
|
275
|
+
end
|
276
|
+
elsif /^I|E|Y$/ =~ str[pos + 2, 1]
|
277
|
+
return 'S', 'S', 3
|
278
|
+
else
|
279
|
+
return 'SK', 'SK', 3
|
280
|
+
end
|
281
|
+
else
|
282
|
+
return (last == pos && /^(A|O)I$/ =~ str[pos - 2, 2] ? nil : 'S'), 'S', (/^S|Z$/ =~ str[pos + 1, 1] ? 2 : 1)
|
283
|
+
end
|
284
|
+
when 'T'
|
285
|
+
if 'TION' == str[pos, 4]
|
286
|
+
return 'X', 'X', 3
|
287
|
+
elsif /^T(IA|CH)$/ =~ str[pos, 3]
|
288
|
+
return 'X', 'X', 3
|
289
|
+
elsif 'TH' == str[pos, 2] || 'TTH' == str[pos, 3]
|
290
|
+
if /^(O|A)M$/ =~ str[pos + 2, 2] || /^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]
|
291
|
+
return 'T', 'T', 2
|
292
|
+
else
|
293
|
+
return '0', 'T', 2
|
294
|
+
end
|
295
|
+
else
|
296
|
+
return 'T', 'T', (/^T|D$/ =~ str[pos + 1, 1] ? 2 : 1)
|
297
|
+
end
|
298
|
+
when 'V'
|
299
|
+
return 'F', 'F', ('V' == str[pos + 1, 1] ? 2 : 1)
|
300
|
+
when 'W'
|
301
|
+
if 'WR' == str[pos, 2]
|
302
|
+
return 'R', 'R', 2
|
303
|
+
end
|
304
|
+
pri, sec = nil, nil
|
305
|
+
if 0 == pos && (vowel?(str[pos + 1, 1]) || 'WH' == str[pos, 2])
|
306
|
+
pri = 'A'
|
307
|
+
sec = vowel?(str[pos + 1, 1]) ? 'F' : 'A'
|
308
|
+
end
|
309
|
+
if (last == pos && vowel?(str[pos - 1, 1])) || 'SCH' == str[0, 3] ||
|
310
|
+
/^EWSKI|EWSKY|OWSKI|OWSKY$/ =~ str[pos - 1, 5]
|
311
|
+
return pri, "#{sec}F", 1
|
312
|
+
elsif /^WI(C|T)Z$/ =~ str[pos, 4]
|
313
|
+
return "#{pri}TS", "#{sec}FX", 4
|
314
|
+
else
|
315
|
+
return pri, sec, 1
|
316
|
+
end
|
317
|
+
when 'X'
|
318
|
+
current = (/^C|X$/ =~ str[pos + 1, 1] ? 2 : 1)
|
319
|
+
if !(last == pos && (/^(I|E)AU$/ =~ str[pos - 3, 3] || /^(A|O)U$/ =~ str[pos - 2, 2]))
|
320
|
+
return 'KS', 'KS', current
|
321
|
+
else
|
322
|
+
return nil, nil, current
|
323
|
+
end
|
324
|
+
when 'Z'
|
325
|
+
if 'H' == str[pos + 1, 1]
|
326
|
+
return 'J', 'J', 2
|
327
|
+
else
|
328
|
+
current = ('Z' == str[pos + 1, 1] ? 2 : 1)
|
329
|
+
if /^Z(O|I|A)$/ =~ str[pos + 1, 2] || (slavo_germanic?(str) && (pos > 0 && 'T' != str[pos - 1, 1]))
|
330
|
+
return 'S', 'TS', current
|
331
|
+
else
|
332
|
+
return 'S', 'S', current
|
333
|
+
end
|
334
|
+
end
|
335
|
+
else
|
336
|
+
return nil, nil, 1
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
extend self
|
341
|
+
|
342
|
+
end
|
data/lib/rubyfish/version.rb
CHANGED
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Yury Korolev
|
@@ -29,6 +29,7 @@ extra_rdoc_files: []
|
|
29
29
|
|
30
30
|
files:
|
31
31
|
- lib/rubyfish/damerau_levenshtein.rb
|
32
|
+
- lib/rubyfish/double_metaphone.rb
|
32
33
|
- lib/rubyfish/hamming.rb
|
33
34
|
- lib/rubyfish/jaro.rb
|
34
35
|
- lib/rubyfish/jaro_winkler.rb
|
@@ -36,7 +37,6 @@ files:
|
|
36
37
|
- lib/rubyfish/longest_subsequence.rb
|
37
38
|
- lib/rubyfish/longest_substring.rb
|
38
39
|
- lib/rubyfish/mmatrix.rb
|
39
|
-
- lib/rubyfish/mra.rb
|
40
40
|
- lib/rubyfish/version.rb
|
41
41
|
- lib/rubyfish.rb
|
42
42
|
- LICENSE
|
@@ -57,7 +57,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
57
57
|
requirements:
|
58
58
|
- - ">="
|
59
59
|
- !ruby/object:Gem::Version
|
60
|
-
hash:
|
60
|
+
hash: 3683456947558493036
|
61
61
|
segments:
|
62
62
|
- 0
|
63
63
|
version: "0"
|
@@ -77,6 +77,6 @@ rubyforge_project: rubyfish
|
|
77
77
|
rubygems_version: 1.3.7
|
78
78
|
signing_key:
|
79
79
|
specification_version: 3
|
80
|
-
summary:
|
80
|
+
summary: Library for doing approximate and phonetic matching of string
|
81
81
|
test_files: []
|
82
82
|
|
data/lib/rubyfish/mra.rb
DELETED