rubyfish 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
- =========
2
- jellyfish
1
+
2
+ RubyFish
3
3
  =========
4
4
 
5
5
  RubyFish is a ruby port of python library jellyfish (http://github.com/sunlightlabs/jellyfish) for doing approximate and phonetic matching of strings.
@@ -27,6 +27,10 @@ String comparison:
27
27
  * Longest Substring
28
28
  * Longest Subsequence
29
29
 
30
+ Phonetic encoding:
31
+
32
+ * Double Metaphone
33
+
30
34
  Example Usage
31
35
  =============
32
36
 
@@ -36,4 +40,6 @@ Example Usage
36
40
  ruby-1.9.2-p0 > RubyFish::Jaro.distance("jellyfish", "rubyfish")
37
41
  => 0.7268518518518519
38
42
  ruby-1.9.2-p0 > RubyFish::DamerauLevenshtein.distance("rubyfish", "rubyfihs")
39
- => 1
43
+ => 1
44
+
45
+
data/ROADMAP.md CHANGED
@@ -1,3 +1,2 @@
1
1
  - Port MRA
2
- - Port NYSIIS
3
- - Add Double Metaphone
2
+ - Port NYSIIS
@@ -1,5 +1,3 @@
1
- #require "rubyfish/awesome"
2
-
3
1
  module RubyFish
4
2
  autoload :Hamming, 'rubyfish/hamming'
5
3
  autoload :Levenshtein, 'rubyfish/levenshtein'
@@ -8,5 +6,6 @@ module RubyFish
8
6
  autoload :LongestSubsequence, 'rubyfish/longest_subsequence'
9
7
  autoload :Jaro, 'rubyfish/jaro'
10
8
  autoload :JaroWinkler, 'rubyfish/jaro_winkler'
9
+ autoload :DoubleMetaphone, 'rubyfish/double_metaphone'
11
10
  autoload :MMatrix, 'rubyfish/mmatrix'
12
11
  end
@@ -0,0 +1,342 @@
1
+ #encoding: utf-8
2
+
3
+ # stolen from http://english.rubyforge.org/
4
+ module RubyFish::DoubleMetaphone
5
+
6
+ def phonetic_code string
7
+ str = string.to_s
8
+ primary, secondary, current = '', '', 0
9
+ original, length, last = "#{str} ".upcase, str.length, str.length - 1
10
+ if /^GN|KN|PN|WR|PS$/ =~ original[0, 2]
11
+ current += 1
12
+ end
13
+ if 'X' == original[0, 1]
14
+ primary << 'S'
15
+ secondary << 'S'
16
+ current += 1
17
+ end
18
+ while primary.length < 4 || secondary.length < 4
19
+ break if current > str.length
20
+ a, b, c = lookup(original, current, length, last)
21
+ primary << a if a
22
+ secondary << b if b
23
+ current += c if c
24
+ end
25
+ primary, secondary = primary[0, 4], secondary[0, 4]
26
+ [primary, (primary == secondary ? nil : secondary)]
27
+ end
28
+
29
+ private
30
+
31
+ def slavo_germanic?(str)
32
+ /W|K|CZ|WITZ/ =~ str
33
+ end
34
+
35
+ def vowel?(str)
36
+ /^A|E|I|O|U|Y$/ =~ str
37
+ end
38
+
39
+ def lookup(str, pos, length, last)
40
+ case str[pos, 1]
41
+ when /^A|E|I|O|U|Y$/
42
+ if 0 == pos
43
+ return 'A', 'A', 1
44
+ else
45
+ return nil, nil, 1
46
+ end
47
+ when 'B'
48
+ return 'P', 'P', ('B' == str[pos + 1, 1] ? 2 : 1)
49
+ when 'Ç'
50
+ return 'S', 'S', 1
51
+ when 'C'
52
+ if pos > 1 &&
53
+ !vowel?(str[pos - 2, 1]) &&
54
+ 'ACH' == str[pos - 1, 3] &&
55
+ str[pos + 2, 1] != 'I' && (
56
+ str[pos + 2, 1] != 'E' ||
57
+ str[pos - 2, 6] =~ /^(B|M)ACHER$/
58
+ ) then
59
+ return 'K', 'K', 2
60
+ elsif 0 == pos && 'CAESAR' == str[pos, 6]
61
+ return 'S', 'S', 2
62
+ elsif 'CHIA' == str[pos, 4]
63
+ return 'K', 'K', 2
64
+ elsif 'CH' == str[pos, 2]
65
+ if 0 == pos && 'CHAE' == str[pos, 4]
66
+ return 'K', 'X', 2
67
+ elsif 0 == pos && (
68
+ ['HARAC', 'HARIS'].include?(str[pos + 1, 5]) ||
69
+ ['HOR', 'HYM', 'HIA', 'HEM'].include?(str[pos + 1, 3])
70
+ ) && str[0, 5] != 'CHORE' then
71
+ return 'K', 'K', 2
72
+ elsif ['VAN ','VON '].include?(str[0, 4]) ||
73
+ 'SCH' == str[0, 3] ||
74
+ ['ORCHES','ARCHIT','ORCHID'].include?(str[pos - 2, 6]) ||
75
+ ['T','S'].include?(str[pos + 2, 1]) || (
76
+ ((0 == pos) || ['A','O','U','E'].include?(str[pos - 1, 1])) &&
77
+ ['L','R','N','M','B','H','F','V','W',' '].include?(str[pos + 2, 1])
78
+ ) then
79
+ return 'K', 'K', 2
80
+ elsif pos > 0
81
+ return ('MC' == str[0, 2] ? 'K' : 'X'), 'K', 2
82
+ else
83
+ return 'X', 'X', 2
84
+ end
85
+ elsif 'CZ' == str[pos, 2] && 'WICZ' != str[pos - 2, 4]
86
+ return 'S', 'X', 2
87
+ elsif 'CIA' == str[pos + 1, 3]
88
+ return 'X', 'X', 3
89
+ elsif 'CC' == str[pos, 2] && !(1 == pos && 'M' == str[0, 1])
90
+ if /^I|E|H$/ =~ str[pos + 2, 1] && 'HU' != str[pos + 2, 2]
91
+ if (1 == pos && 'A' == str[pos - 1, 1]) ||
92
+ /^UCCE(E|S)$/ =~ str[pos - 1, 5] then
93
+ return 'KS', 'KS', 3
94
+ else
95
+ return 'X', 'X', 3
96
+ end
97
+ else
98
+ return 'K', 'K', 2
99
+ end
100
+ elsif /^C(K|G|Q)$/ =~ str[pos, 2]
101
+ return 'K', 'K', 2
102
+ elsif /^C(I|E|Y)$/ =~ str[pos, 2]
103
+ return 'S', (/^CI(O|E|A)$/ =~ str[pos, 3] ? 'X' : 'S'), 2
104
+ else
105
+ if /^ (C|Q|G)$/ =~ str[pos + 1, 2]
106
+ return 'K', 'K', 3
107
+ else
108
+ return 'K', 'K', (/^C|K|Q$/ =~ str[pos + 1, 1] && !(['CE','CI'].include?(str[pos + 1, 2])) ? 2 : 1)
109
+ end
110
+ end
111
+ when 'D'
112
+ if 'DG' == str[pos, 2]
113
+ if /^I|E|Y$/ =~ str[pos + 2, 1]
114
+ return 'J', 'J', 3
115
+ else
116
+ return 'TK', 'TK', 2
117
+ end
118
+ else
119
+ return 'T', 'T', (/^D(T|D)$/ =~ str[pos, 2] ? 2 : 1)
120
+ end
121
+ when 'F'
122
+ return 'F', 'F', ('F' == str[pos + 1, 1] ? 2 : 1)
123
+ when 'G'
124
+ if 'H' == str[pos + 1, 1]
125
+ if pos > 0 && !vowel?(str[pos - 1, 1])
126
+ return 'K', 'K', 2
127
+ elsif 0 == pos
128
+ if 'I' == str[pos + 2, 1]
129
+ return 'J', 'J', 2
130
+ else
131
+ return 'K', 'K', 2
132
+ end
133
+ elsif (pos > 1 && /^B|H|D$/ =~ str[pos - 2, 1]) ||
134
+ (pos > 2 && /^B|H|D$/ =~ str[pos - 3, 1]) ||
135
+ (pos > 3 && /^B|H$/ =~ str[pos - 4, 1])
136
+ return nil, nil, 2
137
+ else
138
+ if (pos > 2 && 'U' == str[pos - 1, 1] && /^C|G|L|R|T$/ =~ str[pos - 3, 1])
139
+ return 'F', 'F', 2
140
+ elsif pos > 0 && 'I' != str[pos - 1, 1]
141
+ return 'K', 'K', 2
142
+ else
143
+ return nil, nil, 2
144
+ end
145
+ end
146
+ elsif 'N' == str[pos + 1, 1]
147
+ if 1 == pos && vowel?(str[0, 1]) && !slavo_germanic?(str)
148
+ return 'KN', 'N', 2
149
+ else
150
+ if 'EY' != str[pos + 2, 2] && 'Y' != str[pos + 1, 1] && !slavo_germanic?(str)
151
+ return 'N', 'KN', 2
152
+ else
153
+ return 'KN', 'KN', 2
154
+ end
155
+ end
156
+ elsif 'LI' == str[pos + 1, 2] && !slavo_germanic?(str)
157
+ return 'KL', 'L', 2
158
+ elsif 0 == pos && ('Y' == str[pos + 1, 1] || /^(E(S|P|B|L|Y|I|R)|I(B|L|N|E))$/ =~ str[pos + 1, 2])
159
+ return 'K', 'J', 2
160
+ elsif (('ER' == str[pos + 1, 2] || 'Y' == str[pos + 1, 1]) &&
161
+ /^(D|R|M)ANGER$/ !~ str[0, 6] &&
162
+ /^E|I$/ !~ str[pos - 1, 1] &&
163
+ /^(R|O)GY$/ !~ str[pos - 1, 3])
164
+ return 'K', 'J', 2
165
+ elsif /^E|I|Y$/ =~ str[pos + 1, 1] || /^(A|O)GGI$/ =~ str[pos - 1, 4]
166
+ if (/^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]) || 'ET' == str[pos + 1, 2]
167
+ return 'K', 'K', 2
168
+ else
169
+ if 'IER ' == str[pos + 1, 4]
170
+ return 'J', 'J', 2
171
+ else
172
+ return 'J', 'K', 2
173
+ end
174
+ end
175
+ elsif 'G' == str[pos + 1, 1]
176
+ return 'K', 'K', 2
177
+ else
178
+ return 'K', 'K', 1
179
+ end
180
+ when 'H'
181
+ if (0 == pos || vowel?(str[pos - 1, 1])) && vowel?(str[pos + 1, 1])
182
+ return 'H', 'H', 2
183
+ else
184
+ return nil, nil, 1
185
+ end
186
+ when 'J'
187
+ if 'JOSE' == str[pos, 4] || 'SAN ' == str[0, 4]
188
+ if (0 == pos && ' ' == str[pos + 4, 1]) || 'SAN ' == str[0, 4]
189
+ return 'H', 'H', 1
190
+ else
191
+ return 'J', 'H', 1
192
+ end
193
+ else
194
+ current = ('J' == str[pos + 1, 1] ? 2 : 1)
195
+
196
+ if 0 == pos && 'JOSE' != str[pos, 4]
197
+ return 'J', 'A', current
198
+ else
199
+ if vowel?(str[pos - 1, 1]) && !slavo_germanic?(str) && /^A|O$/ =~ str[pos + 1, 1]
200
+ return 'J', 'H', current
201
+ else
202
+ if last == pos
203
+ return 'J', nil, current
204
+ else
205
+ if /^L|T|K|S|N|M|B|Z$/ !~ str[pos + 1, 1] && /^S|K|L$/ !~ str[pos - 1, 1]
206
+ return 'J', 'J', current
207
+ else
208
+ return nil, nil, current
209
+ end
210
+ end
211
+ end
212
+ end
213
+ end
214
+ when 'K'
215
+ return 'K', 'K', ('K' == str[pos + 1, 1] ? 2 : 1)
216
+ when 'L'
217
+ if 'L' == str[pos + 1, 1]
218
+ if (((length - 3) == pos && /^(ILL(O|A)|ALLE)$/ =~ str[pos - 1, 4]) ||
219
+ ((/^(A|O)S$/ =~ str[last - 1, 2] || /^A|O$/ =~ str[last, 1]) && 'ALLE' == str[pos - 1, 4]))
220
+ return 'L', nil, 2
221
+ else
222
+ return 'L', 'L', 2
223
+ end
224
+ else
225
+ return 'L', 'L', 1
226
+ end
227
+ when 'M'
228
+ if ('UMB' == str[pos - 1, 3] &&
229
+ ((last - 1) == pos || 'ER' == str[pos + 2, 2])) || 'M' == str[pos + 1, 1]
230
+ return 'M', 'M', 2
231
+ else
232
+ return 'M', 'M', 1
233
+ end
234
+ when 'N'
235
+ return 'N', 'N', ('N' == str[pos + 1, 1] ? 2 : 1)
236
+ when 'Ñ'
237
+ return 'N', 'N', 1
238
+ when 'P'
239
+ if 'H' == str[pos + 1, 1]
240
+ return 'F', 'F', 2
241
+ else
242
+ return 'P', 'P', (/^P|B$/ =~ str[pos + 1, 1] ? 2 : 1)
243
+ end
244
+ when 'Q'
245
+ return 'K', 'K', ('Q' == str[pos + 1, 1] ? 2 : 1)
246
+ when 'R'
247
+ current = ('R' == str[pos + 1, 1] ? 2 : 1)
248
+
249
+ if last == pos && !slavo_germanic?(str) && 'IE' == str[pos - 2, 2] && /^M(E|A)$/ !~ str[pos - 4, 2]
250
+ return nil, 'R', current
251
+ else
252
+ return 'R', 'R', current
253
+ end
254
+ when 'S'
255
+ if /^(I|Y)SL$/ =~ str[pos - 1, 3]
256
+ return nil, nil, 1
257
+ elsif 0 == pos && 'SUGAR' == str[pos, 5]
258
+ return 'X', 'S', 1
259
+ elsif 'SH' == str[pos, 2]
260
+ if /^H(EIM|OEK|OLM|OLZ)$/ =~ str[pos + 1, 4]
261
+ return 'S', 'S', 2
262
+ else
263
+ return 'X', 'X', 2
264
+ end
265
+ elsif /^SI(O|A)$/ =~ str[pos, 3] || 'SIAN' == str[pos, 4]
266
+ return 'S', (slavo_germanic?(str) ? 'S' : 'X'), 3
267
+ elsif (0 == pos && /^M|N|L|W$/ =~ str[pos + 1, 1]) || 'Z' == str[pos + 1, 1]
268
+ return 'S', 'X', ('Z' == str[pos + 1, 1] ? 2 : 1)
269
+ elsif 'SC' == str[pos, 2]
270
+ if 'H' == str[pos + 2, 1]
271
+ if /^OO|ER|EN|UY|ED|EM$/ =~ str[pos + 3, 2]
272
+ return (/^E(R|N)$/ =~ str[pos + 3, 2] ? 'X' : 'SK'), 'SK', 3
273
+ else
274
+ return 'X', ((0 == pos && !vowel?(str[3, 1]) && ('W' != str[pos + 3, 1])) ? 'S' : 'X'), 3
275
+ end
276
+ elsif /^I|E|Y$/ =~ str[pos + 2, 1]
277
+ return 'S', 'S', 3
278
+ else
279
+ return 'SK', 'SK', 3
280
+ end
281
+ else
282
+ return (last == pos && /^(A|O)I$/ =~ str[pos - 2, 2] ? nil : 'S'), 'S', (/^S|Z$/ =~ str[pos + 1, 1] ? 2 : 1)
283
+ end
284
+ when 'T'
285
+ if 'TION' == str[pos, 4]
286
+ return 'X', 'X', 3
287
+ elsif /^T(IA|CH)$/ =~ str[pos, 3]
288
+ return 'X', 'X', 3
289
+ elsif 'TH' == str[pos, 2] || 'TTH' == str[pos, 3]
290
+ if /^(O|A)M$/ =~ str[pos + 2, 2] || /^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]
291
+ return 'T', 'T', 2
292
+ else
293
+ return '0', 'T', 2
294
+ end
295
+ else
296
+ return 'T', 'T', (/^T|D$/ =~ str[pos + 1, 1] ? 2 : 1)
297
+ end
298
+ when 'V'
299
+ return 'F', 'F', ('V' == str[pos + 1, 1] ? 2 : 1)
300
+ when 'W'
301
+ if 'WR' == str[pos, 2]
302
+ return 'R', 'R', 2
303
+ end
304
+ pri, sec = nil, nil
305
+ if 0 == pos && (vowel?(str[pos + 1, 1]) || 'WH' == str[pos, 2])
306
+ pri = 'A'
307
+ sec = vowel?(str[pos + 1, 1]) ? 'F' : 'A'
308
+ end
309
+ if (last == pos && vowel?(str[pos - 1, 1])) || 'SCH' == str[0, 3] ||
310
+ /^EWSKI|EWSKY|OWSKI|OWSKY$/ =~ str[pos - 1, 5]
311
+ return pri, "#{sec}F", 1
312
+ elsif /^WI(C|T)Z$/ =~ str[pos, 4]
313
+ return "#{pri}TS", "#{sec}FX", 4
314
+ else
315
+ return pri, sec, 1
316
+ end
317
+ when 'X'
318
+ current = (/^C|X$/ =~ str[pos + 1, 1] ? 2 : 1)
319
+ if !(last == pos && (/^(I|E)AU$/ =~ str[pos - 3, 3] || /^(A|O)U$/ =~ str[pos - 2, 2]))
320
+ return 'KS', 'KS', current
321
+ else
322
+ return nil, nil, current
323
+ end
324
+ when 'Z'
325
+ if 'H' == str[pos + 1, 1]
326
+ return 'J', 'J', 2
327
+ else
328
+ current = ('Z' == str[pos + 1, 1] ? 2 : 1)
329
+ if /^Z(O|I|A)$/ =~ str[pos + 1, 2] || (slavo_germanic?(str) && (pos > 0 && 'T' != str[pos - 1, 1]))
330
+ return 'S', 'TS', current
331
+ else
332
+ return 'S', 'S', current
333
+ end
334
+ end
335
+ else
336
+ return nil, nil, 1
337
+ end
338
+ end
339
+
340
+ extend self
341
+
342
+ end
@@ -1,3 +1,3 @@
1
1
  module RubyFish
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 2
9
- version: 0.0.2
8
+ - 3
9
+ version: 0.0.3
10
10
  platform: ruby
11
11
  authors:
12
12
  - Yury Korolev
@@ -29,6 +29,7 @@ extra_rdoc_files: []
29
29
 
30
30
  files:
31
31
  - lib/rubyfish/damerau_levenshtein.rb
32
+ - lib/rubyfish/double_metaphone.rb
32
33
  - lib/rubyfish/hamming.rb
33
34
  - lib/rubyfish/jaro.rb
34
35
  - lib/rubyfish/jaro_winkler.rb
@@ -36,7 +37,6 @@ files:
36
37
  - lib/rubyfish/longest_subsequence.rb
37
38
  - lib/rubyfish/longest_substring.rb
38
39
  - lib/rubyfish/mmatrix.rb
39
- - lib/rubyfish/mra.rb
40
40
  - lib/rubyfish/version.rb
41
41
  - lib/rubyfish.rb
42
42
  - LICENSE
@@ -57,7 +57,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
57
57
  requirements:
58
58
  - - ">="
59
59
  - !ruby/object:Gem::Version
60
- hash: 663674839144737507
60
+ hash: 3683456947558493036
61
61
  segments:
62
62
  - 0
63
63
  version: "0"
@@ -77,6 +77,6 @@ rubyforge_project: rubyfish
77
77
  rubygems_version: 1.3.7
78
78
  signing_key:
79
79
  specification_version: 3
80
- summary: A new gem templates
80
+ summary: Library for doing approximate and phonetic matching of string
81
81
  test_files: []
82
82
 
@@ -1,3 +0,0 @@
1
- module RubyFish::MRA
2
-
3
- end