rubyfish 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
- =========
2
- jellyfish
1
+
2
+ RubyFish
3
3
  =========
4
4
 
5
5
  RubyFish is a ruby port of python library jellyfish (http://github.com/sunlightlabs/jellyfish) for doing approximate and phonetic matching of strings.
@@ -27,6 +27,10 @@ String comparison:
27
27
  * Longest Substring
28
28
  * Longest Subsequence
29
29
 
30
+ Phonetic encoding:
31
+
32
+ * Double Metaphone
33
+
30
34
  Example Usage
31
35
  =============
32
36
 
@@ -36,4 +40,6 @@ Example Usage
36
40
  ruby-1.9.2-p0 > RubyFish::Jaro.distance("jellyfish", "rubyfish")
37
41
  => 0.7268518518518519
38
42
  ruby-1.9.2-p0 > RubyFish::DamerauLevenshtein.distance("rubyfish", "rubyfihs")
39
- => 1
43
+ => 1
44
+
45
+
data/ROADMAP.md CHANGED
@@ -1,3 +1,2 @@
1
1
  - Port MRA
2
- - Port NYSIIS
3
- - Add Double Metaphone
2
+ - Port NYSIIS
@@ -1,5 +1,3 @@
1
- #require "rubyfish/awesome"
2
-
3
1
  module RubyFish
4
2
  autoload :Hamming, 'rubyfish/hamming'
5
3
  autoload :Levenshtein, 'rubyfish/levenshtein'
@@ -8,5 +6,6 @@ module RubyFish
8
6
  autoload :LongestSubsequence, 'rubyfish/longest_subsequence'
9
7
  autoload :Jaro, 'rubyfish/jaro'
10
8
  autoload :JaroWinkler, 'rubyfish/jaro_winkler'
9
+ autoload :DoubleMetaphone, 'rubyfish/double_metaphone'
11
10
  autoload :MMatrix, 'rubyfish/mmatrix'
12
11
  end
@@ -0,0 +1,342 @@
1
+ #encoding: utf-8
2
+
3
+ # stolen from http://english.rubyforge.org/
4
+ module RubyFish::DoubleMetaphone
5
+
6
+ def phonetic_code string
7
+ str = string.to_s
8
+ primary, secondary, current = '', '', 0
9
+ original, length, last = "#{str} ".upcase, str.length, str.length - 1
10
+ if /^GN|KN|PN|WR|PS$/ =~ original[0, 2]
11
+ current += 1
12
+ end
13
+ if 'X' == original[0, 1]
14
+ primary << 'S'
15
+ secondary << 'S'
16
+ current += 1
17
+ end
18
+ while primary.length < 4 || secondary.length < 4
19
+ break if current > str.length
20
+ a, b, c = lookup(original, current, length, last)
21
+ primary << a if a
22
+ secondary << b if b
23
+ current += c if c
24
+ end
25
+ primary, secondary = primary[0, 4], secondary[0, 4]
26
+ [primary, (primary == secondary ? nil : secondary)]
27
+ end
28
+
29
+ private
30
+
31
+ def slavo_germanic?(str)
32
+ /W|K|CZ|WITZ/ =~ str
33
+ end
34
+
35
+ def vowel?(str)
36
+ /^A|E|I|O|U|Y$/ =~ str
37
+ end
38
+
39
+ def lookup(str, pos, length, last)
40
+ case str[pos, 1]
41
+ when /^A|E|I|O|U|Y$/
42
+ if 0 == pos
43
+ return 'A', 'A', 1
44
+ else
45
+ return nil, nil, 1
46
+ end
47
+ when 'B'
48
+ return 'P', 'P', ('B' == str[pos + 1, 1] ? 2 : 1)
49
+ when 'Ç'
50
+ return 'S', 'S', 1
51
+ when 'C'
52
+ if pos > 1 &&
53
+ !vowel?(str[pos - 2, 1]) &&
54
+ 'ACH' == str[pos - 1, 3] &&
55
+ str[pos + 2, 1] != 'I' && (
56
+ str[pos + 2, 1] != 'E' ||
57
+ str[pos - 2, 6] =~ /^(B|M)ACHER$/
58
+ ) then
59
+ return 'K', 'K', 2
60
+ elsif 0 == pos && 'CAESAR' == str[pos, 6]
61
+ return 'S', 'S', 2
62
+ elsif 'CHIA' == str[pos, 4]
63
+ return 'K', 'K', 2
64
+ elsif 'CH' == str[pos, 2]
65
+ if 0 == pos && 'CHAE' == str[pos, 4]
66
+ return 'K', 'X', 2
67
+ elsif 0 == pos && (
68
+ ['HARAC', 'HARIS'].include?(str[pos + 1, 5]) ||
69
+ ['HOR', 'HYM', 'HIA', 'HEM'].include?(str[pos + 1, 3])
70
+ ) && str[0, 5] != 'CHORE' then
71
+ return 'K', 'K', 2
72
+ elsif ['VAN ','VON '].include?(str[0, 4]) ||
73
+ 'SCH' == str[0, 3] ||
74
+ ['ORCHES','ARCHIT','ORCHID'].include?(str[pos - 2, 6]) ||
75
+ ['T','S'].include?(str[pos + 2, 1]) || (
76
+ ((0 == pos) || ['A','O','U','E'].include?(str[pos - 1, 1])) &&
77
+ ['L','R','N','M','B','H','F','V','W',' '].include?(str[pos + 2, 1])
78
+ ) then
79
+ return 'K', 'K', 2
80
+ elsif pos > 0
81
+ return ('MC' == str[0, 2] ? 'K' : 'X'), 'K', 2
82
+ else
83
+ return 'X', 'X', 2
84
+ end
85
+ elsif 'CZ' == str[pos, 2] && 'WICZ' != str[pos - 2, 4]
86
+ return 'S', 'X', 2
87
+ elsif 'CIA' == str[pos + 1, 3]
88
+ return 'X', 'X', 3
89
+ elsif 'CC' == str[pos, 2] && !(1 == pos && 'M' == str[0, 1])
90
+ if /^I|E|H$/ =~ str[pos + 2, 1] && 'HU' != str[pos + 2, 2]
91
+ if (1 == pos && 'A' == str[pos - 1, 1]) ||
92
+ /^UCCE(E|S)$/ =~ str[pos - 1, 5] then
93
+ return 'KS', 'KS', 3
94
+ else
95
+ return 'X', 'X', 3
96
+ end
97
+ else
98
+ return 'K', 'K', 2
99
+ end
100
+ elsif /^C(K|G|Q)$/ =~ str[pos, 2]
101
+ return 'K', 'K', 2
102
+ elsif /^C(I|E|Y)$/ =~ str[pos, 2]
103
+ return 'S', (/^CI(O|E|A)$/ =~ str[pos, 3] ? 'X' : 'S'), 2
104
+ else
105
+ if /^ (C|Q|G)$/ =~ str[pos + 1, 2]
106
+ return 'K', 'K', 3
107
+ else
108
+ return 'K', 'K', (/^C|K|Q$/ =~ str[pos + 1, 1] && !(['CE','CI'].include?(str[pos + 1, 2])) ? 2 : 1)
109
+ end
110
+ end
111
+ when 'D'
112
+ if 'DG' == str[pos, 2]
113
+ if /^I|E|Y$/ =~ str[pos + 2, 1]
114
+ return 'J', 'J', 3
115
+ else
116
+ return 'TK', 'TK', 2
117
+ end
118
+ else
119
+ return 'T', 'T', (/^D(T|D)$/ =~ str[pos, 2] ? 2 : 1)
120
+ end
121
+ when 'F'
122
+ return 'F', 'F', ('F' == str[pos + 1, 1] ? 2 : 1)
123
+ when 'G'
124
+ if 'H' == str[pos + 1, 1]
125
+ if pos > 0 && !vowel?(str[pos - 1, 1])
126
+ return 'K', 'K', 2
127
+ elsif 0 == pos
128
+ if 'I' == str[pos + 2, 1]
129
+ return 'J', 'J', 2
130
+ else
131
+ return 'K', 'K', 2
132
+ end
133
+ elsif (pos > 1 && /^B|H|D$/ =~ str[pos - 2, 1]) ||
134
+ (pos > 2 && /^B|H|D$/ =~ str[pos - 3, 1]) ||
135
+ (pos > 3 && /^B|H$/ =~ str[pos - 4, 1])
136
+ return nil, nil, 2
137
+ else
138
+ if (pos > 2 && 'U' == str[pos - 1, 1] && /^C|G|L|R|T$/ =~ str[pos - 3, 1])
139
+ return 'F', 'F', 2
140
+ elsif pos > 0 && 'I' != str[pos - 1, 1]
141
+ return 'K', 'K', 2
142
+ else
143
+ return nil, nil, 2
144
+ end
145
+ end
146
+ elsif 'N' == str[pos + 1, 1]
147
+ if 1 == pos && vowel?(str[0, 1]) && !slavo_germanic?(str)
148
+ return 'KN', 'N', 2
149
+ else
150
+ if 'EY' != str[pos + 2, 2] && 'Y' != str[pos + 1, 1] && !slavo_germanic?(str)
151
+ return 'N', 'KN', 2
152
+ else
153
+ return 'KN', 'KN', 2
154
+ end
155
+ end
156
+ elsif 'LI' == str[pos + 1, 2] && !slavo_germanic?(str)
157
+ return 'KL', 'L', 2
158
+ elsif 0 == pos && ('Y' == str[pos + 1, 1] || /^(E(S|P|B|L|Y|I|R)|I(B|L|N|E))$/ =~ str[pos + 1, 2])
159
+ return 'K', 'J', 2
160
+ elsif (('ER' == str[pos + 1, 2] || 'Y' == str[pos + 1, 1]) &&
161
+ /^(D|R|M)ANGER$/ !~ str[0, 6] &&
162
+ /^E|I$/ !~ str[pos - 1, 1] &&
163
+ /^(R|O)GY$/ !~ str[pos - 1, 3])
164
+ return 'K', 'J', 2
165
+ elsif /^E|I|Y$/ =~ str[pos + 1, 1] || /^(A|O)GGI$/ =~ str[pos - 1, 4]
166
+ if (/^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]) || 'ET' == str[pos + 1, 2]
167
+ return 'K', 'K', 2
168
+ else
169
+ if 'IER ' == str[pos + 1, 4]
170
+ return 'J', 'J', 2
171
+ else
172
+ return 'J', 'K', 2
173
+ end
174
+ end
175
+ elsif 'G' == str[pos + 1, 1]
176
+ return 'K', 'K', 2
177
+ else
178
+ return 'K', 'K', 1
179
+ end
180
+ when 'H'
181
+ if (0 == pos || vowel?(str[pos - 1, 1])) && vowel?(str[pos + 1, 1])
182
+ return 'H', 'H', 2
183
+ else
184
+ return nil, nil, 1
185
+ end
186
+ when 'J'
187
+ if 'JOSE' == str[pos, 4] || 'SAN ' == str[0, 4]
188
+ if (0 == pos && ' ' == str[pos + 4, 1]) || 'SAN ' == str[0, 4]
189
+ return 'H', 'H', 1
190
+ else
191
+ return 'J', 'H', 1
192
+ end
193
+ else
194
+ current = ('J' == str[pos + 1, 1] ? 2 : 1)
195
+
196
+ if 0 == pos && 'JOSE' != str[pos, 4]
197
+ return 'J', 'A', current
198
+ else
199
+ if vowel?(str[pos - 1, 1]) && !slavo_germanic?(str) && /^A|O$/ =~ str[pos + 1, 1]
200
+ return 'J', 'H', current
201
+ else
202
+ if last == pos
203
+ return 'J', nil, current
204
+ else
205
+ if /^L|T|K|S|N|M|B|Z$/ !~ str[pos + 1, 1] && /^S|K|L$/ !~ str[pos - 1, 1]
206
+ return 'J', 'J', current
207
+ else
208
+ return nil, nil, current
209
+ end
210
+ end
211
+ end
212
+ end
213
+ end
214
+ when 'K'
215
+ return 'K', 'K', ('K' == str[pos + 1, 1] ? 2 : 1)
216
+ when 'L'
217
+ if 'L' == str[pos + 1, 1]
218
+ if (((length - 3) == pos && /^(ILL(O|A)|ALLE)$/ =~ str[pos - 1, 4]) ||
219
+ ((/^(A|O)S$/ =~ str[last - 1, 2] || /^A|O$/ =~ str[last, 1]) && 'ALLE' == str[pos - 1, 4]))
220
+ return 'L', nil, 2
221
+ else
222
+ return 'L', 'L', 2
223
+ end
224
+ else
225
+ return 'L', 'L', 1
226
+ end
227
+ when 'M'
228
+ if ('UMB' == str[pos - 1, 3] &&
229
+ ((last - 1) == pos || 'ER' == str[pos + 2, 2])) || 'M' == str[pos + 1, 1]
230
+ return 'M', 'M', 2
231
+ else
232
+ return 'M', 'M', 1
233
+ end
234
+ when 'N'
235
+ return 'N', 'N', ('N' == str[pos + 1, 1] ? 2 : 1)
236
+ when 'Ñ'
237
+ return 'N', 'N', 1
238
+ when 'P'
239
+ if 'H' == str[pos + 1, 1]
240
+ return 'F', 'F', 2
241
+ else
242
+ return 'P', 'P', (/^P|B$/ =~ str[pos + 1, 1] ? 2 : 1)
243
+ end
244
+ when 'Q'
245
+ return 'K', 'K', ('Q' == str[pos + 1, 1] ? 2 : 1)
246
+ when 'R'
247
+ current = ('R' == str[pos + 1, 1] ? 2 : 1)
248
+
249
+ if last == pos && !slavo_germanic?(str) && 'IE' == str[pos - 2, 2] && /^M(E|A)$/ !~ str[pos - 4, 2]
250
+ return nil, 'R', current
251
+ else
252
+ return 'R', 'R', current
253
+ end
254
+ when 'S'
255
+ if /^(I|Y)SL$/ =~ str[pos - 1, 3]
256
+ return nil, nil, 1
257
+ elsif 0 == pos && 'SUGAR' == str[pos, 5]
258
+ return 'X', 'S', 1
259
+ elsif 'SH' == str[pos, 2]
260
+ if /^H(EIM|OEK|OLM|OLZ)$/ =~ str[pos + 1, 4]
261
+ return 'S', 'S', 2
262
+ else
263
+ return 'X', 'X', 2
264
+ end
265
+ elsif /^SI(O|A)$/ =~ str[pos, 3] || 'SIAN' == str[pos, 4]
266
+ return 'S', (slavo_germanic?(str) ? 'S' : 'X'), 3
267
+ elsif (0 == pos && /^M|N|L|W$/ =~ str[pos + 1, 1]) || 'Z' == str[pos + 1, 1]
268
+ return 'S', 'X', ('Z' == str[pos + 1, 1] ? 2 : 1)
269
+ elsif 'SC' == str[pos, 2]
270
+ if 'H' == str[pos + 2, 1]
271
+ if /^OO|ER|EN|UY|ED|EM$/ =~ str[pos + 3, 2]
272
+ return (/^E(R|N)$/ =~ str[pos + 3, 2] ? 'X' : 'SK'), 'SK', 3
273
+ else
274
+ return 'X', ((0 == pos && !vowel?(str[3, 1]) && ('W' != str[pos + 3, 1])) ? 'S' : 'X'), 3
275
+ end
276
+ elsif /^I|E|Y$/ =~ str[pos + 2, 1]
277
+ return 'S', 'S', 3
278
+ else
279
+ return 'SK', 'SK', 3
280
+ end
281
+ else
282
+ return (last == pos && /^(A|O)I$/ =~ str[pos - 2, 2] ? nil : 'S'), 'S', (/^S|Z$/ =~ str[pos + 1, 1] ? 2 : 1)
283
+ end
284
+ when 'T'
285
+ if 'TION' == str[pos, 4]
286
+ return 'X', 'X', 3
287
+ elsif /^T(IA|CH)$/ =~ str[pos, 3]
288
+ return 'X', 'X', 3
289
+ elsif 'TH' == str[pos, 2] || 'TTH' == str[pos, 3]
290
+ if /^(O|A)M$/ =~ str[pos + 2, 2] || /^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]
291
+ return 'T', 'T', 2
292
+ else
293
+ return '0', 'T', 2
294
+ end
295
+ else
296
+ return 'T', 'T', (/^T|D$/ =~ str[pos + 1, 1] ? 2 : 1)
297
+ end
298
+ when 'V'
299
+ return 'F', 'F', ('V' == str[pos + 1, 1] ? 2 : 1)
300
+ when 'W'
301
+ if 'WR' == str[pos, 2]
302
+ return 'R', 'R', 2
303
+ end
304
+ pri, sec = nil, nil
305
+ if 0 == pos && (vowel?(str[pos + 1, 1]) || 'WH' == str[pos, 2])
306
+ pri = 'A'
307
+ sec = vowel?(str[pos + 1, 1]) ? 'F' : 'A'
308
+ end
309
+ if (last == pos && vowel?(str[pos - 1, 1])) || 'SCH' == str[0, 3] ||
310
+ /^EWSKI|EWSKY|OWSKI|OWSKY$/ =~ str[pos - 1, 5]
311
+ return pri, "#{sec}F", 1
312
+ elsif /^WI(C|T)Z$/ =~ str[pos, 4]
313
+ return "#{pri}TS", "#{sec}FX", 4
314
+ else
315
+ return pri, sec, 1
316
+ end
317
+ when 'X'
318
+ current = (/^C|X$/ =~ str[pos + 1, 1] ? 2 : 1)
319
+ if !(last == pos && (/^(I|E)AU$/ =~ str[pos - 3, 3] || /^(A|O)U$/ =~ str[pos - 2, 2]))
320
+ return 'KS', 'KS', current
321
+ else
322
+ return nil, nil, current
323
+ end
324
+ when 'Z'
325
+ if 'H' == str[pos + 1, 1]
326
+ return 'J', 'J', 2
327
+ else
328
+ current = ('Z' == str[pos + 1, 1] ? 2 : 1)
329
+ if /^Z(O|I|A)$/ =~ str[pos + 1, 2] || (slavo_germanic?(str) && (pos > 0 && 'T' != str[pos - 1, 1]))
330
+ return 'S', 'TS', current
331
+ else
332
+ return 'S', 'S', current
333
+ end
334
+ end
335
+ else
336
+ return nil, nil, 1
337
+ end
338
+ end
339
+
340
+ extend self
341
+
342
+ end
@@ -1,3 +1,3 @@
1
1
  module RubyFish
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 2
9
- version: 0.0.2
8
+ - 3
9
+ version: 0.0.3
10
10
  platform: ruby
11
11
  authors:
12
12
  - Yury Korolev
@@ -29,6 +29,7 @@ extra_rdoc_files: []
29
29
 
30
30
  files:
31
31
  - lib/rubyfish/damerau_levenshtein.rb
32
+ - lib/rubyfish/double_metaphone.rb
32
33
  - lib/rubyfish/hamming.rb
33
34
  - lib/rubyfish/jaro.rb
34
35
  - lib/rubyfish/jaro_winkler.rb
@@ -36,7 +37,6 @@ files:
36
37
  - lib/rubyfish/longest_subsequence.rb
37
38
  - lib/rubyfish/longest_substring.rb
38
39
  - lib/rubyfish/mmatrix.rb
39
- - lib/rubyfish/mra.rb
40
40
  - lib/rubyfish/version.rb
41
41
  - lib/rubyfish.rb
42
42
  - LICENSE
@@ -57,7 +57,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
57
57
  requirements:
58
58
  - - ">="
59
59
  - !ruby/object:Gem::Version
60
- hash: 663674839144737507
60
+ hash: 3683456947558493036
61
61
  segments:
62
62
  - 0
63
63
  version: "0"
@@ -77,6 +77,6 @@ rubyforge_project: rubyfish
77
77
  rubygems_version: 1.3.7
78
78
  signing_key:
79
79
  specification_version: 3
80
- summary: A new gem templates
80
+ summary: Library for doing approximate and phonetic matching of string
81
81
  test_files: []
82
82
 
@@ -1,3 +0,0 @@
1
- module RubyFish::MRA
2
-
3
- end