phonetic 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec +1 -1
- data/README.md +7 -0
- data/lib/phonetic.rb +1 -0
- data/lib/phonetic/core_ext/string/dm_soundex.rb +12 -0
- data/lib/phonetic/dm_soundex.rb +82 -0
- data/lib/phonetic/dm_soundex_map.rb +233 -0
- data/lib/phonetic/double_metaphone.rb +519 -569
- data/lib/phonetic/metaphone.rb +43 -69
- data/lib/phonetic/version.rb +1 -1
- data/spec/phonetic/core_ext/string/dm_soundex_spec.rb +9 -0
- data/spec/phonetic/dm_soundex_spec.rb +13 -0
- data/spec/support/dm_soundex_data.rb +259 -0
- data/spec/support/double_metaphone_data.rb +30 -0
- metadata +11 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a20da7ce0b4dab68d7671088098226a035c64b05
|
4
|
+
data.tar.gz: 2b721bc986d8e23ba6780bb7cab92059e6a7652b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14325fa3846251dd1a1cbc59b38c12a32471291b45b07074387747fa9331b5ad98b1b0afaa8dbbac62872f9bf959d5e622742e5ec673f3e1294807f91b5fdc85
|
7
|
+
data.tar.gz: ad80a4c26cae46cbc516cc6cfebbfea39be69fbfe86426737cd94c065da57f4448ff3ffe9f892bd60af94fd834ae122c472cdc09b5fe683344ef479d1f31f90c
|
data/.rspec
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
--color
|
2
|
-
--format doc
|
2
|
+
--format doc
|
data/README.md
CHANGED
@@ -93,6 +93,13 @@ or use alias:
|
|
93
93
|
'Bonnie'.nysiis # => 'BANY'
|
94
94
|
```
|
95
95
|
|
96
|
+
### Daitch–Mokotoff Soundex (D–M Soundex)
|
97
|
+
```ruby
|
98
|
+
'Anja'.dm_soundex # => ['060000', '064000']
|
99
|
+
'Schwarz'.dm_soundex # => ['474000', '479400']
|
100
|
+
'Schtolteheim'.dm_soundex # => ['283560']
|
101
|
+
```
|
102
|
+
|
96
103
|
## Contributing
|
97
104
|
|
98
105
|
1. Fork it
|
data/lib/phonetic.rb
CHANGED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'phonetic/dm_soundex'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# D-M Soundex values of string.
|
5
|
+
# @example
|
6
|
+
# 'Anja'.dm_soundex # => ['060000', '064000']
|
7
|
+
# 'Schwarz'.dm_soundex # => ['474000', '479400']
|
8
|
+
# 'Schtolteheim'.dm_soundex # => ['283560']
|
9
|
+
def dm_soundex(options = {})
|
10
|
+
Phonetic::DMSoundex.encode(self, options)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'phonetic/algorithm'
|
2
|
+
require 'phonetic/dm_soundex_map'
|
3
|
+
|
4
|
+
module Phonetic
|
5
|
+
# Daitch–Mokotoff Soundex (D–M Soundex) is a phonetic algorithm invented
|
6
|
+
# in 1985 by Jewish genealogists Gary Mokotoff and Randy Daitch.
|
7
|
+
#
|
8
|
+
# @example
|
9
|
+
# Phonetic::DMSoundex.encode('Anja') # => ['060000', '064000']
|
10
|
+
# Phonetic::DMSoundex.encode('Schwarz') # => ['474000', '479400']
|
11
|
+
# Phonetic::DMSoundex.encode('Schtolteheim') # => ['283560']
|
12
|
+
class DMSoundex < Algorithm
|
13
|
+
|
14
|
+
def self.encode(str, options = {})
|
15
|
+
encode_word(str, options)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Encode word to its D-M Soundex codes.
|
19
|
+
def self.encode_word(word, options = {})
|
20
|
+
w = word.strip.upcase.gsub(/[^A-Z]+/, '')
|
21
|
+
i = 0
|
22
|
+
code = init_code()
|
23
|
+
while i < w.size
|
24
|
+
if w[i] != w[i + 1]
|
25
|
+
c = find_code(MAP, w, i)
|
26
|
+
if c
|
27
|
+
len = c[3] + 1
|
28
|
+
if i == 0
|
29
|
+
code.add c[0]
|
30
|
+
elsif w[i + len] =~ /[AEIOUJY]/
|
31
|
+
code.add c[1]
|
32
|
+
else
|
33
|
+
code.add c[2]
|
34
|
+
end
|
35
|
+
i += c[3]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
i += 1
|
39
|
+
end
|
40
|
+
code.result
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def self.init_code
|
46
|
+
code = [[]]
|
47
|
+
def code.add(a)
|
48
|
+
case a
|
49
|
+
when Array
|
50
|
+
c = self.map{|w| w.last != a[1] ? w + [a[1]] : w}
|
51
|
+
self.map!{|w| w.last != a[0] ? w + [a[0]] : w}
|
52
|
+
self.push(*c)
|
53
|
+
else
|
54
|
+
self.map!{|w| w.last != a ? w + [a] : w}
|
55
|
+
end
|
56
|
+
end
|
57
|
+
def code.result
|
58
|
+
self.map{|w| w.join[0..5].ljust(6, '0')}.uniq
|
59
|
+
end
|
60
|
+
code
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.find_code(map, w, i, last = nil, count = 0)
|
64
|
+
elem = map[w[i]]
|
65
|
+
r = case elem
|
66
|
+
when Array
|
67
|
+
elem[3] = count
|
68
|
+
elem
|
69
|
+
when Hash
|
70
|
+
_last = last
|
71
|
+
if elem['self']
|
72
|
+
_last = elem['self']
|
73
|
+
_last[3] = count
|
74
|
+
end
|
75
|
+
find_code(elem, w, i + 1, _last, count + 1)
|
76
|
+
when nil
|
77
|
+
last
|
78
|
+
end
|
79
|
+
r
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'phonetic/algorithm'
|
3
|
+
|
4
|
+
module Phonetic
|
5
|
+
class DMSoundex < Algorithm
|
6
|
+
MAP = {
|
7
|
+
'A' => {
|
8
|
+
'self' => ['0', '', ''], # A
|
9
|
+
'I' => ['0', '1', ''], # AI
|
10
|
+
'J' => ['0', '1', ''], # AJ
|
11
|
+
'Y' => ['0', '1', ''], # AY
|
12
|
+
'U' => ['0', '7', ''] # AU
|
13
|
+
},
|
14
|
+
'Ą' => ['', '', ['6', '']],
|
15
|
+
'E' => {
|
16
|
+
'self' => ['0', '', ''], # E
|
17
|
+
'I' => ['0', '1', ''], # EI
|
18
|
+
'Y' => ['0', '1', ''], # EY
|
19
|
+
'J' => ['0', '1', ''], # EJ
|
20
|
+
'U' => ['1', '1', ''] # EU
|
21
|
+
},
|
22
|
+
'O' => {
|
23
|
+
'self' => ['0', '', ''], # O
|
24
|
+
'I' => ['0', '1', ''], # OI
|
25
|
+
'J' => ['0', '1', ''], # OJ
|
26
|
+
'Y' => ['0', '1', ''] # OY
|
27
|
+
},
|
28
|
+
'U' => {
|
29
|
+
'self' => ['0', '', ''], # U
|
30
|
+
'I' => ['0', '1', ''], # UI
|
31
|
+
'J' => ['0', '1', ''], # UJ
|
32
|
+
'Y' => ['0', '1', ''], # UY
|
33
|
+
'E' => ['0', '', ''] # UE
|
34
|
+
},
|
35
|
+
'I' => {
|
36
|
+
'self' => ['0', '', ''], # I
|
37
|
+
'A' => ['1', '', ''], # IA
|
38
|
+
'E' => ['1', '', ''], # IE
|
39
|
+
'O' => ['1', '', ''], # IO
|
40
|
+
'U' => ['1', '', ''] # IU
|
41
|
+
},
|
42
|
+
'Y' => ['1', '', ''], # Y
|
43
|
+
'J' => [['1', '4'], ['', '4'], ['', '4']], # J
|
44
|
+
'B' => ['7', '7', '7'], # B
|
45
|
+
'C' => {
|
46
|
+
'self' => [['5', '4'], ['5', '4'], ['5', '4']], # C
|
47
|
+
'H' => {
|
48
|
+
'self' => [['5', '4'], ['5', '4'], ['5', '4']], # CH
|
49
|
+
'S' => ['5', '54', '54'] # CHS
|
50
|
+
},
|
51
|
+
'K' => [['5', '45'], ['5', '45'], ['5', '45']], # CK
|
52
|
+
'S' => {
|
53
|
+
'self' => ['4', '4', '4'], # CS
|
54
|
+
'Z' => ['4', '4', '4'] # CSZ
|
55
|
+
},
|
56
|
+
'Z' => {
|
57
|
+
'self' => ['4', '4', '4'], # CZ
|
58
|
+
'S' => ['4', '4', '4'] # CZS
|
59
|
+
}
|
60
|
+
},
|
61
|
+
'D' => {
|
62
|
+
'self' => ['3', '3', '3'], # D
|
63
|
+
'R' => { # DR
|
64
|
+
'S' => ['4', '4', '4'], # DRS
|
65
|
+
'Z' => ['4', '4', '4'] # DRZ
|
66
|
+
},
|
67
|
+
'S' => {
|
68
|
+
'self' => ['4', '4', '4'], # DS
|
69
|
+
'H' => ['4', '4', '4'] # DSH
|
70
|
+
},
|
71
|
+
'T' => ['3', '3', '3'], # DT
|
72
|
+
'Z' => {
|
73
|
+
'self' =>['4', '4', '4'], # DZ
|
74
|
+
'H' => ['4', '4', '4'], # DZH
|
75
|
+
'S' => ['4', '4', '4'] # DZS
|
76
|
+
}
|
77
|
+
},
|
78
|
+
'F' => {
|
79
|
+
'self' => ['7', '7', '7'], # F
|
80
|
+
'B' => ['7', '7', '7'] # FB
|
81
|
+
},
|
82
|
+
'G' => ['5', '5', '5'], # G
|
83
|
+
'H' => ['5', '5', ''], # H
|
84
|
+
'K' => {
|
85
|
+
'self' => ['5', '5', '5'], # K
|
86
|
+
'H' => ['5', '5', '5'], # KH
|
87
|
+
'S' => ['5', '54', '54'] # KS
|
88
|
+
},
|
89
|
+
'L' => ['8', '8', '8'], # L
|
90
|
+
'M' => {
|
91
|
+
'self' => ['6', '6', '6'], # M
|
92
|
+
'N' => ['', '66', '66'] # MN
|
93
|
+
},
|
94
|
+
'N' => {
|
95
|
+
'self' => ['6', '6', '6'], # N
|
96
|
+
'M' => ['', '66', '66'] # NM
|
97
|
+
},
|
98
|
+
'P' => {
|
99
|
+
'self' => ['7', '7', '7'], # P
|
100
|
+
'F' => ['7', '7', '7'], # PF
|
101
|
+
'H' => ['7', '7', '7'] # PH
|
102
|
+
},
|
103
|
+
'R' => {
|
104
|
+
'self' => ['9', '9', '9'], # R
|
105
|
+
'S' => [['94', '4'], ['94', '4'], ['94', '4']], # RS
|
106
|
+
'Z' => [['4', '94'], ['4', '94'], ['4', '94']] # RZ
|
107
|
+
},
|
108
|
+
'Q' => ['5', '5', '5'], # Q
|
109
|
+
'S' => {
|
110
|
+
'self' => ['4', '4', '4'], # S
|
111
|
+
'C' => {
|
112
|
+
'self' => ['2', '4', '4'], # SC
|
113
|
+
'H' => {
|
114
|
+
'self' => ['4', '4', '4'], # SCH
|
115
|
+
'T' => {
|
116
|
+
'self' => ['2', '43', '43'], # SCHT
|
117
|
+
'S' => { # SCHTS
|
118
|
+
'C' => { # SCHTSC
|
119
|
+
'H' => ['2', '4', '4'] # SCHTSCH
|
120
|
+
},
|
121
|
+
'H' => ['2', '4', '4'] # SCHTSH
|
122
|
+
},
|
123
|
+
'C' => { # SCHTC
|
124
|
+
'H' => ['2', '4', '4'] # SCHTCH
|
125
|
+
}
|
126
|
+
}
|
127
|
+
}
|
128
|
+
},
|
129
|
+
'D' => ['2', '43', '43'], # SD
|
130
|
+
'H' => {
|
131
|
+
'self' => ['4', '4', '4'], # SH
|
132
|
+
'C' => { # SHC
|
133
|
+
'H' => ['2', '4', '4'] # SHCH
|
134
|
+
},
|
135
|
+
'D' => ['2', '43', '43'], # SHD
|
136
|
+
'T' => {
|
137
|
+
'self' => ['2', '43', '43'], # SHT
|
138
|
+
'C' => { # SHTC
|
139
|
+
'H' => ['2', '4', '4'] # SHTCH
|
140
|
+
},
|
141
|
+
'S' => { # SHTS
|
142
|
+
'H' => ['2', '4', '4'] # SHTSH
|
143
|
+
}
|
144
|
+
}
|
145
|
+
},
|
146
|
+
'T' => {
|
147
|
+
'self' => ['2', '43', '43'], # ST
|
148
|
+
'C' => { # STC
|
149
|
+
'H' => ['2', '4', '4'] # STCH
|
150
|
+
},
|
151
|
+
'S' => { # STS
|
152
|
+
'C' => { # STSC
|
153
|
+
'H' => ['2', '4', '4'] # STSCH
|
154
|
+
},
|
155
|
+
'D' => ['2', '43', '43'], # SCHD
|
156
|
+
'H' => ['2', '4', '4'] # STSH
|
157
|
+
},
|
158
|
+
'R' => { # STR
|
159
|
+
'S' => ['2', '4', '4'], # STRS
|
160
|
+
'Z' => ['2', '4', '4'] # STRZ
|
161
|
+
}
|
162
|
+
},
|
163
|
+
'Z' => {
|
164
|
+
'self' => ['4', '4', '4'], # SZ
|
165
|
+
'C' => { # SZC
|
166
|
+
'S' => ['2', '4', '4'], # SZCS
|
167
|
+
'Z' => ['2', '4', '4'] # SZCZ
|
168
|
+
},
|
169
|
+
'D' => ['2', '43', '43'], # SZD
|
170
|
+
'T' => ['2', '43', '43'] # SZT
|
171
|
+
}
|
172
|
+
},
|
173
|
+
'T' => {
|
174
|
+
'self' => ['3', '3', '3'], # T
|
175
|
+
'C' => {
|
176
|
+
'self' => ['4', '4', '4'], # TC
|
177
|
+
'H' => ['4', '4', '4'] # TCH
|
178
|
+
},
|
179
|
+
'H' => ['3', '3', '3'], # TH
|
180
|
+
'R' => { # TR
|
181
|
+
'C' => { # TRC
|
182
|
+
'H' => ['4', '4', '4'] # TRCH
|
183
|
+
},
|
184
|
+
'S' => ['4', '4', '4'], # TRS
|
185
|
+
'Z' => ['4', '4', '4'] # TRZ
|
186
|
+
},
|
187
|
+
'S' => {
|
188
|
+
'self' => ['4', '4', '4'], # TS
|
189
|
+
'H' => ['4', '4', '4'], # TSH
|
190
|
+
'C' => { # TSC
|
191
|
+
'H' => ['4', '4', '4'] # TSCH
|
192
|
+
},
|
193
|
+
'Z' => ['4', '4', '4'] # TSZ
|
194
|
+
},
|
195
|
+
'T' => { # TT
|
196
|
+
'C' => { # TTC
|
197
|
+
'H' => ['4', '4', '4'] # TTCH
|
198
|
+
},
|
199
|
+
'S' => {
|
200
|
+
'self' => ['4', '4', '4'], # TTS
|
201
|
+
'C' => { # TTSC
|
202
|
+
'H' => ['4', '4', '4'] # TTSCH
|
203
|
+
},
|
204
|
+
'Z' => ['4', '4', '4'] # TTSZ
|
205
|
+
},
|
206
|
+
'Z' => ['4', '4', '4'] # TTZ
|
207
|
+
},
|
208
|
+
'Z' => {
|
209
|
+
'self' => ['4', '4', '4'], # TZ
|
210
|
+
'S' => ['4', '4', '4'] # TZS
|
211
|
+
}
|
212
|
+
},
|
213
|
+
'X' => ['5', '54', '54'], # X
|
214
|
+
'V' => ['7', '7', '7'], # V
|
215
|
+
'W' => ['7', '7', '7'], # W
|
216
|
+
'Z' => {
|
217
|
+
'self' => ['4', '4', '4'], # Z
|
218
|
+
'H' => {
|
219
|
+
'self' => ['4', '4', '4'], # ZH
|
220
|
+
'S' => { # ZHS
|
221
|
+
'H' => ['4', '4', '4'] # ZHSH
|
222
|
+
}
|
223
|
+
},
|
224
|
+
'S' => {
|
225
|
+
'self' => ['4', '4', '4'], # ZS
|
226
|
+
'C' => { # ZSC
|
227
|
+
'H' => ['4', '4', '4'] # ZSCH
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
}
|
232
|
+
end
|
233
|
+
end
|
@@ -9,8 +9,10 @@ module Phonetic
|
|
9
9
|
#
|
10
10
|
# This implementation based on the PHP implementation by Stephen Woodbridge
|
11
11
|
# and contains modifications of algorithm by Kevin Atkinson.
|
12
|
-
# @see http://swoodbridge.com/DoubleMetaPhone/
|
13
|
-
#
|
12
|
+
# @see http://swoodbridge.com/DoubleMetaPhone/
|
13
|
+
# PHP implementation by Stephen Woodbridge
|
14
|
+
# @see http://aspell.net/metaphone/dmetaph.cpp
|
15
|
+
# C++ implementation with modifications by Kevin Atkinson
|
14
16
|
# @example
|
15
17
|
# Phonetic::DoubleMetaphone.encode('czerny') # => ['SRN', 'XRN']
|
16
18
|
# Phonetic::DoubleMetaphone.encode('dumb') # => ['TM', 'TM']
|
@@ -20,605 +22,73 @@ module Phonetic
|
|
20
22
|
# Phonetic::Metaphone2.encode('dumb') # => ['TM', 'TM']
|
21
23
|
# Phonetic::Metaphone2.encode('edgar') # => ['ATKR', 'ATKR']
|
22
24
|
class DoubleMetaphone < Algorithm
|
23
|
-
VOWELS = 'AEIOUY'
|
24
|
-
|
25
25
|
# Encode word to its Double Metaphone code.
|
26
26
|
def self.encode_word(word, options = { size: 4 })
|
27
27
|
code_size = options[:size] || 4
|
28
28
|
w = word.strip.upcase
|
29
|
-
|
30
|
-
|
29
|
+
code = ['', '']
|
30
|
+
def code.add(primary, secondary)
|
31
|
+
self[0] += primary
|
32
|
+
self[1] += secondary
|
33
|
+
end
|
31
34
|
i = 0
|
32
35
|
len = w.size
|
33
36
|
last = len - 1
|
34
37
|
# pad the original string so that we can index beyond the edge of the world
|
35
38
|
w += ' ' * 5
|
36
|
-
|
37
|
-
i
|
38
|
-
# initial 'X' is pronounced 'Z' e.g. 'Xavier'
|
39
|
-
if w[0] == 'X'
|
40
|
-
primary += 'S'
|
41
|
-
secondary += 'S'
|
42
|
-
i += 1
|
43
|
-
end
|
44
|
-
while i < len && (primary.size < code_size || primary.size < code_size)
|
39
|
+
i += encode_start_of_word(w, code)
|
40
|
+
while i < len && (code.first.size < code_size || code.last.size < code_size)
|
45
41
|
case w[i]
|
46
42
|
when 'A', 'E', 'I', 'O', 'U', 'Y'
|
47
|
-
if i == 0
|
48
|
-
# all init vowels now map to 'A'
|
49
|
-
primary += 'A'
|
50
|
-
secondary += 'A'
|
51
|
-
end
|
52
43
|
i += 1
|
53
44
|
when 'B'
|
54
45
|
# "-mb", e.g", "dumb", already skipped over...
|
55
|
-
|
56
|
-
secondary += 'P'
|
57
|
-
i += (w[i + 1] == 'B') ? 2 : 1
|
46
|
+
i += gen_encode(w, i, 'P', 'P', code)
|
58
47
|
when 'Ç', 'ç'
|
59
|
-
|
60
|
-
secondary += 'S'
|
48
|
+
code.add 'S', 'S'
|
61
49
|
i += 1
|
62
50
|
when 'C'
|
63
|
-
|
64
|
-
if i > 1 && !vowel?(w[i - 2]) && w[i - 1, 3] == 'ACH' &&
|
65
|
-
(w[i + 2] != 'I' && (w[i + 2] != 'E' || w[i - 2, 6] =~ /[BM]ACHER/))
|
66
|
-
primary += 'K'
|
67
|
-
secondary += 'K'
|
68
|
-
i += 2
|
69
|
-
# special case 'caesar'
|
70
|
-
elsif i == 0 && w[i, 6] == 'CAESAR'
|
71
|
-
primary += 'S'
|
72
|
-
secondary += 'S'
|
73
|
-
i += 2
|
74
|
-
# italian 'chianti'
|
75
|
-
elsif w[i, 4] == 'CHIA'
|
76
|
-
primary += 'K'
|
77
|
-
secondary += 'K'
|
78
|
-
i += 2
|
79
|
-
elsif w[i, 2] == 'CH'
|
80
|
-
# find 'michael'
|
81
|
-
if i > 0 && w[i, 4] == 'CHAE'
|
82
|
-
primary += 'K'
|
83
|
-
secondary += 'X'
|
84
|
-
i += 2
|
85
|
-
# greek roots e.g. 'chemistry', 'chorus'
|
86
|
-
elsif i == 0 && (w[i + 1, 5] =~ /HARAC|HARIS/ || w[i + 1, 3] =~ /HOR|HYM|HIA|HEM/) &&
|
87
|
-
w[0, 5] != 'CHORE'
|
88
|
-
primary += 'K'
|
89
|
-
secondary += 'K'
|
90
|
-
i += 2
|
91
|
-
else
|
92
|
-
# germanic, greek, or otherwise 'ch' for 'kh' sound
|
93
|
-
if (w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH') ||
|
94
|
-
# 'architect but not 'arch', 'orchestra', 'orchid'
|
95
|
-
(i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/) ||
|
96
|
-
(w[i + 2] =~ /[TS]/) ||
|
97
|
-
((i > 0 && w[i - 1] =~ /[AOUE]/) || i == 0) &&
|
98
|
-
# e.g., 'wachtler', 'wechsler', but not 'tichner'
|
99
|
-
(w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
|
100
|
-
primary += 'K'
|
101
|
-
secondary += 'K'
|
102
|
-
else
|
103
|
-
if i > 0
|
104
|
-
if w[0, 2] == 'MC'
|
105
|
-
# e.g., "McHugh"
|
106
|
-
primary += 'K'
|
107
|
-
secondary += 'K'
|
108
|
-
else
|
109
|
-
primary += 'X'
|
110
|
-
secondary += 'K'
|
111
|
-
end
|
112
|
-
else
|
113
|
-
primary += 'X'
|
114
|
-
secondary += 'X'
|
115
|
-
end
|
116
|
-
end
|
117
|
-
i += 2
|
118
|
-
end
|
119
|
-
elsif w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
|
120
|
-
# e.g, 'czerny'
|
121
|
-
primary += 'S'
|
122
|
-
secondary += 'X'
|
123
|
-
i += 2
|
124
|
-
elsif w[i + 1, 3] == 'CIA'
|
125
|
-
# e.g., 'focaccia'
|
126
|
-
primary += 'X'
|
127
|
-
secondary += 'X'
|
128
|
-
i += 3
|
129
|
-
# double 'C', but not if e.g. 'McClellan'
|
130
|
-
elsif w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
|
131
|
-
# 'bellocchio' but not 'bacchus'
|
132
|
-
if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
|
133
|
-
# 'accident', 'accede' 'succeed'
|
134
|
-
if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
|
135
|
-
# 'bacci', 'bertucci', other italian
|
136
|
-
primary += 'KS'
|
137
|
-
secondary += 'KS'
|
138
|
-
else
|
139
|
-
primary += 'X'
|
140
|
-
secondary += 'X'
|
141
|
-
end
|
142
|
-
i += 3
|
143
|
-
else
|
144
|
-
# Pierce's rule
|
145
|
-
primary += 'K'
|
146
|
-
secondary += 'K'
|
147
|
-
i += 2
|
148
|
-
end
|
149
|
-
elsif w[i, 2] =~ /CK|CG|CQ/
|
150
|
-
primary += 'K'
|
151
|
-
secondary += 'K'
|
152
|
-
i += 2
|
153
|
-
elsif w[i, 2] =~ /CI|CE|CY/
|
154
|
-
# italian vs. english
|
155
|
-
if w[i, 3] =~ /CIO|CIE|CIA/
|
156
|
-
primary += 'S'
|
157
|
-
secondary += 'X'
|
158
|
-
else
|
159
|
-
primary += 'S'
|
160
|
-
secondary += 'S'
|
161
|
-
end
|
162
|
-
i += 2
|
163
|
-
else
|
164
|
-
primary += 'K'
|
165
|
-
secondary += 'K'
|
166
|
-
# name sent in 'mac caffrey', 'mac gregor'
|
167
|
-
if w[i + 1, 2] =~ /\s[CQG]/
|
168
|
-
i += 3
|
169
|
-
else
|
170
|
-
if w[i + 1] =~ /[CKQ]/ && !(w[i + 1, 2] =~ /CE|CI/)
|
171
|
-
i += 2
|
172
|
-
else
|
173
|
-
i += 1
|
174
|
-
end
|
175
|
-
end
|
176
|
-
end
|
51
|
+
i += encode_c(w, i, len, code)
|
177
52
|
when 'D'
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
primary += 'J'
|
182
|
-
secondary += 'J'
|
183
|
-
i += 3
|
184
|
-
else
|
185
|
-
# e.g. 'edgar'
|
186
|
-
primary += 'TK'
|
187
|
-
secondary += 'TK'
|
188
|
-
i += 2
|
189
|
-
end
|
190
|
-
elsif w[i, 2] =~ /DT|DD/
|
191
|
-
primary += 'T'
|
192
|
-
secondary += 'T'
|
193
|
-
i += 2
|
194
|
-
else
|
195
|
-
primary += 'T'
|
196
|
-
secondary += 'T'
|
197
|
-
i += 1
|
198
|
-
end
|
199
|
-
when 'F'
|
200
|
-
if w[i + 1] == 'F'
|
201
|
-
i += 2
|
202
|
-
else
|
203
|
-
i += 1
|
204
|
-
end
|
205
|
-
primary += 'F'
|
206
|
-
secondary += 'F'
|
53
|
+
i += encode_d(w, i, len, code)
|
54
|
+
when 'F', 'K', 'N'
|
55
|
+
i += gen_encode(w, i, w[i], w[i], code)
|
207
56
|
when 'G'
|
208
|
-
|
209
|
-
if i > 0 && !vowel?(w[i - 1])
|
210
|
-
primary += 'K'
|
211
|
-
secondary += 'K'
|
212
|
-
i += 2
|
213
|
-
elsif i == 0
|
214
|
-
# ghislane, ghiradelli
|
215
|
-
if w[i + 2] == 'I'
|
216
|
-
primary += 'J'
|
217
|
-
secondary += 'J'
|
218
|
-
else
|
219
|
-
primary += 'K'
|
220
|
-
secondary += 'K'
|
221
|
-
end
|
222
|
-
i += 2
|
223
|
-
# Parker's rule (with some further refinements) - e.g., 'hugh'
|
224
|
-
elsif (i > 1 && w[i - 2] =~ /[BHD]/) ||
|
225
|
-
# e.g., 'bough'
|
226
|
-
(i > 2 && w[i - 3] =~ /[BHD]/) ||
|
227
|
-
# e.g., 'broughton'
|
228
|
-
(i > 3 && w[i - 4] =~ /[BH]/)
|
229
|
-
i += 2
|
230
|
-
else
|
231
|
-
# e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
|
232
|
-
if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
|
233
|
-
primary += 'F'
|
234
|
-
secondary += 'F'
|
235
|
-
else
|
236
|
-
if i > 0 && w[i - 1] != 'I'
|
237
|
-
primary += 'K'
|
238
|
-
secondary += 'K'
|
239
|
-
end
|
240
|
-
end
|
241
|
-
i += 2
|
242
|
-
end
|
243
|
-
elsif w[i + 1] == 'N'
|
244
|
-
if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
|
245
|
-
primary += 'KN'
|
246
|
-
secondary += 'N'
|
247
|
-
else
|
248
|
-
# not e.g. 'cagney'
|
249
|
-
if w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
|
250
|
-
primary += 'N'
|
251
|
-
secondary += 'KN'
|
252
|
-
else
|
253
|
-
primary += 'KN'
|
254
|
-
secondary += 'KN'
|
255
|
-
end
|
256
|
-
end
|
257
|
-
i += 2
|
258
|
-
# 'tagliaro'
|
259
|
-
elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
|
260
|
-
primary += 'KL'
|
261
|
-
secondary += 'L'
|
262
|
-
i += 2
|
263
|
-
# -ges-,-gep-,-gel-, -gie- at beginning
|
264
|
-
elsif i == 0 && (w[i + 1] == 'Y' || w[i + 1, 2] =~ /ES|EP|EB|EL|EY|IB|IL|IN|IE|EI|ER/)
|
265
|
-
primary += 'K'
|
266
|
-
secondary += 'J'
|
267
|
-
i += 2
|
268
|
-
# -ger-, -gy-
|
269
|
-
elsif (w[i + 1, 2] == 'ER' || w[i + 1] == 'Y') &&
|
270
|
-
!(w[0, 6] =~ /[DRM]ANGER/) &&
|
271
|
-
!(i > 0 && w[i - 1] =~ /[EI]/) &&
|
272
|
-
!(i > 0 && w[i - 1, 3] =~ /RGY|OGY/)
|
273
|
-
primary += 'K'
|
274
|
-
secondary += 'J'
|
275
|
-
i += 2
|
276
|
-
# italian e.g, 'biaggi'
|
277
|
-
elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
|
278
|
-
if w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH' || w[i + 1, 2] == 'ET'
|
279
|
-
primary += 'K'
|
280
|
-
secondary += 'K'
|
281
|
-
else
|
282
|
-
if w[i + 1, 4] =~ /IER\s/
|
283
|
-
primary += 'J'
|
284
|
-
secondary += 'J'
|
285
|
-
else
|
286
|
-
primary += 'J'
|
287
|
-
secondary += 'K'
|
288
|
-
end
|
289
|
-
end
|
290
|
-
i += 2
|
291
|
-
else
|
292
|
-
if w[i + 1] == 'G'
|
293
|
-
i += 2
|
294
|
-
else
|
295
|
-
i += 1
|
296
|
-
end
|
297
|
-
primary += 'K'
|
298
|
-
secondary += 'K'
|
299
|
-
end
|
57
|
+
i += encode_g(w, i, len, code)
|
300
58
|
when 'H'
|
301
|
-
|
302
|
-
if (i == 0 || (i > 0 && vowel?(w[i - 1]))) && vowel?(w[i + 1])
|
303
|
-
primary += 'H'
|
304
|
-
secondary += 'H'
|
305
|
-
i += 2
|
306
|
-
else # also takes care of 'HH'
|
307
|
-
i += 1
|
308
|
-
end
|
59
|
+
i += encode_h(w, i, len, code)
|
309
60
|
when 'J'
|
310
|
-
|
311
|
-
if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
|
312
|
-
if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
|
313
|
-
primary += 'H'
|
314
|
-
secondary += 'H'
|
315
|
-
else
|
316
|
-
primary += 'J'
|
317
|
-
secondary += 'H'
|
318
|
-
end
|
319
|
-
i += 1
|
320
|
-
else
|
321
|
-
if i == 0 && w[i, 4] != 'JOSE'
|
322
|
-
primary += 'J'
|
323
|
-
secondary += 'A'
|
324
|
-
# Yankelovich/Jankelowicz
|
325
|
-
else
|
326
|
-
# spanish pron. of e.g. 'bajador'
|
327
|
-
if i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && (w[i + 1] == 'A' || w[i + 1] == 'O')
|
328
|
-
primary += 'J'
|
329
|
-
secondary += 'H'
|
330
|
-
else
|
331
|
-
if i == last
|
332
|
-
primary += 'J'
|
333
|
-
#secondary += ' '
|
334
|
-
else
|
335
|
-
if !(w[i + 1] =~ /[LTKSNMBZ]/) && !(i > 0 && w[i - 1] =~ /[SKL]/)
|
336
|
-
primary += 'J'
|
337
|
-
secondary += 'J'
|
338
|
-
end
|
339
|
-
end
|
340
|
-
end
|
341
|
-
end
|
342
|
-
if w[i + 1] == 'J'
|
343
|
-
i += 2
|
344
|
-
else
|
345
|
-
i += 1
|
346
|
-
end
|
347
|
-
end
|
348
|
-
when 'K'
|
349
|
-
if w[i + 1] == 'K'
|
350
|
-
i += 2
|
351
|
-
else
|
352
|
-
i += 1
|
353
|
-
end
|
354
|
-
primary += 'K'
|
355
|
-
secondary += 'K'
|
61
|
+
i += encode_j(w, i, len, code)
|
356
62
|
when 'L'
|
357
|
-
|
358
|
-
# spanish e.g. 'cabrillo', 'gallegos'
|
359
|
-
if (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILLO|ILLA|ALLE/) ||
|
360
|
-
((last > 0 && w[last - 1, 2] =~ /AS|OS/ || w[last] =~ /[AO]/) &&
|
361
|
-
(i > 0 && w[i - 1, 4] == 'ALLE'))
|
362
|
-
primary += 'L'
|
363
|
-
i += 2
|
364
|
-
next
|
365
|
-
end
|
366
|
-
i += 2
|
367
|
-
else
|
368
|
-
i += 1
|
369
|
-
end
|
370
|
-
primary += 'L'
|
371
|
-
secondary += 'L'
|
63
|
+
i += encode_l(w, i, len, code)
|
372
64
|
when 'M'
|
373
|
-
|
374
|
-
# 'dumb','thumb'
|
375
|
-
w[i + 1] == 'M'
|
376
|
-
i += 2
|
377
|
-
else
|
378
|
-
i += 1
|
379
|
-
end
|
380
|
-
primary += 'M'
|
381
|
-
secondary += 'M'
|
382
|
-
when 'N'
|
383
|
-
if w[i + 1] == 'N'
|
384
|
-
i += 2
|
385
|
-
else
|
386
|
-
i += 1
|
387
|
-
end
|
388
|
-
primary += 'N'
|
389
|
-
secondary += 'N'
|
65
|
+
i += encode_m(w, i, len, code)
|
390
66
|
when 'Ñ', 'ñ'
|
391
|
-
|
392
|
-
|
393
|
-
secondary += 'N'
|
67
|
+
code.add 'N', 'N'
|
68
|
+
i += 1
|
394
69
|
when 'P'
|
395
|
-
|
396
|
-
primary += 'F'
|
397
|
-
secondary += 'F'
|
398
|
-
i += 2
|
399
|
-
else
|
400
|
-
# also account for "campbell", "raspberry"
|
401
|
-
if w[i + 1] =~ /[PB]/
|
402
|
-
i += 2
|
403
|
-
else
|
404
|
-
i += 1
|
405
|
-
end
|
406
|
-
primary += 'P'
|
407
|
-
secondary += 'P'
|
408
|
-
end
|
70
|
+
i += encode_p(w, i, len, code)
|
409
71
|
when 'Q'
|
410
|
-
|
411
|
-
i += 2
|
412
|
-
else
|
413
|
-
i += 1
|
414
|
-
end
|
415
|
-
primary += 'K'
|
416
|
-
secondary += 'K'
|
72
|
+
i += gen_encode(w, i, 'K', 'K', code)
|
417
73
|
when 'R'
|
418
|
-
|
419
|
-
if i == last && !slavo_germanic?(w) &&
|
420
|
-
(i > 1 && w[i - 2, 2] == "IE") &&
|
421
|
-
!(i > 3 && w[i - 4, 2] =~ /M[EA]/)
|
422
|
-
secondary += 'R'
|
423
|
-
else
|
424
|
-
primary += 'R'
|
425
|
-
secondary += 'R'
|
426
|
-
end
|
427
|
-
if w[i + 1] == 'R'
|
428
|
-
i += 2
|
429
|
-
else
|
430
|
-
i += 1
|
431
|
-
end
|
74
|
+
i += encode_r(w, i, len, code)
|
432
75
|
when 'S'
|
433
|
-
|
434
|
-
if i > 0 && w[i - 1, 3] =~ /ISL|YSL/
|
435
|
-
i += 1
|
436
|
-
# special case 'sugar-'
|
437
|
-
elsif i == 0 && w[i, 5] == 'SUGAR'
|
438
|
-
primary += 'X'
|
439
|
-
secondary += 'S'
|
440
|
-
i += 1
|
441
|
-
elsif w[i, 2] == 'SH'
|
442
|
-
# germanic
|
443
|
-
if w[i + 1, 4] =~ /HEIM|HOEK|HOLM|HOLZ/
|
444
|
-
primary += 'S'
|
445
|
-
secondary += 'S'
|
446
|
-
else
|
447
|
-
primary += 'X'
|
448
|
-
secondary += 'X'
|
449
|
-
end
|
450
|
-
i += 2
|
451
|
-
# italian & armenian
|
452
|
-
elsif w[i, 3] =~ /SIO|SIA/ || w[i, 4] == 'SIAN'
|
453
|
-
if !slavo_germanic?(w)
|
454
|
-
primary += 'S'
|
455
|
-
secondary += 'X'
|
456
|
-
else
|
457
|
-
primary += 'S'
|
458
|
-
secondary += 'S'
|
459
|
-
end
|
460
|
-
i += 3
|
461
|
-
# german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
|
462
|
-
# also, -sz- in slavic language altho in hungarian it is pronounced 's'
|
463
|
-
elsif (i == 0 && w[i + 1] =~ /[MNLW]/) || w[i + 1] == 'Z'
|
464
|
-
primary += 'S'
|
465
|
-
secondary += 'X'
|
466
|
-
if w[i + 1] == 'Z'
|
467
|
-
i += 2
|
468
|
-
else
|
469
|
-
i += 1
|
470
|
-
end
|
471
|
-
elsif w[i, 2] == 'SC'
|
472
|
-
# Schlesinger's rule
|
473
|
-
if w[i + 2] == 'H'
|
474
|
-
# dutch origin, e.g. 'school', 'schooner'
|
475
|
-
if w[i + 3, 2] =~ /OO|ER|EN|UY|ED|EM/
|
476
|
-
# 'schermerhorn', 'schenker'
|
477
|
-
if w[i + 3, 2] =~ /ER|EN/
|
478
|
-
primary += 'X'
|
479
|
-
secondary += 'SK'
|
480
|
-
else
|
481
|
-
primary += 'SK'
|
482
|
-
secondary += 'SK'
|
483
|
-
end
|
484
|
-
i += 3
|
485
|
-
else
|
486
|
-
if i == 0 && !vowel?(w[3]) && w[3] != 'W'
|
487
|
-
primary += 'X'
|
488
|
-
secondary += 'S'
|
489
|
-
else
|
490
|
-
primary += 'X'
|
491
|
-
secondary += 'X'
|
492
|
-
end
|
493
|
-
i += 3
|
494
|
-
end
|
495
|
-
elsif w[i + 2, 1] =~ /[IEY]/
|
496
|
-
primary += 'S'
|
497
|
-
secondary += 'S'
|
498
|
-
i += 3
|
499
|
-
else
|
500
|
-
primary += 'SK'
|
501
|
-
secondary += 'SK'
|
502
|
-
i += 3
|
503
|
-
end
|
504
|
-
else
|
505
|
-
# french e.g. 'resnais', 'artois'
|
506
|
-
if i == last && i > 1 && w[i - 2, 2] =~ /AI|OI/
|
507
|
-
secondary += 'S'
|
508
|
-
else
|
509
|
-
primary += 'S'
|
510
|
-
secondary += 'S'
|
511
|
-
end
|
512
|
-
if w[i + 1] =~ /[SZ]/
|
513
|
-
i += 2
|
514
|
-
else
|
515
|
-
i += 1
|
516
|
-
end
|
517
|
-
end
|
76
|
+
i += encode_s(w, i, len, code)
|
518
77
|
when 'T'
|
519
|
-
|
520
|
-
primary += 'X'
|
521
|
-
secondary += 'X'
|
522
|
-
i += 3
|
523
|
-
elsif w[i, 3] =~ /TIA|TCH/
|
524
|
-
primary += 'X'
|
525
|
-
secondary += 'X'
|
526
|
-
i += 3
|
527
|
-
elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
|
528
|
-
# special case 'thomas', 'thames' or germanic
|
529
|
-
if w[i + 2, 2] =~ /OM|AM/ || w[0, 4] =~ /VAN|VON\s/ || w[0, 3] == 'SCH'
|
530
|
-
primary += 'T'
|
531
|
-
secondary += 'T'
|
532
|
-
else
|
533
|
-
primary += '0'
|
534
|
-
secondary += 'T'
|
535
|
-
end
|
536
|
-
i += 2
|
537
|
-
else
|
538
|
-
if w[i + 1] =~ /[TD]/
|
539
|
-
i += 2
|
540
|
-
else
|
541
|
-
i += 1
|
542
|
-
end
|
543
|
-
primary += 'T'
|
544
|
-
secondary += 'T'
|
545
|
-
end
|
78
|
+
i += encode_t(w, i, len, code)
|
546
79
|
when 'V'
|
547
|
-
|
548
|
-
i += 2
|
549
|
-
else
|
550
|
-
i += 1
|
551
|
-
end
|
552
|
-
primary += 'F'
|
553
|
-
secondary += 'F'
|
80
|
+
i += gen_encode(w, i, 'F', 'F', code)
|
554
81
|
when 'W'
|
555
|
-
|
556
|
-
if w[i, 2] == 'WR'
|
557
|
-
primary += 'R'
|
558
|
-
secondary += 'R'
|
559
|
-
i += 2
|
560
|
-
else
|
561
|
-
if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
|
562
|
-
# Wasserman should match Vasserman
|
563
|
-
if vowel?(w[i + 1])
|
564
|
-
primary += 'A'
|
565
|
-
secondary += 'F'
|
566
|
-
else
|
567
|
-
# need Uomo to match Womo
|
568
|
-
primary += 'A'
|
569
|
-
secondary += 'A'
|
570
|
-
end
|
571
|
-
end
|
572
|
-
# Arnow should match Arnoff
|
573
|
-
if i == last && i > 0 && vowel?(w[i - 1]) ||
|
574
|
-
(i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/) || w[0, 3] == 'SCH'
|
575
|
-
secondary += 'F'
|
576
|
-
i += 1
|
577
|
-
elsif w[i, 4] =~ /WICZ|WITZ/
|
578
|
-
# polish e.g. 'filipowicz'
|
579
|
-
primary += 'TS'
|
580
|
-
secondary += 'FX'
|
581
|
-
i += 4
|
582
|
-
else
|
583
|
-
i += 1
|
584
|
-
end
|
585
|
-
end
|
82
|
+
i += encode_w(w, i, len, code)
|
586
83
|
when 'X'
|
587
|
-
|
588
|
-
if !(i == last && ((i > 2 && w[i - 3, 3] =~ /IAU|EAU/) || (i > 1 && w[i - 2, 2] =~ /AU|OU/)))
|
589
|
-
primary += 'KS'
|
590
|
-
secondary += 'KS'
|
591
|
-
end
|
592
|
-
if w[i + 1] =~ /[CX]/
|
593
|
-
i += 2
|
594
|
-
else
|
595
|
-
i += 1
|
596
|
-
end
|
84
|
+
i += encode_x(w, i, len, code)
|
597
85
|
when 'Z'
|
598
|
-
|
599
|
-
if w[i + 1] == 'H'
|
600
|
-
primary += 'J'
|
601
|
-
secondary += 'J'
|
602
|
-
i += 2
|
603
|
-
else
|
604
|
-
if w[i + 1, 2] =~ /ZO|ZI|ZA/ || slavo_germanic?(w) && (i > 0 && w[i - 1] != 'T')
|
605
|
-
primary += 'S'
|
606
|
-
secondary += 'TS';
|
607
|
-
else
|
608
|
-
primary += 'S'
|
609
|
-
secondary += 'S';
|
610
|
-
end
|
611
|
-
if w[i + 1] == 'Z'
|
612
|
-
i += 2
|
613
|
-
else
|
614
|
-
i += 1
|
615
|
-
end
|
616
|
-
end
|
86
|
+
i += encode_z(w, i, len, code)
|
617
87
|
else
|
618
88
|
i += 1
|
619
89
|
end
|
620
90
|
end
|
621
|
-
[
|
91
|
+
[code.first[0, code_size], code.last[0, code_size]]
|
622
92
|
end
|
623
93
|
|
624
94
|
def self.encode(str, options = { size: 4 })
|
@@ -627,14 +97,494 @@ module Phonetic
|
|
627
97
|
|
628
98
|
private
|
629
99
|
|
630
|
-
def self.
|
631
|
-
|
100
|
+
def self.encode_start_of_word(w, code)
|
101
|
+
i = 0
|
102
|
+
# skip these when at start of word
|
103
|
+
if w[0, 2] =~ /[GKP]N|WR|PS/
|
104
|
+
i = 1
|
105
|
+
# initial 'X' is pronounced 'Z' e.g. 'Xavier'
|
106
|
+
elsif w[0] == 'X'
|
107
|
+
code.add 'S', 'S'
|
108
|
+
i = 1
|
109
|
+
elsif w[0] =~ /[AEIOUY]/
|
110
|
+
code.add 'A', 'A' # all init vowels now map to 'A'
|
111
|
+
i = 1
|
112
|
+
elsif w[0, 6] == 'CAESAR' # special case 'caesar'
|
113
|
+
code.add 'S', 'S'
|
114
|
+
i = 1
|
115
|
+
end
|
116
|
+
i
|
632
117
|
end
|
633
118
|
|
634
|
-
def self.
|
635
|
-
|
636
|
-
|
119
|
+
def self.gen_encode(w, i, primary, secondary, code)
|
120
|
+
code.add primary, secondary
|
121
|
+
w[i + 1] == w[i] ? 2 : 1
|
637
122
|
end
|
638
123
|
|
124
|
+
def self.encode_c(w, i, len, code)
|
125
|
+
r = 1
|
126
|
+
case
|
127
|
+
# various germanic
|
128
|
+
when c_germanic?(w, i)
|
129
|
+
code.add 'K', 'K'
|
130
|
+
r += 1
|
131
|
+
when w[i, 2] == 'CH'
|
132
|
+
encode_ch(w, i, len, code)
|
133
|
+
r += 1
|
134
|
+
when w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
|
135
|
+
# e.g, 'czerny'
|
136
|
+
code.add 'S', 'X'
|
137
|
+
r += 1
|
138
|
+
when w[i + 1, 3] == 'CIA'
|
139
|
+
# e.g., 'focaccia'
|
140
|
+
code.add 'X', 'X'
|
141
|
+
r += 2
|
142
|
+
# double 'C', but not if e.g. 'McClellan'
|
143
|
+
when w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
|
144
|
+
r += encode_cc(w, i, code) + 1
|
145
|
+
when w[i, 2] =~ /C[KGQ]/
|
146
|
+
code.add 'K', 'K'
|
147
|
+
r += 1
|
148
|
+
when w[i, 2] =~ /C[IEY]/
|
149
|
+
# italian vs. english
|
150
|
+
if w[i, 3] =~ /CI[OEA]/
|
151
|
+
code.add 'S', 'X'
|
152
|
+
else
|
153
|
+
code.add 'S', 'S'
|
154
|
+
end
|
155
|
+
r += 1
|
156
|
+
else
|
157
|
+
code.add 'K', 'K'
|
158
|
+
# name sent in 'mac caffrey', 'mac gregor'
|
159
|
+
if w[i + 1, 2] =~ /\s[CQG]/
|
160
|
+
r += 2
|
161
|
+
elsif w[i + 1] =~ /[CKQ]/ && w[i + 1, 2] !~ /C[EI]/
|
162
|
+
r += 1
|
163
|
+
end
|
164
|
+
end
|
165
|
+
r
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.encode_d(w, i, len, code)
|
169
|
+
r = 1
|
170
|
+
if w[i, 2] == 'DG'
|
171
|
+
if w[i + 2] =~ /[IEY]/
|
172
|
+
# e.g. 'edge'
|
173
|
+
code.add 'J', 'J'
|
174
|
+
r += 2
|
175
|
+
else
|
176
|
+
# e.g. 'edgar'
|
177
|
+
code.add 'TK', 'TK'
|
178
|
+
r += 1
|
179
|
+
end
|
180
|
+
elsif w[i, 2] =~ /D[TD]/
|
181
|
+
code.add 'T', 'T'
|
182
|
+
r += 1
|
183
|
+
else
|
184
|
+
code.add 'T', 'T'
|
185
|
+
end
|
186
|
+
r
|
187
|
+
end
|
188
|
+
|
189
|
+
def self.encode_g(w, i, len, code)
|
190
|
+
r = 2
|
191
|
+
if w[i + 1] == 'H'
|
192
|
+
encode_gh(w, i, code)
|
193
|
+
elsif w[i + 1] == 'N'
|
194
|
+
encode_gn(w, i, code)
|
195
|
+
# 'tagliaro'
|
196
|
+
elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
|
197
|
+
code.add 'KL', 'L'
|
198
|
+
# -ges-, -gep-, -gel-, -gie- at beginning
|
199
|
+
elsif i == 0 && w[1, 2] =~ /^Y|E[SPBLYIR]|I[BLNE]/
|
200
|
+
code.add 'K', 'J'
|
201
|
+
# -ger-, -gy-
|
202
|
+
elsif g_ger_or_gy?(w, i)
|
203
|
+
code.add 'K', 'J'
|
204
|
+
# italian e.g, 'biaggi'
|
205
|
+
elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
|
206
|
+
if w[0, 4] =~ /^(VAN |VON |SCH)/ || w[i + 1, 2] == 'ET'
|
207
|
+
code.add 'K', 'K'
|
208
|
+
elsif w[i + 1, 4] =~ /IER\s/
|
209
|
+
code.add 'J', 'J'
|
210
|
+
else
|
211
|
+
code.add 'J', 'K'
|
212
|
+
end
|
213
|
+
else
|
214
|
+
r -= 1 if w[i + 1] != 'G'
|
215
|
+
code.add 'K', 'K'
|
216
|
+
end
|
217
|
+
r
|
218
|
+
end
|
219
|
+
|
220
|
+
def self.encode_h(w, i, len, code)
|
221
|
+
r = 1
|
222
|
+
# only keep if first & before vowel or btw. 2 vowels
|
223
|
+
if (i == 0 || i > 0 && vowel?(w[i - 1])) && vowel?(w[i + 1])
|
224
|
+
code.add 'H', 'H'
|
225
|
+
r += 1
|
226
|
+
end
|
227
|
+
r
|
228
|
+
end
|
229
|
+
|
230
|
+
def self.encode_j(w, i, len, code)
|
231
|
+
r = 1
|
232
|
+
last = len - 1
|
233
|
+
# obvious spanish, 'jose', 'san jacinto'
|
234
|
+
if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
|
235
|
+
if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
|
236
|
+
code.add 'H', 'H'
|
237
|
+
else
|
238
|
+
code.add 'J', 'H'
|
239
|
+
end
|
240
|
+
else
|
241
|
+
if i == 0 && w[i, 4] != 'JOSE'
|
242
|
+
code.add 'J', 'A'
|
243
|
+
# Yankelovich/Jankelowicz
|
244
|
+
else
|
245
|
+
# spanish pron. of e.g. 'bajador'
|
246
|
+
if j_spanish_pron?(w, i)
|
247
|
+
code.add 'J', 'H'
|
248
|
+
elsif i == last
|
249
|
+
code.add 'J', ''
|
250
|
+
elsif w[i + 1] !~ /[LTKSNMBZ]/ && !(i > 0 && w[i - 1] =~ /[SKL]/)
|
251
|
+
code.add 'J', 'J'
|
252
|
+
end
|
253
|
+
end
|
254
|
+
r += 1 if w[i + 1] == 'J'
|
255
|
+
end
|
256
|
+
r
|
257
|
+
end
|
258
|
+
|
259
|
+
def self.encode_l(w, i, len, code)
|
260
|
+
r = 1
|
261
|
+
if w[i + 1] == 'L'
|
262
|
+
# spanish e.g. 'cabrillo', 'gallegos'
|
263
|
+
if ll_spanish?(w, i, len)
|
264
|
+
code.add 'L', ''
|
265
|
+
else
|
266
|
+
code.add 'L', 'L'
|
267
|
+
end
|
268
|
+
r += 1
|
269
|
+
else
|
270
|
+
code.add 'L', 'L'
|
271
|
+
end
|
272
|
+
r
|
273
|
+
end
|
274
|
+
|
275
|
+
def self.encode_m(w, i, len, code)
|
276
|
+
r = 1
|
277
|
+
# 'dumb','thumb'
|
278
|
+
r += 1 if i > 0 && w[i - 1, 5] =~ /UMB( |ER)/ || w[i + 1] == 'M'
|
279
|
+
code.add 'M', 'M'
|
280
|
+
r
|
281
|
+
end
|
282
|
+
|
283
|
+
def self.encode_p(w, i, len, code)
|
284
|
+
r = 1
|
285
|
+
if w[i + 1] == 'H'
|
286
|
+
code.add 'F', 'F'
|
287
|
+
r += 1
|
288
|
+
else
|
289
|
+
# also account for "campbell", "raspberry"
|
290
|
+
r += 1 if w[i + 1] =~ /[PB]/
|
291
|
+
code.add 'P', 'P'
|
292
|
+
end
|
293
|
+
r
|
294
|
+
end
|
295
|
+
|
296
|
+
def self.encode_r(w, i, len, code)
|
297
|
+
last = len - 1
|
298
|
+
# french e.g. 'rogier', but exclude 'hochmeier'
|
299
|
+
if r_french?(w, i, last)
|
300
|
+
code.add '', 'R'
|
301
|
+
else
|
302
|
+
code.add 'R', 'R'
|
303
|
+
end
|
304
|
+
w[i + 1] == 'R' ? 2 : 1
|
305
|
+
end
|
306
|
+
|
307
|
+
def self.encode_s(w, i, len, code)
|
308
|
+
r = 1
|
309
|
+
last = len - 1
|
310
|
+
# special cases 'island', 'isle', 'carlisle', 'carlysle'
|
311
|
+
if i > 0 && w[i - 1, 3] =~ /[IY]SL/
|
312
|
+
# special case 'sugar-'
|
313
|
+
elsif i == 0 && w[i, 5] == 'SUGAR'
|
314
|
+
code.add 'X', 'S'
|
315
|
+
elsif w[i, 2] == 'SH'
|
316
|
+
# germanic
|
317
|
+
if w[i + 1, 4] =~ /H(EIM|OEK|OL[MZ])/
|
318
|
+
code.add 'S', 'S'
|
319
|
+
else
|
320
|
+
code.add 'X', 'X'
|
321
|
+
end
|
322
|
+
r += 1
|
323
|
+
# italian & armenian
|
324
|
+
elsif w[i, 3] =~ /SI[OA]/
|
325
|
+
if !slavo_germanic?(w)
|
326
|
+
code.add 'S', 'X'
|
327
|
+
else
|
328
|
+
code.add 'S', 'S'
|
329
|
+
end
|
330
|
+
r += 2
|
331
|
+
# german & anglicisations, e.g. 'smith' match 'schmidt',
|
332
|
+
# 'snider' match 'schneider' also, -sz- in slavic language altho in
|
333
|
+
# hungarian it is pronounced 's'
|
334
|
+
elsif i == 0 && w[i + 1] =~ /[MNLW]/ || w[i + 1] == 'Z'
|
335
|
+
code.add 'S', 'X'
|
336
|
+
r += 1 if w[i + 1] == 'Z'
|
337
|
+
elsif w[i, 2] == 'SC'
|
338
|
+
encode_sc(w, i, code)
|
339
|
+
r += 2
|
340
|
+
# french e.g. 'resnais', 'artois'
|
341
|
+
else
|
342
|
+
if i == last && i > 1 && w[i - 2, 2] =~ /[AO]I/
|
343
|
+
code.add '', 'S'
|
344
|
+
else
|
345
|
+
code.add 'S', 'S'
|
346
|
+
end
|
347
|
+
r += 1 if w[i + 1] =~ /[SZ]/
|
348
|
+
end
|
349
|
+
r
|
350
|
+
end
|
351
|
+
|
352
|
+
def self.encode_t(w, i, len, code)
|
353
|
+
r = 1
|
354
|
+
if w[i, 4] =~ /^(TION|TIA|TCH)/
|
355
|
+
code.add 'X', 'X'
|
356
|
+
r += 2
|
357
|
+
elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
|
358
|
+
# special case 'thomas', 'thames' or germanic
|
359
|
+
if w[i + 2, 2] =~ /[OA]M/ || w[0, 4] =~ /^(VAN |VON |SCH)/
|
360
|
+
code.add 'T', 'T'
|
361
|
+
else
|
362
|
+
code.add '0', 'T'
|
363
|
+
end
|
364
|
+
r += 1
|
365
|
+
else
|
366
|
+
r += 1 if w[i + 1] =~ /[TD]/
|
367
|
+
code.add 'T', 'T'
|
368
|
+
end
|
369
|
+
r
|
370
|
+
end
|
371
|
+
|
372
|
+
def self.encode_w(w, i, len, code)
|
373
|
+
last = len - 1
|
374
|
+
r = 1
|
375
|
+
# can also be in middle of word
|
376
|
+
if w[i, 2] == 'WR'
|
377
|
+
code.add 'R', 'R'
|
378
|
+
r += 1
|
379
|
+
else
|
380
|
+
if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
|
381
|
+
# Wasserman should match Vasserman
|
382
|
+
if vowel?(w[i + 1])
|
383
|
+
code.add 'A', 'F'
|
384
|
+
else
|
385
|
+
# need Uomo to match Womo
|
386
|
+
code.add 'A', 'A'
|
387
|
+
end
|
388
|
+
end
|
389
|
+
# Arnow should match Arnoff
|
390
|
+
if i == last && i > 0 && vowel?(w[i - 1]) ||
|
391
|
+
i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/ ||
|
392
|
+
w[0, 3] == 'SCH'
|
393
|
+
code.add '', 'F'
|
394
|
+
elsif w[i, 4] =~ /WICZ|WITZ/
|
395
|
+
# polish e.g. 'filipowicz'
|
396
|
+
code.add 'TS', 'FX'
|
397
|
+
r += 3
|
398
|
+
end
|
399
|
+
end
|
400
|
+
r
|
401
|
+
end
|
402
|
+
|
403
|
+
def self.encode_x(w, i, len, code)
|
404
|
+
# french e.g. breaux
|
405
|
+
code.add 'KS', 'KS' unless x_french?(w, i, len - 1)
|
406
|
+
w[i + 1] =~ /[CX]/ ? 2 : 1
|
407
|
+
end
|
408
|
+
|
409
|
+
def self.encode_z(w, i, len, code)
|
410
|
+
r = 1
|
411
|
+
# chinese pinyin e.g. 'zhao'
|
412
|
+
if w[i + 1] == 'H'
|
413
|
+
code.add 'J', 'J'
|
414
|
+
r += 1
|
415
|
+
else
|
416
|
+
if w[i + 1, 2] =~ /Z[OIA]/ ||
|
417
|
+
slavo_germanic?(w) && i > 0 && w[i - 1] != 'T'
|
418
|
+
code.add 'S', 'TS';
|
419
|
+
else
|
420
|
+
code.add 'S', 'S';
|
421
|
+
end
|
422
|
+
r += 1 if w[i + 1] == 'Z'
|
423
|
+
end
|
424
|
+
r
|
425
|
+
end
|
426
|
+
|
427
|
+
def self.encode_ch(w, i, len, code)
|
428
|
+
case
|
429
|
+
# italian 'chianti'
|
430
|
+
when w[i, 4] == 'CHIA'
|
431
|
+
code.add 'K', 'K'
|
432
|
+
# find 'michael'
|
433
|
+
when i > 0 && w[i, 4] == 'CHAE'
|
434
|
+
code.add 'K', 'X'
|
435
|
+
# greek roots e.g. 'chemistry', 'chorus'
|
436
|
+
when ch_greek_roots?(w, i)
|
437
|
+
code.add 'K', 'K'
|
438
|
+
# germanic, greek, or otherwise 'ch' for 'kh' sound
|
439
|
+
when ch_germanic_or_greek?(w, i, len)
|
440
|
+
code.add 'K', 'K'
|
441
|
+
when i == 0
|
442
|
+
code.add 'X', 'X'
|
443
|
+
when w[0, 2] == 'MC'
|
444
|
+
# e.g., "McHugh"
|
445
|
+
code.add 'K', 'K'
|
446
|
+
else
|
447
|
+
code.add 'X', 'K'
|
448
|
+
end
|
449
|
+
end
|
450
|
+
|
451
|
+
def self.encode_cc(w, i, code)
|
452
|
+
r = 0
|
453
|
+
# 'bellocchio' but not 'bacchus'
|
454
|
+
if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
|
455
|
+
# 'accident', 'accede' 'succeed'
|
456
|
+
if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
|
457
|
+
# 'bacci', 'bertucci', other italian
|
458
|
+
code.add 'KS', 'KS'
|
459
|
+
else
|
460
|
+
code.add 'X', 'X'
|
461
|
+
end
|
462
|
+
r = 1
|
463
|
+
else
|
464
|
+
# Pierce's rule
|
465
|
+
code.add 'K', 'K'
|
466
|
+
end
|
467
|
+
r
|
468
|
+
end
|
469
|
+
|
470
|
+
def self.encode_gh(w, i, code)
|
471
|
+
if i > 0 && !vowel?(w[i - 1])
|
472
|
+
code.add 'K', 'K'
|
473
|
+
elsif i == 0
|
474
|
+
# ghislane, ghiradelli
|
475
|
+
if w[i + 2] == 'I'
|
476
|
+
code.add 'J', 'J'
|
477
|
+
else
|
478
|
+
code.add 'K', 'K'
|
479
|
+
end
|
480
|
+
# Parker's rule (with some further refinements)
|
481
|
+
elsif !(i > 1 && w[i - 2] =~ /[BHD]/ || # e.g., 'hugh'
|
482
|
+
i > 2 && w[i - 3] =~ /[BHD]/ || # e.g., 'bough'
|
483
|
+
i > 3 && w[i - 4] =~ /[BH]/) # e.g., 'broughton'
|
484
|
+
# e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
|
485
|
+
if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
|
486
|
+
code.add 'F', 'F'
|
487
|
+
elsif i > 0 && w[i - 1] != 'I'
|
488
|
+
code.add 'K', 'K'
|
489
|
+
end
|
490
|
+
end
|
491
|
+
end
|
492
|
+
|
493
|
+
def self.encode_gn(w, i, code)
|
494
|
+
if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
|
495
|
+
code.add 'KN', 'N'
|
496
|
+
# not e.g. 'cagney'
|
497
|
+
elsif w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
|
498
|
+
code.add 'N', 'KN'
|
499
|
+
else
|
500
|
+
code.add 'KN', 'KN'
|
501
|
+
end
|
502
|
+
end
|
503
|
+
|
504
|
+
def self.encode_sc(w, i, code)
|
505
|
+
# Schlesinger's rule
|
506
|
+
if w[i + 2] == 'H'
|
507
|
+
# dutch origin, e.g. 'school', 'schooner'
|
508
|
+
if w[i + 3, 2] =~ /OO|UY|E[DM]/
|
509
|
+
code.add 'SK', 'SK'
|
510
|
+
# 'schermerhorn', 'schenker'
|
511
|
+
elsif w[i + 3, 2] =~ /E[RN]/
|
512
|
+
code.add 'X', 'SK'
|
513
|
+
elsif i == 0 && !vowel?(w[3]) && w[3] != 'W'
|
514
|
+
code.add 'X', 'S'
|
515
|
+
else
|
516
|
+
code.add 'X', 'X'
|
517
|
+
end
|
518
|
+
elsif w[i + 2] =~ /[IEY]/
|
519
|
+
code.add 'S', 'S'
|
520
|
+
else
|
521
|
+
code.add 'SK', 'SK'
|
522
|
+
end
|
523
|
+
end
|
524
|
+
|
525
|
+
def self.slavo_germanic?(w)
|
526
|
+
w =~ /W|K|CZ|WITZ/
|
527
|
+
end
|
528
|
+
|
529
|
+
def self.vowel?(c)
|
530
|
+
c =~ /[AEIOUY]/
|
531
|
+
end
|
532
|
+
|
533
|
+
def self.c_germanic?(w, i)
|
534
|
+
# various germanic
|
535
|
+
i > 1 &&
|
536
|
+
!vowel?(w[i - 2]) &&
|
537
|
+
w[i - 1, 3] == 'ACH' &&
|
538
|
+
(w[i + 2] !~ /[IE]/ || w[i - 2, 6] =~ /[BM]ACHER/)
|
539
|
+
end
|
540
|
+
|
541
|
+
def self.ch_greek_roots?(w, i)
|
542
|
+
# greek roots e.g. 'chemistry', 'chorus'
|
543
|
+
i == 0 && w[1, 5] =~ /^H(ARAC|ARIS|OR|YM|IA|EM)/ && w[0, 5] != 'CHORE'
|
544
|
+
end
|
545
|
+
|
546
|
+
def self.ch_germanic_or_greek?(w, i, len)
|
547
|
+
# germanic, greek, or otherwise 'ch' for 'kh' sound
|
548
|
+
w[0, 4] =~ /^(V[AO]N\s|SCH)/ ||
|
549
|
+
# 'architect but not 'arch', 'orchestra', 'orchid'
|
550
|
+
i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/ ||
|
551
|
+
(w[i + 2] =~ /[TS]/) ||
|
552
|
+
(i > 0 && w[i - 1] =~ /[AOUE]/ || i == 0) &&
|
553
|
+
# e.g., 'wachtler', 'wechsler', but not 'tichner'
|
554
|
+
(w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
|
555
|
+
end
|
556
|
+
|
557
|
+
def self.g_ger_or_gy?(w, i)
|
558
|
+
# -ger-, -gy-
|
559
|
+
w[i + 1, 2] =~ /^(ER|Y)/ &&
|
560
|
+
w[0, 6] !~ /[DRM]ANGER/ &&
|
561
|
+
!(i > 0 && w[i - 1] =~ /[EI]/) &&
|
562
|
+
!(i > 0 && w[i - 1, 3] =~ /[RO]GY/)
|
563
|
+
end
|
564
|
+
|
565
|
+
def self.j_spanish_pron?(w, i)
|
566
|
+
# spanish pron. of e.g. 'bajador'
|
567
|
+
i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && w[i + 1] =~ /[AO]/
|
568
|
+
end
|
569
|
+
|
570
|
+
def self.ll_spanish?(w, i, len)
|
571
|
+
last = len - 1
|
572
|
+
# spanish e.g. 'cabrillo', 'gallegos'
|
573
|
+
(i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILL[OA]|ALLE/) ||
|
574
|
+
(last > 0 && w[last - 1, 2] =~ /[AO]S/ || w[last] =~ /[AO]/) &&
|
575
|
+
(i > 0 && w[i - 1, 4] == 'ALLE')
|
576
|
+
end
|
577
|
+
|
578
|
+
def self.r_french?(w, i, last)
|
579
|
+
# french e.g. 'rogier', but exclude 'hochmeier'
|
580
|
+
i == last && !slavo_germanic?(w) &&
|
581
|
+
i > 1 && w[i - 2, 2] == 'IE' &&
|
582
|
+
!(i > 3 && w[i - 4, 2] =~ /M[EA]/)
|
583
|
+
end
|
584
|
+
|
585
|
+
def self.x_french?(w, i, last)
|
586
|
+
# french e.g. breaux
|
587
|
+
i == last && (i > 2 && w[i - 3, 3] =~ /[IE]AU/ || i > 1 && w[i - 2, 2] =~ /[AO]U/)
|
588
|
+
end
|
639
589
|
end
|
640
590
|
end
|