phonetic 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: af5149abea885ede20731d2ddf269a57588ca5fa
4
- data.tar.gz: 2ebb72004c4fc667801a2b7087766c8cecb1ff1a
3
+ metadata.gz: a20da7ce0b4dab68d7671088098226a035c64b05
4
+ data.tar.gz: 2b721bc986d8e23ba6780bb7cab92059e6a7652b
5
5
  SHA512:
6
- metadata.gz: 359113efab060b09395e6805bbd2e4b69aee42177b070a03361daad56c39fa6348ab836e77900a14b54236299b1c3b2f24b2b28cc546294bed5065961c465c72
7
- data.tar.gz: b8879f75acc85d2b24b705ca1cc48265702bafaff41f4f5de0783c1950abb62e160f41e5412c48fec7ff0ba29e2a820fa96a295435eed3228fba28cec0d8cbc9
6
+ metadata.gz: 14325fa3846251dd1a1cbc59b38c12a32471291b45b07074387747fa9331b5ad98b1b0afaa8dbbac62872f9bf959d5e622742e5ec673f3e1294807f91b5fdc85
7
+ data.tar.gz: ad80a4c26cae46cbc516cc6cfebbfea39be69fbfe86426737cd94c065da57f4448ff3ffe9f892bd60af94fd834ae122c472cdc09b5fe683344ef479d1f31f90c
data/.rspec CHANGED
@@ -1,2 +1,2 @@
1
1
  --color
2
- --format doc
2
+ --format doc
data/README.md CHANGED
@@ -93,6 +93,13 @@ or use alias:
93
93
  'Bonnie'.nysiis # => 'BANY'
94
94
  ```
95
95
 
96
+ ### Daitch–Mokotoff Soundex (D–M Soundex)
97
+ ```ruby
98
+ 'Anja'.dm_soundex # => ['060000', '064000']
99
+ 'Schwarz'.dm_soundex # => ['474000', '479400']
100
+ 'Schtolteheim'.dm_soundex # => ['283560']
101
+ ```
102
+
96
103
  ## Contributing
97
104
 
98
105
  1. Fork it
data/lib/phonetic.rb CHANGED
@@ -7,4 +7,5 @@ require 'phonetic/double_metaphone'
7
7
  require 'phonetic/metaphone2'
8
8
  require 'phonetic/caverphone'
9
9
  require 'phonetic/caverphone2'
10
+ require 'phonetic/dm_soundex'
10
11
  require 'phonetic/core_ext/string'
@@ -0,0 +1,12 @@
1
+ require 'phonetic/dm_soundex'
2
+
3
+ class String
4
+ # D-M Soundex values of string.
5
+ # @example
6
+ # 'Anja'.dm_soundex # => ['060000', '064000']
7
+ # 'Schwarz'.dm_soundex # => ['474000', '479400']
8
+ # 'Schtolteheim'.dm_soundex # => ['283560']
9
+ def dm_soundex(options = {})
10
+ Phonetic::DMSoundex.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,82 @@
1
+ require 'phonetic/algorithm'
2
+ require 'phonetic/dm_soundex_map'
3
+
4
+ module Phonetic
5
+ # Daitch–Mokotoff Soundex (D–M Soundex) is a phonetic algorithm invented
6
+ # in 1985 by Jewish genealogists Gary Mokotoff and Randy Daitch.
7
+ #
8
+ # @example
9
+ # Phonetic::DMSoundex.encode('Anja') # => ['060000', '064000']
10
+ # Phonetic::DMSoundex.encode('Schwarz') # => ['474000', '479400']
11
+ # Phonetic::DMSoundex.encode('Schtolteheim') # => ['283560']
12
+ class DMSoundex < Algorithm
13
+
14
+ def self.encode(str, options = {})
15
+ encode_word(str, options)
16
+ end
17
+
18
+ # Encode word to its D-M Soundex codes.
19
+ def self.encode_word(word, options = {})
20
+ w = word.strip.upcase.gsub(/[^A-Z]+/, '')
21
+ i = 0
22
+ code = init_code()
23
+ while i < w.size
24
+ if w[i] != w[i + 1]
25
+ c = find_code(MAP, w, i)
26
+ if c
27
+ len = c[3] + 1
28
+ if i == 0
29
+ code.add c[0]
30
+ elsif w[i + len] =~ /[AEIOUJY]/
31
+ code.add c[1]
32
+ else
33
+ code.add c[2]
34
+ end
35
+ i += c[3]
36
+ end
37
+ end
38
+ i += 1
39
+ end
40
+ code.result
41
+ end
42
+
43
+ private
44
+
45
+ def self.init_code
46
+ code = [[]]
47
+ def code.add(a)
48
+ case a
49
+ when Array
50
+ c = self.map{|w| w.last != a[1] ? w + [a[1]] : w}
51
+ self.map!{|w| w.last != a[0] ? w + [a[0]] : w}
52
+ self.push(*c)
53
+ else
54
+ self.map!{|w| w.last != a ? w + [a] : w}
55
+ end
56
+ end
57
+ def code.result
58
+ self.map{|w| w.join[0..5].ljust(6, '0')}.uniq
59
+ end
60
+ code
61
+ end
62
+
63
+ def self.find_code(map, w, i, last = nil, count = 0)
64
+ elem = map[w[i]]
65
+ r = case elem
66
+ when Array
67
+ elem[3] = count
68
+ elem
69
+ when Hash
70
+ _last = last
71
+ if elem['self']
72
+ _last = elem['self']
73
+ _last[3] = count
74
+ end
75
+ find_code(elem, w, i + 1, _last, count + 1)
76
+ when nil
77
+ last
78
+ end
79
+ r
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,233 @@
1
+ # encoding: utf-8
2
+ require 'phonetic/algorithm'
3
+
4
+ module Phonetic
5
+ class DMSoundex < Algorithm
6
+ MAP = {
7
+ 'A' => {
8
+ 'self' => ['0', '', ''], # A
9
+ 'I' => ['0', '1', ''], # AI
10
+ 'J' => ['0', '1', ''], # AJ
11
+ 'Y' => ['0', '1', ''], # AY
12
+ 'U' => ['0', '7', ''] # AU
13
+ },
14
+ 'Ą' => ['', '', ['6', '']],
15
+ 'E' => {
16
+ 'self' => ['0', '', ''], # E
17
+ 'I' => ['0', '1', ''], # EI
18
+ 'Y' => ['0', '1', ''], # EY
19
+ 'J' => ['0', '1', ''], # EJ
20
+ 'U' => ['1', '1', ''] # EU
21
+ },
22
+ 'O' => {
23
+ 'self' => ['0', '', ''], # O
24
+ 'I' => ['0', '1', ''], # OI
25
+ 'J' => ['0', '1', ''], # OJ
26
+ 'Y' => ['0', '1', ''] # OY
27
+ },
28
+ 'U' => {
29
+ 'self' => ['0', '', ''], # U
30
+ 'I' => ['0', '1', ''], # UI
31
+ 'J' => ['0', '1', ''], # UJ
32
+ 'Y' => ['0', '1', ''], # UY
33
+ 'E' => ['0', '', ''] # UE
34
+ },
35
+ 'I' => {
36
+ 'self' => ['0', '', ''], # I
37
+ 'A' => ['1', '', ''], # IA
38
+ 'E' => ['1', '', ''], # IE
39
+ 'O' => ['1', '', ''], # IO
40
+ 'U' => ['1', '', ''] # IU
41
+ },
42
+ 'Y' => ['1', '', ''], # Y
43
+ 'J' => [['1', '4'], ['', '4'], ['', '4']], # J
44
+ 'B' => ['7', '7', '7'], # B
45
+ 'C' => {
46
+ 'self' => [['5', '4'], ['5', '4'], ['5', '4']], # C
47
+ 'H' => {
48
+ 'self' => [['5', '4'], ['5', '4'], ['5', '4']], # CH
49
+ 'S' => ['5', '54', '54'] # CHS
50
+ },
51
+ 'K' => [['5', '45'], ['5', '45'], ['5', '45']], # CK
52
+ 'S' => {
53
+ 'self' => ['4', '4', '4'], # CS
54
+ 'Z' => ['4', '4', '4'] # CSZ
55
+ },
56
+ 'Z' => {
57
+ 'self' => ['4', '4', '4'], # CZ
58
+ 'S' => ['4', '4', '4'] # CZS
59
+ }
60
+ },
61
+ 'D' => {
62
+ 'self' => ['3', '3', '3'], # D
63
+ 'R' => { # DR
64
+ 'S' => ['4', '4', '4'], # DRS
65
+ 'Z' => ['4', '4', '4'] # DRZ
66
+ },
67
+ 'S' => {
68
+ 'self' => ['4', '4', '4'], # DS
69
+ 'H' => ['4', '4', '4'] # DSH
70
+ },
71
+ 'T' => ['3', '3', '3'], # DT
72
+ 'Z' => {
73
+ 'self' =>['4', '4', '4'], # DZ
74
+ 'H' => ['4', '4', '4'], # DZH
75
+ 'S' => ['4', '4', '4'] # DZS
76
+ }
77
+ },
78
+ 'F' => {
79
+ 'self' => ['7', '7', '7'], # F
80
+ 'B' => ['7', '7', '7'] # FB
81
+ },
82
+ 'G' => ['5', '5', '5'], # G
83
+ 'H' => ['5', '5', ''], # H
84
+ 'K' => {
85
+ 'self' => ['5', '5', '5'], # K
86
+ 'H' => ['5', '5', '5'], # KH
87
+ 'S' => ['5', '54', '54'] # KS
88
+ },
89
+ 'L' => ['8', '8', '8'], # L
90
+ 'M' => {
91
+ 'self' => ['6', '6', '6'], # M
92
+ 'N' => ['', '66', '66'] # MN
93
+ },
94
+ 'N' => {
95
+ 'self' => ['6', '6', '6'], # N
96
+ 'M' => ['', '66', '66'] # NM
97
+ },
98
+ 'P' => {
99
+ 'self' => ['7', '7', '7'], # P
100
+ 'F' => ['7', '7', '7'], # PF
101
+ 'H' => ['7', '7', '7'] # PH
102
+ },
103
+ 'R' => {
104
+ 'self' => ['9', '9', '9'], # R
105
+ 'S' => [['94', '4'], ['94', '4'], ['94', '4']], # RS
106
+ 'Z' => [['4', '94'], ['4', '94'], ['4', '94']] # RZ
107
+ },
108
+ 'Q' => ['5', '5', '5'], # Q
109
+ 'S' => {
110
+ 'self' => ['4', '4', '4'], # S
111
+ 'C' => {
112
+ 'self' => ['2', '4', '4'], # SC
113
+ 'H' => {
114
+ 'self' => ['4', '4', '4'], # SCH
115
+ 'T' => {
116
+ 'self' => ['2', '43', '43'], # SCHT
117
+ 'S' => { # SCHTS
118
+ 'C' => { # SCHTSC
119
+ 'H' => ['2', '4', '4'] # SCHTSCH
120
+ },
121
+ 'H' => ['2', '4', '4'] # SCHTSH
122
+ },
123
+ 'C' => { # SCHTC
124
+ 'H' => ['2', '4', '4'] # SCHTCH
125
+ }
126
+ }
127
+ }
128
+ },
129
+ 'D' => ['2', '43', '43'], # SD
130
+ 'H' => {
131
+ 'self' => ['4', '4', '4'], # SH
132
+ 'C' => { # SHC
133
+ 'H' => ['2', '4', '4'] # SHCH
134
+ },
135
+ 'D' => ['2', '43', '43'], # SHD
136
+ 'T' => {
137
+ 'self' => ['2', '43', '43'], # SHT
138
+ 'C' => { # SHTC
139
+ 'H' => ['2', '4', '4'] # SHTCH
140
+ },
141
+ 'S' => { # SHTS
142
+ 'H' => ['2', '4', '4'] # SHTSH
143
+ }
144
+ }
145
+ },
146
+ 'T' => {
147
+ 'self' => ['2', '43', '43'], # ST
148
+ 'C' => { # STC
149
+ 'H' => ['2', '4', '4'] # STCH
150
+ },
151
+ 'S' => { # STS
152
+ 'C' => { # STSC
153
+ 'H' => ['2', '4', '4'] # STSCH
154
+ },
155
+ 'D' => ['2', '43', '43'], # SCHD
156
+ 'H' => ['2', '4', '4'] # STSH
157
+ },
158
+ 'R' => { # STR
159
+ 'S' => ['2', '4', '4'], # STRS
160
+ 'Z' => ['2', '4', '4'] # STRZ
161
+ }
162
+ },
163
+ 'Z' => {
164
+ 'self' => ['4', '4', '4'], # SZ
165
+ 'C' => { # SZC
166
+ 'S' => ['2', '4', '4'], # SZCS
167
+ 'Z' => ['2', '4', '4'] # SZCZ
168
+ },
169
+ 'D' => ['2', '43', '43'], # SZD
170
+ 'T' => ['2', '43', '43'] # SZT
171
+ }
172
+ },
173
+ 'T' => {
174
+ 'self' => ['3', '3', '3'], # T
175
+ 'C' => {
176
+ 'self' => ['4', '4', '4'], # TC
177
+ 'H' => ['4', '4', '4'] # TCH
178
+ },
179
+ 'H' => ['3', '3', '3'], # TH
180
+ 'R' => { # TR
181
+ 'C' => { # TRC
182
+ 'H' => ['4', '4', '4'] # TRCH
183
+ },
184
+ 'S' => ['4', '4', '4'], # TRS
185
+ 'Z' => ['4', '4', '4'] # TRZ
186
+ },
187
+ 'S' => {
188
+ 'self' => ['4', '4', '4'], # TS
189
+ 'H' => ['4', '4', '4'], # TSH
190
+ 'C' => { # TSC
191
+ 'H' => ['4', '4', '4'] # TSCH
192
+ },
193
+ 'Z' => ['4', '4', '4'] # TSZ
194
+ },
195
+ 'T' => { # TT
196
+ 'C' => { # TTC
197
+ 'H' => ['4', '4', '4'] # TTCH
198
+ },
199
+ 'S' => {
200
+ 'self' => ['4', '4', '4'], # TTS
201
+ 'C' => { # TTSC
202
+ 'H' => ['4', '4', '4'] # TTSCH
203
+ },
204
+ 'Z' => ['4', '4', '4'] # TTSZ
205
+ },
206
+ 'Z' => ['4', '4', '4'] # TTZ
207
+ },
208
+ 'Z' => {
209
+ 'self' => ['4', '4', '4'], # TZ
210
+ 'S' => ['4', '4', '4'] # TZS
211
+ }
212
+ },
213
+ 'X' => ['5', '54', '54'], # X
214
+ 'V' => ['7', '7', '7'], # V
215
+ 'W' => ['7', '7', '7'], # W
216
+ 'Z' => {
217
+ 'self' => ['4', '4', '4'], # Z
218
+ 'H' => {
219
+ 'self' => ['4', '4', '4'], # ZH
220
+ 'S' => { # ZHS
221
+ 'H' => ['4', '4', '4'] # ZHSH
222
+ }
223
+ },
224
+ 'S' => {
225
+ 'self' => ['4', '4', '4'], # ZS
226
+ 'C' => { # ZSC
227
+ 'H' => ['4', '4', '4'] # ZSCH
228
+ }
229
+ }
230
+ }
231
+ }
232
+ end
233
+ end
@@ -9,8 +9,10 @@ module Phonetic
9
9
  #
10
10
  # This implementation based on the PHP implementation by Stephen Woodbridge
11
11
  # and contains modifications of algorithm by Kevin Atkinson.
12
- # @see http://swoodbridge.com/DoubleMetaPhone/ PHP implementation by Stephen Woodbridge
13
- # @see http://aspell.net/metaphone/dmetaph.cpp C++ implementation with modifications by Kevin Atkinson
12
+ # @see http://swoodbridge.com/DoubleMetaPhone/
13
+ # PHP implementation by Stephen Woodbridge
14
+ # @see http://aspell.net/metaphone/dmetaph.cpp
15
+ # C++ implementation with modifications by Kevin Atkinson
14
16
  # @example
15
17
  # Phonetic::DoubleMetaphone.encode('czerny') # => ['SRN', 'XRN']
16
18
  # Phonetic::DoubleMetaphone.encode('dumb') # => ['TM', 'TM']
@@ -20,605 +22,73 @@ module Phonetic
20
22
  # Phonetic::Metaphone2.encode('dumb') # => ['TM', 'TM']
21
23
  # Phonetic::Metaphone2.encode('edgar') # => ['ATKR', 'ATKR']
22
24
  class DoubleMetaphone < Algorithm
23
- VOWELS = 'AEIOUY'
24
-
25
25
  # Encode word to its Double Metaphone code.
26
26
  def self.encode_word(word, options = { size: 4 })
27
27
  code_size = options[:size] || 4
28
28
  w = word.strip.upcase
29
- primary = ''
30
- secondary = ''
29
+ code = ['', '']
30
+ def code.add(primary, secondary)
31
+ self[0] += primary
32
+ self[1] += secondary
33
+ end
31
34
  i = 0
32
35
  len = w.size
33
36
  last = len - 1
34
37
  # pad the original string so that we can index beyond the edge of the world
35
38
  w += ' ' * 5
36
- # skip these when at start of word
37
- i += 1 if ['GN','KN','PN','WR','PS'].include? w[0, 2]
38
- # initial 'X' is pronounced 'Z' e.g. 'Xavier'
39
- if w[0] == 'X'
40
- primary += 'S'
41
- secondary += 'S'
42
- i += 1
43
- end
44
- while i < len && (primary.size < code_size || primary.size < code_size)
39
+ i += encode_start_of_word(w, code)
40
+ while i < len && (code.first.size < code_size || code.last.size < code_size)
45
41
  case w[i]
46
42
  when 'A', 'E', 'I', 'O', 'U', 'Y'
47
- if i == 0
48
- # all init vowels now map to 'A'
49
- primary += 'A'
50
- secondary += 'A'
51
- end
52
43
  i += 1
53
44
  when 'B'
54
45
  # "-mb", e.g", "dumb", already skipped over...
55
- primary += 'P'
56
- secondary += 'P'
57
- i += (w[i + 1] == 'B') ? 2 : 1
46
+ i += gen_encode(w, i, 'P', 'P', code)
58
47
  when 'Ç', 'ç'
59
- primary += 'S'
60
- secondary += 'S'
48
+ code.add 'S', 'S'
61
49
  i += 1
62
50
  when 'C'
63
- # various germanic
64
- if i > 1 && !vowel?(w[i - 2]) && w[i - 1, 3] == 'ACH' &&
65
- (w[i + 2] != 'I' && (w[i + 2] != 'E' || w[i - 2, 6] =~ /[BM]ACHER/))
66
- primary += 'K'
67
- secondary += 'K'
68
- i += 2
69
- # special case 'caesar'
70
- elsif i == 0 && w[i, 6] == 'CAESAR'
71
- primary += 'S'
72
- secondary += 'S'
73
- i += 2
74
- # italian 'chianti'
75
- elsif w[i, 4] == 'CHIA'
76
- primary += 'K'
77
- secondary += 'K'
78
- i += 2
79
- elsif w[i, 2] == 'CH'
80
- # find 'michael'
81
- if i > 0 && w[i, 4] == 'CHAE'
82
- primary += 'K'
83
- secondary += 'X'
84
- i += 2
85
- # greek roots e.g. 'chemistry', 'chorus'
86
- elsif i == 0 && (w[i + 1, 5] =~ /HARAC|HARIS/ || w[i + 1, 3] =~ /HOR|HYM|HIA|HEM/) &&
87
- w[0, 5] != 'CHORE'
88
- primary += 'K'
89
- secondary += 'K'
90
- i += 2
91
- else
92
- # germanic, greek, or otherwise 'ch' for 'kh' sound
93
- if (w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH') ||
94
- # 'architect but not 'arch', 'orchestra', 'orchid'
95
- (i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/) ||
96
- (w[i + 2] =~ /[TS]/) ||
97
- ((i > 0 && w[i - 1] =~ /[AOUE]/) || i == 0) &&
98
- # e.g., 'wachtler', 'wechsler', but not 'tichner'
99
- (w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
100
- primary += 'K'
101
- secondary += 'K'
102
- else
103
- if i > 0
104
- if w[0, 2] == 'MC'
105
- # e.g., "McHugh"
106
- primary += 'K'
107
- secondary += 'K'
108
- else
109
- primary += 'X'
110
- secondary += 'K'
111
- end
112
- else
113
- primary += 'X'
114
- secondary += 'X'
115
- end
116
- end
117
- i += 2
118
- end
119
- elsif w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
120
- # e.g, 'czerny'
121
- primary += 'S'
122
- secondary += 'X'
123
- i += 2
124
- elsif w[i + 1, 3] == 'CIA'
125
- # e.g., 'focaccia'
126
- primary += 'X'
127
- secondary += 'X'
128
- i += 3
129
- # double 'C', but not if e.g. 'McClellan'
130
- elsif w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
131
- # 'bellocchio' but not 'bacchus'
132
- if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
133
- # 'accident', 'accede' 'succeed'
134
- if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
135
- # 'bacci', 'bertucci', other italian
136
- primary += 'KS'
137
- secondary += 'KS'
138
- else
139
- primary += 'X'
140
- secondary += 'X'
141
- end
142
- i += 3
143
- else
144
- # Pierce's rule
145
- primary += 'K'
146
- secondary += 'K'
147
- i += 2
148
- end
149
- elsif w[i, 2] =~ /CK|CG|CQ/
150
- primary += 'K'
151
- secondary += 'K'
152
- i += 2
153
- elsif w[i, 2] =~ /CI|CE|CY/
154
- # italian vs. english
155
- if w[i, 3] =~ /CIO|CIE|CIA/
156
- primary += 'S'
157
- secondary += 'X'
158
- else
159
- primary += 'S'
160
- secondary += 'S'
161
- end
162
- i += 2
163
- else
164
- primary += 'K'
165
- secondary += 'K'
166
- # name sent in 'mac caffrey', 'mac gregor'
167
- if w[i + 1, 2] =~ /\s[CQG]/
168
- i += 3
169
- else
170
- if w[i + 1] =~ /[CKQ]/ && !(w[i + 1, 2] =~ /CE|CI/)
171
- i += 2
172
- else
173
- i += 1
174
- end
175
- end
176
- end
51
+ i += encode_c(w, i, len, code)
177
52
  when 'D'
178
- if w[i, 2] == 'DG'
179
- if w[i + 2] =~ /[IEY]/
180
- # e.g. 'edge'
181
- primary += 'J'
182
- secondary += 'J'
183
- i += 3
184
- else
185
- # e.g. 'edgar'
186
- primary += 'TK'
187
- secondary += 'TK'
188
- i += 2
189
- end
190
- elsif w[i, 2] =~ /DT|DD/
191
- primary += 'T'
192
- secondary += 'T'
193
- i += 2
194
- else
195
- primary += 'T'
196
- secondary += 'T'
197
- i += 1
198
- end
199
- when 'F'
200
- if w[i + 1] == 'F'
201
- i += 2
202
- else
203
- i += 1
204
- end
205
- primary += 'F'
206
- secondary += 'F'
53
+ i += encode_d(w, i, len, code)
54
+ when 'F', 'K', 'N'
55
+ i += gen_encode(w, i, w[i], w[i], code)
207
56
  when 'G'
208
- if w[i + 1] == 'H'
209
- if i > 0 && !vowel?(w[i - 1])
210
- primary += 'K'
211
- secondary += 'K'
212
- i += 2
213
- elsif i == 0
214
- # ghislane, ghiradelli
215
- if w[i + 2] == 'I'
216
- primary += 'J'
217
- secondary += 'J'
218
- else
219
- primary += 'K'
220
- secondary += 'K'
221
- end
222
- i += 2
223
- # Parker's rule (with some further refinements) - e.g., 'hugh'
224
- elsif (i > 1 && w[i - 2] =~ /[BHD]/) ||
225
- # e.g., 'bough'
226
- (i > 2 && w[i - 3] =~ /[BHD]/) ||
227
- # e.g., 'broughton'
228
- (i > 3 && w[i - 4] =~ /[BH]/)
229
- i += 2
230
- else
231
- # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
232
- if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
233
- primary += 'F'
234
- secondary += 'F'
235
- else
236
- if i > 0 && w[i - 1] != 'I'
237
- primary += 'K'
238
- secondary += 'K'
239
- end
240
- end
241
- i += 2
242
- end
243
- elsif w[i + 1] == 'N'
244
- if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
245
- primary += 'KN'
246
- secondary += 'N'
247
- else
248
- # not e.g. 'cagney'
249
- if w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
250
- primary += 'N'
251
- secondary += 'KN'
252
- else
253
- primary += 'KN'
254
- secondary += 'KN'
255
- end
256
- end
257
- i += 2
258
- # 'tagliaro'
259
- elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
260
- primary += 'KL'
261
- secondary += 'L'
262
- i += 2
263
- # -ges-,-gep-,-gel-, -gie- at beginning
264
- elsif i == 0 && (w[i + 1] == 'Y' || w[i + 1, 2] =~ /ES|EP|EB|EL|EY|IB|IL|IN|IE|EI|ER/)
265
- primary += 'K'
266
- secondary += 'J'
267
- i += 2
268
- # -ger-, -gy-
269
- elsif (w[i + 1, 2] == 'ER' || w[i + 1] == 'Y') &&
270
- !(w[0, 6] =~ /[DRM]ANGER/) &&
271
- !(i > 0 && w[i - 1] =~ /[EI]/) &&
272
- !(i > 0 && w[i - 1, 3] =~ /RGY|OGY/)
273
- primary += 'K'
274
- secondary += 'J'
275
- i += 2
276
- # italian e.g, 'biaggi'
277
- elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
278
- if w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH' || w[i + 1, 2] == 'ET'
279
- primary += 'K'
280
- secondary += 'K'
281
- else
282
- if w[i + 1, 4] =~ /IER\s/
283
- primary += 'J'
284
- secondary += 'J'
285
- else
286
- primary += 'J'
287
- secondary += 'K'
288
- end
289
- end
290
- i += 2
291
- else
292
- if w[i + 1] == 'G'
293
- i += 2
294
- else
295
- i += 1
296
- end
297
- primary += 'K'
298
- secondary += 'K'
299
- end
57
+ i += encode_g(w, i, len, code)
300
58
  when 'H'
301
- # only keep if first & before vowel or btw. 2 vowels
302
- if (i == 0 || (i > 0 && vowel?(w[i - 1]))) && vowel?(w[i + 1])
303
- primary += 'H'
304
- secondary += 'H'
305
- i += 2
306
- else # also takes care of 'HH'
307
- i += 1
308
- end
59
+ i += encode_h(w, i, len, code)
309
60
  when 'J'
310
- # obvious spanish, 'jose', 'san jacinto'
311
- if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
312
- if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
313
- primary += 'H'
314
- secondary += 'H'
315
- else
316
- primary += 'J'
317
- secondary += 'H'
318
- end
319
- i += 1
320
- else
321
- if i == 0 && w[i, 4] != 'JOSE'
322
- primary += 'J'
323
- secondary += 'A'
324
- # Yankelovich/Jankelowicz
325
- else
326
- # spanish pron. of e.g. 'bajador'
327
- if i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && (w[i + 1] == 'A' || w[i + 1] == 'O')
328
- primary += 'J'
329
- secondary += 'H'
330
- else
331
- if i == last
332
- primary += 'J'
333
- #secondary += ' '
334
- else
335
- if !(w[i + 1] =~ /[LTKSNMBZ]/) && !(i > 0 && w[i - 1] =~ /[SKL]/)
336
- primary += 'J'
337
- secondary += 'J'
338
- end
339
- end
340
- end
341
- end
342
- if w[i + 1] == 'J'
343
- i += 2
344
- else
345
- i += 1
346
- end
347
- end
348
- when 'K'
349
- if w[i + 1] == 'K'
350
- i += 2
351
- else
352
- i += 1
353
- end
354
- primary += 'K'
355
- secondary += 'K'
61
+ i += encode_j(w, i, len, code)
356
62
  when 'L'
357
- if w[i + 1] == 'L'
358
- # spanish e.g. 'cabrillo', 'gallegos'
359
- if (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILLO|ILLA|ALLE/) ||
360
- ((last > 0 && w[last - 1, 2] =~ /AS|OS/ || w[last] =~ /[AO]/) &&
361
- (i > 0 && w[i - 1, 4] == 'ALLE'))
362
- primary += 'L'
363
- i += 2
364
- next
365
- end
366
- i += 2
367
- else
368
- i += 1
369
- end
370
- primary += 'L'
371
- secondary += 'L'
63
+ i += encode_l(w, i, len, code)
372
64
  when 'M'
373
- if (i > 0 && w[i - 1, 3] == 'UMB' && (i + 1 == last || w[i + 2, 2] == "ER")) ||
374
- # 'dumb','thumb'
375
- w[i + 1] == 'M'
376
- i += 2
377
- else
378
- i += 1
379
- end
380
- primary += 'M'
381
- secondary += 'M'
382
- when 'N'
383
- if w[i + 1] == 'N'
384
- i += 2
385
- else
386
- i += 1
387
- end
388
- primary += 'N'
389
- secondary += 'N'
65
+ i += encode_m(w, i, len, code)
390
66
  when 'Ñ', 'ñ'
391
- i += 1;
392
- primary += 'N'
393
- secondary += 'N'
67
+ code.add 'N', 'N'
68
+ i += 1
394
69
  when 'P'
395
- if w[i + 1] == 'H'
396
- primary += 'F'
397
- secondary += 'F'
398
- i += 2
399
- else
400
- # also account for "campbell", "raspberry"
401
- if w[i + 1] =~ /[PB]/
402
- i += 2
403
- else
404
- i += 1
405
- end
406
- primary += 'P'
407
- secondary += 'P'
408
- end
70
+ i += encode_p(w, i, len, code)
409
71
  when 'Q'
410
- if w[i + 1] == 'Q'
411
- i += 2
412
- else
413
- i += 1
414
- end
415
- primary += 'K'
416
- secondary += 'K'
72
+ i += gen_encode(w, i, 'K', 'K', code)
417
73
  when 'R'
418
- # french e.g. 'rogier', but exclude 'hochmeier'
419
- if i == last && !slavo_germanic?(w) &&
420
- (i > 1 && w[i - 2, 2] == "IE") &&
421
- !(i > 3 && w[i - 4, 2] =~ /M[EA]/)
422
- secondary += 'R'
423
- else
424
- primary += 'R'
425
- secondary += 'R'
426
- end
427
- if w[i + 1] == 'R'
428
- i += 2
429
- else
430
- i += 1
431
- end
74
+ i += encode_r(w, i, len, code)
432
75
  when 'S'
433
- # special cases 'island', 'isle', 'carlisle', 'carlysle'
434
- if i > 0 && w[i - 1, 3] =~ /ISL|YSL/
435
- i += 1
436
- # special case 'sugar-'
437
- elsif i == 0 && w[i, 5] == 'SUGAR'
438
- primary += 'X'
439
- secondary += 'S'
440
- i += 1
441
- elsif w[i, 2] == 'SH'
442
- # germanic
443
- if w[i + 1, 4] =~ /HEIM|HOEK|HOLM|HOLZ/
444
- primary += 'S'
445
- secondary += 'S'
446
- else
447
- primary += 'X'
448
- secondary += 'X'
449
- end
450
- i += 2
451
- # italian & armenian
452
- elsif w[i, 3] =~ /SIO|SIA/ || w[i, 4] == 'SIAN'
453
- if !slavo_germanic?(w)
454
- primary += 'S'
455
- secondary += 'X'
456
- else
457
- primary += 'S'
458
- secondary += 'S'
459
- end
460
- i += 3
461
- # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
462
- # also, -sz- in slavic language altho in hungarian it is pronounced 's'
463
- elsif (i == 0 && w[i + 1] =~ /[MNLW]/) || w[i + 1] == 'Z'
464
- primary += 'S'
465
- secondary += 'X'
466
- if w[i + 1] == 'Z'
467
- i += 2
468
- else
469
- i += 1
470
- end
471
- elsif w[i, 2] == 'SC'
472
- # Schlesinger's rule
473
- if w[i + 2] == 'H'
474
- # dutch origin, e.g. 'school', 'schooner'
475
- if w[i + 3, 2] =~ /OO|ER|EN|UY|ED|EM/
476
- # 'schermerhorn', 'schenker'
477
- if w[i + 3, 2] =~ /ER|EN/
478
- primary += 'X'
479
- secondary += 'SK'
480
- else
481
- primary += 'SK'
482
- secondary += 'SK'
483
- end
484
- i += 3
485
- else
486
- if i == 0 && !vowel?(w[3]) && w[3] != 'W'
487
- primary += 'X'
488
- secondary += 'S'
489
- else
490
- primary += 'X'
491
- secondary += 'X'
492
- end
493
- i += 3
494
- end
495
- elsif w[i + 2, 1] =~ /[IEY]/
496
- primary += 'S'
497
- secondary += 'S'
498
- i += 3
499
- else
500
- primary += 'SK'
501
- secondary += 'SK'
502
- i += 3
503
- end
504
- else
505
- # french e.g. 'resnais', 'artois'
506
- if i == last && i > 1 && w[i - 2, 2] =~ /AI|OI/
507
- secondary += 'S'
508
- else
509
- primary += 'S'
510
- secondary += 'S'
511
- end
512
- if w[i + 1] =~ /[SZ]/
513
- i += 2
514
- else
515
- i += 1
516
- end
517
- end
76
+ i += encode_s(w, i, len, code)
518
77
  when 'T'
519
- if w[i, 4] == 'TION'
520
- primary += 'X'
521
- secondary += 'X'
522
- i += 3
523
- elsif w[i, 3] =~ /TIA|TCH/
524
- primary += 'X'
525
- secondary += 'X'
526
- i += 3
527
- elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
528
- # special case 'thomas', 'thames' or germanic
529
- if w[i + 2, 2] =~ /OM|AM/ || w[0, 4] =~ /VAN|VON\s/ || w[0, 3] == 'SCH'
530
- primary += 'T'
531
- secondary += 'T'
532
- else
533
- primary += '0'
534
- secondary += 'T'
535
- end
536
- i += 2
537
- else
538
- if w[i + 1] =~ /[TD]/
539
- i += 2
540
- else
541
- i += 1
542
- end
543
- primary += 'T'
544
- secondary += 'T'
545
- end
78
+ i += encode_t(w, i, len, code)
546
79
  when 'V'
547
- if w[i + 1] == 'V'
548
- i += 2
549
- else
550
- i += 1
551
- end
552
- primary += 'F'
553
- secondary += 'F'
80
+ i += gen_encode(w, i, 'F', 'F', code)
554
81
  when 'W'
555
- # can also be in middle of word
556
- if w[i, 2] == 'WR'
557
- primary += 'R'
558
- secondary += 'R'
559
- i += 2
560
- else
561
- if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
562
- # Wasserman should match Vasserman
563
- if vowel?(w[i + 1])
564
- primary += 'A'
565
- secondary += 'F'
566
- else
567
- # need Uomo to match Womo
568
- primary += 'A'
569
- secondary += 'A'
570
- end
571
- end
572
- # Arnow should match Arnoff
573
- if i == last && i > 0 && vowel?(w[i - 1]) ||
574
- (i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/) || w[0, 3] == 'SCH'
575
- secondary += 'F'
576
- i += 1
577
- elsif w[i, 4] =~ /WICZ|WITZ/
578
- # polish e.g. 'filipowicz'
579
- primary += 'TS'
580
- secondary += 'FX'
581
- i += 4
582
- else
583
- i += 1
584
- end
585
- end
82
+ i += encode_w(w, i, len, code)
586
83
  when 'X'
587
- # french e.g. breaux
588
- if !(i == last && ((i > 2 && w[i - 3, 3] =~ /IAU|EAU/) || (i > 1 && w[i - 2, 2] =~ /AU|OU/)))
589
- primary += 'KS'
590
- secondary += 'KS'
591
- end
592
- if w[i + 1] =~ /[CX]/
593
- i += 2
594
- else
595
- i += 1
596
- end
84
+ i += encode_x(w, i, len, code)
597
85
  when 'Z'
598
- # chinese pinyin e.g. 'zhao'
599
- if w[i + 1] == 'H'
600
- primary += 'J'
601
- secondary += 'J'
602
- i += 2
603
- else
604
- if w[i + 1, 2] =~ /ZO|ZI|ZA/ || slavo_germanic?(w) && (i > 0 && w[i - 1] != 'T')
605
- primary += 'S'
606
- secondary += 'TS';
607
- else
608
- primary += 'S'
609
- secondary += 'S';
610
- end
611
- if w[i + 1] == 'Z'
612
- i += 2
613
- else
614
- i += 1
615
- end
616
- end
86
+ i += encode_z(w, i, len, code)
617
87
  else
618
88
  i += 1
619
89
  end
620
90
  end
621
- [primary[0, code_size], secondary[0, code_size]]
91
+ [code.first[0, code_size], code.last[0, code_size]]
622
92
  end
623
93
 
624
94
  def self.encode(str, options = { size: 4 })
@@ -627,14 +97,494 @@ module Phonetic
627
97
 
628
98
  private
629
99
 
630
- def self.slavo_germanic?(str)
631
- !!(str[/W|K|CZ|WITZ/])
100
+ def self.encode_start_of_word(w, code)
101
+ i = 0
102
+ # skip these when at start of word
103
+ if w[0, 2] =~ /[GKP]N|WR|PS/
104
+ i = 1
105
+ # initial 'X' is pronounced 'Z' e.g. 'Xavier'
106
+ elsif w[0] == 'X'
107
+ code.add 'S', 'S'
108
+ i = 1
109
+ elsif w[0] =~ /[AEIOUY]/
110
+ code.add 'A', 'A' # all init vowels now map to 'A'
111
+ i = 1
112
+ elsif w[0, 6] == 'CAESAR' # special case 'caesar'
113
+ code.add 'S', 'S'
114
+ i = 1
115
+ end
116
+ i
632
117
  end
633
118
 
634
- def self.vowel?(char)
635
- c = VOWELS[char.to_s]
636
- !c.nil? && !c.empty?
119
+ def self.gen_encode(w, i, primary, secondary, code)
120
+ code.add primary, secondary
121
+ w[i + 1] == w[i] ? 2 : 1
637
122
  end
638
123
 
124
+ def self.encode_c(w, i, len, code)
125
+ r = 1
126
+ case
127
+ # various germanic
128
+ when c_germanic?(w, i)
129
+ code.add 'K', 'K'
130
+ r += 1
131
+ when w[i, 2] == 'CH'
132
+ encode_ch(w, i, len, code)
133
+ r += 1
134
+ when w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
135
+ # e.g, 'czerny'
136
+ code.add 'S', 'X'
137
+ r += 1
138
+ when w[i + 1, 3] == 'CIA'
139
+ # e.g., 'focaccia'
140
+ code.add 'X', 'X'
141
+ r += 2
142
+ # double 'C', but not if e.g. 'McClellan'
143
+ when w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
144
+ r += encode_cc(w, i, code) + 1
145
+ when w[i, 2] =~ /C[KGQ]/
146
+ code.add 'K', 'K'
147
+ r += 1
148
+ when w[i, 2] =~ /C[IEY]/
149
+ # italian vs. english
150
+ if w[i, 3] =~ /CI[OEA]/
151
+ code.add 'S', 'X'
152
+ else
153
+ code.add 'S', 'S'
154
+ end
155
+ r += 1
156
+ else
157
+ code.add 'K', 'K'
158
+ # name sent in 'mac caffrey', 'mac gregor'
159
+ if w[i + 1, 2] =~ /\s[CQG]/
160
+ r += 2
161
+ elsif w[i + 1] =~ /[CKQ]/ && w[i + 1, 2] !~ /C[EI]/
162
+ r += 1
163
+ end
164
+ end
165
+ r
166
+ end
167
+
168
+ def self.encode_d(w, i, len, code)
169
+ r = 1
170
+ if w[i, 2] == 'DG'
171
+ if w[i + 2] =~ /[IEY]/
172
+ # e.g. 'edge'
173
+ code.add 'J', 'J'
174
+ r += 2
175
+ else
176
+ # e.g. 'edgar'
177
+ code.add 'TK', 'TK'
178
+ r += 1
179
+ end
180
+ elsif w[i, 2] =~ /D[TD]/
181
+ code.add 'T', 'T'
182
+ r += 1
183
+ else
184
+ code.add 'T', 'T'
185
+ end
186
+ r
187
+ end
188
+
189
+ def self.encode_g(w, i, len, code)
190
+ r = 2
191
+ if w[i + 1] == 'H'
192
+ encode_gh(w, i, code)
193
+ elsif w[i + 1] == 'N'
194
+ encode_gn(w, i, code)
195
+ # 'tagliaro'
196
+ elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
197
+ code.add 'KL', 'L'
198
+ # -ges-, -gep-, -gel-, -gie- at beginning
199
+ elsif i == 0 && w[1, 2] =~ /^Y|E[SPBLYIR]|I[BLNE]/
200
+ code.add 'K', 'J'
201
+ # -ger-, -gy-
202
+ elsif g_ger_or_gy?(w, i)
203
+ code.add 'K', 'J'
204
+ # italian e.g, 'biaggi'
205
+ elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
206
+ if w[0, 4] =~ /^(VAN |VON |SCH)/ || w[i + 1, 2] == 'ET'
207
+ code.add 'K', 'K'
208
+ elsif w[i + 1, 4] =~ /IER\s/
209
+ code.add 'J', 'J'
210
+ else
211
+ code.add 'J', 'K'
212
+ end
213
+ else
214
+ r -= 1 if w[i + 1] != 'G'
215
+ code.add 'K', 'K'
216
+ end
217
+ r
218
+ end
219
+
220
+ def self.encode_h(w, i, len, code)
221
+ r = 1
222
+ # only keep if first & before vowel or btw. 2 vowels
223
+ if (i == 0 || i > 0 && vowel?(w[i - 1])) && vowel?(w[i + 1])
224
+ code.add 'H', 'H'
225
+ r += 1
226
+ end
227
+ r
228
+ end
229
+
230
+ def self.encode_j(w, i, len, code)
231
+ r = 1
232
+ last = len - 1
233
+ # obvious spanish, 'jose', 'san jacinto'
234
+ if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
235
+ if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
236
+ code.add 'H', 'H'
237
+ else
238
+ code.add 'J', 'H'
239
+ end
240
+ else
241
+ if i == 0 && w[i, 4] != 'JOSE'
242
+ code.add 'J', 'A'
243
+ # Yankelovich/Jankelowicz
244
+ else
245
+ # spanish pron. of e.g. 'bajador'
246
+ if j_spanish_pron?(w, i)
247
+ code.add 'J', 'H'
248
+ elsif i == last
249
+ code.add 'J', ''
250
+ elsif w[i + 1] !~ /[LTKSNMBZ]/ && !(i > 0 && w[i - 1] =~ /[SKL]/)
251
+ code.add 'J', 'J'
252
+ end
253
+ end
254
+ r += 1 if w[i + 1] == 'J'
255
+ end
256
+ r
257
+ end
258
+
259
+ def self.encode_l(w, i, len, code)
260
+ r = 1
261
+ if w[i + 1] == 'L'
262
+ # spanish e.g. 'cabrillo', 'gallegos'
263
+ if ll_spanish?(w, i, len)
264
+ code.add 'L', ''
265
+ else
266
+ code.add 'L', 'L'
267
+ end
268
+ r += 1
269
+ else
270
+ code.add 'L', 'L'
271
+ end
272
+ r
273
+ end
274
+
275
+ def self.encode_m(w, i, len, code)
276
+ r = 1
277
+ # 'dumb','thumb'
278
+ r += 1 if i > 0 && w[i - 1, 5] =~ /UMB( |ER)/ || w[i + 1] == 'M'
279
+ code.add 'M', 'M'
280
+ r
281
+ end
282
+
283
+ def self.encode_p(w, i, len, code)
284
+ r = 1
285
+ if w[i + 1] == 'H'
286
+ code.add 'F', 'F'
287
+ r += 1
288
+ else
289
+ # also account for "campbell", "raspberry"
290
+ r += 1 if w[i + 1] =~ /[PB]/
291
+ code.add 'P', 'P'
292
+ end
293
+ r
294
+ end
295
+
296
+ def self.encode_r(w, i, len, code)
297
+ last = len - 1
298
+ # french e.g. 'rogier', but exclude 'hochmeier'
299
+ if r_french?(w, i, last)
300
+ code.add '', 'R'
301
+ else
302
+ code.add 'R', 'R'
303
+ end
304
+ w[i + 1] == 'R' ? 2 : 1
305
+ end
306
+
307
+ def self.encode_s(w, i, len, code)
308
+ r = 1
309
+ last = len - 1
310
+ # special cases 'island', 'isle', 'carlisle', 'carlysle'
311
+ if i > 0 && w[i - 1, 3] =~ /[IY]SL/
312
+ # special case 'sugar-'
313
+ elsif i == 0 && w[i, 5] == 'SUGAR'
314
+ code.add 'X', 'S'
315
+ elsif w[i, 2] == 'SH'
316
+ # germanic
317
+ if w[i + 1, 4] =~ /H(EIM|OEK|OL[MZ])/
318
+ code.add 'S', 'S'
319
+ else
320
+ code.add 'X', 'X'
321
+ end
322
+ r += 1
323
+ # italian & armenian
324
+ elsif w[i, 3] =~ /SI[OA]/
325
+ if !slavo_germanic?(w)
326
+ code.add 'S', 'X'
327
+ else
328
+ code.add 'S', 'S'
329
+ end
330
+ r += 2
331
+ # german & anglicisations, e.g. 'smith' match 'schmidt',
332
+ # 'snider' match 'schneider' also, -sz- in slavic language altho in
333
+ # hungarian it is pronounced 's'
334
+ elsif i == 0 && w[i + 1] =~ /[MNLW]/ || w[i + 1] == 'Z'
335
+ code.add 'S', 'X'
336
+ r += 1 if w[i + 1] == 'Z'
337
+ elsif w[i, 2] == 'SC'
338
+ encode_sc(w, i, code)
339
+ r += 2
340
+ # french e.g. 'resnais', 'artois'
341
+ else
342
+ if i == last && i > 1 && w[i - 2, 2] =~ /[AO]I/
343
+ code.add '', 'S'
344
+ else
345
+ code.add 'S', 'S'
346
+ end
347
+ r += 1 if w[i + 1] =~ /[SZ]/
348
+ end
349
+ r
350
+ end
351
+
352
+ def self.encode_t(w, i, len, code)
353
+ r = 1
354
+ if w[i, 4] =~ /^(TION|TIA|TCH)/
355
+ code.add 'X', 'X'
356
+ r += 2
357
+ elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
358
+ # special case 'thomas', 'thames' or germanic
359
+ if w[i + 2, 2] =~ /[OA]M/ || w[0, 4] =~ /^(VAN |VON |SCH)/
360
+ code.add 'T', 'T'
361
+ else
362
+ code.add '0', 'T'
363
+ end
364
+ r += 1
365
+ else
366
+ r += 1 if w[i + 1] =~ /[TD]/
367
+ code.add 'T', 'T'
368
+ end
369
+ r
370
+ end
371
+
372
+ def self.encode_w(w, i, len, code)
373
+ last = len - 1
374
+ r = 1
375
+ # can also be in middle of word
376
+ if w[i, 2] == 'WR'
377
+ code.add 'R', 'R'
378
+ r += 1
379
+ else
380
+ if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
381
+ # Wasserman should match Vasserman
382
+ if vowel?(w[i + 1])
383
+ code.add 'A', 'F'
384
+ else
385
+ # need Uomo to match Womo
386
+ code.add 'A', 'A'
387
+ end
388
+ end
389
+ # Arnow should match Arnoff
390
+ if i == last && i > 0 && vowel?(w[i - 1]) ||
391
+ i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/ ||
392
+ w[0, 3] == 'SCH'
393
+ code.add '', 'F'
394
+ elsif w[i, 4] =~ /WICZ|WITZ/
395
+ # polish e.g. 'filipowicz'
396
+ code.add 'TS', 'FX'
397
+ r += 3
398
+ end
399
+ end
400
+ r
401
+ end
402
+
403
+ def self.encode_x(w, i, len, code)
404
+ # french e.g. breaux
405
+ code.add 'KS', 'KS' unless x_french?(w, i, len - 1)
406
+ w[i + 1] =~ /[CX]/ ? 2 : 1
407
+ end
408
+
409
+ def self.encode_z(w, i, len, code)
410
+ r = 1
411
+ # chinese pinyin e.g. 'zhao'
412
+ if w[i + 1] == 'H'
413
+ code.add 'J', 'J'
414
+ r += 1
415
+ else
416
+ if w[i + 1, 2] =~ /Z[OIA]/ ||
417
+ slavo_germanic?(w) && i > 0 && w[i - 1] != 'T'
418
+ code.add 'S', 'TS';
419
+ else
420
+ code.add 'S', 'S';
421
+ end
422
+ r += 1 if w[i + 1] == 'Z'
423
+ end
424
+ r
425
+ end
426
+
427
+ def self.encode_ch(w, i, len, code)
428
+ case
429
+ # italian 'chianti'
430
+ when w[i, 4] == 'CHIA'
431
+ code.add 'K', 'K'
432
+ # find 'michael'
433
+ when i > 0 && w[i, 4] == 'CHAE'
434
+ code.add 'K', 'X'
435
+ # greek roots e.g. 'chemistry', 'chorus'
436
+ when ch_greek_roots?(w, i)
437
+ code.add 'K', 'K'
438
+ # germanic, greek, or otherwise 'ch' for 'kh' sound
439
+ when ch_germanic_or_greek?(w, i, len)
440
+ code.add 'K', 'K'
441
+ when i == 0
442
+ code.add 'X', 'X'
443
+ when w[0, 2] == 'MC'
444
+ # e.g., "McHugh"
445
+ code.add 'K', 'K'
446
+ else
447
+ code.add 'X', 'K'
448
+ end
449
+ end
450
+
451
+ def self.encode_cc(w, i, code)
452
+ r = 0
453
+ # 'bellocchio' but not 'bacchus'
454
+ if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
455
+ # 'accident', 'accede' 'succeed'
456
+ if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
457
+ # 'bacci', 'bertucci', other italian
458
+ code.add 'KS', 'KS'
459
+ else
460
+ code.add 'X', 'X'
461
+ end
462
+ r = 1
463
+ else
464
+ # Pierce's rule
465
+ code.add 'K', 'K'
466
+ end
467
+ r
468
+ end
469
+
470
+ def self.encode_gh(w, i, code)
471
+ if i > 0 && !vowel?(w[i - 1])
472
+ code.add 'K', 'K'
473
+ elsif i == 0
474
+ # ghislane, ghiradelli
475
+ if w[i + 2] == 'I'
476
+ code.add 'J', 'J'
477
+ else
478
+ code.add 'K', 'K'
479
+ end
480
+ # Parker's rule (with some further refinements)
481
+ elsif !(i > 1 && w[i - 2] =~ /[BHD]/ || # e.g., 'hugh'
482
+ i > 2 && w[i - 3] =~ /[BHD]/ || # e.g., 'bough'
483
+ i > 3 && w[i - 4] =~ /[BH]/) # e.g., 'broughton'
484
+ # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
485
+ if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
486
+ code.add 'F', 'F'
487
+ elsif i > 0 && w[i - 1] != 'I'
488
+ code.add 'K', 'K'
489
+ end
490
+ end
491
+ end
492
+
493
+ def self.encode_gn(w, i, code)
494
+ if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
495
+ code.add 'KN', 'N'
496
+ # not e.g. 'cagney'
497
+ elsif w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
498
+ code.add 'N', 'KN'
499
+ else
500
+ code.add 'KN', 'KN'
501
+ end
502
+ end
503
+
504
+ def self.encode_sc(w, i, code)
505
+ # Schlesinger's rule
506
+ if w[i + 2] == 'H'
507
+ # dutch origin, e.g. 'school', 'schooner'
508
+ if w[i + 3, 2] =~ /OO|UY|E[DM]/
509
+ code.add 'SK', 'SK'
510
+ # 'schermerhorn', 'schenker'
511
+ elsif w[i + 3, 2] =~ /E[RN]/
512
+ code.add 'X', 'SK'
513
+ elsif i == 0 && !vowel?(w[3]) && w[3] != 'W'
514
+ code.add 'X', 'S'
515
+ else
516
+ code.add 'X', 'X'
517
+ end
518
+ elsif w[i + 2] =~ /[IEY]/
519
+ code.add 'S', 'S'
520
+ else
521
+ code.add 'SK', 'SK'
522
+ end
523
+ end
524
+
525
+ def self.slavo_germanic?(w)
526
+ w =~ /W|K|CZ|WITZ/
527
+ end
528
+
529
+ def self.vowel?(c)
530
+ c =~ /[AEIOUY]/
531
+ end
532
+
533
+ def self.c_germanic?(w, i)
534
+ # various germanic
535
+ i > 1 &&
536
+ !vowel?(w[i - 2]) &&
537
+ w[i - 1, 3] == 'ACH' &&
538
+ (w[i + 2] !~ /[IE]/ || w[i - 2, 6] =~ /[BM]ACHER/)
539
+ end
540
+
541
+ def self.ch_greek_roots?(w, i)
542
+ # greek roots e.g. 'chemistry', 'chorus'
543
+ i == 0 && w[1, 5] =~ /^H(ARAC|ARIS|OR|YM|IA|EM)/ && w[0, 5] != 'CHORE'
544
+ end
545
+
546
+ def self.ch_germanic_or_greek?(w, i, len)
547
+ # germanic, greek, or otherwise 'ch' for 'kh' sound
548
+ w[0, 4] =~ /^(V[AO]N\s|SCH)/ ||
549
+ # 'architect but not 'arch', 'orchestra', 'orchid'
550
+ i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/ ||
551
+ (w[i + 2] =~ /[TS]/) ||
552
+ (i > 0 && w[i - 1] =~ /[AOUE]/ || i == 0) &&
553
+ # e.g., 'wachtler', 'wechsler', but not 'tichner'
554
+ (w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
555
+ end
556
+
557
+ def self.g_ger_or_gy?(w, i)
558
+ # -ger-, -gy-
559
+ w[i + 1, 2] =~ /^(ER|Y)/ &&
560
+ w[0, 6] !~ /[DRM]ANGER/ &&
561
+ !(i > 0 && w[i - 1] =~ /[EI]/) &&
562
+ !(i > 0 && w[i - 1, 3] =~ /[RO]GY/)
563
+ end
564
+
565
+ def self.j_spanish_pron?(w, i)
566
+ # spanish pron. of e.g. 'bajador'
567
+ i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && w[i + 1] =~ /[AO]/
568
+ end
569
+
570
+ def self.ll_spanish?(w, i, len)
571
+ last = len - 1
572
+ # spanish e.g. 'cabrillo', 'gallegos'
573
+ (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILL[OA]|ALLE/) ||
574
+ (last > 0 && w[last - 1, 2] =~ /[AO]S/ || w[last] =~ /[AO]/) &&
575
+ (i > 0 && w[i - 1, 4] == 'ALLE')
576
+ end
577
+
578
+ def self.r_french?(w, i, last)
579
+ # french e.g. 'rogier', but exclude 'hochmeier'
580
+ i == last && !slavo_germanic?(w) &&
581
+ i > 1 && w[i - 2, 2] == 'IE' &&
582
+ !(i > 3 && w[i - 4, 2] =~ /M[EA]/)
583
+ end
584
+
585
+ def self.x_french?(w, i, last)
586
+ # french e.g. breaux
587
+ i == last && (i > 2 && w[i - 3, 3] =~ /[IE]AU/ || i > 1 && w[i - 2, 2] =~ /[AO]U/)
588
+ end
639
589
  end
640
590
  end