phonetic 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: af5149abea885ede20731d2ddf269a57588ca5fa
4
- data.tar.gz: 2ebb72004c4fc667801a2b7087766c8cecb1ff1a
3
+ metadata.gz: a20da7ce0b4dab68d7671088098226a035c64b05
4
+ data.tar.gz: 2b721bc986d8e23ba6780bb7cab92059e6a7652b
5
5
  SHA512:
6
- metadata.gz: 359113efab060b09395e6805bbd2e4b69aee42177b070a03361daad56c39fa6348ab836e77900a14b54236299b1c3b2f24b2b28cc546294bed5065961c465c72
7
- data.tar.gz: b8879f75acc85d2b24b705ca1cc48265702bafaff41f4f5de0783c1950abb62e160f41e5412c48fec7ff0ba29e2a820fa96a295435eed3228fba28cec0d8cbc9
6
+ metadata.gz: 14325fa3846251dd1a1cbc59b38c12a32471291b45b07074387747fa9331b5ad98b1b0afaa8dbbac62872f9bf959d5e622742e5ec673f3e1294807f91b5fdc85
7
+ data.tar.gz: ad80a4c26cae46cbc516cc6cfebbfea39be69fbfe86426737cd94c065da57f4448ff3ffe9f892bd60af94fd834ae122c472cdc09b5fe683344ef479d1f31f90c
data/.rspec CHANGED
@@ -1,2 +1,2 @@
1
1
  --color
2
- --format doc
2
+ --format doc
data/README.md CHANGED
@@ -93,6 +93,13 @@ or use alias:
93
93
  'Bonnie'.nysiis # => 'BANY'
94
94
  ```
95
95
 
96
+ ### Daitch–Mokotoff Soundex (D–M Soundex)
97
+ ```ruby
98
+ 'Anja'.dm_soundex # => ['060000', '064000']
99
+ 'Schwarz'.dm_soundex # => ['474000', '479400']
100
+ 'Schtolteheim'.dm_soundex # => ['283560']
101
+ ```
102
+
96
103
  ## Contributing
97
104
 
98
105
  1. Fork it
data/lib/phonetic.rb CHANGED
@@ -7,4 +7,5 @@ require 'phonetic/double_metaphone'
7
7
  require 'phonetic/metaphone2'
8
8
  require 'phonetic/caverphone'
9
9
  require 'phonetic/caverphone2'
10
+ require 'phonetic/dm_soundex'
10
11
  require 'phonetic/core_ext/string'
@@ -0,0 +1,12 @@
1
+ require 'phonetic/dm_soundex'
2
+
3
+ class String
4
+ # D-M Soundex values of string.
5
+ # @example
6
+ # 'Anja'.dm_soundex # => ['060000', '064000']
7
+ # 'Schwarz'.dm_soundex # => ['474000', '479400']
8
+ # 'Schtolteheim'.dm_soundex # => ['283560']
9
+ def dm_soundex(options = {})
10
+ Phonetic::DMSoundex.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,82 @@
1
+ require 'phonetic/algorithm'
2
+ require 'phonetic/dm_soundex_map'
3
+
4
+ module Phonetic
5
+ # Daitch–Mokotoff Soundex (D–M Soundex) is a phonetic algorithm invented
6
+ # in 1985 by Jewish genealogists Gary Mokotoff and Randy Daitch.
7
+ #
8
+ # @example
9
+ # Phonetic::DMSoundex.encode('Anja') # => ['060000', '064000']
10
+ # Phonetic::DMSoundex.encode('Schwarz') # => ['474000', '479400']
11
+ # Phonetic::DMSoundex.encode('Schtolteheim') # => ['283560']
12
+ class DMSoundex < Algorithm
13
+
14
+ def self.encode(str, options = {})
15
+ encode_word(str, options)
16
+ end
17
+
18
+ # Encode word to its D-M Soundex codes.
19
+ def self.encode_word(word, options = {})
20
+ w = word.strip.upcase.gsub(/[^A-Z]+/, '')
21
+ i = 0
22
+ code = init_code()
23
+ while i < w.size
24
+ if w[i] != w[i + 1]
25
+ c = find_code(MAP, w, i)
26
+ if c
27
+ len = c[3] + 1
28
+ if i == 0
29
+ code.add c[0]
30
+ elsif w[i + len] =~ /[AEIOUJY]/
31
+ code.add c[1]
32
+ else
33
+ code.add c[2]
34
+ end
35
+ i += c[3]
36
+ end
37
+ end
38
+ i += 1
39
+ end
40
+ code.result
41
+ end
42
+
43
+ private
44
+
45
+ def self.init_code
46
+ code = [[]]
47
+ def code.add(a)
48
+ case a
49
+ when Array
50
+ c = self.map{|w| w.last != a[1] ? w + [a[1]] : w}
51
+ self.map!{|w| w.last != a[0] ? w + [a[0]] : w}
52
+ self.push(*c)
53
+ else
54
+ self.map!{|w| w.last != a ? w + [a] : w}
55
+ end
56
+ end
57
+ def code.result
58
+ self.map{|w| w.join[0..5].ljust(6, '0')}.uniq
59
+ end
60
+ code
61
+ end
62
+
63
+ def self.find_code(map, w, i, last = nil, count = 0)
64
+ elem = map[w[i]]
65
+ r = case elem
66
+ when Array
67
+ elem[3] = count
68
+ elem
69
+ when Hash
70
+ _last = last
71
+ if elem['self']
72
+ _last = elem['self']
73
+ _last[3] = count
74
+ end
75
+ find_code(elem, w, i + 1, _last, count + 1)
76
+ when nil
77
+ last
78
+ end
79
+ r
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,233 @@
1
+ # encoding: utf-8
2
+ require 'phonetic/algorithm'
3
+
4
+ module Phonetic
5
+ class DMSoundex < Algorithm
6
+ MAP = {
7
+ 'A' => {
8
+ 'self' => ['0', '', ''], # A
9
+ 'I' => ['0', '1', ''], # AI
10
+ 'J' => ['0', '1', ''], # AJ
11
+ 'Y' => ['0', '1', ''], # AY
12
+ 'U' => ['0', '7', ''] # AU
13
+ },
14
+ 'Ą' => ['', '', ['6', '']],
15
+ 'E' => {
16
+ 'self' => ['0', '', ''], # E
17
+ 'I' => ['0', '1', ''], # EI
18
+ 'Y' => ['0', '1', ''], # EY
19
+ 'J' => ['0', '1', ''], # EJ
20
+ 'U' => ['1', '1', ''] # EU
21
+ },
22
+ 'O' => {
23
+ 'self' => ['0', '', ''], # O
24
+ 'I' => ['0', '1', ''], # OI
25
+ 'J' => ['0', '1', ''], # OJ
26
+ 'Y' => ['0', '1', ''] # OY
27
+ },
28
+ 'U' => {
29
+ 'self' => ['0', '', ''], # U
30
+ 'I' => ['0', '1', ''], # UI
31
+ 'J' => ['0', '1', ''], # UJ
32
+ 'Y' => ['0', '1', ''], # UY
33
+ 'E' => ['0', '', ''] # UE
34
+ },
35
+ 'I' => {
36
+ 'self' => ['0', '', ''], # I
37
+ 'A' => ['1', '', ''], # IA
38
+ 'E' => ['1', '', ''], # IE
39
+ 'O' => ['1', '', ''], # IO
40
+ 'U' => ['1', '', ''] # IU
41
+ },
42
+ 'Y' => ['1', '', ''], # Y
43
+ 'J' => [['1', '4'], ['', '4'], ['', '4']], # J
44
+ 'B' => ['7', '7', '7'], # B
45
+ 'C' => {
46
+ 'self' => [['5', '4'], ['5', '4'], ['5', '4']], # C
47
+ 'H' => {
48
+ 'self' => [['5', '4'], ['5', '4'], ['5', '4']], # CH
49
+ 'S' => ['5', '54', '54'] # CHS
50
+ },
51
+ 'K' => [['5', '45'], ['5', '45'], ['5', '45']], # CK
52
+ 'S' => {
53
+ 'self' => ['4', '4', '4'], # CS
54
+ 'Z' => ['4', '4', '4'] # CSZ
55
+ },
56
+ 'Z' => {
57
+ 'self' => ['4', '4', '4'], # CZ
58
+ 'S' => ['4', '4', '4'] # CZS
59
+ }
60
+ },
61
+ 'D' => {
62
+ 'self' => ['3', '3', '3'], # D
63
+ 'R' => { # DR
64
+ 'S' => ['4', '4', '4'], # DRS
65
+ 'Z' => ['4', '4', '4'] # DRZ
66
+ },
67
+ 'S' => {
68
+ 'self' => ['4', '4', '4'], # DS
69
+ 'H' => ['4', '4', '4'] # DSH
70
+ },
71
+ 'T' => ['3', '3', '3'], # DT
72
+ 'Z' => {
73
+ 'self' =>['4', '4', '4'], # DZ
74
+ 'H' => ['4', '4', '4'], # DZH
75
+ 'S' => ['4', '4', '4'] # DZS
76
+ }
77
+ },
78
+ 'F' => {
79
+ 'self' => ['7', '7', '7'], # F
80
+ 'B' => ['7', '7', '7'] # FB
81
+ },
82
+ 'G' => ['5', '5', '5'], # G
83
+ 'H' => ['5', '5', ''], # H
84
+ 'K' => {
85
+ 'self' => ['5', '5', '5'], # K
86
+ 'H' => ['5', '5', '5'], # KH
87
+ 'S' => ['5', '54', '54'] # KS
88
+ },
89
+ 'L' => ['8', '8', '8'], # L
90
+ 'M' => {
91
+ 'self' => ['6', '6', '6'], # M
92
+ 'N' => ['', '66', '66'] # MN
93
+ },
94
+ 'N' => {
95
+ 'self' => ['6', '6', '6'], # N
96
+ 'M' => ['', '66', '66'] # NM
97
+ },
98
+ 'P' => {
99
+ 'self' => ['7', '7', '7'], # P
100
+ 'F' => ['7', '7', '7'], # PF
101
+ 'H' => ['7', '7', '7'] # PH
102
+ },
103
+ 'R' => {
104
+ 'self' => ['9', '9', '9'], # R
105
+ 'S' => [['94', '4'], ['94', '4'], ['94', '4']], # RS
106
+ 'Z' => [['4', '94'], ['4', '94'], ['4', '94']] # RZ
107
+ },
108
+ 'Q' => ['5', '5', '5'], # Q
109
+ 'S' => {
110
+ 'self' => ['4', '4', '4'], # S
111
+ 'C' => {
112
+ 'self' => ['2', '4', '4'], # SC
113
+ 'H' => {
114
+ 'self' => ['4', '4', '4'], # SCH
115
+ 'T' => {
116
+ 'self' => ['2', '43', '43'], # SCHT
117
+ 'S' => { # SCHTS
118
+ 'C' => { # SCHTSC
119
+ 'H' => ['2', '4', '4'] # SCHTSCH
120
+ },
121
+ 'H' => ['2', '4', '4'] # SCHTSH
122
+ },
123
+ 'C' => { # SCHTC
124
+ 'H' => ['2', '4', '4'] # SCHTCH
125
+ }
126
+ }
127
+ }
128
+ },
129
+ 'D' => ['2', '43', '43'], # SD
130
+ 'H' => {
131
+ 'self' => ['4', '4', '4'], # SH
132
+ 'C' => { # SHC
133
+ 'H' => ['2', '4', '4'] # SHCH
134
+ },
135
+ 'D' => ['2', '43', '43'], # SHD
136
+ 'T' => {
137
+ 'self' => ['2', '43', '43'], # SHT
138
+ 'C' => { # SHTC
139
+ 'H' => ['2', '4', '4'] # SHTCH
140
+ },
141
+ 'S' => { # SHTS
142
+ 'H' => ['2', '4', '4'] # SHTSH
143
+ }
144
+ }
145
+ },
146
+ 'T' => {
147
+ 'self' => ['2', '43', '43'], # ST
148
+ 'C' => { # STC
149
+ 'H' => ['2', '4', '4'] # STCH
150
+ },
151
+ 'S' => { # STS
152
+ 'C' => { # STSC
153
+ 'H' => ['2', '4', '4'] # STSCH
154
+ },
155
+ 'D' => ['2', '43', '43'], # SCHD
156
+ 'H' => ['2', '4', '4'] # STSH
157
+ },
158
+ 'R' => { # STR
159
+ 'S' => ['2', '4', '4'], # STRS
160
+ 'Z' => ['2', '4', '4'] # STRZ
161
+ }
162
+ },
163
+ 'Z' => {
164
+ 'self' => ['4', '4', '4'], # SZ
165
+ 'C' => { # SZC
166
+ 'S' => ['2', '4', '4'], # SZCS
167
+ 'Z' => ['2', '4', '4'] # SZCZ
168
+ },
169
+ 'D' => ['2', '43', '43'], # SZD
170
+ 'T' => ['2', '43', '43'] # SZT
171
+ }
172
+ },
173
+ 'T' => {
174
+ 'self' => ['3', '3', '3'], # T
175
+ 'C' => {
176
+ 'self' => ['4', '4', '4'], # TC
177
+ 'H' => ['4', '4', '4'] # TCH
178
+ },
179
+ 'H' => ['3', '3', '3'], # TH
180
+ 'R' => { # TR
181
+ 'C' => { # TRC
182
+ 'H' => ['4', '4', '4'] # TRCH
183
+ },
184
+ 'S' => ['4', '4', '4'], # TRS
185
+ 'Z' => ['4', '4', '4'] # TRZ
186
+ },
187
+ 'S' => {
188
+ 'self' => ['4', '4', '4'], # TS
189
+ 'H' => ['4', '4', '4'], # TSH
190
+ 'C' => { # TSC
191
+ 'H' => ['4', '4', '4'] # TSCH
192
+ },
193
+ 'Z' => ['4', '4', '4'] # TSZ
194
+ },
195
+ 'T' => { # TT
196
+ 'C' => { # TTC
197
+ 'H' => ['4', '4', '4'] # TTCH
198
+ },
199
+ 'S' => {
200
+ 'self' => ['4', '4', '4'], # TTS
201
+ 'C' => { # TTSC
202
+ 'H' => ['4', '4', '4'] # TTSCH
203
+ },
204
+ 'Z' => ['4', '4', '4'] # TTSZ
205
+ },
206
+ 'Z' => ['4', '4', '4'] # TTZ
207
+ },
208
+ 'Z' => {
209
+ 'self' => ['4', '4', '4'], # TZ
210
+ 'S' => ['4', '4', '4'] # TZS
211
+ }
212
+ },
213
+ 'X' => ['5', '54', '54'], # X
214
+ 'V' => ['7', '7', '7'], # V
215
+ 'W' => ['7', '7', '7'], # W
216
+ 'Z' => {
217
+ 'self' => ['4', '4', '4'], # Z
218
+ 'H' => {
219
+ 'self' => ['4', '4', '4'], # ZH
220
+ 'S' => { # ZHS
221
+ 'H' => ['4', '4', '4'] # ZHSH
222
+ }
223
+ },
224
+ 'S' => {
225
+ 'self' => ['4', '4', '4'], # ZS
226
+ 'C' => { # ZSC
227
+ 'H' => ['4', '4', '4'] # ZSCH
228
+ }
229
+ }
230
+ }
231
+ }
232
+ end
233
+ end
@@ -9,8 +9,10 @@ module Phonetic
9
9
  #
10
10
  # This implementation based on the PHP implementation by Stephen Woodbridge
11
11
  # and contains modifications of algorithm by Kevin Atkinson.
12
- # @see http://swoodbridge.com/DoubleMetaPhone/ PHP implementation by Stephen Woodbridge
13
- # @see http://aspell.net/metaphone/dmetaph.cpp C++ implementation with modifications by Kevin Atkinson
12
+ # @see http://swoodbridge.com/DoubleMetaPhone/
13
+ # PHP implementation by Stephen Woodbridge
14
+ # @see http://aspell.net/metaphone/dmetaph.cpp
15
+ # C++ implementation with modifications by Kevin Atkinson
14
16
  # @example
15
17
  # Phonetic::DoubleMetaphone.encode('czerny') # => ['SRN', 'XRN']
16
18
  # Phonetic::DoubleMetaphone.encode('dumb') # => ['TM', 'TM']
@@ -20,605 +22,73 @@ module Phonetic
20
22
  # Phonetic::Metaphone2.encode('dumb') # => ['TM', 'TM']
21
23
  # Phonetic::Metaphone2.encode('edgar') # => ['ATKR', 'ATKR']
22
24
  class DoubleMetaphone < Algorithm
23
- VOWELS = 'AEIOUY'
24
-
25
25
  # Encode word to its Double Metaphone code.
26
26
  def self.encode_word(word, options = { size: 4 })
27
27
  code_size = options[:size] || 4
28
28
  w = word.strip.upcase
29
- primary = ''
30
- secondary = ''
29
+ code = ['', '']
30
+ def code.add(primary, secondary)
31
+ self[0] += primary
32
+ self[1] += secondary
33
+ end
31
34
  i = 0
32
35
  len = w.size
33
36
  last = len - 1
34
37
  # pad the original string so that we can index beyond the edge of the world
35
38
  w += ' ' * 5
36
- # skip these when at start of word
37
- i += 1 if ['GN','KN','PN','WR','PS'].include? w[0, 2]
38
- # initial 'X' is pronounced 'Z' e.g. 'Xavier'
39
- if w[0] == 'X'
40
- primary += 'S'
41
- secondary += 'S'
42
- i += 1
43
- end
44
- while i < len && (primary.size < code_size || primary.size < code_size)
39
+ i += encode_start_of_word(w, code)
40
+ while i < len && (code.first.size < code_size || code.last.size < code_size)
45
41
  case w[i]
46
42
  when 'A', 'E', 'I', 'O', 'U', 'Y'
47
- if i == 0
48
- # all init vowels now map to 'A'
49
- primary += 'A'
50
- secondary += 'A'
51
- end
52
43
  i += 1
53
44
  when 'B'
54
45
  # "-mb", e.g", "dumb", already skipped over...
55
- primary += 'P'
56
- secondary += 'P'
57
- i += (w[i + 1] == 'B') ? 2 : 1
46
+ i += gen_encode(w, i, 'P', 'P', code)
58
47
  when 'Ç', 'ç'
59
- primary += 'S'
60
- secondary += 'S'
48
+ code.add 'S', 'S'
61
49
  i += 1
62
50
  when 'C'
63
- # various germanic
64
- if i > 1 && !vowel?(w[i - 2]) && w[i - 1, 3] == 'ACH' &&
65
- (w[i + 2] != 'I' && (w[i + 2] != 'E' || w[i - 2, 6] =~ /[BM]ACHER/))
66
- primary += 'K'
67
- secondary += 'K'
68
- i += 2
69
- # special case 'caesar'
70
- elsif i == 0 && w[i, 6] == 'CAESAR'
71
- primary += 'S'
72
- secondary += 'S'
73
- i += 2
74
- # italian 'chianti'
75
- elsif w[i, 4] == 'CHIA'
76
- primary += 'K'
77
- secondary += 'K'
78
- i += 2
79
- elsif w[i, 2] == 'CH'
80
- # find 'michael'
81
- if i > 0 && w[i, 4] == 'CHAE'
82
- primary += 'K'
83
- secondary += 'X'
84
- i += 2
85
- # greek roots e.g. 'chemistry', 'chorus'
86
- elsif i == 0 && (w[i + 1, 5] =~ /HARAC|HARIS/ || w[i + 1, 3] =~ /HOR|HYM|HIA|HEM/) &&
87
- w[0, 5] != 'CHORE'
88
- primary += 'K'
89
- secondary += 'K'
90
- i += 2
91
- else
92
- # germanic, greek, or otherwise 'ch' for 'kh' sound
93
- if (w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH') ||
94
- # 'architect but not 'arch', 'orchestra', 'orchid'
95
- (i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/) ||
96
- (w[i + 2] =~ /[TS]/) ||
97
- ((i > 0 && w[i - 1] =~ /[AOUE]/) || i == 0) &&
98
- # e.g., 'wachtler', 'wechsler', but not 'tichner'
99
- (w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
100
- primary += 'K'
101
- secondary += 'K'
102
- else
103
- if i > 0
104
- if w[0, 2] == 'MC'
105
- # e.g., "McHugh"
106
- primary += 'K'
107
- secondary += 'K'
108
- else
109
- primary += 'X'
110
- secondary += 'K'
111
- end
112
- else
113
- primary += 'X'
114
- secondary += 'X'
115
- end
116
- end
117
- i += 2
118
- end
119
- elsif w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
120
- # e.g, 'czerny'
121
- primary += 'S'
122
- secondary += 'X'
123
- i += 2
124
- elsif w[i + 1, 3] == 'CIA'
125
- # e.g., 'focaccia'
126
- primary += 'X'
127
- secondary += 'X'
128
- i += 3
129
- # double 'C', but not if e.g. 'McClellan'
130
- elsif w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
131
- # 'bellocchio' but not 'bacchus'
132
- if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
133
- # 'accident', 'accede' 'succeed'
134
- if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
135
- # 'bacci', 'bertucci', other italian
136
- primary += 'KS'
137
- secondary += 'KS'
138
- else
139
- primary += 'X'
140
- secondary += 'X'
141
- end
142
- i += 3
143
- else
144
- # Pierce's rule
145
- primary += 'K'
146
- secondary += 'K'
147
- i += 2
148
- end
149
- elsif w[i, 2] =~ /CK|CG|CQ/
150
- primary += 'K'
151
- secondary += 'K'
152
- i += 2
153
- elsif w[i, 2] =~ /CI|CE|CY/
154
- # italian vs. english
155
- if w[i, 3] =~ /CIO|CIE|CIA/
156
- primary += 'S'
157
- secondary += 'X'
158
- else
159
- primary += 'S'
160
- secondary += 'S'
161
- end
162
- i += 2
163
- else
164
- primary += 'K'
165
- secondary += 'K'
166
- # name sent in 'mac caffrey', 'mac gregor'
167
- if w[i + 1, 2] =~ /\s[CQG]/
168
- i += 3
169
- else
170
- if w[i + 1] =~ /[CKQ]/ && !(w[i + 1, 2] =~ /CE|CI/)
171
- i += 2
172
- else
173
- i += 1
174
- end
175
- end
176
- end
51
+ i += encode_c(w, i, len, code)
177
52
  when 'D'
178
- if w[i, 2] == 'DG'
179
- if w[i + 2] =~ /[IEY]/
180
- # e.g. 'edge'
181
- primary += 'J'
182
- secondary += 'J'
183
- i += 3
184
- else
185
- # e.g. 'edgar'
186
- primary += 'TK'
187
- secondary += 'TK'
188
- i += 2
189
- end
190
- elsif w[i, 2] =~ /DT|DD/
191
- primary += 'T'
192
- secondary += 'T'
193
- i += 2
194
- else
195
- primary += 'T'
196
- secondary += 'T'
197
- i += 1
198
- end
199
- when 'F'
200
- if w[i + 1] == 'F'
201
- i += 2
202
- else
203
- i += 1
204
- end
205
- primary += 'F'
206
- secondary += 'F'
53
+ i += encode_d(w, i, len, code)
54
+ when 'F', 'K', 'N'
55
+ i += gen_encode(w, i, w[i], w[i], code)
207
56
  when 'G'
208
- if w[i + 1] == 'H'
209
- if i > 0 && !vowel?(w[i - 1])
210
- primary += 'K'
211
- secondary += 'K'
212
- i += 2
213
- elsif i == 0
214
- # ghislane, ghiradelli
215
- if w[i + 2] == 'I'
216
- primary += 'J'
217
- secondary += 'J'
218
- else
219
- primary += 'K'
220
- secondary += 'K'
221
- end
222
- i += 2
223
- # Parker's rule (with some further refinements) - e.g., 'hugh'
224
- elsif (i > 1 && w[i - 2] =~ /[BHD]/) ||
225
- # e.g., 'bough'
226
- (i > 2 && w[i - 3] =~ /[BHD]/) ||
227
- # e.g., 'broughton'
228
- (i > 3 && w[i - 4] =~ /[BH]/)
229
- i += 2
230
- else
231
- # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
232
- if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
233
- primary += 'F'
234
- secondary += 'F'
235
- else
236
- if i > 0 && w[i - 1] != 'I'
237
- primary += 'K'
238
- secondary += 'K'
239
- end
240
- end
241
- i += 2
242
- end
243
- elsif w[i + 1] == 'N'
244
- if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
245
- primary += 'KN'
246
- secondary += 'N'
247
- else
248
- # not e.g. 'cagney'
249
- if w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
250
- primary += 'N'
251
- secondary += 'KN'
252
- else
253
- primary += 'KN'
254
- secondary += 'KN'
255
- end
256
- end
257
- i += 2
258
- # 'tagliaro'
259
- elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
260
- primary += 'KL'
261
- secondary += 'L'
262
- i += 2
263
- # -ges-,-gep-,-gel-, -gie- at beginning
264
- elsif i == 0 && (w[i + 1] == 'Y' || w[i + 1, 2] =~ /ES|EP|EB|EL|EY|IB|IL|IN|IE|EI|ER/)
265
- primary += 'K'
266
- secondary += 'J'
267
- i += 2
268
- # -ger-, -gy-
269
- elsif (w[i + 1, 2] == 'ER' || w[i + 1] == 'Y') &&
270
- !(w[0, 6] =~ /[DRM]ANGER/) &&
271
- !(i > 0 && w[i - 1] =~ /[EI]/) &&
272
- !(i > 0 && w[i - 1, 3] =~ /RGY|OGY/)
273
- primary += 'K'
274
- secondary += 'J'
275
- i += 2
276
- # italian e.g, 'biaggi'
277
- elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
278
- if w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH' || w[i + 1, 2] == 'ET'
279
- primary += 'K'
280
- secondary += 'K'
281
- else
282
- if w[i + 1, 4] =~ /IER\s/
283
- primary += 'J'
284
- secondary += 'J'
285
- else
286
- primary += 'J'
287
- secondary += 'K'
288
- end
289
- end
290
- i += 2
291
- else
292
- if w[i + 1] == 'G'
293
- i += 2
294
- else
295
- i += 1
296
- end
297
- primary += 'K'
298
- secondary += 'K'
299
- end
57
+ i += encode_g(w, i, len, code)
300
58
  when 'H'
301
- # only keep if first & before vowel or btw. 2 vowels
302
- if (i == 0 || (i > 0 && vowel?(w[i - 1]))) && vowel?(w[i + 1])
303
- primary += 'H'
304
- secondary += 'H'
305
- i += 2
306
- else # also takes care of 'HH'
307
- i += 1
308
- end
59
+ i += encode_h(w, i, len, code)
309
60
  when 'J'
310
- # obvious spanish, 'jose', 'san jacinto'
311
- if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
312
- if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
313
- primary += 'H'
314
- secondary += 'H'
315
- else
316
- primary += 'J'
317
- secondary += 'H'
318
- end
319
- i += 1
320
- else
321
- if i == 0 && w[i, 4] != 'JOSE'
322
- primary += 'J'
323
- secondary += 'A'
324
- # Yankelovich/Jankelowicz
325
- else
326
- # spanish pron. of e.g. 'bajador'
327
- if i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && (w[i + 1] == 'A' || w[i + 1] == 'O')
328
- primary += 'J'
329
- secondary += 'H'
330
- else
331
- if i == last
332
- primary += 'J'
333
- #secondary += ' '
334
- else
335
- if !(w[i + 1] =~ /[LTKSNMBZ]/) && !(i > 0 && w[i - 1] =~ /[SKL]/)
336
- primary += 'J'
337
- secondary += 'J'
338
- end
339
- end
340
- end
341
- end
342
- if w[i + 1] == 'J'
343
- i += 2
344
- else
345
- i += 1
346
- end
347
- end
348
- when 'K'
349
- if w[i + 1] == 'K'
350
- i += 2
351
- else
352
- i += 1
353
- end
354
- primary += 'K'
355
- secondary += 'K'
61
+ i += encode_j(w, i, len, code)
356
62
  when 'L'
357
- if w[i + 1] == 'L'
358
- # spanish e.g. 'cabrillo', 'gallegos'
359
- if (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILLO|ILLA|ALLE/) ||
360
- ((last > 0 && w[last - 1, 2] =~ /AS|OS/ || w[last] =~ /[AO]/) &&
361
- (i > 0 && w[i - 1, 4] == 'ALLE'))
362
- primary += 'L'
363
- i += 2
364
- next
365
- end
366
- i += 2
367
- else
368
- i += 1
369
- end
370
- primary += 'L'
371
- secondary += 'L'
63
+ i += encode_l(w, i, len, code)
372
64
  when 'M'
373
- if (i > 0 && w[i - 1, 3] == 'UMB' && (i + 1 == last || w[i + 2, 2] == "ER")) ||
374
- # 'dumb','thumb'
375
- w[i + 1] == 'M'
376
- i += 2
377
- else
378
- i += 1
379
- end
380
- primary += 'M'
381
- secondary += 'M'
382
- when 'N'
383
- if w[i + 1] == 'N'
384
- i += 2
385
- else
386
- i += 1
387
- end
388
- primary += 'N'
389
- secondary += 'N'
65
+ i += encode_m(w, i, len, code)
390
66
  when 'Ñ', 'ñ'
391
- i += 1;
392
- primary += 'N'
393
- secondary += 'N'
67
+ code.add 'N', 'N'
68
+ i += 1
394
69
  when 'P'
395
- if w[i + 1] == 'H'
396
- primary += 'F'
397
- secondary += 'F'
398
- i += 2
399
- else
400
- # also account for "campbell", "raspberry"
401
- if w[i + 1] =~ /[PB]/
402
- i += 2
403
- else
404
- i += 1
405
- end
406
- primary += 'P'
407
- secondary += 'P'
408
- end
70
+ i += encode_p(w, i, len, code)
409
71
  when 'Q'
410
- if w[i + 1] == 'Q'
411
- i += 2
412
- else
413
- i += 1
414
- end
415
- primary += 'K'
416
- secondary += 'K'
72
+ i += gen_encode(w, i, 'K', 'K', code)
417
73
  when 'R'
418
- # french e.g. 'rogier', but exclude 'hochmeier'
419
- if i == last && !slavo_germanic?(w) &&
420
- (i > 1 && w[i - 2, 2] == "IE") &&
421
- !(i > 3 && w[i - 4, 2] =~ /M[EA]/)
422
- secondary += 'R'
423
- else
424
- primary += 'R'
425
- secondary += 'R'
426
- end
427
- if w[i + 1] == 'R'
428
- i += 2
429
- else
430
- i += 1
431
- end
74
+ i += encode_r(w, i, len, code)
432
75
  when 'S'
433
- # special cases 'island', 'isle', 'carlisle', 'carlysle'
434
- if i > 0 && w[i - 1, 3] =~ /ISL|YSL/
435
- i += 1
436
- # special case 'sugar-'
437
- elsif i == 0 && w[i, 5] == 'SUGAR'
438
- primary += 'X'
439
- secondary += 'S'
440
- i += 1
441
- elsif w[i, 2] == 'SH'
442
- # germanic
443
- if w[i + 1, 4] =~ /HEIM|HOEK|HOLM|HOLZ/
444
- primary += 'S'
445
- secondary += 'S'
446
- else
447
- primary += 'X'
448
- secondary += 'X'
449
- end
450
- i += 2
451
- # italian & armenian
452
- elsif w[i, 3] =~ /SIO|SIA/ || w[i, 4] == 'SIAN'
453
- if !slavo_germanic?(w)
454
- primary += 'S'
455
- secondary += 'X'
456
- else
457
- primary += 'S'
458
- secondary += 'S'
459
- end
460
- i += 3
461
- # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
462
- # also, -sz- in slavic language altho in hungarian it is pronounced 's'
463
- elsif (i == 0 && w[i + 1] =~ /[MNLW]/) || w[i + 1] == 'Z'
464
- primary += 'S'
465
- secondary += 'X'
466
- if w[i + 1] == 'Z'
467
- i += 2
468
- else
469
- i += 1
470
- end
471
- elsif w[i, 2] == 'SC'
472
- # Schlesinger's rule
473
- if w[i + 2] == 'H'
474
- # dutch origin, e.g. 'school', 'schooner'
475
- if w[i + 3, 2] =~ /OO|ER|EN|UY|ED|EM/
476
- # 'schermerhorn', 'schenker'
477
- if w[i + 3, 2] =~ /ER|EN/
478
- primary += 'X'
479
- secondary += 'SK'
480
- else
481
- primary += 'SK'
482
- secondary += 'SK'
483
- end
484
- i += 3
485
- else
486
- if i == 0 && !vowel?(w[3]) && w[3] != 'W'
487
- primary += 'X'
488
- secondary += 'S'
489
- else
490
- primary += 'X'
491
- secondary += 'X'
492
- end
493
- i += 3
494
- end
495
- elsif w[i + 2, 1] =~ /[IEY]/
496
- primary += 'S'
497
- secondary += 'S'
498
- i += 3
499
- else
500
- primary += 'SK'
501
- secondary += 'SK'
502
- i += 3
503
- end
504
- else
505
- # french e.g. 'resnais', 'artois'
506
- if i == last && i > 1 && w[i - 2, 2] =~ /AI|OI/
507
- secondary += 'S'
508
- else
509
- primary += 'S'
510
- secondary += 'S'
511
- end
512
- if w[i + 1] =~ /[SZ]/
513
- i += 2
514
- else
515
- i += 1
516
- end
517
- end
76
+ i += encode_s(w, i, len, code)
518
77
  when 'T'
519
- if w[i, 4] == 'TION'
520
- primary += 'X'
521
- secondary += 'X'
522
- i += 3
523
- elsif w[i, 3] =~ /TIA|TCH/
524
- primary += 'X'
525
- secondary += 'X'
526
- i += 3
527
- elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
528
- # special case 'thomas', 'thames' or germanic
529
- if w[i + 2, 2] =~ /OM|AM/ || w[0, 4] =~ /VAN|VON\s/ || w[0, 3] == 'SCH'
530
- primary += 'T'
531
- secondary += 'T'
532
- else
533
- primary += '0'
534
- secondary += 'T'
535
- end
536
- i += 2
537
- else
538
- if w[i + 1] =~ /[TD]/
539
- i += 2
540
- else
541
- i += 1
542
- end
543
- primary += 'T'
544
- secondary += 'T'
545
- end
78
+ i += encode_t(w, i, len, code)
546
79
  when 'V'
547
- if w[i + 1] == 'V'
548
- i += 2
549
- else
550
- i += 1
551
- end
552
- primary += 'F'
553
- secondary += 'F'
80
+ i += gen_encode(w, i, 'F', 'F', code)
554
81
  when 'W'
555
- # can also be in middle of word
556
- if w[i, 2] == 'WR'
557
- primary += 'R'
558
- secondary += 'R'
559
- i += 2
560
- else
561
- if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
562
- # Wasserman should match Vasserman
563
- if vowel?(w[i + 1])
564
- primary += 'A'
565
- secondary += 'F'
566
- else
567
- # need Uomo to match Womo
568
- primary += 'A'
569
- secondary += 'A'
570
- end
571
- end
572
- # Arnow should match Arnoff
573
- if i == last && i > 0 && vowel?(w[i - 1]) ||
574
- (i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/) || w[0, 3] == 'SCH'
575
- secondary += 'F'
576
- i += 1
577
- elsif w[i, 4] =~ /WICZ|WITZ/
578
- # polish e.g. 'filipowicz'
579
- primary += 'TS'
580
- secondary += 'FX'
581
- i += 4
582
- else
583
- i += 1
584
- end
585
- end
82
+ i += encode_w(w, i, len, code)
586
83
  when 'X'
587
- # french e.g. breaux
588
- if !(i == last && ((i > 2 && w[i - 3, 3] =~ /IAU|EAU/) || (i > 1 && w[i - 2, 2] =~ /AU|OU/)))
589
- primary += 'KS'
590
- secondary += 'KS'
591
- end
592
- if w[i + 1] =~ /[CX]/
593
- i += 2
594
- else
595
- i += 1
596
- end
84
+ i += encode_x(w, i, len, code)
597
85
  when 'Z'
598
- # chinese pinyin e.g. 'zhao'
599
- if w[i + 1] == 'H'
600
- primary += 'J'
601
- secondary += 'J'
602
- i += 2
603
- else
604
- if w[i + 1, 2] =~ /ZO|ZI|ZA/ || slavo_germanic?(w) && (i > 0 && w[i - 1] != 'T')
605
- primary += 'S'
606
- secondary += 'TS';
607
- else
608
- primary += 'S'
609
- secondary += 'S';
610
- end
611
- if w[i + 1] == 'Z'
612
- i += 2
613
- else
614
- i += 1
615
- end
616
- end
86
+ i += encode_z(w, i, len, code)
617
87
  else
618
88
  i += 1
619
89
  end
620
90
  end
621
- [primary[0, code_size], secondary[0, code_size]]
91
+ [code.first[0, code_size], code.last[0, code_size]]
622
92
  end
623
93
 
624
94
  def self.encode(str, options = { size: 4 })
@@ -627,14 +97,494 @@ module Phonetic
627
97
 
628
98
  private
629
99
 
630
- def self.slavo_germanic?(str)
631
- !!(str[/W|K|CZ|WITZ/])
100
+ def self.encode_start_of_word(w, code)
101
+ i = 0
102
+ # skip these when at start of word
103
+ if w[0, 2] =~ /[GKP]N|WR|PS/
104
+ i = 1
105
+ # initial 'X' is pronounced 'Z' e.g. 'Xavier'
106
+ elsif w[0] == 'X'
107
+ code.add 'S', 'S'
108
+ i = 1
109
+ elsif w[0] =~ /[AEIOUY]/
110
+ code.add 'A', 'A' # all init vowels now map to 'A'
111
+ i = 1
112
+ elsif w[0, 6] == 'CAESAR' # special case 'caesar'
113
+ code.add 'S', 'S'
114
+ i = 1
115
+ end
116
+ i
632
117
  end
633
118
 
634
- def self.vowel?(char)
635
- c = VOWELS[char.to_s]
636
- !c.nil? && !c.empty?
119
+ def self.gen_encode(w, i, primary, secondary, code)
120
+ code.add primary, secondary
121
+ w[i + 1] == w[i] ? 2 : 1
637
122
  end
638
123
 
124
+ def self.encode_c(w, i, len, code)
125
+ r = 1
126
+ case
127
+ # various germanic
128
+ when c_germanic?(w, i)
129
+ code.add 'K', 'K'
130
+ r += 1
131
+ when w[i, 2] == 'CH'
132
+ encode_ch(w, i, len, code)
133
+ r += 1
134
+ when w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
135
+ # e.g, 'czerny'
136
+ code.add 'S', 'X'
137
+ r += 1
138
+ when w[i + 1, 3] == 'CIA'
139
+ # e.g., 'focaccia'
140
+ code.add 'X', 'X'
141
+ r += 2
142
+ # double 'C', but not if e.g. 'McClellan'
143
+ when w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
144
+ r += encode_cc(w, i, code) + 1
145
+ when w[i, 2] =~ /C[KGQ]/
146
+ code.add 'K', 'K'
147
+ r += 1
148
+ when w[i, 2] =~ /C[IEY]/
149
+ # italian vs. english
150
+ if w[i, 3] =~ /CI[OEA]/
151
+ code.add 'S', 'X'
152
+ else
153
+ code.add 'S', 'S'
154
+ end
155
+ r += 1
156
+ else
157
+ code.add 'K', 'K'
158
+ # name sent in 'mac caffrey', 'mac gregor'
159
+ if w[i + 1, 2] =~ /\s[CQG]/
160
+ r += 2
161
+ elsif w[i + 1] =~ /[CKQ]/ && w[i + 1, 2] !~ /C[EI]/
162
+ r += 1
163
+ end
164
+ end
165
+ r
166
+ end
167
+
168
+ def self.encode_d(w, i, len, code)
169
+ r = 1
170
+ if w[i, 2] == 'DG'
171
+ if w[i + 2] =~ /[IEY]/
172
+ # e.g. 'edge'
173
+ code.add 'J', 'J'
174
+ r += 2
175
+ else
176
+ # e.g. 'edgar'
177
+ code.add 'TK', 'TK'
178
+ r += 1
179
+ end
180
+ elsif w[i, 2] =~ /D[TD]/
181
+ code.add 'T', 'T'
182
+ r += 1
183
+ else
184
+ code.add 'T', 'T'
185
+ end
186
+ r
187
+ end
188
+
189
+ def self.encode_g(w, i, len, code)
190
+ r = 2
191
+ if w[i + 1] == 'H'
192
+ encode_gh(w, i, code)
193
+ elsif w[i + 1] == 'N'
194
+ encode_gn(w, i, code)
195
+ # 'tagliaro'
196
+ elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
197
+ code.add 'KL', 'L'
198
+ # -ges-, -gep-, -gel-, -gie- at beginning
199
+ elsif i == 0 && w[1, 2] =~ /^Y|E[SPBLYIR]|I[BLNE]/
200
+ code.add 'K', 'J'
201
+ # -ger-, -gy-
202
+ elsif g_ger_or_gy?(w, i)
203
+ code.add 'K', 'J'
204
+ # italian e.g, 'biaggi'
205
+ elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
206
+ if w[0, 4] =~ /^(VAN |VON |SCH)/ || w[i + 1, 2] == 'ET'
207
+ code.add 'K', 'K'
208
+ elsif w[i + 1, 4] =~ /IER\s/
209
+ code.add 'J', 'J'
210
+ else
211
+ code.add 'J', 'K'
212
+ end
213
+ else
214
+ r -= 1 if w[i + 1] != 'G'
215
+ code.add 'K', 'K'
216
+ end
217
+ r
218
+ end
219
+
220
+ def self.encode_h(w, i, len, code)
221
+ r = 1
222
+ # only keep if first & before vowel or btw. 2 vowels
223
+ if (i == 0 || i > 0 && vowel?(w[i - 1])) && vowel?(w[i + 1])
224
+ code.add 'H', 'H'
225
+ r += 1
226
+ end
227
+ r
228
+ end
229
+
230
+ def self.encode_j(w, i, len, code)
231
+ r = 1
232
+ last = len - 1
233
+ # obvious spanish, 'jose', 'san jacinto'
234
+ if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
235
+ if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
236
+ code.add 'H', 'H'
237
+ else
238
+ code.add 'J', 'H'
239
+ end
240
+ else
241
+ if i == 0 && w[i, 4] != 'JOSE'
242
+ code.add 'J', 'A'
243
+ # Yankelovich/Jankelowicz
244
+ else
245
+ # spanish pron. of e.g. 'bajador'
246
+ if j_spanish_pron?(w, i)
247
+ code.add 'J', 'H'
248
+ elsif i == last
249
+ code.add 'J', ''
250
+ elsif w[i + 1] !~ /[LTKSNMBZ]/ && !(i > 0 && w[i - 1] =~ /[SKL]/)
251
+ code.add 'J', 'J'
252
+ end
253
+ end
254
+ r += 1 if w[i + 1] == 'J'
255
+ end
256
+ r
257
+ end
258
+
259
+ def self.encode_l(w, i, len, code)
260
+ r = 1
261
+ if w[i + 1] == 'L'
262
+ # spanish e.g. 'cabrillo', 'gallegos'
263
+ if ll_spanish?(w, i, len)
264
+ code.add 'L', ''
265
+ else
266
+ code.add 'L', 'L'
267
+ end
268
+ r += 1
269
+ else
270
+ code.add 'L', 'L'
271
+ end
272
+ r
273
+ end
274
+
275
+ def self.encode_m(w, i, len, code)
276
+ r = 1
277
+ # 'dumb','thumb'
278
+ r += 1 if i > 0 && w[i - 1, 5] =~ /UMB( |ER)/ || w[i + 1] == 'M'
279
+ code.add 'M', 'M'
280
+ r
281
+ end
282
+
283
+ def self.encode_p(w, i, len, code)
284
+ r = 1
285
+ if w[i + 1] == 'H'
286
+ code.add 'F', 'F'
287
+ r += 1
288
+ else
289
+ # also account for "campbell", "raspberry"
290
+ r += 1 if w[i + 1] =~ /[PB]/
291
+ code.add 'P', 'P'
292
+ end
293
+ r
294
+ end
295
+
296
+ def self.encode_r(w, i, len, code)
297
+ last = len - 1
298
+ # french e.g. 'rogier', but exclude 'hochmeier'
299
+ if r_french?(w, i, last)
300
+ code.add '', 'R'
301
+ else
302
+ code.add 'R', 'R'
303
+ end
304
+ w[i + 1] == 'R' ? 2 : 1
305
+ end
306
+
307
+ def self.encode_s(w, i, len, code)
308
+ r = 1
309
+ last = len - 1
310
+ # special cases 'island', 'isle', 'carlisle', 'carlysle'
311
+ if i > 0 && w[i - 1, 3] =~ /[IY]SL/
312
+ # special case 'sugar-'
313
+ elsif i == 0 && w[i, 5] == 'SUGAR'
314
+ code.add 'X', 'S'
315
+ elsif w[i, 2] == 'SH'
316
+ # germanic
317
+ if w[i + 1, 4] =~ /H(EIM|OEK|OL[MZ])/
318
+ code.add 'S', 'S'
319
+ else
320
+ code.add 'X', 'X'
321
+ end
322
+ r += 1
323
+ # italian & armenian
324
+ elsif w[i, 3] =~ /SI[OA]/
325
+ if !slavo_germanic?(w)
326
+ code.add 'S', 'X'
327
+ else
328
+ code.add 'S', 'S'
329
+ end
330
+ r += 2
331
+ # german & anglicisations, e.g. 'smith' match 'schmidt',
332
+ # 'snider' match 'schneider' also, -sz- in slavic language altho in
333
+ # hungarian it is pronounced 's'
334
+ elsif i == 0 && w[i + 1] =~ /[MNLW]/ || w[i + 1] == 'Z'
335
+ code.add 'S', 'X'
336
+ r += 1 if w[i + 1] == 'Z'
337
+ elsif w[i, 2] == 'SC'
338
+ encode_sc(w, i, code)
339
+ r += 2
340
+ # french e.g. 'resnais', 'artois'
341
+ else
342
+ if i == last && i > 1 && w[i - 2, 2] =~ /[AO]I/
343
+ code.add '', 'S'
344
+ else
345
+ code.add 'S', 'S'
346
+ end
347
+ r += 1 if w[i + 1] =~ /[SZ]/
348
+ end
349
+ r
350
+ end
351
+
352
+ def self.encode_t(w, i, len, code)
353
+ r = 1
354
+ if w[i, 4] =~ /^(TION|TIA|TCH)/
355
+ code.add 'X', 'X'
356
+ r += 2
357
+ elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
358
+ # special case 'thomas', 'thames' or germanic
359
+ if w[i + 2, 2] =~ /[OA]M/ || w[0, 4] =~ /^(VAN |VON |SCH)/
360
+ code.add 'T', 'T'
361
+ else
362
+ code.add '0', 'T'
363
+ end
364
+ r += 1
365
+ else
366
+ r += 1 if w[i + 1] =~ /[TD]/
367
+ code.add 'T', 'T'
368
+ end
369
+ r
370
+ end
371
+
372
+ def self.encode_w(w, i, len, code)
373
+ last = len - 1
374
+ r = 1
375
+ # can also be in middle of word
376
+ if w[i, 2] == 'WR'
377
+ code.add 'R', 'R'
378
+ r += 1
379
+ else
380
+ if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
381
+ # Wasserman should match Vasserman
382
+ if vowel?(w[i + 1])
383
+ code.add 'A', 'F'
384
+ else
385
+ # need Uomo to match Womo
386
+ code.add 'A', 'A'
387
+ end
388
+ end
389
+ # Arnow should match Arnoff
390
+ if i == last && i > 0 && vowel?(w[i - 1]) ||
391
+ i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/ ||
392
+ w[0, 3] == 'SCH'
393
+ code.add '', 'F'
394
+ elsif w[i, 4] =~ /WICZ|WITZ/
395
+ # polish e.g. 'filipowicz'
396
+ code.add 'TS', 'FX'
397
+ r += 3
398
+ end
399
+ end
400
+ r
401
+ end
402
+
403
+ def self.encode_x(w, i, len, code)
404
+ # french e.g. breaux
405
+ code.add 'KS', 'KS' unless x_french?(w, i, len - 1)
406
+ w[i + 1] =~ /[CX]/ ? 2 : 1
407
+ end
408
+
409
+ def self.encode_z(w, i, len, code)
410
+ r = 1
411
+ # chinese pinyin e.g. 'zhao'
412
+ if w[i + 1] == 'H'
413
+ code.add 'J', 'J'
414
+ r += 1
415
+ else
416
+ if w[i + 1, 2] =~ /Z[OIA]/ ||
417
+ slavo_germanic?(w) && i > 0 && w[i - 1] != 'T'
418
+ code.add 'S', 'TS';
419
+ else
420
+ code.add 'S', 'S';
421
+ end
422
+ r += 1 if w[i + 1] == 'Z'
423
+ end
424
+ r
425
+ end
426
+
427
+ def self.encode_ch(w, i, len, code)
428
+ case
429
+ # italian 'chianti'
430
+ when w[i, 4] == 'CHIA'
431
+ code.add 'K', 'K'
432
+ # find 'michael'
433
+ when i > 0 && w[i, 4] == 'CHAE'
434
+ code.add 'K', 'X'
435
+ # greek roots e.g. 'chemistry', 'chorus'
436
+ when ch_greek_roots?(w, i)
437
+ code.add 'K', 'K'
438
+ # germanic, greek, or otherwise 'ch' for 'kh' sound
439
+ when ch_germanic_or_greek?(w, i, len)
440
+ code.add 'K', 'K'
441
+ when i == 0
442
+ code.add 'X', 'X'
443
+ when w[0, 2] == 'MC'
444
+ # e.g., "McHugh"
445
+ code.add 'K', 'K'
446
+ else
447
+ code.add 'X', 'K'
448
+ end
449
+ end
450
+
451
+ def self.encode_cc(w, i, code)
452
+ r = 0
453
+ # 'bellocchio' but not 'bacchus'
454
+ if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
455
+ # 'accident', 'accede' 'succeed'
456
+ if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
457
+ # 'bacci', 'bertucci', other italian
458
+ code.add 'KS', 'KS'
459
+ else
460
+ code.add 'X', 'X'
461
+ end
462
+ r = 1
463
+ else
464
+ # Pierce's rule
465
+ code.add 'K', 'K'
466
+ end
467
+ r
468
+ end
469
+
470
+ def self.encode_gh(w, i, code)
471
+ if i > 0 && !vowel?(w[i - 1])
472
+ code.add 'K', 'K'
473
+ elsif i == 0
474
+ # ghislane, ghiradelli
475
+ if w[i + 2] == 'I'
476
+ code.add 'J', 'J'
477
+ else
478
+ code.add 'K', 'K'
479
+ end
480
+ # Parker's rule (with some further refinements)
481
+ elsif !(i > 1 && w[i - 2] =~ /[BHD]/ || # e.g., 'hugh'
482
+ i > 2 && w[i - 3] =~ /[BHD]/ || # e.g., 'bough'
483
+ i > 3 && w[i - 4] =~ /[BH]/) # e.g., 'broughton'
484
+ # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
485
+ if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
486
+ code.add 'F', 'F'
487
+ elsif i > 0 && w[i - 1] != 'I'
488
+ code.add 'K', 'K'
489
+ end
490
+ end
491
+ end
492
+
493
+ def self.encode_gn(w, i, code)
494
+ if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
495
+ code.add 'KN', 'N'
496
+ # not e.g. 'cagney'
497
+ elsif w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
498
+ code.add 'N', 'KN'
499
+ else
500
+ code.add 'KN', 'KN'
501
+ end
502
+ end
503
+
504
+ def self.encode_sc(w, i, code)
505
+ # Schlesinger's rule
506
+ if w[i + 2] == 'H'
507
+ # dutch origin, e.g. 'school', 'schooner'
508
+ if w[i + 3, 2] =~ /OO|UY|E[DM]/
509
+ code.add 'SK', 'SK'
510
+ # 'schermerhorn', 'schenker'
511
+ elsif w[i + 3, 2] =~ /E[RN]/
512
+ code.add 'X', 'SK'
513
+ elsif i == 0 && !vowel?(w[3]) && w[3] != 'W'
514
+ code.add 'X', 'S'
515
+ else
516
+ code.add 'X', 'X'
517
+ end
518
+ elsif w[i + 2] =~ /[IEY]/
519
+ code.add 'S', 'S'
520
+ else
521
+ code.add 'SK', 'SK'
522
+ end
523
+ end
524
+
525
+ def self.slavo_germanic?(w)
526
+ w =~ /W|K|CZ|WITZ/
527
+ end
528
+
529
+ def self.vowel?(c)
530
+ c =~ /[AEIOUY]/
531
+ end
532
+
533
+ def self.c_germanic?(w, i)
534
+ # various germanic
535
+ i > 1 &&
536
+ !vowel?(w[i - 2]) &&
537
+ w[i - 1, 3] == 'ACH' &&
538
+ (w[i + 2] !~ /[IE]/ || w[i - 2, 6] =~ /[BM]ACHER/)
539
+ end
540
+
541
+ def self.ch_greek_roots?(w, i)
542
+ # greek roots e.g. 'chemistry', 'chorus'
543
+ i == 0 && w[1, 5] =~ /^H(ARAC|ARIS|OR|YM|IA|EM)/ && w[0, 5] != 'CHORE'
544
+ end
545
+
546
+ def self.ch_germanic_or_greek?(w, i, len)
547
+ # germanic, greek, or otherwise 'ch' for 'kh' sound
548
+ w[0, 4] =~ /^(V[AO]N\s|SCH)/ ||
549
+ # 'architect but not 'arch', 'orchestra', 'orchid'
550
+ i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/ ||
551
+ (w[i + 2] =~ /[TS]/) ||
552
+ (i > 0 && w[i - 1] =~ /[AOUE]/ || i == 0) &&
553
+ # e.g., 'wachtler', 'wechsler', but not 'tichner'
554
+ (w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
555
+ end
556
+
557
+ def self.g_ger_or_gy?(w, i)
558
+ # -ger-, -gy-
559
+ w[i + 1, 2] =~ /^(ER|Y)/ &&
560
+ w[0, 6] !~ /[DRM]ANGER/ &&
561
+ !(i > 0 && w[i - 1] =~ /[EI]/) &&
562
+ !(i > 0 && w[i - 1, 3] =~ /[RO]GY/)
563
+ end
564
+
565
+ def self.j_spanish_pron?(w, i)
566
+ # spanish pron. of e.g. 'bajador'
567
+ i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && w[i + 1] =~ /[AO]/
568
+ end
569
+
570
+ def self.ll_spanish?(w, i, len)
571
+ last = len - 1
572
+ # spanish e.g. 'cabrillo', 'gallegos'
573
+ (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILL[OA]|ALLE/) ||
574
+ (last > 0 && w[last - 1, 2] =~ /[AO]S/ || w[last] =~ /[AO]/) &&
575
+ (i > 0 && w[i - 1, 4] == 'ALLE')
576
+ end
577
+
578
+ def self.r_french?(w, i, last)
579
+ # french e.g. 'rogier', but exclude 'hochmeier'
580
+ i == last && !slavo_germanic?(w) &&
581
+ i > 1 && w[i - 2, 2] == 'IE' &&
582
+ !(i > 3 && w[i - 4, 2] =~ /M[EA]/)
583
+ end
584
+
585
+ def self.x_french?(w, i, last)
586
+ # french e.g. breaux
587
+ i == last && (i > 2 && w[i - 3, 3] =~ /[IE]AU/ || i > 1 && w[i - 2, 2] =~ /[AO]U/)
588
+ end
639
589
  end
640
590
  end