RubyGems - phonetic - Versions diffs - 1.0.1 → 1.1.0 - Mend

phonetic 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/.rspec +1 -1
data/README.md +7 -0
data/lib/phonetic.rb +1 -0
data/lib/phonetic/core_ext/string/dm_soundex.rb +12 -0
data/lib/phonetic/dm_soundex.rb +82 -0
data/lib/phonetic/dm_soundex_map.rb +233 -0
data/lib/phonetic/double_metaphone.rb +519 -569
data/lib/phonetic/metaphone.rb +43 -69
data/lib/phonetic/version.rb +1 -1
data/spec/phonetic/core_ext/string/dm_soundex_spec.rb +9 -0
data/spec/phonetic/dm_soundex_spec.rb +13 -0
data/spec/support/dm_soundex_data.rb +259 -0
data/spec/support/double_metaphone_data.rb +30 -0
metadata +11 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: af5149abea885ede20731d2ddf269a57588ca5fa
-  data.tar.gz: 2ebb72004c4fc667801a2b7087766c8cecb1ff1a
+  metadata.gz: a20da7ce0b4dab68d7671088098226a035c64b05
+  data.tar.gz: 2b721bc986d8e23ba6780bb7cab92059e6a7652b
 SHA512:
-  metadata.gz: 359113efab060b09395e6805bbd2e4b69aee42177b070a03361daad56c39fa6348ab836e77900a14b54236299b1c3b2f24b2b28cc546294bed5065961c465c72
-  data.tar.gz: b8879f75acc85d2b24b705ca1cc48265702bafaff41f4f5de0783c1950abb62e160f41e5412c48fec7ff0ba29e2a820fa96a295435eed3228fba28cec0d8cbc9
+  metadata.gz: 14325fa3846251dd1a1cbc59b38c12a32471291b45b07074387747fa9331b5ad98b1b0afaa8dbbac62872f9bf959d5e622742e5ec673f3e1294807f91b5fdc85
+  data.tar.gz: ad80a4c26cae46cbc516cc6cfebbfea39be69fbfe86426737cd94c065da57f4448ff3ffe9f892bd60af94fd834ae122c472cdc09b5fe683344ef479d1f31f90c

data/.rspec CHANGED Viewed

@@ -1,2 +1,2 @@
 --color
---format doc
+--format doc

data/README.md CHANGED Viewed

@@ -93,6 +93,13 @@ or use alias:
 'Bonnie'.nysiis # => 'BANY'
 ```
+### Daitch–Mokotoff Soundex (D–M Soundex)
+```ruby
+'Anja'.dm_soundex # => ['060000', '064000']
+'Schwarz'.dm_soundex # => ['474000', '479400']
+'Schtolteheim'.dm_soundex # => ['283560']
+```
 ## Contributing
 1. Fork it

data/lib/phonetic.rb CHANGED Viewed

@@ -7,4 +7,5 @@ require 'phonetic/double_metaphone'
 require 'phonetic/metaphone2'
 require 'phonetic/caverphone'
 require 'phonetic/caverphone2'
+require 'phonetic/dm_soundex'
 require 'phonetic/core_ext/string'

data/lib/phonetic/core_ext/string/dm_soundex.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'phonetic/dm_soundex'
+class String
+  # D-M Soundex values of string.
+  # @example
+  #    'Anja'.dm_soundex # => ['060000', '064000']
+  #    'Schwarz'.dm_soundex # => ['474000', '479400']
+  #    'Schtolteheim'.dm_soundex # => ['283560']
+  def dm_soundex(options = {})
+    Phonetic::DMSoundex.encode(self, options)
+  end
+end

data/lib/phonetic/dm_soundex.rb ADDED Viewed

@@ -0,0 +1,82 @@
+require 'phonetic/algorithm'
+require 'phonetic/dm_soundex_map'
+module Phonetic
+  # Daitch–Mokotoff Soundex (D–M Soundex) is a phonetic algorithm invented
+  # in 1985 by Jewish genealogists Gary Mokotoff and Randy Daitch.
+  #
+  # @example
+  #    Phonetic::DMSoundex.encode('Anja') # => ['060000', '064000']
+  #    Phonetic::DMSoundex.encode('Schwarz') # => ['474000', '479400']
+  #    Phonetic::DMSoundex.encode('Schtolteheim') # => ['283560']
+  class DMSoundex < Algorithm
+    def self.encode(str, options = {})
+      encode_word(str, options)
+    end
+    # Encode word to its D-M Soundex codes.
+    def self.encode_word(word, options = {})
+      w = word.strip.upcase.gsub(/[^A-Z]+/, '')
+      i = 0
+      code = init_code()
+      while i < w.size
+        if w[i] != w[i + 1]
+          c = find_code(MAP, w, i)
+          if c
+            len = c[3] + 1
+            if i == 0
+              code.add c[0]
+            elsif w[i + len] =~ /[AEIOUJY]/
+              code.add c[1]
+            else
+              code.add c[2]
+            end
+            i += c[3]
+          end
+        end
+        i += 1
+      end
+      code.result
+    end
+    private
+    def self.init_code
+      code = [[]]
+      def code.add(a)
+        case a
+        when Array
+          c = self.map{|w| w.last != a[1] ? w + [a[1]] : w}
+          self.map!{|w| w.last != a[0] ? w + [a[0]] : w}
+          self.push(*c)
+        else
+          self.map!{|w| w.last != a ? w + [a] : w}
+        end
+      end
+      def code.result
+        self.map{|w| w.join[0..5].ljust(6, '0')}.uniq
+      end
+      code
+    end
+    def self.find_code(map, w, i, last = nil, count = 0)
+      elem = map[w[i]]
+      r = case elem
+          when Array
+            elem[3] = count
+            elem
+          when Hash
+            _last = last
+             if elem['self']
+              _last = elem['self']
+              _last[3] = count
+            end
+            find_code(elem, w, i + 1, _last, count + 1)
+          when nil
+            last
+          end
+      r
+    end
+  end
+end

data/lib/phonetic/dm_soundex_map.rb ADDED Viewed

@@ -0,0 +1,233 @@
+# encoding: utf-8
+require 'phonetic/algorithm'
+module Phonetic
+  class DMSoundex < Algorithm
+    MAP = {
+      'A' => {
+        'self' => ['0', '', ''], # A
+        'I' => ['0', '1', ''],   # AI
+        'J' => ['0', '1', ''],   # AJ
+        'Y' => ['0', '1', ''],   # AY
+        'U' => ['0', '7', '']    # AU
+      },
+      'Ą' => ['', '', ['6', '']],
+      'E' => {
+        'self' => ['0', '', ''], # E
+        'I' => ['0', '1', ''],   # EI
+        'Y' => ['0', '1', ''],   # EY
+        'J' => ['0', '1', ''],   # EJ
+        'U' => ['1', '1', '']    # EU
+      },
+      'O' => {
+        'self' => ['0', '', ''], # O
+        'I' => ['0', '1', ''],   # OI
+        'J' => ['0', '1', ''],   # OJ
+        'Y' => ['0', '1', '']    # OY
+      },
+      'U' => {
+        'self' => ['0', '', ''], # U
+        'I' => ['0', '1', ''],   # UI
+        'J' => ['0', '1', ''],   # UJ
+        'Y' => ['0', '1', ''],   # UY
+        'E' => ['0', '', '']     # UE
+      },
+      'I' => {
+        'self' => ['0', '', ''], # I
+        'A' => ['1', '', ''],    # IA
+        'E' => ['1', '', ''],    # IE
+        'O' => ['1', '', ''],    # IO
+        'U' => ['1', '', '']     # IU
+      },
+      'Y' => ['1', '', ''],                               # Y
+      'J' => [['1', '4'], ['', '4'], ['', '4']],          # J
+      'B' => ['7', '7', '7'],                             # B
+      'C' => {
+        'self' => [['5', '4'], ['5', '4'], ['5', '4']],   # C
+        'H' => {
+          'self' => [['5', '4'], ['5', '4'], ['5', '4']], # CH
+          'S' => ['5', '54', '54']                        # CHS
+        },
+        'K' => [['5', '45'], ['5', '45'], ['5', '45']],   # CK
+        'S' => {
+          'self' => ['4', '4', '4'], # CS
+          'Z' => ['4', '4', '4']     # CSZ
+        },
+        'Z' => {
+          'self' => ['4', '4', '4'], # CZ
+          'S' => ['4', '4', '4']     # CZS
+        }
+      },
+      'D' => {
+        'self' => ['3', '3', '3'],   # D
+        'R' => {                     # DR
+          'S' => ['4', '4', '4'],    # DRS
+          'Z' => ['4', '4', '4']     # DRZ
+        },
+        'S' => {
+          'self' => ['4', '4', '4'], # DS
+          'H' => ['4', '4', '4']     # DSH
+        },
+        'T' => ['3', '3', '3'],      # DT
+        'Z' => {
+          'self' =>['4', '4', '4'],  # DZ
+          'H' => ['4', '4', '4'],    # DZH
+          'S' => ['4', '4', '4']     # DZS
+        }
+      },
+      'F' => {
+        'self' => ['7', '7', '7'],  # F
+        'B' => ['7', '7', '7']      # FB
+      },
+      'G' => ['5', '5', '5'],       # G
+      'H' => ['5', '5', ''],        # H
+      'K' => {
+        'self' => ['5', '5', '5'],  # K
+        'H' => ['5', '5', '5'],     # KH
+        'S' => ['5', '54', '54']    # KS
+      },
+      'L' => ['8', '8', '8'],       # L
+      'M' => {
+        'self' => ['6', '6', '6'],  # M
+        'N' => ['', '66', '66']     # MN
+      },
+      'N' => {
+        'self' => ['6', '6', '6'],  # N
+        'M' => ['', '66', '66']     # NM
+      },
+      'P' => {
+        'self' => ['7', '7', '7'],  # P
+        'F' => ['7', '7', '7'],     # PF
+        'H' => ['7', '7', '7']      # PH
+      },
+      'R' => {
+        'self' => ['9', '9', '9'],  # R
+        'S' => [['94', '4'], ['94', '4'], ['94', '4']], # RS
+        'Z' => [['4', '94'], ['4', '94'], ['4', '94']]  # RZ
+      },
+      'Q' => ['5', '5', '5'],               # Q
+      'S' => {
+        'self' => ['4', '4', '4'],          # S
+        'C' => {
+          'self' => ['2', '4', '4'],        # SC
+          'H' => {
+            'self' => ['4', '4', '4'],      # SCH
+            'T' => {
+              'self' => ['2', '43', '43'],  # SCHT
+              'S' => {                      # SCHTS
+                'C' => {                    # SCHTSC
+                  'H' => ['2', '4', '4']    # SCHTSCH
+                },
+                'H' => ['2', '4', '4']      # SCHTSH
+              },
+              'C' => {                      # SCHTC
+                'H' => ['2', '4', '4']      # SCHTCH
+              }
+            }
+          }
+        },
+        'D' => ['2', '43', '43'],           # SD
+        'H' => {
+          'self' => ['4', '4', '4'],        # SH
+          'C' => {                          # SHC
+            'H' => ['2', '4', '4']          # SHCH
+          },
+          'D' => ['2', '43', '43'],         # SHD
+          'T' => {
+            'self' => ['2', '43', '43'],    # SHT
+            'C' => {                        # SHTC
+              'H' => ['2', '4', '4']        # SHTCH
+            },
+            'S' => {                        # SHTS
+              'H' => ['2', '4', '4']        # SHTSH
+            }
+          }
+        },
+        'T' => {
+          'self' => ['2', '43', '43'],      # ST
+          'C' => {                          # STC
+            'H' => ['2', '4', '4']          # STCH
+          },
+          'S' => {                          # STS
+            'C' => {                        # STSC
+              'H' => ['2', '4', '4']        # STSCH
+            },
+            'D' => ['2', '43', '43'],       # SCHD
+            'H' => ['2', '4', '4']          # STSH
+          },
+          'R' => {                          # STR
+            'S' => ['2', '4', '4'],         # STRS
+            'Z' => ['2', '4', '4']          # STRZ
+          }
+        },
+        'Z' => {
+          'self' => ['4', '4', '4'],        # SZ
+          'C' => {                          # SZC
+            'S' => ['2', '4', '4'],         # SZCS
+            'Z' => ['2', '4', '4']          # SZCZ
+          },
+          'D' => ['2', '43', '43'],         # SZD
+          'T' => ['2', '43', '43']          # SZT
+        }
+      },
+      'T' => {
+        'self' => ['3', '3', '3'],          # T
+        'C' => {
+          'self' => ['4', '4', '4'],        # TC
+          'H' => ['4', '4', '4']            # TCH
+        },
+        'H' => ['3', '3', '3'],             # TH
+        'R' => {                            # TR
+          'C' => {                          # TRC
+            'H' => ['4', '4', '4']          # TRCH
+          },
+          'S' => ['4', '4', '4'],           # TRS
+          'Z' => ['4', '4', '4']            # TRZ
+        },
+        'S' => {
+          'self' => ['4', '4', '4'],        # TS
+          'H' => ['4', '4', '4'],           # TSH
+          'C' => {                          # TSC
+            'H' => ['4', '4', '4']          # TSCH
+          },
+          'Z' => ['4', '4', '4']            # TSZ
+        },
+        'T' => {                            # TT
+          'C' => {                          # TTC
+            'H' => ['4', '4', '4']          # TTCH
+          },
+          'S' => {
+            'self' => ['4', '4', '4'],      # TTS
+            'C' => {                        # TTSC
+              'H' => ['4', '4', '4']        # TTSCH
+            },
+            'Z' => ['4', '4', '4']          # TTSZ
+          },
+          'Z' => ['4', '4', '4']            # TTZ
+        },
+        'Z' => {
+          'self' => ['4', '4', '4'],        # TZ
+          'S' => ['4', '4', '4']            # TZS
+        }
+      },
+      'X' => ['5', '54', '54'],             # X
+      'V' => ['7', '7', '7'],               # V
+      'W' => ['7', '7', '7'],               # W
+      'Z' => {
+        'self' => ['4', '4', '4'],          # Z
+        'H' => {
+          'self' => ['4', '4', '4'],        # ZH
+          'S' => {                          # ZHS
+            'H' => ['4', '4', '4']          # ZHSH
+          }
+        },
+        'S' => {
+          'self' => ['4', '4', '4'],        # ZS
+          'C' => {                          # ZSC
+            'H' => ['4', '4', '4']          # ZSCH
+          }
+        }
+      }
+    }
+  end
+end

data/lib/phonetic/double_metaphone.rb CHANGED Viewed

@@ -9,8 +9,10 @@ module Phonetic
   #
   # This implementation based on the PHP implementation by Stephen Woodbridge
   # and contains modifications of algorithm by Kevin Atkinson.
-  # @see http://swoodbridge.com/DoubleMetaPhone/ PHP implementation by Stephen Woodbridge
-  # @see http://aspell.net/metaphone/dmetaph.cpp C++ implementation with modifications by Kevin Atkinson
+  # @see http://swoodbridge.com/DoubleMetaPhone/
+  #      PHP implementation by Stephen Woodbridge
+  # @see http://aspell.net/metaphone/dmetaph.cpp
+  #      C++ implementation with modifications by Kevin Atkinson
   # @example
   #    Phonetic::DoubleMetaphone.encode('czerny') # => ['SRN', 'XRN']
   #    Phonetic::DoubleMetaphone.encode('dumb')   # => ['TM', 'TM']
@@ -20,605 +22,73 @@ module Phonetic
   #    Phonetic::Metaphone2.encode('dumb')   # => ['TM', 'TM']
   #    Phonetic::Metaphone2.encode('edgar')  # => ['ATKR', 'ATKR']
   class DoubleMetaphone < Algorithm
-    VOWELS = 'AEIOUY'
     # Encode word to its Double Metaphone code.
     def self.encode_word(word, options = { size: 4 })
       code_size = options[:size] || 4
       w = word.strip.upcase
-      primary = ''
-      secondary = ''
+      code = ['', '']
+      def code.add(primary, secondary)
+        self[0] += primary
+        self[1] += secondary
+      end
       i = 0
       len = w.size
       last = len - 1
       # pad the original string so that we can index beyond the edge of the world
       w += ' ' * 5
-      # skip these when at start of word
-      i += 1 if ['GN','KN','PN','WR','PS'].include? w[0, 2]
-      # initial 'X' is pronounced 'Z' e.g. 'Xavier'
-      if w[0] == 'X'
-        primary += 'S'
-        secondary += 'S'
-        i += 1
-      end
-      while i < len && (primary.size < code_size || primary.size < code_size)
+      i += encode_start_of_word(w, code)
+      while i < len && (code.first.size < code_size || code.last.size < code_size)
         case w[i]
         when 'A', 'E', 'I', 'O', 'U', 'Y'
-          if i == 0
-            # all init vowels now map to 'A'
-            primary += 'A'
-            secondary += 'A'
-          end
           i += 1
         when 'B'
           # "-mb", e.g", "dumb", already skipped over...
-          primary += 'P'
-          secondary += 'P'
-          i += (w[i + 1] == 'B') ? 2 : 1
+          i += gen_encode(w, i, 'P', 'P', code)
         when 'Ç', 'ç'
-          primary += 'S'
-          secondary += 'S'
+          code.add 'S', 'S'
           i += 1
         when 'C'
-          # various germanic
-          if i > 1 && !vowel?(w[i - 2]) && w[i - 1, 3] == 'ACH' &&
-             (w[i + 2] != 'I' && (w[i + 2] != 'E' || w[i - 2, 6] =~ /[BM]ACHER/))
-            primary += 'K'
-            secondary += 'K'
-            i += 2
-          # special case 'caesar'
-          elsif i == 0 && w[i, 6] == 'CAESAR'
-            primary += 'S'
-            secondary += 'S'
-            i += 2
-          # italian 'chianti'
-          elsif w[i, 4] == 'CHIA'
-            primary += 'K'
-            secondary += 'K'
-            i += 2
-          elsif w[i, 2] == 'CH'
-            # find 'michael'
-            if i > 0 && w[i, 4] == 'CHAE'
-              primary += 'K'
-              secondary += 'X'
-              i += 2
-            # greek roots e.g. 'chemistry', 'chorus'
-            elsif i == 0 && (w[i + 1, 5] =~ /HARAC|HARIS/ || w[i + 1, 3] =~ /HOR|HYM|HIA|HEM/) &&
-                  w[0, 5] != 'CHORE'
-              primary += 'K'
-              secondary += 'K'
-              i += 2
-            else
-              # germanic, greek, or otherwise 'ch' for 'kh' sound
-              if (w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH') ||
-                 # 'architect but not 'arch', 'orchestra', 'orchid'
-                 (i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/) ||
-                 (w[i + 2] =~ /[TS]/) ||
-                 ((i > 0 && w[i - 1] =~ /[AOUE]/) || i == 0) &&
-                 # e.g., 'wachtler', 'wechsler', but not 'tichner'
-                 (w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
-                primary += 'K'
-                secondary += 'K'
-              else
-                if i > 0
-                  if w[0, 2] == 'MC'
-                    # e.g., "McHugh"
-                    primary += 'K'
-                    secondary += 'K'
-                  else
-                    primary += 'X'
-                    secondary += 'K'
-                  end
-                else
-                  primary += 'X'
-                  secondary += 'X'
-                end
-              end
-              i += 2
-            end
-          elsif w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
-            # e.g, 'czerny'
-            primary += 'S'
-            secondary += 'X'
-            i += 2
-          elsif w[i + 1, 3] == 'CIA'
-            # e.g., 'focaccia'
-            primary += 'X'
-            secondary += 'X'
-            i += 3
-          # double 'C', but not if e.g. 'McClellan'
-          elsif w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
-            # 'bellocchio' but not 'bacchus'
-            if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
-              # 'accident', 'accede' 'succeed'
-              if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
-                # 'bacci', 'bertucci', other italian
-                primary += 'KS'
-                secondary += 'KS'
-              else
-                primary += 'X'
-                secondary += 'X'
-              end
-              i += 3
-            else
-              # Pierce's rule
-              primary += 'K'
-              secondary += 'K'
-              i += 2
-            end
-          elsif w[i, 2] =~ /CK|CG|CQ/
-            primary += 'K'
-            secondary += 'K'
-            i += 2
-          elsif w[i, 2] =~ /CI|CE|CY/
-            # italian vs. english
-            if w[i, 3] =~ /CIO|CIE|CIA/
-              primary += 'S'
-              secondary += 'X'
-            else
-              primary += 'S'
-              secondary += 'S'
-            end
-            i += 2
-          else
-            primary += 'K'
-            secondary += 'K'
-            # name sent in 'mac caffrey', 'mac gregor'
-            if w[i + 1, 2] =~ /\s[CQG]/
-              i += 3
-            else
-              if w[i + 1] =~ /[CKQ]/ && !(w[i + 1, 2] =~ /CE|CI/)
-                i += 2
-              else
-                i += 1
-              end
-            end
-          end
+          i += encode_c(w, i, len, code)
         when 'D'
-          if w[i, 2] == 'DG'
-            if w[i + 2] =~ /[IEY]/
-              # e.g. 'edge'
-              primary += 'J'
-              secondary += 'J'
-              i += 3
-            else
-              # e.g. 'edgar'
-              primary += 'TK'
-              secondary += 'TK'
-              i += 2
-            end
-          elsif w[i, 2] =~ /DT|DD/
-            primary += 'T'
-            secondary += 'T'
-            i += 2
-          else
-            primary += 'T'
-            secondary += 'T'
-            i += 1
-          end
-        when 'F'
-          if w[i + 1] == 'F'
-            i += 2
-          else
-            i += 1
-          end
-          primary += 'F'
-          secondary += 'F'
+          i += encode_d(w, i, len, code)
+        when 'F', 'K', 'N'
+          i += gen_encode(w, i, w[i], w[i], code)
         when 'G'
-          if w[i + 1] == 'H'
-            if i > 0 && !vowel?(w[i - 1])
-              primary += 'K'
-              secondary += 'K'
-              i += 2
-            elsif i == 0
-              # ghislane, ghiradelli
-              if w[i + 2] == 'I'
-                primary += 'J'
-                secondary += 'J'
-              else
-                primary += 'K'
-                secondary += 'K'
-              end
-              i += 2
-            # Parker's rule (with some further refinements) - e.g., 'hugh'
-            elsif (i > 1 && w[i - 2] =~ /[BHD]/) ||
-                  # e.g., 'bough'
-                  (i > 2 && w[i - 3] =~ /[BHD]/) ||
-                  # e.g., 'broughton'
-                  (i > 3 && w[i - 4] =~ /[BH]/)
-              i += 2
-            else
-              # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
-              if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
-                primary += 'F'
-                secondary += 'F'
-              else
-                if i > 0 && w[i - 1] != 'I'
-                  primary += 'K'
-                  secondary += 'K'
-                end
-              end
-              i += 2
-            end
-          elsif w[i + 1] == 'N'
-            if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
-              primary += 'KN'
-              secondary += 'N'
-            else
-              # not e.g. 'cagney'
-              if w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
-                primary += 'N'
-                secondary += 'KN'
-              else
-                primary += 'KN'
-                secondary += 'KN'
-              end
-            end
-            i += 2
-          # 'tagliaro'
-          elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
-            primary += 'KL'
-            secondary += 'L'
-            i += 2
-          # -ges-,-gep-,-gel-, -gie- at beginning
-          elsif i == 0 && (w[i + 1] == 'Y' || w[i + 1, 2] =~ /ES|EP|EB|EL|EY|IB|IL|IN|IE|EI|ER/)
-            primary += 'K'
-            secondary += 'J'
-            i += 2
-          # -ger-,  -gy-
-          elsif (w[i + 1, 2] == 'ER' || w[i + 1] == 'Y') &&
-                !(w[0, 6] =~ /[DRM]ANGER/) &&
-                !(i > 0 && w[i - 1] =~ /[EI]/) &&
-                !(i > 0 && w[i - 1, 3] =~ /RGY|OGY/)
-            primary += 'K'
-            secondary += 'J'
-            i += 2
-          # italian e.g, 'biaggi'
-          elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
-            if w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH' || w[i + 1, 2] == 'ET'
-              primary += 'K'
-              secondary += 'K'
-            else
-              if w[i + 1, 4] =~ /IER\s/
-                primary += 'J'
-                secondary += 'J'
-              else
-                primary += 'J'
-                secondary += 'K'
-              end
-            end
-            i += 2
-          else
-            if w[i + 1] == 'G'
-              i += 2
-            else
-              i += 1
-            end
-            primary += 'K'
-            secondary += 'K'
-          end
+          i += encode_g(w, i, len, code)
         when 'H'
-          # only keep if first & before vowel or btw. 2 vowels
-          if (i == 0 || (i > 0 && vowel?(w[i - 1]))) && vowel?(w[i + 1])
-            primary += 'H'
-            secondary += 'H'
-            i += 2
-          else # also takes care of 'HH'
-            i += 1
-          end
+          i += encode_h(w, i, len, code)
         when 'J'
-          # obvious spanish, 'jose', 'san jacinto'
-          if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
-            if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
-              primary += 'H'
-              secondary += 'H'
-            else
-              primary += 'J'
-              secondary += 'H'
-            end
-            i += 1
-          else
-            if i == 0 && w[i, 4] != 'JOSE'
-              primary += 'J'
-              secondary += 'A'
-              # Yankelovich/Jankelowicz
-            else
-              # spanish pron. of e.g. 'bajador'
-              if i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && (w[i + 1] == 'A' || w[i + 1] == 'O')
-                primary += 'J'
-                secondary += 'H'
-              else
-                if i == last
-                  primary += 'J'
-                  #secondary += ' '
-                else
-                  if !(w[i + 1] =~ /[LTKSNMBZ]/) && !(i > 0 && w[i - 1] =~ /[SKL]/)
-                    primary += 'J'
-                    secondary += 'J'
-                  end
-                end
-              end
-            end
-            if w[i + 1] == 'J'
-              i += 2
-            else
-              i += 1
-            end
-          end
-        when 'K'
-          if w[i + 1] == 'K'
-            i += 2
-          else
-            i += 1
-          end
-          primary += 'K'
-          secondary += 'K'
+          i += encode_j(w, i, len, code)
         when 'L'
-          if w[i + 1] == 'L'
-            # spanish e.g. 'cabrillo', 'gallegos'
-            if (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILLO|ILLA|ALLE/) ||
-               ((last > 0 && w[last - 1, 2] =~ /AS|OS/ || w[last] =~ /[AO]/) &&
-               (i > 0 && w[i - 1, 4] == 'ALLE'))
-              primary += 'L'
-              i += 2
-              next
-            end
-            i += 2
-          else
-            i += 1
-          end
-          primary += 'L'
-          secondary += 'L'
+          i += encode_l(w, i, len, code)
         when 'M'
-          if (i > 0 && w[i - 1, 3] == 'UMB' && (i + 1 == last || w[i + 2, 2] == "ER")) ||
-             # 'dumb','thumb'
-             w[i + 1] == 'M'
-            i += 2
-          else
-            i += 1
-          end
-          primary += 'M'
-          secondary += 'M'
-        when 'N'
-          if w[i + 1] == 'N'
-            i += 2
-          else
-            i += 1
-          end
-          primary += 'N'
-          secondary += 'N'
+          i += encode_m(w, i, len, code)
         when 'Ñ', 'ñ'
-          i += 1;
-          primary += 'N'
-          secondary += 'N'
+          code.add 'N', 'N'
+          i += 1
         when 'P'
-          if w[i + 1] == 'H'
-            primary += 'F'
-            secondary += 'F'
-            i += 2
-          else
-            # also account for "campbell", "raspberry"
-            if w[i + 1] =~ /[PB]/
-              i += 2
-            else
-              i += 1
-            end
-            primary += 'P'
-            secondary += 'P'
-          end
+          i += encode_p(w, i, len, code)
         when 'Q'
-          if w[i + 1] == 'Q'
-            i += 2
-          else
-            i += 1
-          end
-          primary += 'K'
-          secondary += 'K'
+          i += gen_encode(w, i, 'K', 'K', code)
         when 'R'
-          # french e.g. 'rogier', but exclude 'hochmeier'
-          if i == last && !slavo_germanic?(w) &&
-             (i > 1 && w[i - 2, 2] == "IE") &&
-             !(i > 3 && w[i - 4, 2] =~ /M[EA]/)
-            secondary += 'R'
-          else
-            primary += 'R'
-            secondary += 'R'
-          end
-          if w[i + 1] == 'R'
-            i += 2
-          else
-            i += 1
-          end
+          i += encode_r(w, i, len, code)
         when 'S'
-          # special cases 'island', 'isle', 'carlisle', 'carlysle'
-          if i > 0 && w[i - 1, 3] =~ /ISL|YSL/
-            i += 1
-          # special case 'sugar-'
-          elsif i == 0 && w[i, 5] == 'SUGAR'
-            primary += 'X'
-            secondary += 'S'
-            i += 1
-          elsif w[i, 2] == 'SH'
-            # germanic
-            if w[i + 1, 4] =~ /HEIM|HOEK|HOLM|HOLZ/
-              primary += 'S'
-              secondary += 'S'
-            else
-              primary += 'X'
-              secondary += 'X'
-            end
-            i += 2
-          # italian & armenian
-          elsif w[i, 3] =~ /SIO|SIA/ || w[i, 4] == 'SIAN'
-            if !slavo_germanic?(w)
-              primary += 'S'
-              secondary += 'X'
-            else
-              primary += 'S'
-              secondary += 'S'
-            end
-            i += 3
-          # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
-          # also, -sz- in slavic language altho in hungarian it is pronounced 's'
-          elsif (i == 0 && w[i + 1] =~ /[MNLW]/) || w[i + 1] == 'Z'
-            primary += 'S'
-            secondary += 'X'
-            if w[i + 1] == 'Z'
-              i += 2
-            else
-              i += 1
-            end
-          elsif w[i, 2] == 'SC'
-            # Schlesinger's rule
-            if w[i + 2] == 'H'
-              # dutch origin, e.g. 'school', 'schooner'
-              if w[i + 3, 2] =~ /OO|ER|EN|UY|ED|EM/
-                # 'schermerhorn', 'schenker'
-                if w[i + 3, 2] =~ /ER|EN/
-                  primary += 'X'
-                  secondary += 'SK'
-                else
-                  primary += 'SK'
-                  secondary += 'SK'
-                end
-                i += 3
-              else
-                if i == 0 && !vowel?(w[3]) && w[3] != 'W'
-                  primary += 'X'
-                  secondary += 'S'
-                else
-                  primary += 'X'
-                  secondary += 'X'
-                end
-                i += 3
-              end
-            elsif w[i + 2, 1] =~ /[IEY]/
-              primary += 'S'
-              secondary += 'S'
-              i += 3
-            else
-              primary += 'SK'
-              secondary += 'SK'
-              i += 3
-            end
-          else
-            # french e.g. 'resnais', 'artois'
-            if i == last && i > 1 && w[i - 2, 2] =~ /AI|OI/
-              secondary += 'S'
-            else
-              primary += 'S'
-              secondary += 'S'
-            end
-            if w[i + 1] =~ /[SZ]/
-              i += 2
-            else
-              i += 1
-            end
-          end
+          i += encode_s(w, i, len, code)
         when 'T'
-          if w[i, 4] == 'TION'
-            primary += 'X'
-            secondary += 'X'
-            i += 3
-          elsif w[i, 3] =~ /TIA|TCH/
-            primary += 'X'
-            secondary += 'X'
-            i += 3
-          elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
-            # special case 'thomas', 'thames' or germanic
-            if w[i + 2, 2] =~ /OM|AM/ || w[0, 4] =~ /VAN|VON\s/ || w[0, 3] == 'SCH'
-              primary += 'T'
-              secondary += 'T'
-            else
-              primary += '0'
-              secondary += 'T'
-            end
-            i += 2
-          else
-            if w[i + 1] =~ /[TD]/
-              i += 2
-            else
-              i += 1
-            end
-            primary += 'T'
-            secondary += 'T'
-          end
+          i += encode_t(w, i, len, code)
         when 'V'
-          if w[i + 1] == 'V'
-            i += 2
-          else
-            i += 1
-          end
-          primary += 'F'
-          secondary += 'F'
+          i += gen_encode(w, i, 'F', 'F', code)
         when 'W'
-          # can also be in middle of word
-          if w[i, 2] == 'WR'
-            primary += 'R'
-            secondary += 'R'
-            i += 2
-          else
-            if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
-              # Wasserman should match Vasserman
-              if vowel?(w[i + 1])
-                primary += 'A'
-                secondary += 'F'
-              else
-                # need Uomo to match Womo
-                primary += 'A'
-                secondary += 'A'
-              end
-            end
-            # Arnow should match Arnoff
-            if i == last && i > 0 && vowel?(w[i - 1]) ||
-               (i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/) || w[0, 3] == 'SCH'
-              secondary += 'F'
-              i += 1
-            elsif w[i, 4] =~ /WICZ|WITZ/
-              # polish e.g. 'filipowicz'
-              primary += 'TS'
-              secondary += 'FX'
-              i += 4
-            else
-              i += 1
-            end
-          end
+          i += encode_w(w, i, len, code)
         when 'X'
-          # french e.g. breaux
-          if !(i == last && ((i > 2 && w[i - 3, 3] =~ /IAU|EAU/) || (i > 1 && w[i - 2, 2] =~ /AU|OU/)))
-            primary += 'KS'
-            secondary += 'KS'
-          end
-          if w[i + 1] =~ /[CX]/
-            i += 2
-          else
-            i += 1
-          end
+          i += encode_x(w, i, len, code)
         when 'Z'
-          # chinese pinyin e.g. 'zhao'
-          if w[i + 1] == 'H'
-            primary += 'J'
-            secondary += 'J'
-            i += 2
-          else
-            if w[i + 1, 2] =~ /ZO|ZI|ZA/ || slavo_germanic?(w) && (i > 0 && w[i - 1] != 'T')
-              primary += 'S'
-              secondary += 'TS';
-            else
-              primary += 'S'
-              secondary += 'S';
-            end
-            if w[i + 1] == 'Z'
-              i += 2
-            else
-              i += 1
-            end
-          end
+          i += encode_z(w, i, len, code)
         else
           i += 1
         end
       end
-      [primary[0, code_size], secondary[0, code_size]]
+      [code.first[0, code_size], code.last[0, code_size]]
     end
     def self.encode(str, options = { size: 4 })
@@ -627,14 +97,494 @@ module Phonetic
     private
-    def self.slavo_germanic?(str)
-      !!(str[/W|K|CZ|WITZ/])
+    def self.encode_start_of_word(w, code)
+      i = 0
+      # skip these when at start of word
+      if w[0, 2] =~ /[GKP]N|WR|PS/
+        i = 1
+      # initial 'X' is pronounced 'Z' e.g. 'Xavier'
+      elsif w[0] == 'X'
+        code.add 'S', 'S'
+        i = 1
+      elsif w[0] =~ /[AEIOUY]/
+        code.add 'A', 'A' # all init vowels now map to 'A'
+        i = 1
+      elsif w[0, 6] == 'CAESAR' # special case 'caesar'
+        code.add 'S', 'S'
+        i = 1
+      end
+      i
     end
-    def self.vowel?(char)
-      c = VOWELS[char.to_s]
-      !c.nil? && !c.empty?
+    def self.gen_encode(w, i, primary, secondary, code)
+      code.add primary, secondary
+      w[i + 1] == w[i] ? 2 : 1
     end
+    def self.encode_c(w, i, len, code)
+      r = 1
+      case
+      # various germanic
+      when c_germanic?(w, i)
+        code.add 'K', 'K'
+        r += 1
+      when w[i, 2] == 'CH'
+        encode_ch(w, i, len, code)
+        r += 1
+      when w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
+        # e.g, 'czerny'
+        code.add 'S', 'X'
+        r += 1
+      when w[i + 1, 3] == 'CIA'
+        # e.g., 'focaccia'
+        code.add 'X', 'X'
+        r += 2
+      # double 'C', but not if e.g. 'McClellan'
+      when w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
+        r += encode_cc(w, i, code) + 1
+      when w[i, 2] =~ /C[KGQ]/
+        code.add 'K', 'K'
+        r += 1
+      when w[i, 2] =~ /C[IEY]/
+        # italian vs. english
+        if w[i, 3] =~ /CI[OEA]/
+          code.add 'S', 'X'
+        else
+          code.add 'S', 'S'
+        end
+        r += 1
+      else
+        code.add 'K', 'K'
+        # name sent in 'mac caffrey', 'mac gregor'
+        if w[i + 1, 2] =~ /\s[CQG]/
+          r += 2
+        elsif w[i + 1] =~ /[CKQ]/ && w[i + 1, 2] !~ /C[EI]/
+          r += 1
+        end
+      end
+      r
+    end
+    def self.encode_d(w, i, len, code)
+      r = 1
+      if w[i, 2] == 'DG'
+        if w[i + 2] =~ /[IEY]/
+          # e.g. 'edge'
+          code.add 'J', 'J'
+          r += 2
+        else
+          # e.g. 'edgar'
+          code.add 'TK', 'TK'
+          r += 1
+        end
+      elsif w[i, 2] =~ /D[TD]/
+        code.add 'T', 'T'
+        r += 1
+      else
+        code.add 'T', 'T'
+      end
+      r
+    end
+    def self.encode_g(w, i, len, code)
+      r = 2
+      if w[i + 1] == 'H'
+        encode_gh(w, i, code)
+      elsif w[i + 1] == 'N'
+        encode_gn(w, i, code)
+      # 'tagliaro'
+      elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
+        code.add 'KL', 'L'
+      # -ges-, -gep-, -gel-, -gie- at beginning
+      elsif i == 0 && w[1, 2] =~ /^Y|E[SPBLYIR]|I[BLNE]/
+        code.add 'K', 'J'
+      # -ger-,  -gy-
+      elsif g_ger_or_gy?(w, i)
+        code.add 'K', 'J'
+      # italian e.g, 'biaggi'
+      elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
+        if w[0, 4] =~ /^(VAN |VON |SCH)/ || w[i + 1, 2] == 'ET'
+          code.add 'K', 'K'
+        elsif w[i + 1, 4] =~ /IER\s/
+          code.add 'J', 'J'
+        else
+          code.add 'J', 'K'
+        end
+      else
+        r -= 1 if w[i + 1] != 'G'
+        code.add 'K', 'K'
+      end
+      r
+    end
+    def self.encode_h(w, i, len, code)
+      r = 1
+      # only keep if first & before vowel or btw. 2 vowels
+      if (i == 0 || i > 0 && vowel?(w[i - 1])) && vowel?(w[i + 1])
+        code.add 'H', 'H'
+        r += 1
+      end
+      r
+    end
+    def self.encode_j(w, i, len, code)
+      r = 1
+      last = len - 1
+      # obvious spanish, 'jose', 'san jacinto'
+      if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
+        if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
+          code.add 'H', 'H'
+        else
+          code.add 'J', 'H'
+        end
+      else
+        if i == 0 && w[i, 4] != 'JOSE'
+          code.add 'J', 'A'
+          # Yankelovich/Jankelowicz
+        else
+          # spanish pron. of e.g. 'bajador'
+          if j_spanish_pron?(w, i)
+            code.add 'J', 'H'
+          elsif i == last
+            code.add 'J', ''
+          elsif w[i + 1] !~ /[LTKSNMBZ]/ && !(i > 0 && w[i - 1] =~ /[SKL]/)
+            code.add 'J', 'J'
+          end
+        end
+        r += 1 if w[i + 1] == 'J'
+      end
+      r
+    end
+    def self.encode_l(w, i, len, code)
+      r = 1
+      if w[i + 1] == 'L'
+        # spanish e.g. 'cabrillo', 'gallegos'
+        if ll_spanish?(w, i, len)
+          code.add 'L', ''
+        else
+          code.add 'L', 'L'
+        end
+        r += 1
+      else
+        code.add 'L', 'L'
+      end
+      r
+    end
+    def self.encode_m(w, i, len, code)
+      r = 1
+      # 'dumb','thumb'
+      r += 1 if i > 0 && w[i - 1, 5] =~ /UMB(  |ER)/ || w[i + 1] == 'M'
+      code.add 'M', 'M'
+      r
+    end
+    def self.encode_p(w, i, len, code)
+      r = 1
+      if w[i + 1] == 'H'
+        code.add 'F', 'F'
+        r += 1
+      else
+        # also account for "campbell", "raspberry"
+        r += 1 if w[i + 1] =~ /[PB]/
+        code.add 'P', 'P'
+      end
+      r
+    end
+    def self.encode_r(w, i, len, code)
+      last = len - 1
+      # french e.g. 'rogier', but exclude 'hochmeier'
+      if r_french?(w, i, last)
+        code.add '', 'R'
+      else
+        code.add 'R', 'R'
+      end
+      w[i + 1] == 'R' ? 2 : 1
+    end
+    def self.encode_s(w, i, len, code)
+      r = 1
+      last = len - 1
+      # special cases 'island', 'isle', 'carlisle', 'carlysle'
+      if i > 0 && w[i - 1, 3] =~ /[IY]SL/
+      # special case 'sugar-'
+      elsif i == 0 && w[i, 5] == 'SUGAR'
+        code.add 'X', 'S'
+      elsif w[i, 2] == 'SH'
+        # germanic
+        if w[i + 1, 4] =~ /H(EIM|OEK|OL[MZ])/
+          code.add 'S', 'S'
+        else
+          code.add 'X', 'X'
+        end
+        r += 1
+      # italian & armenian
+      elsif w[i, 3] =~ /SI[OA]/
+        if !slavo_germanic?(w)
+          code.add 'S', 'X'
+        else
+          code.add 'S', 'S'
+        end
+        r += 2
+      # german & anglicisations, e.g. 'smith' match 'schmidt',
+      # 'snider' match 'schneider' also, -sz- in slavic language altho in
+      # hungarian it is pronounced 's'
+      elsif i == 0 && w[i + 1] =~ /[MNLW]/ || w[i + 1] == 'Z'
+        code.add 'S', 'X'
+        r += 1 if w[i + 1] == 'Z'
+      elsif w[i, 2] == 'SC'
+        encode_sc(w, i, code)
+        r += 2
+      # french e.g. 'resnais', 'artois'
+      else
+        if i == last && i > 1 && w[i - 2, 2] =~ /[AO]I/
+          code.add '', 'S'
+        else
+          code.add 'S', 'S'
+        end
+        r += 1 if w[i + 1] =~ /[SZ]/
+      end
+      r
+    end
+    def self.encode_t(w, i, len, code)
+      r = 1
+      if w[i, 4] =~ /^(TION|TIA|TCH)/
+        code.add 'X', 'X'
+        r += 2
+      elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
+        # special case 'thomas', 'thames' or germanic
+        if w[i + 2, 2] =~ /[OA]M/ || w[0, 4] =~ /^(VAN |VON |SCH)/
+          code.add 'T', 'T'
+        else
+          code.add '0', 'T'
+        end
+        r += 1
+      else
+        r += 1 if w[i + 1] =~ /[TD]/
+        code.add 'T', 'T'
+      end
+      r
+    end
+    def self.encode_w(w, i, len, code)
+      last = len - 1
+      r = 1
+      # can also be in middle of word
+      if w[i, 2] == 'WR'
+        code.add 'R', 'R'
+        r += 1
+      else
+        if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
+          # Wasserman should match Vasserman
+          if vowel?(w[i + 1])
+            code.add 'A', 'F'
+          else
+            # need Uomo to match Womo
+            code.add 'A', 'A'
+          end
+        end
+        # Arnow should match Arnoff
+        if i == last && i > 0 && vowel?(w[i - 1]) ||
+           i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/ ||
+           w[0, 3] == 'SCH'
+          code.add '', 'F'
+        elsif w[i, 4] =~ /WICZ|WITZ/
+          # polish e.g. 'filipowicz'
+          code.add 'TS', 'FX'
+          r += 3
+        end
+      end
+      r
+    end
+    def self.encode_x(w, i, len, code)
+      # french e.g. breaux
+      code.add 'KS', 'KS' unless x_french?(w, i, len - 1)
+      w[i + 1] =~ /[CX]/ ? 2 : 1
+    end
+    def self.encode_z(w, i, len, code)
+      r = 1
+      # chinese pinyin e.g. 'zhao'
+      if w[i + 1] == 'H'
+        code.add 'J', 'J'
+        r += 1
+      else
+        if w[i + 1, 2] =~ /Z[OIA]/ ||
+           slavo_germanic?(w) && i > 0 && w[i - 1] != 'T'
+          code.add 'S', 'TS';
+        else
+          code.add 'S', 'S';
+        end
+        r += 1 if w[i + 1] == 'Z'
+      end
+      r
+    end
+    def self.encode_ch(w, i, len, code)
+      case
+      # italian 'chianti'
+      when w[i, 4] == 'CHIA'
+        code.add 'K', 'K'
+      # find 'michael'
+      when i > 0 && w[i, 4] == 'CHAE'
+        code.add 'K', 'X'
+      # greek roots e.g. 'chemistry', 'chorus'
+      when ch_greek_roots?(w, i)
+        code.add 'K', 'K'
+      # germanic, greek, or otherwise 'ch' for 'kh' sound
+      when ch_germanic_or_greek?(w, i, len)
+        code.add 'K', 'K'
+      when i == 0
+        code.add 'X', 'X'
+      when w[0, 2] == 'MC'
+        # e.g., "McHugh"
+        code.add 'K', 'K'
+      else
+        code.add 'X', 'K'
+      end
+    end
+    def self.encode_cc(w, i, code)
+      r = 0
+      # 'bellocchio' but not 'bacchus'
+      if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
+        # 'accident', 'accede' 'succeed'
+        if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
+          # 'bacci', 'bertucci', other italian
+          code.add 'KS', 'KS'
+        else
+          code.add 'X', 'X'
+        end
+        r = 1
+      else
+        # Pierce's rule
+        code.add 'K', 'K'
+      end
+      r
+    end
+    def self.encode_gh(w, i, code)
+      if i > 0 && !vowel?(w[i - 1])
+        code.add 'K', 'K'
+      elsif i == 0
+        # ghislane, ghiradelli
+        if w[i + 2] == 'I'
+          code.add 'J', 'J'
+        else
+          code.add 'K', 'K'
+        end
+      # Parker's rule (with some further refinements)
+      elsif !(i > 1 && w[i - 2] =~ /[BHD]/ || # e.g., 'hugh'
+              i > 2 && w[i - 3] =~ /[BHD]/ || # e.g., 'bough'
+              i > 3 && w[i - 4] =~ /[BH]/)  # e.g., 'broughton'
+        # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
+        if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
+          code.add 'F', 'F'
+        elsif i > 0 && w[i - 1] != 'I'
+          code.add 'K', 'K'
+        end
+      end
+    end
+    def self.encode_gn(w, i, code)
+      if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
+        code.add 'KN', 'N'
+      # not e.g. 'cagney'
+      elsif w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
+        code.add 'N', 'KN'
+      else
+        code.add 'KN', 'KN'
+      end
+    end
+    def self.encode_sc(w, i, code)
+      # Schlesinger's rule
+      if w[i + 2] == 'H'
+        # dutch origin, e.g. 'school', 'schooner'
+        if w[i + 3, 2] =~ /OO|UY|E[DM]/
+          code.add 'SK', 'SK'
+        # 'schermerhorn', 'schenker'
+        elsif w[i + 3, 2] =~ /E[RN]/
+          code.add 'X', 'SK'
+        elsif i == 0 && !vowel?(w[3]) && w[3] != 'W'
+          code.add 'X', 'S'
+        else
+          code.add 'X', 'X'
+        end
+      elsif w[i + 2] =~ /[IEY]/
+        code.add 'S', 'S'
+      else
+        code.add 'SK', 'SK'
+      end
+    end
+    def self.slavo_germanic?(w)
+      w =~ /W|K|CZ|WITZ/
+    end
+    def self.vowel?(c)
+      c =~ /[AEIOUY]/
+    end
+    def self.c_germanic?(w, i)
+      # various germanic
+      i > 1 &&
+      !vowel?(w[i - 2]) &&
+      w[i - 1, 3] == 'ACH' &&
+      (w[i + 2] !~ /[IE]/ || w[i - 2, 6] =~ /[BM]ACHER/)
+    end
+    def self.ch_greek_roots?(w, i)
+      # greek roots e.g. 'chemistry', 'chorus'
+      i == 0 && w[1, 5] =~ /^H(ARAC|ARIS|OR|YM|IA|EM)/ && w[0, 5] != 'CHORE'
+    end
+    def self.ch_germanic_or_greek?(w, i, len)
+      # germanic, greek, or otherwise 'ch' for 'kh' sound
+      w[0, 4] =~ /^(V[AO]N\s|SCH)/ ||
+      # 'architect but not 'arch', 'orchestra', 'orchid'
+      i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/ ||
+      (w[i + 2] =~ /[TS]/) ||
+      (i > 0 && w[i - 1] =~ /[AOUE]/ || i == 0) &&
+      # e.g., 'wachtler', 'wechsler', but not 'tichner'
+      (w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
+    end
+    def self.g_ger_or_gy?(w, i)
+      # -ger-,  -gy-
+      w[i + 1, 2] =~ /^(ER|Y)/ &&
+      w[0, 6] !~ /[DRM]ANGER/ &&
+      !(i > 0 && w[i - 1] =~ /[EI]/) &&
+      !(i > 0 && w[i - 1, 3] =~ /[RO]GY/)
+    end
+    def self.j_spanish_pron?(w, i)
+      # spanish pron. of e.g. 'bajador'
+      i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && w[i + 1] =~ /[AO]/
+    end
+    def self.ll_spanish?(w, i, len)
+      last = len - 1
+      # spanish e.g. 'cabrillo', 'gallegos'
+      (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILL[OA]|ALLE/) ||
+      (last > 0 && w[last - 1, 2] =~ /[AO]S/ || w[last] =~ /[AO]/) &&
+      (i > 0 && w[i - 1, 4] == 'ALLE')
+    end
+    def self.r_french?(w, i, last)
+      # french e.g. 'rogier', but exclude 'hochmeier'
+      i == last && !slavo_germanic?(w) &&
+      i > 1 && w[i - 2, 2] == 'IE' &&
+      !(i > 3 && w[i - 4, 2] =~ /M[EA]/)
+    end
+    def self.x_french?(w, i, last)
+      # french e.g. breaux
+      i == last && (i > 2 && w[i - 3, 3] =~ /[IE]AU/ || i > 1 && w[i - 2, 2] =~ /[AO]U/)
+    end
   end
 end