phonetic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +3 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +98 -0
  7. data/Rakefile +1 -0
  8. data/lib/phonetic.rb +10 -0
  9. data/lib/phonetic/algorithm.rb +24 -0
  10. data/lib/phonetic/caverphone.rb +68 -0
  11. data/lib/phonetic/caverphone2.rb +69 -0
  12. data/lib/phonetic/core_ext/string.rb +3 -0
  13. data/lib/phonetic/core_ext/string/caverphone.rb +11 -0
  14. data/lib/phonetic/core_ext/string/caverphone2.rb +11 -0
  15. data/lib/phonetic/core_ext/string/double_metaphone.rb +18 -0
  16. data/lib/phonetic/core_ext/string/metaphone.rb +12 -0
  17. data/lib/phonetic/core_ext/string/nysiis.rb +12 -0
  18. data/lib/phonetic/core_ext/string/refined_soundex.rb +12 -0
  19. data/lib/phonetic/core_ext/string/soundex.rb +12 -0
  20. data/lib/phonetic/double_metaphone.rb +640 -0
  21. data/lib/phonetic/metaphone.rb +161 -0
  22. data/lib/phonetic/metaphone2.rb +5 -0
  23. data/lib/phonetic/nysiis.rb +63 -0
  24. data/lib/phonetic/refined_soundex.rb +39 -0
  25. data/lib/phonetic/soundex.rb +39 -0
  26. data/lib/phonetic/version.rb +3 -0
  27. data/phonetic.gemspec +26 -0
  28. data/spec/phonetic/algorithm_spec.rb +15 -0
  29. data/spec/phonetic/caverphone2_spec.rb +66 -0
  30. data/spec/phonetic/caverphone_spec.rb +115 -0
  31. data/spec/phonetic/core_ext/string/caverphone2_spec.rb +9 -0
  32. data/spec/phonetic/core_ext/string/caverphone_spec.rb +9 -0
  33. data/spec/phonetic/core_ext/string/double_metaphone_spec.rb +15 -0
  34. data/spec/phonetic/core_ext/string/metaphone_spec.rb +11 -0
  35. data/spec/phonetic/core_ext/string/nysiis_spec.rb +12 -0
  36. data/spec/phonetic/core_ext/string/refined_soundex_spec.rb +10 -0
  37. data/spec/phonetic/core_ext/string/soundex_spec.rb +14 -0
  38. data/spec/phonetic/double_metaphone_spec.rb +16 -0
  39. data/spec/phonetic/metaphone2_spec.rb +9 -0
  40. data/spec/phonetic/metaphone_spec.rb +81 -0
  41. data/spec/phonetic/nysiis_spec.rb +20 -0
  42. data/spec/phonetic/refined_soundex_spec.rb +13 -0
  43. data/spec/phonetic/soundex_spec.rb +24 -0
  44. data/spec/spec_helper.rb +11 -0
  45. data/spec/support/double_metaphone_data.rb +142 -0
  46. data/spec/support/nysiis_data.rb +31 -0
  47. metadata +180 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cb22f0be1272e5b72586a964943808d93f99c5c2
4
+ data.tar.gz: cfab5ebba7adc3823f9c74a92b5877681e73d36d
5
+ SHA512:
6
+ metadata.gz: da460d1d048d38d39af6970b3c456551217551d3aa1275d8ec83cc86ea6ae5fcf454842ccb0300d93726303ab9009639bd72c894d99e30d9773edd269e0f41c8
7
+ data.tar.gz: 3b94a722e973f37eccd62841d0cd64be3e0dbce0ab27766cd4384a01e375440334f74cc22b14c21804489a5e87f6052911f3decf5b1a2c1f4f24e7b6459306cc
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format doc
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 n7v
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,98 @@
1
+ # Phonetic
2
+
3
+ Ruby library for phonetic algorithms.
4
+ It supports Soundex, Metaphone, Double Metaphone, Caverphone, NYSIIS and others.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'phonetic'
11
+
12
+ And then execute:
13
+
14
+ ```shell
15
+ $ bundle
16
+ ```
17
+
18
+ Or install it yourself as:
19
+
20
+ ```shell
21
+ $ gem install phonetic
22
+ ```
23
+
24
+ ## Usage
25
+
26
+ ```ruby
27
+ require 'phonetic'
28
+ ```
29
+
30
+ ### Soundex
31
+
32
+ ```ruby
33
+ 'Ackerman'.soundex # => 'A265'
34
+ 'ammonium'.soundex # => 'A500'
35
+ 'implementation'.soundex # => 'I514'
36
+ ```
37
+
38
+ ### Refined Soundex
39
+
40
+ ```ruby
41
+ 'Caren'.refined_soundex # => 'C30908'
42
+ 'Hayers'.refined_soundex # => 'H093'
43
+ 'Lambard'.refined_soundex # => 'L7081096'
44
+ ```
45
+
46
+ ### Metaphone
47
+
48
+ ```ruby
49
+ 'Accola'.metaphone # => 'AKKL'
50
+ 'Nikki'.metaphone # => 'NK'
51
+ 'Wright'.metaphone #=> 'RT'
52
+ ```
53
+
54
+ ### Double Metaphone
55
+
56
+ ```ruby
57
+ 'czerny'.double_metaphone # => ['SRN', 'XRN']
58
+ 'dumb'.double_metaphone # => ['TM', 'TM']
59
+ 'edgar'.double_metaphone # => ['ATKR', 'ATKR']
60
+ ```
61
+
62
+ or use alias:
63
+
64
+ ```ruby
65
+ 'czerny'.metaphone2 # => ['SRN', 'XRN']
66
+ 'dumb'.metaphone2 # => ['TM', 'TM']
67
+ 'edgar'.metaphone2 # => ['ATKR', 'ATKR']
68
+ ```
69
+
70
+ ### Caverphone
71
+
72
+ ```ruby
73
+ 'Lashaunda'.caverphone # => 'LSNT11'
74
+ 'Vidaurri'.caverphone # => 'FTR111'
75
+ ````
76
+
77
+ ### Caverphone 2
78
+
79
+ ```ruby
80
+ 'Stevenson'.caverphone2 # => 'STFNSN1111'
81
+ 'Peter'.caverphone2 # => 'PTA1111111'
82
+ ```
83
+
84
+ ### NYSIIS
85
+
86
+ ```ruby
87
+ 'Alexandra'.nysiis # => 'ALAXANDR'
88
+ 'Aumont'.nysiis # => 'AANAD'
89
+ 'Bonnie'.nysiis # => 'BANY'
90
+ ```
91
+
92
+ ## Contributing
93
+
94
+ 1. Fork it
95
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
96
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
97
+ 4. Push to the branch (`git push origin my-new-feature`)
98
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,10 @@
1
+ require 'phonetic/version'
2
+ require 'phonetic/nysiis'
3
+ require 'phonetic/soundex'
4
+ require 'phonetic/refined_soundex'
5
+ require 'phonetic/metaphone'
6
+ require 'phonetic/double_metaphone'
7
+ require 'phonetic/metaphone2'
8
+ require 'phonetic/caverphone'
9
+ require 'phonetic/caverphone2'
10
+ require 'phonetic/core_ext/string'
@@ -0,0 +1,24 @@
1
+ module Phonetic
2
+ # Base class for phonetic algorithms.
3
+ class Algorithm
4
+ # Generic method for encoding single word. Override it in your algorithm class.
5
+ # @param [String] word the word to encode
6
+ # @param [Hash] options the options for the algorithm
7
+ # @return [String] the word
8
+ def self.encode_word(word, options = {})
9
+ word
10
+ end
11
+
12
+ # Generic method for encoding string.
13
+ # Splits string by words and encodes it with {Algorithm.encode_word}.
14
+ #
15
+ # @param [String] str the string to encode.
16
+ # @param [Hash] options the options for algorithm.
17
+ # @return [String] the space separated codes of words from input string.
18
+ def self.encode(str, options = {})
19
+ str.scan(/\p{Word}+/).map do |word|
20
+ encode_word(word, options)
21
+ end.compact.reject(&:empty?).join(' ')
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,68 @@
1
+ require 'phonetic/algorithm'
2
+
3
+ module Phonetic
4
+ # Caverphone created by the Caversham Project at the University of Otago.
5
+ # @see http://caversham.otago.ac.nz/files/working/ctp060902.pdf Caverphone: Phonetic Matching algorithm by David Hood (2002)
6
+ # This class implements this algorithm.
7
+ # @example
8
+ # Phonetic::Caverphone.encode('Charmain') # => 'KMN111'
9
+ # Phonetic::Caverphone.encode('Ellett') # => 'ALT111'
10
+ # Phonetic::Caverphone.encode('Siegmund') # => 'SKMNT1'
11
+ class Caverphone < Algorithm
12
+ MAP = {
13
+ /^(cou|rou|tou|enou)gh/ => '\12f',
14
+ /^gn/ => '2n',
15
+ /mb$/ => 'mb',
16
+ 'cq' => '2q',
17
+ /c([iey])/ => 's\1',
18
+ 'tch' => '2ch',
19
+ /[cqx]/ => 'k',
20
+ 'v' => 'f',
21
+ 'dg' => '2g',
22
+ /ti([oa])/ => 'si\1',
23
+ 'd' => 't',
24
+ 'ph' => 'fh',
25
+ 'b' => 'p',
26
+ 'sh' => 's2',
27
+ 'z' => 's',
28
+ /^[aeiou]/ => 'A',
29
+ /[aeiou]/ => '3',
30
+ '3gh3' => '3kh3',
31
+ 'gh' => '22',
32
+ 'g' => 'k',
33
+ /s+/ => 'S',
34
+ /t+/ => 'T',
35
+ /p+/ => 'P',
36
+ /k+/ => 'K',
37
+ /f+/ => 'F',
38
+ /m+/ => 'M',
39
+ /n+/ => 'N',
40
+ 'w3' => 'W3',
41
+ /wy/ => 'Wy',
42
+ 'wh3' => 'Wh3',
43
+ 'why' => 'Why',
44
+ 'w' => '2',
45
+ /^h/ => 'A',
46
+ 'h' => '2',
47
+ 'r3' => 'R3',
48
+ 'ry' => 'Ry',
49
+ 'r' => '2',
50
+ 'l3' => 'L3',
51
+ 'ly' => 'Ly',
52
+ 'l' => '2',
53
+ 'j' => 'y',
54
+ 'y3' => 'Y3',
55
+ 'y' => '2',
56
+ '2' => '',
57
+ '3' => ''
58
+ }
59
+
60
+ # Encode word to its Caverphone code
61
+ def self.encode_word(word, options = {})
62
+ w = word.strip.downcase.gsub(/[^a-z]/, '')
63
+ MAP.each { |r, v| w.gsub!(r, v) }
64
+ w = w + '1' * 6
65
+ w[0..5]
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,69 @@
1
+ require 'phonetic/algorithm'
2
+
3
+ module Phonetic
4
+ # Caverphone 2.0 created by the Caversham Project at the University of Otago.
5
+ # @see http://caversham.otago.ac.nz/files/working/ctp150804.pdf Caverphone Revisited by David Hood (2004)
6
+ # This class implements this algorithm.
7
+ # @example
8
+ # Phonetic::Caverphone2.encode('Stevenson') # => 'STFNSN1111'
9
+ # Phonetic::Caverphone2.encode('Peter') # => 'PTA1111111'
10
+ class Caverphone2 < Algorithm
11
+ MAP = {
12
+ /e$/ => '',
13
+ /^(cou|rou|tou|enou|trou)gh/ => '\12f',
14
+ /^gn/ => '2n',
15
+ /mb$/ => 'mb',
16
+ 'cq' => '2q',
17
+ /c([iey])/ => 's\1',
18
+ 'tch' => '2ch',
19
+ /[cqx]/ => 'k',
20
+ 'v' => 'f',
21
+ 'dg' => '2g',
22
+ /ti([oa])/ => 'si\1',
23
+ 'd' => 't',
24
+ 'ph' => 'fh',
25
+ 'b' => 'p',
26
+ 'sh' => 's2',
27
+ 'z' => 's',
28
+ /^[aeiou]/ => 'A',
29
+ /[aeiou]/ => '3',
30
+ 'j' => 'y',
31
+ /^y3/ => 'Y3',
32
+ /^y/ => 'A',
33
+ /y/ => '3',
34
+ '3gh3' => '3kh3',
35
+ 'gh' => '22',
36
+ 'g' => 'k',
37
+ /s+/ => 'S',
38
+ /t+/ => 'T',
39
+ /p+/ => 'P',
40
+ /k+/ => 'K',
41
+ /f+/ => 'F',
42
+ /m+/ => 'M',
43
+ /n+/ => 'N',
44
+ 'w3' => 'W3',
45
+ 'wh3' => 'Wh3',
46
+ /w$/ => '3',
47
+ 'w' => '2',
48
+ /^h/ => 'A',
49
+ 'h' => '2',
50
+ 'r3' => 'R3',
51
+ /r$/ => '3',
52
+ 'r' => '2',
53
+ 'l3' => 'L3',
54
+ /l$/ => '3',
55
+ 'l' => '2',
56
+ '2' => '',
57
+ /3$/ => 'A',
58
+ '3' => ''
59
+ }
60
+
61
+ # Encode word to its Caverphone 2 code
62
+ def self.encode_word(word, options = {})
63
+ w = word.strip.downcase.gsub(/[^a-z]/, '')
64
+ MAP.each { |r, v| w.gsub!(r, v) }
65
+ w = w + '1' * 10
66
+ w[0..9]
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,3 @@
1
+ Gem.find_files('phonetic/core_ext/string/*.rb')
2
+ .reject{|path| path =~ /_spec/}
3
+ .each { |path| require path }
@@ -0,0 +1,11 @@
1
+ require 'phonetic/caverphone'
2
+
3
+ class String
4
+ # Caverphone value of string
5
+ # @example
6
+ # 'Lashaunda'.caverphone # => 'LSNT11'
7
+ # 'Vidaurri'.caverphone # => 'FTR111'
8
+ def caverphone(options = {})
9
+ Phonetic::Caverphone.encode(self, options)
10
+ end
11
+ end
@@ -0,0 +1,11 @@
1
+ require 'phonetic/caverphone2'
2
+
3
+ class String
4
+ # Caverphone 2 value of string
5
+ # @example
6
+ # 'Stevenson'.caverphone2 # => 'STFNSN1111'
7
+ # 'Peter'.caverphone2 # => 'PTA1111111'
8
+ def caverphone2(options = {})
9
+ Phonetic::Caverphone2.encode(self, options)
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ require 'phonetic/double_metaphone'
2
+
3
+ class String
4
+ # Double Metahpone code of string.
5
+ # @example
6
+ # 'czerny'.double_metaphone # => ['SRN', 'XRN']
7
+ # 'dumb'.double_metaphone # => ['TM', 'TM']
8
+ # 'edgar'.double_metaphone # => ['ATKR', 'ATKR']
9
+ # # or use alias:
10
+ # 'czerny'.metaphone2 # => ['SRN', 'XRN']
11
+ # 'dumb'.metaphone2 # => ['TM', 'TM']
12
+ # 'edgar'.metaphone2 # => ['ATKR', 'ATKR']
13
+ def double_metaphone(options = { size: 4 })
14
+ Phonetic::DoubleMetaphone.encode(self, options)
15
+ end
16
+
17
+ alias_method :metaphone2, :double_metaphone
18
+ end
@@ -0,0 +1,12 @@
1
+ require 'phonetic/metaphone'
2
+
3
+ class String
4
+ # Metaphone value of string.
5
+ # @example
6
+ # 'Accola'.metaphone # => 'AKKL'
7
+ # 'Nikki'.metaphone # => 'NK'
8
+ # 'Wright'.metaphone #=> 'RT'
9
+ def metaphone(options = { size: 4 })
10
+ Phonetic::Metaphone.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'phonetic/nysiis'
2
+
3
+ class String
4
+ # Caverphone value of string.
5
+ # @example
6
+ # 'Alexandra'.nysiis # => 'ALAXANDR'
7
+ # 'Aumont'.nysiis # => 'AANAD'
8
+ # 'Bonnie'.nysiis # => 'BANY'
9
+ def nysiis(options = { trim: true })
10
+ Phonetic::NYSIIS.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'phonetic/refined_soundex'
2
+
3
+ class String
4
+ # Refined Soundex value of string.
5
+ # @example
6
+ # 'Caren'.refined_soundex # => 'C30908'
7
+ # 'Hayers'.refined_soundex # => 'H093'
8
+ # 'Lambard'.refined_soundex # => 'L7081096'
9
+ def refined_soundex(options = { trim: true })
10
+ Phonetic::RefinedSoundex.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'phonetic/soundex'
2
+
3
+ class String
4
+ # Soundex value of string
5
+ # @example
6
+ # 'Ackerman'.soundex # => 'A265'
7
+ # 'ammonium'.soundex # => 'A500'
8
+ # 'implementation'.soundex # => 'I514'
9
+ def soundex(options = { trim: true })
10
+ Phonetic::Soundex.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,640 @@
1
+ # encoding: utf-8
2
+
3
+ require 'phonetic/algorithm'
4
+
5
+ module Phonetic
6
+ # The Double Metaphone phonetic encoding algorithm is the second generation
7
+ # of the Metaphone algorithm. Its original implementation was described
8
+ # by Lawrence Philips in the June 2000 issue of C/C++ Users Journal.
9
+ #
10
+ # This implementation based on the PHP implementation by Stephen Woodbridge
11
+ # and contains modifications of algorithm by Kevin Atkinson.
12
+ # @see http://swoodbridge.com/DoubleMetaPhone/ PHP implementation by Stephen Woodbridge
13
+ # @see http://aspell.net/metaphone/dmetaph.cpp C++ implementation with modifications by Kevin Atkinson
14
+ # @example
15
+ # Phonetic::DoubleMetaphone.encode('czerny') # => ['SRN', 'XRN']
16
+ # Phonetic::DoubleMetaphone.encode('dumb') # => ['TM', 'TM']
17
+ # Phonetic::DoubleMetaphone.encode('edgar') # => ['ATKR', 'ATKR']
18
+ # # or use alias:
19
+ # Phonetic::Metaphone2.encode('czerny') # => ['SRN', 'XRN']
20
+ # Phonetic::Metaphone2.encode('dumb') # => ['TM', 'TM']
21
+ # Phonetic::Metaphone2.encode('edgar') # => ['ATKR', 'ATKR']
22
+ class DoubleMetaphone < Algorithm
23
+ VOWELS = 'AEIOUY'
24
+
25
+ # Encode word to its Double Metaphone code.
26
+ def self.encode_word(word, options = { size: 4 })
27
+ code_size = options[:size] || 4
28
+ w = word.strip.upcase
29
+ primary = ''
30
+ secondary = ''
31
+ i = 0
32
+ len = w.size
33
+ last = len - 1
34
+ # pad the original string so that we can index beyond the edge of the world
35
+ w += ' ' * 5
36
+ # skip these when at start of word
37
+ i += 1 if ['GN','KN','PN','WR','PS'].include? w[0, 2]
38
+ # initial 'X' is pronounced 'Z' e.g. 'Xavier'
39
+ if w[0] == 'X'
40
+ primary += 'S'
41
+ secondary += 'S'
42
+ i += 1
43
+ end
44
+ while i < len && (primary.size < code_size || primary.size < code_size)
45
+ case w[i]
46
+ when 'A', 'E', 'I', 'O', 'U', 'Y'
47
+ if i == 0
48
+ # all init vowels now map to 'A'
49
+ primary += 'A'
50
+ secondary += 'A'
51
+ end
52
+ i += 1
53
+ when 'B'
54
+ # "-mb", e.g", "dumb", already skipped over...
55
+ primary += 'P'
56
+ secondary += 'P'
57
+ i += (w[i + 1] == 'B') ? 2 : 1
58
+ when 'Ç', 'ç'
59
+ primary += 'S'
60
+ secondary += 'S'
61
+ i += 1
62
+ when 'C'
63
+ # various germanic
64
+ if i > 1 && !vowel?(w[i - 2]) && w[i - 1, 3] == 'ACH' &&
65
+ (w[i + 2] != 'I' && (w[i + 2] != 'E' || w[i - 2, 6] =~ /[BM]ACHER/))
66
+ primary += 'K'
67
+ secondary += 'K'
68
+ i += 2
69
+ # special case 'caesar'
70
+ elsif i == 0 && w[i, 6] == 'CAESAR'
71
+ primary += 'S'
72
+ secondary += 'S'
73
+ i += 2
74
+ # italian 'chianti'
75
+ elsif w[i, 4] == 'CHIA'
76
+ primary += 'K'
77
+ secondary += 'K'
78
+ i += 2
79
+ elsif w[i, 2] == 'CH'
80
+ # find 'michael'
81
+ if i > 0 && w[i, 4] == 'CHAE'
82
+ primary += 'K'
83
+ secondary += 'X'
84
+ i += 2
85
+ # greek roots e.g. 'chemistry', 'chorus'
86
+ elsif i == 0 && (w[i + 1, 5] =~ /HARAC|HARIS/ || w[i + 1, 3] =~ /HOR|HYM|HIA|HEM/) &&
87
+ w[0, 5] != 'CHORE'
88
+ primary += 'K'
89
+ secondary += 'K'
90
+ i += 2
91
+ else
92
+ # germanic, greek, or otherwise 'ch' for 'kh' sound
93
+ if (w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH') ||
94
+ # 'architect but not 'arch', 'orchestra', 'orchid'
95
+ (i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/) ||
96
+ (w[i + 2] =~ /[TS]/) ||
97
+ ((i > 0 && w[i - 1] =~ /[AOUE]/) || i == 0) &&
98
+ # e.g., 'wachtler', 'wechsler', but not 'tichner'
99
+ (w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
100
+ primary += 'K'
101
+ secondary += 'K'
102
+ else
103
+ if i > 0
104
+ if w[0, 2] == 'MC'
105
+ # e.g., "McHugh"
106
+ primary += 'K'
107
+ secondary += 'K'
108
+ else
109
+ primary += 'X'
110
+ secondary += 'K'
111
+ end
112
+ else
113
+ primary += 'X'
114
+ secondary += 'X'
115
+ end
116
+ end
117
+ i += 2
118
+ end
119
+ elsif w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
120
+ # e.g, 'czerny'
121
+ primary += 'S'
122
+ secondary += 'X'
123
+ i += 2
124
+ elsif w[i + 1, 3] == 'CIA'
125
+ # e.g., 'focaccia'
126
+ primary += 'X'
127
+ secondary += 'X'
128
+ i += 3
129
+ # double 'C', but not if e.g. 'McClellan'
130
+ elsif w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
131
+ # 'bellocchio' but not 'bacchus'
132
+ if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
133
+ # 'accident', 'accede' 'succeed'
134
+ if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
135
+ # 'bacci', 'bertucci', other italian
136
+ primary += 'KS'
137
+ secondary += 'KS'
138
+ else
139
+ primary += 'X'
140
+ secondary += 'X'
141
+ end
142
+ i += 3
143
+ else
144
+ # Pierce's rule
145
+ primary += 'K'
146
+ secondary += 'K'
147
+ i += 2
148
+ end
149
+ elsif w[i, 2] =~ /CK|CG|CQ/
150
+ primary += 'K'
151
+ secondary += 'K'
152
+ i += 2
153
+ elsif w[i, 2] =~ /CI|CE|CY/
154
+ # italian vs. english
155
+ if w[i, 3] =~ /CIO|CIE|CIA/
156
+ primary += 'S'
157
+ secondary += 'X'
158
+ else
159
+ primary += 'S'
160
+ secondary += 'S'
161
+ end
162
+ i += 2
163
+ else
164
+ primary += 'K'
165
+ secondary += 'K'
166
+ # name sent in 'mac caffrey', 'mac gregor'
167
+ if w[i + 1, 2] =~ /\s[CQG]/
168
+ i += 3
169
+ else
170
+ if w[i + 1] =~ /[CKQ]/ && !(w[i + 1, 2] =~ /CE|CI/)
171
+ i += 2
172
+ else
173
+ i += 1
174
+ end
175
+ end
176
+ end
177
+ when 'D'
178
+ if w[i, 2] == 'DG'
179
+ if w[i + 2] =~ /[IEY]/
180
+ # e.g. 'edge'
181
+ primary += 'J'
182
+ secondary += 'J'
183
+ i += 3
184
+ else
185
+ # e.g. 'edgar'
186
+ primary += 'TK'
187
+ secondary += 'TK'
188
+ i += 2
189
+ end
190
+ elsif w[i, 2] =~ /DT|DD/
191
+ primary += 'T'
192
+ secondary += 'T'
193
+ i += 2
194
+ else
195
+ primary += 'T'
196
+ secondary += 'T'
197
+ i += 1
198
+ end
199
+ when 'F'
200
+ if w[i + 1] == 'F'
201
+ i += 2
202
+ else
203
+ i += 1
204
+ end
205
+ primary += 'F'
206
+ secondary += 'F'
207
+ when 'G'
208
+ if w[i + 1] == 'H'
209
+ if i > 0 && !vowel?(w[i - 1])
210
+ primary += 'K'
211
+ secondary += 'K'
212
+ i += 2
213
+ elsif i == 0
214
+ # ghislane, ghiradelli
215
+ if w[i + 2] == 'I'
216
+ primary += 'J'
217
+ secondary += 'J'
218
+ else
219
+ primary += 'K'
220
+ secondary += 'K'
221
+ end
222
+ i += 2
223
+ # Parker's rule (with some further refinements) - e.g., 'hugh'
224
+ elsif (i > 1 && w[i - 2] =~ /[BHD]/) ||
225
+ # e.g., 'bough'
226
+ (i > 2 && w[i - 3] =~ /[BHD]/) ||
227
+ # e.g., 'broughton'
228
+ (i > 3 && w[i - 4] =~ /[BH]/)
229
+ i += 2
230
+ else
231
+ # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
232
+ if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
233
+ primary += 'F'
234
+ secondary += 'F'
235
+ else
236
+ if i > 0 && w[i - 1] != 'I'
237
+ primary += 'K'
238
+ secondary += 'K'
239
+ end
240
+ end
241
+ i += 2
242
+ end
243
+ elsif w[i + 1] == 'N'
244
+ if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
245
+ primary += 'KN'
246
+ secondary += 'N'
247
+ else
248
+ # not e.g. 'cagney'
249
+ if w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
250
+ primary += 'N'
251
+ secondary += 'KN'
252
+ else
253
+ primary += 'KN'
254
+ secondary += 'KN'
255
+ end
256
+ end
257
+ i += 2
258
+ # 'tagliaro'
259
+ elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
260
+ primary += 'KL'
261
+ secondary += 'L'
262
+ i += 2
263
+ # -ges-,-gep-,-gel-, -gie- at beginning
264
+ elsif i == 0 && (w[i + 1] == 'Y' || w[i + 1, 2] =~ /ES|EP|EB|EL|EY|IB|IL|IN|IE|EI|ER/)
265
+ primary += 'K'
266
+ secondary += 'J'
267
+ i += 2
268
+ # -ger-, -gy-
269
+ elsif (w[i + 1, 2] == 'ER' || w[i + 1] == 'Y') &&
270
+ !(w[0, 6] =~ /[DRM]ANGER/) &&
271
+ !(i > 0 && w[i - 1] =~ /[EI]/) &&
272
+ !(i > 0 && w[i - 1, 3] =~ /RGY|OGY/)
273
+ primary += 'K'
274
+ secondary += 'J'
275
+ i += 2
276
+ # italian e.g, 'biaggi'
277
+ elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
278
+ if w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH' || w[i + 1, 2] == 'ET'
279
+ primary += 'K'
280
+ secondary += 'K'
281
+ else
282
+ if w[i + 1, 4] =~ /IER\s/
283
+ primary += 'J'
284
+ secondary += 'J'
285
+ else
286
+ primary += 'J'
287
+ secondary += 'K'
288
+ end
289
+ end
290
+ i += 2
291
+ else
292
+ if w[i + 1] == 'G'
293
+ i += 2
294
+ else
295
+ i += 1
296
+ end
297
+ primary += 'K'
298
+ secondary += 'K'
299
+ end
300
+ when 'H'
301
+ # only keep if first & before vowel or btw. 2 vowels
302
+ if (i == 0 || (i > 0 && vowel?(w[i - 1]))) && vowel?(w[i + 1])
303
+ primary += 'H'
304
+ secondary += 'H'
305
+ i += 2
306
+ else # also takes care of 'HH'
307
+ i += 1
308
+ end
309
+ when 'J'
310
+ # obvious spanish, 'jose', 'san jacinto'
311
+ if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
312
+ if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
313
+ primary += 'H'
314
+ secondary += 'H'
315
+ else
316
+ primary += 'J'
317
+ secondary += 'H'
318
+ end
319
+ i += 1
320
+ else
321
+ if i == 0 && w[i, 4] != 'JOSE'
322
+ primary += 'J'
323
+ secondary += 'A'
324
+ # Yankelovich/Jankelowicz
325
+ else
326
+ # spanish pron. of e.g. 'bajador'
327
+ if i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && (w[i + 1] == 'A' || w[i + 1] == 'O')
328
+ primary += 'J'
329
+ secondary += 'H'
330
+ else
331
+ if i == last
332
+ primary += 'J'
333
+ #secondary += ' '
334
+ else
335
+ if !(w[i + 1] =~ /[LTKSNMBZ]/) && !(i > 0 && w[i - 1] =~ /[SKL]/)
336
+ primary += 'J'
337
+ secondary += 'J'
338
+ end
339
+ end
340
+ end
341
+ end
342
+ if w[i + 1] == 'J'
343
+ i += 2
344
+ else
345
+ i += 1
346
+ end
347
+ end
348
+ when 'K'
349
+ if w[i + 1] == 'K'
350
+ i += 2
351
+ else
352
+ i += 1
353
+ end
354
+ primary += 'K'
355
+ secondary += 'K'
356
+ when 'L'
357
+ if w[i + 1] == 'L'
358
+ # spanish e.g. 'cabrillo', 'gallegos'
359
+ if (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILLO|ILLA|ALLE/) ||
360
+ ((last > 0 && w[last - 1, 2] =~ /AS|OS/ || w[last] =~ /[AO]/) &&
361
+ (i > 0 && w[i - 1, 4] == 'ALLE'))
362
+ primary += 'L'
363
+ i += 2
364
+ next
365
+ end
366
+ i += 2
367
+ else
368
+ i += 1
369
+ end
370
+ primary += 'L'
371
+ secondary += 'L'
372
+ when 'M'
373
+ if (i > 0 && w[i - 1, 3] == 'UMB' && (i + 1 == last || w[i + 2, 2] == "ER")) ||
374
+ # 'dumb','thumb'
375
+ w[i + 1] == 'M'
376
+ i += 2
377
+ else
378
+ i += 1
379
+ end
380
+ primary += 'M'
381
+ secondary += 'M'
382
+ when 'N'
383
+ if w[i + 1] == 'N'
384
+ i += 2
385
+ else
386
+ i += 1
387
+ end
388
+ primary += 'N'
389
+ secondary += 'N'
390
+ when 'Ñ', 'ñ'
391
+ i += 1;
392
+ primary += 'N'
393
+ secondary += 'N'
394
+ when 'P'
395
+ if w[i + 1] == 'H'
396
+ primary += 'F'
397
+ secondary += 'F'
398
+ i += 2
399
+ else
400
+ # also account for "campbell", "raspberry"
401
+ if w[i + 1] =~ /[PB]/
402
+ i += 2
403
+ else
404
+ i += 1
405
+ end
406
+ primary += 'P'
407
+ secondary += 'P'
408
+ end
409
+ when 'Q'
410
+ if w[i + 1] == 'Q'
411
+ i += 2
412
+ else
413
+ i += 1
414
+ end
415
+ primary += 'K'
416
+ secondary += 'K'
417
+ when 'R'
418
+ # french e.g. 'rogier', but exclude 'hochmeier'
419
+ if i == last && !slavo_germanic?(w) &&
420
+ (i > 1 && w[i - 2, 2] == "IE") &&
421
+ !(i > 3 && w[i - 4, 2] =~ /M[EA]/)
422
+ secondary += 'R'
423
+ else
424
+ primary += 'R'
425
+ secondary += 'R'
426
+ end
427
+ if w[i + 1] == 'R'
428
+ i += 2
429
+ else
430
+ i += 1
431
+ end
432
+ when 'S'
433
+ # special cases 'island', 'isle', 'carlisle', 'carlysle'
434
+ if i > 0 && w[i - 1, 3] =~ /ISL|YSL/
435
+ i += 1
436
+ # special case 'sugar-'
437
+ elsif i == 0 && w[i, 5] == 'SUGAR'
438
+ primary += 'X'
439
+ secondary += 'S'
440
+ i += 1
441
+ elsif w[i, 2] == 'SH'
442
+ # germanic
443
+ if w[i + 1, 4] =~ /HEIM|HOEK|HOLM|HOLZ/
444
+ primary += 'S'
445
+ secondary += 'S'
446
+ else
447
+ primary += 'X'
448
+ secondary += 'X'
449
+ end
450
+ i += 2
451
+ # italian & armenian
452
+ elsif w[i, 3] =~ /SIO|SIA/ || w[i, 4] == 'SIAN'
453
+ if !slavo_germanic?(w)
454
+ primary += 'S'
455
+ secondary += 'X'
456
+ else
457
+ primary += 'S'
458
+ secondary += 'S'
459
+ end
460
+ i += 3
461
+ # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
462
+ # also, -sz- in slavic language altho in hungarian it is pronounced 's'
463
+ elsif (i == 0 && w[i + 1] =~ /[MNLW]/) || w[i + 1] == 'Z'
464
+ primary += 'S'
465
+ secondary += 'X'
466
+ if w[i + 1] == 'Z'
467
+ i += 2
468
+ else
469
+ i += 1
470
+ end
471
+ elsif w[i, 2] == 'SC'
472
+ # Schlesinger's rule
473
+ if w[i + 2] == 'H'
474
+ # dutch origin, e.g. 'school', 'schooner'
475
+ if w[i + 3, 2] =~ /OO|ER|EN|UY|ED|EM/
476
+ # 'schermerhorn', 'schenker'
477
+ if w[i + 3, 2] =~ /ER|EN/
478
+ primary += 'X'
479
+ secondary += 'SK'
480
+ else
481
+ primary += 'SK'
482
+ secondary += 'SK'
483
+ end
484
+ i += 3
485
+ else
486
+ if i == 0 && !vowel?(w[3]) && w[3] != 'W'
487
+ primary += 'X'
488
+ secondary += 'S'
489
+ else
490
+ primary += 'X'
491
+ secondary += 'X'
492
+ end
493
+ i += 3
494
+ end
495
+ elsif w[i + 2, 1] =~ /[IEY]/
496
+ primary += 'S'
497
+ secondary += 'S'
498
+ i += 3
499
+ else
500
+ primary += 'SK'
501
+ secondary += 'SK'
502
+ i += 3
503
+ end
504
+ else
505
+ # french e.g. 'resnais', 'artois'
506
+ if i == last && i > 1 && w[i - 2, 2] =~ /AI|OI/
507
+ secondary += 'S'
508
+ else
509
+ primary += 'S'
510
+ secondary += 'S'
511
+ end
512
+ if w[i + 1] =~ /[SZ]/
513
+ i += 2
514
+ else
515
+ i += 1
516
+ end
517
+ end
518
+ when 'T'
519
+ if w[i, 4] == 'TION'
520
+ primary += 'X'
521
+ secondary += 'X'
522
+ i += 3
523
+ elsif w[i, 3] =~ /TIA|TCH/
524
+ primary += 'X'
525
+ secondary += 'X'
526
+ i += 3
527
+ elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
528
+ # special case 'thomas', 'thames' or germanic
529
+ if w[i + 2, 2] =~ /OM|AM/ || w[0, 4] =~ /VAN|VON\s/ || w[0, 3] == 'SCH'
530
+ primary += 'T'
531
+ secondary += 'T'
532
+ else
533
+ primary += '0'
534
+ secondary += 'T'
535
+ end
536
+ i += 2
537
+ else
538
+ if w[i + 1] =~ /[TD]/
539
+ i += 2
540
+ else
541
+ i += 1
542
+ end
543
+ primary += 'T'
544
+ secondary += 'T'
545
+ end
546
+ when 'V'
547
+ if w[i + 1] == 'V'
548
+ i += 2
549
+ else
550
+ i += 1
551
+ end
552
+ primary += 'F'
553
+ secondary += 'F'
554
+ when 'W'
555
+ # can also be in middle of word
556
+ if w[i, 2] == 'WR'
557
+ primary += 'R'
558
+ secondary += 'R'
559
+ i += 2
560
+ else
561
+ if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
562
+ # Wasserman should match Vasserman
563
+ if vowel?(w[i + 1])
564
+ primary += 'A'
565
+ secondary += 'F'
566
+ else
567
+ # need Uomo to match Womo
568
+ primary += 'A'
569
+ secondary += 'A'
570
+ end
571
+ end
572
+ # Arnow should match Arnoff
573
+ if i == last && i > 0 && vowel?(w[i - 1]) ||
574
+ (i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/) || w[0, 3] == 'SCH'
575
+ secondary += 'F'
576
+ i += 1
577
+ elsif w[i, 4] =~ /WICZ|WITZ/
578
+ # polish e.g. 'filipowicz'
579
+ primary += 'TS'
580
+ secondary += 'FX'
581
+ i += 4
582
+ else
583
+ i += 1
584
+ end
585
+ end
586
+ when 'X'
587
+ # french e.g. breaux
588
+ if !(i == last && ((i > 2 && w[i - 3, 3] =~ /IAU|EAU/) || (i > 1 && w[i - 2, 2] =~ /AU|OU/)))
589
+ primary += 'KS'
590
+ secondary += 'KS'
591
+ end
592
+ if w[i + 1] =~ /[CX]/
593
+ i += 2
594
+ else
595
+ i += 1
596
+ end
597
+ when 'Z'
598
+ # chinese pinyin e.g. 'zhao'
599
+ if w[i + 1] == 'H'
600
+ primary += 'J'
601
+ secondary += 'J'
602
+ i += 2
603
+ else
604
+ if w[i + 1, 2] =~ /ZO|ZI|ZA/ || slavo_germanic?(w) && (i > 0 && w[i - 1] != 'T')
605
+ primary += 'S'
606
+ secondary += 'TS';
607
+ else
608
+ primary += 'S'
609
+ secondary += 'S';
610
+ end
611
+ if w[i + 1] == 'Z'
612
+ i += 2
613
+ else
614
+ i += 1
615
+ end
616
+ end
617
+ else
618
+ i += 1
619
+ end
620
+ end
621
+ [primary[0, code_size], secondary[0, code_size]]
622
+ end
623
+
624
+ def self.encode(str, options = { size: 4 })
625
+ encode_word(str, options)
626
+ end
627
+
628
+ private
629
+
630
+ def self.slavo_germanic?(str)
631
+ !!(str[/W|K|CZ|WITZ/])
632
+ end
633
+
634
+ def self.vowel?(char)
635
+ c = VOWELS[char.to_s]
636
+ !c.nil? && !c.empty?
637
+ end
638
+
639
+ end
640
+ end