phonetic 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +3 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +98 -0
  7. data/Rakefile +1 -0
  8. data/lib/phonetic.rb +10 -0
  9. data/lib/phonetic/algorithm.rb +24 -0
  10. data/lib/phonetic/caverphone.rb +68 -0
  11. data/lib/phonetic/caverphone2.rb +69 -0
  12. data/lib/phonetic/core_ext/string.rb +3 -0
  13. data/lib/phonetic/core_ext/string/caverphone.rb +11 -0
  14. data/lib/phonetic/core_ext/string/caverphone2.rb +11 -0
  15. data/lib/phonetic/core_ext/string/double_metaphone.rb +18 -0
  16. data/lib/phonetic/core_ext/string/metaphone.rb +12 -0
  17. data/lib/phonetic/core_ext/string/nysiis.rb +12 -0
  18. data/lib/phonetic/core_ext/string/refined_soundex.rb +12 -0
  19. data/lib/phonetic/core_ext/string/soundex.rb +12 -0
  20. data/lib/phonetic/double_metaphone.rb +640 -0
  21. data/lib/phonetic/metaphone.rb +161 -0
  22. data/lib/phonetic/metaphone2.rb +5 -0
  23. data/lib/phonetic/nysiis.rb +63 -0
  24. data/lib/phonetic/refined_soundex.rb +39 -0
  25. data/lib/phonetic/soundex.rb +39 -0
  26. data/lib/phonetic/version.rb +3 -0
  27. data/phonetic.gemspec +26 -0
  28. data/spec/phonetic/algorithm_spec.rb +15 -0
  29. data/spec/phonetic/caverphone2_spec.rb +66 -0
  30. data/spec/phonetic/caverphone_spec.rb +115 -0
  31. data/spec/phonetic/core_ext/string/caverphone2_spec.rb +9 -0
  32. data/spec/phonetic/core_ext/string/caverphone_spec.rb +9 -0
  33. data/spec/phonetic/core_ext/string/double_metaphone_spec.rb +15 -0
  34. data/spec/phonetic/core_ext/string/metaphone_spec.rb +11 -0
  35. data/spec/phonetic/core_ext/string/nysiis_spec.rb +12 -0
  36. data/spec/phonetic/core_ext/string/refined_soundex_spec.rb +10 -0
  37. data/spec/phonetic/core_ext/string/soundex_spec.rb +14 -0
  38. data/spec/phonetic/double_metaphone_spec.rb +16 -0
  39. data/spec/phonetic/metaphone2_spec.rb +9 -0
  40. data/spec/phonetic/metaphone_spec.rb +81 -0
  41. data/spec/phonetic/nysiis_spec.rb +20 -0
  42. data/spec/phonetic/refined_soundex_spec.rb +13 -0
  43. data/spec/phonetic/soundex_spec.rb +24 -0
  44. data/spec/spec_helper.rb +11 -0
  45. data/spec/support/double_metaphone_data.rb +142 -0
  46. data/spec/support/nysiis_data.rb +31 -0
  47. metadata +180 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cb22f0be1272e5b72586a964943808d93f99c5c2
4
+ data.tar.gz: cfab5ebba7adc3823f9c74a92b5877681e73d36d
5
+ SHA512:
6
+ metadata.gz: da460d1d048d38d39af6970b3c456551217551d3aa1275d8ec83cc86ea6ae5fcf454842ccb0300d93726303ab9009639bd72c894d99e30d9773edd269e0f41c8
7
+ data.tar.gz: 3b94a722e973f37eccd62841d0cd64be3e0dbce0ab27766cd4384a01e375440334f74cc22b14c21804489a5e87f6052911f3decf5b1a2c1f4f24e7b6459306cc
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format doc
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 n7v
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,98 @@
1
+ # Phonetic
2
+
3
+ Ruby library for phonetic algorithms.
4
+ It supports Soundex, Metaphone, Double Metaphone, Caverphone, NYSIIS and others.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'phonetic'
11
+
12
+ And then execute:
13
+
14
+ ```shell
15
+ $ bundle
16
+ ```
17
+
18
+ Or install it yourself as:
19
+
20
+ ```shell
21
+ $ gem install phonetic
22
+ ```
23
+
24
+ ## Usage
25
+
26
+ ```ruby
27
+ require 'phonetic'
28
+ ```
29
+
30
+ ### Soundex
31
+
32
+ ```ruby
33
+ 'Ackerman'.soundex # => 'A265'
34
+ 'ammonium'.soundex # => 'A500'
35
+ 'implementation'.soundex # => 'I514'
36
+ ```
37
+
38
+ ### Refined Soundex
39
+
40
+ ```ruby
41
+ 'Caren'.refined_soundex # => 'C30908'
42
+ 'Hayers'.refined_soundex # => 'H093'
43
+ 'Lambard'.refined_soundex # => 'L7081096'
44
+ ```
45
+
46
+ ### Metaphone
47
+
48
+ ```ruby
49
+ 'Accola'.metaphone # => 'AKKL'
50
+ 'Nikki'.metaphone # => 'NK'
51
+ 'Wright'.metaphone #=> 'RT'
52
+ ```
53
+
54
+ ### Double Metaphone
55
+
56
+ ```ruby
57
+ 'czerny'.double_metaphone # => ['SRN', 'XRN']
58
+ 'dumb'.double_metaphone # => ['TM', 'TM']
59
+ 'edgar'.double_metaphone # => ['ATKR', 'ATKR']
60
+ ```
61
+
62
+ or use alias:
63
+
64
+ ```ruby
65
+ 'czerny'.metaphone2 # => ['SRN', 'XRN']
66
+ 'dumb'.metaphone2 # => ['TM', 'TM']
67
+ 'edgar'.metaphone2 # => ['ATKR', 'ATKR']
68
+ ```
69
+
70
+ ### Caverphone
71
+
72
+ ```ruby
73
+ 'Lashaunda'.caverphone # => 'LSNT11'
74
+ 'Vidaurri'.caverphone # => 'FTR111'
75
+ ````
76
+
77
+ ### Caverphone 2
78
+
79
+ ```ruby
80
+ 'Stevenson'.caverphone2 # => 'STFNSN1111'
81
+ 'Peter'.caverphone2 # => 'PTA1111111'
82
+ ```
83
+
84
+ ### NYSIIS
85
+
86
+ ```ruby
87
+ 'Alexandra'.nysiis # => 'ALAXANDR'
88
+ 'Aumont'.nysiis # => 'AANAD'
89
+ 'Bonnie'.nysiis # => 'BANY'
90
+ ```
91
+
92
+ ## Contributing
93
+
94
+ 1. Fork it
95
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
96
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
97
+ 4. Push to the branch (`git push origin my-new-feature`)
98
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,10 @@
1
+ require 'phonetic/version'
2
+ require 'phonetic/nysiis'
3
+ require 'phonetic/soundex'
4
+ require 'phonetic/refined_soundex'
5
+ require 'phonetic/metaphone'
6
+ require 'phonetic/double_metaphone'
7
+ require 'phonetic/metaphone2'
8
+ require 'phonetic/caverphone'
9
+ require 'phonetic/caverphone2'
10
+ require 'phonetic/core_ext/string'
@@ -0,0 +1,24 @@
1
+ module Phonetic
2
+ # Base class for phonetic algorithms.
3
+ class Algorithm
4
+ # Generic method for encoding single word. Override it in your algorithm class.
5
+ # @param [String] word the word to encode
6
+ # @param [Hash] options the options for the algorithm
7
+ # @return [String] the word
8
+ def self.encode_word(word, options = {})
9
+ word
10
+ end
11
+
12
+ # Generic method for encoding string.
13
+ # Splits string by words and encodes it with {Algorithm.encode_word}.
14
+ #
15
+ # @param [String] str the string to encode.
16
+ # @param [Hash] options the options for algorithm.
17
+ # @return [String] the space separated codes of words from input string.
18
+ def self.encode(str, options = {})
19
+ str.scan(/\p{Word}+/).map do |word|
20
+ encode_word(word, options)
21
+ end.compact.reject(&:empty?).join(' ')
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,68 @@
1
+ require 'phonetic/algorithm'
2
+
3
+ module Phonetic
4
+ # Caverphone created by the Caversham Project at the University of Otago.
5
+ # @see http://caversham.otago.ac.nz/files/working/ctp060902.pdf Caverphone: Phonetic Matching algorithm by David Hood (2002)
6
+ # This class implements this algorithm.
7
+ # @example
8
+ # Phonetic::Caverphone.encode('Charmain') # => 'KMN111'
9
+ # Phonetic::Caverphone.encode('Ellett') # => 'ALT111'
10
+ # Phonetic::Caverphone.encode('Siegmund') # => 'SKMNT1'
11
+ class Caverphone < Algorithm
12
+ MAP = {
13
+ /^(cou|rou|tou|enou)gh/ => '\12f',
14
+ /^gn/ => '2n',
15
+ /mb$/ => 'mb',
16
+ 'cq' => '2q',
17
+ /c([iey])/ => 's\1',
18
+ 'tch' => '2ch',
19
+ /[cqx]/ => 'k',
20
+ 'v' => 'f',
21
+ 'dg' => '2g',
22
+ /ti([oa])/ => 'si\1',
23
+ 'd' => 't',
24
+ 'ph' => 'fh',
25
+ 'b' => 'p',
26
+ 'sh' => 's2',
27
+ 'z' => 's',
28
+ /^[aeiou]/ => 'A',
29
+ /[aeiou]/ => '3',
30
+ '3gh3' => '3kh3',
31
+ 'gh' => '22',
32
+ 'g' => 'k',
33
+ /s+/ => 'S',
34
+ /t+/ => 'T',
35
+ /p+/ => 'P',
36
+ /k+/ => 'K',
37
+ /f+/ => 'F',
38
+ /m+/ => 'M',
39
+ /n+/ => 'N',
40
+ 'w3' => 'W3',
41
+ /wy/ => 'Wy',
42
+ 'wh3' => 'Wh3',
43
+ 'why' => 'Why',
44
+ 'w' => '2',
45
+ /^h/ => 'A',
46
+ 'h' => '2',
47
+ 'r3' => 'R3',
48
+ 'ry' => 'Ry',
49
+ 'r' => '2',
50
+ 'l3' => 'L3',
51
+ 'ly' => 'Ly',
52
+ 'l' => '2',
53
+ 'j' => 'y',
54
+ 'y3' => 'Y3',
55
+ 'y' => '2',
56
+ '2' => '',
57
+ '3' => ''
58
+ }
59
+
60
+ # Encode word to its Caverphone code
61
+ def self.encode_word(word, options = {})
62
+ w = word.strip.downcase.gsub(/[^a-z]/, '')
63
+ MAP.each { |r, v| w.gsub!(r, v) }
64
+ w = w + '1' * 6
65
+ w[0..5]
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,69 @@
1
+ require 'phonetic/algorithm'
2
+
3
+ module Phonetic
4
+ # Caverphone 2.0 created by the Caversham Project at the University of Otago.
5
+ # @see http://caversham.otago.ac.nz/files/working/ctp150804.pdf Caverphone Revisited by David Hood (2004)
6
+ # This class implements this algorithm.
7
+ # @example
8
+ # Phonetic::Caverphone2.encode('Stevenson') # => 'STFNSN1111'
9
+ # Phonetic::Caverphone2.encode('Peter') # => 'PTA1111111'
10
+ class Caverphone2 < Algorithm
11
+ MAP = {
12
+ /e$/ => '',
13
+ /^(cou|rou|tou|enou|trou)gh/ => '\12f',
14
+ /^gn/ => '2n',
15
+ /mb$/ => 'mb',
16
+ 'cq' => '2q',
17
+ /c([iey])/ => 's\1',
18
+ 'tch' => '2ch',
19
+ /[cqx]/ => 'k',
20
+ 'v' => 'f',
21
+ 'dg' => '2g',
22
+ /ti([oa])/ => 'si\1',
23
+ 'd' => 't',
24
+ 'ph' => 'fh',
25
+ 'b' => 'p',
26
+ 'sh' => 's2',
27
+ 'z' => 's',
28
+ /^[aeiou]/ => 'A',
29
+ /[aeiou]/ => '3',
30
+ 'j' => 'y',
31
+ /^y3/ => 'Y3',
32
+ /^y/ => 'A',
33
+ /y/ => '3',
34
+ '3gh3' => '3kh3',
35
+ 'gh' => '22',
36
+ 'g' => 'k',
37
+ /s+/ => 'S',
38
+ /t+/ => 'T',
39
+ /p+/ => 'P',
40
+ /k+/ => 'K',
41
+ /f+/ => 'F',
42
+ /m+/ => 'M',
43
+ /n+/ => 'N',
44
+ 'w3' => 'W3',
45
+ 'wh3' => 'Wh3',
46
+ /w$/ => '3',
47
+ 'w' => '2',
48
+ /^h/ => 'A',
49
+ 'h' => '2',
50
+ 'r3' => 'R3',
51
+ /r$/ => '3',
52
+ 'r' => '2',
53
+ 'l3' => 'L3',
54
+ /l$/ => '3',
55
+ 'l' => '2',
56
+ '2' => '',
57
+ /3$/ => 'A',
58
+ '3' => ''
59
+ }
60
+
61
+ # Encode word to its Caverphone 2 code
62
+ def self.encode_word(word, options = {})
63
+ w = word.strip.downcase.gsub(/[^a-z]/, '')
64
+ MAP.each { |r, v| w.gsub!(r, v) }
65
+ w = w + '1' * 10
66
+ w[0..9]
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,3 @@
1
+ Gem.find_files('phonetic/core_ext/string/*.rb')
2
+ .reject{|path| path =~ /_spec/}
3
+ .each { |path| require path }
@@ -0,0 +1,11 @@
1
+ require 'phonetic/caverphone'
2
+
3
+ class String
4
+ # Caverphone value of string
5
+ # @example
6
+ # 'Lashaunda'.caverphone # => 'LSNT11'
7
+ # 'Vidaurri'.caverphone # => 'FTR111'
8
+ def caverphone(options = {})
9
+ Phonetic::Caverphone.encode(self, options)
10
+ end
11
+ end
@@ -0,0 +1,11 @@
1
+ require 'phonetic/caverphone2'
2
+
3
+ class String
4
+ # Caverphone 2 value of string
5
+ # @example
6
+ # 'Stevenson'.caverphone2 # => 'STFNSN1111'
7
+ # 'Peter'.caverphone2 # => 'PTA1111111'
8
+ def caverphone2(options = {})
9
+ Phonetic::Caverphone2.encode(self, options)
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ require 'phonetic/double_metaphone'
2
+
3
+ class String
4
+ # Double Metahpone code of string.
5
+ # @example
6
+ # 'czerny'.double_metaphone # => ['SRN', 'XRN']
7
+ # 'dumb'.double_metaphone # => ['TM', 'TM']
8
+ # 'edgar'.double_metaphone # => ['ATKR', 'ATKR']
9
+ # # or use alias:
10
+ # 'czerny'.metaphone2 # => ['SRN', 'XRN']
11
+ # 'dumb'.metaphone2 # => ['TM', 'TM']
12
+ # 'edgar'.metaphone2 # => ['ATKR', 'ATKR']
13
+ def double_metaphone(options = { size: 4 })
14
+ Phonetic::DoubleMetaphone.encode(self, options)
15
+ end
16
+
17
+ alias_method :metaphone2, :double_metaphone
18
+ end
@@ -0,0 +1,12 @@
1
+ require 'phonetic/metaphone'
2
+
3
+ class String
4
+ # Metaphone value of string.
5
+ # @example
6
+ # 'Accola'.metaphone # => 'AKKL'
7
+ # 'Nikki'.metaphone # => 'NK'
8
+ # 'Wright'.metaphone #=> 'RT'
9
+ def metaphone(options = { size: 4 })
10
+ Phonetic::Metaphone.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'phonetic/nysiis'
2
+
3
+ class String
4
+ # Caverphone value of string.
5
+ # @example
6
+ # 'Alexandra'.nysiis # => 'ALAXANDR'
7
+ # 'Aumont'.nysiis # => 'AANAD'
8
+ # 'Bonnie'.nysiis # => 'BANY'
9
+ def nysiis(options = { trim: true })
10
+ Phonetic::NYSIIS.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'phonetic/refined_soundex'
2
+
3
+ class String
4
+ # Refined Soundex value of string.
5
+ # @example
6
+ # 'Caren'.refined_soundex # => 'C30908'
7
+ # 'Hayers'.refined_soundex # => 'H093'
8
+ # 'Lambard'.refined_soundex # => 'L7081096'
9
+ def refined_soundex(options = { trim: true })
10
+ Phonetic::RefinedSoundex.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'phonetic/soundex'
2
+
3
+ class String
4
+ # Soundex value of string
5
+ # @example
6
+ # 'Ackerman'.soundex # => 'A265'
7
+ # 'ammonium'.soundex # => 'A500'
8
+ # 'implementation'.soundex # => 'I514'
9
+ def soundex(options = { trim: true })
10
+ Phonetic::Soundex.encode(self, options)
11
+ end
12
+ end
@@ -0,0 +1,640 @@
1
+ # encoding: utf-8
2
+
3
+ require 'phonetic/algorithm'
4
+
5
+ module Phonetic
6
+ # The Double Metaphone phonetic encoding algorithm is the second generation
7
+ # of the Metaphone algorithm. Its original implementation was described
8
+ # by Lawrence Philips in the June 2000 issue of C/C++ Users Journal.
9
+ #
10
+ # This implementation based on the PHP implementation by Stephen Woodbridge
11
+ # and contains modifications of algorithm by Kevin Atkinson.
12
+ # @see http://swoodbridge.com/DoubleMetaPhone/ PHP implementation by Stephen Woodbridge
13
+ # @see http://aspell.net/metaphone/dmetaph.cpp C++ implementation with modifications by Kevin Atkinson
14
+ # @example
15
+ # Phonetic::DoubleMetaphone.encode('czerny') # => ['SRN', 'XRN']
16
+ # Phonetic::DoubleMetaphone.encode('dumb') # => ['TM', 'TM']
17
+ # Phonetic::DoubleMetaphone.encode('edgar') # => ['ATKR', 'ATKR']
18
+ # # or use alias:
19
+ # Phonetic::Metaphone2.encode('czerny') # => ['SRN', 'XRN']
20
+ # Phonetic::Metaphone2.encode('dumb') # => ['TM', 'TM']
21
+ # Phonetic::Metaphone2.encode('edgar') # => ['ATKR', 'ATKR']
22
+ class DoubleMetaphone < Algorithm
23
+ VOWELS = 'AEIOUY'
24
+
25
+ # Encode word to its Double Metaphone code.
26
+ def self.encode_word(word, options = { size: 4 })
27
+ code_size = options[:size] || 4
28
+ w = word.strip.upcase
29
+ primary = ''
30
+ secondary = ''
31
+ i = 0
32
+ len = w.size
33
+ last = len - 1
34
+ # pad the original string so that we can index beyond the edge of the world
35
+ w += ' ' * 5
36
+ # skip these when at start of word
37
+ i += 1 if ['GN','KN','PN','WR','PS'].include? w[0, 2]
38
+ # initial 'X' is pronounced 'Z' e.g. 'Xavier'
39
+ if w[0] == 'X'
40
+ primary += 'S'
41
+ secondary += 'S'
42
+ i += 1
43
+ end
44
+ while i < len && (primary.size < code_size || primary.size < code_size)
45
+ case w[i]
46
+ when 'A', 'E', 'I', 'O', 'U', 'Y'
47
+ if i == 0
48
+ # all init vowels now map to 'A'
49
+ primary += 'A'
50
+ secondary += 'A'
51
+ end
52
+ i += 1
53
+ when 'B'
54
+ # "-mb", e.g", "dumb", already skipped over...
55
+ primary += 'P'
56
+ secondary += 'P'
57
+ i += (w[i + 1] == 'B') ? 2 : 1
58
+ when 'Ç', 'ç'
59
+ primary += 'S'
60
+ secondary += 'S'
61
+ i += 1
62
+ when 'C'
63
+ # various germanic
64
+ if i > 1 && !vowel?(w[i - 2]) && w[i - 1, 3] == 'ACH' &&
65
+ (w[i + 2] != 'I' && (w[i + 2] != 'E' || w[i - 2, 6] =~ /[BM]ACHER/))
66
+ primary += 'K'
67
+ secondary += 'K'
68
+ i += 2
69
+ # special case 'caesar'
70
+ elsif i == 0 && w[i, 6] == 'CAESAR'
71
+ primary += 'S'
72
+ secondary += 'S'
73
+ i += 2
74
+ # italian 'chianti'
75
+ elsif w[i, 4] == 'CHIA'
76
+ primary += 'K'
77
+ secondary += 'K'
78
+ i += 2
79
+ elsif w[i, 2] == 'CH'
80
+ # find 'michael'
81
+ if i > 0 && w[i, 4] == 'CHAE'
82
+ primary += 'K'
83
+ secondary += 'X'
84
+ i += 2
85
+ # greek roots e.g. 'chemistry', 'chorus'
86
+ elsif i == 0 && (w[i + 1, 5] =~ /HARAC|HARIS/ || w[i + 1, 3] =~ /HOR|HYM|HIA|HEM/) &&
87
+ w[0, 5] != 'CHORE'
88
+ primary += 'K'
89
+ secondary += 'K'
90
+ i += 2
91
+ else
92
+ # germanic, greek, or otherwise 'ch' for 'kh' sound
93
+ if (w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH') ||
94
+ # 'architect but not 'arch', 'orchestra', 'orchid'
95
+ (i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/) ||
96
+ (w[i + 2] =~ /[TS]/) ||
97
+ ((i > 0 && w[i - 1] =~ /[AOUE]/) || i == 0) &&
98
+ # e.g., 'wachtler', 'wechsler', but not 'tichner'
99
+ (w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
100
+ primary += 'K'
101
+ secondary += 'K'
102
+ else
103
+ if i > 0
104
+ if w[0, 2] == 'MC'
105
+ # e.g., "McHugh"
106
+ primary += 'K'
107
+ secondary += 'K'
108
+ else
109
+ primary += 'X'
110
+ secondary += 'K'
111
+ end
112
+ else
113
+ primary += 'X'
114
+ secondary += 'X'
115
+ end
116
+ end
117
+ i += 2
118
+ end
119
+ elsif w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
120
+ # e.g, 'czerny'
121
+ primary += 'S'
122
+ secondary += 'X'
123
+ i += 2
124
+ elsif w[i + 1, 3] == 'CIA'
125
+ # e.g., 'focaccia'
126
+ primary += 'X'
127
+ secondary += 'X'
128
+ i += 3
129
+ # double 'C', but not if e.g. 'McClellan'
130
+ elsif w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
131
+ # 'bellocchio' but not 'bacchus'
132
+ if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
133
+ # 'accident', 'accede' 'succeed'
134
+ if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
135
+ # 'bacci', 'bertucci', other italian
136
+ primary += 'KS'
137
+ secondary += 'KS'
138
+ else
139
+ primary += 'X'
140
+ secondary += 'X'
141
+ end
142
+ i += 3
143
+ else
144
+ # Pierce's rule
145
+ primary += 'K'
146
+ secondary += 'K'
147
+ i += 2
148
+ end
149
+ elsif w[i, 2] =~ /CK|CG|CQ/
150
+ primary += 'K'
151
+ secondary += 'K'
152
+ i += 2
153
+ elsif w[i, 2] =~ /CI|CE|CY/
154
+ # italian vs. english
155
+ if w[i, 3] =~ /CIO|CIE|CIA/
156
+ primary += 'S'
157
+ secondary += 'X'
158
+ else
159
+ primary += 'S'
160
+ secondary += 'S'
161
+ end
162
+ i += 2
163
+ else
164
+ primary += 'K'
165
+ secondary += 'K'
166
+ # name sent in 'mac caffrey', 'mac gregor'
167
+ if w[i + 1, 2] =~ /\s[CQG]/
168
+ i += 3
169
+ else
170
+ if w[i + 1] =~ /[CKQ]/ && !(w[i + 1, 2] =~ /CE|CI/)
171
+ i += 2
172
+ else
173
+ i += 1
174
+ end
175
+ end
176
+ end
177
+ when 'D'
178
+ if w[i, 2] == 'DG'
179
+ if w[i + 2] =~ /[IEY]/
180
+ # e.g. 'edge'
181
+ primary += 'J'
182
+ secondary += 'J'
183
+ i += 3
184
+ else
185
+ # e.g. 'edgar'
186
+ primary += 'TK'
187
+ secondary += 'TK'
188
+ i += 2
189
+ end
190
+ elsif w[i, 2] =~ /DT|DD/
191
+ primary += 'T'
192
+ secondary += 'T'
193
+ i += 2
194
+ else
195
+ primary += 'T'
196
+ secondary += 'T'
197
+ i += 1
198
+ end
199
+ when 'F'
200
+ if w[i + 1] == 'F'
201
+ i += 2
202
+ else
203
+ i += 1
204
+ end
205
+ primary += 'F'
206
+ secondary += 'F'
207
+ when 'G'
208
+ if w[i + 1] == 'H'
209
+ if i > 0 && !vowel?(w[i - 1])
210
+ primary += 'K'
211
+ secondary += 'K'
212
+ i += 2
213
+ elsif i == 0
214
+ # ghislane, ghiradelli
215
+ if w[i + 2] == 'I'
216
+ primary += 'J'
217
+ secondary += 'J'
218
+ else
219
+ primary += 'K'
220
+ secondary += 'K'
221
+ end
222
+ i += 2
223
+ # Parker's rule (with some further refinements) - e.g., 'hugh'
224
+ elsif (i > 1 && w[i - 2] =~ /[BHD]/) ||
225
+ # e.g., 'bough'
226
+ (i > 2 && w[i - 3] =~ /[BHD]/) ||
227
+ # e.g., 'broughton'
228
+ (i > 3 && w[i - 4] =~ /[BH]/)
229
+ i += 2
230
+ else
231
+ # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
232
+ if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
233
+ primary += 'F'
234
+ secondary += 'F'
235
+ else
236
+ if i > 0 && w[i - 1] != 'I'
237
+ primary += 'K'
238
+ secondary += 'K'
239
+ end
240
+ end
241
+ i += 2
242
+ end
243
+ elsif w[i + 1] == 'N'
244
+ if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
245
+ primary += 'KN'
246
+ secondary += 'N'
247
+ else
248
+ # not e.g. 'cagney'
249
+ if w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
250
+ primary += 'N'
251
+ secondary += 'KN'
252
+ else
253
+ primary += 'KN'
254
+ secondary += 'KN'
255
+ end
256
+ end
257
+ i += 2
258
+ # 'tagliaro'
259
+ elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
260
+ primary += 'KL'
261
+ secondary += 'L'
262
+ i += 2
263
+ # -ges-,-gep-,-gel-, -gie- at beginning
264
+ elsif i == 0 && (w[i + 1] == 'Y' || w[i + 1, 2] =~ /ES|EP|EB|EL|EY|IB|IL|IN|IE|EI|ER/)
265
+ primary += 'K'
266
+ secondary += 'J'
267
+ i += 2
268
+ # -ger-, -gy-
269
+ elsif (w[i + 1, 2] == 'ER' || w[i + 1] == 'Y') &&
270
+ !(w[0, 6] =~ /[DRM]ANGER/) &&
271
+ !(i > 0 && w[i - 1] =~ /[EI]/) &&
272
+ !(i > 0 && w[i - 1, 3] =~ /RGY|OGY/)
273
+ primary += 'K'
274
+ secondary += 'J'
275
+ i += 2
276
+ # italian e.g, 'biaggi'
277
+ elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
278
+ if w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH' || w[i + 1, 2] == 'ET'
279
+ primary += 'K'
280
+ secondary += 'K'
281
+ else
282
+ if w[i + 1, 4] =~ /IER\s/
283
+ primary += 'J'
284
+ secondary += 'J'
285
+ else
286
+ primary += 'J'
287
+ secondary += 'K'
288
+ end
289
+ end
290
+ i += 2
291
+ else
292
+ if w[i + 1] == 'G'
293
+ i += 2
294
+ else
295
+ i += 1
296
+ end
297
+ primary += 'K'
298
+ secondary += 'K'
299
+ end
300
+ when 'H'
301
+ # only keep if first & before vowel or btw. 2 vowels
302
+ if (i == 0 || (i > 0 && vowel?(w[i - 1]))) && vowel?(w[i + 1])
303
+ primary += 'H'
304
+ secondary += 'H'
305
+ i += 2
306
+ else # also takes care of 'HH'
307
+ i += 1
308
+ end
309
+ when 'J'
310
+ # obvious spanish, 'jose', 'san jacinto'
311
+ if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
312
+ if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
313
+ primary += 'H'
314
+ secondary += 'H'
315
+ else
316
+ primary += 'J'
317
+ secondary += 'H'
318
+ end
319
+ i += 1
320
+ else
321
+ if i == 0 && w[i, 4] != 'JOSE'
322
+ primary += 'J'
323
+ secondary += 'A'
324
+ # Yankelovich/Jankelowicz
325
+ else
326
+ # spanish pron. of e.g. 'bajador'
327
+ if i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && (w[i + 1] == 'A' || w[i + 1] == 'O')
328
+ primary += 'J'
329
+ secondary += 'H'
330
+ else
331
+ if i == last
332
+ primary += 'J'
333
+ #secondary += ' '
334
+ else
335
+ if !(w[i + 1] =~ /[LTKSNMBZ]/) && !(i > 0 && w[i - 1] =~ /[SKL]/)
336
+ primary += 'J'
337
+ secondary += 'J'
338
+ end
339
+ end
340
+ end
341
+ end
342
+ if w[i + 1] == 'J'
343
+ i += 2
344
+ else
345
+ i += 1
346
+ end
347
+ end
348
+ when 'K'
349
+ if w[i + 1] == 'K'
350
+ i += 2
351
+ else
352
+ i += 1
353
+ end
354
+ primary += 'K'
355
+ secondary += 'K'
356
+ when 'L'
357
+ if w[i + 1] == 'L'
358
+ # spanish e.g. 'cabrillo', 'gallegos'
359
+ if (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILLO|ILLA|ALLE/) ||
360
+ ((last > 0 && w[last - 1, 2] =~ /AS|OS/ || w[last] =~ /[AO]/) &&
361
+ (i > 0 && w[i - 1, 4] == 'ALLE'))
362
+ primary += 'L'
363
+ i += 2
364
+ next
365
+ end
366
+ i += 2
367
+ else
368
+ i += 1
369
+ end
370
+ primary += 'L'
371
+ secondary += 'L'
372
+ when 'M'
373
+ if (i > 0 && w[i - 1, 3] == 'UMB' && (i + 1 == last || w[i + 2, 2] == "ER")) ||
374
+ # 'dumb','thumb'
375
+ w[i + 1] == 'M'
376
+ i += 2
377
+ else
378
+ i += 1
379
+ end
380
+ primary += 'M'
381
+ secondary += 'M'
382
+ when 'N'
383
+ if w[i + 1] == 'N'
384
+ i += 2
385
+ else
386
+ i += 1
387
+ end
388
+ primary += 'N'
389
+ secondary += 'N'
390
+ when 'Ñ', 'ñ'
391
+ i += 1;
392
+ primary += 'N'
393
+ secondary += 'N'
394
+ when 'P'
395
+ if w[i + 1] == 'H'
396
+ primary += 'F'
397
+ secondary += 'F'
398
+ i += 2
399
+ else
400
+ # also account for "campbell", "raspberry"
401
+ if w[i + 1] =~ /[PB]/
402
+ i += 2
403
+ else
404
+ i += 1
405
+ end
406
+ primary += 'P'
407
+ secondary += 'P'
408
+ end
409
+ when 'Q'
410
+ if w[i + 1] == 'Q'
411
+ i += 2
412
+ else
413
+ i += 1
414
+ end
415
+ primary += 'K'
416
+ secondary += 'K'
417
+ when 'R'
418
+ # french e.g. 'rogier', but exclude 'hochmeier'
419
+ if i == last && !slavo_germanic?(w) &&
420
+ (i > 1 && w[i - 2, 2] == "IE") &&
421
+ !(i > 3 && w[i - 4, 2] =~ /M[EA]/)
422
+ secondary += 'R'
423
+ else
424
+ primary += 'R'
425
+ secondary += 'R'
426
+ end
427
+ if w[i + 1] == 'R'
428
+ i += 2
429
+ else
430
+ i += 1
431
+ end
432
+ when 'S'
433
+ # special cases 'island', 'isle', 'carlisle', 'carlysle'
434
+ if i > 0 && w[i - 1, 3] =~ /ISL|YSL/
435
+ i += 1
436
+ # special case 'sugar-'
437
+ elsif i == 0 && w[i, 5] == 'SUGAR'
438
+ primary += 'X'
439
+ secondary += 'S'
440
+ i += 1
441
+ elsif w[i, 2] == 'SH'
442
+ # germanic
443
+ if w[i + 1, 4] =~ /HEIM|HOEK|HOLM|HOLZ/
444
+ primary += 'S'
445
+ secondary += 'S'
446
+ else
447
+ primary += 'X'
448
+ secondary += 'X'
449
+ end
450
+ i += 2
451
+ # italian & armenian
452
+ elsif w[i, 3] =~ /SIO|SIA/ || w[i, 4] == 'SIAN'
453
+ if !slavo_germanic?(w)
454
+ primary += 'S'
455
+ secondary += 'X'
456
+ else
457
+ primary += 'S'
458
+ secondary += 'S'
459
+ end
460
+ i += 3
461
+ # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
462
+ # also, -sz- in slavic language altho in hungarian it is pronounced 's'
463
+ elsif (i == 0 && w[i + 1] =~ /[MNLW]/) || w[i + 1] == 'Z'
464
+ primary += 'S'
465
+ secondary += 'X'
466
+ if w[i + 1] == 'Z'
467
+ i += 2
468
+ else
469
+ i += 1
470
+ end
471
+ elsif w[i, 2] == 'SC'
472
+ # Schlesinger's rule
473
+ if w[i + 2] == 'H'
474
+ # dutch origin, e.g. 'school', 'schooner'
475
+ if w[i + 3, 2] =~ /OO|ER|EN|UY|ED|EM/
476
+ # 'schermerhorn', 'schenker'
477
+ if w[i + 3, 2] =~ /ER|EN/
478
+ primary += 'X'
479
+ secondary += 'SK'
480
+ else
481
+ primary += 'SK'
482
+ secondary += 'SK'
483
+ end
484
+ i += 3
485
+ else
486
+ if i == 0 && !vowel?(w[3]) && w[3] != 'W'
487
+ primary += 'X'
488
+ secondary += 'S'
489
+ else
490
+ primary += 'X'
491
+ secondary += 'X'
492
+ end
493
+ i += 3
494
+ end
495
+ elsif w[i + 2, 1] =~ /[IEY]/
496
+ primary += 'S'
497
+ secondary += 'S'
498
+ i += 3
499
+ else
500
+ primary += 'SK'
501
+ secondary += 'SK'
502
+ i += 3
503
+ end
504
+ else
505
+ # french e.g. 'resnais', 'artois'
506
+ if i == last && i > 1 && w[i - 2, 2] =~ /AI|OI/
507
+ secondary += 'S'
508
+ else
509
+ primary += 'S'
510
+ secondary += 'S'
511
+ end
512
+ if w[i + 1] =~ /[SZ]/
513
+ i += 2
514
+ else
515
+ i += 1
516
+ end
517
+ end
518
+ when 'T'
519
+ if w[i, 4] == 'TION'
520
+ primary += 'X'
521
+ secondary += 'X'
522
+ i += 3
523
+ elsif w[i, 3] =~ /TIA|TCH/
524
+ primary += 'X'
525
+ secondary += 'X'
526
+ i += 3
527
+ elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
528
+ # special case 'thomas', 'thames' or germanic
529
+ if w[i + 2, 2] =~ /OM|AM/ || w[0, 4] =~ /VAN|VON\s/ || w[0, 3] == 'SCH'
530
+ primary += 'T'
531
+ secondary += 'T'
532
+ else
533
+ primary += '0'
534
+ secondary += 'T'
535
+ end
536
+ i += 2
537
+ else
538
+ if w[i + 1] =~ /[TD]/
539
+ i += 2
540
+ else
541
+ i += 1
542
+ end
543
+ primary += 'T'
544
+ secondary += 'T'
545
+ end
546
+ when 'V'
547
+ if w[i + 1] == 'V'
548
+ i += 2
549
+ else
550
+ i += 1
551
+ end
552
+ primary += 'F'
553
+ secondary += 'F'
554
+ when 'W'
555
+ # can also be in middle of word
556
+ if w[i, 2] == 'WR'
557
+ primary += 'R'
558
+ secondary += 'R'
559
+ i += 2
560
+ else
561
+ if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
562
+ # Wasserman should match Vasserman
563
+ if vowel?(w[i + 1])
564
+ primary += 'A'
565
+ secondary += 'F'
566
+ else
567
+ # need Uomo to match Womo
568
+ primary += 'A'
569
+ secondary += 'A'
570
+ end
571
+ end
572
+ # Arnow should match Arnoff
573
+ if i == last && i > 0 && vowel?(w[i - 1]) ||
574
+ (i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/) || w[0, 3] == 'SCH'
575
+ secondary += 'F'
576
+ i += 1
577
+ elsif w[i, 4] =~ /WICZ|WITZ/
578
+ # polish e.g. 'filipowicz'
579
+ primary += 'TS'
580
+ secondary += 'FX'
581
+ i += 4
582
+ else
583
+ i += 1
584
+ end
585
+ end
586
+ when 'X'
587
+ # french e.g. breaux
588
+ if !(i == last && ((i > 2 && w[i - 3, 3] =~ /IAU|EAU/) || (i > 1 && w[i - 2, 2] =~ /AU|OU/)))
589
+ primary += 'KS'
590
+ secondary += 'KS'
591
+ end
592
+ if w[i + 1] =~ /[CX]/
593
+ i += 2
594
+ else
595
+ i += 1
596
+ end
597
+ when 'Z'
598
+ # chinese pinyin e.g. 'zhao'
599
+ if w[i + 1] == 'H'
600
+ primary += 'J'
601
+ secondary += 'J'
602
+ i += 2
603
+ else
604
+ if w[i + 1, 2] =~ /ZO|ZI|ZA/ || slavo_germanic?(w) && (i > 0 && w[i - 1] != 'T')
605
+ primary += 'S'
606
+ secondary += 'TS';
607
+ else
608
+ primary += 'S'
609
+ secondary += 'S';
610
+ end
611
+ if w[i + 1] == 'Z'
612
+ i += 2
613
+ else
614
+ i += 1
615
+ end
616
+ end
617
+ else
618
+ i += 1
619
+ end
620
+ end
621
+ [primary[0, code_size], secondary[0, code_size]]
622
+ end
623
+
624
+ def self.encode(str, options = { size: 4 })
625
+ encode_word(str, options)
626
+ end
627
+
628
+ private
629
+
630
+ def self.slavo_germanic?(str)
631
+ !!(str[/W|K|CZ|WITZ/])
632
+ end
633
+
634
+ def self.vowel?(char)
635
+ c = VOWELS[char.to_s]
636
+ !c.nil? && !c.empty?
637
+ end
638
+
639
+ end
640
+ end