phonetic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +98 -0
- data/Rakefile +1 -0
- data/lib/phonetic.rb +10 -0
- data/lib/phonetic/algorithm.rb +24 -0
- data/lib/phonetic/caverphone.rb +68 -0
- data/lib/phonetic/caverphone2.rb +69 -0
- data/lib/phonetic/core_ext/string.rb +3 -0
- data/lib/phonetic/core_ext/string/caverphone.rb +11 -0
- data/lib/phonetic/core_ext/string/caverphone2.rb +11 -0
- data/lib/phonetic/core_ext/string/double_metaphone.rb +18 -0
- data/lib/phonetic/core_ext/string/metaphone.rb +12 -0
- data/lib/phonetic/core_ext/string/nysiis.rb +12 -0
- data/lib/phonetic/core_ext/string/refined_soundex.rb +12 -0
- data/lib/phonetic/core_ext/string/soundex.rb +12 -0
- data/lib/phonetic/double_metaphone.rb +640 -0
- data/lib/phonetic/metaphone.rb +161 -0
- data/lib/phonetic/metaphone2.rb +5 -0
- data/lib/phonetic/nysiis.rb +63 -0
- data/lib/phonetic/refined_soundex.rb +39 -0
- data/lib/phonetic/soundex.rb +39 -0
- data/lib/phonetic/version.rb +3 -0
- data/phonetic.gemspec +26 -0
- data/spec/phonetic/algorithm_spec.rb +15 -0
- data/spec/phonetic/caverphone2_spec.rb +66 -0
- data/spec/phonetic/caverphone_spec.rb +115 -0
- data/spec/phonetic/core_ext/string/caverphone2_spec.rb +9 -0
- data/spec/phonetic/core_ext/string/caverphone_spec.rb +9 -0
- data/spec/phonetic/core_ext/string/double_metaphone_spec.rb +15 -0
- data/spec/phonetic/core_ext/string/metaphone_spec.rb +11 -0
- data/spec/phonetic/core_ext/string/nysiis_spec.rb +12 -0
- data/spec/phonetic/core_ext/string/refined_soundex_spec.rb +10 -0
- data/spec/phonetic/core_ext/string/soundex_spec.rb +14 -0
- data/spec/phonetic/double_metaphone_spec.rb +16 -0
- data/spec/phonetic/metaphone2_spec.rb +9 -0
- data/spec/phonetic/metaphone_spec.rb +81 -0
- data/spec/phonetic/nysiis_spec.rb +20 -0
- data/spec/phonetic/refined_soundex_spec.rb +13 -0
- data/spec/phonetic/soundex_spec.rb +24 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/support/double_metaphone_data.rb +142 -0
- data/spec/support/nysiis_data.rb +31 -0
- metadata +180 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cb22f0be1272e5b72586a964943808d93f99c5c2
|
4
|
+
data.tar.gz: cfab5ebba7adc3823f9c74a92b5877681e73d36d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: da460d1d048d38d39af6970b3c456551217551d3aa1275d8ec83cc86ea6ae5fcf454842ccb0300d93726303ab9009639bd72c894d99e30d9773edd269e0f41c8
|
7
|
+
data.tar.gz: 3b94a722e973f37eccd62841d0cd64be3e0dbce0ab27766cd4384a01e375440334f74cc22b14c21804489a5e87f6052911f3decf5b1a2c1f4f24e7b6459306cc
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 n7v
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
# Phonetic
|
2
|
+
|
3
|
+
Ruby library for phonetic algorithms.
|
4
|
+
It supports Soundex, Metaphone, Double Metaphone, Caverphone, NYSIIS and others.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'phonetic'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
```shell
|
15
|
+
$ bundle
|
16
|
+
```
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
```shell
|
21
|
+
$ gem install phonetic
|
22
|
+
```
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
require 'phonetic'
|
28
|
+
```
|
29
|
+
|
30
|
+
### Soundex
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
'Ackerman'.soundex # => 'A265'
|
34
|
+
'ammonium'.soundex # => 'A500'
|
35
|
+
'implementation'.soundex # => 'I514'
|
36
|
+
```
|
37
|
+
|
38
|
+
### Refined Soundex
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
'Caren'.refined_soundex # => 'C30908'
|
42
|
+
'Hayers'.refined_soundex # => 'H093'
|
43
|
+
'Lambard'.refined_soundex # => 'L7081096'
|
44
|
+
```
|
45
|
+
|
46
|
+
### Metaphone
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
'Accola'.metaphone # => 'AKKL'
|
50
|
+
'Nikki'.metaphone # => 'NK'
|
51
|
+
'Wright'.metaphone #=> 'RT'
|
52
|
+
```
|
53
|
+
|
54
|
+
### Double Metaphone
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
'czerny'.double_metaphone # => ['SRN', 'XRN']
|
58
|
+
'dumb'.double_metaphone # => ['TM', 'TM']
|
59
|
+
'edgar'.double_metaphone # => ['ATKR', 'ATKR']
|
60
|
+
```
|
61
|
+
|
62
|
+
or use alias:
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
'czerny'.metaphone2 # => ['SRN', 'XRN']
|
66
|
+
'dumb'.metaphone2 # => ['TM', 'TM']
|
67
|
+
'edgar'.metaphone2 # => ['ATKR', 'ATKR']
|
68
|
+
```
|
69
|
+
|
70
|
+
### Caverphone
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
'Lashaunda'.caverphone # => 'LSNT11'
|
74
|
+
'Vidaurri'.caverphone # => 'FTR111'
|
75
|
+
````
|
76
|
+
|
77
|
+
### Caverphone 2
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
'Stevenson'.caverphone2 # => 'STFNSN1111'
|
81
|
+
'Peter'.caverphone2 # => 'PTA1111111'
|
82
|
+
```
|
83
|
+
|
84
|
+
### NYSIIS
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
'Alexandra'.nysiis # => 'ALAXANDR'
|
88
|
+
'Aumont'.nysiis # => 'AANAD'
|
89
|
+
'Bonnie'.nysiis # => 'BANY'
|
90
|
+
```
|
91
|
+
|
92
|
+
## Contributing
|
93
|
+
|
94
|
+
1. Fork it
|
95
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
96
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
97
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
98
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/lib/phonetic.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'phonetic/version'
|
2
|
+
require 'phonetic/nysiis'
|
3
|
+
require 'phonetic/soundex'
|
4
|
+
require 'phonetic/refined_soundex'
|
5
|
+
require 'phonetic/metaphone'
|
6
|
+
require 'phonetic/double_metaphone'
|
7
|
+
require 'phonetic/metaphone2'
|
8
|
+
require 'phonetic/caverphone'
|
9
|
+
require 'phonetic/caverphone2'
|
10
|
+
require 'phonetic/core_ext/string'
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Phonetic
|
2
|
+
# Base class for phonetic algorithms.
|
3
|
+
class Algorithm
|
4
|
+
# Generic method for encoding single word. Override it in your algorithm class.
|
5
|
+
# @param [String] word the word to encode
|
6
|
+
# @param [Hash] options the options for the algorithm
|
7
|
+
# @return [String] the word
|
8
|
+
def self.encode_word(word, options = {})
|
9
|
+
word
|
10
|
+
end
|
11
|
+
|
12
|
+
# Generic method for encoding string.
|
13
|
+
# Splits string by words and encodes it with {Algorithm.encode_word}.
|
14
|
+
#
|
15
|
+
# @param [String] str the string to encode.
|
16
|
+
# @param [Hash] options the options for algorithm.
|
17
|
+
# @return [String] the space separated codes of words from input string.
|
18
|
+
def self.encode(str, options = {})
|
19
|
+
str.scan(/\p{Word}+/).map do |word|
|
20
|
+
encode_word(word, options)
|
21
|
+
end.compact.reject(&:empty?).join(' ')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'phonetic/algorithm'
|
2
|
+
|
3
|
+
module Phonetic
|
4
|
+
# Caverphone created by the Caversham Project at the University of Otago.
|
5
|
+
# @see http://caversham.otago.ac.nz/files/working/ctp060902.pdf Caverphone: Phonetic Matching algorithm by David Hood (2002)
|
6
|
+
# This class implements this algorithm.
|
7
|
+
# @example
|
8
|
+
# Phonetic::Caverphone.encode('Charmain') # => 'KMN111'
|
9
|
+
# Phonetic::Caverphone.encode('Ellett') # => 'ALT111'
|
10
|
+
# Phonetic::Caverphone.encode('Siegmund') # => 'SKMNT1'
|
11
|
+
class Caverphone < Algorithm
|
12
|
+
MAP = {
|
13
|
+
/^(cou|rou|tou|enou)gh/ => '\12f',
|
14
|
+
/^gn/ => '2n',
|
15
|
+
/mb$/ => 'mb',
|
16
|
+
'cq' => '2q',
|
17
|
+
/c([iey])/ => 's\1',
|
18
|
+
'tch' => '2ch',
|
19
|
+
/[cqx]/ => 'k',
|
20
|
+
'v' => 'f',
|
21
|
+
'dg' => '2g',
|
22
|
+
/ti([oa])/ => 'si\1',
|
23
|
+
'd' => 't',
|
24
|
+
'ph' => 'fh',
|
25
|
+
'b' => 'p',
|
26
|
+
'sh' => 's2',
|
27
|
+
'z' => 's',
|
28
|
+
/^[aeiou]/ => 'A',
|
29
|
+
/[aeiou]/ => '3',
|
30
|
+
'3gh3' => '3kh3',
|
31
|
+
'gh' => '22',
|
32
|
+
'g' => 'k',
|
33
|
+
/s+/ => 'S',
|
34
|
+
/t+/ => 'T',
|
35
|
+
/p+/ => 'P',
|
36
|
+
/k+/ => 'K',
|
37
|
+
/f+/ => 'F',
|
38
|
+
/m+/ => 'M',
|
39
|
+
/n+/ => 'N',
|
40
|
+
'w3' => 'W3',
|
41
|
+
/wy/ => 'Wy',
|
42
|
+
'wh3' => 'Wh3',
|
43
|
+
'why' => 'Why',
|
44
|
+
'w' => '2',
|
45
|
+
/^h/ => 'A',
|
46
|
+
'h' => '2',
|
47
|
+
'r3' => 'R3',
|
48
|
+
'ry' => 'Ry',
|
49
|
+
'r' => '2',
|
50
|
+
'l3' => 'L3',
|
51
|
+
'ly' => 'Ly',
|
52
|
+
'l' => '2',
|
53
|
+
'j' => 'y',
|
54
|
+
'y3' => 'Y3',
|
55
|
+
'y' => '2',
|
56
|
+
'2' => '',
|
57
|
+
'3' => ''
|
58
|
+
}
|
59
|
+
|
60
|
+
# Encode word to its Caverphone code
|
61
|
+
def self.encode_word(word, options = {})
|
62
|
+
w = word.strip.downcase.gsub(/[^a-z]/, '')
|
63
|
+
MAP.each { |r, v| w.gsub!(r, v) }
|
64
|
+
w = w + '1' * 6
|
65
|
+
w[0..5]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'phonetic/algorithm'
|
2
|
+
|
3
|
+
module Phonetic
|
4
|
+
# Caverphone 2.0 created by the Caversham Project at the University of Otago.
|
5
|
+
# @see http://caversham.otago.ac.nz/files/working/ctp150804.pdf Caverphone Revisited by David Hood (2004)
|
6
|
+
# This class implements this algorithm.
|
7
|
+
# @example
|
8
|
+
# Phonetic::Caverphone2.encode('Stevenson') # => 'STFNSN1111'
|
9
|
+
# Phonetic::Caverphone2.encode('Peter') # => 'PTA1111111'
|
10
|
+
class Caverphone2 < Algorithm
|
11
|
+
MAP = {
|
12
|
+
/e$/ => '',
|
13
|
+
/^(cou|rou|tou|enou|trou)gh/ => '\12f',
|
14
|
+
/^gn/ => '2n',
|
15
|
+
/mb$/ => 'mb',
|
16
|
+
'cq' => '2q',
|
17
|
+
/c([iey])/ => 's\1',
|
18
|
+
'tch' => '2ch',
|
19
|
+
/[cqx]/ => 'k',
|
20
|
+
'v' => 'f',
|
21
|
+
'dg' => '2g',
|
22
|
+
/ti([oa])/ => 'si\1',
|
23
|
+
'd' => 't',
|
24
|
+
'ph' => 'fh',
|
25
|
+
'b' => 'p',
|
26
|
+
'sh' => 's2',
|
27
|
+
'z' => 's',
|
28
|
+
/^[aeiou]/ => 'A',
|
29
|
+
/[aeiou]/ => '3',
|
30
|
+
'j' => 'y',
|
31
|
+
/^y3/ => 'Y3',
|
32
|
+
/^y/ => 'A',
|
33
|
+
/y/ => '3',
|
34
|
+
'3gh3' => '3kh3',
|
35
|
+
'gh' => '22',
|
36
|
+
'g' => 'k',
|
37
|
+
/s+/ => 'S',
|
38
|
+
/t+/ => 'T',
|
39
|
+
/p+/ => 'P',
|
40
|
+
/k+/ => 'K',
|
41
|
+
/f+/ => 'F',
|
42
|
+
/m+/ => 'M',
|
43
|
+
/n+/ => 'N',
|
44
|
+
'w3' => 'W3',
|
45
|
+
'wh3' => 'Wh3',
|
46
|
+
/w$/ => '3',
|
47
|
+
'w' => '2',
|
48
|
+
/^h/ => 'A',
|
49
|
+
'h' => '2',
|
50
|
+
'r3' => 'R3',
|
51
|
+
/r$/ => '3',
|
52
|
+
'r' => '2',
|
53
|
+
'l3' => 'L3',
|
54
|
+
/l$/ => '3',
|
55
|
+
'l' => '2',
|
56
|
+
'2' => '',
|
57
|
+
/3$/ => 'A',
|
58
|
+
'3' => ''
|
59
|
+
}
|
60
|
+
|
61
|
+
# Encode word to its Caverphone 2 code
|
62
|
+
def self.encode_word(word, options = {})
|
63
|
+
w = word.strip.downcase.gsub(/[^a-z]/, '')
|
64
|
+
MAP.each { |r, v| w.gsub!(r, v) }
|
65
|
+
w = w + '1' * 10
|
66
|
+
w[0..9]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'phonetic/caverphone'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Caverphone value of string
|
5
|
+
# @example
|
6
|
+
# 'Lashaunda'.caverphone # => 'LSNT11'
|
7
|
+
# 'Vidaurri'.caverphone # => 'FTR111'
|
8
|
+
def caverphone(options = {})
|
9
|
+
Phonetic::Caverphone.encode(self, options)
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'phonetic/caverphone2'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Caverphone 2 value of string
|
5
|
+
# @example
|
6
|
+
# 'Stevenson'.caverphone2 # => 'STFNSN1111'
|
7
|
+
# 'Peter'.caverphone2 # => 'PTA1111111'
|
8
|
+
def caverphone2(options = {})
|
9
|
+
Phonetic::Caverphone2.encode(self, options)
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'phonetic/double_metaphone'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Double Metahpone code of string.
|
5
|
+
# @example
|
6
|
+
# 'czerny'.double_metaphone # => ['SRN', 'XRN']
|
7
|
+
# 'dumb'.double_metaphone # => ['TM', 'TM']
|
8
|
+
# 'edgar'.double_metaphone # => ['ATKR', 'ATKR']
|
9
|
+
# # or use alias:
|
10
|
+
# 'czerny'.metaphone2 # => ['SRN', 'XRN']
|
11
|
+
# 'dumb'.metaphone2 # => ['TM', 'TM']
|
12
|
+
# 'edgar'.metaphone2 # => ['ATKR', 'ATKR']
|
13
|
+
def double_metaphone(options = { size: 4 })
|
14
|
+
Phonetic::DoubleMetaphone.encode(self, options)
|
15
|
+
end
|
16
|
+
|
17
|
+
alias_method :metaphone2, :double_metaphone
|
18
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'phonetic/metaphone'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Metaphone value of string.
|
5
|
+
# @example
|
6
|
+
# 'Accola'.metaphone # => 'AKKL'
|
7
|
+
# 'Nikki'.metaphone # => 'NK'
|
8
|
+
# 'Wright'.metaphone #=> 'RT'
|
9
|
+
def metaphone(options = { size: 4 })
|
10
|
+
Phonetic::Metaphone.encode(self, options)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'phonetic/nysiis'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Caverphone value of string.
|
5
|
+
# @example
|
6
|
+
# 'Alexandra'.nysiis # => 'ALAXANDR'
|
7
|
+
# 'Aumont'.nysiis # => 'AANAD'
|
8
|
+
# 'Bonnie'.nysiis # => 'BANY'
|
9
|
+
def nysiis(options = { trim: true })
|
10
|
+
Phonetic::NYSIIS.encode(self, options)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'phonetic/refined_soundex'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Refined Soundex value of string.
|
5
|
+
# @example
|
6
|
+
# 'Caren'.refined_soundex # => 'C30908'
|
7
|
+
# 'Hayers'.refined_soundex # => 'H093'
|
8
|
+
# 'Lambard'.refined_soundex # => 'L7081096'
|
9
|
+
def refined_soundex(options = { trim: true })
|
10
|
+
Phonetic::RefinedSoundex.encode(self, options)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'phonetic/soundex'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Soundex value of string
|
5
|
+
# @example
|
6
|
+
# 'Ackerman'.soundex # => 'A265'
|
7
|
+
# 'ammonium'.soundex # => 'A500'
|
8
|
+
# 'implementation'.soundex # => 'I514'
|
9
|
+
def soundex(options = { trim: true })
|
10
|
+
Phonetic::Soundex.encode(self, options)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,640 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'phonetic/algorithm'
|
4
|
+
|
5
|
+
module Phonetic
|
6
|
+
# The Double Metaphone phonetic encoding algorithm is the second generation
|
7
|
+
# of the Metaphone algorithm. Its original implementation was described
|
8
|
+
# by Lawrence Philips in the June 2000 issue of C/C++ Users Journal.
|
9
|
+
#
|
10
|
+
# This implementation based on the PHP implementation by Stephen Woodbridge
|
11
|
+
# and contains modifications of algorithm by Kevin Atkinson.
|
12
|
+
# @see http://swoodbridge.com/DoubleMetaPhone/ PHP implementation by Stephen Woodbridge
|
13
|
+
# @see http://aspell.net/metaphone/dmetaph.cpp C++ implementation with modifications by Kevin Atkinson
|
14
|
+
# @example
|
15
|
+
# Phonetic::DoubleMetaphone.encode('czerny') # => ['SRN', 'XRN']
|
16
|
+
# Phonetic::DoubleMetaphone.encode('dumb') # => ['TM', 'TM']
|
17
|
+
# Phonetic::DoubleMetaphone.encode('edgar') # => ['ATKR', 'ATKR']
|
18
|
+
# # or use alias:
|
19
|
+
# Phonetic::Metaphone2.encode('czerny') # => ['SRN', 'XRN']
|
20
|
+
# Phonetic::Metaphone2.encode('dumb') # => ['TM', 'TM']
|
21
|
+
# Phonetic::Metaphone2.encode('edgar') # => ['ATKR', 'ATKR']
|
22
|
+
class DoubleMetaphone < Algorithm
|
23
|
+
VOWELS = 'AEIOUY'
|
24
|
+
|
25
|
+
# Encode word to its Double Metaphone code.
|
26
|
+
def self.encode_word(word, options = { size: 4 })
|
27
|
+
code_size = options[:size] || 4
|
28
|
+
w = word.strip.upcase
|
29
|
+
primary = ''
|
30
|
+
secondary = ''
|
31
|
+
i = 0
|
32
|
+
len = w.size
|
33
|
+
last = len - 1
|
34
|
+
# pad the original string so that we can index beyond the edge of the world
|
35
|
+
w += ' ' * 5
|
36
|
+
# skip these when at start of word
|
37
|
+
i += 1 if ['GN','KN','PN','WR','PS'].include? w[0, 2]
|
38
|
+
# initial 'X' is pronounced 'Z' e.g. 'Xavier'
|
39
|
+
if w[0] == 'X'
|
40
|
+
primary += 'S'
|
41
|
+
secondary += 'S'
|
42
|
+
i += 1
|
43
|
+
end
|
44
|
+
while i < len && (primary.size < code_size || primary.size < code_size)
|
45
|
+
case w[i]
|
46
|
+
when 'A', 'E', 'I', 'O', 'U', 'Y'
|
47
|
+
if i == 0
|
48
|
+
# all init vowels now map to 'A'
|
49
|
+
primary += 'A'
|
50
|
+
secondary += 'A'
|
51
|
+
end
|
52
|
+
i += 1
|
53
|
+
when 'B'
|
54
|
+
# "-mb", e.g", "dumb", already skipped over...
|
55
|
+
primary += 'P'
|
56
|
+
secondary += 'P'
|
57
|
+
i += (w[i + 1] == 'B') ? 2 : 1
|
58
|
+
when 'Ç', 'ç'
|
59
|
+
primary += 'S'
|
60
|
+
secondary += 'S'
|
61
|
+
i += 1
|
62
|
+
when 'C'
|
63
|
+
# various germanic
|
64
|
+
if i > 1 && !vowel?(w[i - 2]) && w[i - 1, 3] == 'ACH' &&
|
65
|
+
(w[i + 2] != 'I' && (w[i + 2] != 'E' || w[i - 2, 6] =~ /[BM]ACHER/))
|
66
|
+
primary += 'K'
|
67
|
+
secondary += 'K'
|
68
|
+
i += 2
|
69
|
+
# special case 'caesar'
|
70
|
+
elsif i == 0 && w[i, 6] == 'CAESAR'
|
71
|
+
primary += 'S'
|
72
|
+
secondary += 'S'
|
73
|
+
i += 2
|
74
|
+
# italian 'chianti'
|
75
|
+
elsif w[i, 4] == 'CHIA'
|
76
|
+
primary += 'K'
|
77
|
+
secondary += 'K'
|
78
|
+
i += 2
|
79
|
+
elsif w[i, 2] == 'CH'
|
80
|
+
# find 'michael'
|
81
|
+
if i > 0 && w[i, 4] == 'CHAE'
|
82
|
+
primary += 'K'
|
83
|
+
secondary += 'X'
|
84
|
+
i += 2
|
85
|
+
# greek roots e.g. 'chemistry', 'chorus'
|
86
|
+
elsif i == 0 && (w[i + 1, 5] =~ /HARAC|HARIS/ || w[i + 1, 3] =~ /HOR|HYM|HIA|HEM/) &&
|
87
|
+
w[0, 5] != 'CHORE'
|
88
|
+
primary += 'K'
|
89
|
+
secondary += 'K'
|
90
|
+
i += 2
|
91
|
+
else
|
92
|
+
# germanic, greek, or otherwise 'ch' for 'kh' sound
|
93
|
+
if (w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH') ||
|
94
|
+
# 'architect but not 'arch', 'orchestra', 'orchid'
|
95
|
+
(i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/) ||
|
96
|
+
(w[i + 2] =~ /[TS]/) ||
|
97
|
+
((i > 0 && w[i - 1] =~ /[AOUE]/) || i == 0) &&
|
98
|
+
# e.g., 'wachtler', 'wechsler', but not 'tichner'
|
99
|
+
(w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
|
100
|
+
primary += 'K'
|
101
|
+
secondary += 'K'
|
102
|
+
else
|
103
|
+
if i > 0
|
104
|
+
if w[0, 2] == 'MC'
|
105
|
+
# e.g., "McHugh"
|
106
|
+
primary += 'K'
|
107
|
+
secondary += 'K'
|
108
|
+
else
|
109
|
+
primary += 'X'
|
110
|
+
secondary += 'K'
|
111
|
+
end
|
112
|
+
else
|
113
|
+
primary += 'X'
|
114
|
+
secondary += 'X'
|
115
|
+
end
|
116
|
+
end
|
117
|
+
i += 2
|
118
|
+
end
|
119
|
+
elsif w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
|
120
|
+
# e.g, 'czerny'
|
121
|
+
primary += 'S'
|
122
|
+
secondary += 'X'
|
123
|
+
i += 2
|
124
|
+
elsif w[i + 1, 3] == 'CIA'
|
125
|
+
# e.g., 'focaccia'
|
126
|
+
primary += 'X'
|
127
|
+
secondary += 'X'
|
128
|
+
i += 3
|
129
|
+
# double 'C', but not if e.g. 'McClellan'
|
130
|
+
elsif w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
|
131
|
+
# 'bellocchio' but not 'bacchus'
|
132
|
+
if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
|
133
|
+
# 'accident', 'accede' 'succeed'
|
134
|
+
if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
|
135
|
+
# 'bacci', 'bertucci', other italian
|
136
|
+
primary += 'KS'
|
137
|
+
secondary += 'KS'
|
138
|
+
else
|
139
|
+
primary += 'X'
|
140
|
+
secondary += 'X'
|
141
|
+
end
|
142
|
+
i += 3
|
143
|
+
else
|
144
|
+
# Pierce's rule
|
145
|
+
primary += 'K'
|
146
|
+
secondary += 'K'
|
147
|
+
i += 2
|
148
|
+
end
|
149
|
+
elsif w[i, 2] =~ /CK|CG|CQ/
|
150
|
+
primary += 'K'
|
151
|
+
secondary += 'K'
|
152
|
+
i += 2
|
153
|
+
elsif w[i, 2] =~ /CI|CE|CY/
|
154
|
+
# italian vs. english
|
155
|
+
if w[i, 3] =~ /CIO|CIE|CIA/
|
156
|
+
primary += 'S'
|
157
|
+
secondary += 'X'
|
158
|
+
else
|
159
|
+
primary += 'S'
|
160
|
+
secondary += 'S'
|
161
|
+
end
|
162
|
+
i += 2
|
163
|
+
else
|
164
|
+
primary += 'K'
|
165
|
+
secondary += 'K'
|
166
|
+
# name sent in 'mac caffrey', 'mac gregor'
|
167
|
+
if w[i + 1, 2] =~ /\s[CQG]/
|
168
|
+
i += 3
|
169
|
+
else
|
170
|
+
if w[i + 1] =~ /[CKQ]/ && !(w[i + 1, 2] =~ /CE|CI/)
|
171
|
+
i += 2
|
172
|
+
else
|
173
|
+
i += 1
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
when 'D'
|
178
|
+
if w[i, 2] == 'DG'
|
179
|
+
if w[i + 2] =~ /[IEY]/
|
180
|
+
# e.g. 'edge'
|
181
|
+
primary += 'J'
|
182
|
+
secondary += 'J'
|
183
|
+
i += 3
|
184
|
+
else
|
185
|
+
# e.g. 'edgar'
|
186
|
+
primary += 'TK'
|
187
|
+
secondary += 'TK'
|
188
|
+
i += 2
|
189
|
+
end
|
190
|
+
elsif w[i, 2] =~ /DT|DD/
|
191
|
+
primary += 'T'
|
192
|
+
secondary += 'T'
|
193
|
+
i += 2
|
194
|
+
else
|
195
|
+
primary += 'T'
|
196
|
+
secondary += 'T'
|
197
|
+
i += 1
|
198
|
+
end
|
199
|
+
when 'F'
|
200
|
+
if w[i + 1] == 'F'
|
201
|
+
i += 2
|
202
|
+
else
|
203
|
+
i += 1
|
204
|
+
end
|
205
|
+
primary += 'F'
|
206
|
+
secondary += 'F'
|
207
|
+
when 'G'
|
208
|
+
if w[i + 1] == 'H'
|
209
|
+
if i > 0 && !vowel?(w[i - 1])
|
210
|
+
primary += 'K'
|
211
|
+
secondary += 'K'
|
212
|
+
i += 2
|
213
|
+
elsif i == 0
|
214
|
+
# ghislane, ghiradelli
|
215
|
+
if w[i + 2] == 'I'
|
216
|
+
primary += 'J'
|
217
|
+
secondary += 'J'
|
218
|
+
else
|
219
|
+
primary += 'K'
|
220
|
+
secondary += 'K'
|
221
|
+
end
|
222
|
+
i += 2
|
223
|
+
# Parker's rule (with some further refinements) - e.g., 'hugh'
|
224
|
+
elsif (i > 1 && w[i - 2] =~ /[BHD]/) ||
|
225
|
+
# e.g., 'bough'
|
226
|
+
(i > 2 && w[i - 3] =~ /[BHD]/) ||
|
227
|
+
# e.g., 'broughton'
|
228
|
+
(i > 3 && w[i - 4] =~ /[BH]/)
|
229
|
+
i += 2
|
230
|
+
else
|
231
|
+
# e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
|
232
|
+
if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
|
233
|
+
primary += 'F'
|
234
|
+
secondary += 'F'
|
235
|
+
else
|
236
|
+
if i > 0 && w[i - 1] != 'I'
|
237
|
+
primary += 'K'
|
238
|
+
secondary += 'K'
|
239
|
+
end
|
240
|
+
end
|
241
|
+
i += 2
|
242
|
+
end
|
243
|
+
elsif w[i + 1] == 'N'
|
244
|
+
if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
|
245
|
+
primary += 'KN'
|
246
|
+
secondary += 'N'
|
247
|
+
else
|
248
|
+
# not e.g. 'cagney'
|
249
|
+
if w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
|
250
|
+
primary += 'N'
|
251
|
+
secondary += 'KN'
|
252
|
+
else
|
253
|
+
primary += 'KN'
|
254
|
+
secondary += 'KN'
|
255
|
+
end
|
256
|
+
end
|
257
|
+
i += 2
|
258
|
+
# 'tagliaro'
|
259
|
+
elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
|
260
|
+
primary += 'KL'
|
261
|
+
secondary += 'L'
|
262
|
+
i += 2
|
263
|
+
# -ges-,-gep-,-gel-, -gie- at beginning
|
264
|
+
elsif i == 0 && (w[i + 1] == 'Y' || w[i + 1, 2] =~ /ES|EP|EB|EL|EY|IB|IL|IN|IE|EI|ER/)
|
265
|
+
primary += 'K'
|
266
|
+
secondary += 'J'
|
267
|
+
i += 2
|
268
|
+
# -ger-, -gy-
|
269
|
+
elsif (w[i + 1, 2] == 'ER' || w[i + 1] == 'Y') &&
|
270
|
+
!(w[0, 6] =~ /[DRM]ANGER/) &&
|
271
|
+
!(i > 0 && w[i - 1] =~ /[EI]/) &&
|
272
|
+
!(i > 0 && w[i - 1, 3] =~ /RGY|OGY/)
|
273
|
+
primary += 'K'
|
274
|
+
secondary += 'J'
|
275
|
+
i += 2
|
276
|
+
# italian e.g, 'biaggi'
|
277
|
+
elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
|
278
|
+
if w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH' || w[i + 1, 2] == 'ET'
|
279
|
+
primary += 'K'
|
280
|
+
secondary += 'K'
|
281
|
+
else
|
282
|
+
if w[i + 1, 4] =~ /IER\s/
|
283
|
+
primary += 'J'
|
284
|
+
secondary += 'J'
|
285
|
+
else
|
286
|
+
primary += 'J'
|
287
|
+
secondary += 'K'
|
288
|
+
end
|
289
|
+
end
|
290
|
+
i += 2
|
291
|
+
else
|
292
|
+
if w[i + 1] == 'G'
|
293
|
+
i += 2
|
294
|
+
else
|
295
|
+
i += 1
|
296
|
+
end
|
297
|
+
primary += 'K'
|
298
|
+
secondary += 'K'
|
299
|
+
end
|
300
|
+
when 'H'
|
301
|
+
# only keep if first & before vowel or btw. 2 vowels
|
302
|
+
if (i == 0 || (i > 0 && vowel?(w[i - 1]))) && vowel?(w[i + 1])
|
303
|
+
primary += 'H'
|
304
|
+
secondary += 'H'
|
305
|
+
i += 2
|
306
|
+
else # also takes care of 'HH'
|
307
|
+
i += 1
|
308
|
+
end
|
309
|
+
when 'J'
|
310
|
+
# obvious spanish, 'jose', 'san jacinto'
|
311
|
+
if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
|
312
|
+
if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
|
313
|
+
primary += 'H'
|
314
|
+
secondary += 'H'
|
315
|
+
else
|
316
|
+
primary += 'J'
|
317
|
+
secondary += 'H'
|
318
|
+
end
|
319
|
+
i += 1
|
320
|
+
else
|
321
|
+
if i == 0 && w[i, 4] != 'JOSE'
|
322
|
+
primary += 'J'
|
323
|
+
secondary += 'A'
|
324
|
+
# Yankelovich/Jankelowicz
|
325
|
+
else
|
326
|
+
# spanish pron. of e.g. 'bajador'
|
327
|
+
if i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && (w[i + 1] == 'A' || w[i + 1] == 'O')
|
328
|
+
primary += 'J'
|
329
|
+
secondary += 'H'
|
330
|
+
else
|
331
|
+
if i == last
|
332
|
+
primary += 'J'
|
333
|
+
#secondary += ' '
|
334
|
+
else
|
335
|
+
if !(w[i + 1] =~ /[LTKSNMBZ]/) && !(i > 0 && w[i - 1] =~ /[SKL]/)
|
336
|
+
primary += 'J'
|
337
|
+
secondary += 'J'
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
if w[i + 1] == 'J'
|
343
|
+
i += 2
|
344
|
+
else
|
345
|
+
i += 1
|
346
|
+
end
|
347
|
+
end
|
348
|
+
when 'K'
|
349
|
+
if w[i + 1] == 'K'
|
350
|
+
i += 2
|
351
|
+
else
|
352
|
+
i += 1
|
353
|
+
end
|
354
|
+
primary += 'K'
|
355
|
+
secondary += 'K'
|
356
|
+
when 'L'
|
357
|
+
if w[i + 1] == 'L'
|
358
|
+
# spanish e.g. 'cabrillo', 'gallegos'
|
359
|
+
if (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILLO|ILLA|ALLE/) ||
|
360
|
+
((last > 0 && w[last - 1, 2] =~ /AS|OS/ || w[last] =~ /[AO]/) &&
|
361
|
+
(i > 0 && w[i - 1, 4] == 'ALLE'))
|
362
|
+
primary += 'L'
|
363
|
+
i += 2
|
364
|
+
next
|
365
|
+
end
|
366
|
+
i += 2
|
367
|
+
else
|
368
|
+
i += 1
|
369
|
+
end
|
370
|
+
primary += 'L'
|
371
|
+
secondary += 'L'
|
372
|
+
when 'M'
|
373
|
+
if (i > 0 && w[i - 1, 3] == 'UMB' && (i + 1 == last || w[i + 2, 2] == "ER")) ||
|
374
|
+
# 'dumb','thumb'
|
375
|
+
w[i + 1] == 'M'
|
376
|
+
i += 2
|
377
|
+
else
|
378
|
+
i += 1
|
379
|
+
end
|
380
|
+
primary += 'M'
|
381
|
+
secondary += 'M'
|
382
|
+
when 'N'
|
383
|
+
if w[i + 1] == 'N'
|
384
|
+
i += 2
|
385
|
+
else
|
386
|
+
i += 1
|
387
|
+
end
|
388
|
+
primary += 'N'
|
389
|
+
secondary += 'N'
|
390
|
+
when 'Ñ', 'ñ'
|
391
|
+
i += 1;
|
392
|
+
primary += 'N'
|
393
|
+
secondary += 'N'
|
394
|
+
when 'P'
|
395
|
+
if w[i + 1] == 'H'
|
396
|
+
primary += 'F'
|
397
|
+
secondary += 'F'
|
398
|
+
i += 2
|
399
|
+
else
|
400
|
+
# also account for "campbell", "raspberry"
|
401
|
+
if w[i + 1] =~ /[PB]/
|
402
|
+
i += 2
|
403
|
+
else
|
404
|
+
i += 1
|
405
|
+
end
|
406
|
+
primary += 'P'
|
407
|
+
secondary += 'P'
|
408
|
+
end
|
409
|
+
when 'Q'
|
410
|
+
if w[i + 1] == 'Q'
|
411
|
+
i += 2
|
412
|
+
else
|
413
|
+
i += 1
|
414
|
+
end
|
415
|
+
primary += 'K'
|
416
|
+
secondary += 'K'
|
417
|
+
when 'R'
|
418
|
+
# french e.g. 'rogier', but exclude 'hochmeier'
|
419
|
+
if i == last && !slavo_germanic?(w) &&
|
420
|
+
(i > 1 && w[i - 2, 2] == "IE") &&
|
421
|
+
!(i > 3 && w[i - 4, 2] =~ /M[EA]/)
|
422
|
+
secondary += 'R'
|
423
|
+
else
|
424
|
+
primary += 'R'
|
425
|
+
secondary += 'R'
|
426
|
+
end
|
427
|
+
if w[i + 1] == 'R'
|
428
|
+
i += 2
|
429
|
+
else
|
430
|
+
i += 1
|
431
|
+
end
|
432
|
+
when 'S'
|
433
|
+
# special cases 'island', 'isle', 'carlisle', 'carlysle'
|
434
|
+
if i > 0 && w[i - 1, 3] =~ /ISL|YSL/
|
435
|
+
i += 1
|
436
|
+
# special case 'sugar-'
|
437
|
+
elsif i == 0 && w[i, 5] == 'SUGAR'
|
438
|
+
primary += 'X'
|
439
|
+
secondary += 'S'
|
440
|
+
i += 1
|
441
|
+
elsif w[i, 2] == 'SH'
|
442
|
+
# germanic
|
443
|
+
if w[i + 1, 4] =~ /HEIM|HOEK|HOLM|HOLZ/
|
444
|
+
primary += 'S'
|
445
|
+
secondary += 'S'
|
446
|
+
else
|
447
|
+
primary += 'X'
|
448
|
+
secondary += 'X'
|
449
|
+
end
|
450
|
+
i += 2
|
451
|
+
# italian & armenian
|
452
|
+
elsif w[i, 3] =~ /SIO|SIA/ || w[i, 4] == 'SIAN'
|
453
|
+
if !slavo_germanic?(w)
|
454
|
+
primary += 'S'
|
455
|
+
secondary += 'X'
|
456
|
+
else
|
457
|
+
primary += 'S'
|
458
|
+
secondary += 'S'
|
459
|
+
end
|
460
|
+
i += 3
|
461
|
+
# german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
|
462
|
+
# also, -sz- in slavic language altho in hungarian it is pronounced 's'
|
463
|
+
elsif (i == 0 && w[i + 1] =~ /[MNLW]/) || w[i + 1] == 'Z'
|
464
|
+
primary += 'S'
|
465
|
+
secondary += 'X'
|
466
|
+
if w[i + 1] == 'Z'
|
467
|
+
i += 2
|
468
|
+
else
|
469
|
+
i += 1
|
470
|
+
end
|
471
|
+
elsif w[i, 2] == 'SC'
|
472
|
+
# Schlesinger's rule
|
473
|
+
if w[i + 2] == 'H'
|
474
|
+
# dutch origin, e.g. 'school', 'schooner'
|
475
|
+
if w[i + 3, 2] =~ /OO|ER|EN|UY|ED|EM/
|
476
|
+
# 'schermerhorn', 'schenker'
|
477
|
+
if w[i + 3, 2] =~ /ER|EN/
|
478
|
+
primary += 'X'
|
479
|
+
secondary += 'SK'
|
480
|
+
else
|
481
|
+
primary += 'SK'
|
482
|
+
secondary += 'SK'
|
483
|
+
end
|
484
|
+
i += 3
|
485
|
+
else
|
486
|
+
if i == 0 && !vowel?(w[3]) && w[3] != 'W'
|
487
|
+
primary += 'X'
|
488
|
+
secondary += 'S'
|
489
|
+
else
|
490
|
+
primary += 'X'
|
491
|
+
secondary += 'X'
|
492
|
+
end
|
493
|
+
i += 3
|
494
|
+
end
|
495
|
+
elsif w[i + 2, 1] =~ /[IEY]/
|
496
|
+
primary += 'S'
|
497
|
+
secondary += 'S'
|
498
|
+
i += 3
|
499
|
+
else
|
500
|
+
primary += 'SK'
|
501
|
+
secondary += 'SK'
|
502
|
+
i += 3
|
503
|
+
end
|
504
|
+
else
|
505
|
+
# french e.g. 'resnais', 'artois'
|
506
|
+
if i == last && i > 1 && w[i - 2, 2] =~ /AI|OI/
|
507
|
+
secondary += 'S'
|
508
|
+
else
|
509
|
+
primary += 'S'
|
510
|
+
secondary += 'S'
|
511
|
+
end
|
512
|
+
if w[i + 1] =~ /[SZ]/
|
513
|
+
i += 2
|
514
|
+
else
|
515
|
+
i += 1
|
516
|
+
end
|
517
|
+
end
|
518
|
+
when 'T'
|
519
|
+
if w[i, 4] == 'TION'
|
520
|
+
primary += 'X'
|
521
|
+
secondary += 'X'
|
522
|
+
i += 3
|
523
|
+
elsif w[i, 3] =~ /TIA|TCH/
|
524
|
+
primary += 'X'
|
525
|
+
secondary += 'X'
|
526
|
+
i += 3
|
527
|
+
elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
|
528
|
+
# special case 'thomas', 'thames' or germanic
|
529
|
+
if w[i + 2, 2] =~ /OM|AM/ || w[0, 4] =~ /VAN|VON\s/ || w[0, 3] == 'SCH'
|
530
|
+
primary += 'T'
|
531
|
+
secondary += 'T'
|
532
|
+
else
|
533
|
+
primary += '0'
|
534
|
+
secondary += 'T'
|
535
|
+
end
|
536
|
+
i += 2
|
537
|
+
else
|
538
|
+
if w[i + 1] =~ /[TD]/
|
539
|
+
i += 2
|
540
|
+
else
|
541
|
+
i += 1
|
542
|
+
end
|
543
|
+
primary += 'T'
|
544
|
+
secondary += 'T'
|
545
|
+
end
|
546
|
+
when 'V'
|
547
|
+
if w[i + 1] == 'V'
|
548
|
+
i += 2
|
549
|
+
else
|
550
|
+
i += 1
|
551
|
+
end
|
552
|
+
primary += 'F'
|
553
|
+
secondary += 'F'
|
554
|
+
when 'W'
|
555
|
+
# can also be in middle of word
|
556
|
+
if w[i, 2] == 'WR'
|
557
|
+
primary += 'R'
|
558
|
+
secondary += 'R'
|
559
|
+
i += 2
|
560
|
+
else
|
561
|
+
if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
|
562
|
+
# Wasserman should match Vasserman
|
563
|
+
if vowel?(w[i + 1])
|
564
|
+
primary += 'A'
|
565
|
+
secondary += 'F'
|
566
|
+
else
|
567
|
+
# need Uomo to match Womo
|
568
|
+
primary += 'A'
|
569
|
+
secondary += 'A'
|
570
|
+
end
|
571
|
+
end
|
572
|
+
# Arnow should match Arnoff
|
573
|
+
if i == last && i > 0 && vowel?(w[i - 1]) ||
|
574
|
+
(i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/) || w[0, 3] == 'SCH'
|
575
|
+
secondary += 'F'
|
576
|
+
i += 1
|
577
|
+
elsif w[i, 4] =~ /WICZ|WITZ/
|
578
|
+
# polish e.g. 'filipowicz'
|
579
|
+
primary += 'TS'
|
580
|
+
secondary += 'FX'
|
581
|
+
i += 4
|
582
|
+
else
|
583
|
+
i += 1
|
584
|
+
end
|
585
|
+
end
|
586
|
+
when 'X'
|
587
|
+
# french e.g. breaux
|
588
|
+
if !(i == last && ((i > 2 && w[i - 3, 3] =~ /IAU|EAU/) || (i > 1 && w[i - 2, 2] =~ /AU|OU/)))
|
589
|
+
primary += 'KS'
|
590
|
+
secondary += 'KS'
|
591
|
+
end
|
592
|
+
if w[i + 1] =~ /[CX]/
|
593
|
+
i += 2
|
594
|
+
else
|
595
|
+
i += 1
|
596
|
+
end
|
597
|
+
when 'Z'
|
598
|
+
# chinese pinyin e.g. 'zhao'
|
599
|
+
if w[i + 1] == 'H'
|
600
|
+
primary += 'J'
|
601
|
+
secondary += 'J'
|
602
|
+
i += 2
|
603
|
+
else
|
604
|
+
if w[i + 1, 2] =~ /ZO|ZI|ZA/ || slavo_germanic?(w) && (i > 0 && w[i - 1] != 'T')
|
605
|
+
primary += 'S'
|
606
|
+
secondary += 'TS';
|
607
|
+
else
|
608
|
+
primary += 'S'
|
609
|
+
secondary += 'S';
|
610
|
+
end
|
611
|
+
if w[i + 1] == 'Z'
|
612
|
+
i += 2
|
613
|
+
else
|
614
|
+
i += 1
|
615
|
+
end
|
616
|
+
end
|
617
|
+
else
|
618
|
+
i += 1
|
619
|
+
end
|
620
|
+
end
|
621
|
+
[primary[0, code_size], secondary[0, code_size]]
|
622
|
+
end
|
623
|
+
|
624
|
+
def self.encode(str, options = { size: 4 })
|
625
|
+
encode_word(str, options)
|
626
|
+
end
|
627
|
+
|
628
|
+
private
|
629
|
+
|
630
|
+
def self.slavo_germanic?(str)
|
631
|
+
!!(str[/W|K|CZ|WITZ/])
|
632
|
+
end
|
633
|
+
|
634
|
+
def self.vowel?(char)
|
635
|
+
c = VOWELS[char.to_s]
|
636
|
+
!c.nil? && !c.empty?
|
637
|
+
end
|
638
|
+
|
639
|
+
end
|
640
|
+
end
|