phonetic 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +98 -0
- data/Rakefile +1 -0
- data/lib/phonetic.rb +10 -0
- data/lib/phonetic/algorithm.rb +24 -0
- data/lib/phonetic/caverphone.rb +68 -0
- data/lib/phonetic/caverphone2.rb +69 -0
- data/lib/phonetic/core_ext/string.rb +3 -0
- data/lib/phonetic/core_ext/string/caverphone.rb +11 -0
- data/lib/phonetic/core_ext/string/caverphone2.rb +11 -0
- data/lib/phonetic/core_ext/string/double_metaphone.rb +18 -0
- data/lib/phonetic/core_ext/string/metaphone.rb +12 -0
- data/lib/phonetic/core_ext/string/nysiis.rb +12 -0
- data/lib/phonetic/core_ext/string/refined_soundex.rb +12 -0
- data/lib/phonetic/core_ext/string/soundex.rb +12 -0
- data/lib/phonetic/double_metaphone.rb +640 -0
- data/lib/phonetic/metaphone.rb +161 -0
- data/lib/phonetic/metaphone2.rb +5 -0
- data/lib/phonetic/nysiis.rb +63 -0
- data/lib/phonetic/refined_soundex.rb +39 -0
- data/lib/phonetic/soundex.rb +39 -0
- data/lib/phonetic/version.rb +3 -0
- data/phonetic.gemspec +26 -0
- data/spec/phonetic/algorithm_spec.rb +15 -0
- data/spec/phonetic/caverphone2_spec.rb +66 -0
- data/spec/phonetic/caverphone_spec.rb +115 -0
- data/spec/phonetic/core_ext/string/caverphone2_spec.rb +9 -0
- data/spec/phonetic/core_ext/string/caverphone_spec.rb +9 -0
- data/spec/phonetic/core_ext/string/double_metaphone_spec.rb +15 -0
- data/spec/phonetic/core_ext/string/metaphone_spec.rb +11 -0
- data/spec/phonetic/core_ext/string/nysiis_spec.rb +12 -0
- data/spec/phonetic/core_ext/string/refined_soundex_spec.rb +10 -0
- data/spec/phonetic/core_ext/string/soundex_spec.rb +14 -0
- data/spec/phonetic/double_metaphone_spec.rb +16 -0
- data/spec/phonetic/metaphone2_spec.rb +9 -0
- data/spec/phonetic/metaphone_spec.rb +81 -0
- data/spec/phonetic/nysiis_spec.rb +20 -0
- data/spec/phonetic/refined_soundex_spec.rb +13 -0
- data/spec/phonetic/soundex_spec.rb +24 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/support/double_metaphone_data.rb +142 -0
- data/spec/support/nysiis_data.rb +31 -0
- metadata +180 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cb22f0be1272e5b72586a964943808d93f99c5c2
|
4
|
+
data.tar.gz: cfab5ebba7adc3823f9c74a92b5877681e73d36d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: da460d1d048d38d39af6970b3c456551217551d3aa1275d8ec83cc86ea6ae5fcf454842ccb0300d93726303ab9009639bd72c894d99e30d9773edd269e0f41c8
|
7
|
+
data.tar.gz: 3b94a722e973f37eccd62841d0cd64be3e0dbce0ab27766cd4384a01e375440334f74cc22b14c21804489a5e87f6052911f3decf5b1a2c1f4f24e7b6459306cc
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 n7v
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
# Phonetic
|
2
|
+
|
3
|
+
Ruby library for phonetic algorithms.
|
4
|
+
It supports Soundex, Metaphone, Double Metaphone, Caverphone, NYSIIS and others.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'phonetic'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
```shell
|
15
|
+
$ bundle
|
16
|
+
```
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
```shell
|
21
|
+
$ gem install phonetic
|
22
|
+
```
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
require 'phonetic'
|
28
|
+
```
|
29
|
+
|
30
|
+
### Soundex
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
'Ackerman'.soundex # => 'A265'
|
34
|
+
'ammonium'.soundex # => 'A500'
|
35
|
+
'implementation'.soundex # => 'I514'
|
36
|
+
```
|
37
|
+
|
38
|
+
### Refined Soundex
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
'Caren'.refined_soundex # => 'C30908'
|
42
|
+
'Hayers'.refined_soundex # => 'H093'
|
43
|
+
'Lambard'.refined_soundex # => 'L7081096'
|
44
|
+
```
|
45
|
+
|
46
|
+
### Metaphone
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
'Accola'.metaphone # => 'AKKL'
|
50
|
+
'Nikki'.metaphone # => 'NK'
|
51
|
+
'Wright'.metaphone #=> 'RT'
|
52
|
+
```
|
53
|
+
|
54
|
+
### Double Metaphone
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
'czerny'.double_metaphone # => ['SRN', 'XRN']
|
58
|
+
'dumb'.double_metaphone # => ['TM', 'TM']
|
59
|
+
'edgar'.double_metaphone # => ['ATKR', 'ATKR']
|
60
|
+
```
|
61
|
+
|
62
|
+
or use alias:
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
'czerny'.metaphone2 # => ['SRN', 'XRN']
|
66
|
+
'dumb'.metaphone2 # => ['TM', 'TM']
|
67
|
+
'edgar'.metaphone2 # => ['ATKR', 'ATKR']
|
68
|
+
```
|
69
|
+
|
70
|
+
### Caverphone
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
'Lashaunda'.caverphone # => 'LSNT11'
|
74
|
+
'Vidaurri'.caverphone # => 'FTR111'
|
75
|
+
````
|
76
|
+
|
77
|
+
### Caverphone 2
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
'Stevenson'.caverphone2 # => 'STFNSN1111'
|
81
|
+
'Peter'.caverphone2 # => 'PTA1111111'
|
82
|
+
```
|
83
|
+
|
84
|
+
### NYSIIS
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
'Alexandra'.nysiis # => 'ALAXANDR'
|
88
|
+
'Aumont'.nysiis # => 'AANAD'
|
89
|
+
'Bonnie'.nysiis # => 'BANY'
|
90
|
+
```
|
91
|
+
|
92
|
+
## Contributing
|
93
|
+
|
94
|
+
1. Fork it
|
95
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
96
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
97
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
98
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/lib/phonetic.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'phonetic/version'
|
2
|
+
require 'phonetic/nysiis'
|
3
|
+
require 'phonetic/soundex'
|
4
|
+
require 'phonetic/refined_soundex'
|
5
|
+
require 'phonetic/metaphone'
|
6
|
+
require 'phonetic/double_metaphone'
|
7
|
+
require 'phonetic/metaphone2'
|
8
|
+
require 'phonetic/caverphone'
|
9
|
+
require 'phonetic/caverphone2'
|
10
|
+
require 'phonetic/core_ext/string'
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Phonetic
|
2
|
+
# Base class for phonetic algorithms.
|
3
|
+
class Algorithm
|
4
|
+
# Generic method for encoding single word. Override it in your algorithm class.
|
5
|
+
# @param [String] word the word to encode
|
6
|
+
# @param [Hash] options the options for the algorithm
|
7
|
+
# @return [String] the word
|
8
|
+
def self.encode_word(word, options = {})
|
9
|
+
word
|
10
|
+
end
|
11
|
+
|
12
|
+
# Generic method for encoding string.
|
13
|
+
# Splits string by words and encodes it with {Algorithm.encode_word}.
|
14
|
+
#
|
15
|
+
# @param [String] str the string to encode.
|
16
|
+
# @param [Hash] options the options for algorithm.
|
17
|
+
# @return [String] the space separated codes of words from input string.
|
18
|
+
def self.encode(str, options = {})
|
19
|
+
str.scan(/\p{Word}+/).map do |word|
|
20
|
+
encode_word(word, options)
|
21
|
+
end.compact.reject(&:empty?).join(' ')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'phonetic/algorithm'
|
2
|
+
|
3
|
+
module Phonetic
|
4
|
+
# Caverphone created by the Caversham Project at the University of Otago.
|
5
|
+
# @see http://caversham.otago.ac.nz/files/working/ctp060902.pdf Caverphone: Phonetic Matching algorithm by David Hood (2002)
|
6
|
+
# This class implements this algorithm.
|
7
|
+
# @example
|
8
|
+
# Phonetic::Caverphone.encode('Charmain') # => 'KMN111'
|
9
|
+
# Phonetic::Caverphone.encode('Ellett') # => 'ALT111'
|
10
|
+
# Phonetic::Caverphone.encode('Siegmund') # => 'SKMNT1'
|
11
|
+
class Caverphone < Algorithm
|
12
|
+
MAP = {
|
13
|
+
/^(cou|rou|tou|enou)gh/ => '\12f',
|
14
|
+
/^gn/ => '2n',
|
15
|
+
/mb$/ => 'mb',
|
16
|
+
'cq' => '2q',
|
17
|
+
/c([iey])/ => 's\1',
|
18
|
+
'tch' => '2ch',
|
19
|
+
/[cqx]/ => 'k',
|
20
|
+
'v' => 'f',
|
21
|
+
'dg' => '2g',
|
22
|
+
/ti([oa])/ => 'si\1',
|
23
|
+
'd' => 't',
|
24
|
+
'ph' => 'fh',
|
25
|
+
'b' => 'p',
|
26
|
+
'sh' => 's2',
|
27
|
+
'z' => 's',
|
28
|
+
/^[aeiou]/ => 'A',
|
29
|
+
/[aeiou]/ => '3',
|
30
|
+
'3gh3' => '3kh3',
|
31
|
+
'gh' => '22',
|
32
|
+
'g' => 'k',
|
33
|
+
/s+/ => 'S',
|
34
|
+
/t+/ => 'T',
|
35
|
+
/p+/ => 'P',
|
36
|
+
/k+/ => 'K',
|
37
|
+
/f+/ => 'F',
|
38
|
+
/m+/ => 'M',
|
39
|
+
/n+/ => 'N',
|
40
|
+
'w3' => 'W3',
|
41
|
+
/wy/ => 'Wy',
|
42
|
+
'wh3' => 'Wh3',
|
43
|
+
'why' => 'Why',
|
44
|
+
'w' => '2',
|
45
|
+
/^h/ => 'A',
|
46
|
+
'h' => '2',
|
47
|
+
'r3' => 'R3',
|
48
|
+
'ry' => 'Ry',
|
49
|
+
'r' => '2',
|
50
|
+
'l3' => 'L3',
|
51
|
+
'ly' => 'Ly',
|
52
|
+
'l' => '2',
|
53
|
+
'j' => 'y',
|
54
|
+
'y3' => 'Y3',
|
55
|
+
'y' => '2',
|
56
|
+
'2' => '',
|
57
|
+
'3' => ''
|
58
|
+
}
|
59
|
+
|
60
|
+
# Encode word to its Caverphone code
|
61
|
+
def self.encode_word(word, options = {})
|
62
|
+
w = word.strip.downcase.gsub(/[^a-z]/, '')
|
63
|
+
MAP.each { |r, v| w.gsub!(r, v) }
|
64
|
+
w = w + '1' * 6
|
65
|
+
w[0..5]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'phonetic/algorithm'
|
2
|
+
|
3
|
+
module Phonetic
|
4
|
+
# Caverphone 2.0 created by the Caversham Project at the University of Otago.
|
5
|
+
# @see http://caversham.otago.ac.nz/files/working/ctp150804.pdf Caverphone Revisited by David Hood (2004)
|
6
|
+
# This class implements this algorithm.
|
7
|
+
# @example
|
8
|
+
# Phonetic::Caverphone2.encode('Stevenson') # => 'STFNSN1111'
|
9
|
+
# Phonetic::Caverphone2.encode('Peter') # => 'PTA1111111'
|
10
|
+
class Caverphone2 < Algorithm
|
11
|
+
MAP = {
|
12
|
+
/e$/ => '',
|
13
|
+
/^(cou|rou|tou|enou|trou)gh/ => '\12f',
|
14
|
+
/^gn/ => '2n',
|
15
|
+
/mb$/ => 'mb',
|
16
|
+
'cq' => '2q',
|
17
|
+
/c([iey])/ => 's\1',
|
18
|
+
'tch' => '2ch',
|
19
|
+
/[cqx]/ => 'k',
|
20
|
+
'v' => 'f',
|
21
|
+
'dg' => '2g',
|
22
|
+
/ti([oa])/ => 'si\1',
|
23
|
+
'd' => 't',
|
24
|
+
'ph' => 'fh',
|
25
|
+
'b' => 'p',
|
26
|
+
'sh' => 's2',
|
27
|
+
'z' => 's',
|
28
|
+
/^[aeiou]/ => 'A',
|
29
|
+
/[aeiou]/ => '3',
|
30
|
+
'j' => 'y',
|
31
|
+
/^y3/ => 'Y3',
|
32
|
+
/^y/ => 'A',
|
33
|
+
/y/ => '3',
|
34
|
+
'3gh3' => '3kh3',
|
35
|
+
'gh' => '22',
|
36
|
+
'g' => 'k',
|
37
|
+
/s+/ => 'S',
|
38
|
+
/t+/ => 'T',
|
39
|
+
/p+/ => 'P',
|
40
|
+
/k+/ => 'K',
|
41
|
+
/f+/ => 'F',
|
42
|
+
/m+/ => 'M',
|
43
|
+
/n+/ => 'N',
|
44
|
+
'w3' => 'W3',
|
45
|
+
'wh3' => 'Wh3',
|
46
|
+
/w$/ => '3',
|
47
|
+
'w' => '2',
|
48
|
+
/^h/ => 'A',
|
49
|
+
'h' => '2',
|
50
|
+
'r3' => 'R3',
|
51
|
+
/r$/ => '3',
|
52
|
+
'r' => '2',
|
53
|
+
'l3' => 'L3',
|
54
|
+
/l$/ => '3',
|
55
|
+
'l' => '2',
|
56
|
+
'2' => '',
|
57
|
+
/3$/ => 'A',
|
58
|
+
'3' => ''
|
59
|
+
}
|
60
|
+
|
61
|
+
# Encode word to its Caverphone 2 code
|
62
|
+
def self.encode_word(word, options = {})
|
63
|
+
w = word.strip.downcase.gsub(/[^a-z]/, '')
|
64
|
+
MAP.each { |r, v| w.gsub!(r, v) }
|
65
|
+
w = w + '1' * 10
|
66
|
+
w[0..9]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'phonetic/caverphone'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Caverphone value of string
|
5
|
+
# @example
|
6
|
+
# 'Lashaunda'.caverphone # => 'LSNT11'
|
7
|
+
# 'Vidaurri'.caverphone # => 'FTR111'
|
8
|
+
def caverphone(options = {})
|
9
|
+
Phonetic::Caverphone.encode(self, options)
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'phonetic/caverphone2'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Caverphone 2 value of string
|
5
|
+
# @example
|
6
|
+
# 'Stevenson'.caverphone2 # => 'STFNSN1111'
|
7
|
+
# 'Peter'.caverphone2 # => 'PTA1111111'
|
8
|
+
def caverphone2(options = {})
|
9
|
+
Phonetic::Caverphone2.encode(self, options)
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'phonetic/double_metaphone'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Double Metahpone code of string.
|
5
|
+
# @example
|
6
|
+
# 'czerny'.double_metaphone # => ['SRN', 'XRN']
|
7
|
+
# 'dumb'.double_metaphone # => ['TM', 'TM']
|
8
|
+
# 'edgar'.double_metaphone # => ['ATKR', 'ATKR']
|
9
|
+
# # or use alias:
|
10
|
+
# 'czerny'.metaphone2 # => ['SRN', 'XRN']
|
11
|
+
# 'dumb'.metaphone2 # => ['TM', 'TM']
|
12
|
+
# 'edgar'.metaphone2 # => ['ATKR', 'ATKR']
|
13
|
+
def double_metaphone(options = { size: 4 })
|
14
|
+
Phonetic::DoubleMetaphone.encode(self, options)
|
15
|
+
end
|
16
|
+
|
17
|
+
alias_method :metaphone2, :double_metaphone
|
18
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'phonetic/metaphone'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Metaphone value of string.
|
5
|
+
# @example
|
6
|
+
# 'Accola'.metaphone # => 'AKKL'
|
7
|
+
# 'Nikki'.metaphone # => 'NK'
|
8
|
+
# 'Wright'.metaphone #=> 'RT'
|
9
|
+
def metaphone(options = { size: 4 })
|
10
|
+
Phonetic::Metaphone.encode(self, options)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'phonetic/nysiis'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Caverphone value of string.
|
5
|
+
# @example
|
6
|
+
# 'Alexandra'.nysiis # => 'ALAXANDR'
|
7
|
+
# 'Aumont'.nysiis # => 'AANAD'
|
8
|
+
# 'Bonnie'.nysiis # => 'BANY'
|
9
|
+
def nysiis(options = { trim: true })
|
10
|
+
Phonetic::NYSIIS.encode(self, options)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'phonetic/refined_soundex'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Refined Soundex value of string.
|
5
|
+
# @example
|
6
|
+
# 'Caren'.refined_soundex # => 'C30908'
|
7
|
+
# 'Hayers'.refined_soundex # => 'H093'
|
8
|
+
# 'Lambard'.refined_soundex # => 'L7081096'
|
9
|
+
def refined_soundex(options = { trim: true })
|
10
|
+
Phonetic::RefinedSoundex.encode(self, options)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'phonetic/soundex'
|
2
|
+
|
3
|
+
class String
|
4
|
+
# Soundex value of string
|
5
|
+
# @example
|
6
|
+
# 'Ackerman'.soundex # => 'A265'
|
7
|
+
# 'ammonium'.soundex # => 'A500'
|
8
|
+
# 'implementation'.soundex # => 'I514'
|
9
|
+
def soundex(options = { trim: true })
|
10
|
+
Phonetic::Soundex.encode(self, options)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,640 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'phonetic/algorithm'
|
4
|
+
|
5
|
+
module Phonetic
|
6
|
+
# The Double Metaphone phonetic encoding algorithm is the second generation
|
7
|
+
# of the Metaphone algorithm. Its original implementation was described
|
8
|
+
# by Lawrence Philips in the June 2000 issue of C/C++ Users Journal.
|
9
|
+
#
|
10
|
+
# This implementation based on the PHP implementation by Stephen Woodbridge
|
11
|
+
# and contains modifications of algorithm by Kevin Atkinson.
|
12
|
+
# @see http://swoodbridge.com/DoubleMetaPhone/ PHP implementation by Stephen Woodbridge
|
13
|
+
# @see http://aspell.net/metaphone/dmetaph.cpp C++ implementation with modifications by Kevin Atkinson
|
14
|
+
# @example
|
15
|
+
# Phonetic::DoubleMetaphone.encode('czerny') # => ['SRN', 'XRN']
|
16
|
+
# Phonetic::DoubleMetaphone.encode('dumb') # => ['TM', 'TM']
|
17
|
+
# Phonetic::DoubleMetaphone.encode('edgar') # => ['ATKR', 'ATKR']
|
18
|
+
# # or use alias:
|
19
|
+
# Phonetic::Metaphone2.encode('czerny') # => ['SRN', 'XRN']
|
20
|
+
# Phonetic::Metaphone2.encode('dumb') # => ['TM', 'TM']
|
21
|
+
# Phonetic::Metaphone2.encode('edgar') # => ['ATKR', 'ATKR']
|
22
|
+
class DoubleMetaphone < Algorithm
|
23
|
+
VOWELS = 'AEIOUY'
|
24
|
+
|
25
|
+
# Encode word to its Double Metaphone code.
|
26
|
+
def self.encode_word(word, options = { size: 4 })
|
27
|
+
code_size = options[:size] || 4
|
28
|
+
w = word.strip.upcase
|
29
|
+
primary = ''
|
30
|
+
secondary = ''
|
31
|
+
i = 0
|
32
|
+
len = w.size
|
33
|
+
last = len - 1
|
34
|
+
# pad the original string so that we can index beyond the edge of the world
|
35
|
+
w += ' ' * 5
|
36
|
+
# skip these when at start of word
|
37
|
+
i += 1 if ['GN','KN','PN','WR','PS'].include? w[0, 2]
|
38
|
+
# initial 'X' is pronounced 'Z' e.g. 'Xavier'
|
39
|
+
if w[0] == 'X'
|
40
|
+
primary += 'S'
|
41
|
+
secondary += 'S'
|
42
|
+
i += 1
|
43
|
+
end
|
44
|
+
while i < len && (primary.size < code_size || primary.size < code_size)
|
45
|
+
case w[i]
|
46
|
+
when 'A', 'E', 'I', 'O', 'U', 'Y'
|
47
|
+
if i == 0
|
48
|
+
# all init vowels now map to 'A'
|
49
|
+
primary += 'A'
|
50
|
+
secondary += 'A'
|
51
|
+
end
|
52
|
+
i += 1
|
53
|
+
when 'B'
|
54
|
+
# "-mb", e.g", "dumb", already skipped over...
|
55
|
+
primary += 'P'
|
56
|
+
secondary += 'P'
|
57
|
+
i += (w[i + 1] == 'B') ? 2 : 1
|
58
|
+
when 'Ç', 'ç'
|
59
|
+
primary += 'S'
|
60
|
+
secondary += 'S'
|
61
|
+
i += 1
|
62
|
+
when 'C'
|
63
|
+
# various germanic
|
64
|
+
if i > 1 && !vowel?(w[i - 2]) && w[i - 1, 3] == 'ACH' &&
|
65
|
+
(w[i + 2] != 'I' && (w[i + 2] != 'E' || w[i - 2, 6] =~ /[BM]ACHER/))
|
66
|
+
primary += 'K'
|
67
|
+
secondary += 'K'
|
68
|
+
i += 2
|
69
|
+
# special case 'caesar'
|
70
|
+
elsif i == 0 && w[i, 6] == 'CAESAR'
|
71
|
+
primary += 'S'
|
72
|
+
secondary += 'S'
|
73
|
+
i += 2
|
74
|
+
# italian 'chianti'
|
75
|
+
elsif w[i, 4] == 'CHIA'
|
76
|
+
primary += 'K'
|
77
|
+
secondary += 'K'
|
78
|
+
i += 2
|
79
|
+
elsif w[i, 2] == 'CH'
|
80
|
+
# find 'michael'
|
81
|
+
if i > 0 && w[i, 4] == 'CHAE'
|
82
|
+
primary += 'K'
|
83
|
+
secondary += 'X'
|
84
|
+
i += 2
|
85
|
+
# greek roots e.g. 'chemistry', 'chorus'
|
86
|
+
elsif i == 0 && (w[i + 1, 5] =~ /HARAC|HARIS/ || w[i + 1, 3] =~ /HOR|HYM|HIA|HEM/) &&
|
87
|
+
w[0, 5] != 'CHORE'
|
88
|
+
primary += 'K'
|
89
|
+
secondary += 'K'
|
90
|
+
i += 2
|
91
|
+
else
|
92
|
+
# germanic, greek, or otherwise 'ch' for 'kh' sound
|
93
|
+
if (w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH') ||
|
94
|
+
# 'architect but not 'arch', 'orchestra', 'orchid'
|
95
|
+
(i > 1 && w[i - 2, 6] =~ /ORCHES|ARCHIT|ORCHID/) ||
|
96
|
+
(w[i + 2] =~ /[TS]/) ||
|
97
|
+
((i > 0 && w[i - 1] =~ /[AOUE]/) || i == 0) &&
|
98
|
+
# e.g., 'wachtler', 'wechsler', but not 'tichner'
|
99
|
+
(w[i + 2] =~ /[LRNMBHFVW ]/ || i + 2 >= len)
|
100
|
+
primary += 'K'
|
101
|
+
secondary += 'K'
|
102
|
+
else
|
103
|
+
if i > 0
|
104
|
+
if w[0, 2] == 'MC'
|
105
|
+
# e.g., "McHugh"
|
106
|
+
primary += 'K'
|
107
|
+
secondary += 'K'
|
108
|
+
else
|
109
|
+
primary += 'X'
|
110
|
+
secondary += 'K'
|
111
|
+
end
|
112
|
+
else
|
113
|
+
primary += 'X'
|
114
|
+
secondary += 'X'
|
115
|
+
end
|
116
|
+
end
|
117
|
+
i += 2
|
118
|
+
end
|
119
|
+
elsif w[i, 2] == 'CZ' && !(i > 1 && w[i - 2, 4] == 'WICZ')
|
120
|
+
# e.g, 'czerny'
|
121
|
+
primary += 'S'
|
122
|
+
secondary += 'X'
|
123
|
+
i += 2
|
124
|
+
elsif w[i + 1, 3] == 'CIA'
|
125
|
+
# e.g., 'focaccia'
|
126
|
+
primary += 'X'
|
127
|
+
secondary += 'X'
|
128
|
+
i += 3
|
129
|
+
# double 'C', but not if e.g. 'McClellan'
|
130
|
+
elsif w[i, 2] == 'CC' && !(i == 1 && w[0] == 'M')
|
131
|
+
# 'bellocchio' but not 'bacchus'
|
132
|
+
if w[i + 2, 1] =~ /[IEH]/ && w[i + 2, 2] != 'HU'
|
133
|
+
# 'accident', 'accede' 'succeed'
|
134
|
+
if i == 1 && w[i - 1] == 'A' || w[i - 1, 5] =~ /UCCEE|UCCES/
|
135
|
+
# 'bacci', 'bertucci', other italian
|
136
|
+
primary += 'KS'
|
137
|
+
secondary += 'KS'
|
138
|
+
else
|
139
|
+
primary += 'X'
|
140
|
+
secondary += 'X'
|
141
|
+
end
|
142
|
+
i += 3
|
143
|
+
else
|
144
|
+
# Pierce's rule
|
145
|
+
primary += 'K'
|
146
|
+
secondary += 'K'
|
147
|
+
i += 2
|
148
|
+
end
|
149
|
+
elsif w[i, 2] =~ /CK|CG|CQ/
|
150
|
+
primary += 'K'
|
151
|
+
secondary += 'K'
|
152
|
+
i += 2
|
153
|
+
elsif w[i, 2] =~ /CI|CE|CY/
|
154
|
+
# italian vs. english
|
155
|
+
if w[i, 3] =~ /CIO|CIE|CIA/
|
156
|
+
primary += 'S'
|
157
|
+
secondary += 'X'
|
158
|
+
else
|
159
|
+
primary += 'S'
|
160
|
+
secondary += 'S'
|
161
|
+
end
|
162
|
+
i += 2
|
163
|
+
else
|
164
|
+
primary += 'K'
|
165
|
+
secondary += 'K'
|
166
|
+
# name sent in 'mac caffrey', 'mac gregor'
|
167
|
+
if w[i + 1, 2] =~ /\s[CQG]/
|
168
|
+
i += 3
|
169
|
+
else
|
170
|
+
if w[i + 1] =~ /[CKQ]/ && !(w[i + 1, 2] =~ /CE|CI/)
|
171
|
+
i += 2
|
172
|
+
else
|
173
|
+
i += 1
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
when 'D'
|
178
|
+
if w[i, 2] == 'DG'
|
179
|
+
if w[i + 2] =~ /[IEY]/
|
180
|
+
# e.g. 'edge'
|
181
|
+
primary += 'J'
|
182
|
+
secondary += 'J'
|
183
|
+
i += 3
|
184
|
+
else
|
185
|
+
# e.g. 'edgar'
|
186
|
+
primary += 'TK'
|
187
|
+
secondary += 'TK'
|
188
|
+
i += 2
|
189
|
+
end
|
190
|
+
elsif w[i, 2] =~ /DT|DD/
|
191
|
+
primary += 'T'
|
192
|
+
secondary += 'T'
|
193
|
+
i += 2
|
194
|
+
else
|
195
|
+
primary += 'T'
|
196
|
+
secondary += 'T'
|
197
|
+
i += 1
|
198
|
+
end
|
199
|
+
when 'F'
|
200
|
+
if w[i + 1] == 'F'
|
201
|
+
i += 2
|
202
|
+
else
|
203
|
+
i += 1
|
204
|
+
end
|
205
|
+
primary += 'F'
|
206
|
+
secondary += 'F'
|
207
|
+
when 'G'
|
208
|
+
if w[i + 1] == 'H'
|
209
|
+
if i > 0 && !vowel?(w[i - 1])
|
210
|
+
primary += 'K'
|
211
|
+
secondary += 'K'
|
212
|
+
i += 2
|
213
|
+
elsif i == 0
|
214
|
+
# ghislane, ghiradelli
|
215
|
+
if w[i + 2] == 'I'
|
216
|
+
primary += 'J'
|
217
|
+
secondary += 'J'
|
218
|
+
else
|
219
|
+
primary += 'K'
|
220
|
+
secondary += 'K'
|
221
|
+
end
|
222
|
+
i += 2
|
223
|
+
# Parker's rule (with some further refinements) - e.g., 'hugh'
|
224
|
+
elsif (i > 1 && w[i - 2] =~ /[BHD]/) ||
|
225
|
+
# e.g., 'bough'
|
226
|
+
(i > 2 && w[i - 3] =~ /[BHD]/) ||
|
227
|
+
# e.g., 'broughton'
|
228
|
+
(i > 3 && w[i - 4] =~ /[BH]/)
|
229
|
+
i += 2
|
230
|
+
else
|
231
|
+
# e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
|
232
|
+
if i > 2 && w[i - 1] == 'U' && w[i - 3] =~ /[CGLRT]/
|
233
|
+
primary += 'F'
|
234
|
+
secondary += 'F'
|
235
|
+
else
|
236
|
+
if i > 0 && w[i - 1] != 'I'
|
237
|
+
primary += 'K'
|
238
|
+
secondary += 'K'
|
239
|
+
end
|
240
|
+
end
|
241
|
+
i += 2
|
242
|
+
end
|
243
|
+
elsif w[i + 1] == 'N'
|
244
|
+
if i == 1 && vowel?(w[0]) && !slavo_germanic?(w)
|
245
|
+
primary += 'KN'
|
246
|
+
secondary += 'N'
|
247
|
+
else
|
248
|
+
# not e.g. 'cagney'
|
249
|
+
if w[i + 2, 2] != 'EY' && w[i + 1] != 'Y' && !slavo_germanic?(w)
|
250
|
+
primary += 'N'
|
251
|
+
secondary += 'KN'
|
252
|
+
else
|
253
|
+
primary += 'KN'
|
254
|
+
secondary += 'KN'
|
255
|
+
end
|
256
|
+
end
|
257
|
+
i += 2
|
258
|
+
# 'tagliaro'
|
259
|
+
elsif w[i + 1, 2] == 'LI' && !slavo_germanic?(w)
|
260
|
+
primary += 'KL'
|
261
|
+
secondary += 'L'
|
262
|
+
i += 2
|
263
|
+
# -ges-,-gep-,-gel-, -gie- at beginning
|
264
|
+
elsif i == 0 && (w[i + 1] == 'Y' || w[i + 1, 2] =~ /ES|EP|EB|EL|EY|IB|IL|IN|IE|EI|ER/)
|
265
|
+
primary += 'K'
|
266
|
+
secondary += 'J'
|
267
|
+
i += 2
|
268
|
+
# -ger-, -gy-
|
269
|
+
elsif (w[i + 1, 2] == 'ER' || w[i + 1] == 'Y') &&
|
270
|
+
!(w[0, 6] =~ /[DRM]ANGER/) &&
|
271
|
+
!(i > 0 && w[i - 1] =~ /[EI]/) &&
|
272
|
+
!(i > 0 && w[i - 1, 3] =~ /RGY|OGY/)
|
273
|
+
primary += 'K'
|
274
|
+
secondary += 'J'
|
275
|
+
i += 2
|
276
|
+
# italian e.g, 'biaggi'
|
277
|
+
elsif w[i + 1] =~ /[EIY]/ || (i > 0 && w[i - 1, 4] =~ /[AO]GGI/)
|
278
|
+
if w[0, 4] =~ /(VAN|VON)\s/ || w[0, 3] == 'SCH' || w[i + 1, 2] == 'ET'
|
279
|
+
primary += 'K'
|
280
|
+
secondary += 'K'
|
281
|
+
else
|
282
|
+
if w[i + 1, 4] =~ /IER\s/
|
283
|
+
primary += 'J'
|
284
|
+
secondary += 'J'
|
285
|
+
else
|
286
|
+
primary += 'J'
|
287
|
+
secondary += 'K'
|
288
|
+
end
|
289
|
+
end
|
290
|
+
i += 2
|
291
|
+
else
|
292
|
+
if w[i + 1] == 'G'
|
293
|
+
i += 2
|
294
|
+
else
|
295
|
+
i += 1
|
296
|
+
end
|
297
|
+
primary += 'K'
|
298
|
+
secondary += 'K'
|
299
|
+
end
|
300
|
+
when 'H'
|
301
|
+
# only keep if first & before vowel or btw. 2 vowels
|
302
|
+
if (i == 0 || (i > 0 && vowel?(w[i - 1]))) && vowel?(w[i + 1])
|
303
|
+
primary += 'H'
|
304
|
+
secondary += 'H'
|
305
|
+
i += 2
|
306
|
+
else # also takes care of 'HH'
|
307
|
+
i += 1
|
308
|
+
end
|
309
|
+
when 'J'
|
310
|
+
# obvious spanish, 'jose', 'san jacinto'
|
311
|
+
if w[i, 4] == 'JOSE' || w[0, 4] =~ /SAN\s/
|
312
|
+
if i == 0 && w[i + 4] == ' ' || w[0, 4] =~ /SAN\s/
|
313
|
+
primary += 'H'
|
314
|
+
secondary += 'H'
|
315
|
+
else
|
316
|
+
primary += 'J'
|
317
|
+
secondary += 'H'
|
318
|
+
end
|
319
|
+
i += 1
|
320
|
+
else
|
321
|
+
if i == 0 && w[i, 4] != 'JOSE'
|
322
|
+
primary += 'J'
|
323
|
+
secondary += 'A'
|
324
|
+
# Yankelovich/Jankelowicz
|
325
|
+
else
|
326
|
+
# spanish pron. of e.g. 'bajador'
|
327
|
+
if i > 0 && vowel?(w[i - 1]) && !slavo_germanic?(w) && (w[i + 1] == 'A' || w[i + 1] == 'O')
|
328
|
+
primary += 'J'
|
329
|
+
secondary += 'H'
|
330
|
+
else
|
331
|
+
if i == last
|
332
|
+
primary += 'J'
|
333
|
+
#secondary += ' '
|
334
|
+
else
|
335
|
+
if !(w[i + 1] =~ /[LTKSNMBZ]/) && !(i > 0 && w[i - 1] =~ /[SKL]/)
|
336
|
+
primary += 'J'
|
337
|
+
secondary += 'J'
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
if w[i + 1] == 'J'
|
343
|
+
i += 2
|
344
|
+
else
|
345
|
+
i += 1
|
346
|
+
end
|
347
|
+
end
|
348
|
+
when 'K'
|
349
|
+
if w[i + 1] == 'K'
|
350
|
+
i += 2
|
351
|
+
else
|
352
|
+
i += 1
|
353
|
+
end
|
354
|
+
primary += 'K'
|
355
|
+
secondary += 'K'
|
356
|
+
when 'L'
|
357
|
+
if w[i + 1] == 'L'
|
358
|
+
# spanish e.g. 'cabrillo', 'gallegos'
|
359
|
+
if (i == len - 3 && i > 0 && w[i - 1, 4] =~ /ILLO|ILLA|ALLE/) ||
|
360
|
+
((last > 0 && w[last - 1, 2] =~ /AS|OS/ || w[last] =~ /[AO]/) &&
|
361
|
+
(i > 0 && w[i - 1, 4] == 'ALLE'))
|
362
|
+
primary += 'L'
|
363
|
+
i += 2
|
364
|
+
next
|
365
|
+
end
|
366
|
+
i += 2
|
367
|
+
else
|
368
|
+
i += 1
|
369
|
+
end
|
370
|
+
primary += 'L'
|
371
|
+
secondary += 'L'
|
372
|
+
when 'M'
|
373
|
+
if (i > 0 && w[i - 1, 3] == 'UMB' && (i + 1 == last || w[i + 2, 2] == "ER")) ||
|
374
|
+
# 'dumb','thumb'
|
375
|
+
w[i + 1] == 'M'
|
376
|
+
i += 2
|
377
|
+
else
|
378
|
+
i += 1
|
379
|
+
end
|
380
|
+
primary += 'M'
|
381
|
+
secondary += 'M'
|
382
|
+
when 'N'
|
383
|
+
if w[i + 1] == 'N'
|
384
|
+
i += 2
|
385
|
+
else
|
386
|
+
i += 1
|
387
|
+
end
|
388
|
+
primary += 'N'
|
389
|
+
secondary += 'N'
|
390
|
+
when 'Ñ', 'ñ'
|
391
|
+
i += 1;
|
392
|
+
primary += 'N'
|
393
|
+
secondary += 'N'
|
394
|
+
when 'P'
|
395
|
+
if w[i + 1] == 'H'
|
396
|
+
primary += 'F'
|
397
|
+
secondary += 'F'
|
398
|
+
i += 2
|
399
|
+
else
|
400
|
+
# also account for "campbell", "raspberry"
|
401
|
+
if w[i + 1] =~ /[PB]/
|
402
|
+
i += 2
|
403
|
+
else
|
404
|
+
i += 1
|
405
|
+
end
|
406
|
+
primary += 'P'
|
407
|
+
secondary += 'P'
|
408
|
+
end
|
409
|
+
when 'Q'
|
410
|
+
if w[i + 1] == 'Q'
|
411
|
+
i += 2
|
412
|
+
else
|
413
|
+
i += 1
|
414
|
+
end
|
415
|
+
primary += 'K'
|
416
|
+
secondary += 'K'
|
417
|
+
when 'R'
|
418
|
+
# french e.g. 'rogier', but exclude 'hochmeier'
|
419
|
+
if i == last && !slavo_germanic?(w) &&
|
420
|
+
(i > 1 && w[i - 2, 2] == "IE") &&
|
421
|
+
!(i > 3 && w[i - 4, 2] =~ /M[EA]/)
|
422
|
+
secondary += 'R'
|
423
|
+
else
|
424
|
+
primary += 'R'
|
425
|
+
secondary += 'R'
|
426
|
+
end
|
427
|
+
if w[i + 1] == 'R'
|
428
|
+
i += 2
|
429
|
+
else
|
430
|
+
i += 1
|
431
|
+
end
|
432
|
+
when 'S'
|
433
|
+
# special cases 'island', 'isle', 'carlisle', 'carlysle'
|
434
|
+
if i > 0 && w[i - 1, 3] =~ /ISL|YSL/
|
435
|
+
i += 1
|
436
|
+
# special case 'sugar-'
|
437
|
+
elsif i == 0 && w[i, 5] == 'SUGAR'
|
438
|
+
primary += 'X'
|
439
|
+
secondary += 'S'
|
440
|
+
i += 1
|
441
|
+
elsif w[i, 2] == 'SH'
|
442
|
+
# germanic
|
443
|
+
if w[i + 1, 4] =~ /HEIM|HOEK|HOLM|HOLZ/
|
444
|
+
primary += 'S'
|
445
|
+
secondary += 'S'
|
446
|
+
else
|
447
|
+
primary += 'X'
|
448
|
+
secondary += 'X'
|
449
|
+
end
|
450
|
+
i += 2
|
451
|
+
# italian & armenian
|
452
|
+
elsif w[i, 3] =~ /SIO|SIA/ || w[i, 4] == 'SIAN'
|
453
|
+
if !slavo_germanic?(w)
|
454
|
+
primary += 'S'
|
455
|
+
secondary += 'X'
|
456
|
+
else
|
457
|
+
primary += 'S'
|
458
|
+
secondary += 'S'
|
459
|
+
end
|
460
|
+
i += 3
|
461
|
+
# german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
|
462
|
+
# also, -sz- in slavic language altho in hungarian it is pronounced 's'
|
463
|
+
elsif (i == 0 && w[i + 1] =~ /[MNLW]/) || w[i + 1] == 'Z'
|
464
|
+
primary += 'S'
|
465
|
+
secondary += 'X'
|
466
|
+
if w[i + 1] == 'Z'
|
467
|
+
i += 2
|
468
|
+
else
|
469
|
+
i += 1
|
470
|
+
end
|
471
|
+
elsif w[i, 2] == 'SC'
|
472
|
+
# Schlesinger's rule
|
473
|
+
if w[i + 2] == 'H'
|
474
|
+
# dutch origin, e.g. 'school', 'schooner'
|
475
|
+
if w[i + 3, 2] =~ /OO|ER|EN|UY|ED|EM/
|
476
|
+
# 'schermerhorn', 'schenker'
|
477
|
+
if w[i + 3, 2] =~ /ER|EN/
|
478
|
+
primary += 'X'
|
479
|
+
secondary += 'SK'
|
480
|
+
else
|
481
|
+
primary += 'SK'
|
482
|
+
secondary += 'SK'
|
483
|
+
end
|
484
|
+
i += 3
|
485
|
+
else
|
486
|
+
if i == 0 && !vowel?(w[3]) && w[3] != 'W'
|
487
|
+
primary += 'X'
|
488
|
+
secondary += 'S'
|
489
|
+
else
|
490
|
+
primary += 'X'
|
491
|
+
secondary += 'X'
|
492
|
+
end
|
493
|
+
i += 3
|
494
|
+
end
|
495
|
+
elsif w[i + 2, 1] =~ /[IEY]/
|
496
|
+
primary += 'S'
|
497
|
+
secondary += 'S'
|
498
|
+
i += 3
|
499
|
+
else
|
500
|
+
primary += 'SK'
|
501
|
+
secondary += 'SK'
|
502
|
+
i += 3
|
503
|
+
end
|
504
|
+
else
|
505
|
+
# french e.g. 'resnais', 'artois'
|
506
|
+
if i == last && i > 1 && w[i - 2, 2] =~ /AI|OI/
|
507
|
+
secondary += 'S'
|
508
|
+
else
|
509
|
+
primary += 'S'
|
510
|
+
secondary += 'S'
|
511
|
+
end
|
512
|
+
if w[i + 1] =~ /[SZ]/
|
513
|
+
i += 2
|
514
|
+
else
|
515
|
+
i += 1
|
516
|
+
end
|
517
|
+
end
|
518
|
+
when 'T'
|
519
|
+
if w[i, 4] == 'TION'
|
520
|
+
primary += 'X'
|
521
|
+
secondary += 'X'
|
522
|
+
i += 3
|
523
|
+
elsif w[i, 3] =~ /TIA|TCH/
|
524
|
+
primary += 'X'
|
525
|
+
secondary += 'X'
|
526
|
+
i += 3
|
527
|
+
elsif w[i, 2] == 'TH' || w[i, 3] == 'TTH'
|
528
|
+
# special case 'thomas', 'thames' or germanic
|
529
|
+
if w[i + 2, 2] =~ /OM|AM/ || w[0, 4] =~ /VAN|VON\s/ || w[0, 3] == 'SCH'
|
530
|
+
primary += 'T'
|
531
|
+
secondary += 'T'
|
532
|
+
else
|
533
|
+
primary += '0'
|
534
|
+
secondary += 'T'
|
535
|
+
end
|
536
|
+
i += 2
|
537
|
+
else
|
538
|
+
if w[i + 1] =~ /[TD]/
|
539
|
+
i += 2
|
540
|
+
else
|
541
|
+
i += 1
|
542
|
+
end
|
543
|
+
primary += 'T'
|
544
|
+
secondary += 'T'
|
545
|
+
end
|
546
|
+
when 'V'
|
547
|
+
if w[i + 1] == 'V'
|
548
|
+
i += 2
|
549
|
+
else
|
550
|
+
i += 1
|
551
|
+
end
|
552
|
+
primary += 'F'
|
553
|
+
secondary += 'F'
|
554
|
+
when 'W'
|
555
|
+
# can also be in middle of word
|
556
|
+
if w[i, 2] == 'WR'
|
557
|
+
primary += 'R'
|
558
|
+
secondary += 'R'
|
559
|
+
i += 2
|
560
|
+
else
|
561
|
+
if i == 0 && (vowel?(w[i + 1]) || w[i, 2] == 'WH')
|
562
|
+
# Wasserman should match Vasserman
|
563
|
+
if vowel?(w[i + 1])
|
564
|
+
primary += 'A'
|
565
|
+
secondary += 'F'
|
566
|
+
else
|
567
|
+
# need Uomo to match Womo
|
568
|
+
primary += 'A'
|
569
|
+
secondary += 'A'
|
570
|
+
end
|
571
|
+
end
|
572
|
+
# Arnow should match Arnoff
|
573
|
+
if i == last && i > 0 && vowel?(w[i - 1]) ||
|
574
|
+
(i > 0 && w[i - 1, 5] =~ /EWSKI|EWSKY|OWSKI|OWSKY/) || w[0, 3] == 'SCH'
|
575
|
+
secondary += 'F'
|
576
|
+
i += 1
|
577
|
+
elsif w[i, 4] =~ /WICZ|WITZ/
|
578
|
+
# polish e.g. 'filipowicz'
|
579
|
+
primary += 'TS'
|
580
|
+
secondary += 'FX'
|
581
|
+
i += 4
|
582
|
+
else
|
583
|
+
i += 1
|
584
|
+
end
|
585
|
+
end
|
586
|
+
when 'X'
|
587
|
+
# french e.g. breaux
|
588
|
+
if !(i == last && ((i > 2 && w[i - 3, 3] =~ /IAU|EAU/) || (i > 1 && w[i - 2, 2] =~ /AU|OU/)))
|
589
|
+
primary += 'KS'
|
590
|
+
secondary += 'KS'
|
591
|
+
end
|
592
|
+
if w[i + 1] =~ /[CX]/
|
593
|
+
i += 2
|
594
|
+
else
|
595
|
+
i += 1
|
596
|
+
end
|
597
|
+
when 'Z'
|
598
|
+
# chinese pinyin e.g. 'zhao'
|
599
|
+
if w[i + 1] == 'H'
|
600
|
+
primary += 'J'
|
601
|
+
secondary += 'J'
|
602
|
+
i += 2
|
603
|
+
else
|
604
|
+
if w[i + 1, 2] =~ /ZO|ZI|ZA/ || slavo_germanic?(w) && (i > 0 && w[i - 1] != 'T')
|
605
|
+
primary += 'S'
|
606
|
+
secondary += 'TS';
|
607
|
+
else
|
608
|
+
primary += 'S'
|
609
|
+
secondary += 'S';
|
610
|
+
end
|
611
|
+
if w[i + 1] == 'Z'
|
612
|
+
i += 2
|
613
|
+
else
|
614
|
+
i += 1
|
615
|
+
end
|
616
|
+
end
|
617
|
+
else
|
618
|
+
i += 1
|
619
|
+
end
|
620
|
+
end
|
621
|
+
[primary[0, code_size], secondary[0, code_size]]
|
622
|
+
end
|
623
|
+
|
624
|
+
def self.encode(str, options = { size: 4 })
|
625
|
+
encode_word(str, options)
|
626
|
+
end
|
627
|
+
|
628
|
+
private
|
629
|
+
|
630
|
+
def self.slavo_germanic?(str)
|
631
|
+
!!(str[/W|K|CZ|WITZ/])
|
632
|
+
end
|
633
|
+
|
634
|
+
def self.vowel?(char)
|
635
|
+
c = VOWELS[char.to_s]
|
636
|
+
!c.nil? && !c.empty?
|
637
|
+
end
|
638
|
+
|
639
|
+
end
|
640
|
+
end
|