soundcord 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,7 +1,9 @@
1
1
  require 'rake/testtask'
2
2
 
3
3
  Rake::TestTask.new do |t|
4
- t.libs << 'test'
4
+ t.libs << "test"
5
+ t.test_files = FileList['test/test*.rb']
6
+ t.verbose = true
5
7
  end
6
8
 
7
9
  desc "Run tests"
data/lib/algorithm.rb ADDED
@@ -0,0 +1,134 @@
1
+ # encoding: utf-8
2
+
3
+ class SoundCord
4
+ private
5
+ def self.process_text text
6
+ load_language unless language
7
+
8
+ text = text.downcase
9
+
10
+ lang_yml.each do |key, values|
11
+ if key == "terminations"
12
+ text = process_group text, values, :terminations => true
13
+ elsif key == "initiations"
14
+ text = process_group text, values, :initiations => true
15
+ elsif key == "follow_ups"
16
+ text = process_follow_ups text, values, options
17
+ elsif key == "second_followed"
18
+ text = process_second_followed text, values, options
19
+ elsif key == "vowels_proonunciation_insignificance"
20
+ text = process_vowels_proonunciation_insignificance text, values, options
21
+ elsif !key.include? "duplicate"
22
+ text = process_group text, values, options
23
+ end
24
+ end
25
+
26
+ text = remove_duplicity text, :duplicate_exceptions => (lang_yml["duplicate_exceptions"])
27
+
28
+ text.upcase
29
+ end
30
+
31
+ def self.remove_duplicity text, options
32
+ options[:duplicate_exceptions] = [] unless options[:duplicate_exceptions]
33
+
34
+ text.split(//).inject("") do |s, n|
35
+ last_s_char = s[s.length-1..s.length-1]
36
+ s + ((last_s_char === n &&
37
+ !options[:duplicate_exceptions].include?(n)) ? '' : n )
38
+ end
39
+ end
40
+
41
+ def self.process_group text, group, options
42
+ group.each do |key, values|
43
+ if values
44
+ text = simple_replace text, key, values, options
45
+ else
46
+ text = simple_replace text, '', key, options
47
+ end
48
+ end
49
+ return text
50
+ end
51
+
52
+ def self.process_follow_ups text, group, options = {}
53
+ group.each do |key, prefixes|
54
+ prefixes.each do |prefix, sufixes|
55
+ regexp = mount_follow_up_regexp prefix, sufixes
56
+ text = text.gsub regexp, key
57
+ end
58
+ end
59
+ return text
60
+ end
61
+
62
+ def self.process_second_followed text, group, options = {}
63
+ group.each do |key, prefixes|
64
+ prefixes.each do |prefix, sufixes|
65
+ regexp = mount_second_followed_by_regexp prefix, sufixes
66
+ text =~ regexp
67
+ replacing = ($1 ? $1 : '') + key
68
+ text = text.gsub regexp, replacing
69
+ end
70
+ end
71
+ return text
72
+ end
73
+
74
+ def process_vowels_proonunciation_insignificance text, group
75
+ group.each do |key, value|
76
+ regexp = mount_vowels_proonunciation_insignificance_regexp value
77
+ text =~ regexp
78
+ text = text.gsub regexp, $1
79
+ end
80
+ return text
81
+ end
82
+
83
+ def self.process_followed_by_consonant_regexp text, group
84
+ group.each do |key, value|
85
+ regexp = mount_followed_by_consonant_regexp value
86
+ text = text.gsub regexp, ''
87
+ end
88
+ return text
89
+ end
90
+
91
+ def self.simple_replace text, key, values, options
92
+ regexp = mount_regexp values, options
93
+ text.gsub regexp, key.to_s
94
+ end
95
+
96
+ def self.mount_regexp sentence, options = { :terminations => false, :initiations => false }
97
+ regexp = "/"
98
+ regexp += "^" if options[:initiations]
99
+ regexp += "("
100
+ regexp += sentence.kind_of?(Array) ? sentence.join("|") : sentence
101
+ regexp += ")"
102
+ regexp += "\\b" if options[:terminations]
103
+ regexp += "/"
104
+ eval(regexp)
105
+ end
106
+
107
+ def self.mount_follow_up_regexp prefix, sufix, options = {}
108
+ regexp = options[:not_eval] ? "" : "/"
109
+ regexp += prefix
110
+ regexp += "(?="
111
+ regexp += "("
112
+ regexp += sufix.kind_of?(Array) ? sufix.join("|") : sufix
113
+ regexp += "))"
114
+ regexp += "/" unless options[:not_eval]
115
+ options[:not_eval] ? regexp : eval(regexp)
116
+ end
117
+
118
+ def self.mount_second_followed_by_regexp char, group
119
+ regexp = "/" + not_first(char) + mount_follow_up_regexp(char, group, :not_eval => true) + "/"
120
+ eval(regexp)
121
+ end
122
+
123
+ def self.mount_vowels_proonunciation_insignificance_regexp char
124
+ eval "/([aeiou])#{char}(?=\b|[^aeiou])/"
125
+ end
126
+
127
+ def self.mount_followed_by_consonant_regexp char
128
+ eval "[#{char}](?![aeiou])"
129
+ end
130
+
131
+ def self.not_first char
132
+ "([^#{char}]|^)"
133
+ end
134
+ end
data/lib/config.rb ADDED
@@ -0,0 +1,25 @@
1
+ require 'yaml'
2
+
3
+ class SoundCord
4
+ DEFAULT_LANGUAGE = 'pt-BR'
5
+ LANGUAGES_DIRECTORY = "#{Dir.pwd}/lib/soundcord/languages/"
6
+
7
+ def self.load_language lang = DEFAULT_LANGUAGE
8
+ @language = lang
9
+ @lang_yml = YAML::load_file(LANGUAGES_DIRECTORY + "#{lang}.yml")[language]
10
+ @options = { :use_vowels => false }
11
+ end
12
+
13
+ def self.language
14
+ @language
15
+ end
16
+
17
+ def self.options
18
+ @options
19
+ end
20
+
21
+ private
22
+ def self.lang_yml
23
+ @lang_yml
24
+ end
25
+ end
data/lib/soundcord.rb CHANGED
@@ -2,98 +2,19 @@
2
2
 
3
3
  require 'soundcord/integrations/string'
4
4
  require 'soundcord/integrations/array'
5
+ require 'algorithm'
6
+ require 'config'
5
7
 
6
8
  class SoundCord
7
- def self.phonetize text, options = { :use_vogals => false }
8
- return handle_text(text, options)
9
+ def self.phonetize text
10
+ process_text(text)
9
11
  end
10
12
 
11
- def self.compare term_1, term_2, options = { :use_vogals => false }
12
- homophone? term_1, term_2, options
13
+ def self.compare term_1, term_2
14
+ homophone? term_1, term_2
13
15
  end
14
16
 
15
- def self.homophone? term_1, term_2, options = { :use_vogals => false }
16
- phonetize(term_1, options) == phonetize(term_2, options)
17
- end
18
-
19
- private
20
- def self.handle_text text, options = { :use_vogals => false }
21
- text = text.downcase
22
-
23
- text = remove_duplicity text
24
- text = handle_special_chars text
25
- text = handle_unusual_chars text
26
- text = handle_unusual_combinations text
27
- text = handle_terminations text
28
- text = remove_vogals(text) unless options[:use_vogals]
29
- text = remove_unwanted_chars text
30
-
31
- text.upcase
32
- end
33
-
34
- def self.handle_special_chars text
35
- text = text.gsub /(á|à|â|ã)/, 'a'
36
- text = text.gsub /(é|è|ê)/, 'e'
37
- text = text.gsub /(í|ì|î)/, 'i'
38
- text = text.gsub /(ó|ò|ô|õ)/, 'o'
39
- text = text.gsub /(ú|ù|û)/, 'u'
40
- end
41
-
42
- def self.handle_unusual_chars text
43
- text = text.gsub /y/, 'i'
44
- end
45
-
46
- def self.handle_unusual_combinations text
47
- text = text.gsub /(br|bl)/, 'b'
48
-
49
- text = text.gsub /ph/, 'f'
50
-
51
- text = text.gsub /(gr|mg|ng|rg|gl)/, 'g'
52
-
53
- text = text.gsub /(ge|gi|rj|mj|nj)/, 'j'
54
-
55
- text = text.gsub /(ce|ci|ch|cs)/, 's'
56
-
57
- text = text.gsub /ct/, 't'
58
-
59
- text = text.gsub /(q|ca|co|cu|ck|c)/, 'k'
60
-
61
- text = text.gsub /lh/, 'l'
62
-
63
- text = text.gsub /rm/, 'sm'
64
-
65
- text = text.gsub /(rm|gm|md|sm|ao\b)/, 'm'
66
-
67
- text = text.gsub /n/, 'm'
68
-
69
- text = text.gsub /ao\b/, 'm'
70
-
71
- text = text.gsub /nh/, 'n'
72
-
73
- text = text.gsub /pr/, 'p'
74
-
75
- text = text.gsub /(ç|x|ts|c|z|rs)/, 's'
76
-
77
- text = text.gsub /(tr|tl|lt|rt|st)/, 's'
78
-
79
- text = text.gsub /w/, 'v'
80
- end
81
-
82
- def self.handle_terminations text
83
- text = text.gsub /(s|z|r|m|n|ao|l)\b/, ''
84
- end
85
-
86
- def self.remove_vogals text
87
- text = text.gsub /(a|e|i|o|u)/, ''
88
- end
89
-
90
- def self.remove_unwanted_chars text
91
- text = text.gsub /h/, ''
92
- end
93
-
94
- def self.remove_duplicity text
95
- text.split(//).inject("") do |s,n|
96
- s + ((s[s.length-1..s.length-1] === n) ? '' : n )
97
- end
17
+ def self.homophone? term_1, term_2
18
+ phonetize(term_1) == phonetize(term_2)
98
19
  end
99
20
  end
@@ -1,4 +1,7 @@
1
1
  class Array
2
+ # Search possible homphone matches within the array object for a given string word
3
+ # Params:
4
+ # +value+:: string to be phonetized and compared with the array items
2
5
  def homophones value
3
6
  self.select { |i| i.homophone? value }
4
7
  end
@@ -1,23 +1,21 @@
1
1
  class String
2
- def phonetize options = { :use_vogals => false }
3
- SoundCord.phonetize self, options
2
+ # Returns the phonetic version of the object string
3
+ # Params:
4
+ # +use_vowels+:: enables the vowel comparison feature (if avaiable)
5
+ def phonetize
6
+ SoundCord.phonetize self
4
7
  end
5
8
 
9
+ # Returns the phonetic version of the passed string
10
+ # Params:
11
+ # +use_vowels+:: enables the vowel comparison feature (if avaiable)
6
12
  def self.phonetize value
7
13
  value.phonetize
8
14
  end
9
15
 
10
- # DEPRECATED: Please use homophone? instead.
11
- def compare_phntc compared
12
- warn "[DEPRECATION] `compare_phntc` is deprecated. Please use `homophone?` instead."
13
- self.homophone? compared
14
- end
15
- # DEPRECATED: Please use homophone? instead.
16
- def compare_phonetically compared
17
- warn "[DEPRECATION] `compare_phonetically` is deprecated. Please use `homophone?` instead."
18
- self.homophone? compared
19
- end
20
-
16
+ # Compares the passed value with the object value, both in their phonetic version
17
+ # Params:
18
+ # +use_vowels+:: enables the vowel comparison feature (if avaiable)
21
19
  def homophone? compared
22
20
  SoundCord.homophone? self, compared
23
21
  end
data/soundcord.gemspec CHANGED
@@ -2,11 +2,11 @@ Gem::Specification.new do |s|
2
2
  s.name = %q{soundcord}
3
3
  s.author = 'Lukas Alexandre'
4
4
  s.email = 'lukeskytm@gmail.com'
5
- s.homepage = 'https://github.com/lukasalexandre/soundcord'
6
- s.version = "0.1.1"
5
+ s.homepage = 'http://lukasalexandre.github.com/soundcord'
6
+ s.version = "0.2.0"
7
7
  s.date = Date.today
8
- s.summary = %q{A phonetic algorithm implementation}
9
- s.description = "A phonetic algorithm to make comparison by phonetically similar terms easier."
8
+ s.summary = %q{A phonetic algorithm for indexing of words by their pronunciation.}
9
+ s.description = %q{"Make comparisons of phonetically similar terms easier."}
10
10
  s.files = Dir["{lib/**/*.rb,README.rdoc,test/**/*.rb,Rakefile,*.gemspec}"]
11
11
  s.require_paths = ["lib"]
12
12
  end
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+
3
+ require 'test/unit'
4
+ require 'soundcord'
5
+
6
+ class SoundCordTest < Test::Unit::TestCase
7
+ def test_simple_words
8
+ assert_equal "J", "João".phonetize
9
+ assert_equal "MR", "Maria".phonetize
10
+ assert_equal "LM", "Helena".phonetize
11
+ assert_equal "VLM", "Valmir".phonetize
12
+ assert_equal "VLM", "Walmir".phonetize
13
+ end
14
+ def test_simple_comparations
15
+ assert_equal true, "Joao".homophone?("João")
16
+ assert_equal true, "Helena".homophone?("Elena")
17
+ assert_equal true, "Walmir".homophone?("Valmir")
18
+ assert_equal true, "Marria".homophone?("Maria")
19
+ assert_equal true, "Wagner".homophone?("Vagner")
20
+ assert_equal true, "Mirela".homophone?("Mirella")
21
+ assert_equal true, "Artur".homophone?("Arthur")
22
+ assert_equal true, "Diego".homophone?("Dyego")
23
+ assert_equal true, "Felipe".homophone?("Phelipe")
24
+ assert_equal true, "Filipe".homophone?("Felipe")
25
+ assert_equal true, "Phelipe".homophone?("Filipe")
26
+ assert_equal true, "Philippe".homophone?("Felipe")
27
+ end
28
+ def test_use_vogals_option
29
+ assert_equal "ELEMA", "Helena".phonetize(:use_vowels => true)
30
+ end
31
+ def test_special_chars
32
+ assert_equal true, "Luçia".homophone?("lucia")
33
+ assert_equal true, "Lúcio".homophone?("lucio")
34
+ end
35
+ def test_find_in_collection
36
+ list = %w( saola paulo saulo ricardo sallo )
37
+ expected = %w( saola saulo sallo )
38
+ assert_equal expected, list.homophones("saulo")
39
+ list = %w( leonardo lucene rodrigo luciana lussene )
40
+ expected = %w( lucene luciana lussene )
41
+ assert_equal expected, list.homophones("lucene")
42
+ end
43
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ require 'test/unit'
4
+ require 'config'
5
+
6
+ class SoundCordTest < Test::Unit::TestCase
7
+ def test_language_set_up
8
+ SoundCord.load_language "pt-BR"
9
+ assert_equal "pt-BR", SoundCord.language
10
+ end
11
+ def test_language_set_up
12
+ SoundCord.load_language "en"
13
+ assert_equal "en", SoundCord.language
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ # encoding: utf-8
2
+
3
+ require "benchmark"
4
+ require 'test/unit'
5
+ require 'soundcord'
6
+
7
+ class SoundCordTest < Test::Unit::TestCase
8
+ # pt-BR
9
+ def test_with_100_words_pt_br
10
+ SoundCord.load_language 'pt-BR'
11
+
12
+ list_of_random_words = []
13
+ 100.times do
14
+ list_of_random_words << (0...8).map{65.+(rand(25)).chr}.join
15
+ end
16
+ time = Benchmark.measure do
17
+ list_of_random_words.each { |i| i.phonetize }
18
+ end
19
+ assert_block do
20
+ time.real < 0.5
21
+ end
22
+ end
23
+ end
@@ -4,14 +4,19 @@ require 'test/unit'
4
4
  require 'soundcord'
5
5
 
6
6
  class SoundCordTest < Test::Unit::TestCase
7
- def test_simple_words
8
- assert_equal "João".phonetize, "J"
9
- assert_equal "Maria".phonetize, "MR"
10
- assert_equal "Helena".phonetize, "LM"
11
- assert_equal "Valmir".phonetize, "VLM"
12
- assert_equal "Walmir".phonetize, "VLM"
13
- end
14
- def test_simple_comparations
7
+ # pt-BR
8
+ def test_simple_words_pt_br
9
+ SoundCord.load_language 'pt-BR'
10
+
11
+ assert_equal "J", "João".phonetize
12
+ assert_equal "MR", "Maria".phonetize
13
+ assert_equal "LM", "Helena".phonetize
14
+ assert_equal "VLM", "Valmir".phonetize
15
+ assert_equal "VLM", "Walmir".phonetize
16
+ end
17
+ def test_simple_comparisons_pt_br
18
+ SoundCord.load_language 'pt-BR'
19
+
15
20
  assert_equal true, "Joao".homophone?("João")
16
21
  assert_equal true, "Helena".homophone?("Elena")
17
22
  assert_equal true, "Walmir".homophone?("Valmir")
@@ -25,10 +30,15 @@ class SoundCordTest < Test::Unit::TestCase
25
30
  assert_equal true, "Phelipe".homophone?("Filipe")
26
31
  assert_equal true, "Philippe".homophone?("Felipe")
27
32
  end
28
- def test_use_vogals_option
29
- assert_equal "ELEMA", "Helena".phonetize(:use_vogals => true)
33
+ def test_special_chars_pt_br
34
+ SoundCord.load_language 'pt-BR'
35
+
36
+ assert_equal true, "Luçia".homophone?("lucia")
37
+ assert_equal true, "Lúcio".homophone?("lucio")
30
38
  end
31
- def test_find_in_collection
39
+ def test_find_in_collection_pt_br
40
+ SoundCord.load_language 'pt-BR'
41
+
32
42
  list = %w( saola paulo saulo ricardo sallo )
33
43
  expected = %w( saola saulo sallo )
34
44
  assert_equal expected, list.homophones("saulo")
@@ -36,4 +46,70 @@ class SoundCordTest < Test::Unit::TestCase
36
46
  expected = %w( lucene luciana lussene )
37
47
  assert_equal expected, list.homophones("lucene")
38
48
  end
49
+
50
+ # en
51
+ def test_initiations_en
52
+ SoundCord.load_language 'en'
53
+
54
+ assert_equal "RL", "aerial".phonetize
55
+ assert_equal "RP", "wrap".phonetize
56
+ assert_equal "SN", "xeno".phonetize
57
+ assert_equal "TFR", "whatever".phonetize
58
+ assert_equal "NM", "gnome".phonetize
59
+ assert_equal "NF", "knife".phonetize
60
+ assert_equal "NMNK", "pneumonic".phonetize
61
+ end
62
+
63
+ def test_unusual_combinations_en
64
+ SoundCord.load_language 'en'
65
+
66
+ assert_equal "0TR", "theater".phonetize
67
+ assert_equal "TX", "touch".phonetize
68
+ assert_equal "XL", "shell".phonetize
69
+ assert_equal "KRX", "crutch".phonetize
70
+ assert_equal "FS", "phase".phonetize
71
+ assert_equal "BKR", "beggar".phonetize
72
+ end
73
+
74
+ def test_terminations_en
75
+ SoundCord.load_language 'en'
76
+
77
+ assert_equal "LM", "lmb".phonetize
78
+ end
79
+
80
+ def test_middle_en
81
+ SoundCord.load_language 'en'
82
+
83
+ # couldn't remember a better word with SCH in the middle
84
+ assert_equal "PRSK", "porsche".phonetize
85
+ end
86
+
87
+ def test_duplicate_exceptions_en
88
+ SoundCord.load_language 'en'
89
+
90
+ assert_equal "GKLS", "goggles".phonetize
91
+ end
92
+
93
+ def test_special_chars_en
94
+ SoundCord.load_language 'en'
95
+
96
+ assert_equal true, "Qeyla".homophone?("keyla")
97
+ assert_equal true, "Courtiney".homophone?("kourtiney")
98
+ assert_equal true, "Quartz".homophone?("kuarts")
99
+ assert_equal true, "falue".homophone?("value")
100
+ assert_equal true, "data".homophone?("tada")
101
+ end
102
+
103
+ def test_second_follwed_by_en
104
+ SoundCord.load_language 'en'
105
+
106
+ assert_equal "JM", "ogema".phonetize
107
+ end
108
+
109
+ def test_vowels_pronunciation_insignificance_en
110
+ SoundCord.load_language 'en'
111
+
112
+ assert_equal "MSX", "messiah".phonetize
113
+ assert_equal "ML", "mehlia".phonetize
114
+ end
39
115
  end
metadata CHANGED
@@ -1,75 +1,58 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: soundcord
3
- version: !ruby/object:Gem::Version
4
- hash: 25
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 1
9
- - 1
10
- version: 0.1.1
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Lukas Alexandre
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-06-18 00:00:00 -03:00
19
- default_executable:
12
+ date: 2012-07-12 00:00:00.000000000 Z
20
13
  dependencies: []
21
-
22
- description: A phonetic algorithm to make comparison by phonetically similar terms easier.
14
+ description: ! '"Make comparisons of phonetically similar terms easier."'
23
15
  email: lukeskytm@gmail.com
24
16
  executables: []
25
-
26
17
  extensions: []
27
-
28
18
  extra_rdoc_files: []
29
-
30
- files:
19
+ files:
20
+ - lib/algorithm.rb
21
+ - lib/config.rb
31
22
  - lib/soundcord/integrations/array.rb
32
23
  - lib/soundcord/integrations/string.rb
33
24
  - lib/soundcord/version.rb
34
25
  - lib/soundcord.rb
26
+ - test/languages/pt_br/test_soundcord.rb
35
27
  - test/test_array.rb
28
+ - test/test_config.rb
29
+ - test/test_performance.rb
36
30
  - test/test_soundcord.rb
37
31
  - test/test_string.rb
38
32
  - Rakefile
39
33
  - soundcord.gemspec
40
- has_rdoc: true
41
- homepage: https://github.com/lukasalexandre/soundcord
34
+ homepage: http://lukasalexandre.github.com/soundcord
42
35
  licenses: []
43
-
44
36
  post_install_message:
45
37
  rdoc_options: []
46
-
47
- require_paths:
38
+ require_paths:
48
39
  - lib
49
- required_ruby_version: !ruby/object:Gem::Requirement
40
+ required_ruby_version: !ruby/object:Gem::Requirement
50
41
  none: false
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- hash: 3
55
- segments:
56
- - 0
57
- version: "0"
58
- required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
47
  none: false
60
- requirements:
61
- - - ">="
62
- - !ruby/object:Gem::Version
63
- hash: 3
64
- segments:
65
- - 0
66
- version: "0"
48
+ requirements:
49
+ - - ! '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
67
52
  requirements: []
68
-
69
53
  rubyforge_project:
70
- rubygems_version: 1.5.3
54
+ rubygems_version: 1.8.24
71
55
  signing_key:
72
56
  specification_version: 3
73
- summary: A phonetic algorithm implementation
57
+ summary: A phonetic algorithm for indexing of words by their pronunciation.
74
58
  test_files: []
75
-