soundcord 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,7 +1,9 @@
1
1
  require 'rake/testtask'
2
2
 
3
3
  Rake::TestTask.new do |t|
4
- t.libs << 'test'
4
+ t.libs << "test"
5
+ t.test_files = FileList['test/test*.rb']
6
+ t.verbose = true
5
7
  end
6
8
 
7
9
  desc "Run tests"
data/lib/algorithm.rb ADDED
@@ -0,0 +1,134 @@
1
+ # encoding: utf-8
2
+
3
+ class SoundCord
4
+ private
5
+ def self.process_text text
6
+ load_language unless language
7
+
8
+ text = text.downcase
9
+
10
+ lang_yml.each do |key, values|
11
+ if key == "terminations"
12
+ text = process_group text, values, :terminations => true
13
+ elsif key == "initiations"
14
+ text = process_group text, values, :initiations => true
15
+ elsif key == "follow_ups"
16
+ text = process_follow_ups text, values, options
17
+ elsif key == "second_followed"
18
+ text = process_second_followed text, values, options
19
+ elsif key == "vowels_proonunciation_insignificance"
20
+ text = process_vowels_proonunciation_insignificance text, values, options
21
+ elsif !key.include? "duplicate"
22
+ text = process_group text, values, options
23
+ end
24
+ end
25
+
26
+ text = remove_duplicity text, :duplicate_exceptions => (lang_yml["duplicate_exceptions"])
27
+
28
+ text.upcase
29
+ end
30
+
31
+ def self.remove_duplicity text, options
32
+ options[:duplicate_exceptions] = [] unless options[:duplicate_exceptions]
33
+
34
+ text.split(//).inject("") do |s, n|
35
+ last_s_char = s[s.length-1..s.length-1]
36
+ s + ((last_s_char === n &&
37
+ !options[:duplicate_exceptions].include?(n)) ? '' : n )
38
+ end
39
+ end
40
+
41
+ def self.process_group text, group, options
42
+ group.each do |key, values|
43
+ if values
44
+ text = simple_replace text, key, values, options
45
+ else
46
+ text = simple_replace text, '', key, options
47
+ end
48
+ end
49
+ return text
50
+ end
51
+
52
+ def self.process_follow_ups text, group, options = {}
53
+ group.each do |key, prefixes|
54
+ prefixes.each do |prefix, sufixes|
55
+ regexp = mount_follow_up_regexp prefix, sufixes
56
+ text = text.gsub regexp, key
57
+ end
58
+ end
59
+ return text
60
+ end
61
+
62
+ def self.process_second_followed text, group, options = {}
63
+ group.each do |key, prefixes|
64
+ prefixes.each do |prefix, sufixes|
65
+ regexp = mount_second_followed_by_regexp prefix, sufixes
66
+ text =~ regexp
67
+ replacing = ($1 ? $1 : '') + key
68
+ text = text.gsub regexp, replacing
69
+ end
70
+ end
71
+ return text
72
+ end
73
+
74
+ def process_vowels_proonunciation_insignificance text, group
75
+ group.each do |key, value|
76
+ regexp = mount_vowels_proonunciation_insignificance_regexp value
77
+ text =~ regexp
78
+ text = text.gsub regexp, $1
79
+ end
80
+ return text
81
+ end
82
+
83
+ def self.process_followed_by_consonant_regexp text, group
84
+ group.each do |key, value|
85
+ regexp = mount_followed_by_consonant_regexp value
86
+ text = text.gsub regexp, ''
87
+ end
88
+ return text
89
+ end
90
+
91
+ def self.simple_replace text, key, values, options
92
+ regexp = mount_regexp values, options
93
+ text.gsub regexp, key.to_s
94
+ end
95
+
96
+ def self.mount_regexp sentence, options = { :terminations => false, :initiations => false }
97
+ regexp = "/"
98
+ regexp += "^" if options[:initiations]
99
+ regexp += "("
100
+ regexp += sentence.kind_of?(Array) ? sentence.join("|") : sentence
101
+ regexp += ")"
102
+ regexp += "\\b" if options[:terminations]
103
+ regexp += "/"
104
+ eval(regexp)
105
+ end
106
+
107
+ def self.mount_follow_up_regexp prefix, sufix, options = {}
108
+ regexp = options[:not_eval] ? "" : "/"
109
+ regexp += prefix
110
+ regexp += "(?="
111
+ regexp += "("
112
+ regexp += sufix.kind_of?(Array) ? sufix.join("|") : sufix
113
+ regexp += "))"
114
+ regexp += "/" unless options[:not_eval]
115
+ options[:not_eval] ? regexp : eval(regexp)
116
+ end
117
+
118
+ def self.mount_second_followed_by_regexp char, group
119
+ regexp = "/" + not_first(char) + mount_follow_up_regexp(char, group, :not_eval => true) + "/"
120
+ eval(regexp)
121
+ end
122
+
123
+ def self.mount_vowels_proonunciation_insignificance_regexp char
124
+ eval "/([aeiou])#{char}(?=\b|[^aeiou])/"
125
+ end
126
+
127
+ def self.mount_followed_by_consonant_regexp char
128
+ eval "[#{char}](?![aeiou])"
129
+ end
130
+
131
+ def self.not_first char
132
+ "([^#{char}]|^)"
133
+ end
134
+ end
data/lib/config.rb ADDED
@@ -0,0 +1,25 @@
1
+ require 'yaml'
2
+
3
+ class SoundCord
4
+ DEFAULT_LANGUAGE = 'pt-BR'
5
+ LANGUAGES_DIRECTORY = "#{Dir.pwd}/lib/soundcord/languages/"
6
+
7
+ def self.load_language lang = DEFAULT_LANGUAGE
8
+ @language = lang
9
+ @lang_yml = YAML::load_file(LANGUAGES_DIRECTORY + "#{lang}.yml")[language]
10
+ @options = { :use_vowels => false }
11
+ end
12
+
13
+ def self.language
14
+ @language
15
+ end
16
+
17
+ def self.options
18
+ @options
19
+ end
20
+
21
+ private
22
+ def self.lang_yml
23
+ @lang_yml
24
+ end
25
+ end
data/lib/soundcord.rb CHANGED
@@ -2,98 +2,19 @@
2
2
 
3
3
  require 'soundcord/integrations/string'
4
4
  require 'soundcord/integrations/array'
5
+ require 'algorithm'
6
+ require 'config'
5
7
 
6
8
  class SoundCord
7
- def self.phonetize text, options = { :use_vogals => false }
8
- return handle_text(text, options)
9
+ def self.phonetize text
10
+ process_text(text)
9
11
  end
10
12
 
11
- def self.compare term_1, term_2, options = { :use_vogals => false }
12
- homophone? term_1, term_2, options
13
+ def self.compare term_1, term_2
14
+ homophone? term_1, term_2
13
15
  end
14
16
 
15
- def self.homophone? term_1, term_2, options = { :use_vogals => false }
16
- phonetize(term_1, options) == phonetize(term_2, options)
17
- end
18
-
19
- private
20
- def self.handle_text text, options = { :use_vogals => false }
21
- text = text.downcase
22
-
23
- text = remove_duplicity text
24
- text = handle_special_chars text
25
- text = handle_unusual_chars text
26
- text = handle_unusual_combinations text
27
- text = handle_terminations text
28
- text = remove_vogals(text) unless options[:use_vogals]
29
- text = remove_unwanted_chars text
30
-
31
- text.upcase
32
- end
33
-
34
- def self.handle_special_chars text
35
- text = text.gsub /(á|à|â|ã)/, 'a'
36
- text = text.gsub /(é|è|ê)/, 'e'
37
- text = text.gsub /(í|ì|î)/, 'i'
38
- text = text.gsub /(ó|ò|ô|õ)/, 'o'
39
- text = text.gsub /(ú|ù|û)/, 'u'
40
- end
41
-
42
- def self.handle_unusual_chars text
43
- text = text.gsub /y/, 'i'
44
- end
45
-
46
- def self.handle_unusual_combinations text
47
- text = text.gsub /(br|bl)/, 'b'
48
-
49
- text = text.gsub /ph/, 'f'
50
-
51
- text = text.gsub /(gr|mg|ng|rg|gl)/, 'g'
52
-
53
- text = text.gsub /(ge|gi|rj|mj|nj)/, 'j'
54
-
55
- text = text.gsub /(ce|ci|ch|cs)/, 's'
56
-
57
- text = text.gsub /ct/, 't'
58
-
59
- text = text.gsub /(q|ca|co|cu|ck|c)/, 'k'
60
-
61
- text = text.gsub /lh/, 'l'
62
-
63
- text = text.gsub /rm/, 'sm'
64
-
65
- text = text.gsub /(rm|gm|md|sm|ao\b)/, 'm'
66
-
67
- text = text.gsub /n/, 'm'
68
-
69
- text = text.gsub /ao\b/, 'm'
70
-
71
- text = text.gsub /nh/, 'n'
72
-
73
- text = text.gsub /pr/, 'p'
74
-
75
- text = text.gsub /(ç|x|ts|c|z|rs)/, 's'
76
-
77
- text = text.gsub /(tr|tl|lt|rt|st)/, 's'
78
-
79
- text = text.gsub /w/, 'v'
80
- end
81
-
82
- def self.handle_terminations text
83
- text = text.gsub /(s|z|r|m|n|ao|l)\b/, ''
84
- end
85
-
86
- def self.remove_vogals text
87
- text = text.gsub /(a|e|i|o|u)/, ''
88
- end
89
-
90
- def self.remove_unwanted_chars text
91
- text = text.gsub /h/, ''
92
- end
93
-
94
- def self.remove_duplicity text
95
- text.split(//).inject("") do |s,n|
96
- s + ((s[s.length-1..s.length-1] === n) ? '' : n )
97
- end
17
+ def self.homophone? term_1, term_2
18
+ phonetize(term_1) == phonetize(term_2)
98
19
  end
99
20
  end
@@ -1,4 +1,7 @@
1
1
  class Array
2
+ # Search possible homphone matches within the array object for a given string word
3
+ # Params:
4
+ # +value+:: string to be phonetized and compared with the array items
2
5
  def homophones value
3
6
  self.select { |i| i.homophone? value }
4
7
  end
@@ -1,23 +1,21 @@
1
1
  class String
2
- def phonetize options = { :use_vogals => false }
3
- SoundCord.phonetize self, options
2
+ # Returns the phonetic version of the object string
3
+ # Params:
4
+ # +use_vowels+:: enables the vowel comparison feature (if avaiable)
5
+ def phonetize
6
+ SoundCord.phonetize self
4
7
  end
5
8
 
9
+ # Returns the phonetic version of the passed string
10
+ # Params:
11
+ # +use_vowels+:: enables the vowel comparison feature (if avaiable)
6
12
  def self.phonetize value
7
13
  value.phonetize
8
14
  end
9
15
 
10
- # DEPRECATED: Please use homophone? instead.
11
- def compare_phntc compared
12
- warn "[DEPRECATION] `compare_phntc` is deprecated. Please use `homophone?` instead."
13
- self.homophone? compared
14
- end
15
- # DEPRECATED: Please use homophone? instead.
16
- def compare_phonetically compared
17
- warn "[DEPRECATION] `compare_phonetically` is deprecated. Please use `homophone?` instead."
18
- self.homophone? compared
19
- end
20
-
16
+ # Compares the passed value with the object value, both in their phonetic version
17
+ # Params:
18
+ # +use_vowels+:: enables the vowel comparison feature (if avaiable)
21
19
  def homophone? compared
22
20
  SoundCord.homophone? self, compared
23
21
  end
data/soundcord.gemspec CHANGED
@@ -2,11 +2,11 @@ Gem::Specification.new do |s|
2
2
  s.name = %q{soundcord}
3
3
  s.author = 'Lukas Alexandre'
4
4
  s.email = 'lukeskytm@gmail.com'
5
- s.homepage = 'https://github.com/lukasalexandre/soundcord'
6
- s.version = "0.1.1"
5
+ s.homepage = 'http://lukasalexandre.github.com/soundcord'
6
+ s.version = "0.2.0"
7
7
  s.date = Date.today
8
- s.summary = %q{A phonetic algorithm implementation}
9
- s.description = "A phonetic algorithm to make comparison by phonetically similar terms easier."
8
+ s.summary = %q{A phonetic algorithm for indexing of words by their pronunciation.}
9
+ s.description = %q{"Make comparisons of phonetically similar terms easier."}
10
10
  s.files = Dir["{lib/**/*.rb,README.rdoc,test/**/*.rb,Rakefile,*.gemspec}"]
11
11
  s.require_paths = ["lib"]
12
12
  end
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+
3
+ require 'test/unit'
4
+ require 'soundcord'
5
+
6
+ class SoundCordTest < Test::Unit::TestCase
7
+ def test_simple_words
8
+ assert_equal "J", "João".phonetize
9
+ assert_equal "MR", "Maria".phonetize
10
+ assert_equal "LM", "Helena".phonetize
11
+ assert_equal "VLM", "Valmir".phonetize
12
+ assert_equal "VLM", "Walmir".phonetize
13
+ end
14
+ def test_simple_comparations
15
+ assert_equal true, "Joao".homophone?("João")
16
+ assert_equal true, "Helena".homophone?("Elena")
17
+ assert_equal true, "Walmir".homophone?("Valmir")
18
+ assert_equal true, "Marria".homophone?("Maria")
19
+ assert_equal true, "Wagner".homophone?("Vagner")
20
+ assert_equal true, "Mirela".homophone?("Mirella")
21
+ assert_equal true, "Artur".homophone?("Arthur")
22
+ assert_equal true, "Diego".homophone?("Dyego")
23
+ assert_equal true, "Felipe".homophone?("Phelipe")
24
+ assert_equal true, "Filipe".homophone?("Felipe")
25
+ assert_equal true, "Phelipe".homophone?("Filipe")
26
+ assert_equal true, "Philippe".homophone?("Felipe")
27
+ end
28
+ def test_use_vogals_option
29
+ assert_equal "ELEMA", "Helena".phonetize(:use_vowels => true)
30
+ end
31
+ def test_special_chars
32
+ assert_equal true, "Luçia".homophone?("lucia")
33
+ assert_equal true, "Lúcio".homophone?("lucio")
34
+ end
35
+ def test_find_in_collection
36
+ list = %w( saola paulo saulo ricardo sallo )
37
+ expected = %w( saola saulo sallo )
38
+ assert_equal expected, list.homophones("saulo")
39
+ list = %w( leonardo lucene rodrigo luciana lussene )
40
+ expected = %w( lucene luciana lussene )
41
+ assert_equal expected, list.homophones("lucene")
42
+ end
43
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ require 'test/unit'
4
+ require 'config'
5
+
6
+ class SoundCordTest < Test::Unit::TestCase
7
+ def test_language_set_up
8
+ SoundCord.load_language "pt-BR"
9
+ assert_equal "pt-BR", SoundCord.language
10
+ end
11
+ def test_language_set_up
12
+ SoundCord.load_language "en"
13
+ assert_equal "en", SoundCord.language
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ # encoding: utf-8
2
+
3
+ require "benchmark"
4
+ require 'test/unit'
5
+ require 'soundcord'
6
+
7
+ class SoundCordTest < Test::Unit::TestCase
8
+ # pt-BR
9
+ def test_with_100_words_pt_br
10
+ SoundCord.load_language 'pt-BR'
11
+
12
+ list_of_random_words = []
13
+ 100.times do
14
+ list_of_random_words << (0...8).map{65.+(rand(25)).chr}.join
15
+ end
16
+ time = Benchmark.measure do
17
+ list_of_random_words.each { |i| i.phonetize }
18
+ end
19
+ assert_block do
20
+ time.real < 0.5
21
+ end
22
+ end
23
+ end
@@ -4,14 +4,19 @@ require 'test/unit'
4
4
  require 'soundcord'
5
5
 
6
6
  class SoundCordTest < Test::Unit::TestCase
7
- def test_simple_words
8
- assert_equal "João".phonetize, "J"
9
- assert_equal "Maria".phonetize, "MR"
10
- assert_equal "Helena".phonetize, "LM"
11
- assert_equal "Valmir".phonetize, "VLM"
12
- assert_equal "Walmir".phonetize, "VLM"
13
- end
14
- def test_simple_comparations
7
+ # pt-BR
8
+ def test_simple_words_pt_br
9
+ SoundCord.load_language 'pt-BR'
10
+
11
+ assert_equal "J", "João".phonetize
12
+ assert_equal "MR", "Maria".phonetize
13
+ assert_equal "LM", "Helena".phonetize
14
+ assert_equal "VLM", "Valmir".phonetize
15
+ assert_equal "VLM", "Walmir".phonetize
16
+ end
17
+ def test_simple_comparisons_pt_br
18
+ SoundCord.load_language 'pt-BR'
19
+
15
20
  assert_equal true, "Joao".homophone?("João")
16
21
  assert_equal true, "Helena".homophone?("Elena")
17
22
  assert_equal true, "Walmir".homophone?("Valmir")
@@ -25,10 +30,15 @@ class SoundCordTest < Test::Unit::TestCase
25
30
  assert_equal true, "Phelipe".homophone?("Filipe")
26
31
  assert_equal true, "Philippe".homophone?("Felipe")
27
32
  end
28
- def test_use_vogals_option
29
- assert_equal "ELEMA", "Helena".phonetize(:use_vogals => true)
33
+ def test_special_chars_pt_br
34
+ SoundCord.load_language 'pt-BR'
35
+
36
+ assert_equal true, "Luçia".homophone?("lucia")
37
+ assert_equal true, "Lúcio".homophone?("lucio")
30
38
  end
31
- def test_find_in_collection
39
+ def test_find_in_collection_pt_br
40
+ SoundCord.load_language 'pt-BR'
41
+
32
42
  list = %w( saola paulo saulo ricardo sallo )
33
43
  expected = %w( saola saulo sallo )
34
44
  assert_equal expected, list.homophones("saulo")
@@ -36,4 +46,70 @@ class SoundCordTest < Test::Unit::TestCase
36
46
  expected = %w( lucene luciana lussene )
37
47
  assert_equal expected, list.homophones("lucene")
38
48
  end
49
+
50
+ # en
51
+ def test_initiations_en
52
+ SoundCord.load_language 'en'
53
+
54
+ assert_equal "RL", "aerial".phonetize
55
+ assert_equal "RP", "wrap".phonetize
56
+ assert_equal "SN", "xeno".phonetize
57
+ assert_equal "TFR", "whatever".phonetize
58
+ assert_equal "NM", "gnome".phonetize
59
+ assert_equal "NF", "knife".phonetize
60
+ assert_equal "NMNK", "pneumonic".phonetize
61
+ end
62
+
63
+ def test_unusual_combinations_en
64
+ SoundCord.load_language 'en'
65
+
66
+ assert_equal "0TR", "theater".phonetize
67
+ assert_equal "TX", "touch".phonetize
68
+ assert_equal "XL", "shell".phonetize
69
+ assert_equal "KRX", "crutch".phonetize
70
+ assert_equal "FS", "phase".phonetize
71
+ assert_equal "BKR", "beggar".phonetize
72
+ end
73
+
74
+ def test_terminations_en
75
+ SoundCord.load_language 'en'
76
+
77
+ assert_equal "LM", "lmb".phonetize
78
+ end
79
+
80
+ def test_middle_en
81
+ SoundCord.load_language 'en'
82
+
83
+ # couldn't remember a better word with SCH in the middle
84
+ assert_equal "PRSK", "porsche".phonetize
85
+ end
86
+
87
+ def test_duplicate_exceptions_en
88
+ SoundCord.load_language 'en'
89
+
90
+ assert_equal "GKLS", "goggles".phonetize
91
+ end
92
+
93
+ def test_special_chars_en
94
+ SoundCord.load_language 'en'
95
+
96
+ assert_equal true, "Qeyla".homophone?("keyla")
97
+ assert_equal true, "Courtiney".homophone?("kourtiney")
98
+ assert_equal true, "Quartz".homophone?("kuarts")
99
+ assert_equal true, "falue".homophone?("value")
100
+ assert_equal true, "data".homophone?("tada")
101
+ end
102
+
103
+ def test_second_follwed_by_en
104
+ SoundCord.load_language 'en'
105
+
106
+ assert_equal "JM", "ogema".phonetize
107
+ end
108
+
109
+ def test_vowels_pronunciation_insignificance_en
110
+ SoundCord.load_language 'en'
111
+
112
+ assert_equal "MSX", "messiah".phonetize
113
+ assert_equal "ML", "mehlia".phonetize
114
+ end
39
115
  end
metadata CHANGED
@@ -1,75 +1,58 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: soundcord
3
- version: !ruby/object:Gem::Version
4
- hash: 25
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 1
9
- - 1
10
- version: 0.1.1
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Lukas Alexandre
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-06-18 00:00:00 -03:00
19
- default_executable:
12
+ date: 2012-07-12 00:00:00.000000000 Z
20
13
  dependencies: []
21
-
22
- description: A phonetic algorithm to make comparison by phonetically similar terms easier.
14
+ description: ! '"Make comparisons of phonetically similar terms easier."'
23
15
  email: lukeskytm@gmail.com
24
16
  executables: []
25
-
26
17
  extensions: []
27
-
28
18
  extra_rdoc_files: []
29
-
30
- files:
19
+ files:
20
+ - lib/algorithm.rb
21
+ - lib/config.rb
31
22
  - lib/soundcord/integrations/array.rb
32
23
  - lib/soundcord/integrations/string.rb
33
24
  - lib/soundcord/version.rb
34
25
  - lib/soundcord.rb
26
+ - test/languages/pt_br/test_soundcord.rb
35
27
  - test/test_array.rb
28
+ - test/test_config.rb
29
+ - test/test_performance.rb
36
30
  - test/test_soundcord.rb
37
31
  - test/test_string.rb
38
32
  - Rakefile
39
33
  - soundcord.gemspec
40
- has_rdoc: true
41
- homepage: https://github.com/lukasalexandre/soundcord
34
+ homepage: http://lukasalexandre.github.com/soundcord
42
35
  licenses: []
43
-
44
36
  post_install_message:
45
37
  rdoc_options: []
46
-
47
- require_paths:
38
+ require_paths:
48
39
  - lib
49
- required_ruby_version: !ruby/object:Gem::Requirement
40
+ required_ruby_version: !ruby/object:Gem::Requirement
50
41
  none: false
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- hash: 3
55
- segments:
56
- - 0
57
- version: "0"
58
- required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
47
  none: false
60
- requirements:
61
- - - ">="
62
- - !ruby/object:Gem::Version
63
- hash: 3
64
- segments:
65
- - 0
66
- version: "0"
48
+ requirements:
49
+ - - ! '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
67
52
  requirements: []
68
-
69
53
  rubyforge_project:
70
- rubygems_version: 1.5.3
54
+ rubygems_version: 1.8.24
71
55
  signing_key:
72
56
  specification_version: 3
73
- summary: A phonetic algorithm implementation
57
+ summary: A phonetic algorithm for indexing of words by their pronunciation.
74
58
  test_files: []
75
-