RubyGems - soundcord - Versions diffs - 0.1.1 → 0.2.0 - Mend

soundcord 0.1.1 → 0.2.0

Files changed (12) hide show

data/Rakefile +3 -1
data/lib/algorithm.rb +134 -0
data/lib/config.rb +25 -0
data/lib/soundcord.rb +8 -87
data/lib/soundcord/integrations/array.rb +3 -0
data/lib/soundcord/integrations/string.rb +11 -13
data/soundcord.gemspec +4 -4
data/test/languages/pt_br/test_soundcord.rb +43 -0
data/test/test_config.rb +15 -0
data/test/test_performance.rb +23 -0
data/test/test_soundcord.rb +87 -11
metadata +26 -43

data/Rakefile CHANGED Viewed

@@ -1,7 +1,9 @@
 require 'rake/testtask'
 Rake::TestTask.new do |t|
-  t.libs << 'test'
+  t.libs << "test"
+  t.test_files = FileList['test/test*.rb']
+  t.verbose = true
 end
 desc "Run tests"

data/lib/algorithm.rb ADDED Viewed

@@ -0,0 +1,134 @@
+# encoding: utf-8
+class SoundCord
+  private
+  def self.process_text text
+    load_language unless language
+    text = text.downcase
+    lang_yml.each do |key, values|
+      if key == "terminations"
+        text = process_group text, values, :terminations => true
+      elsif key == "initiations"
+        text = process_group text, values, :initiations => true
+      elsif key == "follow_ups"
+        text = process_follow_ups text, values, options
+      elsif key == "second_followed"
+        text = process_second_followed text, values, options
+      elsif key == "vowels_proonunciation_insignificance"
+        text = process_vowels_proonunciation_insignificance text, values, options
+      elsif !key.include? "duplicate"
+        text = process_group text, values, options
+      end
+    end
+    text = remove_duplicity text, :duplicate_exceptions => (lang_yml["duplicate_exceptions"])
+    text.upcase
+  end
+  def self.remove_duplicity text, options
+    options[:duplicate_exceptions] = [] unless options[:duplicate_exceptions]
+    text.split(//).inject("") do |s, n|
+      last_s_char = s[s.length-1..s.length-1]
+      s + ((last_s_char === n &&
+            !options[:duplicate_exceptions].include?(n)) ? '' : n )
+    end
+  end
+  def self.process_group text, group, options
+    group.each do |key, values|
+      if values
+        text = simple_replace text, key, values, options
+      else
+        text = simple_replace text, '', key, options
+      end
+    end
+    return text
+  end
+  def self.process_follow_ups text, group, options = {}
+    group.each do |key, prefixes|
+      prefixes.each do |prefix, sufixes|
+        regexp = mount_follow_up_regexp prefix, sufixes
+        text = text.gsub regexp, key
+      end
+    end
+    return text
+  end
+  def self.process_second_followed text, group, options = {}
+    group.each do |key, prefixes|
+      prefixes.each do |prefix, sufixes|
+        regexp = mount_second_followed_by_regexp prefix, sufixes
+        text =~ regexp
+        replacing = ($1 ? $1 : '') + key
+        text = text.gsub regexp, replacing
+      end
+    end
+    return text
+  end
+  def process_vowels_proonunciation_insignificance text, group
+    group.each do |key, value|
+      regexp = mount_vowels_proonunciation_insignificance_regexp value
+      text =~ regexp
+      text = text.gsub regexp, $1
+    end
+    return text
+  end
+  def self.process_followed_by_consonant_regexp text, group
+    group.each do |key, value|
+      regexp = mount_followed_by_consonant_regexp value
+      text = text.gsub regexp, ''
+    end
+    return text
+  end
+  def self.simple_replace text, key, values, options
+    regexp = mount_regexp values, options
+    text.gsub regexp, key.to_s
+  end
+  def self.mount_regexp sentence, options = { :terminations => false, :initiations => false }
+    regexp = "/"
+    regexp += "^" if options[:initiations]
+    regexp += "("
+    regexp += sentence.kind_of?(Array) ? sentence.join("|") : sentence
+    regexp += ")"
+    regexp += "\\b" if options[:terminations]
+    regexp += "/"
+    eval(regexp)
+  end
+  def self.mount_follow_up_regexp prefix, sufix, options = {}
+    regexp = options[:not_eval] ? "" : "/"
+    regexp += prefix
+    regexp += "(?="
+    regexp += "("
+    regexp += sufix.kind_of?(Array) ? sufix.join("|") : sufix
+    regexp += "))"
+    regexp += "/" unless options[:not_eval]
+    options[:not_eval] ? regexp : eval(regexp)
+  end
+  def self.mount_second_followed_by_regexp char, group
+    regexp = "/" + not_first(char) + mount_follow_up_regexp(char, group, :not_eval => true) + "/"
+    eval(regexp)
+  end
+  def self.mount_vowels_proonunciation_insignificance_regexp char
+    eval "/([aeiou])#{char}(?=\b|[^aeiou])/"
+  end
+  def self.mount_followed_by_consonant_regexp char
+    eval "[#{char}](?![aeiou])"
+  end
+  def self.not_first char
+    "([^#{char}]|^)"
+  end
+end

data/lib/config.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require 'yaml'
+class SoundCord
+  DEFAULT_LANGUAGE = 'pt-BR'
+  LANGUAGES_DIRECTORY = "#{Dir.pwd}/lib/soundcord/languages/"
+  def self.load_language lang = DEFAULT_LANGUAGE
+    @language = lang
+    @lang_yml = YAML::load_file(LANGUAGES_DIRECTORY + "#{lang}.yml")[language]
+    @options  = { :use_vowels => false }
+  end
+  def self.language
+    @language
+  end
+  def self.options
+    @options
+  end
+  private
+  def self.lang_yml
+    @lang_yml
+  end
+end

data/lib/soundcord.rb CHANGED Viewed

@@ -2,98 +2,19 @@
 require 'soundcord/integrations/string'
 require 'soundcord/integrations/array'
+require 'algorithm'
+require 'config'
 class SoundCord
-  def self.phonetize text, options = { :use_vogals => false }
-    return handle_text(text, options)
+  def self.phonetize text
+    process_text(text)
   end
-  def self.compare term_1, term_2, options = { :use_vogals => false }
-    homophone? term_1, term_2, options
+  def self.compare term_1, term_2
+    homophone? term_1, term_2
   end
-  def self.homophone? term_1, term_2, options = { :use_vogals => false }
-    phonetize(term_1, options) == phonetize(term_2, options)
-  end
-  private
-  def self.handle_text text, options = { :use_vogals => false }
-    text = text.downcase
-    text = remove_duplicity text
-    text = handle_special_chars text
-    text = handle_unusual_chars text
-    text = handle_unusual_combinations text
-    text = handle_terminations text
-    text = remove_vogals(text) unless options[:use_vogals]
-    text = remove_unwanted_chars text
-    text.upcase
-  end
-  def self.handle_special_chars text
-    text = text.gsub /(á|à|â|ã)/, 'a'
-    text = text.gsub /(é|è|ê)/, 'e'
-    text = text.gsub /(í|ì|î)/, 'i'
-    text = text.gsub /(ó|ò|ô|õ)/, 'o'
-    text = text.gsub /(ú|ù|û)/, 'u'
-  end
-  def self.handle_unusual_chars text
-    text = text.gsub /y/, 'i'
-  end
-  def self.handle_unusual_combinations text
-    text = text.gsub /(br|bl)/, 'b'
-    text = text.gsub /ph/, 'f'
-    text = text.gsub /(gr|mg|ng|rg|gl)/, 'g'
-    text = text.gsub /(ge|gi|rj|mj|nj)/, 'j'
-    text = text.gsub /(ce|ci|ch|cs)/, 's'
-    text = text.gsub /ct/, 't'
-    text = text.gsub /(q|ca|co|cu|ck|c)/, 'k'
-    text = text.gsub /lh/, 'l'
-    text = text.gsub /rm/, 'sm'
-    text = text.gsub /(rm|gm|md|sm|ao\b)/, 'm'
-    text = text.gsub /n/, 'm'
-    text = text.gsub /ao\b/, 'm'
-    text = text.gsub /nh/, 'n'
-    text = text.gsub /pr/, 'p'
-    text = text.gsub /(ç|x|ts|c|z|rs)/, 's'
-    text = text.gsub /(tr|tl|lt|rt|st)/, 's'
-    text = text.gsub /w/, 'v'
-  end
-  def self.handle_terminations text
-    text = text.gsub /(s|z|r|m|n|ao|l)\b/, ''
-  end
-  def self.remove_vogals text
-    text = text.gsub /(a|e|i|o|u)/, ''
-  end
-  def self.remove_unwanted_chars text
-    text = text.gsub /h/, ''
-  end
-  def self.remove_duplicity text
-    text.split(//).inject("") do |s,n|
-      s + ((s[s.length-1..s.length-1] === n) ? '' : n )
-    end
+  def self.homophone? term_1, term_2
+    phonetize(term_1) == phonetize(term_2)
   end
 end

data/lib/soundcord/integrations/array.rb CHANGED Viewed

@@ -1,4 +1,7 @@
 class Array
+  # Search possible homphone matches within the array object for a given string word
+  # Params:
+  # +value+:: string to be phonetized and compared with the array items
   def homophones value
     self.select { |i| i.homophone? value }
   end

data/lib/soundcord/integrations/string.rb CHANGED Viewed

@@ -1,23 +1,21 @@
 class String
-  def phonetize options = { :use_vogals => false }
-    SoundCord.phonetize self, options
+  # Returns the phonetic version of the object string
+  # Params:
+  # +use_vowels+:: enables the vowel comparison feature (if avaiable)
+  def phonetize
+    SoundCord.phonetize self
   end
+  # Returns the phonetic version of the passed string
+  # Params:
+  # +use_vowels+:: enables the vowel comparison feature (if avaiable)
   def self.phonetize value
     value.phonetize
   end
-  # DEPRECATED: Please use homophone? instead.
-  def compare_phntc compared
-    warn "[DEPRECATION] `compare_phntc` is deprecated. Please use `homophone?` instead."
-    self.homophone? compared
-  end
-  # DEPRECATED: Please use homophone? instead.
-  def compare_phonetically compared
-    warn "[DEPRECATION] `compare_phonetically` is deprecated. Please use `homophone?` instead."
-    self.homophone? compared
-  end
+  # Compares the passed value with the object value, both in their phonetic version
+  # Params:
+  # +use_vowels+:: enables the vowel comparison feature (if avaiable)
   def homophone? compared
     SoundCord.homophone? self, compared
   end

data/soundcord.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@ Gem::Specification.new do |s|
   s.name = %q{soundcord}
   s.author = 'Lukas Alexandre'
   s.email = 'lukeskytm@gmail.com'
-  s.homepage = 'https://github.com/lukasalexandre/soundcord'
-  s.version = "0.1.1"
+  s.homepage = 'http://lukasalexandre.github.com/soundcord'
+  s.version = "0.2.0"
   s.date = Date.today
-  s.summary = %q{A phonetic algorithm implementation}
-  s.description = "A phonetic algorithm to make comparison by phonetically similar terms easier."
+  s.summary = %q{A phonetic algorithm for indexing of words by their pronunciation.}
+  s.description = %q{"Make comparisons of phonetically similar terms easier."}
   s.files = Dir["{lib/**/*.rb,README.rdoc,test/**/*.rb,Rakefile,*.gemspec}"]
   s.require_paths = ["lib"]
 end

data/test/languages/pt_br/test_soundcord.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# encoding: utf-8
+require 'test/unit'
+require 'soundcord'
+class SoundCordTest < Test::Unit::TestCase
+  def test_simple_words
+    assert_equal "J", "João".phonetize
+    assert_equal "MR", "Maria".phonetize
+    assert_equal "LM", "Helena".phonetize
+    assert_equal "VLM", "Valmir".phonetize
+    assert_equal "VLM", "Walmir".phonetize
+  end
+  def test_simple_comparations
+    assert_equal true, "Joao".homophone?("João")
+    assert_equal true, "Helena".homophone?("Elena")
+    assert_equal true, "Walmir".homophone?("Valmir")
+    assert_equal true, "Marria".homophone?("Maria")
+    assert_equal true, "Wagner".homophone?("Vagner")
+    assert_equal true, "Mirela".homophone?("Mirella")
+    assert_equal true, "Artur".homophone?("Arthur")
+    assert_equal true, "Diego".homophone?("Dyego")
+    assert_equal true, "Felipe".homophone?("Phelipe")
+    assert_equal true, "Filipe".homophone?("Felipe")
+    assert_equal true, "Phelipe".homophone?("Filipe")
+    assert_equal true, "Philippe".homophone?("Felipe")
+  end
+  def test_use_vogals_option
+    assert_equal "ELEMA", "Helena".phonetize(:use_vowels => true)
+  end
+  def test_special_chars
+    assert_equal true, "Luçia".homophone?("lucia")
+    assert_equal true, "Lúcio".homophone?("lucio")
+  end
+  def test_find_in_collection
+    list = %w( saola paulo saulo ricardo sallo )
+    expected = %w( saola saulo sallo )
+    assert_equal expected, list.homophones("saulo")
+    list = %w( leonardo lucene rodrigo luciana lussene )
+    expected = %w( lucene luciana lussene )
+    assert_equal expected, list.homophones("lucene")
+  end
+end

data/test/test_config.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# encoding: utf-8
+require 'test/unit'
+require 'config'
+class SoundCordTest < Test::Unit::TestCase
+  def test_language_set_up
+    SoundCord.load_language "pt-BR"
+    assert_equal "pt-BR", SoundCord.language
+  end
+  def test_language_set_up
+    SoundCord.load_language "en"
+    assert_equal "en", SoundCord.language
+  end
+end

data/test/test_performance.rb ADDED Viewed

@@ -0,0 +1,23 @@
+# encoding: utf-8
+require "benchmark"
+require 'test/unit'
+require 'soundcord'
+class SoundCordTest < Test::Unit::TestCase
+  # pt-BR
+  def test_with_100_words_pt_br
+    SoundCord.load_language 'pt-BR'
+    list_of_random_words = []
+    100.times do
+      list_of_random_words << (0...8).map{65.+(rand(25)).chr}.join
+    end
+    time = Benchmark.measure do
+      list_of_random_words.each { |i| i.phonetize }
+    end
+    assert_block do
+      time.real < 0.5
+    end
+  end
+end

data/test/test_soundcord.rb CHANGED Viewed

@@ -4,14 +4,19 @@ require 'test/unit'
 require 'soundcord'
 class SoundCordTest < Test::Unit::TestCase
-  def test_simple_words
-    assert_equal "João".phonetize, "J"
-    assert_equal "Maria".phonetize, "MR"
-    assert_equal "Helena".phonetize, "LM"
-    assert_equal "Valmir".phonetize, "VLM"
-    assert_equal "Walmir".phonetize, "VLM"
-  end
-  def test_simple_comparations
+  # pt-BR
+  def test_simple_words_pt_br
+    SoundCord.load_language 'pt-BR'
+    assert_equal "J", "João".phonetize
+    assert_equal "MR", "Maria".phonetize
+    assert_equal "LM", "Helena".phonetize
+    assert_equal "VLM", "Valmir".phonetize
+    assert_equal "VLM", "Walmir".phonetize
+  end
+  def test_simple_comparisons_pt_br
+    SoundCord.load_language 'pt-BR'
     assert_equal true, "Joao".homophone?("João")
     assert_equal true, "Helena".homophone?("Elena")
     assert_equal true, "Walmir".homophone?("Valmir")
@@ -25,10 +30,15 @@ class SoundCordTest < Test::Unit::TestCase
     assert_equal true, "Phelipe".homophone?("Filipe")
     assert_equal true, "Philippe".homophone?("Felipe")
   end
-  def test_use_vogals_option
-    assert_equal "ELEMA", "Helena".phonetize(:use_vogals => true)
+  def test_special_chars_pt_br
+    SoundCord.load_language 'pt-BR'
+    assert_equal true, "Luçia".homophone?("lucia")
+    assert_equal true, "Lúcio".homophone?("lucio")
   end
-  def test_find_in_collection
+  def test_find_in_collection_pt_br
+    SoundCord.load_language 'pt-BR'
     list = %w( saola paulo saulo ricardo sallo )
     expected = %w( saola saulo sallo )
     assert_equal expected, list.homophones("saulo")
@@ -36,4 +46,70 @@ class SoundCordTest < Test::Unit::TestCase
     expected = %w( lucene luciana lussene )
     assert_equal expected, list.homophones("lucene")
   end
+  # en
+  def test_initiations_en
+    SoundCord.load_language 'en'
+    assert_equal "RL", "aerial".phonetize
+    assert_equal "RP", "wrap".phonetize
+    assert_equal "SN", "xeno".phonetize
+    assert_equal "TFR", "whatever".phonetize
+    assert_equal "NM", "gnome".phonetize
+    assert_equal "NF", "knife".phonetize
+    assert_equal "NMNK", "pneumonic".phonetize
+  end
+  def test_unusual_combinations_en
+    SoundCord.load_language 'en'
+    assert_equal "0TR", "theater".phonetize
+    assert_equal "TX", "touch".phonetize
+    assert_equal "XL", "shell".phonetize
+    assert_equal "KRX", "crutch".phonetize
+    assert_equal "FS", "phase".phonetize
+    assert_equal "BKR", "beggar".phonetize
+  end
+  def test_terminations_en
+    SoundCord.load_language 'en'
+    assert_equal "LM", "lmb".phonetize
+  end
+  def test_middle_en
+    SoundCord.load_language 'en'
+    # couldn't remember a better word with SCH in the middle
+    assert_equal "PRSK", "porsche".phonetize
+  end
+  def test_duplicate_exceptions_en
+    SoundCord.load_language 'en'
+    assert_equal "GKLS", "goggles".phonetize
+  end
+  def test_special_chars_en
+    SoundCord.load_language 'en'
+    assert_equal true, "Qeyla".homophone?("keyla")
+    assert_equal true, "Courtiney".homophone?("kourtiney")
+    assert_equal true, "Quartz".homophone?("kuarts")
+    assert_equal true, "falue".homophone?("value")
+    assert_equal true, "data".homophone?("tada")
+  end
+  def test_second_follwed_by_en
+    SoundCord.load_language 'en'
+    assert_equal "JM", "ogema".phonetize
+  end
+  def test_vowels_pronunciation_insignificance_en
+    SoundCord.load_language 'en'
+    assert_equal "MSX", "messiah".phonetize
+    assert_equal "ML", "mehlia".phonetize
+  end
 end

metadata CHANGED Viewed

@@ -1,75 +1,58 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: soundcord
-version: !ruby/object:Gem::Version
-  hash: 25
+version: !ruby/object:Gem::Version
+  version: 0.2.0
   prerelease:
-  segments:
-  - 0
-  - 1
-  - 1
-  version: 0.1.1
 platform: ruby
-authors:
+authors:
 - Lukas Alexandre
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-06-18 00:00:00 -03:00
-default_executable:
+date: 2012-07-12 00:00:00.000000000 Z
 dependencies: []
-description: A phonetic algorithm to make comparison by phonetically similar terms easier.
+description: ! '"Make comparisons of phonetically similar terms easier."'
 email: lukeskytm@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []
-files:
+files:
+- lib/algorithm.rb
+- lib/config.rb
 - lib/soundcord/integrations/array.rb
 - lib/soundcord/integrations/string.rb
 - lib/soundcord/version.rb
 - lib/soundcord.rb
+- test/languages/pt_br/test_soundcord.rb
 - test/test_array.rb
+- test/test_config.rb
+- test/test_performance.rb
 - test/test_soundcord.rb
 - test/test_string.rb
 - Rakefile
 - soundcord.gemspec
-has_rdoc: true
-homepage: https://github.com/lukasalexandre/soundcord
+homepage: http://lukasalexandre.github.com/soundcord
 licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.5.3
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
-summary: A phonetic algorithm implementation
+summary: A phonetic algorithm for indexing of words by their pronunciation.
 test_files: []