RubyGems - engtagger - Versions diffs - 0.2.0 → 0.3.0 - Mend

engtagger 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/lib/engtagger.rb CHANGED Viewed

@@ -3,30 +3,17 @@
 $LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
 require 'rubygems'
-require 'kconv'
 require 'porter'
+require 'lru_redux'
-# use hpricot for extracting English text from docs with XML like tags
-begin
-  require 'hpricot'
-rescue LoadError
-  $no_hpricot = true
-end
-# File paths
-$lexpath   = File.join(File.dirname(__FILE__), 'engtagger')
-$word_path = File.join($lexpath, "pos_words.hash")
-$tag_path  = File.join($lexpath, "pos_tags.hash")
-# for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
-class Module
-  def memoize(method)
+module BoundedSpaceMemoizable
+  def memoize(method, max_cache_size=100000)
     # alias_method is faster than define_method + old.bind(self).call
     alias_method "__memoized__#{method}", method
     module_eval <<-EOF
-      def #{method}(*a, &b)
-        # assumes the block won't change the result if the args are the same
-        (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b)
+      def #{method}(*a)
+        @__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
+        @__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
       end
     EOF
   end
@@ -34,27 +21,39 @@ end
 # English part-of-speech tagger class
 class EngTagger
+  extend BoundedSpaceMemoizable
+  # File paths
+  DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), 'engtagger')
+  DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
+  DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
   #################
   # Class methods #
   #################
-  # Return a class variable that holds probability data
+  # Return a class variable that holds probability data.
+  #
+  # @return [Hash] the probability data
+  #
   def self.hmm
     return @@hmm
   end
-  # Return a class variable that holds lexical data
+  # Return a class variable that holds lexical data.
+  #
+  # @return [Hash] the lexicon
+  #
   def self.lexicon
     return @@lexicon
   end
-  # Return a regexp from a string argument that matches an XML-style pos tag
+  # Return a regexp from a string argument that matches an XML-style pos tag
   def self.get_ext(tag = nil)
     return nil unless tag
     return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
   end
   # Regexps to match XML-style part-of-speech tags
   NUM   = get_ext('cd')
   GER   = get_ext('vbg')
@@ -70,22 +69,37 @@ class EngTagger
   VB    = get_ext('vb')
   VBG   = get_ext('vbg')
   VBD   = get_ext('vbd')
-  PART  = get_ext('vbn')
+  PART  = get_ext('vbn')
   VBP   = get_ext('vbp')
   VBZ   = get_ext('vbz')
   JJ    = get_ext('jj')
   JJR   = get_ext('jjr')
   JJS   = get_ext('jjs')
+  RB    = get_ext('rb')
+  RBR   = get_ext('rbr')
+  RBS   = get_ext('rbs')
+  RP    = get_ext('rp')
+  WRB   = get_ext('wrb')
+  WDT   = get_ext('wdt')
+  WP    = get_ext('wp')
+  WPS   = get_ext('wps')
+  CC    = get_ext('cc')
+  IN    = get_ext('in')
-  # Convert a Treebank-style, abbreviated tag into verbose definitions
+  # Convert a Treebank-style, abbreviated tag into verbose definitions
+  #
+  # @param tag [#to_s] the tag in question
+  # @return [String] the definition, if available
+  #
   def self.explain_tag(tag)
+    tag = tag.to_s.downcase
     if TAGS[tag]
       return TAGS[tag]
     else
       return tag
     end
-  end
+  end
   # The folloging is to make a hash to convert a pos tag to its definition
   # used by the explain_tag method
   tags = [
@@ -132,35 +146,35 @@ class EngTagger
     "PPR",  "Punctuation, quotation mark right",
     "PPS",  "Punctuation, colon, semicolon, elipsis",
     "LRB",  "Punctuation, left bracket",
-    "RRB",  "Punctuation, right bracket"
-    ]
+    "RRB",  "Punctuation, right bracket"
+  ]
   tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
   tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
   TAGS = Hash[*tags]
   # Hash storing config values:
   #
   # * :unknown_word_tag
   #    => (String) Tag to assign to unknown words
-  # * :stem
+  # * :stem
   #    => (Boolean) Stem single words using Porter module
   # * :weight_noun_phrases
-  #    => (Boolean) When returning occurrence counts for a noun phrase, multiply
+  #    => (Boolean) When returning occurrence counts for a noun phrase, multiply
   #        the valuethe number of words in the NP.
-  # * :longest_noun_phrase
-  #    => (Integer) Will ignore noun phrases longer than this threshold. This
+  # * :longest_noun_phrase
+  #    => (Integer) Will ignore noun phrases longer than this threshold. This
   #        affects only the get_words() and get_nouns() methods.
-  # * :relax
-  #    => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
+  # * :relax
+  #    => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
   #        uncommon words, particularly words used polysemously
   # * :tag_lex
-  #    => (String) Name of the YAML file containing a hash of adjacent part of
+  #    => (String) Name of the YAML file containing a hash of adjacent part of
   #         speech tags and the probability of each
   # * :word_lex
-  #    => (String) Name of the YAML file containing a hash of words and corresponding
+  #    => (String) Name of the YAML file containing a hash of words and corresponding
   #        parts of speech
   # * :unknown_lex
-  #    => (String) Name of the YAML file containing a hash of tags for unknown
+  #    => (String) Name of the YAML file containing a hash of tags for unknown
   #        words and corresponding parts of speech
   # * :tag_path
   #    => (String) Directory path of tag_lex
@@ -169,12 +183,12 @@ class EngTagger
   # * :debug
   #    => (Boolean) Print debug messages
   attr_accessor :conf
   ###############
   # Constructor #
   ###############
-  # Take a hash of parameters that override default values.
+  # Take a hash of parameters that override default values.
   # See above for details.
   def initialize(params = {})
     @conf = Hash.new
@@ -186,13 +200,13 @@ class EngTagger
     @conf[:tag_lex] = 'tags.yml'
     @conf[:word_lex] = 'words.yml'
     @conf[:unknown_lex] = 'unknown.yml'
-    @conf[:word_path] = $word_path
-    @conf[:tag_path] = $tag_path
+    @conf[:word_path] = DEFAULT_WORDPATH
+    @conf[:tag_path] = DEFAULT_TAGPATH
     @conf[:debug] = false
     # assuming that we start analyzing from the beginninga new sentence...
-    @conf[:current_tag] = 'pp'
-    @conf.merge!(params)
-    unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
+    @conf[:current_tag] = 'pp'
+    @conf.merge!(params) if params
+    unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
       print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
       @@hmm = Hash.new
       @@lexicon = Hash.new
@@ -206,11 +220,38 @@ class EngTagger
     end
     @@mnp = get_max_noun_regex
   end
   ##################
   # Public methods #
   ##################
+  # Return an array of pairs of the form `["word", :tag]`.
+  #
+  # @param text [String] the input text
+  # @return [Array] the tagged words
+  #
+  def tag_pairs(text)
+    return [] unless valid_text(text)
+    out = clean_text(text).map do |word|
+      cleaned_word = clean_word word
+      tag = assign_tag(@conf[:current_tag], cleaned_word)
+      @conf[:current_tag] = tag = (tag and !tag.empty?) ? tag : 'nn'
+      [word, tag.to_sym]
+    end
+    # reset the tagger state
+    reset
+    out
+  end
+  # Examine the string provided and return it fully tagged in XML style.
+  #
+  # @param text [String] the input text
+  # @param verbose [false, true] whether to use verbose tags
+  # @return [String] the marked-up string
+  #
   # Examine the string provided and return it fully tagged in XML style
   def add_tags(text, verbose = false)
     return nil unless valid_text(text)
@@ -222,15 +263,15 @@ class EngTagger
       tag = assign_tag(@conf[:current_tag], cleaned_word)
       @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
       tag = EngTagger.explain_tag(tag) if verbose
-      tagged << '<' + tag + '>' + word + '</' + tag + '>'
+      tagged << '<' + tag + '>' + word + '</' + tag + '>'
     end
     reset
     return tagged.join(' ')
   end
-  # Given a text string, return as many nouns and noun phrases as possible.
+  # Given a text string, return as many nouns and noun phrases as possible.
   # Applies add_tags and involves three stages:
-  #
+  #
   # * Tag the text
   # * Extract all the maximal noun phrases
   # * Recursively extract all noun phrases from the MNPs
@@ -244,19 +285,19 @@ class EngTagger
       return get_noun_phrases(tagged)
     end
   end
-  # Return an easy-on-the-eyes tagged version of a text string.
+  # Return an easy-on-the-eyes tagged version of a text string.
   # Applies add_tags and reformats to be easier to read.
   def get_readable(text, verbose = false)
     return nil unless valid_text(text)
     tagged = add_tags(text, verbose)
-    tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
+    tagged = tagged.gsub(/<\w+>([^<]+|[<\w>]+)<\/(\w+)>/o) do
+    #!!!# tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
       $1 + '/' + $2.upcase
     end
-    return tagged
   end
-  # Return an array of sentences (without POS tags) from a text.
+  # Return an array of sentences (without POS tags) from a text.
   def get_sentences(text)
     return nil unless valid_text(text)
     tagged = add_tags(text)
@@ -270,25 +311,19 @@ class EngTagger
       sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
       sentence.gsub(Regexp.new(" (\W+)$")){$1}
       sentence.gsub(Regexp.new("^(`+) ")){$1}
-    end
+    end
     return sentences
   end
   # Given a POS-tagged text, this method returns a hash of all proper nouns
   # and their occurrence frequencies. The method is greedy and will
   # return multi-word phrases, if possible, so it would find ``Linguistic
-  # Data Consortium'' as a single unit, rather than as three individual
-  # proper nouns. This method does not stem the found words.
+  # Data Consortium'' as a single unit, rather than as three individual
+  # proper nouns. This method does not stem the found words.
   def get_proper_nouns(tagged)
     return nil unless valid_text(tagged)
-    trimmed = tagged.scan(NNP).map do |n|
-      strip_tags(n)
-    end
-    nnp = Hash.new(0)
-    trimmed.each do |n|
-      next unless n.length < 100  # sanity check on word length
-      nnp[n] += 1 unless n =~ /\A\s*\z/
-    end
+    tags = [NNP]
+    nnp = build_matches_hash(build_trimmed(tagged, tags))
     # Now for some fancy resolution stuff...
     nnp.keys.each do |key|
       words = key.split(/\s/)
@@ -301,7 +336,7 @@ class EngTagger
           /\A([a-z])[a-z]*\z/ =~ word
           $1
         end.join ''
-        # If that acronym has been seen,
+        # If that acronym has been seen,
         # remove it and add the values to
         # the full name
         if nnp[acronym]
@@ -312,167 +347,170 @@ class EngTagger
     end
     return nnp
   end
-  # Given a POS-tagged text, this method returns all nouns and their
-  # occurrence frequencies.
+  # Given a POS-tagged text, this method returns all nouns and their
+  # occurrence frequencies.
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_nouns(tagged)
     return nil unless valid_text(tagged)
-    NN
-    trimmed = tagged.scan(NN).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
+    tags = [NN]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  # Returns all types of verbs and does not descriminate between the
+  # various kinds. Combines all other verb methods listed in this
+  # class.
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
+  def get_verbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [VB, VBD, VBG, PART, VBP, VBZ]
+    build_matches_hash(build_trimmed(tagged, tags))
   end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_infinitive_verbs(tagged)
     return nil unless valid_text(tagged)
-    VB
-    trimmed = tagged.scan(VB).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
+    tags = [VB]
+    build_matches_hash(build_trimmed(tagged, tags))
   end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_past_tense_verbs(tagged)
     return nil unless valid_text(tagged)
-    VBD
-    trimmed = tagged.scan(VBD).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
+    tags = [VBD]
+    build_matches_hash(build_trimmed(tagged, tags))
   end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_gerund_verbs(tagged)
     return nil unless valid_text(tagged)
-    VBG
-    trimmed = tagged.scan(VB).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
+    tags = [VBG]
+    build_matches_hash(build_trimmed(tagged, tags))
   end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_passive_verbs(tagged)
     return nil unless valid_text(tagged)
-    PART
-    trimmed = tagged.scan(PART).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
+    tags = [PART]
+    build_matches_hash(build_trimmed(tagged, tags))
   end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_base_present_verbs(tagged)
     return nil unless valid_text(tagged)
-    VBP
-    trimmed = tagged.scan(VBP).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
+    tags = [VBP]
+    build_matches_hash(build_trimmed(tagged, tags))
   end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_present_verbs(tagged)
     return nil unless valid_text(tagged)
-    VBZ
-    trimmed = tagged.scan(VBZ).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
+    tags = [VBZ]
+    build_matches_hash(build_trimmed(tagged, tags))
   end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_adjectives(tagged)
     return nil unless valid_text(tagged)
-    JJ
-    trimmed = tagged.scan(JJ).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
+    tags = [JJ]
+    build_matches_hash(build_trimmed(tagged, tags))
   end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_comparative_adjectives(tagged)
     return nil unless valid_text(tagged)
-    JJR
-    trimmed = tagged.scan(JJR).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
-  end
+    tags = [JJR]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_superlative_adjectives(tagged)
     return nil unless valid_text(tagged)
-    JJS
-    trimmed = tagged.scan(JJS).map do |n|
-      strip_tags(n)
-    end
-    ret = Hash.new(0)
-    trimmed.each do |n|
-      n = stem(n)
-      next unless n.length < 100  # sanity check on word length
-      ret[n] += 1 unless n =~ /\A\s*\z/
-    end
-    return ret
+    tags = [JJS]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
+  def get_adverbs(tagged)
+    return nil unless valid_text(tagged)
+    tags = [RB, RBR, RBS, RP]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
+  def get_interrogatives(tagged)
+    return nil unless valid_text(tagged)
+    tags = [WRB, WDT, WP, WPS]
+    build_matches_hash(build_trimmed(tagged, tags))
+  end
+  # To be consistent with documentation's naming of 'interrogative'
+  # parts of speech as 'question'
+  alias_method :get_question_parts, :get_interrogatives
+  # Returns all types of conjunctions and does not discriminate
+  # between the various kinds. E.g. coordinating, subordinating,
+  # correlative...
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
+  def get_conjunctions(tagged)
+    return nil unless valid_text(tagged)
+    tags = [CC, IN]
+    build_matches_hash(build_trimmed(tagged, tags))
   end
   # Given a POS-tagged text, this method returns only the maximal noun phrases.
-  # May be called directly, but is also used by get_noun_phrases
+  # May be called directly, but is also used by `get_noun_phrases`.
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_max_noun_phrases(tagged)
-    return unless valid_text(tagged)
-    mn_phrases = tagged.scan(@@mnp).map do |m|
-      strip_tags(m)
-    end
+    return nil unless valid_text(tagged)
+    tags = [@@mnp]
+    mn_phrases = build_trimmed(tagged, tags)
     ret = Hash.new(0)
     mn_phrases.each do |p|
       p = stem(p) unless p =~ /\s/  # stem single words
@@ -482,11 +520,15 @@ class EngTagger
   end
   # Similar to get_words, but requires a POS-tagged text as an argument.
+  #
+  # @param tagged [String] the tagged text
+  # @return [Hash] the hash of matches
+  #
   def get_noun_phrases(tagged)
     return nil unless valid_text(tagged)
     found = Hash.new(0)
     phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
-    scanned = tagged.scan(@@mnp)
+      scanned = tagged.scan(@@mnp)
     # Find MNPs in the text, one sentence at a time
     # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
     mn_phrases = []
@@ -495,9 +537,9 @@ class EngTagger
       mn_phrases += m.split(phrase_ext)
     end
     mn_phrases.each do |mnp|
-     # Split the phrase into an array of words, and create a loop for each word,
-     # shortening the phrase by removing the word in the first position.
-     # Record the phrase and any single nouns that are found
+      # Split the phrase into an array of words, and create a loop for each word,
+      # shortening the phrase by removing the word in the first position.
+      # Record the phrase and any single nouns that are found
       words = mnp.split
       words.length.times do |i|
         found[words.join(' ')] += 1 if words.length > 1
@@ -519,12 +561,12 @@ class EngTagger
       multiplier = word_count if @conf[:weight_noun_phrases]
       ret[k] += multiplier * v
     end
-    return ret
+    return ret
   end
-  # Reads some included corpus data and saves it in a stored hash on the
-  # local file system. This is called automatically if the tagger can't
-  # find the stored lexicon.
+  # Reads some included corpus data and saves it in a stored hash on the
+  # local file system. This is called automatically if the tagger can't
+  # find the stored lexicon.
   def install
     puts "Creating part-of-speech lexicon" if @conf[:debug]
     load_tags(@conf[:tag_lex])
@@ -542,7 +584,23 @@ class EngTagger
   # Private methods #
   ###################
-  :private
+  private
+  def build_trimmed(tagged, tags)
+    tags.map { |tag| tagged.scan(tag) }.flatten.map do |n|
+      strip_tags(n)
+    end
+  end
+  def build_matches_hash(trimmed)
+    ret = Hash.new(0)
+    trimmed.each do |n|
+      n = stem(n)
+      next unless n.length < 100 # sanity check on word length
+      ret[n] += 1 unless n =~ /\A\s*\z/
+    end
+    ret
+  end
   # Downcase the first letter of word
   def lcfirst(word)
@@ -552,8 +610,8 @@ class EngTagger
   # Upcase the first letter of word
   def ucfirst(word)
     word.split(//)[0].upcase + word.split(//)[1..-1].join
-  end
+  end
   # Return the word stem as given by Stemmable module. This can be
   # turned off with the class parameter @conf[:stem] => false.
   def stem(word)
@@ -561,8 +619,8 @@ class EngTagger
     return word.stem
   end
-  # This method will reset the preceeding tag to a sentence ender (PP).
-  # This prepares the first word of a new sentence to be tagged correctly.
+  # This method will reset the preceeding tag to a sentence ender (PP).
+  # This prepares the first word of a new sentence to be tagged correctly.
   def reset
     @conf[:current_tag] = 'pp'
   end
@@ -581,7 +639,7 @@ class EngTagger
       return true
     end
   end
   # Return a text string with the part-of-speech tags removed
   def strip_tags(tagged, downcase = false)
     return nil unless valid_text(tagged)
@@ -595,18 +653,11 @@ class EngTagger
       return text
     end
   end
-  # Strip the provided text of HTML-style tags and separate off any punctuation
-  # in preparation for tagging
+  # Strip the provided text and separate off any punctuation in preparation for tagging
   def clean_text(text)
     return false unless valid_text(text)
-    text = text.toutf8
-    unless $no_hpricot
-      # Strip out any markup and convert entities to their proper form
-      cleaned_text = Hpricot(text).inner_text
-    else
-      cleaned_text = text
-    end
+    cleaned_text = text.encode('utf-8')
     tokenized = []
     # Tokenize the text (splitting on punctuation as you go)
     cleaned_text.split(/\s+/).each do |line|
@@ -615,41 +666,43 @@ class EngTagger
     words = split_sentences(tokenized)
     return words
   end
-  # This handles all of the trailing periods, keeping those that
+  # This handles all of the trailing periods, keeping those that
   # belong on abbreviations and removing those that seem to be
   # at the end of sentences. This method makes some assumptions
   # about the use of capitalization in the incoming text
   def split_sentences(array)
     tokenized = array
-    people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
+    people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
                 supt det mssrs rev)
     army   = %w(col gen lt cmdr adm capt sgt cpl maj brig)
     inst   = %w(dept univ assn bros ph.d)
-    place  = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
+    place  = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
                 hwy hway la pde pd plz pl rd st tce)
     comp   = %w(mfg inc ltd co corp)
-    state  = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
-                ind ia kans kan ken ky la me md is mass mich minn miss mo mont
-                neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
+    state  = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
+                ind ia kans kan ken ky la me md is mass mich minn miss mo mont
+                neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
                 va wash wis wisc wy wyo usafa alta man ont que sask yuk)
     month  = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
     misc   = %w(vs etc no esp)
-    abbr = Hash.new
+    abbr = Hash.new
     [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
       abbr[i] = true
     end
     words = Array.new
     tokenized.each_with_index do |t, i|
-      if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
+      if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
+          tokenized[i] =~ /\A(.+)\.\z/
         w = $1
-        # Don't separate the period off words that
+        # Don't separate the period off words that
         # meet any of the following conditions:
         #
         # 1. It is defined in one of the lists above
-        # 2. It is only one letter long: Alfred E. Sloan
+        # 2. It is only one letter long: Alfred E. Sloan
         # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
-        unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
+        unless abbr[w.downcase] or
+            [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
           words <<  w
           words << '.'
           next
@@ -664,8 +717,8 @@ class EngTagger
     end
     return words
   end
-  # Separate punctuation from words, where appropriate. This leaves trailing
+  # Separate punctuation from words, where appropriate. This leaves trailing
   # periods in place to be dealt with later. Called by the clean_text method.
   def split_punct(text)
     # If there's no punctuation, return immediately
@@ -675,27 +728,27 @@ class EngTagger
     # Put quotes into a standard format
     text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
-    text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
-    text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
+    text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
+    text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
     text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
     text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
     # Handle all other punctuation
     text = text.gsub(/--+/o, " - ") # Convert and separate dashes
     text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
-    text = text.gsub(/:/o, " :") # Shift semicolons off
-    text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
+    text = text.gsub(/:/o, " : ") # Shift semicolons off
+    text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
     text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
     text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
     # English-specific contractions
     text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2}  # Separate off 'd 'm 's
-    text = text.gsub(/n't\b/o, " n't")                     # Separate off n't
+    text = text.gsub(/n't\b/o, " n't")                     # Separate off n't
     text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1}         # Separate off 've, 'll, 're
     result = text.split(' ')
     return result
-  end
+  end
   # Given a preceding tag, assign a tag word. Called by the add_tags method.
   # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
   def assign_tag(prev_tag, word)
@@ -709,7 +762,7 @@ class EngTagger
     best_so_far = 0
     w = @@lexicon[word]
     t = @@hmm
     # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
     # which is used in most POS taggers
     best_tag = ""
@@ -724,9 +777,9 @@ class EngTagger
       else
         next
       end
-      # Bayesian logic:
-      # P =  P( tag | prev_tag ) * P( tag | word )
+      # Bayesian logic:
+      # P =  P( tag | prev_tag ) * P( tag | word )
       probability = t[prev_tag][tag] * (pw + 1)
       # Set the tag with maximal probability
       if probability > best_so_far
@@ -735,18 +788,18 @@ class EngTagger
       end
     end
     return best_tag
-  end
-  # This method determines whether a word should be considered in its
+  end
+  # This method determines whether a word should be considered in its
   # lower or upper case form. This is useful in considering proper nouns
-  # and words that begin sentences. Called by add_tags.
+  # and words that begin sentences. Called by add_tags.
   def clean_word(word)
     lcf = lcfirst(word)
     # seen this word as it appears (lower or upper case)
     if @@lexicon[word]
       return word
     elsif @@lexicon[lcf]
-      # seen this word only as lower case
+      # seen this word only as lower case
       return lcf
     else
       # never seen this word. guess.
@@ -754,14 +807,13 @@ class EngTagger
     end
   end
-  # This changes any word not appearing in the lexicon to identifiable
-  # classes of words handled by a simple unknown word classification
+  # This changes any word not appearing in the lexicon to identifiable
+  # classes of words handled by a simple unknown word classification
   # metric. Called by the clean_word method.
   def classify_unknown_word(word)
     if /[\(\{\[]/ =~ word  # Left brackets
       classified = "*LRB*"
-    elsif
-      /[\)\}\]]/ =~ word   # Right brackets
+    elsif /[\)\}\]]/ =~ word   # Right brackets
       classified = "*RRB*"
     elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
       classified = "*NUM*"
@@ -800,33 +852,33 @@ class EngTagger
     end
     return classified
   end
-  # This returns a compiled regexp for extracting maximal noun phrases
+  # This returns a compiled regexp for extracting maximal noun phrases
   # from a POS-tagged text.
   def get_max_noun_regex
     regex = /
-      # optional number, gerund - adjective -participle
-      (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
-        # Followed by one or more nouns
-        (?:#{NN})+
-          (?:
-            # Optional preposition, determinant, cardinal
-            (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
-              # Optional gerund-adjective -participle
-              (?:#{GER}|#{ADJ}|#{PART})*
-                # one or more nouns
-                (?:#{NN})+
-           )*
-    /xo #/
-    return regex
-  end
-  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
-  # YAML data parser. It will load a YAML document with a collection of key:
-  # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
-  # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
-  def load_tags(lexicon)
-    path = File.join($lexpath, lexicon)
+    # optional number, gerund - adjective -participle
+    (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
+      # Followed by one or more nouns
+      (?:#{NN})+
+      (?:
+       # Optional preposition, determinant, cardinal
+       (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
+       # Optional gerund-adjective -participle
+       (?:#{GER}|#{ADJ}|#{PART})*
+       # one or more nouns
+       (?:#{NN})+
+      )*
+      /xo #/
+      return regex
+  end
+  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
+  # YAML data parser. It will load a YAML document with a collection of key:
+  # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
+  # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
+  def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
+    path = File.join(lexpath, lexicon)
     fh = File.open(path, 'r')
     while line = fh.gets
       /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
@@ -837,19 +889,19 @@ class EngTagger
       pairs = {}
       items.each do |i|
         /([^:]+):\s*(.+)/ =~ i
-        pairs[$1] = $2.to_f
+        pairs[$1] = $2.to_f
       end
       @@hmm[key] = pairs
     end
     fh.close
   end
-  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
-  # YAML data parser. It will load a YAML document with a collection of key:
-  # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
-  # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
-  def load_words(lexicon)
-    path = File.join($lexpath, lexicon)
+  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
+  # YAML data parser. It will load a YAML document with a collection of key:
+  # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
+  # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
+  def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
+    path = File.join(lexpath, lexicon)
     fh = File.open(path, 'r')
     while line = fh.gets
       /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
@@ -860,15 +912,14 @@ class EngTagger
       pairs = {}
       items.each do |i|
         /([^:]+):\s*(.+)/ =~ i
-        pairs[$1] = $2.to_f
+        pairs[$1] = $2.to_f
       end
       @@lexicon[key] = pairs
     end
     fh.close
   end
-  #memoize the stem and assign_tag methods
+  #memoize the stem and assign_tag methods
   memoize("stem")
-  memoize("assign_tag")
+  memoize("assign_tag")
 end