RubyGems - engtagger - Versions diffs - 0.3.2 → 0.4.0 - Mend

engtagger 0.3.2 → 0.4.0

Files changed (12) hide show

data/lib/engtagger.rb CHANGED Viewed

@@ -1,20 +1,21 @@
 #!/usr/bin/env ruby
-# -*- coding: utf-8 -*-
-require 'rubygems'
-require 'engtagger/porter'
-require 'lru_redux'
+# frozen_string_literal: true
+require "rubygems"
+require "lru_redux"
+require_relative "engtagger/porter"
 module BoundedSpaceMemoizable
-  def memoize(method, max_cache_size=100000)
+  def memoize(method, max_cache_size = 100_000)
     # alias_method is faster than define_method + old.bind(self).call
     alias_method "__memoized__#{method}", method
-    module_eval <<-EOF
+    module_eval <<-MODEV
       def #{method}(*a)
         @__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
         @__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
       end
-    EOF
+    MODEV
   end
 end
@@ -23,7 +24,7 @@ class EngTagger
   extend BoundedSpaceMemoizable
   # File paths
-  DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), 'engtagger')
+  DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), "engtagger")
   DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
   DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
@@ -36,7 +37,7 @@ class EngTagger
   # @return [Hash] the probability data
   #
   def self.hmm
-    return @@hmm
+    @@hmm
   end
   # Return a class variable that holds lexical data.
@@ -44,46 +45,47 @@ class EngTagger
   # @return [Hash] the lexicon
   #
   def self.lexicon
-    return @@lexicon
+    @@lexicon
   end
   # Return a regexp from a string argument that matches an XML-style pos tag
   def self.get_ext(tag = nil)
     return nil unless tag
-    return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
+    Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
   end
   # Regexps to match XML-style part-of-speech tags
-  NUM   = get_ext('cd')
-  GER   = get_ext('vbg')
-  ADJ   = get_ext('jj[rs]*')
-  NN    = get_ext('nn[sp]*')
-  NNP   = get_ext('nnp')
-  PREP  = get_ext('in')
-  DET   = get_ext('det')
-  PAREN = get_ext('[lr]rb')
-  QUOT  = get_ext('ppr')
-  SEN   = get_ext('pp')
-  WORD  = get_ext('\w+')
-  VB    = get_ext('vb')
-  VBG   = get_ext('vbg')
-  VBD   = get_ext('vbd')
-  PART  = get_ext('vbn')
-  VBP   = get_ext('vbp')
-  VBZ   = get_ext('vbz')
-  JJ    = get_ext('jj')
-  JJR   = get_ext('jjr')
-  JJS   = get_ext('jjs')
-  RB    = get_ext('rb')
-  RBR   = get_ext('rbr')
-  RBS   = get_ext('rbs')
-  RP    = get_ext('rp')
-  WRB   = get_ext('wrb')
-  WDT   = get_ext('wdt')
-  WP    = get_ext('wp')
-  WPS   = get_ext('wps')
-  CC    = get_ext('cc')
-  IN    = get_ext('in')
+  NUM   = get_ext("cd")
+  GER   = get_ext("vbg")
+  ADJ   = get_ext("jj[rs]*")
+  NN    = get_ext("nn[sp]*")
+  NNP   = get_ext("nnp")
+  PREP  = get_ext("in")
+  DET   = get_ext("det")
+  PAREN = get_ext("[lr]rb")
+  QUOT  = get_ext("ppr")
+  SEN   = get_ext("pp")
+  WORD  = get_ext("\w+")
+  VB    = get_ext("vb")
+  VBG   = get_ext("vbg")
+  VBD   = get_ext("vbd")
+  PART  = get_ext("vbn")
+  VBP   = get_ext("vbp")
+  VBZ   = get_ext("vbz")
+  JJ    = get_ext("jj")
+  JJR   = get_ext("jjr")
+  JJS   = get_ext("jjs")
+  RB    = get_ext("rb")
+  RBR   = get_ext("rbr")
+  RBS   = get_ext("rbs")
+  RP    = get_ext("rp")
+  WRB   = get_ext("wrb")
+  WDT   = get_ext("wdt")
+  WP    = get_ext("wp")
+  WPS   = get_ext("wps")
+  CC    = get_ext("cc")
+  IN    = get_ext("in")
   # Convert a Treebank-style, abbreviated tag into verbose definitions
   #
@@ -92,11 +94,7 @@ class EngTagger
   #
   def self.explain_tag(tag)
     tag = tag.to_s.downcase
-    if TAGS[tag]
-      return TAGS[tag]
-    else
-      return tag
-    end
+    TAGS[tag] || tag
   end
   # The folloging is to make a hash to convert a pos tag to its definition
@@ -147,8 +145,8 @@ class EngTagger
     "LRB",  "Punctuation, left bracket",
     "RRB",  "Punctuation, right bracket"
   ]
-  tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
-  tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
+  tags = tags.collect { |t| t.downcase.gsub(/[.,'\-\s]+/, "_") }
+  tags = tags.collect { |t| t.gsub("&", "and").gsub("/", "or") }
   TAGS = Hash[*tags]
   # Hash storing config values:
@@ -190,30 +188,30 @@ class EngTagger
   # Take a hash of parameters that override default values.
   # See above for details.
   def initialize(params = {})
-    @conf = Hash.new
-    @conf[:unknown_word_tag] = ''
+    @conf = {}
+    @conf[:unknown_word_tag] = ""
     @conf[:stem] = false
     @conf[:weight_noun_phrases] = false
     @conf[:longest_noun_phrase] = 5
     @conf[:relax] = false
-    @conf[:tag_lex] = 'tags.yml'
-    @conf[:word_lex] = 'words.yml'
-    @conf[:unknown_lex] = 'unknown.yml'
+    @conf[:tag_lex] = "tags.yml"
+    @conf[:word_lex] = "words.yml"
+    @conf[:unknown_lex] = "unknown.yml"
     @conf[:word_path] = DEFAULT_WORDPATH
     @conf[:tag_path] = DEFAULT_TAGPATH
     @conf[:debug] = false
     # assuming that we start analyzing from the beginninga new sentence...
-    @conf[:current_tag] = 'pp'
+    @conf[:current_tag] = "pp"
     @conf.merge!(params) if params
-    unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
+    if !File.exist?(@conf[:word_path]) || !File.exist?(@conf[:tag_path])
       print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
-      @@hmm = Hash.new
-      @@lexicon = Hash.new
+      @@hmm = {}
+      @@lexicon = {}
     else
-      lexf = File.open(@conf[:word_path], 'r')
+      lexf = File.open(@conf[:word_path], "r")
       @@lexicon = Marshal.load(lexf)
       lexf.close
-      hmmf = File.open(@conf[:tag_path], 'r')
+      hmmf = File.open(@conf[:tag_path], "r")
       @@hmm = Marshal.load(hmmf)
       hmmf.close
     end
@@ -235,7 +233,7 @@ class EngTagger
     out = clean_text(text).map do |word|
       cleaned_word = clean_word word
       tag = assign_tag(@conf[:current_tag], cleaned_word)
-      @conf[:current_tag] = tag = (tag and !tag.empty?) ? tag : 'nn'
+      @conf[:current_tag] = tag = tag && !tag.empty? ? tag : "nn"
       [word, tag.to_sym]
     end
@@ -254,18 +252,18 @@ class EngTagger
   # Examine the string provided and return it fully tagged in XML style
   def add_tags(text, verbose = false)
     return nil unless valid_text(text)
     tagged = []
     words = clean_text(text)
-    tags = Array.new
     words.each do |word|
       cleaned_word = clean_word(word)
       tag = assign_tag(@conf[:current_tag], cleaned_word)
-      @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
+      @conf[:current_tag] = tag = tag && tag != "" ? tag : "nn"
       tag = EngTagger.explain_tag(tag) if verbose
-      tagged << '<' + tag + '>' + word + '</' + tag + '>'
+      tagged << "<#{tag}>#{word}</#{tag}>"
     end
     reset
-    return tagged.join(' ')
+    tagged.join(" ")
   end
   # Given a text string, return as many nouns and noun phrases as possible.
@@ -277,11 +275,12 @@ class EngTagger
   #
   def get_words(text)
     return false unless valid_text(text)
     tagged = add_tags(text)
-    if(@conf[:longest_noun_phrase] <= 1)
-      return get_nouns(tagged)
+    if @conf[:longest_noun_phrase] <= 1
+      get_nouns(tagged)
     else
-      return get_noun_phrases(tagged)
+      get_noun_phrases(tagged)
     end
   end
@@ -289,29 +288,29 @@ class EngTagger
   # Applies add_tags and reformats to be easier to read.
   def get_readable(text, verbose = false)
     return nil unless valid_text(text)
     tagged = add_tags(text, verbose)
-    tagged = tagged.gsub(/<\w+>([^<]+|[<\w>]+)<\/(\w+)>/o) do
-    #!!!# tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
-      $1 + '/' + $2.upcase
+    tagged.gsub(%r{<\w+>([^<]+|[<\w>]+)</(\w+)>}o) do
+      "#{$1}/#{$2.upcase}"
     end
   end
   # Return an array of sentences (without POS tags) from a text.
   def get_sentences(text)
     return nil unless valid_text(text)
     tagged = add_tags(text)
-    sentences = Array.new
-    tagged.split(/<\/pp>/).each do |line|
+    sentences = []
+    tagged.split(%r{</pp>}).each do |line|
       sentences << strip_tags(line)
     end
     sentences = sentences.map do |sentence|
-      sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
-      sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
-      sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
-      sentence.gsub(Regexp.new(" (\W+)$")){$1}
-      sentence.gsub(Regexp.new("^(`+) ")){$1}
+      sentence.gsub(Regexp.new(" ('s?) ")) { $1 + " " }
+      sentence.gsub(Regexp.new(" (\W+) ")) { $1 + " " }
+      sentence.gsub(Regexp.new(" (`+) ")) { " " + $1 }
+      sentence.gsub(Regexp.new(" (\W+)$")) { $1 }
+      sentence.gsub(Regexp.new("^(`+) ")) { $1 }
     end
-    return sentences
   end
   # Given a POS-tagged text, this method returns a hash of all proper nouns
@@ -321,30 +320,31 @@ class EngTagger
   # proper nouns. This method does not stem the found words.
   def get_proper_nouns(tagged)
     return nil unless valid_text(tagged)
     tags = [NNP]
     nnp = build_matches_hash(build_trimmed(tagged, tags))
     # Now for some fancy resolution stuff...
-    nnp.keys.each do |key|
+    nnp.each_key do |key|
       words = key.split(/\s/)
       # Let's say this is an organization's name --
       # (and it's got at least three words)
       # is there a corresponding acronym in this hash?
-      if words.length > 2
-        # Make a (naive) acronym out of this name
-        acronym = words.map do |word|
-          /\A([a-z])[a-z]*\z/ =~ word
-          $1
-        end.join ''
-        # If that acronym has been seen,
-        # remove it and add the values to
-        # the full name
-        if nnp[acronym]
-          nnp[key] += nnp[acronym]
-          nnp.delete(acronym)
-        end
+      next if words.length <= 2
+      # Make a (naive) acronym out of this name
+      acronym = words.map do |word|
+        /\A([a-z])[a-z]*\z/ =~ word
+        $1
+      end.join " "
+      # If that acronym has been seen,
+      # remove it and add the values to
+      # the full name
+      if nnp[acronym]
+        nnp[key] += nnp[acronym]
+        nnp.delete(acronym)
       end
     end
-    return nnp
+    nnp
   end
   # Given a POS-tagged text, this method returns all nouns and their
@@ -355,6 +355,7 @@ class EngTagger
   #
   def get_nouns(tagged)
     return nil unless valid_text(tagged)
     tags = [NN]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -368,6 +369,7 @@ class EngTagger
   #
   def get_verbs(tagged)
     return nil unless valid_text(tagged)
     tags = [VB, VBD, VBG, PART, VBP, VBZ]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -379,6 +381,7 @@ class EngTagger
   def get_infinitive_verbs(tagged)
     return nil unless valid_text(tagged)
     tags = [VB]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -389,6 +392,7 @@ class EngTagger
   #
   def get_past_tense_verbs(tagged)
     return nil unless valid_text(tagged)
     tags = [VBD]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -399,6 +403,7 @@ class EngTagger
   #
   def get_gerund_verbs(tagged)
     return nil unless valid_text(tagged)
     tags = [VBG]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -409,6 +414,7 @@ class EngTagger
   #
   def get_passive_verbs(tagged)
     return nil unless valid_text(tagged)
     tags = [PART]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -419,6 +425,7 @@ class EngTagger
   #
   def get_base_present_verbs(tagged)
     return nil unless valid_text(tagged)
     tags = [VBP]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -429,6 +436,7 @@ class EngTagger
   #
   def get_present_verbs(tagged)
     return nil unless valid_text(tagged)
     tags = [VBZ]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -439,6 +447,7 @@ class EngTagger
   #
   def get_adjectives(tagged)
     return nil unless valid_text(tagged)
     tags = [JJ]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -449,6 +458,7 @@ class EngTagger
   #
   def get_comparative_adjectives(tagged)
     return nil unless valid_text(tagged)
     tags = [JJR]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -459,6 +469,7 @@ class EngTagger
   #
   def get_superlative_adjectives(tagged)
     return nil unless valid_text(tagged)
     tags = [JJS]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -469,6 +480,7 @@ class EngTagger
   #
   def get_adverbs(tagged)
     return nil unless valid_text(tagged)
     tags = [RB, RBR, RBS, RP]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -479,13 +491,14 @@ class EngTagger
   #
   def get_interrogatives(tagged)
     return nil unless valid_text(tagged)
     tags = [WRB, WDT, WP, WPS]
     build_matches_hash(build_trimmed(tagged, tags))
   end
   # To be consistent with documentation's naming of 'interrogative'
   # parts of speech as 'question'
-  alias_method :get_question_parts, :get_interrogatives
+  alias get_question_parts get_interrogatives
   # Returns all types of conjunctions and does not discriminate
   # between the various kinds. E.g. coordinating, subordinating,
@@ -496,6 +509,7 @@ class EngTagger
   #
   def get_conjunctions(tagged)
     return nil unless valid_text(tagged)
     tags = [CC, IN]
     build_matches_hash(build_trimmed(tagged, tags))
   end
@@ -508,14 +522,15 @@ class EngTagger
   #
   def get_max_noun_phrases(tagged)
     return nil unless valid_text(tagged)
     tags = [@@mnp]
     mn_phrases = build_trimmed(tagged, tags)
     ret = Hash.new(0)
     mn_phrases.each do |p|
-      p = stem(p) unless p =~ /\s/  # stem single words
+      p = stem(p) unless p =~ /\s/ # stem single words
       ret[p] += 1 unless p =~ /\A\s*\z/
     end
-    return ret
+    ret
   end
   # Similar to get_words, but requires a POS-tagged text as an argument.
@@ -525,9 +540,10 @@ class EngTagger
   #
   def get_noun_phrases(tagged)
     return nil unless valid_text(tagged)
     found = Hash.new(0)
     phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
-      scanned = tagged.scan(@@mnp)
+    scanned = tagged.scan(@@mnp)
     # Find MNPs in the text, one sentence at a time
     # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
     mn_phrases = []
@@ -540,14 +556,14 @@ class EngTagger
       # shortening the phrase by removing the word in the first position.
       # Record the phrase and any single nouns that are found
       words = mnp.split
-      words.length.times do |i|
-        found[words.join(' ')] += 1 if words.length > 1
+      words.length.times do
+        found[words.join(" ")] += 1 if words.length > 1
         w = words.shift
         found[w] += 1 if w =~ /#{NN}/
       end
     end
     ret = Hash.new(0)
-    found.keys.each do |f|
+    found.each_key do |f|
       k = strip_tags(f)
       v = found[f]
       # We weight by the word count to favor long noun phrases
@@ -555,12 +571,13 @@ class EngTagger
       word_count = space_count.length + 1
       # Throttle MNPs if necessary
       next if word_count > @conf[:longest_noun_phrase]
-      k = stem(k) unless word_count > 1  # stem single words
+      k = stem(k) unless word_count > 1 # stem single words
       multiplier = 1
       multiplier = word_count if @conf[:weight_noun_phrases]
       ret[k] += multiplier * v
     end
-    return ret
+    ret
   end
   # Reads some included corpus data and saves it in a stored hash on the
@@ -571,10 +588,10 @@ class EngTagger
     load_tags(@conf[:tag_lex])
     load_words(@conf[:word_lex])
     load_words(@conf[:unknown_lex])
-    File.open(@conf[:word_path], 'w') do |f|
+    File.open(@conf[:word_path], "w") do |f|
       Marshal.dump(@@lexicon, f)
     end
-    File.open(@conf[:tag_path], 'w') do |f|
+    File.open(@conf[:tag_path], "w") do |f|
       Marshal.dump(@@hmm, f)
     end
   end
@@ -596,6 +613,7 @@ class EngTagger
     trimmed.each do |n|
       n = stem(n)
       next unless n.length < 100 # sanity check on word length
       ret[n] += 1 unless n =~ /\A\s*\z/
     end
     ret
@@ -603,25 +621,24 @@ class EngTagger
   # Downcase the first letter of word
   def lcfirst(word)
-    word.split(//)[0].downcase + word.split(//)[1..-1].join
+    word.split(//)[0].downcase + word.split(//)[1..].join
   end
   # Upcase the first letter of word
   def ucfirst(word)
-    word.split(//)[0].upcase + word.split(//)[1..-1].join
+    word.split(//)[0].upcase + word.split(//)[1..].join
   end
   # Return the word stem as given by Stemmable module. This can be
   # turned off with the class parameter @conf[:stem] => false.
   def stem(word)
-    return word unless @conf[:stem]
-    return word.stem
+    @conf[:stem] ? word.stem : word
   end
   # This method will reset the preceeding tag to a sentence ender (PP).
   # This prepares the first word of a new sentence to be tagged correctly.
   def reset
-    @conf[:current_tag] = 'pp'
+    @conf[:current_tag] = "pp"
   end
   # Check whether the text is a valid string
@@ -629,41 +646,38 @@ class EngTagger
     if !text
       # there's nothing to parse
       "method call on uninitialized variable" if @conf[:debug]
-      return false
+      false
     elsif /\A\s*\z/ =~ text
       # text is an empty string, nothing to parse
-      return false
+      false
     else
       # $text is valid
-      return true
+      true
     end
   end
   # Return a text string with the part-of-speech tags removed
   def strip_tags(tagged, downcase = false)
     return nil unless valid_text(tagged)
     text = tagged.gsub(/<[^>]+>/m, "")
     text = text.gsub(/\s+/m, " ")
     text = text.gsub(/\A\s*/, "")
     text = text.gsub(/\s*\z/, "")
-    if downcase
-      return text.downcase
-    else
-      return text
-    end
+    downcase ? text.downcase : text
   end
   # Strip the provided text and separate off any punctuation in preparation for tagging
   def clean_text(text)
     return false unless valid_text(text)
-    cleaned_text = text.encode('utf-8')
+    cleaned_text = text.encode("utf-8")
     tokenized = []
     # Tokenize the text (splitting on punctuation as you go)
     cleaned_text.split(/\s+/).each do |line|
       tokenized += split_punct(line)
     end
-    words = split_sentences(tokenized)
-    return words
+    split_sentences(tokenized)
   end
   # This handles all of the trailing periods, keeping those that
@@ -672,27 +686,26 @@ class EngTagger
   # about the use of capitalization in the incoming text
   def split_sentences(array)
     tokenized = array
-    people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
-                supt det mssrs rev)
-    army   = %w(col gen lt cmdr adm capt sgt cpl maj brig)
-    inst   = %w(dept univ assn bros ph.d)
-    place  = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
-                hwy hway la pde pd plz pl rd st tce)
-    comp   = %w(mfg inc ltd co corp)
-    state  = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
+    people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
+                supt det mssrs rev]
+    army   = %w[col gen lt cmdr adm capt sgt cpl maj brig]
+    inst   = %w[dept univ assn bros ph.d]
+    place  = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
+                hwy hway la pde pd plz pl rd st tce]
+    comp   = %w[mfg inc ltd co corp]
+    state  = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
                 ind ia kans kan ken ky la me md is mass mich minn miss mo mont
                 neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
-                va wash wis wisc wy wyo usafa alta man ont que sask yuk)
-    month  = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
-    misc   = %w(vs etc no esp)
-    abbr = Hash.new
+                va wash wis wisc wy wyo usafa alta man ont que sask yuk]
+    month  = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
+    misc   = %w[vs etc no esp]
+    abbr = {}
     [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
       abbr[i] = true
     end
-    words = Array.new
-    tokenized.each_with_index do |t, i|
-      if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
-          tokenized[i] =~ /\A(.+)\.\z/
+    words = []
+    tokenized.each_with_index do |_, i|
+      if tokenized[i + 1] && tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
         w = $1
         # Don't separate the period off words that
         # meet any of the following conditions:
@@ -700,21 +713,20 @@ class EngTagger
         # 1. It is defined in one of the lists above
         # 2. It is only one letter long: Alfred E. Sloan
         # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
-        unless abbr[w.downcase] or
-            [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
+        unless abbr[w.downcase] || [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
           words <<  w
-          words << '.'
+          words << "."
           next
         end
       end
       words << tokenized[i]
     end
     # If the final word ends in a period..
-    if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
+    if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
       words[-1] = $1
-      words.push '.'
+      words.push "."
     end
-    return words
+    words
   end
   # Separate punctuation from words, where appropriate. This leaves trailing
@@ -722,39 +734,40 @@ class EngTagger
   def split_punct(text)
     # If there's no punctuation, return immediately
     return [text] if /\A\w+\z/ =~ text
     # Sanity checks
     text = text.gsub(/\W{10,}/o, " ")
     # Put quotes into a standard format
     text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
     text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
-    text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
+    text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + " ` " : " ` " } # Convert left quotes to `
     text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
-    text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
+    text = text.gsub(/(\w)'(?!')(?=\W|$)/o) { $1 + " ' " } # Separate right single quotes
     # Handle all other punctuation
     text = text.gsub(/--+/o, " - ") # Convert and separate dashes
     text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
     text = text.gsub(/:/o, " : ") # Shift semicolons off
-    text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
-    text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
-    text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
+    text = text.gsub(/(\.\.\.+)/o) { " " + $1 + " " } # Shift ellipses off
+    text = text.gsub(/([(\[{}\])])/o) { " " + $1 + " " } # Shift off brackets
+    text = text.gsub(/([!?#$%;~|])/o) { " " + $1 + " " } # Shift off other ``standard'' punctuation
     # English-specific contractions
-    text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2}  # Separate off 'd 'm 's
-    text = text.gsub(/n't\b/o, " n't")                     # Separate off n't
-    text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1}         # Separate off 've, 'll, 're
-    result = text.split(' ')
-    return result
+    text = text.gsub(/([A-Za-z])'([dms])\b/o) { $1 + " '" + $2 } # Separate off 'd 'm 's
+    text = text.gsub(/n't\b/o, " n't") # Separate off n't
+    text = text.gsub(/'(ve|ll|re)\b/o) { " '" + $1 } # Separate off 've, 'll, 're
+    text.split(" ")
   end
   # Given a preceding tag, assign a tag word. Called by the add_tags method.
   # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
   def assign_tag(prev_tag, word)
-    if word == "-unknown-"
+    case word
+    when "-unknown-"
       # classify unknown words accordingly
       return @conf[:unknown_word_tag]
-    elsif word == "-sym-"
+    when "-sym-"
       # If this is a symbol, tag it as a symbol
       return "sym"
     end
@@ -765,13 +778,13 @@ class EngTagger
     # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
     # which is used in most POS taggers
     best_tag = ""
-    t[prev_tag].keys.each do |tag|
+    t[prev_tag].each_key do |tag|
       # With @config[:relax] set, this method
       # will also include any `open classes' of POS tags
       pw = 0
       if w[tag]
         pw = w[tag]
-      elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
+      elsif @conf[:relax] && tag =~ /\A(?:jj|nn|rb|vb)/
         pw = 0
       else
         next
@@ -786,7 +799,7 @@ class EngTagger
         best_tag = tag
       end
     end
-    return best_tag
+    best_tag
   end
   # This method determines whether a word should be considered in its
@@ -796,13 +809,13 @@ class EngTagger
     lcf = lcfirst(word)
     # seen this word as it appears (lower or upper case)
     if @@lexicon[word]
-      return word
+      word
     elsif @@lexicon[lcf]
       # seen this word only as lower case
-      return lcf
+      lcf
     else
       # never seen this word. guess.
-      return classify_unknown_word(word)
+      classify_unknown_word(word)
     end
   end
@@ -810,52 +823,52 @@ class EngTagger
   # classes of words handled by a simple unknown word classification
   # metric. Called by the clean_word method.
   def classify_unknown_word(word)
-    if /[\(\{\[]/ =~ word  # Left brackets
-      classified = "*LRB*"
-    elsif /[\)\}\]]/ =~ word   # Right brackets
-      classified = "*RRB*"
-    elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
-      classified = "*NUM*"
-    elsif /\A\d+[\d\/:-]+\d\z/ =~ word  # Other number constructs
-      classified = "*NUM*"
-    elsif /\A-?\d+\w+\z/o =~ word        # Ordinal number
-      classified = "*ORD*"
-    elsif /\A[A-Z][A-Z\.-]*\z/o =~ word  # Abbreviation (all caps)
-      classified = "-abr-"
-    elsif /\w-\w/o =~ word             # Hyphenated word
+    case word
+    when /[({\[]/ # Left brackets
+      "*LRB*"
+    when /[)}\]]/ # Right brackets
+      "*RRB*"
+    when /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ # Floating point number
+      "*NUM*"
+    when %r{\A\d+[\d/:-]+\d\z} # Other number constructs
+      "*NUM*"
+    when /\A-?\d+\w+\z/o # Ordinal number
+      "*ORD*"
+    when /\A[A-Z][A-Z.-]*\z/o # Abbreviation (all caps)
+      "-abr-"
+    when /\w-\w/o # Hyphenated word
       /-([^-]+)\z/ =~ word
       h_suffix = $1
-      if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
+      if h_suffix && (@@lexicon[h_suffix] && @@lexicon[h_suffix]["jj"])
         # last part of this is defined as an adjective
-        classified = "-hyp-adj-"
+        "-hyp-adj-"
       else
         # last part of this is not defined as an adjective
-        classified = "-hyp-"
+        "-hyp-"
       end
-    elsif /\A\W+\z/o =~ word
-      classified = "-sym-"  # Symbol
-    elsif word == ucfirst(word)
-      classified = "-cap-"  # Capitalized word
-    elsif /ing\z/o =~ word
-      classified = "-ing-"  # Ends in 'ing'
-    elsif /s\z/o =~ word
-      classified = "-s-"    # Ends in 's'
-    elsif /tion\z/o =~ word
-      classified = "-tion-" # Ends in 'tion'
-    elsif /ly\z/o =~ word
-      classified = "-ly-"  # Ends in 'ly'
-    elsif /ed\z/o =~ word
-      classified = "-ed-"  # Ends in 'ed
+    when /\A\W+\z/o
+      "-sym-"  # Symbol
+    when ucfirst(word)
+      "-cap-"  # Capitalized word
+    when /ing\z/o
+      "-ing-"  # Ends in "ing"
+    when /s\z/o
+      "-s-"    # Ends in "s"
+    when /tion\z/o
+      "-tion-" # Ends in "tion"
+    when /ly\z/o
+      "-ly-"  # Ends in "ly"
+    when /ed\z/o
+      "-ed-"  # Ends in "ed
     else
-      classified = "-unknown-" # Completely unknown
+      "-unknown-" # Completely unknown
     end
-    return classified
   end
   # This returns a compiled regexp for extracting maximal noun phrases
   # from a POS-tagged text.
   def get_max_noun_regex
-    regex = /
+    /
     # optional number, gerund - adjective -participle
     (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
       # Followed by one or more nouns
@@ -868,8 +881,7 @@ class EngTagger
        # one or more nouns
        (?:#{NN})+
       )*
-      /xo #/
-      return regex
+      /xo
   end
   # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
@@ -878,12 +890,13 @@ class EngTagger
   # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
   def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
     path = File.join(lexpath, lexicon)
-    fh = File.open(path, 'r')
-    while line = fh.gets
+    fh = File.open(path, "r")
+    while (line = fh.gets)
       /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
-      next unless $1 and $2
-      key, data = $1, $2
-      tags = Hash.new
+      next unless $1 && $2
+      key = $1
+      data = $2
       items = data.split(/,\s+/)
       pairs = {}
       items.each do |i|
@@ -901,12 +914,13 @@ class EngTagger
   # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
   def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
     path = File.join(lexpath, lexicon)
-    fh = File.open(path, 'r')
-    while line = fh.gets
+    fh = File.open(path, "r")
+    while (line = fh.gets)
       /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
-      next unless $1 and $2
-      key, data = $1, $2
-      tags = Hash.new
+      next unless $1 && $2
+      key = $1
+      data = $2
       items = data.split(/,\s+/)
       pairs = {}
       items.each do |i|
@@ -918,7 +932,7 @@ class EngTagger
     fh.close
   end
-  #memoize the stem and assign_tag methods
+  # memoize the stem and assign_tag methods
   memoize("stem")
   memoize("assign_tag")
 end